diff --git a/.dockerignore b/.dockerignore index fe1ff54b7..22a52b8b8 100644 --- a/.dockerignore +++ b/.dockerignore @@ -4,5 +4,4 @@ */.mypy_cache */.pytest_cache */build -*/*/_mars **/node_modules \ No newline at end of file diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index 04edad397..25b9f87eb 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -43,9 +43,11 @@ jobs: - uses: isort/isort-action@master with: sortPaths: "python/xorbits" - configuration: "--check-only --profile black --diff --skip-glob xorbits/_mars/" + configuration: "--check-only --profile black --diff --skip python/xorbits/_mars/" - name: mypy run: pip install mypy && cd python && mypy xorbits + - name: codespell + run: pip install codespell && cd python && codespell xorbits - name: Set up Node.js uses: actions/setup-node@v1 with: diff --git a/.gitignore b/.gitignore index aa3aa9542..88c75be57 100644 --- a/.gitignore +++ b/.gitignore @@ -119,6 +119,16 @@ venv.bak/ # mkdocs documentation /site +# cython compiled files +python/xorbits/_mars/*.c* +python/xorbits/_mars/*.h* +python/xorbits/_mars/core/**/*.c* +python/xorbits/_mars/learn/cluster/*.c* +python/xorbits/_mars/learn/utils/*.c* +python/xorbits/_mars/lib/*.c* +python/xorbits/_mars/oscar/**/*.c* +python/xorbits/_mars/serialization/*.c* + # mypy .mypy_cache/ .dmypy.json @@ -132,9 +142,6 @@ dmypy.json .vscode *.iml -# soft link -python/xorbits/_mars - # web staff node_modules/ static/ diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 97bb4bb60..000000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "third_party/_mars"] - path = third_party/_mars - url = https://github.com/xprobe-inc/mars.git diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 30f2f5c25..a3988130a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,15 +18,23 @@ repos: rev: 5.12.0 hooks: - id: isort - args: [--profile=black] + args: [--sp, python/setup.cfg] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v0.991 + rev: v1.0.0 hooks: - id: mypy additional_dependencies: [tokenize-rt==3.2.0] - args: [--config-file, python/setup.cfg] + exclude: _mars + args: [--ignore-missing-imports, --follow-imports, skip] - repo: https://github.com/pre-commit/mirrors-prettier rev: v3.0.0-alpha.4 # Use the sha or tag you want to point at hooks: - id: prettier types_or: [html, javascript] + args: [--ignore-path, python/xorbits/_mars] + - repo: https://github.com/codespell-project/codespell + rev: v2.2.2 + hooks: + - id: codespell + exclude: _mars/lib + args: [ --config, python/setup.cfg] diff --git a/python/setup.cfg b/python/setup.cfg index b8f471686..22084a4dd 100644 --- a/python/setup.cfg +++ b/python/setup.cfg @@ -168,13 +168,16 @@ exclude = ci/ dist/ docs/ - xorbits/_mars/* + xorbits/_mars/lib/nvutils.py + xorbits/_mars/lib/uhashring/* + xorbits/_mars/lib/version.py + per-file-ignores = */core/adapter.py: F401 [codespell] ignore-words-list = hist,rcall,fpr,ser,nd,inout,ot,Ba,ba,asend,hart,coo,splitted,datas,fro -skip = .idea,.git,./build,./docs/build,./xorbits/_mars/lib,node_modules,static,generated,*.po,*.ts,*.json,*.c,*.cpp,*.cfg +skip = .idea,.git,./build,./docs/build,xorbits/_mars/lib,node_modules,static,generated,*.po,*.ts,*.json,*.c,*.cpp,*.cfg [isort] profile = black diff --git a/python/setup.py b/python/setup.py index 5fdba7d35..ab2f504bb 100644 --- a/python/setup.py +++ b/python/setup.py @@ -61,15 +61,6 @@ repo_root = os.path.dirname(os.path.abspath(__file__)) os.chdir(repo_root) -# create symlink for mars -absolute_path = os.path.join(repo_root, os.path.join("xorbits", "_mars")) -source_path = os.path.join("..", "..", "third_party", "_mars", "mars") -try: - os.symlink(source_path, absolute_path, target_is_directory=True) -except FileExistsError: - # symlink exists already, skip - pass - cythonize_kw = dict(language_level=sys.version_info[0]) cy_extension_kw = dict() diff --git a/python/xorbits/_mars/__init__.py b/python/xorbits/_mars/__init__.py new file mode 100644 index 000000000..3e3beccbb --- /dev/null +++ b/python/xorbits/_mars/__init__.py @@ -0,0 +1,21 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import _version +from .config import options +from .core.context import get_context +from .deploy.oscar import new_cluster_in_ray, new_ray_session +from .session import execute, fetch, fetch_log, new_session, stop_server + +__version__ = _version.get_versions()["version"] diff --git a/python/xorbits/_mars/_resource.pyx b/python/xorbits/_mars/_resource.pyx new file mode 100644 index 000000000..63803adcf --- /dev/null +++ b/python/xorbits/_mars/_resource.pyx @@ -0,0 +1,73 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cdef class Resource: + cdef readonly: + float num_cpus + float num_gpus + float mem_bytes + + def __init__(self, float num_cpus=0, float num_gpus=0, float mem_bytes=0): + self.num_cpus = num_cpus + self.num_gpus = num_gpus + self.mem_bytes = mem_bytes + + def __eq__(self, Resource other): + cdef bint ret = ( + self.mem_bytes == other.mem_bytes + and self.num_gpus == other.num_gpus + and self.num_cpus == other.num_cpus + ) + return ret + + cdef bint _le(self, Resource other) nogil: + # memory first, then gpu, cpu last + cdef bint ret = ( + self.mem_bytes <= other.mem_bytes + and self.num_gpus <= other.num_gpus + and self.num_cpus <= other.num_cpus + ) + return ret + + def __gt__(self, Resource other): + return not self._le(other) + + def __le__(self, Resource other): + return self._le(other) + + def __add__(self, Resource other): + return Resource( + num_cpus=self.num_cpus + other.num_cpus, + num_gpus=self.num_gpus + other.num_gpus, + mem_bytes=self.mem_bytes + other.mem_bytes, + ) + + def __sub__(self, Resource other): + return Resource( + num_cpus=self.num_cpus - other.num_cpus, + num_gpus=self.num_gpus - other.num_gpus, + mem_bytes=self.mem_bytes - other.mem_bytes, + ) + + def __neg__(self): + return Resource( + num_cpus=-self.num_cpus, + num_gpus=-self.num_gpus, + mem_bytes=-self.mem_bytes, + ) + + def __repr__(self): + return f"Resource(num_cpus={self.num_cpus}, num_gpus={self.num_gpus}, mem_bytes={self.mem_bytes})" + +ZeroResource = Resource(num_cpus=0, num_gpus=0, mem_bytes=0) diff --git a/python/xorbits/_mars/_utils.pxd b/python/xorbits/_mars/_utils.pxd new file mode 100644 index 000000000..d875ff78c --- /dev/null +++ b/python/xorbits/_mars/_utils.pxd @@ -0,0 +1,33 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +cdef class TypeDispatcher: + cdef dict _handlers + cdef dict _lazy_handlers + cdef dict _inherit_handlers + cdef object __weakref__ + + cpdef void register(self, object type_, object handler) + cpdef void unregister(self, object type_) + cdef _reload_lazy_handlers(self) + cpdef get_handler(self, object type_) + + +cpdef str to_str(s, encoding=*) +cpdef bytes to_binary(s, encoding=*) +cpdef unicode to_text(s, encoding=*) +cpdef register_tokenizer(cls, handler) +cpdef void reset_id_random_seed() except * +cpdef bytes new_random_id(int byte_len) diff --git a/python/xorbits/_mars/_utils.pyx b/python/xorbits/_mars/_utils.pyx new file mode 100644 index 000000000..a7740f8df --- /dev/null +++ b/python/xorbits/_mars/_utils.pyx @@ -0,0 +1,508 @@ +# distutils: language = c++ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import importlib +import itertools +import os +import pickle +import pkgutil +import time +import types +import uuid +import warnings +from datetime import date, datetime, timedelta, tzinfo +from enum import Enum +from functools import lru_cache, partial +from random import getrandbits +from weakref import WeakSet + +import cloudpickle +import numpy as np +import pandas as pd + +cimport cython +from cpython cimport PyBytes_FromStringAndSize +from libc.stdint cimport uint8_t, uint32_t, uint_fast64_t +from libc.stdlib cimport free, malloc + +from .lib.cython.libcpp cimport mt19937_64 + +try: + from pandas.tseries.offsets import Tick as PDTick +except ImportError: + PDTick = None + +from .lib.mmh3 import hash as mmh_hash +from .lib.mmh3 import hash_bytes as mmh_hash_bytes +from .lib.mmh3 import hash_from_buffer as mmh3_hash_from_buffer + + +cdef bint _has_cupy = bool(pkgutil.find_loader('cupy')) +cdef bint _has_cudf = bool(pkgutil.find_loader('cudf')) +cdef bint _has_sqlalchemy = bool(pkgutil.find_loader('sqlalchemy')) +cdef bint _has_interval_array_inclusive = hasattr( + pd.arrays.IntervalArray, "inclusive" +) + + +cdef extern from "MurmurHash3.h": + void MurmurHash3_x64_128(const void * key, Py_ssize_t len, uint32_t seed, void * out) + + +cdef bytes _get_mars_key(const uint8_t[:] bufferview): + cdef const uint8_t *data = &bufferview[0] + cdef uint8_t out[16] + MurmurHash3_x64_128(data, len(bufferview), 0, out) + out[0] |= 0xC0 + return PyBytes_FromStringAndSize(out, 16) + + +cpdef str to_str(s, encoding='utf-8'): + if type(s) is str: + return s + elif isinstance(s, bytes): + return (s).decode(encoding) + elif isinstance(s, str): + return str(s) + elif s is None: + return s + else: + raise TypeError(f"Could not convert from {s} to str.") + + +cpdef bytes to_binary(s, encoding='utf-8'): + if type(s) is bytes: + return s + elif isinstance(s, unicode): + return (s).encode(encoding) + elif isinstance(s, bytes): + return bytes(s) + elif s is None: + return None + else: + raise TypeError(f"Could not convert from {s} to bytes.") + + +cpdef unicode to_text(s, encoding='utf-8'): + if type(s) is unicode: + return s + elif isinstance(s, bytes): + return (s).decode('utf-8') + elif isinstance(s, unicode): + return unicode(s) + elif s is None: + return None + else: + raise TypeError(f"Could not convert from {s} to unicode.") + + +_type_dispatchers = WeakSet() + + +NamedType = collections.namedtuple("NamedType", ["name", "type_"]) + + +cdef class TypeDispatcher: + def __init__(self): + self._handlers = dict() + self._lazy_handlers = dict() + # store inherited handlers to facilitate unregistering + self._inherit_handlers = dict() + + _type_dispatchers.add(self) + + cpdef void register(self, object type_, object handler): + if isinstance(type_, str): + self._lazy_handlers[type_] = handler + elif type(type_) is not NamedType and isinstance(type_, tuple): + for t in type_: + self.register(t, handler) + else: + self._handlers[type_] = handler + + cpdef void unregister(self, object type_): + if type(type_) is not NamedType and isinstance(type_, tuple): + for t in type_: + self.unregister(t) + else: + self._lazy_handlers.pop(type_, None) + self._handlers.pop(type_, None) + self._inherit_handlers.clear() + + cdef _reload_lazy_handlers(self): + for k, v in self._lazy_handlers.items(): + mod_name, obj_name = k.rsplit('.', 1) + with warnings.catch_warnings(): + # the lazy imported cudf will warn no device found, + # when we set visible device to -1 for CPU processes, + # ignore the warning to not distract users + warnings.simplefilter("ignore") + mod = importlib.import_module(mod_name, __name__) + self.register(getattr(mod, obj_name), v) + self._lazy_handlers = dict() + + cpdef get_handler(self, object type_): + try: + return self._handlers[type_] + except KeyError: + pass + + try: + return self._inherit_handlers[type_] + except KeyError: + self._reload_lazy_handlers() + if type(type_) is NamedType: + named_type = partial(NamedType, type_.name) + mro = itertools.chain( + *zip(map(named_type, type_.type_.__mro__), + type_.type_.__mro__) + ) + else: + mro = type_.__mro__ + for clz in mro: + # only lookup self._handlers for mro clz + handler = self._handlers.get(clz) + if handler is not None: + self._inherit_handlers[type_] = handler + return handler + raise KeyError(f'Cannot dispatch type {type_}') + + def __call__(self, object obj, *args, **kwargs): + return self.get_handler(type(obj))(obj, *args, **kwargs) + + @staticmethod + def reload_all_lazy_handlers(): + for dispatcher in _type_dispatchers: + (dispatcher)._reload_lazy_handlers() + + +cdef inline build_canonical_bytes(tuple args, kwargs): + if kwargs: + args = args + (kwargs,) + return pickle.dumps(tokenize_handler(args)) + + +def tokenize(*args, **kwargs): + return _get_mars_key(build_canonical_bytes(args, kwargs)).hex() + + +def tokenize_int(*args, **kwargs): + return mmh_hash(build_canonical_bytes(args, kwargs)) + + +cdef class Tokenizer(TypeDispatcher): + def __call__(self, object obj, *args, **kwargs): + try: + return self.get_handler(type(obj))(obj, *args, **kwargs) + except KeyError: + if hasattr(obj, '__mars_tokenize__') and not isinstance(obj, type): + if len(args) == 0 and len(kwargs) == 0: + return obj.__mars_tokenize__() + else: + obj = obj.__mars_tokenize__() + return self.get_handler(type(obj))(obj, *args, **kwargs) + if callable(obj): + if PDTick is not None and not isinstance(obj, PDTick): + return tokenize_function(obj) + + try: + return cloudpickle.dumps(obj) + except: + raise TypeError(f'Cannot generate token for {obj}, type: {type(obj)}') from None + + +cdef inline list iterative_tokenize(object ob): + cdef list dq = [ob] + cdef int dq_pos = 0 + cdef list h_list = [] + while dq_pos < len(dq): + x = dq[dq_pos] + dq_pos += 1 + if type(x) in _primitive_types: + h_list.append(x) + elif isinstance(x, (list, tuple)): + dq.extend(x) + elif isinstance(x, set): + dq.extend(sorted(x)) + elif isinstance(x, dict): + dq.extend(sorted(x.items())) + else: + h_list.append(tokenize_handler(x)) + + if dq_pos >= 64 and len(dq) < dq_pos * 2: # pragma: no cover + dq = dq[dq_pos:] + dq_pos = 0 + return h_list + + +cdef inline tuple tokenize_numpy(ob): + cdef int offset + + if not ob.shape: + return str(ob), ob.dtype + if hasattr(ob, 'mode') and getattr(ob, 'filename', None): + if hasattr(ob.base, 'ctypes'): + offset = (ob.ctypes.get_as_parameter().value - + ob.base.ctypes.get_as_parameter().value) + else: + offset = 0 # root memmap's have mmap object as base + return (ob.filename, os.path.getmtime(ob.filename), ob.dtype, + ob.shape, ob.strides, offset) + if ob.dtype.hasobject: + try: + data = mmh_hash_bytes('-'.join(ob.flat).encode('utf-8', errors='surrogatepass')) + except UnicodeDecodeError: + data = mmh_hash_bytes(b'-'.join([to_binary(x) for x in ob.flat])) + except TypeError: + try: + data = mmh_hash_bytes(pickle.dumps(ob, pickle.HIGHEST_PROTOCOL)) + except: + # nothing can do, generate uuid + data = uuid.uuid4().hex + else: + try: + data = mmh_hash_bytes(ob.ravel().view('i1').data) + except (BufferError, AttributeError, ValueError): + data = mmh_hash_bytes(ob.copy().ravel().view('i1').data) + return data, ob.dtype, ob.shape, ob.strides + + +cdef inline _extract_range_index_attr(object range_index, str attr): + try: + return getattr(range_index, attr) + except AttributeError: # pragma: no cover + return getattr(range_index, '_' + attr) + + +cdef list tokenize_pandas_index(ob): + cdef long long start + cdef long long stop + cdef long long end + if isinstance(ob, pd.RangeIndex): + start = _extract_range_index_attr(ob, 'start') + stop = _extract_range_index_attr(ob, 'stop') + step = _extract_range_index_attr(ob, 'step') + # for range index, there is no need to get the values + return iterative_tokenize([ob.name, getattr(ob, 'names', None), slice(start, stop, step)]) + else: + return iterative_tokenize([ob.name, getattr(ob, 'names', None), ob.values]) + + +cdef list tokenize_pandas_series(ob): + return iterative_tokenize([ob.name, ob.dtype, ob.values, ob.index]) + + +cdef list tokenize_pandas_dataframe(ob): + l = [block.values for block in ob._data.blocks] + l.extend([ob.columns, ob.index]) + return iterative_tokenize(l) + + +cdef list tokenize_pandas_categorical(ob): + l = ob.to_list() + l.append(ob.shape) + return iterative_tokenize(l) + + +cdef list tokenize_pd_extension_dtype(ob): + return iterative_tokenize([ob.name]) + + +cdef list tokenize_categories_dtype(ob): + return iterative_tokenize([ob.categories, ob.ordered]) + + +cdef list tokenize_interval_dtype(ob): + return iterative_tokenize([type(ob).__name__, ob.subtype]) + + +cdef list tokenize_pandas_time_arrays(ob): + return iterative_tokenize([ob.asi8, ob.dtype]) + + +cdef list tokenize_pandas_tick(ob): + return iterative_tokenize([ob.freqstr]) + + +cdef list tokenize_pandas_interval_arrays(ob): # pragma: no cover + if _has_interval_array_inclusive: + return iterative_tokenize([ob.left, ob.right, ob.inclusive]) + else: + return iterative_tokenize([ob.left, ob.right, ob.closed]) + + +cdef list tokenize_sqlalchemy_data_type(ob): + return iterative_tokenize([repr(ob)]) + + +cdef list tokenize_sqlalchemy_selectable(ob): + return iterative_tokenize([str(ob)]) + + +cdef list tokenize_enum(ob): + cls = type(ob) + return iterative_tokenize([id(cls), cls.__name__, ob.name]) + + +@lru_cache(500) +def tokenize_function(ob): + if isinstance(ob, partial): + args = iterative_tokenize(ob.args) + keywords = iterative_tokenize(ob.keywords.items()) if ob.keywords else None + return tokenize_function(ob.func), args, keywords + else: + try: + if isinstance(ob, types.FunctionType): + return iterative_tokenize([pickle.dumps(ob, protocol=0), id(ob)]) + else: + return pickle.dumps(ob, protocol=0) + except: + pass + try: + return cloudpickle.dumps(ob, protocol=0) + except: + return str(ob) + + +@lru_cache(500) +def tokenize_pickled_with_cache(ob): + return pickle.dumps(ob) + + +def tokenize_cupy(ob): + from .serialization import serialize + header, _buffers = serialize(ob) + return iterative_tokenize([header, ob.data.ptr]) + + +def tokenize_cudf(ob): + from .serialization import serialize + header, buffers = serialize(ob) + return iterative_tokenize([header] + [(buf.ptr, buf.size) for buf in buffers]) + + +cdef Tokenizer tokenize_handler = Tokenizer() + +cdef set _primitive_types = { + int, float, str, unicode, bytes, complex, type(None), type, slice, date, datetime, timedelta +} +for t in _primitive_types: + tokenize_handler.register(t, lambda ob: ob) + +for t in (np.dtype, np.generic): + tokenize_handler.register(t, lambda ob: ob) + +for t in (list, tuple, dict, set): + tokenize_handler.register(t, iterative_tokenize) + +tokenize_handler.register(np.ndarray, tokenize_numpy) +tokenize_handler.register(np.random.RandomState, lambda ob: iterative_tokenize(ob.get_state())) +tokenize_handler.register(memoryview, lambda ob: mmh3_hash_from_buffer(ob)) +tokenize_handler.register(Enum, tokenize_enum) +tokenize_handler.register(pd.Index, tokenize_pandas_index) +tokenize_handler.register(pd.Series, tokenize_pandas_series) +tokenize_handler.register(pd.DataFrame, tokenize_pandas_dataframe) +tokenize_handler.register(pd.Categorical, tokenize_pandas_categorical) +tokenize_handler.register(pd.CategoricalDtype, tokenize_categories_dtype) +tokenize_handler.register(pd.IntervalDtype, tokenize_interval_dtype) +tokenize_handler.register(tzinfo, tokenize_pickled_with_cache) +tokenize_handler.register(pd.arrays.DatetimeArray, tokenize_pandas_time_arrays) +tokenize_handler.register(pd.arrays.TimedeltaArray, tokenize_pandas_time_arrays) +tokenize_handler.register(pd.arrays.PeriodArray, tokenize_pandas_time_arrays) +tokenize_handler.register(pd.arrays.IntervalArray, tokenize_pandas_interval_arrays) +tokenize_handler.register(pd.api.extensions.ExtensionDtype, tokenize_pd_extension_dtype) +if _has_cupy: + tokenize_handler.register('cupy.ndarray', tokenize_cupy) +if _has_cudf: + tokenize_handler.register('cudf.DataFrame', tokenize_cudf) + tokenize_handler.register('cudf.Series', tokenize_cudf) + tokenize_handler.register('cudf.Index', tokenize_cudf) + +if PDTick is not None: + tokenize_handler.register(PDTick, tokenize_pandas_tick) +if _has_sqlalchemy: + tokenize_handler.register( + "sqlalchemy.sql.sqltypes.TypeEngine", tokenize_sqlalchemy_data_type + ) + tokenize_handler.register( + "sqlalchemy.sql.Selectable", tokenize_sqlalchemy_selectable + ) + +cpdef register_tokenizer(cls, handler): + tokenize_handler.register(cls, handler) + + +@cython.nonecheck(False) +@cython.cdivision(True) +cpdef long long ceildiv(long long x, long long y) nogil: + return x // y + (x % y != 0) + + +cdef class Timer: + cdef object _start + cdef readonly object duration + + def __enter__(self): + self._start = time.time() + return self + + def __exit__(self, *_): + self.duration = time.time() - self._start + + +cdef mt19937_64 _rnd_gen +cdef bint _rnd_is_seed_set = False + + +cpdef void reset_id_random_seed() except *: + cdef bytes seed_bytes + global _rnd_is_seed_set + + seed_bytes = getrandbits(64).to_bytes(8, "little") + _rnd_gen.seed((seed_bytes)[0]) + _rnd_is_seed_set = True + + +cpdef bytes new_random_id(int byte_len): + cdef uint_fast64_t *res_ptr + cdef uint_fast64_t res_data[4] + cdef int i, qw_num = byte_len >> 3 + cdef bytes res + + if not _rnd_is_seed_set: + reset_id_random_seed() + + if (qw_num << 3) < byte_len: + qw_num += 1 + + if qw_num <= 4: + # use stack memory to accelerate + res_ptr = res_data + else: + res_ptr = malloc(qw_num << 3) + + try: + for i in range(qw_num): + res_ptr[i] = _rnd_gen() + return ((&(res_ptr[0]))[:byte_len]) + finally: + # free memory if allocated by malloc + if res_ptr != res_data: + free(res_ptr) + + +__all__ = ['to_str', 'to_binary', 'to_text', 'TypeDispatcher', 'tokenize', 'tokenize_int', + 'register_tokenizer', 'ceildiv', 'Timer', 'reset_id_random_seed', 'new_random_id'] diff --git a/python/xorbits/_mars/_version.py b/python/xorbits/_mars/_version.py new file mode 100644 index 000000000..742480dba --- /dev/null +++ b/python/xorbits/_mars/_version.py @@ -0,0 +1,692 @@ +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. Generated by +# versioneer-0.23 (https://github.com/python-versioneer/python-versioneer) + +"""Git implementation of _version.py.""" + +import errno +import os +import re +import subprocess +import sys +from typing import Callable, Dict +import functools + + +def get_keywords(): + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "$Format:%d$" + git_full = "$Format:%H$" + git_date = "$Format:%ci$" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_config(): + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "pep440" + cfg.tag_prefix = "v" + cfg.parentdir_prefix = "pymars-" + cfg.versionfile_source = "mars/_version.py" + cfg.verbose = False + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +LONG_VERSION_PY: Dict[str, str] = {} +HANDLERS: Dict[str, Dict[str, Callable]] = {} + + +def register_vcs_handler(vcs, method): # decorator + """Create decorator to mark a method as the handler of a VCS.""" + + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + process = None + + popen_kwargs = {} + if sys.platform == "win32": + # This hides the console window if pythonw.exe is used + startupinfo = subprocess.STARTUPINFO() + startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW + popen_kwargs["startupinfo"] = startupinfo + + for command in commands: + try: + dispcmd = str([command] + args) + # remember shell=False, so use git.cmd on windows, not just git + process = subprocess.Popen( + [command] + args, + cwd=cwd, + env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr else None), + **popen_kwargs, + ) + break + except OSError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %s" % (commands,)) + return None, None + stdout = process.communicate()[0].strip().decode() + if process.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, process.returncode + return stdout, process.returncode + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for _ in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + "date": None, + } + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print( + "Tried directories %s but none started with prefix %s" + % (str(rootdirs), parentdir_prefix) + ) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + with open(versionfile_abs, "r") as fobj: + for line in fobj: + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + except OSError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if "refnames" not in keywords: + raise NotThisMethod("Short version file found") + date = keywords.get("date") + if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = {r.strip() for r in refnames.strip("()").split(",")} + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = {r for r in refs if re.search(r"\d", r)} + if verbose: + print("discarding '%s', no digits" % ",".join(refs - tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix) :] + # Filter out refs that exactly match prefix or that don't start + # with a number once the prefix is stripped (mostly a concern + # when prefix is '') + if not re.match(r"\d", r): + continue + if verbose: + print("picking %s" % r) + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + "date": date, + } + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + "date": None, + } + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + # GIT_DIR can interfere with correct operation of Versioneer. + # It may be intended to be passed to the Versioneer-versioned project, + # but that should not change where we get our version from. + env = os.environ.copy() + env.pop("GIT_DIR", None) + runner = functools.partial(runner, env=env) + + _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = runner( + GITS, + [ + "describe", + "--tags", + "--dirty", + "--always", + "--long", + "--match", + f"{tag_prefix}[[:digit:]]*", + ], + cwd=root, + ) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) + # --abbrev-ref was added in git-1.6.3 + if rc != 0 or branch_name is None: + raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") + branch_name = branch_name.strip() + + if branch_name == "HEAD": + # If we aren't exactly on a branch, pick a branch which represents + # the current commit. If all else fails, we are on a branchless + # commit. + branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) + # --contains was added in git-1.5.4 + if rc != 0 or branches is None: + raise NotThisMethod("'git branch --contains' returned error") + branches = branches.split("\n") + + # Remove the first line if we're running detached + if "(" in branches[0]: + branches.pop(0) + + # Strip off the leading "* " from the list of branches. + branches = [branch[2:] for branch in branches] + if "master" in branches: + branch_name = "master" + elif not branches: + branch_name = None + else: + # Pick the first branch that is returned. Good or bad. + branch_name = branches[0] + + pieces["branch"] = branch_name + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[: git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) + if not mo: + # unparsable. Maybe git-describe is misbehaving? + pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( + full_tag, + tag_prefix, + ) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix) :] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) + pieces["distance"] = len(out.split()) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_branch(pieces): + """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . + + The ".dev0" means not master branch. Note that .dev0 sorts backwards + (a feature branch will appear "older" than the master branch). + + Exceptions: + 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0" + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def pep440_split_post(ver): + """Split pep440 version string at the post-release segment. + + Returns the release segments before the post-release and the + post-release version number (or -1 if no post-release segment is present). + """ + vc = str.split(ver, ".post") + return vc[0], int(vc[1] or 0) if len(vc) == 2 else None + + +def render_pep440_pre(pieces): + """TAG[.postN.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ + if pieces["closest-tag"]: + if pieces["distance"]: + # update the post release segment + tag_version, post_version = pep440_split_post(pieces["closest-tag"]) + rendered = tag_version + if post_version is not None: + rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) + else: + rendered += ".post0.dev%d" % (pieces["distance"]) + else: + # no commits, use the tag as the version + rendered = pieces["closest-tag"] + else: + # exception #1 + rendered = "0.post0.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_post_branch(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . + + The ".dev0" means not master branch. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None, + } + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-branch": + rendered = render_pep440_branch(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-post-branch": + rendered = render_pep440_post_branch(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + "date": pieces.get("date"), + } + + +def get_versions(): + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + cfg = get_config() + verbose = cfg.verbose + + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) + except NotThisMethod: + pass + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for _ in cfg.versionfile_source.split("/"): + root = os.path.dirname(root) + except NameError: + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None, + } + + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + "date": None, + } diff --git a/python/xorbits/_mars/config.py b/python/xorbits/_mars/config.py new file mode 100644 index 000000000..f20513450 --- /dev/null +++ b/python/xorbits/_mars/config.py @@ -0,0 +1,443 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import functools +import operator +import os +import threading +import warnings +from copy import deepcopy +from typing import Dict, Union + +_DEFAULT_REDIRECT_WARN = ( + "Option {source} has been replaced by {target} and " + "might be removed in a future release." +) + + +class OptionError(Exception): + pass + + +class Redirection: + def __init__(self, item, warn=None): + self._items = item.split(".") + self._warn = warn + self._warned = True + self._parent = None + + def bind(self, attr_dict): + self._parent = attr_dict + self.getvalue() + self._warned = False + + def getvalue(self): + if self._warn and not self._warned: + self._warned = True + warnings.warn(self._warn) + conf = self._parent.root + for it in self._items: + conf = getattr(conf, it) + return conf + + def setvalue(self, value): + if self._warn and not self._warned: + self._warned = True + warnings.warn(self._warn) + conf = self._parent.root + for it in self._items[:-1]: + conf = getattr(conf, it) + setattr(conf, self._items[-1], value) + + +class AttributeDict(dict): + def __init__(self, *args, **kwargs): + self._inited = False + self._parent = kwargs.pop("_parent", None) + self._root = None + super().__init__(*args, **kwargs) + self._inited = True + + @property + def root(self): + if self._root is not None: + return self._root + if self._parent is None: + self._root = self + else: + self._root = self._parent.root + return self._root + + def __getattr__(self, item): + if item in self: + val = self[item] + if isinstance(val, AttributeDict): + return val + elif isinstance(val[0], Redirection): + return val[0].getvalue() + else: + return val[0] + return object.__getattribute__(self, item) + + def __dir__(self): + return list(self.keys()) + + def register(self, key, value, validator=None): + if isinstance(validator, tuple): + validator = any_validator(*validator) + self[key] = value, validator + if isinstance(value, Redirection): + value.bind(self) + + def unregister(self, key): + del self[key] + + def _setattr(self, key, value, silent=False): + splits = key.split(".") + target = self + for k in splits[:-1]: + if not silent and ( + not isinstance(target, AttributeDict) or k not in target + ): + raise OptionError("You can only set the value of existing options") + target = target[k] + key = splits[-1] + + if not isinstance(value, AttributeDict): + validate = None + if key in target: + val = target[key] + validate = target[key][1] + if validate is not None: + if not validate(value): + raise ValueError(f"Cannot set value `{value}`") + if isinstance(val[0], Redirection): + val[0].setvalue(value) + else: + target[key] = value, validate + else: + target[key] = value, validate + else: + target[key] = value + + def __setattr__(self, key, value): + if key == "_inited": + super().__setattr__(key, value) + return + try: + object.__getattribute__(self, key) + super().__setattr__(key, value) + return + except AttributeError: + pass + + if not self._inited: + super().__setattr__(key, value) + else: + self._setattr(key, value) + + def to_dict(self): + d = dict() + for k, v in self.items(): + if isinstance(v, AttributeDict): + d.update( + {k + "." + sub_k: sub_v for sub_k, sub_v in v.to_dict().items()} + ) + elif isinstance(v[0], Redirection): + continue + else: + d[k] = v[0] + return d + + +class Config: + def __init__(self, config=None): + self._config = config or AttributeDict() + self._serialize_options = [] + + def __dir__(self): + return list(self._config.keys()) + + def __getattr__(self, item): + config = object.__getattribute__(self, "_config") + return getattr(config, item) + + def __setattr__(self, key, value): + if key.startswith("_"): + object.__setattr__(self, key, value) + return + setattr(self._config, key, value) + + def register_option(self, option, value, validator=None, serialize=False): + splits = option.split(".") + conf = self._config + if isinstance(validator, tuple): + validator = any_validator(*validator) + + for name in splits[:-1]: + config = conf.get(name) + if config is None: + val = AttributeDict(_parent=conf) + conf[name] = val + conf = val + elif not isinstance(config, dict): + raise AttributeError( + f"Fail to set option: {option}, conflict has encountered" + ) + else: + conf = config + + key = splits[-1] + if conf.get(key) is not None: + raise AttributeError(f"Fail to set option: {option}, option has been set") + + conf.register(key, value, validator) + if serialize: + self._serialize_options.append(option) + + def redirect_option(self, option, target, warn=_DEFAULT_REDIRECT_WARN): + redir = Redirection(target, warn=warn.format(source=option, target=target)) + self.register_option(option, redir) + + def unregister_option(self, option): + splits = option.split(".") + conf = self._config + for name in splits[:-1]: + config = conf.get(name) + if not isinstance(config, dict): + raise AttributeError( + f"Fail to unregister option: {option}, conflict has encountered" + ) + else: + conf = config + + key = splits[-1] + if key not in conf: + raise AttributeError( + f"Option {option} not configured, thus failed to unregister." + ) + conf.unregister(key) + + def copy(self): + new_options = Config(deepcopy(self._config)) + return new_options + + def update(self, new_config: Union["Config", Dict]): + if not isinstance(new_config, dict): + new_config = new_config._config + for option, value in new_config.items(): + try: + self.register_option(option, value) + except AttributeError: + setattr(self, option, value) + + def get_serializable(self): + d = dict() + for k in self._serialize_options: + parts = k.split(".") + v = self + for p in parts: + v = getattr(v, p) + d[k] = v + return d + + def fill_serialized(self, d): + for k, v in d.items(): + parts = k.split(".") + cf = self + for p in parts[:-1]: + cf = getattr(cf, p) + setattr(cf, parts[-1], v) + + def to_dict(self): + return self._config.to_dict() + + +@contextlib.contextmanager +def option_context(config=None): + global_options = get_global_option() + + try: + config = config or dict() + local_options = Config(deepcopy(global_options._config)) + local_options.update(config) + _options_local.default_options = local_options + yield local_options + finally: + _options_local.default_options = global_options + + +def is_interactive(): + import __main__ as main + + return not hasattr(main, "__file__") + + +# validators +def any_validator(*validators): + def validate(x): + return any(validator(x) for validator in validators) + + return validate + + +def all_validator(*validators): + def validate(x): + return all(validator(x) for validator in validators) + + return validate + + +def _instance_check(typ, v): + return isinstance(v, typ) + + +is_null = functools.partial(operator.is_, None) +is_bool = functools.partial(_instance_check, bool) +is_integer = functools.partial(_instance_check, int) +is_float = functools.partial(_instance_check, float) +is_numeric = functools.partial(_instance_check, (float, int)) +is_string = functools.partial(_instance_check, str) +is_dict = functools.partial(_instance_check, dict) +is_list = functools.partial(_instance_check, list) + + +def is_in(vals): + def validate(x): + return x in vals + + return validate + + +default_options = Config() +default_options.register_option("tcp_timeout", 30, validator=is_integer) +default_options.register_option("verbose", False, validator=is_bool) +default_options.register_option("kv_store", ":inproc:", validator=is_string) +default_options.register_option("check_interval", 20, validator=is_integer) +default_options.register_option( + "show_progress", "auto", validator=any_validator(is_bool, is_string) +) +default_options.register_option("serialize_method", "pickle") + +# dataframe-related options +default_options.register_option( + "dataframe.mode.use_inf_as_na", False, validator=is_bool +) +default_options.register_option( + "dataframe.use_arrow_dtype", None, validator=any_validator(is_null, is_bool) +) +default_options.register_option( + "dataframe.arrow_array.pandas_only", None, validator=any_validator(is_null, is_bool) +) + +# learn options +assume_finite = os.environ.get("SKLEARN_ASSUME_FINITE") +if assume_finite is not None: + assume_finite = bool(assume_finite) +working_memory = os.environ.get("SKLEARN_WORKING_MEMORY") +if working_memory is not None: + working_memory = int(working_memory) +default_options.register_option( + "learn.assume_finite", assume_finite, validator=any_validator(is_null, is_bool) +) +default_options.register_option( + "learn.working_memory", working_memory, validator=any_validator(is_null, is_integer) +) + +# the number of combined chunks in tree reduction or tree add +default_options.register_option("combine_size", 4, validator=is_integer, serialize=True) + +# the default chunk store size +default_options.register_option( + "chunk_store_limit", 128 * 1024**2, validator=is_numeric +) +default_options.register_option( + "chunk_size", None, validator=any_validator(is_null, is_integer), serialize=True +) + +# rechunk +default_options.register_option( + "rechunk.threshold", 4, validator=is_integer, serialize=True +) +default_options.register_option( + "rechunk.chunk_size_limit", int(1e8), validator=is_integer, serialize=True +) + +default_options.register_option( + "bincount.chunk_size_limit", int(1e8), validator=is_integer, serialize=True +) + +# deploy +default_options.register_option("deploy.open_browser", True, validator=is_bool) + +# optimization +default_options.register_option("optimize_tileable_graph", True, validator=is_bool) + +# eager mode +default_options.register_option("eager_mode", False, validator=is_bool) + +# optimization +default_options.register_option( + "optimize.head_optimize_threshold", 1000, validator=is_integer +) + +# debug +default_options.register_option("warn_duplicated_execution", False, validator=is_bool) + +# client serialize type +default_options.register_option("client.serial_type", "arrow", validator=is_string) + +# custom log dir +default_options.register_option( + "custom_log_dir", None, validator=any_validator(is_null, is_string) +) + +# vineyard +default_options.register_option( + "vineyard.socket", os.environ.get("VINEYARD_IPC_SOCKET", None) +) +default_options.register_option( + "vineyard.enabled", os.environ.get("WITH_VINEYARD", None) is not None +) + +_options_local = threading.local() +_options_local.default_options = default_options + + +def get_global_option(): + ret = getattr(_options_local, "default_options", None) + if ret is None: + ret = _options_local.default_options = Config(deepcopy(default_options._config)) + + return ret + + +class OptionsProxy: + def __dir__(self): + return dir(get_global_option()) + + def __getattribute__(self, attr): + return getattr(get_global_option(), attr) + + def __setattr__(self, key, value): + setattr(get_global_option(), key, value) + + +options = OptionsProxy() + +options.redirect_option("tensor.chunk_store_limit", "chunk_store_limit") +options.redirect_option("tensor.chunk_size", "chunk_size") +options.redirect_option("tensor.rechunk.threshold", "rechunk.threshold") +options.redirect_option("tensor.rechunk.chunk_size_limit", "rechunk.chunk_size_limit") diff --git a/python/xorbits/_mars/config.yml b/python/xorbits/_mars/config.yml new file mode 100644 index 000000000..9f808f479 --- /dev/null +++ b/python/xorbits/_mars/config.yml @@ -0,0 +1 @@ +"@inherits": "@mars/deploy/oscar/config.yml" diff --git a/python/xorbits/_mars/conftest.py b/python/xorbits/_mars/conftest.py new file mode 100644 index 000000000..767010588 --- /dev/null +++ b/python/xorbits/_mars/conftest.py @@ -0,0 +1,290 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import concurrent.futures +import os +import subprocess +import time + +import psutil +import pytest +from mars.config import option_context +from mars.core.mode import is_build_mode, is_kernel_mode +from mars.lib.aio.lru import clear_all_alru_caches +from mars.oscar.backends.ray.communication import RayServer +from mars.oscar.backends.router import Router +from mars.utils import lazy_import + +ray = lazy_import("ray") +MARS_CI_BACKEND = os.environ.get("MARS_CI_BACKEND", "mars") + + +@pytest.fixture(autouse=True) +def auto_cleanup(request): + request.addfinalizer(clear_all_alru_caches) + + +@pytest.fixture(scope="module", autouse=True) +def check_router_cleaned(request): + def route_checker(): + if Router.get_instance() is not None: + assert len(Router.get_instance()._mapping) == 0 + assert len(Router.get_instance()._local_mapping) == 0 + + request.addfinalizer(route_checker) + + +@pytest.fixture(scope="module") +def ray_start_regular_shared(request): # pragma: no cover + yield from _ray_start_regular(request) + + +@pytest.fixture(scope="module") +def ray_start_regular_shared2(request): # pragma: no cover + os.environ["RAY_kill_idle_workers_interval_ms"] = "0" + param = getattr(request, "param", {}) + num_cpus = param.get("num_cpus", 64) + total_memory_mb = num_cpus * 2 * 1024**2 + try: + try: + job_config = ray.job_config.JobConfig(total_memory_mb=total_memory_mb) + except TypeError: + job_config = None + yield ray.init(num_cpus=num_cpus, job_config=job_config) + finally: + ray.shutdown() + Router.set_instance(None) + os.environ.pop("RAY_kill_idle_workers_interval_ms", None) + + +@pytest.fixture +def ray_start_regular(request): # pragma: no cover + yield from _ray_start_regular(request) + + +def _ray_start_regular(request): # pragma: no cover + param = getattr(request, "param", {}) + if not param.get("enable", True): + yield + elif ray and ray.is_initialized(): + yield + else: + num_cpus = param.get("num_cpus", 64) + total_memory_mb = num_cpus * 2 * 1024**2 + try: + try: + job_config = ray.job_config.JobConfig(total_memory_mb=total_memory_mb) + except TypeError: + job_config = None + yield ray.init(num_cpus=num_cpus, job_config=job_config) + finally: + ray.shutdown() + Router.set_instance(None) + RayServer.clear() + if "COV_CORE_SOURCE" in os.environ: + # Remove this when https://github.com/ray-project/ray/issues/16802 got fixed + subprocess.check_call(["ray", "stop", "--force"]) + + +@pytest.fixture(scope="module") +def ray_large_cluster_shared(request): # pragma: no cover + yield from _ray_large_cluster(request) + + +@pytest.fixture +def ray_large_cluster(request): # pragma: no cover + yield from _ray_large_cluster(request) + + +def _ray_large_cluster(request): # pragma: no cover + param = getattr(request, "param", {}) + num_nodes = param.get("num_nodes", 3) + num_cpus = param.get("num_cpus", 16) + from ray.cluster_utils import Cluster + + cluster = Cluster() + remote_nodes = [] + for i in range(num_nodes): + remote_nodes.append( + cluster.add_node(num_cpus=num_cpus, memory=num_cpus * 2 * 1024**3) + ) + if len(remote_nodes) == 1: + try: + job_config = ray.job_config.JobConfig( + total_memory_mb=num_nodes * 32 * 1024**3 + ) + except TypeError: + job_config = None + ray.init(address=cluster.address, job_config=job_config) + try: + yield cluster + finally: + Router.set_instance(None) + RayServer.clear() + ray.shutdown() + cluster.shutdown() + if "COV_CORE_SOURCE" in os.environ: + # Remove this when https://github.com/ray-project/ray/issues/16802 got fixed + subprocess.check_call(["ray", "stop", "--force"]) + + +@pytest.fixture +def stop_ray(request): # pragma: no cover + yield + if ray.is_initialized(): + ray.shutdown() + Router.set_instance(None) + + +@pytest.fixture +async def ray_create_mars_cluster(request, check_router_cleaned): + from mars.deploy.oscar.ray import _load_config, new_cluster + + ray_config = _load_config() + param = getattr(request, "param", {}) + supervisor_mem = param.get("supervisor_mem", 1 * 1024**3) + worker_num = param.get("worker_num", 2) + worker_cpu = param.get("worker_cpu", 2) + worker_mem = param.get("worker_mem", 256 * 1024**2) + ray_config.update(param.get("config", {})) + client = await new_cluster( + supervisor_mem=supervisor_mem, + worker_num=worker_num, + worker_cpu=worker_cpu, + worker_mem=worker_mem, + config=ray_config, + ) + try: + async with client: + yield client + finally: + Router.set_instance(None) + + +@pytest.fixture +def stop_mars(): + try: + yield + finally: + import mars + + mars.stop_server() + + +@pytest.fixture(scope="module") +def _new_test_session(check_router_cleaned): + from .deploy.oscar.tests.session import new_test_session + + sess = new_test_session( + address="test://127.0.0.1", + backend=MARS_CI_BACKEND, + init_local=True, + default=True, + timeout=300, + ) + with option_context({"show_progress": False}): + try: + yield sess + finally: + sess.stop_server(isolation=False) + Router.set_instance(None) + + +@pytest.fixture(scope="module") +def _new_integrated_test_session(check_router_cleaned): + from .deploy.oscar.tests.session import new_test_session + + sess = None + for i in range(3): + try: + sess = new_test_session( + address="127.0.0.1", + backend=MARS_CI_BACKEND, + init_local=True, + n_worker=2, + default=True, + timeout=300, + ) + except ChildProcessError: + time.sleep(1) + if i == 2: + raise + else: + continue + else: + break + with option_context({"show_progress": False}): + try: + yield sess + finally: + try: + sess.stop_server(isolation=False) + except concurrent.futures.TimeoutError: + Router.set_instance(None) + subprocesses = psutil.Process().children(recursive=True) + for proc in subprocesses: + proc.terminate() + for proc in subprocesses: + try: + proc.wait(1) + except (psutil.TimeoutExpired, psutil.NoSuchProcess): + pass + try: + proc.kill() + except psutil.NoSuchProcess: + pass + + +@pytest.fixture(scope="module") +def _new_gpu_test_session(check_router_cleaned): # pragma: no cover + from .deploy.oscar.tests.session import new_test_session + from .resource import cuda_count + + cuda_devices = list(range(min(cuda_count(), 2))) + + sess = new_test_session( + address="127.0.0.1", + backend=MARS_CI_BACKEND, + init_local=True, + n_worker=1, + n_cpu=1, + cuda_devices=cuda_devices, + default=True, + timeout=300, + ) + with option_context({"show_progress": False}): + try: + yield sess + finally: + sess.stop_server(isolation=False) + Router.set_instance(None) + + +@pytest.fixture +def setup(_new_test_session): + _new_test_session.as_default() + yield _new_test_session + assert not (is_build_mode() or is_kernel_mode()) + + +@pytest.fixture +def setup_cluster(_new_integrated_test_session): + _new_integrated_test_session.as_default() + yield _new_integrated_test_session + + +@pytest.fixture +def setup_gpu(_new_gpu_test_session): # pragma: no cover + _new_gpu_test_session.as_default() + yield _new_test_session diff --git a/python/xorbits/_mars/constants.py b/python/xorbits/_mars/constants.py new file mode 100644 index 000000000..b4b56cab4 --- /dev/null +++ b/python/xorbits/_mars/constants.py @@ -0,0 +1,20 @@ +# Copyright 2022 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Env key for Mars log absolute path +MARS_LOG_PATH_KEY = "MARS_LOG_PATH" +# Mars log file name prefix +MARS_LOG_PREFIX = "mars_" +# The prefix of the temporary directory where the Mars log is located +MARS_TMP_DIR_PREFIX = "mars_tmp" diff --git a/python/xorbits/_mars/contrib/__init__.py b/python/xorbits/_mars/contrib/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/contrib/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/contrib/dask/__init__.py b/python/xorbits/_mars/contrib/dask/__init__.py new file mode 100644 index 000000000..7a4ea89b8 --- /dev/null +++ b/python/xorbits/_mars/contrib/dask/__init__.py @@ -0,0 +1,25 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# noinspection PyUnresolvedReferences + +from ...utils import ModulePlaceholder + +try: + import dask +except ImportError: + convert_dask_collection = mars_scheduler = ModulePlaceholder("dask") +else: + from .converter import convert_dask_collection + from .scheduler import mars_scheduler diff --git a/python/xorbits/_mars/contrib/dask/converter.py b/python/xorbits/_mars/contrib/dask/converter.py new file mode 100644 index 000000000..e41f5e33f --- /dev/null +++ b/python/xorbits/_mars/contrib/dask/converter.py @@ -0,0 +1,58 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dask import is_dask_collection, optimize +from dask.bag import Bag + +from ...remote import spawn +from .scheduler import mars_dask_get +from .utils import reduce + + +def convert_dask_collection(dc): + """ + Convert dask collection object into mars.core.Object via remote API + + Parameters + ---------- + dc: dask collection + Dask collection object to be converted. + + Returns + ------- + Object + Mars Object. + """ + if not is_dask_collection(dc): + raise TypeError(f"'{type(dc).__name__}' object is not a valid dask collection") + + dc.__dask_graph__().validate() + dsk = optimize(dc)[0].__dask_graph__() + + first_key = next(iter(dsk.keys())) + if isinstance(first_key, str): + key = [first_key] + elif isinstance(first_key, tuple): + key = sorted( + [i for i in dsk.keys() if i[0] == first_key[0]], key=lambda x: x[1] + ) + else: + raise ValueError( + f"Dask collection object seems be broken, with unexpected key type:'{type(first_key).__name__}'" + ) + res = reduce(mars_dask_get(dsk, [key])) + if isinstance(dc, Bag): + return spawn(lambda x: list(x[0][0]), args=(res,)) + else: + return res diff --git a/python/xorbits/_mars/contrib/dask/scheduler.py b/python/xorbits/_mars/contrib/dask/scheduler.py new file mode 100644 index 000000000..bdfe17f47 --- /dev/null +++ b/python/xorbits/_mars/contrib/dask/scheduler.py @@ -0,0 +1,102 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Tuple, Union + +from dask.core import ishashable, istask + +from ...deploy.oscar.session import execute +from ...remote import spawn +from .utils import reduce + + +def mars_scheduler(dsk: dict, keys: Union[List[List[str]], List[str]]): + """ + A Dask-Mars scheduler + + This scheduler is intended to be compatible with existing + dask user interface, no callbacks are implemented. + + Parameters + ---------- + dsk: Dict + Dask graph, represented as a task DAG dictionary. + keys: Union[List[List[str]], List[str]] + 1d or 2d list of Dask graph keys whose values we wish to compute and return. + + Returns + ------- + Object + Computed values corresponding to the provided keys with same dimension. + """ + + if isinstance(keys, List) and not isinstance(keys[0], List): # 1d keys + task = execute(mars_dask_get(dsk, keys)) + if not isinstance(task, List): + task = [task] + return map(lambda x: x.fetch(), task) + else: # 2d keys + res = execute(reduce(mars_dask_get(dsk, keys))).fetch() + if not isinstance(res, List): + return [[res]] + else: + return res + + +def mars_dask_get(dsk: dict, keys: Union[List[List[str]], List[str]]): + """ + A Dask-Mars convert function. This function will send the dask graph layers + to Mars Remote API, generating mars objects correspond to the provided keys. + + Parameters + ---------- + dsk: Dict + Dask graph, represented as a task DAG dictionary. + keys: Union[List[List[str]], List[str]] + 1d or 2d list of Dask graph keys whose values we wish to compute and return. + + Returns + ------- + Object + Spawned mars objects corresponding to the provided keys with same dimension. + """ + + def _get_arg(a): + # if arg contains layer index or callable objs, handle it + if ishashable(a) and a in dsk.keys(): + while ishashable(a) and a in dsk.keys(): + a = dsk[a] + return _spawn_task(a) + elif not isinstance(a, str) and hasattr(a, "__getitem__"): + if istask( + a + ): # TODO:Handle `SubgraphCallable`, which may contains dsk in it + return spawn(a[0], args=tuple(_get_arg(i) for i in a[1:])) + elif isinstance(a, dict): + return {k: _get_arg(v) for k, v in a.items()} + elif isinstance(a, List) or isinstance(a, Tuple): + return type(a)(_get_arg(i) for i in a) + return a + + def _spawn_task(task: tuple): + if not istask(task): + return _get_arg(task) + return spawn(task[0], args=tuple(_get_arg(a) for a in task[1:])) + + return [ + [_spawn_task(dsk[k]) for k in keys_d] + if isinstance(keys_d, List) + else _spawn_task(dsk[keys_d]) + for keys_d in keys + ] diff --git a/python/xorbits/_mars/contrib/dask/tests/__init__.py b/python/xorbits/_mars/contrib/dask/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/contrib/dask/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/contrib/dask/tests/test_dask.py b/python/xorbits/_mars/contrib/dask/tests/test_dask.py new file mode 100644 index 000000000..967807410 --- /dev/null +++ b/python/xorbits/_mars/contrib/dask/tests/test_dask.py @@ -0,0 +1,183 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from ....utils import lazy_import +from .. import convert_dask_collection, mars_scheduler + +dask_installed = lazy_import("dask") is not None +mimesis_installed = lazy_import("mimesis") is not None + + +@pytest.mark.skipif(not dask_installed, reason="dask not installed") +def test_delayed(setup_cluster): + import numpy as np + from dask import delayed + + def calc_chunk(n: int, i: int): + rs = np.random.RandomState(i) + a = rs.uniform(-1, 1, size=(n, 2)) + d = np.linalg.norm(a, axis=1) + return (d < 1).sum() + + def calc_pi(fs, N): + return sum(fs) * 4 / N + + N = 200_000_000 + n = 10_000_000 + + fs = [delayed(calc_chunk)(n, i) for i in range(N // n)] + pi = delayed(calc_pi)(fs, N) + + dask_res = pi.compute() + assert dask_res == pi.compute(scheduler=mars_scheduler) + assert dask_res == convert_dask_collection(pi).execute().fetch() + + +@pytest.mark.skipif(not dask_installed, reason="dask not installed") +def test_partitioned_dataframe(setup_cluster): + import numpy as np + import pandas as pd + from dask import dataframe as dd + from pandas._testing import assert_frame_equal + + data = np.random.randn(10000, 100) + df = dd.from_pandas( + pd.DataFrame(data, columns=[f"col{i}" for i in range(100)]), npartitions=4 + ) + df["col0"] = df["col0"] + df["col1"] / 2 + col2_mean = df["col2"].mean() + df = df[df["col2"] > col2_mean] + + dask_res = df.compute() + assert_frame_equal( + dask_res, df.compute(scheduler=mars_scheduler), check_index_type=False + ) + assert_frame_equal( + dask_res, convert_dask_collection(df).execute().fetch(), check_index_type=False + ) + + +@pytest.mark.skipif(not dask_installed, reason="dask not installed") +def test_unpartitioned_dataframe(setup_cluster): + import pandas as pd + from dask import dataframe as dd + from pandas._testing import assert_frame_equal + from sklearn.datasets import load_iris + + boston = load_iris() + pd.DataFrame(boston.data, columns=boston["feature_names"]).to_csv( + "./boston_housing_data.csv" + ) + + df = dd.read_csv(r"./boston_housing_data.csv") + df["sepal length (cm)"] = df["sepal length (cm)"] / 2 + + dask_res = df.compute() + assert_frame_equal(dask_res, df.compute(scheduler=mars_scheduler)) + assert_frame_equal(dask_res, convert_dask_collection(df).execute().fetch()) + + +@pytest.mark.skipif(not dask_installed, reason="dask not installed") +def test_array(setup_cluster): + import dask.array as da + from numpy.core.numeric import array_equal + + x = da.random.random((10000, 10000), chunks=(1000, 1000)) + y = x + x.T + z = y[::2, 5000:].mean(axis=1) + + dask_res = z.compute() + assert array_equal(dask_res, z.compute(scheduler=mars_scheduler)) + assert array_equal(dask_res, convert_dask_collection(z).execute().fetch()) + + +@pytest.mark.skipif(not dask_installed, reason="dask not installed") +@pytest.mark.skipif(not mimesis_installed, reason="mimesis not installed") +def test_bag(setup_cluster): + import dask + + b = dask.datasets.make_people() # Make records of people + result = ( + b.filter(lambda record: record["age"] > 30) + .map(lambda record: record["occupation"]) + .frequencies(sort=True) + .topk(10, key=1) + ) + + dask_res = result.compute() + assert dask_res == result.compute(scheduler=mars_scheduler) + assert dask_res == convert_dask_collection(result).execute().fetch() + + +@pytest.mark.skipif(not dask_installed, reason="dask not installed") +def test_dask_errors(): + with pytest.raises(TypeError): + convert_dask_collection({"foo": 0, "bar": 1}) + + +@pytest.mark.skipif(not dask_installed, reason="dask not installed") +def test_multiple_objects(setup_cluster): + import dask + + def inc(x: int): + return x + 1 + + test_list = [dask.delayed(inc)(i) for i in range(10)] + test_tuple = tuple(dask.delayed(inc)(i) for i in range(10)) + test_dict = {str(i): dask.delayed(inc)(i) for i in range(10)} + + for test_obj in (test_list, test_tuple, test_dict): + assert dask.compute(test_obj) == dask.compute( + test_obj, scheduler=mars_scheduler + ) + + +@pytest.mark.skipif(not dask_installed, reason="dask not installed") +def test_persist(setup_cluster): + import dask + + def inc(x): + return x + 1 + + a = dask.delayed(inc)(1) + task_mars_persist = dask.delayed(inc)(a.persist(scheduler=mars_scheduler)) + task_dask_persist = dask.delayed(inc)(a.persist()) + + assert task_dask_persist.compute() == task_mars_persist.compute( + scheduler=mars_scheduler + ) + + +@pytest.mark.skipif(not dask_installed, reason="dask not installed") +def test_partitioned_dataframe_persist(setup_cluster): + import numpy as np + import pandas as pd + from dask import dataframe as dd + from pandas._testing import assert_frame_equal + + data = np.random.randn(10000, 100) + df = dd.from_pandas( + pd.DataFrame(data, columns=[f"col{i}" for i in range(100)]), npartitions=4 + ) + df["col0"] = df["col0"] + df["col1"] / 2 + col2_mean = df["col2"].mean() + + df_mars_persist = df[df["col2"] > col2_mean.persist(scheduler=mars_scheduler)] + df_dask_persist = df[df["col2"] > col2_mean.persist()] + + assert_frame_equal( + df_dask_persist.compute(), df_mars_persist.compute(scheduler=mars_scheduler) + ) diff --git a/python/xorbits/_mars/contrib/dask/utils.py b/python/xorbits/_mars/contrib/dask/utils.py new file mode 100644 index 000000000..53f73cb28 --- /dev/null +++ b/python/xorbits/_mars/contrib/dask/utils.py @@ -0,0 +1,66 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List + +from dask import is_dask_collection +from dask.array.core import _concatenate2 as array_concat +from dask.dataframe import concat as df_concat +from dask.utils import is_arraylike, is_dataframe_like, is_index_like, is_series_like + +from ...remote import spawn + + +def concat(objs: List): + """ + Concat the results of partitioned dask task executions. This function guess the + types of resulting list, then calls the corresponding native dask concat functions. + + Parameters + ---------- + objs: List + List of the partitioned dask task execution results, which will be concat. + + Returns + ------- + obj: + The concat result + + """ + if is_arraylike(objs[0]): + res = array_concat(objs, axes=[0]) # TODO: Add concat with args support + elif any( + (is_dataframe_like(objs[0]), is_series_like(objs[0]), is_index_like(objs[0])) + ): + res = df_concat(objs) + else: + res = objs + return res.compute() if is_dask_collection(res) else res + + +def reduce(objs: List[List]): + """ + Spawn a concat task for 2d-list objects + + Parameters + ---------- + objs: List + 2d-list of the partitioned dask task execution results, which will be concat. + + Returns + ------- + obj: + The spawning concat task. + """ + return spawn(concat, args=([spawn(concat, args=(objs_d,)) for objs_d in objs],)) diff --git a/python/xorbits/_mars/core/__init__.py b/python/xorbits/_mars/core/__init__.py new file mode 100644 index 000000000..b26fc8950 --- /dev/null +++ b/python/xorbits/_mars/core/__init__.py @@ -0,0 +1,67 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# noinspection PyUnresolvedReferences +from ..typing import ChunkType, EntityType, OperandType, TileableType +from .base import ExecutionError +from .entity import ( + CHUNK_TYPE, + ENTITY_TYPE, + FUSE_CHUNK_TYPE, + OBJECT_CHUNK_TYPE, + OBJECT_TYPE, + TILEABLE_TYPE, + Chunk, + ChunkData, + Entity, + EntityData, + ExecutableTuple, + FuseChunk, + FuseChunkData, + HasShapeTileable, + HasShapeTileableData, + NotSupportTile, + Object, + ObjectChunk, + ObjectChunkData, + ObjectData, + OutputType, + Tileable, + TileableData, + _ExecuteAndFetchMixin, + get_chunk_types, + get_fetch_class, + get_output_types, + get_tileable_types, + recursive_tile, + register, + register_fetch_class, + register_output_types, + tile, + unregister, +) + +# noinspection PyUnresolvedReferences +from .graph import ( + DAG, + ChunkGraph, + ChunkGraphBuilder, + DirectedGraph, + GraphContainsCycleError, + TileableGraph, + TileableGraphBuilder, + TileContext, + TileStatus, +) +from .mode import enter_mode, is_build_mode, is_eager_mode, is_kernel_mode diff --git a/python/xorbits/_mars/core/base.py b/python/xorbits/_mars/core/base.py new file mode 100644 index 000000000..c3022c83f --- /dev/null +++ b/python/xorbits/_mars/core/base.py @@ -0,0 +1,149 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, Tuple, Type + +from ..serialization.serializables import Serializable, StringField +from ..serialization.serializables.core import SerializableSerializer +from ..utils import tokenize + + +class Base(Serializable): + _no_copy_attrs_ = {"_id"} + _init_update_key_ = True + + _key = StringField("key", default=None) + _id = StringField("id") + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + if self._init_update_key_ and (not hasattr(self, "_key") or not self._key): + self._update_key() + if not hasattr(self, "_id") or not self._id: + self._id = str(id(self)) + + @property + def _keys_(self): + cls = type(self) + member = "__keys_" + cls.__name__ + try: + return getattr(cls, member) + except AttributeError: + slots = sorted(self._FIELDS) + setattr(cls, member, slots) + return slots + + @property + def _copy_tags_(self): + cls = type(self) + member = f"__copy_tags_{cls.__name__}" + try: + return getattr(cls, member) + except AttributeError: + slots = sorted( + f.name for k, f in self._FIELDS.items() if k not in self._no_copy_attrs_ + ) + setattr(cls, member, slots) + return slots + + @property + def _values_(self): + values = [] + fields = self._FIELDS + for k in self._copy_tags_: + try: + values.append(fields[k].get(self)) + except AttributeError: + values.append(None) + return values + + def __mars_tokenize__(self): + try: + return self._key + except AttributeError: # pragma: no cover + self._update_key() + return self._key + + def _obj_set(self, k, v): + object.__setattr__(self, k, v) + + def _update_key(self): + self._obj_set("_key", tokenize(type(self).__name__, *self._values_)) + return self + + def reset_key(self): + self._obj_set("_key", None) + return self + + def __copy__(self): + return self.copy() + + def copy(self): + return self.copy_to(type(self)(_key=self.key)) + + def copy_to(self, target: "Base"): + target_fields = target._FIELDS + no_copy_attrs = self._no_copy_attrs_ + for k, field in self._FIELDS.items(): + if k in no_copy_attrs: + continue + try: + # Slightly faster than getattr. + value = field.__get__(self, k) + target_fields[k].set(target, value) + except AttributeError: + continue + + return target + + def copy_from(self, obj): + obj.copy_to(self) + + @property + def key(self): + return self._key + + @property + def id(self): + return self._id + + def to_kv(self, exclude_fields: Tuple[str], accept_value_types: Tuple[Type]): + fields = self._FIELDS + kv = {} + no_value = object() + for name, field in fields.items(): + if name not in exclude_fields: + value = getattr(self, name, no_value) + if value is not no_value and isinstance(value, accept_value_types): + kv[field.tag] = value + return kv + + +class BaseSerializer(SerializableSerializer): + def serial(self, obj: Base, context: Dict): + return super().serial(obj, context) + + +BaseSerializer.register(Base) + + +class MarsError(Exception): + pass + + +class ExecutionError(MarsError): + def __init__(self, nested_error: BaseException): + super().__init__(nested_error) + self.nested_error = nested_error diff --git a/python/xorbits/_mars/core/context.py b/python/xorbits/_mars/core/context.py new file mode 100644 index 000000000..6379d29ed --- /dev/null +++ b/python/xorbits/_mars/core/context.py @@ -0,0 +1,304 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from abc import ABC, abstractmethod +from typing import Dict, List + +from ..storage.base import StorageLevel +from ..typing import BandType, SessionType +from ..utils import classproperty + + +class Context(ABC): + """ + Context that providing API that can be + used inside `tile` and `execute`. + """ + + all_contexts = [] + + def __init__( + self, + session_id: str = None, + supervisor_address: str = None, + worker_address: str = None, + local_address: str = None, + band: BandType = None, + ): + if session_id is None: + # try to get session id from environment + session_id = os.environ.get("MARS_SESSION_ID") + if session_id is None: + raise ValueError("session_id should be provided to create a context") + if supervisor_address is None: + # try to get supervisor address from environment + supervisor_address = os.environ.get("MARS_SUPERVISOR_ADDRESS") + if supervisor_address is None: + raise ValueError( + "supervisor_address should be provided to create a context" + ) + + self.session_id = session_id + self.supervisor_address = supervisor_address + self.worker_address = worker_address + self.local_address = local_address + self.band = band + + @abstractmethod + def get_current_session(self) -> SessionType: + """ + Get current session + + Returns + ------- + session + """ + + @abstractmethod + def get_local_host_ip(self) -> str: + """ + Get local worker's host ip + + Returns + ------- + host_ip : str + """ + + @abstractmethod + def get_supervisor_addresses(self) -> List[str]: + """ + Get supervisor addresses. + + Returns + ------- + supervisor_addresses : list + """ + + @abstractmethod + def get_worker_addresses(self) -> List[str]: + """ + Get worker addresses. + + Returns + ------- + worker_addresses : list + """ + + @abstractmethod + def get_worker_bands(self) -> List[BandType]: + """ + Get worker bands. + + Returns + ------- + worker_bands : list + """ + + @abstractmethod + def get_total_n_cpu(self) -> int: + """ + Get number of cpus. + + Returns + ------- + number_of_cpu: int + """ + + @abstractmethod + def get_slots(self) -> int: + """ + Get num of slots of current band + + Returns + ------- + number_of_bands: int + """ + + @abstractmethod + def get_chunks_result(self, data_keys: List[str], fetch_only: bool = False) -> List: + """ + Get result of chunks. + + Parameters + ---------- + data_keys : list + Data keys. + fetch_only : bool + If fetch_only, only fetch data but not return. + + Returns + ------- + results : list + Result of chunks if not fetch_only, else return None + """ + + @abstractmethod + def get_chunks_meta( + self, data_keys: List[str], fields: List[str] = None, error="raise" + ) -> List[Dict]: + """ + Get meta of chunks. + + Parameters + ---------- + data_keys : list + Data keys. + fields : list + Fields to filter. + error : str + raise, ignore + + Returns + ------- + meta_list : list + Meta list. + """ + + @abstractmethod + def get_storage_info(self, address: str, level: StorageLevel): + """ + Get the customized storage backend info of requested storage backend level at given worker. + + Parameters + ---------- + address: str + The worker address. + level: StorageLevel + The storage level to fetch the backend info. + + Returns + ------- + info: dict + Customized storage backend info dict of all workers. The key is + worker address, the value is the backend info dict. + """ + + @abstractmethod + def create_remote_object(self, name: str, object_cls, *args, **kwargs): + """ + Create remote object. + + Parameters + ---------- + name : str + Object name. + object_cls + Object class. + args + kwargs + + Returns + ------- + ref + """ + + @abstractmethod + def get_remote_object(self, name: str): + """ + Get remote object + + Parameters + ---------- + name : str + Object name. + + Returns + ------- + ref + """ + + @abstractmethod + def destroy_remote_object(self, name: str): + """ + Destroy remote object. + + Parameters + ---------- + name : str + Object name. + """ + + @abstractmethod + def register_custom_log_path( + self, + session_id: str, + tileable_op_key: str, + chunk_op_key: str, + worker_address: str, + log_path: str, + ): + """ + Register custom log path. + + Parameters + ---------- + session_id : str + Session ID. + tileable_op_key : str + Key of tileable's op. + chunk_op_key : str + Kye of chunk's op. + worker_address : str + Worker address. + log_path : str + Log path. + """ + + def new_custom_log_dir(self) -> str: + """ + New custom log dir. + + Returns + ------- + custom_log_dir : str + Custom log dir. + """ + + def set_running_operand_key(self, session_id: str, op_key: str): + """ + Set key of running operand. + + Parameters + ---------- + session_id : str + op_key : str + """ + + def set_progress(self, progress: float): + """ + Set progress of running operand. + + Parameters + ---------- + progress : float + """ + + def __enter__(self): + Context.all_contexts.append(self) + + def __exit__(self, *_): + Context.all_contexts.pop() + + @classproperty + def current(cls): + return cls.all_contexts[-1] if cls.all_contexts else None + + +def set_context(context: Context): + Context.all_contexts.append(context) + + +def get_context() -> Context: + return Context.current diff --git a/python/xorbits/_mars/core/custom_log.py b/python/xorbits/_mars/core/custom_log.py new file mode 100644 index 000000000..7b918ef21 --- /dev/null +++ b/python/xorbits/_mars/core/custom_log.py @@ -0,0 +1,188 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import io +import os +import sys +import textwrap +import weakref +from typing import Callable, List, Type + +from ..typing import OperandType, SessionType, TileableType +from .context import Context + + +class _LogWrapper: + def __init__(self, ctx: Context, op: OperandType, log_path: str): + self.ctx = ctx + self.op = op + self.log_path = log_path + + self.file = open(log_path, "w") + self.stdout = sys.stdout + + self.raw_stdout = self.stdout + while isinstance(self.raw_stdout, _LogWrapper): + self.raw_stdout = self.raw_stdout.stdout + + # flag about registering log path + self.is_log_path_registered = False + + def __enter__(self): + self.file.__enter__() + # set stdout + sys.stdout = self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.file.__exit__(exc_type, exc_val, exc_tb) + # set back stdout + sys.stdout = self.stdout + + def _register_log_path(self): + if self.is_log_path_registered: + return + + # register log path + session_id = self.ctx.session_id + tileable_op_key = self.op.tileable_op_key + chunk_op_key = self.op.key + worker_addr = self.ctx.local_address + log_path = self.log_path + + self.ctx.register_custom_log_path( + session_id, tileable_op_key, chunk_op_key, worker_addr, log_path + ) + + self.is_log_path_registered = True + + def write(self, data): + self._register_log_path() + + # write into file + self.file.write(data) + # force flush to make sure `fetch_log` can get stdout in time + self.file.flush() + # write into previous stdout + self.raw_stdout.write(data) + + def flush(self): + self.raw_stdout.flush() + + +def redirect_custom_log(func: Callable[[Type, Context, OperandType], None]): + """ + Redirect stdout to a file by wrapping ``Operand.execute(ctx, op)`` + """ + + @functools.wraps(func) + def wrap(cls, ctx: Context, op: OperandType): + custom_log_dir = ctx.new_custom_log_dir() + + if custom_log_dir is None: + return func(cls, ctx, op) + + log_path = os.path.join(custom_log_dir, op.key) + + with _LogWrapper(ctx, op, log_path): + return func(cls, ctx, op) + + return wrap + + +_tileable_to_log_fetcher = weakref.WeakKeyDictionary() + + +class LogFetcher: + def __init__(self, tileable_op_key: str, session: SessionType): + self._tileable_op_key = tileable_op_key + self._session = session + self._chunk_op_key_to_result = dict() + self._chunk_op_key_to_offsets = dict() + + def __len__(self): + return len(self._chunk_op_key_to_result) + + @property + def chunk_op_keys(self) -> List[str]: + return list(self._chunk_op_key_to_result.keys()) + + @property + def results(self) -> list: + return list(self._chunk_op_key_to_result.values()) + + @property + def offsets(self) -> List[List[int]]: + return list(self._chunk_op_key_to_offsets.values()) + + def fetch(self, offsets: List[int] = None, sizes: List[int] = None): + if offsets is None: + offsets = self._chunk_op_key_to_offsets + + if sizes is None: + sizes = 1 * 1024**2 # 1M each time + + result: dict = self._session.fetch_tileable_op_logs( + self._tileable_op_key, offsets=offsets, sizes=sizes + ) + + if result is None: + return + + for chunk_key, chunk_result in result.items(): + self._chunk_op_key_to_result[chunk_key] = chunk_result["log"] + self._chunk_op_key_to_offsets[chunk_key] = chunk_result["offset"] + + def _display(self, representation: bool = True): + if len(self) == 1: + content = next(iter(self._chunk_op_key_to_result.values())) + return repr(content) if representation else str(content) + + sio = io.StringIO() + for chunk_op_key, content in self._chunk_op_key_to_result.items(): + sio.write( + textwrap.dedent( + f""" + Chunk op key: {chunk_op_key} + Out: + {content}""" + ) + ) + result = sio.getvalue() + return repr(result) if representation else str(result) + + def __repr__(self): + return self._display(True) + + def __str__(self): + return self._display(False) + + +def fetch( + tileables: List[TileableType], + session: SessionType, + offsets: List[int] = None, + sizes: List[int] = None, +): + log_fetchers = [] + for tileable in tileables: + tileable = tileable.data if hasattr(tileable, "data") else tileable + + if tileable not in _tileable_to_log_fetcher: + _tileable_to_log_fetcher[tileable] = LogFetcher(tileable.op.key, session) + + log_fetcher = _tileable_to_log_fetcher[tileable] + log_fetcher.fetch(offsets=offsets, sizes=sizes) + log_fetchers.append(log_fetcher) + return log_fetchers diff --git a/python/xorbits/_mars/core/entity/__init__.py b/python/xorbits/_mars/core/entity/__init__.py new file mode 100644 index 000000000..e0a4ee754 --- /dev/null +++ b/python/xorbits/_mars/core/entity/__init__.py @@ -0,0 +1,46 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .chunks import CHUNK_TYPE, Chunk, ChunkData +from .core import ENTITY_TYPE, Entity, EntityData +from .executable import ExecutableTuple, _ExecuteAndFetchMixin +from .fuse import FUSE_CHUNK_TYPE, FuseChunk, FuseChunkData +from .objects import ( + OBJECT_CHUNK_TYPE, + OBJECT_TYPE, + Object, + ObjectChunk, + ObjectChunkData, + ObjectData, +) +from .output_types import ( + OutputType, + get_chunk_types, + get_fetch_class, + get_output_types, + get_tileable_types, + register_fetch_class, + register_output_types, +) +from .tileables import ( + TILEABLE_TYPE, + HasShapeTileable, + HasShapeTileableData, + NotSupportTile, + Tileable, + TileableData, + register, + unregister, +) +from .utils import recursive_tile, tile diff --git a/python/xorbits/_mars/core/entity/chunks.py b/python/xorbits/_mars/core/entity/chunks.py new file mode 100644 index 000000000..e96b87aa3 --- /dev/null +++ b/python/xorbits/_mars/core/entity/chunks.py @@ -0,0 +1,68 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...serialization.serializables import BoolField, FieldTypes, TupleField +from ...utils import tokenize +from .core import Entity, EntityData + + +class ChunkData(EntityData): + __slots__ = () + + is_broadcaster = BoolField("is_broadcaster", default=False) + # If the operand is a shuffle mapper, this flag indicates whether the current chunk is mapper chunk when + # the operand produce multiple chunks such as TensorUnique. + is_mapper = BoolField("is_mapper", default=None) + # optional fields + _index = TupleField("index", FieldTypes.uint32) + + def __repr__(self): + if self.op.stage is None: + return ( + f"{type(self).__name__} " + ) + else: + return ( + f"{type(self).__name__} " + ) + + @property + def index(self): + return getattr(self, "_index", None) + + @property + def device(self): + return self.op.device + + def _update_key(self): + object.__setattr__( + self, + "_key", + tokenize( + type(self).__name__, + *(getattr(self, k, None) for k in self._keys_ if k != "_index"), + ), + ) + + +class Chunk(Entity): + _allow_data_type_ = (ChunkData,) + + def __repr__(self): + return f"{type(self).__name__}({self._data.__repr__()})" + + +CHUNK_TYPE = (Chunk, ChunkData) diff --git a/python/xorbits/_mars/core/entity/core.py b/python/xorbits/_mars/core/entity/core.py new file mode 100644 index 000000000..b6bbeff56 --- /dev/null +++ b/python/xorbits/_mars/core/entity/core.py @@ -0,0 +1,152 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...serialization.serializables import ( + DictField, + FieldTypes, + ReferenceField, + Serializable, +) +from ...utils import AttributeDict +from ..base import Base + + +class EntityData(Base): + __slots__ = ("_siblings",) + type_name = None + + # required fields + _op = ReferenceField("op", "mars.core.operand.base.Operand") + # optional fields + _extra_params = DictField("extra_params", key_type=FieldTypes.string) + + def __init__(self, *args, **kwargs): + extras = AttributeDict( + (k, kwargs.pop(k)) for k in set(kwargs) - set(self._FIELDS) + ) + kwargs["_extra_params"] = kwargs.pop("_extra_params", extras) + super().__init__(*args, **kwargs) + + @property + def op(self): + return self._op + + @property + def inputs(self): + return self.op.inputs + + @inputs.setter + def inputs(self, new_inputs): + self.op.inputs = new_inputs + + def is_sparse(self): + return self.op.is_sparse() + + issparse = is_sparse + + @property + def extra_params(self): + return self._extra_params + + def build_graph(self, **kw): + from ..graph.builder.utils import build_graph + + return build_graph([self], **kw) + + def visualize(self, graph_attrs=None, node_attrs=None, **kw): + from graphviz import Source + + g = self.build_graph(**kw) + dot = g.to_dot( + graph_attrs=graph_attrs, + node_attrs=node_attrs, + result_chunk_keys={c.key for c in self.chunks}, + ) + + return Source(dot) + + def _need_execution(self): # pylint: disable=no-self-use + # some tileable may generate unknown meta, + # they need to be executed first + return False + + +class Entity(Serializable): + _allow_data_type_ = () + type_name = None + + _data = ReferenceField("data", EntityData) + + def __init__(self, data=None, **kw): + super().__init__(_data=data, **kw) + + def __dir__(self): + obj_dir = object.__dir__(self) + if self._data is not None: + obj_dir = sorted(set(dir(self._data) + obj_dir)) + return obj_dir + + def __str__(self): + return self._data.__str__() + + def __repr__(self): + return self._data.__repr__() + + def _check_data(self, data): + if data is not None and not isinstance(data, self._allow_data_type_): + raise TypeError(f"Expect {self._allow_data_type_}, got {type(data)}") + + @property + def data(self): + return self._data + + @data.setter + def data(self, new_data): + self._check_data(new_data) + self._data = new_data + + def __copy__(self): + return self.copy() + + def copy(self): + return self.copy_to(type(self)(None)) + + def copy_to(self, target): + target.data = self._data + return target + + def copy_from(self, obj): + self.data = obj.data + + def tiles(self): + from .tileables import handler + + new_entity = self.copy() + new_entity.data = handler.tiles(self.data) + return new_entity + + def __getattr__(self, attr): + return getattr(self._data, attr) + + def __setattr__(self, key, value): + try: + object.__setattr__(self, key, value) + except AttributeError: + return setattr(self._data, key, value) + + def _need_execution(self): + return self._data._need_execution() + + +ENTITY_TYPE = (Entity, EntityData) diff --git a/python/xorbits/_mars/core/entity/executable.py b/python/xorbits/_mars/core/entity/executable.py new file mode 100644 index 000000000..2b420efa5 --- /dev/null +++ b/python/xorbits/_mars/core/entity/executable.py @@ -0,0 +1,337 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import atexit +import concurrent.futures +import queue +import threading +from typing import List +from weakref import WeakKeyDictionary, ref + +from ...lib.aio import get_isolation +from ...typing import SessionType, TileableType +from ..mode import enter_mode + + +class DecrefRunner: + def __init__(self): + self._decref_thread = None + self._queue = queue.Queue() + + def start(self): + self._decref_thread = threading.Thread( + target=self._thread_body, name="DecrefThread" + ) + self._decref_thread.daemon = True + self._decref_thread.start() + + def _thread_body(self): + from ...deploy.oscar.session import SyncSession + from ...oscar.errors import ActorNotExist + + while True: + key, session_ref, fut = self._queue.get() + if key is None: + break + + session = session_ref() + if session is None: + fut.set_result(None) + continue + try: + s = SyncSession.from_isolated_session(session) + s.decref(key) + fut.set_result(None) + except (RuntimeError, ConnectionError, KeyError, ActorNotExist): + fut.set_result(None) + except ( + Exception + ) as ex: # pragma: no cover # noqa: E722 # nosec # pylint: disable=bare-except + fut.set_exception(ex) + finally: + del session + + def stop(self): + if self._decref_thread: # pragma: no branch + self._queue.put_nowait((None, None, None)) + self._decref_thread.join(1) + + def put(self, key: str, session_ref: ref): + if self._decref_thread is None: + self.start() + + fut = concurrent.futures.Future() + self._queue.put_nowait((key, session_ref, fut)) + return fut + + +_decref_runner = DecrefRunner() +atexit.register(_decref_runner.stop) + + +class _TileableSession: + def __init__(self, tileable: TileableType, session: SessionType): + self._sess_id = id(session) + key = tileable.key + + def cb(_, sess=ref(session)): + try: + cur_thread_ident = threading.current_thread().ident + decref_in_isolation = get_isolation().thread_ident == cur_thread_ident + except KeyError: + # isolation destroyed, no need to decref + return + + fut = _decref_runner.put(key, sess) + if not decref_in_isolation: + # if decref in isolation, means that this tileable + # is not required for main thread, thus we do not need + # to wait for decref, otherwise, wait a bit + try: + fut.result(0.5) + except concurrent.futures.TimeoutError: + # ignore timeout + pass + + self.tileable = ref(tileable, cb) + + def __eq__(self, other: "_TileableSession"): + return self._sess_id == other._sess_id + + +class _TileableDataCleaner: + def __init__(self): + self._tileable_to_sessions = WeakKeyDictionary() + + @enter_mode(build=True) + def register(self, tileable: TileableType, session: SessionType): + if tileable in self._tileable_to_sessions: + self._tileable_to_sessions[tileable].append( + _TileableSession(tileable, session) + ) + else: + self._tileable_to_sessions[tileable] = [_TileableSession(tileable, session)] + + +# we don't use __del__ to avoid potential Circular reference +_cleaner = _TileableDataCleaner() + + +def _get_session(executable: "_ExecutableMixin", session: SessionType = None): + from ...deploy.oscar.session import get_default_session + + # if session is not specified, use default session + if session is None: + session = get_default_session() + + return session + + +class _ExecutableMixin: + __slots__ = () + _executed_sessions: List[SessionType] + + def execute(self, session: SessionType = None, **kw): + from ...deploy.oscar.session import execute + + session = _get_session(self, session) + return execute(self, session=session, **kw) + + def _check_session(self, session: SessionType, action: str): + if session is None: + if isinstance(self, tuple): + key = self[0].key + else: + key = self.key + raise ValueError( + f"Tileable object {key} must be executed first before {action}" + ) + + def _fetch(self, session: SessionType = None, **kw): + from ...deploy.oscar.session import fetch + + session = _get_session(self, session) + self._check_session(session, "fetch") + return fetch(self, session=session, **kw) + + def fetch(self, session: SessionType = None, **kw): + return self._fetch(session=session, **kw) + + def fetch_log( + self, + session: SessionType = None, + offsets: List[int] = None, + sizes: List[int] = None, + ): + from ...deploy.oscar.session import fetch_log + + session = _get_session(self, session) + self._check_session(session, "fetch_log") + return fetch_log(self, session=session, offsets=offsets, sizes=sizes)[0] + + def _fetch_infos(self, fields=None, session=None, **kw): + from ...deploy.oscar.session import fetch_infos + + session = _get_session(self, session) + self._check_session(session, "fetch_infos") + return fetch_infos(self, fields=fields, session=session, **kw) + + def _attach_session(self, session: SessionType): + if session not in self._executed_sessions: + _cleaner.register(self, session) + self._executed_sessions.append(session) + + def _detach_session(self, session: SessionType): + if session in self._executed_sessions: + sessions = _cleaner._tileable_to_sessions.get(self, []) + if sessions: + sessions.remove(_TileableSession(self, session)) + if len(sessions) == 0: + del _cleaner._tileable_to_sessions[self] + self._executed_sessions.remove(session) + + +class _ExecuteAndFetchMixin: + __slots__ = () + + def _execute_and_fetch(self, session: SessionType = None, **kw): + from ...deploy.oscar.session import ExecutionInfo, SyncSession, fetch + + session = _get_session(self, session) + fetch_kwargs = kw.pop("fetch_kwargs", dict()) + if session in self._executed_sessions: + # if has been executed, fetch directly. + return self.fetch(session=session, **fetch_kwargs) + ret = self.execute(session=session, **kw) + if isinstance(ret, ExecutionInfo): + # wait=False + aio_task = ret.aio_task + + async def _wait(): + await aio_task + + def run(): + asyncio.run_coroutine_threadsafe(_wait(), loop=ret.loop).result() + return fetch(self, session=session, **fetch_kwargs) + + return SyncSession._execution_pool.submit(run) + else: + # wait=True + return self.fetch(session=session, **fetch_kwargs) + + +class _ToObjectMixin(_ExecuteAndFetchMixin): + __slots__ = () + + def to_object(self, session: SessionType = None, **kw): + return self._execute_and_fetch(session=session, **kw) + + +class ExecutableTuple(tuple, _ExecutableMixin, _ToObjectMixin): + def __init__(self, *args): + tuple.__init__(*args) + + self._fields_to_idx = None + self._fields = None + self._raw_type = None + + if len(args) == 1 and isinstance(args[0], tuple): + self._fields = getattr(args[0], "_fields", None) + if self._fields is not None: + self._raw_type = type(args[0]) + self._fields_to_idx = {f: idx for idx, f in enumerate(self._fields)} + + self._executed_sessions = [] + + def __getattr__(self, item): + if self._fields_to_idx is None or item not in self._fields_to_idx: + raise AttributeError(item) + return self[self._fields_to_idx[item]] + + def __dir__(self): + result = list(super().__dir__()) + list(self._fields or []) + return sorted(result) + + def __repr__(self): + if not self._fields: + return super().__repr__() + items = [] + for k, v in zip(self._fields, self): + items.append(f"{k}={v!r}") + return "%s(%s)" % (self._raw_type.__name__, ", ".join(items)) + + def execute(self, session: SessionType = None, **kw): + from ...deploy.oscar.session import execute + + if len(self) == 0: + return self + + session = _get_session(self, session) + ret = execute(*self, session=session, **kw) + + if session not in self._executed_sessions: + self._executed_sessions.append(session) + + if kw.get("wait", True): + return self + else: + return ret + + def _fetch(self, session: SessionType = None, **kw): + from ...deploy.oscar.session import fetch + + session = _get_session(self, session) + self._check_session(session, "fetch") + return fetch(*self, session=session, **kw) + + def _fetch_infos(self, fields=None, session=None, **kw): + from ...deploy.oscar.session import fetch_infos + + session = _get_session(self, session) + self._check_session(session, "fetch_infos") + return fetch_infos(*self, fields=fields, session=session, **kw) + + def fetch(self, session: SessionType = None, **kw): + if len(self) == 0: + return tuple() + + session = _get_session(self, session) + ret = super().fetch(session=session, **kw) + if self._raw_type is not None: + ret = self._raw_type(*ret) + if len(self) == 1: + return (ret,) + return ret + + def fetch_log( + self, + session: SessionType = None, + offsets: List[int] = None, + sizes: List[int] = None, + ): + from ...deploy.oscar.session import fetch_log + + if len(self) == 0: + return [] + session = self._get_session(session=session) + return fetch_log(*self, session=session, offsets=offsets, sizes=sizes) + + def _get_session(self, session: SessionType = None): + if session is None: + for item in self: + session = _get_session(item, session) + if session is not None: + return session + return session diff --git a/python/xorbits/_mars/core/entity/fuse.py b/python/xorbits/_mars/core/entity/fuse.py new file mode 100644 index 000000000..0fafe2d9c --- /dev/null +++ b/python/xorbits/_mars/core/entity/fuse.py @@ -0,0 +1,73 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ...serialization.serializables import ReferenceField +from .chunks import CHUNK_TYPE, Chunk, ChunkData + + +class FuseChunkData(ChunkData): + __slots__ = ("_inited",) + + _chunk = ReferenceField( + "chunk", CHUNK_TYPE, on_serialize=lambda x: x.data if hasattr(x, "data") else x + ) + + def __init__(self, *args, **kwargs): + self._inited = False + super().__init__(*args, **kwargs) + self._extra_params = {} + self._inited = True + + @property + def chunk(self): + return self._chunk + + @property + def composed(self): + # for compatibility, just return the topological ordering, + # once we apply optimization on the subgraph, + # `composed` is not needed any more and should be removed then. + assert getattr(self._op, "fuse_graph", None) is not None + fuse_graph = self._op.fuse_graph + return list(fuse_graph.topological_iter()) + + def __getattr__(self, attr): + if not self._inited: + return object.__getattribute__(self, attr) + if attr in self._extra_params: + return self._extra_params[attr] + try: + return getattr(self._chunk, attr) + except AttributeError: + return object.__getattribute__(self, attr) + + def __setattr__(self, attr, value): + if attr == "params": + self._chunk.params = value + else: + super().__setattr__(attr, value) + + @property + def nbytes(self): + return np.prod(self.shape) * self.dtype.itemsize + + +class FuseChunk(Chunk): + __slots__ = () + _allow_data_type_ = (FuseChunkData,) + + +FUSE_CHUNK_TYPE = (FuseChunkData, FuseChunk) diff --git a/python/xorbits/_mars/core/entity/objects.py b/python/xorbits/_mars/core/entity/objects.py new file mode 100644 index 000000000..cefe7d629 --- /dev/null +++ b/python/xorbits/_mars/core/entity/objects.py @@ -0,0 +1,98 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict + +from ...serialization.serializables import FieldTypes, ListField +from .chunks import Chunk, ChunkData +from .executable import _ToObjectMixin +from .tileables import Tileable, TileableData + + +class ObjectChunkData(ChunkData): + # chunk whose data could be any serializable + __slots__ = () + type_name = "Object" + + def __init__(self, op=None, index=None, **kw): + super().__init__(_op=op, _index=index, **kw) + + @property + def params(self) -> Dict[str, Any]: + # params return the properties which useful to rebuild a new chunk + return { + "index": self.index, + } + + @params.setter + def params(self, new_params: Dict[str, Any]): + params = new_params.copy() + params.pop("index", None) # index not needed to update + if params: # pragma: no cover + raise TypeError(f"Unknown params: {list(params)}") + + @classmethod + def get_params_from_data(cls, data: Any) -> Dict[str, Any]: + return dict() + + +class ObjectChunk(Chunk): + __slots__ = () + _allow_data_type_ = (ObjectChunkData,) + type_name = "Object" + + +class ObjectData(TileableData, _ToObjectMixin): + __slots__ = () + type_name = "Object" + + # optional fields + _chunks = ListField( + "chunks", + FieldTypes.reference(ObjectChunkData), + on_serialize=lambda x: [it.data for it in x] if x is not None else x, + on_deserialize=lambda x: [ObjectChunk(it) for it in x] if x is not None else x, + ) + + def __init__(self, op=None, nsplits=None, chunks=None, **kw): + super().__init__(_op=op, _nsplits=nsplits, _chunks=chunks, **kw) + + def __repr__(self): + return f"Object " + + @property + def params(self): + # params return the properties which useful to rebuild a new tileable object + return dict() + + @params.setter + def params(self, new_params: Dict[str, Any]): + params = new_params.copy() + if params: # pragma: no cover + raise TypeError(f"Unknown params: {list(params)}") + + def refresh_params(self): + # refresh params when chunks updated + # nothing needs to do for Object + pass + + +class Object(Tileable, _ToObjectMixin): + __slots__ = () + _allow_data_type_ = (ObjectData,) + type_name = "Object" + + +OBJECT_TYPE = (Object, ObjectData) +OBJECT_CHUNK_TYPE = (ObjectChunk, ObjectChunkData) diff --git a/python/xorbits/_mars/core/entity/output_types.py b/python/xorbits/_mars/core/entity/output_types.py new file mode 100644 index 000000000..b63f59508 --- /dev/null +++ b/python/xorbits/_mars/core/entity/output_types.py @@ -0,0 +1,97 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +from enum import Enum + +from .fuse import FUSE_CHUNK_TYPE +from .objects import OBJECT_CHUNK_TYPE, OBJECT_TYPE + + +class OutputType(Enum): + object = 1 + tensor = 2 + dataframe = 3 + series = 4 + index = 5 + scalar = 6 + categorical = 7 + dataframe_groupby = 8 + series_groupby = 9 + df_or_series = 10 + + @classmethod + def serialize_list(cls, output_types): + return [ot.value for ot in output_types] if output_types is not None else None + + @classmethod + def deserialize_list(cls, output_types): + return [cls(ot) for ot in output_types] if output_types is not None else None + + +_OUTPUT_TYPE_TO_CHUNK_TYPES = {OutputType.object: OBJECT_CHUNK_TYPE} +_OUTPUT_TYPE_TO_TILEABLE_TYPES = {OutputType.object: OBJECT_TYPE} +_OUTPUT_TYPE_TO_FETCH_CLS = {} + + +def register_output_types(output_type, tileable_types, chunk_types): + _OUTPUT_TYPE_TO_TILEABLE_TYPES[output_type] = tileable_types + _OUTPUT_TYPE_TO_CHUNK_TYPES[output_type] = chunk_types + + +def register_fetch_class(output_type, fetch_cls, fetch_shuffle_cls): + _OUTPUT_TYPE_TO_FETCH_CLS[output_type] = (fetch_cls, fetch_shuffle_cls) + + +def get_tileable_types(output_type): + return _OUTPUT_TYPE_TO_TILEABLE_TYPES[output_type] + + +def get_chunk_types(output_type): + return _OUTPUT_TYPE_TO_CHUNK_TYPES[output_type] + + +def get_fetch_class(output_type): + return _OUTPUT_TYPE_TO_FETCH_CLS[output_type] + + +@functools.lru_cache(100) +def _get_output_type_by_cls(cls): + for tp in OutputType.__members__.values(): + try: + tileable_types = _OUTPUT_TYPE_TO_TILEABLE_TYPES[tp] + chunk_types = _OUTPUT_TYPE_TO_CHUNK_TYPES[tp] + if issubclass(cls, (tileable_types, chunk_types)): + return tp + except KeyError: # pragma: no cover + continue + raise TypeError("Output can only be tensor, dataframe or series") + + +def get_output_types(*objs, unknown_as=None): + output_types = [] + for obj in objs: + if obj is None: + continue + elif isinstance(obj, FUSE_CHUNK_TYPE): + obj = obj.chunk + + try: + output_types.append(_get_output_type_by_cls(type(obj))) + except TypeError: + if unknown_as is not None: + output_types.append(unknown_as) + else: # pragma: no cover + raise + return output_types diff --git a/python/xorbits/_mars/core/entity/tests/__init__.py b/python/xorbits/_mars/core/entity/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/core/entity/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/core/entity/tests/test_utils.py b/python/xorbits/_mars/core/entity/tests/test_utils.py new file mode 100644 index 000000000..ac54b3cd6 --- /dev/null +++ b/python/xorbits/_mars/core/entity/tests/test_utils.py @@ -0,0 +1,81 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest + +from .... import tensor as mt +from ....tensor.operands import TensorOperand, TensorOperandMixin +from ....utils import has_unknown_shape +from ... import recursive_tile + + +class _TestOperand(TensorOperand, TensorOperandMixin): + @classmethod + def tile(cls, op: "_TestOperand"): + data1, data2 = op.inputs + + data1 = mt.sort(data1) + data2 = mt.sort(data2) + data_all = mt.concatenate([data1, data2]) + s1 = mt.searchsorted(data1, data_all) + s2 = mt.searchsorted(data2, data_all) + result = yield from recursive_tile(mt.concatenate([s1, s2])) + # data1 will be yield by s1 + assert not has_unknown_shape(data1) + assert not has_unknown_shape(data2) + assert not has_unknown_shape(data_all) + return result + + +def test_recursive_tile(setup): + d1 = mt.random.rand(10, chunk_size=5) + d2 = mt.random.rand(10, chunk_size=5) + op = _TestOperand() + t = op.new_tensor([d1, d2], dtype=d1.dtype, shape=(20,), order=d1.order) + t.execute(extra_config={"check_duplicated_operand_keys": True}) + + +class _TestOperandWithDuplicatedSubmission(TensorOperand, TensorOperandMixin): + @classmethod + def tile(cls, op: "_TestOperand"): + data1 = op.inputs[0] + + data2 = yield from recursive_tile(data1 + 1) + yield data2.chunks + data3 = yield from recursive_tile(data1 + 2) + yield data3.chunks + + return (yield from recursive_tile(data2 + data3)) + + +def test_recursive_tile_with_duplicated_submission(setup): + raw = np.random.RandomState(0).rand(10) + d1 = mt.tensor(raw, chunk_size=5) + op = _TestOperandWithDuplicatedSubmission() + t = op.new_tensor( + [ + d1, + ], + dtype=d1.dtype, + shape=(10,), + order=d1.order, + ) + + with pytest.raises(RuntimeError, match="submitted repeatedly"): + t.execute(extra_config={"check_duplicated_submission": True}) + + result = t.execute(extra_config={"check_duplicated_submission": False}) + expected = 2 * raw + 3 + np.testing.assert_array_equal(result, expected) diff --git a/python/xorbits/_mars/core/entity/tileables.py b/python/xorbits/_mars/core/entity/tileables.py new file mode 100644 index 000000000..7f7718a2a --- /dev/null +++ b/python/xorbits/_mars/core/entity/tileables.py @@ -0,0 +1,470 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import builtins +import inspect +import itertools +from operator import attrgetter +from typing import Callable, Generator, List +from weakref import WeakKeyDictionary, WeakSet + +import numpy as np + +from ...serialization.serializables import BoolField, FieldTypes, TupleField +from ...typing import ChunkType, OperandType, TileableType +from ...utils import on_deserialize_shape, on_serialize_nsplits, on_serialize_shape +from ..base import Base +from ..mode import enter_mode +from .chunks import Chunk +from .core import Entity, EntityData +from .executable import _ExecutableMixin + + +class NotSupportTile(Exception): + pass + + +class OperandTilesHandler: + _handlers = dict() + + @classmethod + def _get_op_cls(cls, op: OperandType): + if isinstance(op, type): + return op + return type(op) + + @classmethod + def register( + cls, op: OperandType, tile_handler: Callable[[OperandType], TileableType] + ): + cls._handlers[cls._get_op_cls(op)] = tile_handler + + @classmethod + def unregister(cls, op: OperandType): + del cls._handlers[cls._get_op_cls(op)] + + @classmethod + def get_handler( + cls, op: OperandType + ) -> Callable[[OperandType], List[TileableType]]: + op_cls = cls._get_op_cls(op) + return cls._handlers.get(op_cls, op_cls.tile) + + @classmethod + def tile( + cls, tileables: List[TileableType] + ) -> Generator[List[ChunkType], List[ChunkType], List[TileableType]]: + op = tileables[0].op + # pre tile + op.pre_tile(op) + tiled_result = None + try: + tile_handler = cls.get_handler(op) + if inspect.isgeneratorfunction(tile_handler): + # op.tile can be a generator function, + # each time an operand yield some chunks, + # they will be put into ChunkGraph and executed first. + # After execution, resume from the yield place. + tiled_result = yield from tile_handler(op) + else: + # without iterative tiling + tiled_result = tile_handler(op) + finally: + op.post_tile(op, tiled_result) + + if not isinstance(tiled_result, list): + tiled_result = [tiled_result] + tiled_results = [t.data if hasattr(t, "data") else t for t in tiled_result] + assert len(tileables) == len(tiled_results) + if any(inspect.isgenerator(r) for r in tiled_results): # pragma: no cover + raise TypeError(f"tiled result cannot be generator when tiling {op}") + cls._assign_to(tiled_results, tileables) + return tileables + + @classmethod + def _assign_to( + cls, + tile_after_tensor_datas: List["TileableData"], + tile_before_tensor_datas: List["TileableData"], + ): + assert len(tile_after_tensor_datas) == len(tile_before_tensor_datas) + + for tile_after_tensor_data, tile_before_tensor_data in zip( + tile_after_tensor_datas, tile_before_tensor_datas + ): + if tile_before_tensor_data is None: + # garbage collected + continue + tile_after_tensor_data.copy_to(tile_before_tensor_data) + tile_before_tensor_data.op.outputs = tile_before_tensor_datas + + @enter_mode(kernel=True) + def dispatch(self, op: OperandType): + op_cls = self._get_op_cls(op) + tiled = None + cause = None + + if op_cls in self._handlers: + tiled = self._handlers[op_cls](op) + else: + try: + tiled = op_cls.tile(op) + except NotImplementedError as ex: + cause = ex + for super_cls in op_cls.__mro__: + if super_cls in self._handlers: + h = self._handlers[op_cls] = self._handlers[super_cls] + tiled = h(op) + break + + if tiled is not None: + return tiled if isinstance(tiled, list) else [tiled] + else: + raise NotImplementedError(f"{type(op)} does not support tile") from cause + + +handler = OperandTilesHandler() +register = OperandTilesHandler.register +unregister = OperandTilesHandler.unregister + + +class _ChunksIndexer: + __slots__ = ("_tileable",) + + def __init__(self, tileable): + self._tileable = tileable + + def __getitem__(self, item): + """ + The indices for `cix` can be [x, y] or [x, :]. + For the former the result will be a single chunk, + and for the later the result will be a list of chunks (flattened). + + The length of indices must be the same with `chunk_shape` of tileable. + """ + if isinstance(item, int): + item = (item,) + if isinstance(item, tuple): + if len(item) == 0 and self._tileable.is_scalar(): + return self._tileable.chunks[0] + if len(item) != self._tileable.ndim: + raise ValueError( + f"Cannot get chunk by {item}, " + f"expect length {self._tileable.ndim}" + ) + slices, singleton = [], True + for it, dim in zip(item, self._tileable.chunk_shape): + if isinstance(it, slice): + slices.append(range(dim)[it]) + singleton = False + elif np.issubdtype(type(it), np.integer): + slices.append([it if it >= 0 else dim + it]) + else: + raise TypeError( + f"Cannot get chunk by {it}, " + f"invalid value has type {type(it)}" + ) + + indexes = tuple(zip(*itertools.product(*slices))) + + flat_index = np.ravel_multi_index(indexes, self._tileable.chunk_shape) + if singleton: + return self._tileable._chunks[flat_index[0]] + else: + return [self._tileable._chunks[idx] for idx in flat_index] + + raise ValueError(f"Cannot get {type(self._tileable).__name__} chunk by {item}") + + +class EntityDataModificationHandler: + def __init__(self): + self._data_to_entities = WeakKeyDictionary() + + def _add_observer(self, data, entity): + # only tileable data should be considered + assert isinstance(data, TileableData) + assert isinstance(entity, Tileable) + + if data not in self._data_to_entities: + self._data_to_entities[data] = WeakSet() + + self._data_to_entities[data].add(entity) + + @enter_mode(build=True) + def add_observer(self, data, entity): + self._add_observer(data, entity) + + def _update_observe_data(self, observer, data, new_data): + self._data_to_entities.get(data, set()).discard(observer) + self._add_observer(new_data, observer) + + @staticmethod + def _set_data(entity, data): + entity._data.detach(entity) + entity._data = data + data.attach(entity) + + @staticmethod + def _get_data(obj): + return obj.data if isinstance(obj, Entity) else obj + + @enter_mode(build=True) + def data_changed(self, old_data, new_data): + notified = set() + processed_data = set() + old_to_new = {old_data: new_data} + q = [old_data] + while len(q) > 0: + data = q.pop() + + # handle entities + for entity in data.entities: + self._set_data(entity, old_to_new[data]) + notified.add(entity) + + observers = { + ob + for ob in self._data_to_entities.pop(data, set()) + if ob not in notified + } + for ob in observers: + new_data = self._get_data(ob.op.on_input_modify(old_to_new[data])) + old_data = ob.data + self._update_observe_data(ob, ob.data, new_data) + old_to_new[old_data] = new_data + if old_data not in processed_data: + q.append(old_data) + processed_data.add(old_data) + notified.add(ob) + + if data.op.create_view: + old_input_data = data.inputs[0] + new_input_data = self._get_data( + data.op.on_output_modify(old_to_new[data]) + ) + old_to_new[old_input_data] = new_input_data + if old_input_data not in processed_data: + q.append(old_input_data) + processed_data.add(old_input_data) + + +entity_view_handler = EntityDataModificationHandler() + + +class TileableData(EntityData, _ExecutableMixin): + __slots__ = "_cix", "_entities", "_executed_sessions" + _no_copy_attrs_ = Base._no_copy_attrs_ | {"_cix"} + + # optional fields + # `nsplits` means the sizes of chunks for each dimension + _nsplits = TupleField( + "nsplits", + FieldTypes.tuple(FieldTypes.tuple(FieldTypes.uint64)), + on_serialize=on_serialize_nsplits, + ) + # cache tileable data, if true, this data will be materialized + cache = BoolField("cache", default=False) + + def __init__(self: TileableType, *args, **kwargs): + if kwargs.get("_nsplits", None) is not None: + kwargs["_nsplits"] = tuple(tuple(s) for s in kwargs["_nsplits"]) + + super().__init__(*args, **kwargs) + + try: + chunks = self._chunks + if chunks: + self._chunks = sorted(chunks, key=attrgetter("index")) + except AttributeError: # pragma: no cover + pass + self._entities = WeakSet() + self._executed_sessions = [] + + def __on_deserialize__(self): + super(TileableData, self).__on_deserialize__() + self._entities = WeakSet() + self._executed_sessions = [] + + @property + def chunk_shape(self): + if hasattr(self, "_nsplits") and self._nsplits is not None: + return tuple(map(len, self._nsplits)) + + @property + def chunks(self) -> List[Chunk]: + return getattr(self, "_chunks", None) + + @property + def nsplits(self): + return getattr(self, "_nsplits", None) + + @nsplits.setter + def nsplits(self, new_nsplits): + self._nsplits = new_nsplits + + @property + def params(self) -> dict: + # params return the properties which useful to rebuild a new tileable object + return dict() + + @property + def cix(self): + if self.ndim == 0: + return _ChunksIndexer(self) + + try: + if getattr(self, "_cix", None) is None: + self._cix = _ChunksIndexer(self) + return self._cix + except (TypeError, ValueError): + return _ChunksIndexer(self) + + @property + def entities(self): + return self._entities + + def is_coarse(self): + if not hasattr(self, "_chunks"): + return True + if not self._chunks: + return True + return False + + def attach(self, entity): + self._entities.add(entity) + + def detach(self, entity): + self._entities.discard(entity) + + +class Tileable(Entity): + def __init__(self, data: TileableType = None, **kw): + super().__init__(data=data, **kw) + data = self._data + if data is not None: + data.attach(self) + if data.op.create_view: + entity_view_handler.add_observer(data.inputs[0], self) + + def __copy__(self): + return self._view() + + def _view(self): + return super().copy() + + def copy(self: TileableType) -> TileableType: + new_op = self.op.copy() + if new_op.create_view: + # if the operand is a view, make it a copy + new_op.create_view = False + params = [] + for o in self.op.outputs: + param = o.params + param["_key"] = o.key + param.update(o.extra_params) + params.append(param) + new_outs = new_op.new_tileables( + self.op.inputs, kws=params, output_limit=len(params) + ) + pos = -1 + for i, out in enumerate(self.op.outputs): + # create a ref to copied one + new_out = new_outs[i] + if not hasattr(new_out.data, "_siblings"): + new_out.data._siblings = [] + new_out.data._siblings.append(out) + + if self._data is out: + pos = i + break + assert pos >= 0 + return new_outs[pos] + + @Entity.data.setter + def data(self, new_data): + self._check_data(new_data) + if self._data is None: + self._data = new_data + self._data.attach(self) + else: + entity_view_handler.data_changed(self._data, new_data) + + def execute(self, session=None, **kw): + result = self.data.execute(session=session, **kw) + if isinstance(result, TILEABLE_TYPE): + return self + else: + return result + + +TILEABLE_TYPE = (Tileable, TileableData) + + +class HasShapeTileableData(TileableData): + # required fields + _shape = TupleField( + "shape", + FieldTypes.int64, + on_serialize=on_serialize_shape, + on_deserialize=on_deserialize_shape, + ) + + @property + def ndim(self): + return len(self.shape) + + def __len__(self): + try: + return int(self.shape[0]) + except (IndexError, ValueError): # pragma: no cover + return 0 + + @property + def shape(self): + if hasattr(self, "_shape") and self._shape is not None: + return self._shape + if hasattr(self, "_nsplits") and self._nsplits is not None: + self._shape = tuple(builtins.sum(nsplit) for nsplit in self._nsplits) + return self._shape + + def _update_shape(self, new_shape): + self._shape = new_shape + + @property + def size(self): + return np.prod(self.shape).item() + + @property + def params(self): + # params return the properties which useful to rebuild a new tileable object + return {"shape": self.shape} + + def _equals(self, o): + return self is o + + +class HasShapeTileable(Tileable): + __slots__ = () + + @property + def shape(self): + return self._data.shape + + @property + def ndim(self): + return self._data.ndim + + @property + def size(self): + return self._data.size diff --git a/python/xorbits/_mars/core/entity/utils.py b/python/xorbits/_mars/core/entity/utils.py new file mode 100644 index 000000000..02b603f90 --- /dev/null +++ b/python/xorbits/_mars/core/entity/utils.py @@ -0,0 +1,102 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Generator, List, Union + +from ...typing import ChunkType, TileableType +from ...utils import calc_nsplits, has_unknown_shape + + +def refresh_tileable_shape(tileable): + if tileable.shape is None or has_unknown_shape(tileable): + # update shape + nsplits = calc_nsplits({c.index: c.shape for c in tileable.chunks}) + shape = tuple(sum(ns) for ns in nsplits) + tileable._nsplits = nsplits + tileable._shape = shape + + +def tile(tileable, *tileables: TileableType): + from ..graph import ( + ChunkGraphBuilder, + TileableGraph, + TileableGraphBuilder, + TileContext, + ) + + raw_tileables = target_tileables = [tileable] + list(tileables) + target_tileables = [t.data if hasattr(t, "data") else t for t in target_tileables] + + tileable_graph = TileableGraph(target_tileables) + tileable_graph_builder = TileableGraphBuilder(tileable_graph) + next(tileable_graph_builder.build()) + + # tile + tile_context = TileContext() + chunk_graph_builder = ChunkGraphBuilder( + tileable_graph, fuse_enabled=False, tile_context=tile_context + ) + next(chunk_graph_builder.build()) + + if len(tileables) == 0: + return type(tileable)(tile_context[target_tileables[0]]) + else: + return [ + type(raw_t)(tile_context[t]) + for raw_t, t in zip(raw_tileables, target_tileables) + ] + + +def recursive_tile( + tileable: TileableType, *tileables: TileableType +) -> Generator[ + List[ChunkType], List[ChunkType], Union[TileableType, List[TileableType]] +]: + from .tileables import handler + + return_list = len(tileables) > 0 + if not return_list and isinstance(tileable, (list, tuple)): + return_list = True + raw = tileable + tileable = raw[0] + tileables = raw[1:] + + to_tile = [tileable] + list(tileables) + q = [t for t in to_tile if t.is_coarse()] + while q: + t = q[-1] + if t.is_coarse(): + # t may be put into q repeatedly, + # so we check if it's tiled or not + cs = [c for c in t.inputs if c.is_coarse()] + if cs: + q.extend(cs) + continue + for obj in handler.tile(t.op.outputs): + to_update_inputs = [] + chunks = [] + for inp in t.op.inputs: + chunks.extend(inp.chunks) + if has_unknown_shape(inp): + to_update_inputs.append(inp) + if obj is None: + yield chunks + to_update_inputs + else: + yield obj + to_update_inputs + q.pop() + + if not return_list: + return tileable + else: + return [tileable] + list(tileables) diff --git a/python/xorbits/_mars/core/entrypoints.py b/python/xorbits/_mars/core/entrypoints.py new file mode 100644 index 000000000..53ea7ee6b --- /dev/null +++ b/python/xorbits/_mars/core/entrypoints.py @@ -0,0 +1,42 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import logging +import warnings + +logger = logging.getLogger(__name__) + + +# from https://github.com/numba/numba/blob/master/numba/core/entrypoints.py +# Must put this here to avoid extensions re-triggering initialization +@functools.lru_cache(maxsize=None) +def init_extension_entrypoints(): + """Execute all `mars_extensions` entry points with the name `init` + If extensions have already been initialized, this function does nothing. + """ + from pkg_resources import iter_entry_points + + for entry_point in iter_entry_points("mars_extensions", "init"): + logger.info("Loading extension: %s", entry_point) + try: + func = entry_point.load() + func() + except Exception as e: + msg = "Mars extension module '{}' failed to load due to '{}({})'." + warnings.warn( + msg.format(entry_point.module_name, type(e).__name__, str(e)), + stacklevel=2, + ) + logger.info("Extension loading failed for: %s", entry_point) diff --git a/python/xorbits/_mars/core/graph/__init__.py b/python/xorbits/_mars/core/graph/__init__.py new file mode 100644 index 000000000..e298eabdd --- /dev/null +++ b/python/xorbits/_mars/core/graph/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .builder import ChunkGraphBuilder, TileableGraphBuilder, TileContext, TileStatus +from .core import DAG, DirectedGraph, GraphContainsCycleError +from .entity import ChunkGraph, EntityGraph, TileableGraph diff --git a/python/xorbits/_mars/core/graph/builder/__init__.py b/python/xorbits/_mars/core/graph/builder/__init__.py new file mode 100644 index 000000000..2e5f05c3e --- /dev/null +++ b/python/xorbits/_mars/core/graph/builder/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .chunk import ChunkGraphBuilder, TileContext, TileStatus +from .tileable import TileableGraphBuilder diff --git a/python/xorbits/_mars/core/graph/builder/base.py b/python/xorbits/_mars/core/graph/builder/base.py new file mode 100644 index 000000000..dbf56e3bc --- /dev/null +++ b/python/xorbits/_mars/core/graph/builder/base.py @@ -0,0 +1,86 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from abc import ABC, abstractmethod +from typing import Generator, List, Set, Union + +from ....typing import EntityType +from ..entity import ChunkGraph, EntityGraph, TileableGraph + + +def _default_inputs_selector(inputs: List[EntityType]) -> List[EntityType]: + return inputs + + +class AbstractGraphBuilder(ABC): + _graph: EntityGraph + + def __init__(self, graph: EntityGraph): + self._graph = graph + + def _process_node(self, entity: EntityType): + return entity + + def _select_inputs(self, inputs: List[EntityType]): + return inputs + + def _if_add_node( + self, node: EntityType, visited: Set + ): # pylint: disable=no-self-use + return node not in visited + + def _add_nodes( + self, + graph: Union[ChunkGraph, TileableGraph], + nodes: List[EntityType], + visited: Set, + ): + # update visited + visited.update(nodes) + + while len(nodes) > 0: + node = nodes.pop() + node = self._process_node(node) + + # mark node as visited + visited.add(node) + + # add node to graph if possible + if not graph.contains(node): + graph.add_node(node) + + children = self._select_inputs(node.inputs or []) + if children: + node.inputs = children + for c in children: + c = self._process_node(c) + if not graph.contains(c): + graph.add_node(c) + if not graph.has_successor(c, node): + graph.add_edge(c, node) + for out in c.op.outputs: + if self._if_add_node(out, visited): + nodes.append(out) + + @abstractmethod + def build(self) -> Generator[Union[EntityGraph, ChunkGraph], None, None]: + """ + Build a entity graph. + + Returns + ------- + graph : EntityGraph + Entity graph. + """ diff --git a/python/xorbits/_mars/core/graph/builder/chunk.py b/python/xorbits/_mars/core/graph/builder/chunk.py new file mode 100644 index 000000000..06d0a7a87 --- /dev/null +++ b/python/xorbits/_mars/core/graph/builder/chunk.py @@ -0,0 +1,430 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses +import functools +from typing import Callable, Dict, Generator, Iterable, List, Optional, Set, Type, Union + +from ....core import CHUNK_TYPE, FUSE_CHUNK_TYPE, TILEABLE_TYPE +from ....typing import ChunkType, EntityType, TileableType +from ....utils import build_fetch, copy_tileables +from ...entity.tileables import handler +from ...mode import enter_mode +from ..entity import ChunkGraph, TileableGraph +from .base import AbstractGraphBuilder + +tile_gen_type = Generator[List[ChunkType], List[ChunkType], List[TileableType]] +DEFAULT_UPDATED_PROGRESS = 0.4 + + +@dataclasses.dataclass +class _TileableHandler: + tileable: TileableType + handler: tile_gen_type + last_need_processes: List[EntityType] = None + + +@dataclasses.dataclass +class _TileableTileInfo: + curr_iter: int + # incremental progress for this iteration + tile_progress: float + # newly generated chunks by a tileable in this iteration + generated_chunks: List[ChunkType] = dataclasses.field(default_factory=list) + + +class TileContext(Dict[TileableType, TileableType]): + _tileables = Set[TileableType] + _tileable_to_progress: Dict[TileableType, float] + _tileable_to_tile_infos: Dict[TileableType, List[_TileableTileInfo]] + + def __init__(self, *args, **kw): + super().__init__(*args, **kw) + self._tileables = None + self._tileable_to_progress = dict() + self._tileable_to_tile_infos = dict() + + def set_tileables(self, tileables: Set[TileableType]): + self._tileables = tileables + + def __setitem__(self, key, value): + self._tileable_to_progress.pop(key, None) + return super().__setitem__(key, value) + + def set_progress(self, tileable: TileableType, progress: float): + assert 0.0 <= progress <= 1.0 + last_progress = self._tileable_to_progress.get(tileable, 0.0) + self._tileable_to_progress[tileable] = max(progress, last_progress) + + def get_progress(self, tileable: TileableType) -> float: + if tileable in self: + return 1.0 + else: + return self._tileable_to_progress.get(tileable, 0.0) + + def get_all_progress(self) -> float: + return sum(self.get_progress(t) for t in self._tileables) / len(self._tileables) + + def record_tileable_tile_info( + self, tileable: TileableType, curr_iter: int, generated_chunks: List[ChunkType] + ): + if tileable not in self._tileable_to_tile_infos: + self._tileable_to_tile_infos[tileable] = [] + prev_progress = sum( + info.tile_progress for info in self._tileable_to_tile_infos[tileable] + ) + curr_progress = self.get_progress(tileable) + infos = self._tileable_to_tile_infos[tileable] + infos.append( + _TileableTileInfo( + curr_iter=curr_iter, + tile_progress=curr_progress - prev_progress, + generated_chunks=generated_chunks, + ) + ) + + def get_tileable_tile_infos(self) -> Dict[TileableType, List[_TileableTileInfo]]: + return {t: self._tileable_to_tile_infos.get(t, list()) for t in self._tileables} + + +@dataclasses.dataclass +class TileStatus: + entities: List[EntityType] = None + progress: float = None + + +class Tiler: + _cur_iter: int + _cur_chunk_graph: Optional[ChunkGraph] + _tileable_handlers: Iterable[_TileableHandler] + + def __init__( + self, + tileable_graph: TileableGraph, + tile_context: TileContext, + processed_chunks: Set[str], + chunk_to_fetch: Dict[ChunkType, ChunkType], + add_nodes: Callable, + ): + self._tileable_graph = tileable_graph + self._tile_context = tile_context + self._processed_chunks = processed_chunks + self._chunk_to_fetch = chunk_to_fetch + self._add_nodes = self._wrap_add_nodes(add_nodes) + self._curr_iter = 0 + self._cur_chunk_graph = None + self._tileable_handlers = ( + _TileableHandler(tileable, self._tile_handler(tileable)) + for tileable in tileable_graph.topological_iter() + ) + + def _wrap_add_nodes(self, add_nodes: Callable): + @functools.wraps(add_nodes) + def inner( + chunk_graph: ChunkGraph, + chunks: List[ChunkType], + visited: Set[ChunkType], + tileable: TileableType, + ): + prev_chunks = set(chunk_graph) + add_nodes(chunk_graph, chunks, visited) + new_chunks = set(chunk_graph) + self._tile_context.record_tileable_tile_info( + tileable, self._curr_iter, list(new_chunks - prev_chunks) + ) + + return inner + + @staticmethod + def _get_data(entity: EntityType): + return entity.data if hasattr(entity, "data") else entity + + def _tile_handler(self, tileable: TileableType) -> tile_gen_type: + from ....core.operand import Fetch + + tileable = self._get_data(tileable) + + if isinstance(tileable.op, Fetch) and not tileable.is_coarse(): + return [tileable] + + assert tileable.is_coarse() + + # copy tileable + tiled_tileables = copy_tileables( + tileable.op.outputs, + inputs=[self._tile_context[inp] for inp in tileable.inputs], + copy_key=True, + copy_id=False, + ) + tiled_tileables = [self._get_data(t) for t in tiled_tileables] + # start to tile + tiled_tileables = yield from handler.tile(tiled_tileables) + return tiled_tileables + + def _gen_tileable_handlers(self, next_tileable_handlers: List[_TileableHandler]): + for tile_handler in self._tileable_handlers: + tileable, handler = tile_handler.tileable, tile_handler.handler + if tileable in self._tile_context: + continue + if any( + inp not in self._tile_context + for inp in self._tileable_graph.predecessors(tileable) + ): + # predecessors not finished yet + next_tileable_handlers.append(_TileableHandler(tileable, handler)) + continue + + yield _TileableHandler(tileable, handler) + + def _tile( + self, + chunk_graph: ChunkGraph, + tileable: TileableType, + tile_handler: tile_gen_type, + next_tileable_handlers: List[_TileableHandler], + to_update_tileables: List[TileableType], + visited: Set[EntityType], + ): + try: + need_process = next(tile_handler) + + if isinstance(need_process, TileStatus): + # process tile that returns progress + self._tile_context.set_progress(tileable, need_process.progress) + need_process = need_process.entities + else: + # if progress not specified, we just update 0.4 * rest progress + progress = self._tile_context.get_progress(tileable) + new_progress = progress + (1.0 - progress) * DEFAULT_UPDATED_PROGRESS + self._tile_context.set_progress(tileable, new_progress) + + chunks = [] + if need_process is not None: + for t in need_process: + if isinstance(t, CHUNK_TYPE): + chunks.append(self._get_data(t)) + elif isinstance(t, TILEABLE_TYPE): + to_update_tileables.append(self._get_data(t)) + # not finished yet + self._add_nodes(chunk_graph, chunks.copy(), visited, tileable) + next_tileable_handlers.append( + _TileableHandler(tileable, tile_handler, need_process) + ) + # add intermediate chunks into result chunks + # to prevent them being pruned + chunk_graph.result_chunks.extend(c for c in chunks if c in chunk_graph) + except StopIteration as e: + # tile done + tiled_tileables = e.value + for out, tiled_tileable in zip(tileable.op.outputs, tiled_tileables): + out = self._get_data(out) + tiled_tileable = self._get_data(tiled_tileable) + + chunks = tiled_tileable.chunks + if chunks is None: # pragma: no cover + raise ValueError(f"tileable({out}) is still coarse after tile") + chunks = [self._get_data(c) for c in chunks] + self._tile_context[out] = tiled_tileable + self._add_nodes(chunk_graph, chunks, visited, tileable) + + def _gen_result_chunks( + self, + chunk_graph: ChunkGraph, + next_tileable_handlers: List[_TileableHandler], + ): + result_chunks = chunk_graph.result_chunks + tileable_graph = self._tileable_graph + result_chunk_set = set(result_chunks) + + def _add_result_chunk(c): + if c not in result_chunk_set: + result_chunks.append(c) + result_chunk_set.add(c) + + if next_tileable_handlers: + for tileable_handler in next_tileable_handlers: + tileable = tileable_handler.tileable + # tileable that tile not completed, scan their inputs + for inp_tileable in tileable_graph.iter_predecessors(tileable): + if ( + tileable_handler.last_need_processes is None + or tileable_graph.count_successors(inp_tileable) > 1 + ): + # if nothing yielded inside its tile, + # or the input has more than 1 successors, + # make sure their chunks in result, + # so that they will not be executed repeatedly + if inp_tileable in self._tile_context: + for chunk in self._tile_context[inp_tileable].chunks: + chunk = self._get_data(chunk) + if chunk in chunk_graph: + _add_result_chunk(chunk) + for tileable in tileable_graph.result_tileables: + if tileable in self._tile_context: + for chunk in self._tile_context[tileable].chunks: + chunk = self._get_data(chunk) + if chunk in chunk_graph: + _add_result_chunk(chunk) + if ( + chunk in self._chunk_to_fetch + and self._chunk_to_fetch[chunk] in chunk_graph + ): + _add_result_chunk(self._chunk_to_fetch[chunk]) + + def _iter(self): + chunk_graph = self._cur_chunk_graph + + to_update_tileables = [] + visited = set() + + if chunk_graph is not None: + # last tiled chunks, add them to processed + # so that fetch chunk can be generated. + # Use chunk key as the key to make sure the copied chunk can be build to a fetch. + processed_chunks = ( + c.chunk.key if isinstance(c, FUSE_CHUNK_TYPE) else c.key + for c in chunk_graph.result_chunks + ) + self._processed_chunks.update(processed_chunks) + + result_chunks = [] + chunk_graph = self._cur_chunk_graph = ChunkGraph(result_chunks) + + next_tileable_handlers = [] + # tile + for tile_handler in self._gen_tileable_handlers(next_tileable_handlers): + self._tile( + chunk_graph, + tile_handler.tileable, + tile_handler.handler, + next_tileable_handlers, + to_update_tileables, + visited, + ) + self._tileable_handlers = next_tileable_handlers + # gen result chunks + self._gen_result_chunks(chunk_graph, next_tileable_handlers) + # prune unused chunks + prune_chunk_graph(chunk_graph) + + self._curr_iter += 1 + + return to_update_tileables + + def __iter__(self): + while self._tileable_handlers: + to_update_tileables = self._iter() + yield self._cur_chunk_graph + for t in to_update_tileables: + t.refresh_params() + + +def prune_chunk_graph(chunk_graph: ChunkGraph): + from ....core.operand import Fetch, ShuffleProxy, VirtualOperand + + result_set = set(chunk_graph.result_chunks) + stack = list(chunk_graph.result_chunks) + used = set() + while stack: + n = stack.pop() + if n in used: + continue + used.add(n) + stack.extend(chunk_graph.predecessors(n)) + if isinstance(n.op, ShuffleProxy): + stack.extend( + succ for succ in chunk_graph.iter_successors(n) if succ not in used + ) + + unused = {n for n in chunk_graph if n not in used} + for n in unused: + # for pruned chunks, we assume we will use them later, + # so we add the inputs of them into result chunks, + # to prevent from duplicated submission + for inp in chunk_graph.iter_predecessors(n): + if ( + inp in used + and inp not in result_set + and not isinstance(inp.op, (Fetch, VirtualOperand)) + ): + chunk_graph.result_chunks.append(inp) + result_set.add(inp) + # prune chunk + chunk_graph.remove_node(n) + + +class ChunkGraphBuilder(AbstractGraphBuilder): + _graph: TileableGraph + + def __init__( + self, + graph: TileableGraph, + fuse_enabled: bool = True, + tile_context: TileContext = None, + tiler_cls: Union[Type[Tiler], Callable] = None, + ): + super().__init__(graph) + self.fuse_enabled = fuse_enabled + self.tile_context = TileContext() if tile_context is None else tile_context + self.tile_context.set_tileables(set(graph)) + + self._processed_chunks: Set[str] = set() + self._chunk_to_fetch: Dict[ChunkType, ChunkType] = dict() + + tiler_cls = Tiler if tiler_cls is None else tiler_cls + self.tiler = tiler_cls( + self._graph, + self.tile_context, + self._processed_chunks, + self._chunk_to_fetch, + self._add_nodes, + ) + + def _process_node(self, entity: EntityType): + if entity.key in self._processed_chunks: + if entity not in self._chunk_to_fetch: + # gen fetch + fetch_chunk = build_fetch(entity).data + self._chunk_to_fetch[entity] = fetch_chunk + return self._chunk_to_fetch[entity] + return entity + + def _select_inputs(self, inputs: List[ChunkType]): + new_inputs = [] + for inp in inputs: + if inp.key in self._processed_chunks: + # gen fetch + if inp not in self._chunk_to_fetch: + fetch_chunk = build_fetch(inp).data + self._chunk_to_fetch[inp] = fetch_chunk + new_inputs.append(self._chunk_to_fetch[inp]) + else: + new_inputs.append(inp) + return new_inputs + + def _if_add_node(self, node: EntityType, visited: Set): + return node not in visited and node.key not in self._processed_chunks + + def _build(self) -> Iterable[Union[TileableGraph, ChunkGraph]]: + tile_iterator = iter(self.tiler) + while True: + try: + with enter_mode(build=True, kernel=True): + graph = next(tile_iterator) + yield graph + except StopIteration: + break + + def build(self) -> Generator[Union[TileableGraph, ChunkGraph], None, None]: + yield from self._build() diff --git a/python/xorbits/_mars/core/graph/builder/tileable.py b/python/xorbits/_mars/core/graph/builder/tileable.py new file mode 100644 index 000000000..65d3c9ecb --- /dev/null +++ b/python/xorbits/_mars/core/graph/builder/tileable.py @@ -0,0 +1,34 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Generator, Union + +from ...mode import enter_mode +from ..entity import ChunkGraph, TileableGraph +from .base import AbstractGraphBuilder + + +class TileableGraphBuilder(AbstractGraphBuilder): + _graph: TileableGraph + + def __init__(self, graph: TileableGraph): + super().__init__(graph=graph) + + @enter_mode(build=True, kernel=True) + def _build(self) -> Union[TileableGraph, ChunkGraph]: + self._add_nodes(self._graph, list(self._graph.result_tileables), set()) + return self._graph + + def build(self) -> Generator[Union[TileableGraph, ChunkGraph], None, None]: + yield self._build() diff --git a/python/xorbits/_mars/core/graph/builder/utils.py b/python/xorbits/_mars/core/graph/builder/utils.py new file mode 100644 index 000000000..e32d9e4f5 --- /dev/null +++ b/python/xorbits/_mars/core/graph/builder/utils.py @@ -0,0 +1,41 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from typing import List, Union + +from ....typing import TileableType +from ...mode import enter_mode +from ..entity import ChunkGraph, TileableGraph +from .chunk import ChunkGraphBuilder +from .tileable import TileableGraphBuilder + + +@enter_mode(kernel=True) +def build_graph( + tileables: List[TileableType], + tile: bool = False, + fuse_enabled: bool = True, + **chunk_graph_build_kwargs +) -> Union[TileableGraph, ChunkGraph]: + tileables = list(itertools.chain(*(tileable.op.outputs for tileable in tileables))) + tileable_graph = TileableGraph(tileables) + tileable_graph_builder = TileableGraphBuilder(tileable_graph) + tileable_graph = next(tileable_graph_builder.build()) + if not tile: + return tileable_graph + chunk_graph_builder = ChunkGraphBuilder( + tileable_graph, fuse_enabled=fuse_enabled, **chunk_graph_build_kwargs + ) + return next(chunk_graph_builder.build()) diff --git a/python/xorbits/_mars/core/graph/core.pyx b/python/xorbits/_mars/core/graph/core.pyx new file mode 100644 index 000000000..9134ae4c2 --- /dev/null +++ b/python/xorbits/_mars/core/graph/core.pyx @@ -0,0 +1,464 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging +from collections import deque +from io import StringIO + +logger = logging.getLogger(__name__) + + +cdef class DirectedGraph: + cdef readonly: + dict _nodes + dict _predecessors + dict _successors + + def __init__(self): + self._nodes = dict() + self._predecessors = dict() + self._successors = dict() + + def __iter__(self): + return iter(self._nodes) + + def __contains__(self, n): + return n in self._nodes + + def __len__(self): + return len(self._nodes) + + def __getitem__(self, n): + return self._successors[n] + + def contains(self, node): + return node in self._nodes + + def add_node(self, node, node_attr=None, **node_attrs): + if node_attr is None: + node_attr = node_attrs + else: + try: + node_attr.update(node_attrs) + except AttributeError: + raise TypeError('The node_attr argument must be a dictionary') + self._add_node(node, node_attr) + + cdef inline _add_node(self, node, dict node_attr=None): + if node_attr is None: + node_attr = dict() + if node not in self._nodes: + self._nodes[node] = node_attr + self._successors[node] = dict() + self._predecessors[node] = dict() + else: + self._nodes[node].update(node_attr) + + def remove_node(self, node): + if node not in self._nodes: + raise KeyError(f'Node {node} does not exist ' + f'in the directed graph') + + del self._nodes[node] + + for succ in self._successors[node]: + del self._predecessors[succ][node] + del self._successors[node] + + for pred in self._predecessors[node]: + del self._successors[pred][node] + del self._predecessors[node] + + def add_edge(self, u, v, edge_attr=None, **edge_attrs): + if edge_attr is None: + edge_attr = edge_attrs + else: + try: + edge_attr.update(edge_attrs) + except AttributeError: + raise TypeError('The edge_attr argument must be a dictionary') + self._add_edge(u, v, edge_attr) + + cdef inline _add_edge(self, u, v, edge_attr=None): + cdef: + dict u_succ, v_pred + + if u not in self._nodes: + raise KeyError(f'Node {u} does not exist in the directed graph') + if v not in self._nodes: + raise KeyError(f'Node {v} does not exist in the directed graph') + + if edge_attr is None: + edge_attr = dict() + + u_succ = self._successors[u] + if v in u_succ: + u_succ[v].update(edge_attr) + else: + u_succ[v] = edge_attr + + v_pred = self._predecessors[v] + if u not in v_pred: + # `update` is not necessary, as they point to the same object + v_pred[u] = edge_attr + + def remove_edge(self, u, v): + try: + del self._successors[u][v] + del self._predecessors[v][u] + except KeyError: + raise KeyError(f'Edge {u}->{v} does not exist in the directed graph') + + def has_successor(self, u, v): + return (u in self._successors) and (v in self._successors[u]) + + def has_predecessor(self, u, v): + return (u in self._predecessors) and (v in self._predecessors[u]) + + def iter_nodes(self, data=False): + if data: + return iter(self._nodes.items()) + return iter(self._nodes) + + def iter_successors(self, n): + try: + return iter(self._successors[n]) + except KeyError: + raise KeyError(f'Node {n} does not exist in the directed graph') + + cpdef list successors(self, n): + try: + return list(self._successors[n]) + except KeyError: + raise KeyError(f'Node {n} does not exist in the directed graph') + + def iter_predecessors(self, n): + try: + return iter(self._predecessors[n]) + except KeyError: + raise KeyError(f'Node {n} does not exist in the directed graph') + + cpdef list predecessors(self, n): + try: + return list(self._predecessors[n]) + except KeyError: + raise KeyError(f'Node {n} does not exist in the directed graph') + + cpdef int count_successors(self, n): + return len(self._successors[n]) + + cpdef int count_predecessors(self, n): + return len(self._predecessors[n]) + + def iter_indep(self, bint reverse=False): + cdef dict preds + preds = self._predecessors if not reverse else self._successors + for n, p in preds.items(): + if len(p) == 0: + yield n + + cpdef int count_indep(self, reverse=False): + cdef: + dict preds + int result = 0 + preds = self._predecessors if not reverse else self._successors + for n, p in preds.items(): + if len(p) == 0: + result += 1 + return result + + def dfs(self, start=None, visit_predicate=None, successors=None, reverse=False): + cdef: + set visited = set() + list stack + bint visit_all = False + + if reverse: + pred_fun, succ_fun = self.successors, self.predecessors + else: + pred_fun, succ_fun = self.predecessors, self.successors + + if start: + if not isinstance(start, (list, tuple)): + start = [start] + stack = list(start) + else: + stack = list(self.iter_indep(reverse=reverse)) + + def _default_visit_predicate(n, visited): + cdef list preds + preds = pred_fun(n) + return not preds or all(pred in visited for pred in preds) + + successors = successors or succ_fun + visit_all = (visit_predicate == 'all') + visit_predicate = visit_predicate or _default_visit_predicate + + while stack: + node = stack.pop() + if node in visited: + continue + preds = self.predecessors(node) + if visit_all or visit_predicate(node, visited): + yield node + visited.add(node) + stack.extend(n for n in successors(node) if n not in visited) + else: + stack.append(node) + stack.extend(n for n in preds if n not in visited) + + def bfs(self, start=None, visit_predicate=None, successors=None, reverse=False): + cdef: + object queue + object node + set visited = set() + bint visit_all = False + + if reverse: + pred_fun, succ_fun = self.successors, self.predecessors + else: + pred_fun, succ_fun = self.predecessors, self.successors + + if start is not None: + if not isinstance(start, (list, tuple)): + start = [start] + queue = deque(start) + else: + queue = deque(self.iter_indep(reverse=reverse)) + + def _default_visit_predicate(n, visited): + preds = pred_fun(n) + return not preds or all(pred in visited for pred in preds) + + successors = successors or succ_fun + visit_all = (visit_predicate == 'all') + visit_predicate = visit_predicate or _default_visit_predicate + + while queue: + node = queue.popleft() + if node in visited: + continue + preds = pred_fun(node) + if visit_all or visit_predicate(node, visited): + yield node + visited.add(node) + queue.extend(n for n in successors(node) if n not in visited) + else: + queue.append(node) + queue.extend(n for n in preds if n not in visited) + + def copy(self): + cdef DirectedGraph graph = type(self)() + for n in self: + if n not in graph._nodes: + graph._add_node(n) + for succ in self.iter_successors(n): + if succ not in graph._nodes: + graph._add_node(succ) + graph._add_edge(n, succ) + return graph + + def copyto(self, DirectedGraph other_graph): + if other_graph is self: + return + + other_graph._nodes = self._nodes.copy() + other_graph._predecessors = self._predecessors.copy() + other_graph._successors = self._successors.copy() + + def build_undirected(self): + cdef DirectedGraph graph = DirectedGraph() + for n in self: + if n not in graph._nodes: + graph._add_node(n) + for succ in self._successors[n]: + if succ not in graph._nodes: + graph._add_node(succ) + graph._add_edge(n, succ) + graph._add_edge(succ, n) + return graph + + def build_reversed(self): + cdef DirectedGraph graph = type(self)() + for n in self: + if n not in graph._nodes: + graph._add_node(n) + for succ in self._successors[n]: + if succ not in graph._nodes: + graph._add_node(succ) + graph._add_edge(succ, n) + return graph + + @classmethod + def _repr_in_dot(cls, val): + if isinstance(val, bool): + return 'true' if val else 'false' + if isinstance(val, str): + return f'"{val}"' + return val + + def _extract_operands(self, node): + return [node.op] + + def to_dot( + self, + graph_attrs=None, + node_attrs=None, + trunc_key=5, result_chunk_keys=None, show_columns=False): + + sio = StringIO() + sio.write('digraph {\n') + sio.write('splines=curved\n') + sio.write('rankdir=BT\n') + + if graph_attrs: + sio.write('graph [{0}];\n'.format( + ' '.join(f'{k}={self._repr_in_dot(v)}' for k, v in graph_attrs.items()))) + if node_attrs: + sio.write('node [{0}];\n'.format( + ' '.join(f'{k}={self._repr_in_dot(v)}' for k, v in node_attrs.items()))) + + chunk_style = '[shape=box]' + operand_style = '[shape=circle]' + + visited = set() + + def get_col_names(obj): + if hasattr(obj, "dtypes"): + return f"\"{','.join(list(obj.dtypes.index))}\"" + elif hasattr(obj, "name"): + return f"\"{obj.name}\"" + else: + return "\"N/A\"" + + for node in self.iter_nodes(): + for op in self._extract_operands(node): + op_name = type(op).__name__ + if op.stage is not None: + op_name = f'{op_name}:{op.stage.name}' + if op.key in visited: + continue + for input_chunk in (op.inputs or []): + if input_chunk.key not in visited: + sio.write(f'"Chunk:{self._gen_chunk_key(input_chunk, trunc_key)}" {chunk_style}\n') + visited.add(input_chunk.key) + if op.key not in visited: + sio.write(f'"{op_name}:{op.key[:trunc_key]}" {operand_style}\n') + visited.add(op.key) + sio.write(f'"Chunk:{self._gen_chunk_key(input_chunk, trunc_key)}" -> ' + f'"{op_name}:{op.key[:trunc_key]}"\n') + + for output_chunk in (op.outputs or []): + if output_chunk.key not in visited: + tmp_chunk_style = chunk_style + if result_chunk_keys and output_chunk.key in result_chunk_keys: + tmp_chunk_style = '[shape=box,style=filled,fillcolor=cadetblue1]' + sio.write(f'"Chunk:{self._gen_chunk_key(output_chunk, trunc_key)}" {tmp_chunk_style}\n') + visited.add(output_chunk.key) + if op.key not in visited: + sio.write(f'"{op_name}:{op.key[:trunc_key]}" {operand_style}\n') + visited.add(op.key) + sio.write(f'"{op_name}:{op.key[:trunc_key]}" -> ' + f'"Chunk:{self._gen_chunk_key(output_chunk, trunc_key)}"') + if show_columns: + sio.write(f' [ label={get_col_names(output_chunk)} ]') + sio.write("\n") + + sio.write('}') + return sio.getvalue() + + @classmethod + def _gen_chunk_key(cls, chunk, trunc_key): + if "_" in chunk.key: + key, index = chunk.key.split("_", 1) + return "_".join([key[:trunc_key], index]) + else: # pragma: no cover + return chunk.key[:trunc_key] + + def _repr_svg_(self): # pragma: no cover + from graphviz import Source + return Source(self.to_dot())._repr_svg_() + + def compose(self, list keys=None): + from ...optimizes.chunk_graph.fuse import Fusion + + return Fusion(self).compose(keys=keys) + + def decompose(self, nodes=None): + from ...optimizes.chunk_graph.fuse import Fusion + + Fusion(self).decompose(nodes=nodes) + + def view(self, filename='default', graph_attrs=None, node_attrs=None, result_chunk_keys=None, show_columns=False): # pragma: no cover + from graphviz import Source + + g = Source(self.to_dot(graph_attrs, node_attrs, result_chunk_keys=result_chunk_keys, show_columns=show_columns)) + g.view(filename=filename, cleanup=True) + + def to_dag(self): + dag = DAG() + dag._nodes = self._nodes.copy() + dag._predecessors = self._predecessors.copy() + dag._successors = self._successors.copy() + return dag + + +class GraphContainsCycleError(Exception): + pass + + +cdef class DAG(DirectedGraph): + def to_dag(self): + return self + + def topological_iter(self, succ_checker=None, reverse=False): + cdef: + dict preds, succs + set visited = set() + list stack + + if len(self) == 0: + return + + if reverse: + preds, succs = self._successors, self._predecessors + else: + preds, succs = self._predecessors, self._successors + + # copy predecessors and successors + succs = dict((k, set(v)) for k, v in succs.items()) + preds = dict((k, set(v)) for k, v in preds.items()) + + def _default_succ_checker(_, predecessors): + return len(predecessors) == 0 + + succ_checker = succ_checker or _default_succ_checker + + stack = list((p for p, l in preds.items() if len(l) == 0)) + if not stack: + raise GraphContainsCycleError + while stack: + node = stack.pop() + yield node + visited.add(node) + for succ in succs.get(node, {}): + if succ in visited: + raise GraphContainsCycleError + succ_preds = preds[succ] + succ_preds.remove(node) + if succ_checker(succ, succ_preds): + stack.append(succ) + if len(visited) != len(self): + raise GraphContainsCycleError diff --git a/python/xorbits/_mars/core/graph/entity.py b/python/xorbits/_mars/core/graph/entity.py new file mode 100644 index 000000000..8672cdbec --- /dev/null +++ b/python/xorbits/_mars/core/graph/entity.py @@ -0,0 +1,170 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABCMeta, abstractmethod +from typing import Dict, Iterable, List, Tuple, Union + +from ...core import Chunk, Tileable +from ...serialization.core import buffered +from ...serialization.serializables import BoolField, DictField, ListField, Serializable +from ...serialization.serializables.core import SerializableSerializer +from ...utils import tokenize +from .core import DAG + + +class EntityGraph(DAG, metaclass=ABCMeta): + @property + @abstractmethod + def results(self): + """ + Return result tileables or chunks. + + Returns + ------- + results + """ + + @results.setter + @abstractmethod + def results(self, new_results): + """ + Set result tileables or chunks. + + Parameters + ---------- + new_results + + Returns + ------- + + """ + + def copy(self) -> "EntityGraph": + graph = super().copy() + graph.results = self.results.copy() + return graph + + +class TileableGraph(EntityGraph, Iterable[Tileable]): + _result_tileables: List[Tileable] + # logic key is a unique and deterministic key for `TileableGraph`. For + # multiple runs the logic key will remain same if the computational logic + # doesn't change. And it can be used to some optimization when running a + # same `execute`, like HBO. + _logic_key: str + + def __init__(self, result_tileables: List[Tileable] = None): + super().__init__() + self._result_tileables = result_tileables + + @property + def result_tileables(self): + return self._result_tileables + + @property + def results(self): + return self._result_tileables + + @results.setter + def results(self, new_results): + self._result_tileables = new_results + + @property + def logic_key(self): + if not hasattr(self, "_logic_key") or self._logic_key is None: + token_keys = [] + for node in self.bfs(): + logic_key = node.op.get_logic_key() + if hasattr(node.op, "logic_key") and node.op.logic_key is None: + node.op.logic_key = logic_key + token_keys.append( + tokenize(logic_key, **node.extra_params) + if node.extra_params + else logic_key + ) + self._logic_key = tokenize(*token_keys) + return self._logic_key + + +class ChunkGraph(EntityGraph, Iterable[Chunk]): + _result_chunks: List[Chunk] + + def __init__(self, result_chunks: List[Chunk] = None): + super().__init__() + self._result_chunks = result_chunks + + @property + def result_chunks(self): + return self._result_chunks + + @property + def results(self): + return self._result_chunks + + @results.setter + def results(self, new_results): + self._result_chunks = new_results + + +class SerializableGraph(Serializable): + _is_chunk = BoolField("is_chunk") + # TODO(qinxuye): remove this logic when we handle fetch elegantly, + # now, the node in the graph and inputs for operand may be inconsistent, + # for example, an operand's inputs may be chunks, + # but in the graph, the predecessors are all fetch chunks, + # we serialize the fetch chunks first to make sure when operand's inputs + # are serialized, they will just be marked as serialized and skip serialization. + _fetch_nodes = ListField("fetch_nodes") + _nodes = DictField("nodes") + _predecessors = DictField("predecessors") + _successors = DictField("successors") + _results = ListField("results") + + @classmethod + def from_graph(cls, graph: Union[TileableGraph, ChunkGraph]) -> "SerializableGraph": + from ..operand import Fetch + + is_chunk = isinstance(graph, ChunkGraph) + return SerializableGraph( + _is_chunk=is_chunk, + _fetch_nodes=[chunk for chunk in graph if isinstance(chunk.op, Fetch)], + _nodes=graph._nodes, + _predecessors=graph._predecessors, + _successors=graph._successors, + _results=graph.results, + ) + + def to_graph(self) -> Union[TileableGraph, ChunkGraph]: + graph_cls = ChunkGraph if self._is_chunk else TileableGraph + graph = graph_cls(self._results) + graph._nodes.update(self._nodes) + graph._predecessors.update(self._predecessors) + graph._successors.update(self._successors) + return graph + + +class GraphSerializer(SerializableSerializer): + @buffered + def serial(self, obj: Union[TileableGraph, ChunkGraph], context: Dict): + serializable_graph = SerializableGraph.from_graph(obj) + return (), [serializable_graph], False + + def deserial( + self, serialized: Tuple, context: Dict, subs: List + ) -> Union[TileableGraph, ChunkGraph]: + serializable_graph: SerializableGraph = subs[0] + return serializable_graph.to_graph() + + +GraphSerializer.register(EntityGraph) diff --git a/python/xorbits/_mars/core/graph/tests/__init__.py b/python/xorbits/_mars/core/graph/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/core/graph/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/core/graph/tests/test_graph.py b/python/xorbits/_mars/core/graph/tests/test_graph.py new file mode 100644 index 000000000..ed9be63bf --- /dev/null +++ b/python/xorbits/_mars/core/graph/tests/test_graph.py @@ -0,0 +1,211 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest + +from .... import dataframe as md +from .... import tensor as mt +from ....tests.core import flaky +from ....utils import to_str +from .. import DAG, GraphContainsCycleError + + +def test_dag(): + r""" + 1 --- 4 + 2 --- 6 + \ / + 5 + / + 3 + """ + + dag = DAG() + [dag.add_node(i) for i in range(1, 7)] + dag.add_edge(1, 4) + dag.add_edge(2, 6) + dag.add_edge(2, 5) + dag.add_edge(5, 6) + dag.add_edge(3, 5) + + with pytest.raises(KeyError): + dag.add_edge(1, 10) + with pytest.raises(KeyError): + dag.add_edge(10, 1) + + assert set(dag[2]) == {5, 6} + assert list(dag.topological_iter()) == [3, 2, 5, 6, 1, 4] + + assert list(dag.dfs()) == [3, 2, 5, 6, 1, 4] + assert list(dag.bfs()) == [1, 2, 3, 4, 5, 6] + + dag.add_edge(6, 1) + dag.add_edge(1, 2) + + with pytest.raises(KeyError): + for _ in dag.iter_predecessors(-1): + pass + + with pytest.raises(KeyError): + for _ in dag.iter_successors(-1): + pass + + with pytest.raises(GraphContainsCycleError): + _ = list(dag.topological_iter()) + + dag.remove_edge(2, 5) + assert dag.has_successor(2, 5) is False + with pytest.raises(KeyError): + dag.remove_edge(2, 5) + + rev_dag = dag.build_reversed() + for n in dag: + assert n in rev_dag + assert ( + all(rev_dag.has_successor(n, pred) for pred in dag.predecessors(n)) is True + ) + + undigraph = dag.build_undirected() + for n in dag: + assert n in undigraph + assert ( + all(undigraph.has_predecessor(pred, n) for pred in dag.predecessors(n)) + is True + ) + assert ( + all(undigraph.has_successor(n, pred) for pred in dag.predecessors(n)) + is True + ) + + dag_copy = dag.copy() + for n in dag: + assert n in dag_copy + assert ( + all(dag_copy.has_successor(pred, n) for pred in dag_copy.predecessors(n)) + is True + ) + + +@flaky(max_runs=3) +def test_to_dot(): + arr = mt.random.randint(10, size=(10, 8), chunk_size=4) + arr_add = mt.random.randint(10, size=(10, 8), chunk_size=4) + arr2 = arr + arr_add + graph = arr2.build_graph(fuse_enabled=False, tile=True) + + dot = to_str(graph.to_dot(trunc_key=5)) + assert all(to_str(n.key)[:5] in dot for n in graph) is True + + +def test_tileable_graph_logic_key(): + # Tensor + t1 = mt.random.randint(10, size=(10, 8), chunk_size=4) + t2 = mt.random.randint(10, size=(10, 8), chunk_size=5) + graph1 = (t1 + t2).build_graph(tile=False) + tt1 = mt.random.randint(10, size=(10, 8), chunk_size=4) + tt2 = mt.random.randint(10, size=(10, 8), chunk_size=5) + graph2 = (tt1 + tt2).build_graph(tile=False) + assert graph1.logic_key == graph2.logic_key + t3 = mt.random.randint(10, size=(10, 8), chunk_size=6) + tt3 = mt.random.randint(10, size=(10, 8), chunk_size=6) + graph3 = (t1 + t3).build_graph(tile=False) + graph4 = (t1 + tt3).build_graph(tile=False) + assert graph1.logic_key != graph3.logic_key + assert graph3.logic_key == graph4.logic_key + t4 = mt.random.randint(10, size=(10, 8)) + graph5 = (t1 + t4).build_graph(tile=False) + assert graph1.logic_key != graph5.logic_key + + # Series + s1 = md.Series([1, 3, 5, mt.nan, 6, 8]) + s2 = md.Series(np.random.randn(1000), chunk_size=100) + graph1 = (s1 + s2).build_graph(tile=False) + ss1 = md.Series([1, 3, 5, mt.nan, 6, 8]) + ss2 = md.Series(np.random.randn(1000), chunk_size=100) + graph2 = (ss1 + ss2).build_graph(tile=False) + assert graph1.logic_key == graph2.logic_key + s3 = md.Series(np.random.randn(1000), chunk_size=200) + ss3 = md.Series(np.random.randn(1000), chunk_size=200) + graph3 = (s1 + s3).build_graph(tile=False) + graph4 = (s1 + ss3).build_graph(tile=False) + assert graph1.logic_key != graph3.logic_key + assert graph3.logic_key == graph4.logic_key + s4 = md.Series(np.random.randn(1000)) + graph5 = (s1 + s4).build_graph(tile=False) + assert graph1.logic_key != graph5.logic_key + + # DataFrame + df1 = md.DataFrame( + np.random.randint(0, 100, size=(100_000, 4)), columns=list("ABCD"), chunk_size=5 + ) + df2 = md.DataFrame( + np.random.randint(0, 100, size=(100_000, 4)), columns=list("ABCD"), chunk_size=4 + ) + graph1 = (df1 + df2).build_graph(tile=False) + ddf1 = md.DataFrame( + np.random.randint(0, 100, size=(100_000, 4)), columns=list("ABCD"), chunk_size=5 + ) + ddf2 = md.DataFrame( + np.random.randint(0, 100, size=(100_000, 4)), columns=list("ABCD"), chunk_size=4 + ) + graph2 = (ddf1 + ddf2).build_graph(tile=False) + assert graph1.logic_key == graph2.logic_key + df3 = md.DataFrame( + np.random.randint(0, 100, size=(100_000, 4)), columns=list("ABCD"), chunk_size=3 + ) + ddf3 = md.DataFrame( + np.random.randint(0, 100, size=(100_000, 4)), columns=list("ABCD"), chunk_size=3 + ) + graph3 = (df1 + df3).build_graph(tile=False) + graph4 = (df1 + ddf3).build_graph(tile=False) + assert graph1.logic_key != graph3.logic_key + assert graph3.logic_key == graph4.logic_key + df5 = md.DataFrame( + np.random.randint(0, 100, size=(100_000, 4)), columns=list("ABCD") + ) + graph5 = (df1 + df5).build_graph(tile=False) + assert graph1.logic_key != graph5.logic_key + graph6 = df1.describe().build_graph(tile=False) + graph7 = df2.describe().build_graph(tile=False) + assert graph6.logic_key != graph7.logic_key + graph8 = df1.apply(lambda x: x.max() - x.min()).build_graph(tile=False) + graph9 = df2.apply(lambda x: x.max() - x.min()).build_graph(tile=False) + assert graph8.logic_key != graph9.logic_key + assert ( + graph8.result_tileables[0].op.logic_key + == graph9.result_tileables[0].op.logic_key + ) + pieces1 = [df1[:3], df1[3:7], df1[7:]] + graph10 = md.concat(pieces1).build_graph(tile=False) + pieces2 = [df2[:3], df2[3:7], df2[7:]] + graph11 = md.concat(pieces2).build_graph(tile=False) + assert graph10.logic_key != graph11.logic_key + graph12 = md.merge(df1, df2, on="A", how="left").build_graph(tile=False) + graph13 = md.merge(df1, df3, on="A", how="left").build_graph(tile=False) + assert graph12.logic_key != graph13.logic_key + graph14 = df2.groupby("A").sum().build_graph(tile=False) + graph15 = df3.groupby("A").sum().build_graph(tile=False) + assert graph14.logic_key != graph15.logic_key + graph16 = ( + df2.groupby("A").apply(lambda x: x.max() - x.min()).build_graph(tile=False) + ) + graph17 = ( + df3.groupby("A").apply(lambda x: x.max() - x.min()).build_graph(tile=False) + ) + assert graph16.logic_key != graph17.logic_key + assert ( + graph16.result_tileables[0].op.logic_key + == graph17.result_tileables[0].op.logic_key + ) diff --git a/python/xorbits/_mars/core/mode.py b/python/xorbits/_mars/core/mode.py new file mode 100644 index 000000000..09ab86faf --- /dev/null +++ b/python/xorbits/_mars/core/mode.py @@ -0,0 +1,96 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import inspect +import threading + +from ..config import options + +_internal_mode = threading.local() + + +def is_eager_mode(): + in_kernel = is_kernel_mode() + if not in_kernel: + return options.eager_mode + else: + # in kernel, eager mode always False + return False + + +def is_kernel_mode(): + try: + return bool(_internal_mode.kernel) + except AttributeError: + _internal_mode.kernel = None + return False + + +def is_build_mode(): + return bool(getattr(_internal_mode, "build", False)) + + +class _EnterModeFuncWrapper: + def __init__(self, mode_name_to_value): + self.mode_name_to_value = mode_name_to_value + + # as the wrapper may enter for many times + # record old values for each time + self.mode_name_to_value_list = list() + + def __enter__(self): + mode_name_to_old_value = dict() + for mode_name, value in self.mode_name_to_value.items(): + # record mode's old values + mode_name_to_old_value[mode_name] = getattr(_internal_mode, mode_name, None) + if value is None: + continue + # set value + setattr(_internal_mode, mode_name, value) + self.mode_name_to_value_list.append(mode_name_to_old_value) + + def __exit__(self, *_): + mode_name_to_old_value = self.mode_name_to_value_list.pop() + for mode_name in self.mode_name_to_value.keys(): + # set back old values + setattr(_internal_mode, mode_name, mode_name_to_old_value[mode_name]) + + def __call__(self, func): + mode_name_to_value = self.mode_name_to_value.copy() + if not inspect.iscoroutinefunction(func): + # sync + @functools.wraps(func) + def _inner(*args, **kwargs): + with enter_mode(**mode_name_to_value): + return func(*args, **kwargs) + + else: + # async + @functools.wraps(func) + async def _inner(*args, **kwargs): + with enter_mode(**mode_name_to_value): + return await func(*args, **kwargs) + + return _inner + + +def enter_mode(kernel=None, build=None): + mode_name_to_value = { + "kernel": kernel, + "build": build, + } + mode_name_to_value = {k: v for k, v in mode_name_to_value.items() if v is not None} + + return _EnterModeFuncWrapper(mode_name_to_value) diff --git a/python/xorbits/_mars/core/operand/__init__.py b/python/xorbits/_mars/core/operand/__init__.py new file mode 100644 index 000000000..7f86d5ace --- /dev/null +++ b/python/xorbits/_mars/core/operand/__init__.py @@ -0,0 +1,34 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .base import ( + HasInput, + LogicKeyGenerator, + Operand, + OperandStage, + OperatorLogicKeyGeneratorMixin, + VirtualOperand, +) +from .core import TileableOperandMixin, estimate_size, execute +from .fetch import Fetch, FetchMixin, FetchShuffle, ShuffleFetchType +from .fuse import Fuse, FuseChunkMixin +from .objects import ( + MergeDictOperand, + ObjectFetch, + ObjectFuseChunk, + ObjectFuseChunkMixin, + ObjectOperand, + ObjectOperandMixin, +) +from .shuffle import MapReduceOperand, ShuffleProxy diff --git a/python/xorbits/_mars/core/operand/base.py b/python/xorbits/_mars/core/operand/base.py new file mode 100644 index 000000000..e009d25cf --- /dev/null +++ b/python/xorbits/_mars/core/operand/base.py @@ -0,0 +1,387 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import weakref +from copy import deepcopy +from enum import Enum +from functools import partial +from typing import Any, Dict, List, Tuple, Type, Union + +from ...serialization.core import Placeholder +from ...serialization.serializables import ( + BoolField, + DictField, + FieldTypes, + Float32Field, + Int32Field, + ListField, + ReferenceField, + Serializable, + SerializableMeta, + StringField, + TupleField, +) +from ...serialization.serializables.core import SerializableSerializer +from ...typing import OperandType +from ...utils import AttributeDict, classproperty, tokenize +from ..base import Base +from ..entity.chunks import Chunk +from ..entity.core import ENTITY_TYPE, Entity, EntityData +from ..entity.output_types import OutputType +from ..entity.tileables import Tileable +from ..mode import enter_mode + + +class OperandMetaclass(SerializableMeta): + def __new__(mcs, name: str, bases: Tuple[Type], properties: Dict): + if "__call__" in properties: + # if __call__ is specified for an operand, + # make sure that entering user space + properties["__call__"] = enter_mode(kernel=False)(properties["__call__"]) + + return super().__new__(mcs, name, bases, properties) + + +class OperandStage(Enum): + map = 0 + reduce = 1 + combine = 2 + agg = 3 + + +class SchedulingHint(Serializable): + # worker to execute, only work for chunk op, + # if specified, the op should be executed on the specified worker + # only work for those operand that has no input + expect_worker = StringField("expect_worker", default=None) + # band to execute, only work for chunk op, + # if specified, the op should be executed on the specified band + # only work for those operand that has no input + expect_band = TupleField( + "expect_band", + FieldTypes.tuple(FieldTypes.string, FieldTypes.string), + default=None, + ) + # will this operand be assigned a worker or not + reassign_worker = BoolField("reassign_worker", default=False) + # mark a op as fuseable + fuseable = BoolField("fuseable", default=True) + # True means control dependency, False means data dependency + _pure_depends = ListField("pure_depends", FieldTypes.bool, default=None) + # useful when setting chunk index as priority, + # useful for those op like read_csv, the first chunk + # need to be executed not later than the later ones, + # because the range index of later chunk should be accumulated from + # indexes of previous ones + # `gpu` indicates that if the operand should be executed on the GPU. + gpu = BoolField("gpu", default=None) + priority = Int32Field("priority", default=None) + + @classproperty + @functools.lru_cache(1) + def all_hint_names(cls): + return list(cls._FIELDS) + + def can_be_fused(self) -> bool: + if not self.fuseable: + return False + if self.reassign_worker: + return False + if self._pure_depends and any(depend for depend in self._pure_depends): + # control dependency exists + return False + return True + + +def _install_scheduling_hint_properties(cls: Type["Operand"]): + def get_hint(name): + def _get_val(operand: "Operand"): + if operand.scheduling_hint: + return getattr(operand.scheduling_hint, name) + + def _set_val(operand: "Operand", val: Any): + if not operand.scheduling_hint: + operand.scheduling_hint = SchedulingHint(**{name: val}) + else: + setattr(operand.scheduling_hint, name, val) + + return property(_get_val, _set_val) + + for hint_name in SchedulingHint.all_hint_names: + setattr(cls, hint_name, get_hint(hint_name)) + return cls + + +class OperatorLogicKeyGeneratorMixin: + """ + This generator will generate an unique and deterministic key for operator compute logic. It should be same + for different run if the compute logic doesn't change. This id will be used in subtask speculative + execution and hbo scheduling and so on. + """ + + def get_logic_key(self): + """The subclass may need to override this method to ensure unique and deterministic.""" + fields = self._get_logic_key_token_values() + try: + return tokenize(*fields) + except Exception as e: # pragma: no cover + raise ValueError( + f"Cannot generate logic key for operator {self} with fields {fields}" + ) from e + + def _get_logic_key_token_values(self): + token_values = [type(self).__module__, type(self).__name__] + if self.stage is not None: + token_values.append(self.stage.name) + return token_values + + +class LogicKeyGenerator: + def __init__(self): + self.operator_id_to_logic_key = {} + + def get_logic_key(self, op: "Operand"): + assert isinstance(op, Operand) + logic_key = self.operator_id_to_logic_key.get(op.id) + if logic_key is None: + logic_key = self.operator_id_to_logic_key[op.id] = op.get_logic_key() + return logic_key + + +@_install_scheduling_hint_properties +class Operand(Base, OperatorLogicKeyGeneratorMixin, metaclass=OperandMetaclass): + """ + Operand base class. All operands should have a type, which can be Add, Subtract etc. + `sparse` indicates that if the operand is applied on a sparse tensor/chunk. + `device`, 0 means the CPU, otherwise means the GPU device. + Operand can have inputs and outputs + which should be the :class:`mars.tensor.core.TensorData`, :class:`mars.tensor.core.ChunkData` etc. + """ + + attr_tag = "attr" + _init_update_key_ = False + _output_type_ = None + _no_copy_attrs_ = Base._no_copy_attrs_ | {"scheduling_hint"} + _cache_primitive_serial = True + + sparse = BoolField("sparse", default=False) + device = Int32Field("device", default=None) + # will this operand create a view of input data or not + create_view = BoolField("create_view", default=False) + stage = ReferenceField("stage", OperandStage, default=None) + memory_scale = Float32Field("memory_scale", default=None) + tileable_op_key = StringField("tileable_op_key", default=None) + extra_params = DictField("extra_params", key_type=FieldTypes.string) + # scheduling hint + scheduling_hint = ReferenceField("scheduling_hint", SchedulingHint, default=None) + + _inputs = ListField( + "inputs", FieldTypes.reference(EntityData), default_factory=list + ) + # outputs are weak-refs which are not pickle-able + _outputs = ListField( + "outputs", default=None, on_serialize=lambda outputs: [o() for o in outputs] + ) + _output_types = ListField( + "output_type", FieldTypes.reference(OutputType), default=None + ) + + def __init__(self: OperandType, *args, **kwargs): + self._parse_kwargs(kwargs) + super().__init__(*args, **kwargs) + + @classmethod + def _parse_kwargs(cls, kwargs: Dict[str, Any]): + extra_params = kwargs.pop("extra_params", {}) + kwargs["extra_params"] = extras = AttributeDict(extra_params) + kwargs["scheduling_hint"] = scheduling_hint = kwargs.get( + "scheduling_hint", SchedulingHint() + ) + for k in set(kwargs): + if k in cls._FIELDS: + continue + elif k in SchedulingHint.all_hint_names: + setattr(scheduling_hint, k, kwargs.pop(k)) + else: + extras[k] = kwargs.pop(k) + + def __repr__(self): + if self.stage is None: + return f"{type(self).__name__} " + else: + return f"{type(self).__name__} " + + @classmethod + def _get_entity_data(cls, entity): + if isinstance(entity, Entity): + return entity.data + return entity + + @classmethod + def _get_inputs_data(cls, inputs): + return [cls._get_entity_data(inp) for inp in inputs] + + def _set_inputs(self, inputs): + if inputs is not None: + inputs = self._get_inputs_data(inputs) + if hasattr(self, "check_inputs"): + self.check_inputs(inputs) + setattr(self, "_inputs", inputs) + + @property + def inputs(self) -> List[Union[ENTITY_TYPE]]: + inputs = self._inputs + if inputs is None: + inputs = self._inputs = [] + return inputs + + @inputs.setter + def inputs(self, vals): + self._set_inputs(vals) + + @property + def output_limit(self) -> int: + return 1 + + @property + def pure_depends(self): + val = self._pure_depends # pylint: disable=access-member-before-definition + if not val: + val = self._pure_depends = [False] * len(self.inputs or ()) + return val + + @property + def output_types(self): + return self._output_types + + @output_types.setter + def output_types(self, value): + self._output_types = value + + def _attach_outputs(self, *outputs): + self._outputs = [ + weakref.ref(self._get_entity_data(o)) if o is not None else o + for o in outputs + ] + + if len(self._outputs) > self.output_limit: + raise ValueError("Outputs' size exceeds limitation") + + @property + def outputs(self) -> List[Union[Chunk, Tileable]]: + outputs = self._outputs + if outputs: + return [ref() for ref in outputs] + + @outputs.setter + def outputs(self, outputs): + self._attach_outputs(*outputs) + + def is_sparse(self) -> bool: + return self.sparse + + issparse = is_sparse + + def is_gpu(self) -> bool: + return self.gpu + + @property + def retryable(self) -> bool: + return True + + def get_dependent_data_keys(self): + return [dep.key for dep in self.inputs or ()] + + def _get_output_type(self, output_idx): + if self.output_types: + try: + return self.output_types[output_idx] + except IndexError: + return self.output_types[0] + else: + return self._output_type_ + + def copy(self: OperandType) -> OperandType: + new_op = super().copy() + new_op.outputs = [] + # copy scheduling_hint + new_op.scheduling_hint = self.scheduling_hint.copy() + extra_params = self.extra_params + if extra_params: + new_op.extra_params = deepcopy(extra_params) + return new_op + + def on_output_modify(self, new_output): + # when `create_view` is True, if the output is modified, + # the modification should be set back to the input. + # This function is for this sort of usage. + # Remember, if `create_view` is False, this function should take no effect. + raise NotImplementedError + + def on_input_modify(self, new_input): + # when `create_view` is True, if the input is modified, + # this function could be used to respond the modification. + # Remember, if `create_view` is False, this function should take no effect. + raise NotImplementedError + + +class OperandSerializer(SerializableSerializer): + def serial(self, obj: Serializable, context: Dict): + res = super().serial(obj, context) + return res + + def deserial(self, serialized: Tuple, context: Dict, subs: List) -> Operand: + # convert outputs back to weak-refs + operand: Operand = super().deserial(serialized, context, subs) + for i, out in enumerate(operand._outputs): + + def cb(o, index): + outputs = operand._outputs + outputs[index] = weakref.ref(o) + + if len(outputs) > 1 and all( + not isinstance(o, Placeholder) for o in outputs + ): + # all replaced + # add siblings for multiple outputs + outputs = operand.outputs + for j in range(len(outputs)): + outputs[j]._siblings = outputs[:j] + outputs[j + 1 :] + + if isinstance(out, Placeholder): + out.callbacks.append(partial(cb, index=i)) + else: + cb(out, i) + return operand + + +OperandSerializer.register(Operand) + + +class VirtualOperand(Operand): + def get_dependent_data_keys(self): + return [] + + +class HasInput(Operand): + __slots__ = () + + @property + def input(self): + return self._input + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] diff --git a/python/xorbits/_mars/core/operand/core.py b/python/xorbits/_mars/core/operand/core.py new file mode 100644 index 000000000..3a05a55e8 --- /dev/null +++ b/python/xorbits/_mars/core/operand/core.py @@ -0,0 +1,529 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +from typing import Any, Callable, Dict, List, Tuple, Type, Union + +import numpy as np + +try: + from numpy.core._exceptions import UFuncTypeError +except ImportError: # pragma: no cover + UFuncTypeError = None + +from ...metrics import Metrics +from ...typing import ChunkType, OperandType, TileableType +from ...utils import calc_data_size +from ..context import Context +from ..entity import ( + ExecutableTuple, + OutputType, + get_chunk_types, + get_fetch_class, + get_output_types, + get_tileable_types, +) +from ..mode import is_eager_mode + +_op_type_to_executor: Dict[Type[OperandType], Callable] = dict() +_op_type_to_size_estimator: Dict[Type[OperandType], Callable] = dict() + + +op_executed_number = Metrics.counter( + "mars.operand.executed_number", "The number of executed operands.", ("op",) +) + + +class TileableOperandMixin: + __slots__ = () + + def check_inputs(self, inputs: List[TileableType]): + if not inputs: + return + + for inp in inputs: + if inp is not None and inp._need_execution(): + raise ValueError( + f"{inp} has unknown dtypes, " + f"it must be executed first before {str(type(self))}" + ) + + @classmethod + def _check_if_gpu(cls, inputs: List[TileableType]): + if not inputs: + return None + true_num = 0 + for inp in inputs: + op = getattr(inp, "op", None) + if op is None or op.gpu is None: + return None + true_num += int(op.gpu) + if true_num == len(inputs): + return True + elif true_num == 0: + return False + return None + + def _tokenize_output(self, output_idx: int, **kw): + return f"{self._key}_{output_idx}" + + def _create_chunk(self, output_idx: int, index: Tuple[int], **kw) -> ChunkType: + output_type = kw.pop("output_type", None) or self._get_output_type(output_idx) + if not output_type: + raise ValueError("output_type should be specified") + + if isinstance(output_type, (list, tuple)): + output_type = output_type[output_idx] + chunk_type, chunk_data_type = get_chunk_types(output_type) + kw["_i"] = output_idx + kw["op"] = self + kw["index"] = index + if output_type == OutputType.scalar: + # tensor + kw["order"] = "C_ORDER" + + # key of output chunks may only contain keys for its output ids + if "_key" not in kw: + kw["_key"] = self._tokenize_output(output_idx, **kw) + + data = chunk_data_type(**kw) + return chunk_type(data) + + def _new_chunks( + self, inputs: List[ChunkType], kws: List[Dict] = None, **kw + ) -> List[ChunkType]: + output_limit = kw.pop("output_limit", None) + if output_limit is None: + output_limit = self.output_limit + if isinstance(output_limit, float) and kws: + output_limit = len(kws) + + self.check_inputs(inputs) + self._set_inputs(inputs) + if self.gpu is None: + self.gpu = self._check_if_gpu(self._inputs) + if self._key is None: + self._update_key() + + chunks = [] + for j in range(output_limit): + create_chunk_kw = kw.copy() + if kws: + create_chunk_kw.update(kws[j]) + index = create_chunk_kw.pop("index", None) + chunk = self._create_chunk(j, index, **create_chunk_kw) + chunks.append(chunk) + + self.outputs = chunks + if len(chunks) > 1: + # for each output chunk, hold the reference to the other outputs + # so that either no one or everyone are gc collected + for j, t in enumerate(chunks): + t.data._siblings = [c.data for c in chunks[:j] + chunks[j + 1 :]] + return chunks + + def new_chunks( + self, inputs: List[ChunkType], kws: List[Dict] = None, **kwargs + ) -> List[ChunkType]: + """ + Create chunks. + + A chunk is a node in a fine grained graph, all the chunk objects are created by + calling this function, it happens mostly in tiles. + The generated chunks will be set as this operand's outputs and each chunk will + hold this operand as it's op. + + Parameters + ---------- + inputs : list + Input chunks. + kws : dict + Kwargs for each output. + kwargs : dict + common kwargs for all outputs + + Returns + ------- + chunks : list + Output chunks. + + .. note:: + It's a final method, do not override. + Override the method `_new_chunks` if needed. + """ + return self._new_chunks(inputs, kws=kws, **kwargs) + + def new_chunk( + self, inputs: List[ChunkType], kws: List[Dict] = None, **kw + ) -> ChunkType: + if getattr(self, "output_limit") != 1: + raise TypeError("cannot new chunk with more than 1 outputs") + + return self.new_chunks(inputs, kws=kws, **kw)[0] + + @staticmethod + def _fill_nan_shape(kw: dict): + nsplits = kw.get("nsplits") + shape = kw.get("shape") + if nsplits is not None and shape is not None: + nsplits = tuple(nsplits) + shape = list(shape) + for idx, (s, sp) in enumerate(zip(shape, nsplits)): + if not np.isnan(s): + continue + s = sum(sp) + if not np.isnan(s): + shape[idx] = s + kw["shape"] = tuple(shape) + kw["nsplits"] = nsplits + return kw + + def _create_tileable(self, output_idx: int, **kw) -> TileableType: + output_type = kw.pop("output_type", self._get_output_type(output_idx)) + if output_type is None: + raise ValueError("output_type should be specified") + + if isinstance(output_type, (list, tuple)): + output_type = output_type[output_idx] + + tileable_type, tileable_data_type = get_tileable_types(output_type) + kw["_i"] = output_idx + kw["op"] = self + if output_type == OutputType.scalar: + # tensor + kw["order"] = "C_ORDER" + + kw = self._fill_nan_shape(kw) + + # key of output chunks may only contain keys for its output ids + if "_key" not in kw: + kw["_key"] = self._tokenize_output(output_idx, **kw) + + data = tileable_data_type(**kw) + return tileable_type(data) + + def _new_tileables( + self, inputs: List[TileableType], kws: List[dict] = None, **kw + ) -> List[TileableType]: + output_limit = kw.pop("output_limit", None) + if output_limit is None: + output_limit = getattr(self, "output_limit") + + self._set_inputs(inputs) + if self.gpu is None: + self.gpu = self._check_if_gpu(self._inputs) + if getattr(self, "_key", None) is None: + self._update_key() # update key when inputs are set + + tileables = [] + for j in range(output_limit): + create_tensor_kw = kw.copy() + if kws: + create_tensor_kw.update(kws[j]) + tileable = self._create_tileable(j, **create_tensor_kw) + tileables.append(tileable) + + self.outputs = tileables + if len(tileables) > 1: + # for each output tileable, hold the reference to the other outputs + # so that either no one or everyone are gc collected + for j, t in enumerate(tileables): + t.data._siblings = [ + tileable.data for tileable in tileables[:j] + tileables[j + 1 :] + ] + return tileables + + def new_tileables( + self, inputs: List[TileableType], kws: List[dict] = None, **kw + ) -> List[TileableType]: + """ + Create tileable objects(Tensors or DataFrames). + + This is a base function for create tileable objects like tensors or dataframes, + it will be called inside the `new_tensors` and `new_dataframes`. + If eager mode is on, it will trigger the execution after tileable objects are created. + + Parameters + ---------- + inputs : list + Input tileables + kws : List[dict] + Kwargs for each output. + kw : dict + Common kwargs for all outputs. + + Returns + ------- + tileables : list + Output tileables. + + .. note:: + It's a final method, do not override. + Override the method `_new_tileables` if needed. + """ + tileables = self._new_tileables(inputs, kws=kws, **kw) + if is_eager_mode(): + ExecutableTuple(tileables).execute() + return tileables + + def new_tileable( + self, inputs: List[TileableType], kws: List[Dict] = None, **kw + ) -> TileableType: + if getattr(self, "output_limit") != 1: + raise TypeError("cannot new chunk with more than 1 outputs") + + return self.new_tileables(inputs, kws=kws, **kw)[0] + + @classmethod + def pre_tile(cls, op: OperandType): + """ + Operation before tile. + + Parameters + ---------- + op : OperandType + Operand to tile + """ + + @classmethod + def tile(cls, op: OperandType): + raise NotImplementedError + + @classmethod + def post_tile(cls, op: OperandType, results: List[TileableType]): + """ + Operation after tile. + + Parameters + ---------- + op : OperandType + Operand to tile. + results: list + List of tiled results. + """ + + @classmethod + def pre_execute(cls, ctx: Union[dict, Context], op: OperandType): + """ + Operation before execute. + + Parameters + ---------- + ctx : dict + Data store. + op : OperandType + Operand to execute. + """ + + @classmethod + def execute(cls, ctx: Union[dict, Context], op: OperandType): + raise NotImplementedError + + @classmethod + def post_execute(cls, ctx: Union[dict, Context], op: OperandType): + """ + Operand before execute. + + Parameters + ---------- + ctx : dict + Data store + op : OperandType + Operand to execute. + """ + + @classmethod + def estimate_size(cls, ctx: dict, op: OperandType): + from .fetch import FetchShuffle + + # when sizes of all outputs are deterministic, return directly + outputs = op.outputs + if all( + not c.is_sparse() and hasattr(c, "nbytes") and not np.isnan(c.nbytes) + for c in outputs + ): + for out in outputs: + ctx[out.key] = (out.nbytes, out.nbytes) + return + + pure_dep_keys = set( + inp.key + for inp, is_dep in zip(op.inputs or (), op.pure_depends or ()) + if is_dep + ) + exec_sizes = [0] + for inp in op.inputs or (): + if inp.key in pure_dep_keys: + continue + try: + if isinstance(inp.op, FetchShuffle): + keys_and_shapes = inp.extra_params.get("_shapes", dict()).items() + else: + keys_and_shapes = [(inp.key, getattr(inp, "shape", None))] + + # execution size of a specific data chunk may be + # larger than stored type due to objects + for key, shape in keys_and_shapes: + exec_sizes.append(ctx[key][0]) + except KeyError: + if not op.sparse: + inp_size = calc_data_size(inp) + if not np.isnan(inp_size): + exec_sizes.append(inp_size) + if any(c.is_sparse() for c in op.inputs): + exec_size = sum(exec_sizes) + else: + exec_size = max(exec_sizes) + + total_out_size = 0 + chunk_sizes = dict() + for out in outputs: + try: + if not out.is_sparse(): + chunk_size = calc_data_size(out) + else: + chunk_size = exec_size + if np.isnan(chunk_size): + raise TypeError + chunk_sizes[out.key] = chunk_size + total_out_size += chunk_size + except (AttributeError, TypeError, ValueError): + pass + + exec_size = max(exec_size, total_out_size) + memory_scale = op.memory_scale or 1.0 + for out in outputs: + if out.key in ctx: + continue + if out.key in chunk_sizes: + result_size = chunk_sizes[out.key] + else: + result_size = max( + exec_size // len(outputs), + total_out_size // max(len(chunk_sizes), 1), + ) + try: + if getattr(out, "dtype", None) is not None and out.is_sparse(): + max_sparse_size = ( + out.nbytes + + np.dtype(np.int64).itemsize * np.prod(out.shape) * out.ndim + ) + else: + max_sparse_size = np.nan + except TypeError: # pragma: no cover + max_sparse_size = np.nan + if not np.isnan(max_sparse_size): + result_size = min(result_size, max_sparse_size) + ctx[out.key] = (result_size, int(exec_size * memory_scale // len(outputs))) + + @classmethod + def concat_tileable_chunks(cls, tileable: TileableType): + raise NotImplementedError + + @classmethod + def create_tileable_from_chunks( + cls, chunks: List[ChunkType], inputs: List[TileableType] = None, **kw + ) -> TileableType: + raise NotImplementedError + + def get_fetch_op_cls(self, obj: ChunkType): + from .shuffle import ShuffleProxy + + output_types = get_output_types(obj, unknown_as=OutputType.object) + fetch_cls, fetch_shuffle_cls = get_fetch_class(output_types[0]) + if isinstance(self, ShuffleProxy): + cls = fetch_shuffle_cls + else: + cls = fetch_cls + + def _inner(**kw): + return cls(output_types=output_types, **kw) + + return _inner + + def get_fuse_op_cls(self, obj: ChunkType): + raise NotImplementedError + + @classmethod + def register_executor(cls, executor: Callable): + _op_type_to_executor[cls] = executor + + @classmethod + def unregister_executor(cls): + del _op_type_to_executor[cls] + + @classmethod + def register_size_estimator(cls, size_estimator: Callable): + _op_type_to_size_estimator[cls] = size_estimator + + @classmethod + def unregister_size_estimator(cls): + del _op_type_to_size_estimator[cls] + + +def execute(results: Dict[str, Any], op: OperandType): + try: + executor = _op_type_to_executor[type(op)] + except KeyError: + executor = type(op).execute + + # pre execute + op.pre_execute(results, op) + succeeded = False + try: + if UFuncTypeError is None: # pragma: no cover + return executor(results, op) + else: + # Cast `UFuncTypeError` to `TypeError` since subclasses of the former is unpickleable. + # The `UFuncTypeError` was introduced by numpy#12593 since v1.17.0. + try: + result = executor(results, op) + succeeded = True + if op.stage is not None: + op_name = f"{op.__class__.__name__}:{op.stage.name}" + else: + op_name = op.__class__.__name__ + op_executed_number.record(1, {"op": op_name}) + return result + except UFuncTypeError as e: # pragma: no cover + raise TypeError(str(e)).with_traceback(sys.exc_info()[2]) from None + except NotImplementedError: + for op_cls in type(op).__mro__: + if op_cls in _op_type_to_executor: + executor = _op_type_to_executor[op_cls] + _op_type_to_executor[type(op)] = executor + result = executor(results, op) + succeeded = True + return result + raise KeyError(f"No handler found for op: {op}") + finally: + if succeeded: + op.post_execute(results, op) + + +def estimate_size(results: Dict[str, Any], op: OperandType): + try: + size_estimator = _op_type_to_size_estimator[type(op)] + except KeyError: + size_estimator = type(op).estimate_size + + try: + return size_estimator(results, op) + except NotImplementedError: + for op_cls in type(op).__mro__: + if op_cls in _op_type_to_size_estimator: + size_estimator = _op_type_to_size_estimator[op_cls] + _op_type_to_size_estimator[type(op)] = size_estimator + return size_estimator(results, op) + raise KeyError(f"No handler found for op: {op} to estimate size") diff --git a/python/xorbits/_mars/core/operand/fetch.py b/python/xorbits/_mars/core/operand/fetch.py new file mode 100644 index 000000000..1e06f0922 --- /dev/null +++ b/python/xorbits/_mars/core/operand/fetch.py @@ -0,0 +1,62 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import enum + +from ... import opcodes +from ...serialization.serializables import ( + FieldTypes, + Int32Field, + ListField, + ReferenceField, + StringField, +) +from .base import Operand +from .core import TileableOperandMixin + + +class Fetch(Operand): + _op_type_ = opcodes.FETCH + + source_key = StringField("source_key", default=None) + + +class FetchMixin(TileableOperandMixin): + def check_inputs(self, inputs): + # no inputs + if inputs and len(inputs) > 0: + raise ValueError(f"{type(self).__name__} has no inputs") + + @classmethod + def tile(cls, op): + raise NotImplementedError("Fetch tile cannot be handled by operand itself") + + @classmethod + def execute(cls, ctx, op): + """ + Fetch operand needs nothing to do. + """ + + +class FetchShuffle(Operand): + _op_type_ = opcodes.FETCH_SHUFFLE + + source_keys = ListField("source_keys", FieldTypes.string) + n_mappers = Int32Field("n_mappers") + n_reducers = Int32Field("n_reducers") + shuffle_fetch_type = ReferenceField("shuffle_fetch_type") + + +class ShuffleFetchType(enum.Enum): + FETCH_BY_KEY = 0 + FETCH_BY_INDEX = 1 diff --git a/python/xorbits/_mars/core/operand/fuse.py b/python/xorbits/_mars/core/operand/fuse.py new file mode 100644 index 000000000..d1d78e65a --- /dev/null +++ b/python/xorbits/_mars/core/operand/fuse.py @@ -0,0 +1,38 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes +from ...serialization.serializables import ReferenceField +from ..entity import FuseChunk, FuseChunkData, NotSupportTile +from ..graph import ChunkGraph +from .base import Operand + + +class Fuse(Operand): + __slots__ = ("_fuse_graph",) + _op_type_ = opcodes.FUSE + + fuse_graph = ReferenceField("fuse_graph", ChunkGraph) + + +class FuseChunkMixin: + __slots__ = () + + def _create_chunk(self, output_idx, index, **kw): + data = FuseChunkData(_index=index, _op=self, **kw) + return FuseChunk(data) + + @classmethod + def tile(cls, op): + raise NotSupportTile("FuseChunk is a chunk operand which does not support tile") diff --git a/python/xorbits/_mars/core/operand/objects.py b/python/xorbits/_mars/core/operand/objects.py new file mode 100644 index 000000000..091464c48 --- /dev/null +++ b/python/xorbits/_mars/core/operand/objects.py @@ -0,0 +1,86 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...serialization.serializables import BoolField +from ..entity import OutputType, register_fetch_class +from .base import Operand +from .core import TileableOperandMixin +from .fetch import Fetch, FetchMixin +from .fuse import Fuse, FuseChunkMixin + + +class ObjectOperand(Operand): + pass + + +class ObjectOperandMixin(TileableOperandMixin): + _output_type_ = OutputType.object + + def get_fuse_op_cls(self, obj): + return ObjectFuseChunk + + +class ObjectFuseChunkMixin(FuseChunkMixin, ObjectOperandMixin): + __slots__ = () + + +class ObjectFuseChunk(ObjectFuseChunkMixin, Fuse): + pass + + +class ObjectFetch(FetchMixin, ObjectOperandMixin, Fetch): + _output_type_ = OutputType.object + + def __init__(self, **kw): + kw.pop("output_types", None) + kw.pop("_output_types", None) + super().__init__(**kw) + + def _new_chunks(self, inputs, kws=None, **kw): + if "_key" in kw and self.source_key is None: + self.source_key = kw["_key"] + return super()._new_chunks(inputs, kws=kws, **kw) + + def _new_tileables(self, inputs, kws=None, **kw): + if "_key" in kw and self.source_key is None: + self.source_key = kw["_key"] + return super()._new_tileables(inputs, kws=kws, **kw) + + +register_fetch_class(OutputType.object, ObjectFetch, None) + + +class MergeDictOperand(ObjectOperand, ObjectOperandMixin): + _merge = BoolField("merge") + + def __init__(self, merge=None, **kw): + super().__init__(_merge=merge, **kw) + + @property + def merge(self): + return self._merge + + @classmethod + def concat_tileable_chunks(cls, tileable): + assert not tileable.is_coarse() + + op = cls(merge=True) + chunk = cls(merge=True).new_chunk(tileable.chunks) + return op.new_tileable([tileable], chunks=[chunk], nsplits=((1,),)) + + @classmethod + def execute(cls, ctx, op): + assert op.merge + inputs = [ctx[inp.key] for inp in op.inputs] + ctx[op.outputs[0].key] = next(inp for inp in inputs if inp) diff --git a/python/xorbits/_mars/core/operand/shuffle.py b/python/xorbits/_mars/core/operand/shuffle.py new file mode 100644 index 000000000..8c4404657 --- /dev/null +++ b/python/xorbits/_mars/core/operand/shuffle.py @@ -0,0 +1,130 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes +from ...serialization.serializables import ( + FieldTypes, + Int32Field, + StringField, + TupleField, +) +from . import FetchShuffle, ShuffleFetchType +from .base import Operand, OperandStage, VirtualOperand + + +class ShuffleProxy(VirtualOperand): + _op_type_ = opcodes.SHUFFLE_PROXY + n_mappers = Int32Field("n_mappers", default=0) + # `n_reducers` will be updated in `MapReduceOperand._new_chunks` + n_reducers = Int32Field("n_reducers", default=0) + + def _new_chunks(self, inputs, kws=None, **kw): + self.n_mappers = len(inputs) + return super()._new_chunks(inputs, kws, **kw) + + +class MapReduceOperand(Operand): + """ + An operand for shuffle execution which partitions data by the value in each record’s partition key, and + send the partitioned data from all mappers to all reducers. + """ + + # for reducer + reducer_index = TupleField("reducer_index", FieldTypes.uint64) + # Total reducer nums, which also be shuffle blocks for single mapper. + n_reducers = Int32Field("n_reducers") + # The reducer ordinal in all reducers. It's different from reducer_index, + # which might be a tuple. + # `reducer_ordinal` will be set in `_new_chunks`. + reducer_ordinal = Int32Field("reducer_ordinal") + reducer_phase = StringField("reducer_phase", default=None) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if self.stage == OperandStage.reduce: + # for reducer, we assign worker at first + self.scheduling_hint.reassign_worker = True + + def _new_chunks(self, inputs, kws=None, **kw): + if getattr(self, "reducer_index", None) is None: + index = None + if kws: + index = kws[0].get("index") + self.reducer_index = index or kw.get("index") + if self.stage == OperandStage.reduce: + # Operands such as `TensorIndexSetValue` will have multiple inputs, some won't be ProxyChunk + proxy_operands = [c.op for c in inputs if isinstance(c.op, ShuffleProxy)] + if proxy_operands: + # For create reduce checks with `FetchShuffle`, `proxy_operands` will be empty. + proxy = proxy_operands[0] + self.reducer_ordinal = proxy.n_reducers + proxy.n_reducers += 1 + return super()._new_chunks(inputs, kws, **kw) + + def get_dependent_data_keys(self): + from .fetch import FetchShuffle + + if self.stage == OperandStage.reduce: + inputs = self.inputs or () + deps = [] + for inp in inputs: + if isinstance(inp.op, ShuffleProxy): + deps.extend( + [(chunk.key, self.reducer_index) for chunk in inp.inputs or ()] + ) + elif isinstance(inp.op, FetchShuffle): + # fetch shuffle by index doesn't store data keys, so it won't run into this function. + assert inp.op.shuffle_fetch_type == ShuffleFetchType.FETCH_BY_KEY + deps.extend([(k, self.reducer_index) for k in inp.op.source_keys]) + else: + deps.append(inp.key) + return deps + return super().get_dependent_data_keys() + + def iter_mapper_keys(self, input_id=0): + # key is mapper chunk key, index is mapper chunk index. + input_chunk = self.inputs[input_id] + if isinstance(input_chunk.op, ShuffleProxy): + keys = [inp.key for inp in input_chunk.inputs] + else: + assert isinstance(input_chunk.op, FetchShuffle), input_chunk.op + if input_chunk.op.shuffle_fetch_type == ShuffleFetchType.FETCH_BY_INDEX: + # For fetch shuffle by index, all shuffle block of same reducers are + # identified by their index. chunk key are not needed any more. + # so just mock key here. + # keep this in sync with ray executor `execute_subtask`. + return list(range(input_chunk.op.n_mappers)) + keys = input_chunk.op.source_keys + return keys + + def iter_mapper_data(self, ctx, input_id=0, pop=False, skip_none=False): + for key in self.iter_mapper_keys(input_id): + try: + if pop: + yield ctx.pop((key, self.reducer_index)) + else: + yield ctx[key, self.reducer_index] + except KeyError: + if not skip_none: # pragma: no cover + raise + if not pop: + ctx[key, self.reducer_index] = None + + def execute(self, ctx, op): + """The mapper stage must ensure all mapper blocks are inserted into ctx and no blocks + for some reducers are missing. This is needed by shuffle fetch by index, + which shuffle block are identified by the index instead of data keys. + For operands implementation simplicity, we can sort the `ctx` by key which are (chunk key, reducer index) tuple + and relax the insert order requirements. + """ diff --git a/python/xorbits/_mars/core/operand/tests/__init__.py b/python/xorbits/_mars/core/operand/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/core/operand/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/core/operand/tests/test_core.py b/python/xorbits/_mars/core/operand/tests/test_core.py new file mode 100644 index 000000000..736b84faf --- /dev/null +++ b/python/xorbits/_mars/core/operand/tests/test_core.py @@ -0,0 +1,151 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest + +from ....dataframe import core # noqa: F401 # pylint: disable=unused-variable +from ... import OutputType +from .. import Operand, ShuffleProxy, TileableOperandMixin, estimate_size, execute + + +class MyOperand(Operand, TileableOperandMixin): + @classmethod + def execute(cls, ctx, op): + return 1 + + @classmethod + def estimate_size(cls, ctx, op): + return 1 + + +class MyOperand2(MyOperand): + @classmethod + def execute(cls, ctx, op): + raise NotImplementedError + + @classmethod + def estimate_size(cls, ctx, op): + raise NotImplementedError + + +class _OperandMixin(TileableOperandMixin): + @classmethod + def tile(cls, op): + out = op.outputs[0] + params = out.params.copy() + params["index"] = (0,) * out.ndim + chunk = op.copy().reset_key().new_chunk(None, kws=[params]) + new_params = out.params.copy() + new_params["chunks"] = [chunk] + new_params["nsplits"] = () + return op.copy().new_tileables(op.inputs, kws=[new_params]) + + +class MyOperand3(Operand, _OperandMixin): + @classmethod + def execute(cls, ctx, op): + raise ValueError("intend to fail") + + @classmethod + def post_execute(cls, ctx, op): # pragma: no cover + ctx[op.outputs[0].key] += 1 + + +class MyOperand4(Operand, _OperandMixin): + @classmethod + def post_execute(cls, ctx, op): + ctx[op.outputs[0].key] += 1 + + +class MyOperand5(MyOperand4): + pass + + +def test_execute(): + op = MyOperand(extra_params={"my_extra_params": 1}) + assert op.extra_params["my_extra_params"] == 1 + MyOperand.register_executor(lambda *_: 2) + assert execute(dict(), MyOperand(_key="1")) == 2 + assert execute(dict(), MyOperand2(_key="1")) == 2 + + MyOperand.unregister_executor() + assert execute(dict(), MyOperand(_key="1")) == 1 + MyOperand2.unregister_executor() + with pytest.raises(KeyError): + execute(dict(), MyOperand2(_key="1")) + + +def test_estimate_size(): + MyOperand.register_size_estimator(lambda *_: 2) + assert estimate_size(dict(), MyOperand(_key="1")) == 2 + assert estimate_size(dict(), MyOperand2(_key="1")) == 2 + + MyOperand.unregister_size_estimator() + assert estimate_size(dict(), MyOperand(_key="1")) == 1 + MyOperand2.unregister_size_estimator() + with pytest.raises(KeyError): + estimate_size(dict(), MyOperand2(_key="1")) + + +def test_unknown_dtypes(): + op = MyOperand(_output_types=[OutputType.dataframe]) + df = op.new_tileable(None, dtypes=None) + op2 = MyOperand(_output_types=[OutputType.scalar]) + with pytest.raises(ValueError) as exc_info: + op2.new_tileable([df]) + assert "executed first" in exc_info.value.args[0] + + +def test_post_execute(setup): + op = MyOperand3(_output_types=[OutputType.tensor]) + t = op.new_tileable(None, dtype=np.dtype(float), shape=()) + with pytest.raises(ValueError, match="intend to fail"): + t.execute() + + op = MyOperand5(_output_types=[OutputType.tensor]) + t2 = op.new_tileable(None, dtype=np.dtype(float), shape=()) + + def execute_error(*_): + raise ValueError("intend to fail again") + + with pytest.raises(ValueError, match="intend to fail again"): + operand_executors = {MyOperand4: execute_error} + t2.execute(extra_config={"operand_executors": operand_executors}).fetch() + + def execute_normally(ctx, op): + ctx[op.outputs[0].key] = 1 + + operand_executors = {MyOperand5: execute_normally} + assert ( + t2.execute(extra_config={"operand_executors": operand_executors}).fetch() == 2 + ) + + +def test_shuffle(setup): + from ....dataframe import DataFrame + + chunk_size, n_rows = 10, 100 + df = DataFrame( + pd.DataFrame(np.random.rand(n_rows, 3), columns=list("abc")), + chunk_size=chunk_size, + ) + chunk_graph = df.groupby(["a"]).apply(lambda x: x).build_graph(tile=True) + [proxy_chunk] = [c for c in chunk_graph if isinstance(c.op, ShuffleProxy)] + successors = chunk_graph.successors(proxy_chunk) + n_reducers = successors[0].op.n_reducers + assert n_reducers == len(successors), (n_reducers, len(successors)) + assert len(set(c.op.n_reducers for c in successors)) == 1 + assert sorted([c.op.reducer_ordinal for c in successors]) == list(range(n_reducers)) diff --git a/python/xorbits/_mars/core/tests/__init__.py b/python/xorbits/_mars/core/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/core/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/core/tests/test_context.py b/python/xorbits/_mars/core/tests/test_context.py new file mode 100644 index 000000000..07e065fe1 --- /dev/null +++ b/python/xorbits/_mars/core/tests/test_context.py @@ -0,0 +1,31 @@ +# Copyright 2022 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import remote as mr +from ..context import get_context + + +def test_context(setup): + def func(): + ctx = get_context() + assert ctx is not None + + # no error should happen + mr.spawn(func).execute() + + # context should be reset after execution + # for test backend(test://xxx), + # the worker pool and client are in the same process + # if context is not reset, get_context() will still get one + assert get_context() is None diff --git a/python/xorbits/_mars/core/tests/test_entrypoints.py b/python/xorbits/_mars/core/tests/test_entrypoints.py new file mode 100644 index 000000000..250783851 --- /dev/null +++ b/python/xorbits/_mars/core/tests/test_entrypoints.py @@ -0,0 +1,125 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import types +import warnings + +import pkg_resources + + +class _DummyClass(object): + def __init__(self, value): + self.value = value + + def __repr__(self): + return "_DummyClass(%f, %f)" % self.value + + +def test_init_entrypoint(): + # FIXME: Python 2 workaround because nonlocal doesn't exist + counters = {"init": 0} + + def init_function(): + counters["init"] += 1 + + mod = types.ModuleType("_test_mars_extension") + mod.init_func = init_function + + try: + # will remove this module at the end of the test + sys.modules[mod.__name__] = mod + + # We are registering an entry point using the "mars" package + # ("distribution" in pkg_resources-speak) itself, though these are + # normally registered by other packages. + dist = "pymars" + entrypoints = pkg_resources.get_entry_map(dist) + my_entrypoint = pkg_resources.EntryPoint( + "init", # name of entry point + mod.__name__, # module with entry point object + attrs=["init_func"], # name of entry point object + dist=pkg_resources.get_distribution(dist), + ) + entrypoints.setdefault("mars_extensions", {})["init"] = my_entrypoint + + from .. import entrypoints + + # Allow reinitialization + entrypoints.init_extension_entrypoints.cache_clear() + + entrypoints.init_extension_entrypoints() + + # was our init function called? + assert counters["init"] == 1 + + # ensure we do not initialize twice + entrypoints.init_extension_entrypoints() + assert counters["init"] == 1 + finally: + # remove fake module + if mod.__name__ in sys.modules: + del sys.modules[mod.__name__] + + +def test_entrypoint_tolerance(): + # FIXME: Python 2 workaround because nonlocal doesn't exist + counters = {"init": 0} + + def init_function(): + counters["init"] += 1 + raise ValueError("broken") + + mod = types.ModuleType("_test_mars_bad_extension") + mod.init_func = init_function + + try: + # will remove this module at the end of the test + sys.modules[mod.__name__] = mod + + # We are registering an entry point using the "mars" package + # ("distribution" in pkg_resources-speak) itself, though these are + # normally registered by other packages. + dist = "pymars" + entrypoints = pkg_resources.get_entry_map(dist) + my_entrypoint = pkg_resources.EntryPoint( + "init", # name of entry point + mod.__name__, # module with entry point object + attrs=["init_func"], # name of entry point object + dist=pkg_resources.get_distribution(dist), + ) + entrypoints.setdefault("mars_extensions", {})["init"] = my_entrypoint + + from .. import entrypoints + + # Allow reinitialization + entrypoints.init_extension_entrypoints.cache_clear() + + with warnings.catch_warnings(record=True) as w: + entrypoints.init_extension_entrypoints() + + bad_str = "Mars extension module '_test_mars_bad_extension'" + for x in w: + if bad_str in str(x): + break + else: + raise ValueError("Expected warning message not found") + + # was our init function called? + assert counters["init"] == 1 + + finally: + # remove fake module + if mod.__name__ in sys.modules: + del sys.modules[mod.__name__] diff --git a/python/xorbits/_mars/core/tests/test_mode.py b/python/xorbits/_mars/core/tests/test_mode.py new file mode 100644 index 000000000..569bf85a5 --- /dev/null +++ b/python/xorbits/_mars/core/tests/test_mode.py @@ -0,0 +1,75 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from .. import enter_mode, is_build_mode, is_eager_mode, is_kernel_mode + + +def test_enter_mode(): + from ...config import option_context, options + + @enter_mode(kernel=True) + def wrapped(): + return is_eager_mode() + + assert not options.eager_mode + assert not wrapped() + + with option_context({"eager_mode": True}): + assert options.eager_mode + assert not wrapped() + + @enter_mode(kernel=True) + def wrapped2(): + wrapped() + with option_context({"eager_mode": True}): + assert options.eager_mode + assert not is_eager_mode() + with enter_mode(kernel=False): + assert not is_kernel_mode() + assert is_kernel_mode() + + wrapped2() + + assert not is_kernel_mode() + assert not is_build_mode() + + @enter_mode(kernel=False) + def wrapped3(): + wrapped() + with option_context({"eager_mode": True}): + assert options.eager_mode + assert not is_kernel_mode() + with enter_mode(kernel=True, build=True): + assert is_kernel_mode() + assert is_build_mode() + assert not is_kernel_mode() + assert not is_build_mode() + with pytest.raises(ValueError): + with enter_mode(kernel=True, build=True): + raise ValueError("meant to raise error") + assert not is_kernel_mode() + assert not is_build_mode() + + @enter_mode(kernel=True) + def wrapped4(): + raise ValueError("meant to raise error") + + with pytest.raises(ValueError): + wrapped4() + assert not is_kernel_mode() + assert not is_build_mode() + + wrapped3() diff --git a/python/xorbits/_mars/dataframe/__init__.py b/python/xorbits/_mars/dataframe/__init__.py new file mode 100644 index 000000000..826df3298 --- /dev/null +++ b/python/xorbits/_mars/dataframe/__init__.py @@ -0,0 +1,119 @@ +# isort: skip_file +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .initializer import DataFrame, Series, Index + +# do imports to register operands +from .base.cut import cut +from .base.eval import mars_eval as eval # pylint: disable=redefined-builtin +from .base.get_dummies import get_dummies +from .base.melt import melt +from .base.qcut import qcut +from .base.to_numeric import to_numeric +from .base.value_counts import value_counts +from .contrib.raydataset import to_ray_mldataset, to_ray_dataset +from .datasource.from_tensor import dataframe_from_tensor, series_from_tensor +from .datasource.from_index import series_from_index +from .datasource.from_records import from_records +from .datasource.from_vineyard import from_vineyard +from .datasource.read_csv import read_csv +from .datasource.read_sql import read_sql, read_sql_table, read_sql_query +from .datasource.read_parquet import read_parquet +from .datasource.read_raydataset import ( + read_raydataset, + read_ray_dataset, + read_ray_mldataset, +) +from .datasource.date_range import date_range +from .fetch import DataFrameFetch, DataFrameFetchShuffle +from .merge import concat, merge +from .missing.checkna import isna, isnull, notna, notnull +from .reduction import CustomReduction, unique +from .tseries.to_datetime import to_datetime + +from . import arithmetic +from . import base +from . import indexing +from . import merge as merge_ +from . import missing +from . import reduction +from . import statistics +from . import sort +from . import groupby +from . import ufunc +from . import datastore +from . import window +from . import plotting + +del ( + reduction, + statistics, + arithmetic, + indexing, + merge_, + base, + groupby, + missing, + ufunc, + datastore, + sort, + window, + plotting, +) +del DataFrameFetch, DataFrameFetchShuffle + +# noinspection PyUnresolvedReferences +from .arrays import ArrowStringDtype, ArrowStringArray, ArrowListDtype, ArrowListArray +from .core import ( + CategoricalIndex, + DatetimeIndex, + Float64Index, + IntervalIndex, + MultiIndex, + PeriodIndex, + RangeIndex, + TimedeltaIndex, + UInt64Index, +) + +# noinspection PyUnresolvedReferences +from pandas import ( + Timedelta, + Timestamp, + offsets, + NaT, + Interval, + DateOffset, + BooleanDtype, + CategoricalDtype, + DatetimeTZDtype, + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + IntervalDtype, + SparseDtype, + StringDtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + PeriodDtype, +) + +try: + from pandas import NA, NamedAgg +except ImportError: # pragma: no cover + pass diff --git a/python/xorbits/_mars/dataframe/align.py b/python/xorbits/_mars/dataframe/align.py new file mode 100644 index 000000000..53f82ffcb --- /dev/null +++ b/python/xorbits/_mars/dataframe/align.py @@ -0,0 +1,975 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import operator + +import numpy as np +import pandas as pd + +from .. import opcodes as OperandDef +from ..core import OutputType +from ..core.operand import MapReduceOperand, OperandStage +from ..serialization.serializables import ( + AnyField, + BoolField, + FieldTypes, + Int32Field, + KeyField, + ListField, +) +from .core import SERIES_CHUNK_TYPE +from .operands import DataFrameOperandMixin, DataFrameShuffleProxy +from .utils import ( + build_split_idx_to_origin_idx, + filter_dtypes, + filter_index_value, + hash_dtypes, + hash_index, + is_index_value_identical, + parse_index, + split_monotonic_index_min_max, + validate_axis, +) + + +class DataFrameIndexAlign(MapReduceOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.DATAFRAME_INDEX_ALIGN + + index_min = AnyField("index_min") + index_min_close = BoolField("index_min_close") + index_max = AnyField("index_max") + index_max_close = BoolField("index_max_close") + index_shuffle_size = Int32Field("index_shuffle_size", default=None) + column_min = AnyField("column_min") + column_min_close = BoolField("column_min_close") + column_max = AnyField("column_max") + column_max_close = BoolField("column_max_close") + column_shuffle_size = Int32Field("column_shuffle_size", default=None) + column_shuffle_segments = ListField("column_shuffle_segments", FieldTypes.series) + + input = KeyField("input") + + def __init__( + self, index_min_max=None, column_min_max=None, output_types=None, **kw + ): + if index_min_max is not None: + kw.update( + dict( + index_min=index_min_max[0], + index_min_close=index_min_max[1], + index_max=index_min_max[2], + index_max_close=index_min_max[3], + ) + ) + if column_min_max is not None: + kw.update( + dict( + column_min=column_min_max[0], + column_min_close=column_min_max[1], + column_max=column_min_max[2], + column_max_close=column_min_max[3], + ) + ) + super().__init__(_output_types=output_types, **kw) + + @property + def index_min_max(self): + if getattr(self, "index_min", None) is None: + return None + return ( + self.index_min, + self.index_min_close, + self.index_max, + self.index_max_close, + ) + + @property + def column_min_max(self): + if getattr(self, "column_min", None) is None: + return None + return ( + self.column_min, + self.column_min_close, + self.column_max, + self.column_max_close, + ) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self.input = self._inputs[0] + + def build_map_chunk_kw(self, inputs, **kw): + if kw.get("index_value", None) is None and inputs[0].index_value is not None: + input_index_value = inputs[0].index_value + index_min_max = self.index_min_max + if index_min_max is not None: + kw["index_value"] = filter_index_value(input_index_value, index_min_max) + else: + kw["index_value"] = parse_index( + inputs[0].index_value.to_pandas(), + input_index_value, + type(self).__name__, + ) + if self.output_types[0] == OutputType.dataframe: + if ( + kw.get("columns_value", None) is None + and getattr(inputs[0], "columns_value", None) is not None + ): + input_columns_value = inputs[0].columns_value + input_dtypes = inputs[0].dtypes + column_min_max = self.column_min_max + if column_min_max is not None: + kw["columns_value"] = filter_index_value( + input_columns_value, column_min_max, store_data=True + ) + else: + kw["columns_value"] = parse_index( + inputs[0].columns_value.to_pandas(), + input_columns_value, + type(self).__name__, + ) + kw["dtypes"] = input_dtypes[kw["columns_value"].to_pandas()] + column_shuffle_size = self.column_shuffle_size + if column_shuffle_size is not None: + self.column_shuffle_segments = hash_dtypes( + input_dtypes, column_shuffle_size + ) + else: + if ( + kw.get("dtype", None) is None + and getattr(inputs[0], "dtype", None) is not None + ): + kw["dtype"] = inputs[0].dtype + if ( + kw.get("name", None) is None + and getattr(inputs[0], "name", None) is not None + ): + kw["name"] = inputs[0].name + return kw + + def build_reduce_chunk_kw(self, inputs, index, **kw): + kw["index"] = index + if ( + kw.get("index_value", None) is None + and inputs[0].inputs[0].index_value is not None + ): + index_align_map_chunks = inputs[0].inputs + if index_align_map_chunks[0].op.index_min_max is not None: + # shuffle on columns, all the DataFrameIndexAlignMap has the same index + kw["index_value"] = filter_index_value( + index_align_map_chunks[0].index_value, + index_align_map_chunks[0].op.index_min_max, + ) + else: + # shuffle on index + kw["index_value"] = parse_index( + index_align_map_chunks[0].index_value.to_pandas(), + [c.key for c in index_align_map_chunks], + type(self).__name__, + ) + if self.output_types[0] == OutputType.dataframe: + if ( + kw.get("columns_value", None) is None + and getattr(inputs[0].inputs[0], "columns_value", None) is not None + ): + index_align_map_chunks = inputs[0].inputs + if index_align_map_chunks[0].op.column_min_max is not None: + # shuffle on index + kw["columns_value"] = filter_index_value( + index_align_map_chunks[0].columns_value, + index_align_map_chunks[0].op.column_min_max, + store_data=True, + ) + kw["dtypes"] = index_align_map_chunks[0].dtypes[ + kw["columns_value"].to_pandas() + ] + else: + # shuffle on columns + all_dtypes = [ + c.op.column_shuffle_segments[index[1]] + for c in index_align_map_chunks + if c.index[0] == index_align_map_chunks[0].index[0] + ] + kw["dtypes"] = pd.concat(all_dtypes) + kw["columns_value"] = parse_index( + kw["dtypes"].index, store_data=True + ) + else: + if ( + kw.get("dtype", None) is None + and getattr(inputs[0].inputs[0], "dtype", None) is not None + ): + kw["dtype"] = inputs[0].inputs[0].dtype + if ( + kw.get("name", None) is None + and getattr(inputs[0].inputs[0], "name", None) is not None + ): + kw["name"] = inputs[0].inputs[0].name + return kw + + @classmethod + def execute_map(cls, ctx, op): + # TODO(QIN): add GPU support here + df = ctx[op.inputs[0].key] + + filters = [[], []] + + chunk = op.outputs[0] + if op.index_shuffle_size == -1: + # no shuffle and no min-max filter on index + filters[0].append(slice(None, None, None)) + elif op.index_shuffle_size is None: + # no shuffle on index + comp_op = operator.ge if op.index_min_close else operator.gt + index_cond = comp_op(df.index, op.index_min) + comp_op = operator.le if op.index_max_close else operator.lt + index_cond = index_cond & comp_op(df.index, op.index_max) + filters[0].append(index_cond) + else: + # shuffle on index + shuffle_size = op.index_shuffle_size + filters[0].extend(hash_index(df.index, shuffle_size)) + + if chunk.ndim == 1: + if len(filters[0]) == 1: + # no shuffle + ctx[chunk.key] = df.loc[filters[0][0]] + else: + for index_idx, index_filter in enumerate(filters[0]): + ctx[chunk.key, (index_idx,)] = ( + ctx.get_current_chunk().index, + df.loc[index_filter], + ) + return + + if op.column_shuffle_size == -1: + # no shuffle and no min-max filter on columns + filters[1].append(slice(None, None, None)) + if op.column_shuffle_size is None: + # no shuffle on columns + comp_op = operator.ge if op.column_min_close else operator.gt + columns_cond = comp_op(df.columns, op.column_min) + comp_op = operator.le if op.column_max_close else operator.lt + columns_cond = columns_cond & comp_op(df.columns, op.column_max) + filters[1].append(columns_cond) + else: + # shuffle on columns + shuffle_size = op.column_shuffle_size + filters[1].extend(hash_index(df.columns, shuffle_size)) + + if all(len(it) == 1 for it in filters): + # no shuffle + ctx[chunk.key] = df.loc[filters[0][0], filters[1][0]] + elif len(filters[0]) == 1: + # shuffle on columns + for column_idx, column_filter in enumerate(filters[1]): + shuffle_index = (chunk.index[0], column_idx) + ctx[chunk.key, shuffle_index] = ( + ctx.get_current_chunk().index, + df.loc[filters[0][0], column_filter], + ) + elif len(filters[1]) == 1: + # shuffle on index + for index_idx, index_filter in enumerate(filters[0]): + shuffle_index = (index_idx, chunk.index[1]) + ctx[chunk.key, shuffle_index] = ( + ctx.get_current_chunk().index, + df.loc[index_filter, filters[1][0]], + ) + else: + # full shuffle + shuffle_index_size = op.index_shuffle_size + shuffle_column_size = op.column_shuffle_size + out_idxes = itertools.product( + range(shuffle_index_size), range(shuffle_column_size) + ) + out_index_columns = itertools.product(*filters) + for out_idx, out_index_column in zip(out_idxes, out_index_columns): + index_filter, column_filter = out_index_column + ctx[chunk.key, out_idx] = ( + ctx.get_current_chunk().index, + df.loc[index_filter, column_filter], + ) + + @classmethod + def execute_reduce(cls, ctx, op: "DataFrameIndexAlign"): + chunk = op.outputs[0] + input_idx_to_df = dict(op.iter_mapper_data(ctx)) + row_idxes = sorted({idx[0] for idx in input_idx_to_df}) + if chunk.ndim == 2: + col_idxes = sorted({idx[1] for idx in input_idx_to_df}) + + ress = [] + for row_idx in row_idxes: + if chunk.ndim == 2: + row_dfs = [] + for col_idx in col_idxes: + row_dfs.append(input_idx_to_df[row_idx, col_idx]) + row_df = pd.concat(row_dfs, axis=1) + else: + row_df = input_idx_to_df[(row_idx,)] + + ress.append(row_df) + + ctx[chunk.key] = pd.concat(ress, axis=0) + + @classmethod + def execute(cls, ctx, op): + if op.stage == OperandStage.map: + cls.execute_map(ctx, op) + else: + cls.execute_reduce(ctx, op) + + +class _AxisMinMaxSplitInfo(object): + def __init__( + self, left_split, left_increase, right_split, right_increase, dummy=False + ): + self._left_split = left_split + self._right_split = right_split + self._dummy = dummy + + self._left_split_idx_to_origin_idx = build_split_idx_to_origin_idx( + self._left_split, left_increase + ) + self._right_split_idx_to_origin_idx = build_split_idx_to_origin_idx( + self._right_split, right_increase + ) + + def isdummy(self): + return self._dummy + + def get_origin_left_idx(self, idx): + return self._left_split_idx_to_origin_idx[idx][0] + + def get_origin_left_split(self, idx): + left_idx, left_inner_idx = self._left_split_idx_to_origin_idx[idx] + return self._left_split[left_idx][left_inner_idx] + + def get_origin_right_idx(self, idx): + return self._right_split_idx_to_origin_idx[idx][0] + + def get_origin_right_split(self, idx): + right_idx, right_inner_idx = self._right_split_idx_to_origin_idx[idx] + return self._right_split[right_idx][right_inner_idx] + + +class _MinMaxSplitInfo(object): + def __init__(self, row_min_max_split_info=None, col_min_max_split_info=None): + self.row_min_max_split_info = row_min_max_split_info + self.col_min_max_split_info = col_min_max_split_info + + def all_axes_can_split(self): + return ( + self.row_min_max_split_info is not None + and self.col_min_max_split_info is not None + ) + + def one_axis_can_split(self): + return (self.row_min_max_split_info is None) ^ ( + self.col_min_max_split_info is None + ) + + def no_axis_can_split(self): + return ( + self.row_min_max_split_info is None and self.col_min_max_split_info is None + ) + + def __getitem__(self, i): + return [self.row_min_max_split_info, self.col_min_max_split_info][i] + + def __setitem__(self, axis, axis_min_max_split_info): + assert axis in {0, 1} + if axis == 0: + self.row_min_max_split_info = axis_min_max_split_info + else: + self.col_min_max_split_info = axis_min_max_split_info + + def get_row_left_idx(self, out_idx): + return self.row_min_max_split_info.get_origin_left_idx(out_idx) + + def get_row_left_split(self, out_idx): + return self.row_min_max_split_info.get_origin_left_split(out_idx) + + def get_col_left_idx(self, out_idx): + return self.col_min_max_split_info.get_origin_left_idx(out_idx) + + def get_col_left_split(self, out_idx): + return self.col_min_max_split_info.get_origin_left_split(out_idx) + + def get_row_right_idx(self, out_idx): + return self.row_min_max_split_info.get_origin_right_idx(out_idx) + + def get_row_right_split(self, out_idx): + return self.row_min_max_split_info.get_origin_right_split(out_idx) + + def get_col_right_idx(self, out_idx): + return self.col_min_max_split_info.get_origin_right_idx(out_idx) + + def get_col_right_split(self, out_idx): + return self.col_min_max_split_info.get_origin_right_split(out_idx) + + def get_axis_idx(self, axis, left_or_right, out_idx): + if axis == 0: + if left_or_right == 0: + return self.get_row_left_idx(out_idx) + else: + assert left_or_right == 1 + return self.get_row_right_idx(out_idx) + else: + assert axis == 1 + if left_or_right == 0: + return self.get_col_left_idx(out_idx) + else: + assert left_or_right == 1 + return self.get_col_right_idx(out_idx) + + def get_axis_split(self, axis, left_or_right, out_idx): + if axis == 0: + if left_or_right == 0: + return self.get_row_left_split(out_idx) + else: + assert left_or_right == 1 + return self.get_row_right_split(out_idx) + else: + assert axis == 1 + if left_or_right == 0: + return self.get_col_left_split(out_idx) + else: + assert left_or_right == 1 + return self.get_col_right_split(out_idx) + + +def _get_chunk_index_min_max(index_chunks): + chunk_index_min_max = [] + for chunk in index_chunks: + min_val = chunk.min_val + min_val_close = chunk.min_val_close + max_val = chunk.max_val + max_val_close = chunk.max_val_close + if min_val is None or max_val is None: + chunk_index_min_max.append((None, True, None, True)) + else: + chunk_index_min_max.append((min_val, min_val_close, max_val, max_val_close)) + return chunk_index_min_max + + +def _get_monotonic_chunk_index_min_max(index, index_chunks): + chunk_index_min_max = _get_chunk_index_min_max(index_chunks) + if index.is_monotonic_decreasing: + return list(reversed(chunk_index_min_max)), False + + for j in range(len(chunk_index_min_max) - 1): + # overlap only if the prev max is close and curr min is close + # and they are identical + prev_max, prev_max_close = chunk_index_min_max[j][2:] + curr_min, curr_min_close = chunk_index_min_max[j + 1][:2] + if prev_max_close and curr_min_close and prev_max == curr_min: + return + return chunk_index_min_max, True + + +def _need_align_map( + input_chunk, + index_min_max, + column_min_max, + dummy_index_splits=False, + dummy_column_splits=False, +): + if isinstance(input_chunk, SERIES_CHUNK_TYPE): + if input_chunk.index_value is None: + return True + if input_chunk.index_value.min_max != index_min_max: + return True + else: + if not dummy_index_splits: + if ( + input_chunk.index_value is None + or input_chunk.index_value.min_max != index_min_max + ): + return True + if not dummy_column_splits: + if ( + input_chunk.columns_value is None + or input_chunk.columns_value.min_max != column_min_max + ): + return True + return False + + +def _is_index_identical(left, right): + if len(left) != len(right): + return False + for left_item, right_item in zip(left, right): + if left_item.key != right_item.key: + return False + return True + + +def _axis_need_shuffle(left_axis, right_axis, left_axis_chunks, right_axis_chunks): + if _is_index_identical(left_axis_chunks, right_axis_chunks): + return False + if ( + not left_axis.is_monotonic_increasing_or_decreasing + and len(left_axis_chunks) > 1 + ): + return True + if ( + not right_axis.is_monotonic_increasing_or_decreasing + and len(right_axis_chunks) > 1 + ): + return True + return False + + +def _calc_axis_splits(left_axis, right_axis, left_axis_chunks, right_axis_chunks): + if _axis_need_shuffle(left_axis, right_axis, left_axis_chunks, right_axis_chunks): + # do shuffle + out_chunk_size = max(len(left_axis_chunks), len(right_axis_chunks)) + return None, [np.nan for _ in range(out_chunk_size)] + else: + # no need to do shuffle on this axis + if _is_index_identical(left_axis_chunks, right_axis_chunks): + left_chunk_index_min_max = _get_chunk_index_min_max(left_axis_chunks) + right_splits = left_splits = [[c] for c in left_chunk_index_min_max] + right_increase = left_increase = None + elif len(left_axis_chunks) == 1 and len(right_axis_chunks) == 1: + left_splits = [_get_chunk_index_min_max(left_axis_chunks)] + left_increase = left_axis_chunks[0].is_monotonic_decreasing + right_splits = [_get_chunk_index_min_max(right_axis_chunks)] + right_increase = right_axis_chunks[0].is_monotonic_decreasing + else: + ( + left_chunk_index_min_max, + left_increase, + ) = _get_monotonic_chunk_index_min_max(left_axis, left_axis_chunks) + ( + right_chunk_index_min_max, + right_increase, + ) = _get_monotonic_chunk_index_min_max(right_axis, right_axis_chunks) + left_splits, right_splits = split_monotonic_index_min_max( + left_chunk_index_min_max, + left_increase, + right_chunk_index_min_max, + right_increase, + ) + splits = _AxisMinMaxSplitInfo( + left_splits, left_increase, right_splits, right_increase + ) + return splits, None + + +def _build_dummy_axis_split(chunk_shape): + axis_index_min_max, axis_increase = ( + [(i, True, i + 1, True) for i in range(chunk_shape)], + True, + ) + if len(axis_index_min_max) == 1: + left_splits, right_splits = [axis_index_min_max], [axis_index_min_max] + else: + left_splits, right_splits = split_monotonic_index_min_max( + axis_index_min_max, axis_increase, axis_index_min_max, axis_increase + ) + return _AxisMinMaxSplitInfo( + left_splits, axis_increase, right_splits, axis_increase, dummy=True + ) + + +def _gen_series_chunks(splits, out_shape, left_or_right, series): + out_chunks = [] + if splits[0] is not None: + # need no shuffle + for out_idx in range(out_shape[0]): + idx = splits.get_axis_idx(0, left_or_right, out_idx) + index_min_max = splits.get_axis_split(0, left_or_right, out_idx) + chunk = series.cix[(idx,)] + if _need_align_map(chunk, index_min_max, None): + align_op = DataFrameIndexAlign( + stage=OperandStage.map, + index_min_max=index_min_max, + column_min_max=None, + dtype=chunk.dtype, + sparse=series.issparse(), + output_types=[OutputType.series], + ) + params = align_op.build_map_chunk_kw( + [chunk], shape=(np.nan,), index=(out_idx,) + ) + out_chunk = align_op.new_chunk([chunk], **params) + else: + out_chunk = chunk + out_chunks.append(out_chunk) + else: + # gen map chunks + map_chunks = [] + for chunk in series.chunks: + map_op = DataFrameIndexAlign( + stage=OperandStage.map, + sparse=chunk.issparse(), + index_shuffle_size=out_shape[0], + output_types=[OutputType.series], + ) + params = map_op.build_map_chunk_kw( + [chunk], shape=(np.nan,), index=chunk.index + ) + map_chunks.append(map_op.new_chunk([chunk], **params)) + + proxy_chunk = DataFrameShuffleProxy(output_types=[OutputType.series]).new_chunk( + map_chunks, shape=() + ) + + # gen reduce chunks + for out_idx in range(out_shape[0]): + reduce_op = DataFrameIndexAlign( + stage=OperandStage.reduce, + n_reducers=out_shape[0], + i=out_idx, + sparse=proxy_chunk.issparse(), + output_types=[OutputType.series], + ) + params = reduce_op.build_reduce_chunk_kw( + [proxy_chunk], index=(out_idx,), shape=(np.nan,) + ) + out_chunks.append(reduce_op.new_chunk([proxy_chunk], **params)) + + return out_chunks + + +def _gen_dataframe_chunks(splits, out_shape, left_or_right, df): + out_chunks = [] + if splits.all_axes_can_split(): + # no shuffle for all axes + kw = { + "index_shuffle_size": -1 if splits[0].isdummy() else None, + "column_shuffle_size": -1 if splits[1].isdummy() else None, + } + for out_idx in itertools.product(*(range(s) for s in out_shape)): + row_idx = splits.get_axis_idx(0, left_or_right, out_idx[0]) + col_idx = splits.get_axis_idx(1, left_or_right, out_idx[1]) + index_min_max = splits.get_axis_split(0, left_or_right, out_idx[0]) + column_min_max = splits.get_axis_split(1, left_or_right, out_idx[1]) + chunk = df.cix[row_idx, col_idx] + if _need_align_map( + chunk, + index_min_max, + column_min_max, + splits[0].isdummy(), + splits[1].isdummy(), + ): + if splits[1].isdummy(): + dtypes = chunk.dtypes + else: + dtypes = filter_dtypes(chunk.dtypes, column_min_max) + chunk_kw = { + "index_value": chunk.index_value if splits[0].isdummy() else None, + "columns_value": chunk.columns_value + if splits[1].isdummy() + else None, + "dtypes": chunk.dtypes if splits[1].isdummy() else None, + } + align_op = DataFrameIndexAlign( + stage=OperandStage.map, + index_min_max=index_min_max, + column_min_max=column_min_max, + dtypes=dtypes, + sparse=chunk.issparse(), + output_types=[OutputType.dataframe], + **kw + ) + params = align_op.build_map_chunk_kw( + [chunk], shape=(np.nan, np.nan), index=out_idx, **chunk_kw + ) + out_chunk = align_op.new_chunk([chunk], **params) + else: + out_chunk = chunk + out_chunks.append(out_chunk) + elif splits.one_axis_can_split(): + # one axis needs shuffle + shuffle_axis = 0 if splits[0] is None else 1 + align_axis = 1 - shuffle_axis + + for align_axis_idx in range(out_shape[align_axis]): + if align_axis == 0: + kw = { + "index_min_max": splits.get_axis_split( + align_axis, left_or_right, align_axis_idx + ), + "index_shuffle_size": -1 if splits[0].isdummy() else None, + "column_shuffle_size": out_shape[shuffle_axis], + } + input_idx = splits.get_axis_idx( + align_axis, left_or_right, align_axis_idx + ) + else: + kw = { + "column_min_max": splits.get_axis_split( + align_axis, left_or_right, align_axis_idx + ), + "index_shuffle_size": out_shape[shuffle_axis], + "column_shuffle_size": -1 if splits[1].isdummy() else None, + } + input_idx = splits.get_axis_idx( + align_axis, left_or_right, align_axis_idx + ) + input_chunks = [c for c in df.chunks if c.index[align_axis] == input_idx] + map_chunks = [] + for j, input_chunk in enumerate(input_chunks): + chunk_kw = dict() + if align_axis == 0: + chunk_kw["index_value"] = ( + input_chunk.index_value if splits[0].isdummy() else None + ) + else: + chunk_kw["columns_value"] = ( + input_chunk.columns_value if splits[1].isdummy() else None + ) + chunk_kw["dtypes"] = input_chunk.dtypes + map_op = DataFrameIndexAlign( + stage=OperandStage.map, + sparse=input_chunk.issparse(), + output_types=[OutputType.dataframe], + **kw + ) + idx = [None, None] + idx[align_axis] = align_axis_idx + idx[shuffle_axis] = j + params = map_op.build_map_chunk_kw( + [input_chunk], shape=(np.nan, np.nan), index=tuple(idx), **chunk_kw + ) + map_chunks.append(map_op.new_chunk([input_chunk], **params)) + proxy_chunk = DataFrameShuffleProxy( + sparse=df.issparse(), output_types=[OutputType.dataframe] + ).new_chunk(map_chunks, shape=()) + for j in range(out_shape[shuffle_axis]): + chunk_kw = dict() + if align_axis == 0: + chunk_kw["index_value"] = ( + proxy_chunk.inputs[0].inputs[0].index_value + if splits[0].isdummy() + else None + ) + else: + chunk_kw["columns_value"] = ( + proxy_chunk.inputs[0].inputs[0].columns_value + if splits[1].isdummy() + else None + ) + chunk_kw["dtypes"] = proxy_chunk.inputs[0].inputs[0].dtypes + reduce_idx = ( + (align_axis_idx, j) if align_axis == 0 else (j, align_axis_idx) + ) + reduce_op = DataFrameIndexAlign( + stage=OperandStage.reduce, + n_reducers=out_shape[shuffle_axis], + i=j, + sparse=proxy_chunk.issparse(), + output_types=[OutputType.dataframe], + ) + params = reduce_op.build_reduce_chunk_kw( + [proxy_chunk], shape=(np.nan, np.nan), index=reduce_idx, **chunk_kw + ) + out_chunks.append(reduce_op.new_chunk([proxy_chunk], **params)) + out_chunks.sort(key=lambda c: c.index) + else: + # all axes need shuffle + assert splits.no_axis_can_split() + + # gen map chunks + map_chunks = [] + for chunk in df.chunks: + map_op = DataFrameIndexAlign( + stage=OperandStage.map, + sparse=chunk.issparse(), + index_shuffle_size=out_shape[0], + column_shuffle_size=out_shape[1], + output_types=[OutputType.dataframe], + ) + params = map_op.build_map_chunk_kw( + [chunk], shape=(np.nan, np.nan), index=chunk.index + ) + map_chunks.append(map_op.new_chunk([chunk], **params)) + + proxy_chunk = DataFrameShuffleProxy( + output_types=[OutputType.dataframe] + ).new_chunk(map_chunks, shape=()) + + # gen reduce chunks + out_indices = list(itertools.product(*(range(s) for s in out_shape))) + for out_idx in out_indices: + reduce_op = DataFrameIndexAlign( + stage=OperandStage.reduce, + n_reducers=len(out_indices), + i=out_idx, + sparse=proxy_chunk.issparse(), + output_types=[OutputType.dataframe], + ) + params = reduce_op.build_reduce_chunk_kw( + [proxy_chunk], index=out_idx, shape=(np.nan, np.nan) + ) + out_chunks.append(reduce_op.new_chunk([proxy_chunk], **params)) + + return out_chunks + + +def align_dataframe_dataframe(left, right, axis=None): + left_index_chunks = [c.index_value for c in left.cix[:, 0]] + right_index_chunks = [c.index_value for c in right.cix[:, 0]] + left_columns_chunks = [c.columns_value for c in left.cix[0, :]] + right_columns_chunks = [c.columns_value for c in right.cix[0, :]] + + axis = validate_axis(axis) if axis is not None else None + if axis is None or axis == 0: + index_splits, index_chunk_shape = _calc_axis_splits( + left.index_value, right.index_value, left_index_chunks, right_index_chunks + ) + else: + index_splits, index_chunk_shape = None, None + + if axis is None or axis == 1: + columns_splits, column_chunk_shape = _calc_axis_splits( + left.columns_value, + right.columns_value, + left_columns_chunks, + right_columns_chunks, + ) + else: + columns_splits, column_chunk_shape = None, None + + splits = _MinMaxSplitInfo(index_splits, columns_splits) + out_left_chunk_shape = ( + len(index_chunk_shape or list(itertools.chain(*index_splits._left_split))) + if index_splits is not None + else left.chunk_shape[0], + len(column_chunk_shape or list(itertools.chain(*columns_splits._left_split))) + if columns_splits is not None + else left.chunk_shape[1], + ) + if axis is None: + out_right_chunk_shape = out_left_chunk_shape + else: + out_right_chunk_shape = ( + len(index_chunk_shape or list(itertools.chain(*index_splits._right_split))) + if index_splits is not None + else right.chunk_shape[0], + len( + column_chunk_shape + or list(itertools.chain(*columns_splits._right_split)) + ) + if columns_splits is not None + else right.chunk_shape[1], + ) + left_chunks = _gen_dataframe_chunks(splits, out_left_chunk_shape, 0, left) + right_chunks = _gen_dataframe_chunks(splits, out_right_chunk_shape, 1, right) + + index_nsplits = columns_nsplits = None + if axis is None or axis == 0: + if _is_index_identical(left_index_chunks, right_index_chunks): + index_nsplits = left.nsplits[0] + else: + index_nsplits = [np.nan for _ in range(out_left_chunk_shape[0])] + if axis is None or axis == 1: + if _is_index_identical(left_columns_chunks, right_columns_chunks): + columns_nsplits = left.nsplits[1] + else: + columns_nsplits = [np.nan for _ in range(out_left_chunk_shape[1])] + + nsplits = [index_nsplits, columns_nsplits] + + out_chunk_shapes = (out_left_chunk_shape, out_right_chunk_shape) + return nsplits, out_chunk_shapes, left_chunks, right_chunks + + +def align_dataframe_series(left, right, axis="columns"): + axis = validate_axis(axis) + if axis == 1: + left_columns_chunks = [c.columns_value for c in left.cix[0, :]] + right_index_chunks = [c.index_value for c in right.chunks] + index_splits, chunk_shape = _calc_axis_splits( + left.columns_value, + right.index_value, + left_columns_chunks, + right_index_chunks, + ) + dummy_splits, dummy_nsplits = ( + _build_dummy_axis_split(left.chunk_shape[0]), + left.nsplits[0], + ) + out_chunk_shape = ( + len(dummy_nsplits), + len(chunk_shape or list(itertools.chain(*index_splits._left_split))), + ) + left_chunks = _gen_dataframe_chunks( + _MinMaxSplitInfo(dummy_splits, index_splits), out_chunk_shape, 0, left + ) + right_chunks = _gen_series_chunks( + _MinMaxSplitInfo(index_splits, None), (out_chunk_shape[1],), 1, right + ) + if _is_index_identical(left_columns_chunks, right_index_chunks): + index_nsplits = left.nsplits[1] + else: + index_nsplits = [np.nan for _ in range(out_chunk_shape[1])] + nsplits = [dummy_nsplits, index_nsplits] + else: + left_index_chunks = [c.index_value for c in left.cix[:, 0]] + right_index_chunks = [c.index_value for c in right.chunks] + index_splits, index_chunk_shape = _calc_axis_splits( + left.index_value, right.index_value, left_index_chunks, right_index_chunks + ) + + dummy_splits, dummy_nsplits = ( + _build_dummy_axis_split(left.chunk_shape[1]), + left.nsplits[1], + ) + out_chunk_shape = ( + len(index_chunk_shape or list(itertools.chain(*index_splits._left_split))), + len(dummy_nsplits), + ) + left_chunks = _gen_dataframe_chunks( + _MinMaxSplitInfo(index_splits, dummy_splits), out_chunk_shape, 0, left + ) + right_chunks = _gen_series_chunks( + _MinMaxSplitInfo(index_splits, None), (out_chunk_shape[0],), 1, right + ) + if _is_index_identical(left_index_chunks, right_index_chunks): + index_nsplits = left.nsplits[0] + else: + index_nsplits = [np.nan for _ in range(out_chunk_shape[0])] + nsplits = [index_nsplits, dummy_nsplits] + + return nsplits, out_chunk_shape, left_chunks, right_chunks + + +def align_series_series(left, right): + if is_index_value_identical(left, right): + # index identical, skip align + return left.nsplits, left.chunk_shape, left.chunks, right.chunks + + left_index_chunks = [c.index_value for c in left.chunks] + right_index_chunks = [c.index_value for c in right.chunks] + + index_splits, index_chunk_shape = _calc_axis_splits( + left.index_value, right.index_value, left_index_chunks, right_index_chunks + ) + + out_chunk_shape = ( + len(index_chunk_shape or list(itertools.chain(*index_splits._left_split))), + ) + splits = _MinMaxSplitInfo(index_splits, None) + + left_chunks = _gen_series_chunks(splits, out_chunk_shape, 0, left) + right_chunks = _gen_series_chunks(splits, out_chunk_shape, 1, right) + index_nsplits = [np.nan for _ in range(out_chunk_shape[0])] + nsplits = [index_nsplits] + return nsplits, out_chunk_shape, left_chunks, right_chunks diff --git a/python/xorbits/_mars/dataframe/arithmetic/__init__.py b/python/xorbits/_mars/dataframe/arithmetic/__init__.py new file mode 100644 index 000000000..5e80fa457 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/__init__.py @@ -0,0 +1,352 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools + +import pandas as pd + +try: + from pandas.core.arraylike import OpsMixin as PdOpsMixin +except ImportError: # pragma: no cover + PdOpsMixin = None + +from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE, is_build_mode +from ..ufunc.tensor import register_tensor_ufunc +from ..utils import wrap_notimplemented_exception +from .abs import DataFrameAbs, abs_ +from .add import DataFrameAdd, add, radd +from .arccos import DataFrameArccos +from .arccosh import DataFrameArccosh +from .arcsin import DataFrameArcsin +from .arcsinh import DataFrameArcsinh +from .arctan import DataFrameArctan +from .arctanh import DataFrameArctanh +from .around import DataFrameAround, around +from .bitwise_and import DataFrameAnd, bitand, rbitand +from .bitwise_or import DataFrameOr, bitor, rbitor +from .bitwise_xor import DataFrameXor, bitxor, rbitxor +from .ceil import DataFrameCeil +from .cos import DataFrameCos +from .cosh import DataFrameCosh +from .degrees import DataFrameDegrees +from .dot import dot, rdot +from .equal import DataFrameEqual, eq +from .exp import DataFrameExp +from .exp2 import DataFrameExp2 +from .expm1 import DataFrameExpm1 +from .floor import DataFrameFloor +from .floordiv import DataFrameFloorDiv, floordiv, rfloordiv +from .greater import DataFrameGreater, gt +from .greater_equal import DataFrameGreaterEqual, ge +from .invert import DataFrameNot, invert +from .is_ufuncs import DataFrameIsFinite, DataFrameIsInf, DataFrameIsNan +from .less import DataFrameLess, lt +from .less_equal import DataFrameLessEqual, le +from .log import DataFrameLog +from .log2 import DataFrameLog2 +from .log10 import DataFrameLog10 +from .mod import DataFrameMod, mod, rmod +from .multiply import DataFrameMul, mul, rmul +from .negative import DataFrameNegative, negative +from .not_equal import DataFrameNotEqual, ne +from .power import DataFramePower, power, rpower +from .radians import DataFrameRadians +from .sin import DataFrameSin +from .sinh import DataFrameSinh +from .sqrt import DataFrameSqrt +from .subtract import DataFrameSubtract, rsubtract, subtract +from .tan import DataFrameTan +from .tanh import DataFrameTanh +from .truediv import DataFrameTrueDiv, rtruediv, truediv + + +def _wrap_eq(): + @functools.wraps(eq) + def call(df, other, **kw): + if is_build_mode(): + return df._equals(other) + return _wrap_comparison(eq)(df, other, **kw) + + return call + + +def _wrap_comparison(func): + @functools.wraps(func) + def call(df, other, **kw): + if isinstance(df, DATAFRAME_TYPE) and isinstance(other, DATAFRAME_TYPE): + # index and columns should be identical + for index_type in ["index_value", "columns_value"]: + left, right = getattr(df, index_type), getattr(other, index_type) + if left.has_value() and right.has_value(): + # if df and other's index or columns has value + index_eq = left.to_pandas().equals(right.to_pandas()) + else: + index_eq = left.key == right.key + if not index_eq: + raise ValueError( + "Can only compare identically-labeled DataFrame object" + ) + return wrap_notimplemented_exception(func)(df, other, **kw) + + return call + + +_reverse_magic_names = { + "eq": "eq", + "ne": "ne", + "lt": "ge", + "le": "gt", + "gt": "le", + "ge": "lt", +} + + +def _wrap_pandas_magics(cls, magic_name: str): + magic_func_name = f"__{magic_name}__" + magic_rfunc_name = _reverse_magic_names.get(magic_name, f"__r{magic_name}__") + try: + raw_method = getattr(cls, magic_func_name) + except AttributeError: + return + + @functools.wraps(raw_method) + def wrapped(self, other): + if not isinstance(other, (DATAFRAME_TYPE, SERIES_TYPE, INDEX_TYPE)): + return raw_method(self, other) + + try: + val = getattr(other, magic_rfunc_name)(self) + except AttributeError: # pragma: no cover + return raw_method(self, other) + + if val is NotImplemented: # pragma: no cover + return raw_method(self, other) + return val + + setattr(cls, magic_func_name, wrapped) + + +def _install(): + def _register_method(cls, name, func, wrapper=None): + if wrapper is None: + + @functools.wraps(func) + def wrapper(df, *args, **kwargs): + return func(df, *args, **kwargs) + + try: + if issubclass(cls, DATAFRAME_TYPE): + wrapper.__doc__ = func.__frame_doc__ + elif issubclass(cls, SERIES_TYPE): + wrapper.__doc__ = func.__series_doc__ + else: + wrapper = func + except AttributeError: + wrapper = func + + wrapper.__name__ = func.__name__ + setattr(cls, name, wrapper) + + def _register_bin_method(cls, name, func): + def call_df_fill(df, other, axis="columns", level=None, fill_value=None): + return func(df, other, axis=axis, level=level, fill_value=fill_value) + + def call_df_no_fill(df, other, axis="columns", level=None): + return func(df, other, axis=axis, level=level) + + def call_series_fill(df, other, level=None, fill_value=None, axis=0): + return func(df, other, axis=axis, level=level, fill_value=fill_value) + + def call_series_no_fill(df, other, level=None, axis=0): + return func(df, other, axis=axis, level=level) + + if issubclass(cls, DATAFRAME_TYPE): + call = ( + call_df_fill + if "fill_value" in func.__code__.co_varnames + else call_df_no_fill + ) + elif issubclass(cls, SERIES_TYPE): + call = ( + call_series_fill + if "fill_value" in func.__code__.co_varnames + else call_series_no_fill + ) + else: + call = None + return _register_method(cls, name, func, wrapper=call) + + # register mars tensor ufuncs + ufunc_ops = [ + # unary + DataFrameAbs, + DataFrameLog, + DataFrameLog2, + DataFrameLog10, + DataFrameSin, + DataFrameCos, + DataFrameTan, + DataFrameSinh, + DataFrameCosh, + DataFrameTanh, + DataFrameArcsin, + DataFrameArccos, + DataFrameArctan, + DataFrameArcsinh, + DataFrameArccosh, + DataFrameArctanh, + DataFrameRadians, + DataFrameDegrees, + DataFrameCeil, + DataFrameFloor, + DataFrameAround, + DataFrameExp, + DataFrameExp2, + DataFrameExpm1, + DataFrameSqrt, + DataFrameNot, + DataFrameIsNan, + DataFrameIsInf, + DataFrameIsFinite, + DataFrameNegative, + # binary + DataFrameAdd, + DataFrameEqual, + DataFrameFloorDiv, + DataFrameGreater, + DataFrameGreaterEqual, + DataFrameLess, + DataFrameLessEqual, + DataFrameAnd, + DataFrameOr, + DataFrameXor, + DataFrameMod, + DataFrameMul, + DataFrameNotEqual, + DataFramePower, + DataFrameSubtract, + DataFrameTrueDiv, + ] + for ufunc_op in ufunc_ops: + register_tensor_ufunc(ufunc_op) + + for entity in DATAFRAME_TYPE + SERIES_TYPE: + setattr(entity, "__abs__", abs_) + setattr(entity, "abs", abs_) + _register_method(entity, "round", around) + setattr(entity, "__invert__", invert) + + setattr(entity, "__add__", wrap_notimplemented_exception(add)) + setattr(entity, "__radd__", wrap_notimplemented_exception(radd)) + _register_bin_method(entity, "add", add) + _register_bin_method(entity, "radd", radd) + + setattr(entity, "__sub__", wrap_notimplemented_exception(subtract)) + setattr(entity, "__rsub__", wrap_notimplemented_exception(rsubtract)) + _register_bin_method(entity, "sub", subtract) + _register_bin_method(entity, "rsub", rsubtract) + + setattr(entity, "__mul__", wrap_notimplemented_exception(mul)) + setattr(entity, "__rmul__", wrap_notimplemented_exception(rmul)) + _register_bin_method(entity, "mul", mul) + _register_bin_method(entity, "multiply", mul) + _register_bin_method(entity, "rmul", rmul) + + setattr(entity, "__floordiv__", wrap_notimplemented_exception(floordiv)) + setattr(entity, "__rfloordiv__", wrap_notimplemented_exception(rfloordiv)) + setattr(entity, "__truediv__", wrap_notimplemented_exception(truediv)) + setattr(entity, "__rtruediv__", wrap_notimplemented_exception(rtruediv)) + setattr(entity, "__div__", wrap_notimplemented_exception(truediv)) + setattr(entity, "__rdiv__", wrap_notimplemented_exception(rtruediv)) + _register_bin_method(entity, "floordiv", floordiv) + _register_bin_method(entity, "rfloordiv", rfloordiv) + _register_bin_method(entity, "truediv", truediv) + _register_bin_method(entity, "rtruediv", rtruediv) + _register_bin_method(entity, "div", truediv) + _register_bin_method(entity, "rdiv", rtruediv) + + setattr(entity, "__mod__", wrap_notimplemented_exception(mod)) + setattr(entity, "__rmod__", wrap_notimplemented_exception(rmod)) + _register_bin_method(entity, "mod", mod) + _register_bin_method(entity, "rmod", rmod) + + setattr(entity, "__pow__", wrap_notimplemented_exception(power)) + setattr(entity, "__rpow__", wrap_notimplemented_exception(rpower)) + _register_bin_method(entity, "pow", power) + _register_bin_method(entity, "rpow", rpower) + + setattr(entity, "__eq__", _wrap_eq()) + setattr(entity, "__ne__", _wrap_comparison(ne)) + setattr(entity, "__lt__", _wrap_comparison(lt)) + setattr(entity, "__gt__", _wrap_comparison(gt)) + setattr(entity, "__ge__", _wrap_comparison(ge)) + setattr(entity, "__le__", _wrap_comparison(le)) + _register_bin_method(entity, "eq", eq) + _register_bin_method(entity, "ne", ne) + _register_bin_method(entity, "lt", lt) + _register_bin_method(entity, "gt", gt) + _register_bin_method(entity, "ge", ge) + _register_bin_method(entity, "le", le) + + setattr(entity, "__matmul__", dot) + setattr(entity, "__rmatmul__", rdot) + _register_method(entity, "dot", dot) + + setattr(entity, "__and__", wrap_notimplemented_exception(bitand)) + setattr(entity, "__rand__", wrap_notimplemented_exception(rbitand)) + + setattr(entity, "__or__", wrap_notimplemented_exception(bitor)) + setattr(entity, "__ror__", wrap_notimplemented_exception(rbitor)) + + setattr(entity, "__xor__", wrap_notimplemented_exception(bitxor)) + setattr(entity, "__rxor__", wrap_notimplemented_exception(rbitxor)) + + setattr(entity, "__neg__", wrap_notimplemented_exception(negative)) + + for entity in INDEX_TYPE: + setattr(entity, "__eq__", _wrap_eq()) + + if PdOpsMixin is not None and not hasattr( + pd, "_mars_df_arith_wrapped" + ): # pragma: no branch + # wrap pandas magic functions to intercept reverse operands + for magic_name in [ + "add", + "sub", + "mul", + "div", + "truediv", + "floordiv", + "mod", + "pow", + "and", + "or", + "xor", + "eq", + "ne", + "lt", + "le", + "gt", + "ge", + ]: + _wrap_pandas_magics(PdOpsMixin, magic_name) + + for pd_cls in (pd.DataFrame, pd.Series): + _wrap_pandas_magics(pd_cls, "matmul") + + pd._mars_df_arith_wrapped = True + + +_install() +del _install diff --git a/python/xorbits/_mars/dataframe/arithmetic/abs.py b/python/xorbits/_mars/dataframe/arithmetic/abs.py new file mode 100644 index 000000000..8b706adae --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/abs.py @@ -0,0 +1,33 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameAbs(DataFrameUnaryUfunc): + _op_type_ = OperandDef.ABS + _func_name = "abs" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorAbsolute + + return TensorAbsolute + + +def abs_(df): + op = DataFrameAbs() + return op(df) diff --git a/python/xorbits/_mars/dataframe/arithmetic/add.py b/python/xorbits/_mars/dataframe/arithmetic/add.py new file mode 100644 index 000000000..9c661f402 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/add.py @@ -0,0 +1,60 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameBinopUfunc +from .docstring import bin_arithmetic_doc + + +class DataFrameAdd(DataFrameBinopUfunc): + _op_type_ = OperandDef.ADD + + _func_name = "add" + _rfunc_name = "radd" + + @classproperty + def _operator(self): + return operator.add + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorAdd + + return TensorAdd + + +_add_example = """ +>>> a.add(b, fill_value=0).execute() +a 2.0 +b 1.0 +c 1.0 +d 1.0 +e NaN +dtype: float64 +""" + + +@bin_arithmetic_doc("Addition", equiv="+", series_example=_add_example) +def add(df, other, axis="columns", level=None, fill_value=None): + op = DataFrameAdd(axis=axis, level=level, fill_value=fill_value, lhs=df, rhs=other) + return op(df, other) + + +@bin_arithmetic_doc("Addition", equiv="+", series_example=_add_example) +def radd(df, other, axis="columns", level=None, fill_value=None): + op = DataFrameAdd(axis=axis, level=level, fill_value=fill_value, lhs=other, rhs=df) + return op.rcall(df, other) diff --git a/python/xorbits/_mars/dataframe/arithmetic/arccos.py b/python/xorbits/_mars/dataframe/arithmetic/arccos.py new file mode 100644 index 000000000..3cd366e39 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/arccos.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameArccos(DataFrameUnaryUfunc): + _op_type_ = OperandDef.ARCCOS + _func_name = "arccos" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorArccos + + return TensorArccos diff --git a/python/xorbits/_mars/dataframe/arithmetic/arccosh.py b/python/xorbits/_mars/dataframe/arithmetic/arccosh.py new file mode 100644 index 000000000..f2612b3d4 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/arccosh.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameArccosh(DataFrameUnaryUfunc): + _op_type_ = OperandDef.ARCCOSH + _func_name = "arccosh" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorArccosh + + return TensorArccosh diff --git a/python/xorbits/_mars/dataframe/arithmetic/arcsin.py b/python/xorbits/_mars/dataframe/arithmetic/arcsin.py new file mode 100644 index 000000000..8da1008c4 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/arcsin.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameArcsin(DataFrameUnaryUfunc): + _op_type_ = OperandDef.ARCSIN + _func_name = "arcsin" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorArcsin + + return TensorArcsin diff --git a/python/xorbits/_mars/dataframe/arithmetic/arcsinh.py b/python/xorbits/_mars/dataframe/arithmetic/arcsinh.py new file mode 100644 index 000000000..4918323b1 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/arcsinh.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameArcsinh(DataFrameUnaryUfunc): + _op_type_ = OperandDef.ARCSINH + _func_name = "arcsinh" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorArcsinh + + return TensorArcsinh diff --git a/python/xorbits/_mars/dataframe/arithmetic/arctan.py b/python/xorbits/_mars/dataframe/arithmetic/arctan.py new file mode 100644 index 000000000..f3a45aba6 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/arctan.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameArctan(DataFrameUnaryUfunc): + _op_type_ = OperandDef.ARCTAN + _func_name = "arctan" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorArctan + + return TensorArctan diff --git a/python/xorbits/_mars/dataframe/arithmetic/arctanh.py b/python/xorbits/_mars/dataframe/arithmetic/arctanh.py new file mode 100644 index 000000000..e4f9698d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/arctanh.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameArctanh(DataFrameUnaryUfunc): + _op_type_ = OperandDef.ARCTANH + _func_name = "arctanh" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorArctanh + + return TensorArctanh diff --git a/python/xorbits/_mars/dataframe/arithmetic/around.py b/python/xorbits/_mars/dataframe/arithmetic/around.py new file mode 100644 index 000000000..d41f1a409 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/around.py @@ -0,0 +1,167 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import Int32Field +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameAround(DataFrameUnaryUfunc): + _op_type_ = OperandDef.AROUND + _func_name = "around" + + _decimals = Int32Field("decimals") + + def __init__(self, decimals=None, output_types=None, **kw): + super().__init__(_decimals=decimals, output_types=output_types, **kw) + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorAround + + return TensorAround + + @property + def decimals(self): + return self._decimals + + @classmethod + def execute(cls, ctx, op): + df = ctx[op.inputs[0].key] + func_name = getattr(cls, "_func_name") + if hasattr(df, func_name): + ctx[op.outputs[0].key] = getattr(df, func_name)(decimals=op.decimals) + else: + ctx[op.outputs[0].key] = getattr(np, func_name)(df, decimals=op.decimals) + + +def around(df, decimals=0, *args, **kwargs): + if len(args) > 0: + raise TypeError( + f"round() takes 0 positional arguments but {len(args)} was given" + ) + op = DataFrameAround(decimals=decimals, **kwargs) + return op(df) + + +around.__frame_doc__ = """ +Round a DataFrame to a variable number of decimal places. + +Parameters +---------- +decimals : int, dict, Series + Number of decimal places to round each column to. If an int is + given, round each column to the same number of places. + Otherwise dict and Series round to variable numbers of places. + Column names should be in the keys if `decimals` is a + dict-like, or in the index if `decimals` is a Series. Any + columns not included in `decimals` will be left as is. Elements + of `decimals` which are not columns of the input will be + ignored. +*args + Additional keywords have no effect but might be accepted for + compatibility with numpy. +**kwargs + Additional keywords have no effect but might be accepted for + compatibility with numpy. + +Returns +------- +DataFrame + A DataFrame with the affected columns rounded to the specified + number of decimal places. + +See Also +-------- +numpy.around : Round a numpy array to the given number of decimals. +Series.round : Round a Series to the given number of decimals. + +Examples +-------- +>>> import mars.dataframe as md +>>> df = md.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)], +... columns=['dogs', 'cats']) +>>> df.execute() + dogs cats +0 0.21 0.32 +1 0.01 0.67 +2 0.66 0.03 +3 0.21 0.18 + +By providing an integer each column is rounded to the same number +of decimal places + +>>> df.round(1).execute() + dogs cats +0 0.2 0.3 +1 0.0 0.7 +2 0.7 0.0 +3 0.2 0.2 + +With a dict, the number of places for specific columns can be +specified with the column names as key and the number of decimal +places as value + +>>> df.round({'dogs': 1, 'cats': 0}).execute() + dogs cats +0 0.2 0.0 +1 0.0 1.0 +2 0.7 0.0 +3 0.2 0.0 + +Using a Series, the number of places for specific columns can be +specified with the column names as index and the number of +decimal places as value + +>>> decimals = md.Series([0, 1], index=['cats', 'dogs']) +>>> df.round(decimals).execute() + dogs cats +0 0.2 0.0 +1 0.0 1.0 +2 0.7 0.0 +3 0.2 0.0 +""" +around.__series_doc__ = """ +Round each value in a Series to the given number of decimals. + +Parameters +---------- +decimals : int, default 0 + Number of decimal places to round to. If decimals is negative, + it specifies the number of positions to the left of the decimal point. + +Returns +------- +Series + Rounded values of the Series. + +See Also +-------- +numpy.around : Round values of an np.array. +DataFrame.round : Round values of a DataFrame. + +Examples +-------- +>>> import mars.tensor as mt +>>> import mars.dataframe as md +>>> s = md.Series([0.1, 1.3, 2.7]) +>>> s.round().execute() +0 0.0 +1 1.0 +2 3.0 +dtype: float64 +""" diff --git a/python/xorbits/_mars/dataframe/arithmetic/bitwise_and.py b/python/xorbits/_mars/dataframe/arithmetic/bitwise_and.py new file mode 100644 index 000000000..71970ab6b --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/bitwise_and.py @@ -0,0 +1,46 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameBinopUfunc + + +class DataFrameAnd(DataFrameBinopUfunc): + _op_type_ = OperandDef.AND + + _bit_func_name = "__and__" + _bit_rfunc_name = "__rand__" + + @classproperty + def _operator(self): + return operator.and_ + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorBitand + + return TensorBitand + + +def bitand(df, other, axis="columns", level=None, fill_value=None): + op = DataFrameAnd(axis=axis, level=level, fill_value=fill_value, lhs=df, rhs=other) + return op(df, other) + + +def rbitand(df, other, axis="columns", level=None, fill_value=None): + op = DataFrameAnd(axis=axis, level=level, fill_value=fill_value, lhs=other, rhs=df) + return op.rcall(df, other) diff --git a/python/xorbits/_mars/dataframe/arithmetic/bitwise_or.py b/python/xorbits/_mars/dataframe/arithmetic/bitwise_or.py new file mode 100644 index 000000000..eb12d4964 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/bitwise_or.py @@ -0,0 +1,68 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator + +from ... import opcodes as OperandDef +from ...utils import TreeReductionBuilder, classproperty +from .core import DataFrameArithmeticTreeMixin, DataFrameBinopUfunc + + +class DataFrameOr(DataFrameBinopUfunc): + _op_type_ = OperandDef.OR + + _bit_func_name = "__or__" + _bit_rfunc_name = "__ror__" + + @classproperty + def _operator(self): + return operator.or_ + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorBitor + + return TensorBitor + + +class DataFrameTreeOr(DataFrameArithmeticTreeMixin, DataFrameOr): + _op_type_ = OperandDef.TREE_OR + + +def bitor(df, other, axis="columns", level=None, fill_value=None): + op = DataFrameOr(axis=axis, level=level, fill_value=fill_value, lhs=df, rhs=other) + return op(df, other) + + +def rbitor(df, other, axis="columns", level=None, fill_value=None): + op = DataFrameOr(axis=axis, level=level, fill_value=fill_value, lhs=other, rhs=df) + return op.rcall(df, other) + + +def tree_dataframe_or( + *args, index=None, combine_size=None, axis="columns", level=None, fill_value=None +): + class MultiplyBuilder(TreeReductionBuilder): + def _build_reduction(self, inputs, final=False): + op = DataFrameTreeOr( + axis=axis, + level=level, + fill_value=fill_value, + output_types=inputs[0].op.output_types, + ) + params = inputs[0].params.copy() + params["index"] = index + return op.new_chunk(inputs, **params) + + return MultiplyBuilder(combine_size).build(args) diff --git a/python/xorbits/_mars/dataframe/arithmetic/bitwise_xor.py b/python/xorbits/_mars/dataframe/arithmetic/bitwise_xor.py new file mode 100644 index 000000000..14230dfb6 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/bitwise_xor.py @@ -0,0 +1,46 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameBinopUfunc + + +class DataFrameXor(DataFrameBinopUfunc): + _op_type_ = OperandDef.XOR + + _bit_func_name = "__xor__" + _bit_rfunc_name = "__rxor__" + + @classproperty + def _operator(self): + return operator.xor + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorBitxor + + return TensorBitxor + + +def bitxor(df, other, axis="columns", level=None, fill_value=None): + op = DataFrameXor(axis=axis, level=level, fill_value=fill_value, lhs=df, rhs=other) + return op(df, other) + + +def rbitxor(df, other, axis="columns", level=None, fill_value=None): + op = DataFrameXor(axis=axis, level=level, fill_value=fill_value, lhs=other, rhs=df) + return op.rcall(df, other) diff --git a/python/xorbits/_mars/dataframe/arithmetic/ceil.py b/python/xorbits/_mars/dataframe/arithmetic/ceil.py new file mode 100644 index 000000000..f277b1b8f --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/ceil.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameCeil(DataFrameUnaryUfunc): + _op_type_ = OperandDef.CEIL + _func_name = "ceil" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorCeil + + return TensorCeil diff --git a/python/xorbits/_mars/dataframe/arithmetic/core.py b/python/xorbits/_mars/dataframe/arithmetic/core.py new file mode 100644 index 000000000..7fb06ecca --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/core.py @@ -0,0 +1,832 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import itertools +from functools import reduce + +import numpy as np +import pandas as pd + +from ...core import CHUNK_TYPE, ENTITY_TYPE, recursive_tile +from ...serialization.serializables import AnyField +from ...tensor.core import TENSOR_CHUNK_TYPE, TENSOR_TYPE, Chunk, ChunkData +from ...utils import classproperty, get_dtype +from ..align import ( + align_dataframe_dataframe, + align_dataframe_series, + align_series_series, +) +from ..core import ( + DATAFRAME_CHUNK_TYPE, + DATAFRAME_TYPE, + SERIES_CHUNK_TYPE, + SERIES_TYPE, + is_chunk_meta_lazy, +) +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..ufunc.tensor import TensorUfuncMixin +from ..utils import ( + build_empty_df, + infer_dtype, + infer_dtypes, + infer_index_value, + parse_index, +) + + +class DataFrameBinOpMixin(DataFrameOperandMixin): + @classmethod + def _tile_both_dataframes(cls, op): + # if both of the inputs are DataFrames, axis is just ignored + left, right = op.lhs, op.rhs + df = op.outputs[0] + + nsplits, out_shapes, left_chunks, right_chunks = align_dataframe_dataframe( + left, right + ) + out_chunk_indexes = itertools.product(*(range(s) for s in out_shapes[0])) + + out_chunks = [] + for idx, left_chunk, right_chunk in zip( + out_chunk_indexes, left_chunks, right_chunks + ): + out_chunk = ( + op.copy() + .reset_key() + .new_chunk( + [left_chunk, right_chunk], + shape=(nsplits[0][idx[0]], nsplits[1][idx[1]]), + index=idx, + ) + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_dataframes( + op.inputs, + df.shape, + nsplits=tuple(tuple(ns) for ns in nsplits), + chunks=out_chunks, + dtypes=df.dtypes, + index_value=df.index_value, + columns_value=df.columns_value, + ) + + @classmethod + def _tile_both_series(cls, op): + left, right = op.lhs, op.rhs + df = op.outputs[0] + + nsplits, out_shape, left_chunks, right_chunks = align_series_series(left, right) + + out_chunks = [] + for idx, left_chunk, right_chunk in zip( + range(out_shape[0]), left_chunks, right_chunks + ): + out_chunk = ( + op.copy() + .reset_key() + .new_chunk( + [left_chunk, right_chunk], shape=(nsplits[0][idx],), index=(idx,) + ) + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_seriess( + op.inputs, + df.shape, + nsplits=tuple(tuple(ns) for ns in nsplits), + chunks=out_chunks, + dtype=df.dtype, + index_value=df.index_value, + name=df.name, + ) + + @classmethod + def _tile_dataframe_series(cls, op): + left, right = op.lhs, op.rhs + df = op.outputs[0] + + nsplits, out_shape, left_chunks, right_chunks = align_dataframe_series( + left, right, axis=op.axis + ) + out_chunk_indexes = itertools.product(*(range(s) for s in out_shape)) + + out_chunks = [] + for out_idx, df_chunk in zip(out_chunk_indexes, left_chunks): + if op.axis == "columns" or op.axis == 1: + series_chunk = right_chunks[out_idx[1]] + kw = { + "shape": (nsplits[0][out_idx[0]], nsplits[1][out_idx[1]]), + "index_value": df_chunk.index_value, + "dtypes_value": df_chunk.dtypes_value, + } + else: + series_chunk = right_chunks[out_idx[0]] + kw = { + "shape": (nsplits[0][out_idx[0]], nsplits[1][out_idx[1]]), + "columns_value": df_chunk.columns_value, + "dtypes_value": df_chunk.dtypes_value, + } + out_chunk = ( + op.copy() + .reset_key() + .new_chunk([df_chunk, series_chunk], index=out_idx, **kw) + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_dataframes( + op.inputs, + df.shape, + nsplits=tuple(tuple(ns) for ns in nsplits), + chunks=out_chunks, + dtypes=df.dtypes, + index_value=df.index_value, + columns_value=df.columns_value, + ) + + @classmethod + def _tile_series_dataframe(cls, op): + left, right = op.lhs, op.rhs + df = op.outputs[0] + + nsplits, out_shape, right_chunks, left_chunks = align_dataframe_series( + right, left, axis=op.axis + ) + out_chunk_indexes = itertools.product(*(range(s) for s in out_shape)) + + out_chunks = [] + for out_idx, df_chunk in zip(out_chunk_indexes, right_chunks): + if op.axis == "columns" or op.axis == 1: + series_chunk = left_chunks[out_idx[1]] + kw = { + "shape": (df_chunk.shape[0], np.nan), + "index_value": df_chunk.index_value, + "dtypes_value": df_chunk.dtypes_value, + } + else: + series_chunk = left_chunks[out_idx[0]] + kw = { + "shape": (df_chunk.shape[0], np.nan), + "index_value": df_chunk.index_value, + "dtypes_value": df_chunk.dtypes_value, + } + out_chunk = ( + op.copy() + .reset_key() + .new_chunk([series_chunk, df_chunk], index=out_idx, **kw) + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_dataframes( + op.inputs, + df.shape, + nsplits=tuple(tuple(ns) for ns in nsplits), + chunks=out_chunks, + dtypes=df.dtypes, + index_value=df.index_value, + columns_value=df.columns_value, + ) + + @classmethod + def _tile_scalar(cls, op): + tileable = op.rhs if pd.api.types.is_scalar(op.lhs) else op.lhs + df = op.outputs[0] + out_chunks = [] + lazy_chunk_meta = is_chunk_meta_lazy(tileable.chunks[0]) + for chunk in tileable.chunks: + out_op = op.copy().reset_key() + if chunk.ndim == 2: + if lazy_chunk_meta: + out_chunk = out_op.new_chunk( + [chunk], + shape=chunk.shape, + index=chunk.index, + ) + out_chunk._set_tileable_meta( + tileable_key=df.key, + nsplits=tileable.nsplits, + index_value=df.index_value, + columns_value=df.columns_value, + dtypes=df.dtypes, + ) + else: + out_chunk = out_op.new_chunk( + [chunk], + shape=chunk.shape, + index=chunk.index, + dtypes=chunk.dtypes, + index_value=chunk.index_value, + columns_value=getattr(chunk, "columns_value"), + ) + else: + if lazy_chunk_meta: + out_chunk = out_op.new_chunk( + [chunk], + shape=chunk.shape, + index=chunk.index, + dtype=chunk.dtype, + name=getattr(chunk, "name"), + ) + out_chunk._set_tileable_meta( + tileable_key=df.key, + nsplits=tileable.nsplits, + index_value=df.index_value, + ) + else: + out_chunk = out_op.new_chunk( + [chunk], + shape=chunk.shape, + index=chunk.index, + dtype=chunk.dtype, + index_value=chunk.index_value, + name=getattr(chunk, "name"), + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + params = df.params.copy() + params["chunks"] = out_chunks + params["nsplits"] = tileable.nsplits + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + def _tile_with_tensor(cls, op): + out = op.outputs[0] + axis = op.axis + if axis is None: + axis = 0 + + rhs_is_tensor = isinstance(op.rhs, TENSOR_TYPE) + tensor, other = (op.rhs, op.lhs) if rhs_is_tensor else (op.lhs, op.rhs) + if tensor.shape == other.shape: + tensor = yield from recursive_tile(tensor.rechunk(other.nsplits)) + else: + # shape differs only when dataframe add 1-d tensor, we need rechunk on columns axis. + if axis in ["columns", 1] and other.ndim == 1: + # force axis == 0 if it's Series other than DataFrame + axis = 0 + rechunk_size = ( + other.nsplits[1] if axis == "columns" or axis == 1 else other.nsplits[0] + ) + if tensor.ndim > 0: + tensor = yield from recursive_tile(tensor.rechunk((rechunk_size,))) + + out_chunks = [] + for out_index in itertools.product(*(map(range, other.chunk_shape))): + tensor_chunk = tensor.cix[out_index[: tensor.ndim]] + other_chunk = other.cix[out_index] + out_op = op.copy().reset_key() + inputs = ( + [other_chunk, tensor_chunk] + if rhs_is_tensor + else [tensor_chunk, other_chunk] + ) + if isinstance(other_chunk, DATAFRAME_CHUNK_TYPE): + cum_splits = [0] + np.cumsum(other.nsplits[1]).tolist() + start = cum_splits[out_index[1]] + end = cum_splits[out_index[1] + 1] + chunk_dtypes = out.dtypes.iloc[start:end] + out_chunk = out_op.new_chunk( + inputs, + shape=other_chunk.shape, + index=other_chunk.index, + dtypes=chunk_dtypes, + index_value=other_chunk.index_value, + columns_value=other_chunk.columns_value, + ) + else: + out_chunk = out_op.new_chunk( + inputs, + shape=other_chunk.shape, + index=other_chunk.index, + dtype=out.dtype, + index_value=other_chunk.index_value, + name=other_chunk.name, + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + if isinstance(other, SERIES_TYPE): + return new_op.new_seriess( + op.inputs, + other.shape, + nsplits=other.nsplits, + dtype=out.dtype, + name=other.name, + index_value=other.index_value, + chunks=out_chunks, + ) + else: + return new_op.new_dataframes( + op.inputs, + other.shape, + nsplits=other.nsplits, + dtypes=out.dtypes, + index_value=other.index_value, + columns_value=other.columns_value, + chunks=out_chunks, + ) + + @classmethod + def tile(cls, op): + if len(op.inputs) < 2: + return cls._tile_scalar(op) + elif isinstance(op.inputs[0], DATAFRAME_TYPE) and isinstance( + op.inputs[1], DATAFRAME_TYPE + ): + return cls._tile_both_dataframes(op) + elif isinstance(op.inputs[0], SERIES_TYPE) and isinstance( + op.inputs[1], SERIES_TYPE + ): + return cls._tile_both_series(op) + elif isinstance(op.inputs[0], DATAFRAME_TYPE) and isinstance( + op.inputs[1], SERIES_TYPE + ): + return cls._tile_dataframe_series(op) + elif isinstance(op.inputs[0], SERIES_TYPE) and isinstance( + op.inputs[1], DATAFRAME_TYPE + ): + return cls._tile_series_dataframe(op) + elif isinstance(op.inputs[0], TENSOR_TYPE) or isinstance( + op.inputs[1], TENSOR_TYPE + ): + return (yield from cls._tile_with_tensor(op)) + + @classmethod + def execute(cls, ctx, op): + if getattr(cls, "_func_name", None) is not None: + if len(op.inputs) == 2: + df, other = ctx[op.inputs[0].key], ctx[op.inputs[1].key] + if isinstance(op.inputs[0], SERIES_CHUNK_TYPE) and isinstance( + op.inputs[1], DATAFRAME_CHUNK_TYPE + ): + df, other = other, df + func_name = getattr(cls, "_rfunc_name") + else: + func_name = getattr(cls, "_func_name") + elif pd.api.types.is_scalar(op.lhs) or isinstance(op.lhs, np.ndarray): + df = ctx[op.rhs.key] + other = op.lhs + func_name = getattr(cls, "_rfunc_name") + else: + df = ctx[op.lhs.key] + other = op.rhs + func_name = getattr(cls, "_func_name") + if df.ndim == 2: + kw = dict(axis=op.axis) + else: + kw = dict() + if op.fill_value is not None: + # comparison function like eq does not have `fill_value` + kw["fill_value"] = op.fill_value + if op.level is not None: + # logical function like and may don't have `level` (for Series type) + kw["level"] = op.level + if hasattr(other, "ndim") and other.ndim == 0: + other = other.item() + ctx[op.outputs[0].key] = getattr(df, func_name)(other, **kw) + else: + inputs_iter = iter(op.inputs) + if not pd.api.types.is_scalar(op.lhs): + lhs = ctx[next(inputs_iter).key] + else: + lhs = op.lhs + if not pd.api.types.is_scalar(op.rhs): + rhs = ctx[next(inputs_iter).key] + else: + rhs = op.rhs + ctx[op.outputs[0].key] = cls._operator( + lhs, rhs + ) # pylint: disable=too-many-function-args + + @classproperty + def _operator(self): + raise NotImplementedError + + @classmethod + def _calc_properties(cls, x1, x2=None, axis="columns"): + is_chunk = isinstance(x1, CHUNK_TYPE) + + if isinstance(x1, (DATAFRAME_TYPE, DATAFRAME_CHUNK_TYPE)) and ( + x2 is None + or pd.api.types.is_scalar(x2) + or isinstance(x2, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)) + ): + if not is_chunk: + if pd.api.types.is_scalar(x2): + dtypes = cls._operator(build_empty_df(x1.dtypes), x2).dtypes + elif x1.dtypes is not None and isinstance(x2, TENSOR_TYPE): + dtypes = pd.Series( + [infer_dtype(dt, x2.dtype, cls._operator) for dt in x1.dtypes], + index=x1.dtypes.index, + ) + else: # pragma: no cover + dtypes = x1.dtypes + return { + "shape": x1.shape, + "dtypes": dtypes, + "columns_value": x1.columns_value, + "index_value": x1.index_value, + } + else: + return {"shape": x1.shape} + + if isinstance(x1, (SERIES_TYPE, SERIES_CHUNK_TYPE)) and ( + x2 is None + or pd.api.types.is_scalar(x2) + or isinstance(x2, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)) + ): + x2_dtype = x2.dtype if hasattr(x2, "dtype") else type(x2) + x2_dtype = get_dtype(x2_dtype) + if hasattr(cls, "return_dtype"): + dtype = cls.return_dtype + else: + dtype = infer_dtype(x1.dtype, x2_dtype, cls._operator) + ret = {"shape": x1.shape, "dtype": dtype} + if pd.api.types.is_scalar(x2) or ( + hasattr(x2, "ndim") and (x2.ndim == 0 or x2.ndim == 1) + ): + ret["name"] = x1.name + if not is_chunk: + ret["index_value"] = x1.index_value + return ret + + if isinstance(x1, (DATAFRAME_TYPE, DATAFRAME_CHUNK_TYPE)) and isinstance( + x2, (DATAFRAME_TYPE, DATAFRAME_CHUNK_TYPE) + ): + index_shape, column_shape, dtypes, columns, index = ( + np.nan, + np.nan, + None, + None, + None, + ) + + if ( + x1.columns_value is not None + and x2.columns_value is not None + and x1.columns_value.key == x2.columns_value.key + ): + dtypes = pd.Series( + [ + infer_dtype(dt1, dt2, cls._operator) + for dt1, dt2 in zip(x1.dtypes, x2.dtypes) + ], + index=x1.dtypes.index, + ) + columns = copy.copy(x1.columns_value) + column_shape = len(dtypes) + elif x1.dtypes is not None and x2.dtypes is not None: + dtypes = infer_dtypes(x1.dtypes, x2.dtypes, cls._operator) + columns = parse_index(dtypes.index, store_data=True) + column_shape = len(dtypes) + if x1.index_value is not None and x2.index_value is not None: + if x1.index_value.key == x2.index_value.key: + index = copy.copy(x1.index_value) + index_shape = x1.shape[0] + else: + index = infer_index_value(x1.index_value, x2.index_value) + if index.key == x1.index_value.key == x2.index_value.key and ( + not np.isnan(x1.shape[0]) or not np.isnan(x2.shape[0]) + ): + index_shape = ( + x1.shape[0] if not np.isnan(x1.shape[0]) else x2.shape[0] + ) + + return { + "shape": (index_shape, column_shape), + "dtypes": dtypes, + "columns_value": columns, + "index_value": index, + } + + if isinstance(x1, (DATAFRAME_TYPE, DATAFRAME_CHUNK_TYPE)) and isinstance( + x2, (SERIES_TYPE, SERIES_CHUNK_TYPE) + ): + if axis == "columns" or axis == 1: + index_shape = x1.shape[0] + index = x1.index_value + column_shape, dtypes, columns = np.nan, None, None + if x1.columns_value is not None and x1.index_value is not None: + if x1.columns_value.key == x2.index_value.key: + dtypes = pd.Series( + [ + infer_dtype(dt, x2.dtype, cls._operator) + for dt in x1.dtypes + ], + index=x1.dtypes.index, + ) + columns = copy.copy(x1.columns_value) + column_shape = len(dtypes) + else: # pragma: no cover + dtypes = x1.dtypes # FIXME + columns = infer_index_value(x1.columns_value, x2.index_value) + column_shape = np.nan + else: + assert axis == "index" or axis == 0 + column_shape = x1.shape[1] + columns = x1.columns_value + dtypes = x1.dtypes + index_shape, index = np.nan, None + if x1.index_value is not None and x1.index_value is not None: + if x1.index_value.key == x2.index_value.key: + dtypes = pd.Series( + [ + infer_dtype(dt, x2.dtype, cls._operator) + for dt in x1.dtypes + ], + index=x1.dtypes.index, + ) + index = copy.copy(x1.index_value) + index_shape = x1.shape[0] + else: + if x1.dtypes is not None: + dtypes = pd.Series( + [ + infer_dtype(dt, x2.dtype, cls._operator) + for dt in x1.dtypes + ], + index=x1.dtypes.index, + ) + index = infer_index_value(x1.index_value, x2.index_value) + index_shape = np.nan + return { + "shape": (index_shape, column_shape), + "dtypes": dtypes, + "columns_value": columns, + "index_value": index, + } + + if isinstance(x1, (SERIES_TYPE, SERIES_CHUNK_TYPE)) and isinstance( + x2, (SERIES_TYPE, SERIES_CHUNK_TYPE) + ): + index_shape, dtype, index = np.nan, None, None + + dtype = infer_dtype(x1.dtype, x2.dtype, cls._operator) + if x1.index_value is not None and x2.index_value is not None: + if x1.index_value.key == x2.index_value.key: + index = copy.copy(x1.index_value) + index_shape = x1.shape[0] + else: + index = infer_index_value(x1.index_value, x2.index_value) + if index.key == x1.index_value.key == x2.index_value.key and ( + not np.isnan(x1.shape[0]) or not np.isnan(x2.shape[0]) + ): + index_shape = ( + x1.shape[0] if not np.isnan(x1.shape[0]) else x2.shape[0] + ) + + ret = {"shape": (index_shape,), "dtype": dtype, "index_value": index} + if x1.name == x2.name: + ret["name"] = x1.name + return ret + + raise NotImplementedError("Unknown combination of parameters") + + def _new_chunks(self, inputs, kws=None, **kw): + property_inputs = [ + inp + for inp in inputs + if isinstance( + inp, (DATAFRAME_CHUNK_TYPE, SERIES_CHUNK_TYPE, TENSOR_CHUNK_TYPE) + ) + ] + # use first two to infer(for tree operand) + property_inputs = property_inputs[:2] + if len(property_inputs) == 1: + properties = self._calc_properties(*property_inputs) + elif any(inp.ndim == 2 for inp in property_inputs): + df1, df2 = ( + property_inputs + if isinstance(property_inputs[0], DATAFRAME_CHUNK_TYPE) + else reversed(property_inputs) + ) + properties = self._calc_properties(df1, df2, axis=self.axis) + else: + if property_inputs[0].ndim < property_inputs[1].ndim or isinstance( + property_inputs[0], (TENSOR_TYPE, TENSOR_CHUNK_TYPE) + ): + property_inputs = reversed(property_inputs) + properties = self._calc_properties(*property_inputs) + + inputs = [inp for inp in inputs if isinstance(inp, (Chunk, ChunkData))] + + shape = properties.pop("shape") + if "shape" in kw: + shape = kw.pop("shape") + + for prop, value in properties.items(): + if kw.get(prop, None) is None: + kw[prop] = value + + return super()._new_chunks(inputs, shape=shape, kws=kws, **kw) + + def _check_inputs(self, x1, x2): + if isinstance(x1, TENSOR_TYPE) or isinstance(x2, TENSOR_TYPE): + tensor, other = (x1, x2) if isinstance(x1, TENSOR_TYPE) else (x2, x1) + if isinstance(other, DATAFRAME_TYPE): + if self.axis == "index" or self.axis == 0: + other_shape = tuple(reversed(other.shape)) + else: + other_shape = other.shape + if tensor.ndim == 2 and tensor.shape != other_shape: + raise ValueError( + f"Unable to coerce to DataFrame, shape must be {other_shape}: " + f"given {tensor.shape}" + ) + elif tensor.ndim == 1 and tensor.shape[0] != other_shape[1]: + raise ValueError( + f"Unable to coerce to Series, length must be {other_shape[1]}: " + f"given {tensor.shape[0]}" + ) + elif tensor.ndim > 2: + raise ValueError( + "Unable to coerce to Series/DataFrame, dim must be <= 2" + ) + if isinstance(other, SERIES_TYPE): + if tensor.ndim == 1 and (tensor.shape[0] != other.shape[0]): + raise ValueError( + f"Unable to coerce to Series, length must be {other.shape[0]}: " + f"given {tensor.shape[0]}" + ) + elif tensor.ndim > 1: + raise ValueError("Unable to coerce to Series, dim must be 1") + + def _call(self, x1, x2): + self._check_inputs(x1, x2) + if isinstance(x1, DATAFRAME_TYPE) or isinstance(x2, DATAFRAME_TYPE): + df1, df2 = (x1, x2) if isinstance(x1, DATAFRAME_TYPE) else (x2, x1) + kw = self._calc_properties(df1, df2, axis=self.axis) + if not pd.api.types.is_scalar(df2): + return self.new_dataframe([x1, x2], **kw) + else: + return self.new_dataframe([df1], **kw) + if isinstance(x1, SERIES_TYPE) or isinstance(x2, SERIES_TYPE): + s1, s2 = (x1, x2) if isinstance(x1, SERIES_TYPE) else (x2, x1) + kw = self._calc_properties(s1, s2) + if not pd.api.types.is_scalar(s2): + return self.new_series([x1, x2], **kw) + else: + return self.new_series([s1], **kw) + raise NotImplementedError( + "Only support add dataframe, series or scalar for now" + ) + + def __call__(self, x1, x2): + x1 = self._process_input(x1) + x2 = self._process_input(x2) + if isinstance(x1, SERIES_TYPE) and isinstance(x2, DATAFRAME_TYPE): + # reject invoking series's op on dataframe + raise NotImplementedError + return self._call(x1, x2) + + def rcall(self, x1, x2): + x1 = self._process_input(x1) + x2 = self._process_input(x2) + if isinstance(x1, SERIES_TYPE) and isinstance(x2, DATAFRAME_TYPE): + # reject invoking series's op on dataframe + raise NotImplementedError + return self._call(x2, x1) + + +class DataFrameBinOp(DataFrameOperand, DataFrameBinOpMixin): + axis = AnyField("axis", default=None) + level = AnyField("level", default=None) + fill_value = AnyField("fill_value", default=None) + lhs = AnyField("lhs") + rhs = AnyField("rhs") + + def __init__(self, output_types=None, **kw): + super().__init__(_output_types=output_types, **kw) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if len(self._inputs) == 2: + self.lhs = self._inputs[0] + self.rhs = self._inputs[1] + else: + if isinstance(self.lhs, ENTITY_TYPE): + self.lhs = self._inputs[0] + elif pd.api.types.is_scalar(self.lhs): + self.rhs = self._inputs[0] + + +class DataFrameUnaryOpMixin(DataFrameOperandMixin): + __slots__ = () + + @classmethod + def tile(cls, op): + in_df = op.inputs[0] + out_df = op.outputs[0] + + out_chunks = [] + index_dtypes_cache = dict() + for in_chunk in in_df.chunks: + out_op = op.copy().reset_key() + if out_df.ndim == 2: + try: + dtypes = index_dtypes_cache[in_chunk.index[1]] + except KeyError: + dtypes = out_df.dtypes[in_chunk.columns_value.to_pandas()] + index_dtypes_cache[in_chunk.index[1]] = dtypes + + out_chunk = out_op.new_chunk( + [in_chunk], + shape=in_chunk.shape, + dtypes=dtypes, + index=in_chunk.index, + index_value=in_chunk.index_value, + columns_value=in_chunk.columns_value, + ) + else: + out_chunk = out_op.new_chunk( + [in_chunk], + shape=in_chunk.shape, + index=in_chunk.index, + dtype=in_chunk.dtype, + index_value=in_chunk.index_value, + name=in_chunk.name, + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + kw = out_df.params + kw["nsplits"] = in_df.nsplits + kw["chunks"] = out_chunks + return new_op.new_tileables(op.inputs, kws=[kw]) + + @classmethod + def execute(cls, ctx, op): + df = ctx[op.inputs[0].key] + func_name = getattr(cls, "_func_name") + if hasattr(df, func_name): + ctx[op.outputs[0].key] = getattr(df, func_name)() + else: + ctx[op.outputs[0].key] = getattr(np, func_name)(df) + + +class DataFrameUnaryOp(DataFrameOperand, DataFrameUnaryOpMixin): + def __init__(self, output_types=None, **kw): + super().__init__(_output_types=output_types, **kw) + + @classmethod + def _get_output_dtype(cls, df): + if df.ndim == 2: + return df.dtypes + else: + return df.dtype + + def __call__(self, df): + self.output_types = df.op.output_types + if df.ndim == 2: + return self.new_dataframe( + [df], + shape=df.shape, + dtypes=self._get_output_dtype(df), + columns_value=df.columns_value, + index_value=df.index_value, + ) + else: + series = df + return self.new_series( + [series], + shape=series.shape, + name=series.name, + index_value=series.index_value, + dtype=self._get_output_dtype(series), + ) + + +class DataFrameArithmeticTreeMixin: + @classmethod + def execute(cls, ctx, op): + inputs = [ctx[c.key] for c in op.inputs] + ctx[op.outputs[0].key] = reduce(op._operator, inputs) + + def _set_inputs(self, inputs): + inputs = self._get_inputs_data(inputs) + setattr(self, "_inputs", inputs) + + +class DataFrameUnaryUfunc(DataFrameUnaryOp, TensorUfuncMixin): + pass + + +class DataFrameBinopUfunc(DataFrameBinOp, TensorUfuncMixin): + pass diff --git a/python/xorbits/_mars/dataframe/arithmetic/cos.py b/python/xorbits/_mars/dataframe/arithmetic/cos.py new file mode 100644 index 000000000..0e66d4532 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/cos.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameCos(DataFrameUnaryUfunc): + _op_type_ = OperandDef.COS + _func_name = "cos" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorCos + + return TensorCos diff --git a/python/xorbits/_mars/dataframe/arithmetic/cosh.py b/python/xorbits/_mars/dataframe/arithmetic/cosh.py new file mode 100644 index 000000000..0df784e29 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/cosh.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameCosh(DataFrameUnaryUfunc): + _op_type_ = OperandDef.COSH + _func_name = "cosh" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorCosh + + return TensorCosh diff --git a/python/xorbits/_mars/dataframe/arithmetic/degrees.py b/python/xorbits/_mars/dataframe/arithmetic/degrees.py new file mode 100644 index 000000000..0ce1c5bf2 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/degrees.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameDegrees(DataFrameUnaryUfunc): + _op_type_ = OperandDef.DEGREES + _func_name = "degrees" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorDegrees + + return TensorDegrees diff --git a/python/xorbits/_mars/dataframe/arithmetic/docstring.py b/python/xorbits/_mars/dataframe/arithmetic/docstring.py new file mode 100644 index 000000000..43434a3ce --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/docstring.py @@ -0,0 +1,442 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +_flex_doc_FRAME = """ +Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`). +Equivalent to ``{equiv}``, but with support to substitute a fill_value +for missing data in one of the inputs. With reverse version, `{reverse}`. +Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to +arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`. + +Parameters +---------- +other : scalar, sequence, Series, or DataFrame + Any single or multiple element data structure, or list-like object. +axis : {{0 or 'index', 1 or 'columns'}} + Whether to compare by the index (0 or 'index') or columns + (1 or 'columns'). For Series input, axis to match Series index on. +level : int or label + Broadcast across a level, matching Index values on the + passed MultiIndex level. +fill_value : float or None, default None + Fill existing missing (NaN) values, and any new element needed for + successful DataFrame alignment, with this value before computation. + If data in both corresponding DataFrame locations is missing + the result will be missing. + +Returns +------- +DataFrame + Result of the arithmetic operation. + +See Also +-------- +DataFrame.add : Add DataFrames. +DataFrame.sub : Subtract DataFrames. +DataFrame.mul : Multiply DataFrames. +DataFrame.div : Divide DataFrames (float division). +DataFrame.truediv : Divide DataFrames (float division). +DataFrame.floordiv : Divide DataFrames (integer division). +DataFrame.mod : Calculate modulo (remainder after division). +DataFrame.pow : Calculate exponential power. + +Notes +----- +Mismatched indices will be unioned together. + +Examples +-------- +>>> import mars.dataframe as md +>>> df = md.DataFrame({{'angles': [0, 3, 4], +... 'degrees': [360, 180, 360]}}, +... index=['circle', 'triangle', 'rectangle']) +>>> df.execute() + angles degrees +circle 0 360 +triangle 3 180 +rectangle 4 360 + +Add a scalar with operator version which return the same +results. + +>>> (df + 1).execute() + angles degrees +circle 1 361 +triangle 4 181 +rectangle 5 361 + +>>> df.add(1).execute() + angles degrees +circle 1 361 +triangle 4 181 +rectangle 5 361 + +Divide by constant with reverse version. + +>>> df.div(10).execute() + angles degrees +circle 0.0 36.0 +triangle 0.3 18.0 +rectangle 0.4 36.0 + +>>> df.rdiv(10).execute() + angles degrees +circle inf 0.027778 +triangle 3.333333 0.055556 +rectangle 2.500000 0.027778 + +Subtract a list and Series by axis with operator version. + +>>> (df - [1, 2]).execute() + angles degrees +circle -1 358 +triangle 2 178 +rectangle 3 358 + +>>> df.sub([1, 2], axis='columns').execute() + angles degrees +circle -1 358 +triangle 2 178 +rectangle 3 358 + +>>> df.sub(md.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']), +... axis='index').execute() + angles degrees +circle -1 359 +triangle 2 179 +rectangle 3 359 + +Multiply a DataFrame of different shape with operator version. + +>>> other = md.DataFrame({{'angles': [0, 3, 4]}}, +... index=['circle', 'triangle', 'rectangle']) +>>> other.execute() + angles +circle 0 +triangle 3 +rectangle 4 + +>>> (df * other).execute() + angles degrees +circle 0 NaN +triangle 9 NaN +rectangle 16 NaN + +>>> df.mul(other, fill_value=0).execute() + angles degrees +circle 0 0.0 +triangle 9 0.0 +rectangle 16 0.0 + +Divide by a MultiIndex by level. + +>>> df_multindex = md.DataFrame({{'angles': [0, 3, 4, 4, 5, 6], +... 'degrees': [360, 180, 360, 360, 540, 720]}}, +... index=[['A', 'A', 'A', 'B', 'B', 'B'], +... ['circle', 'triangle', 'rectangle', +... 'square', 'pentagon', 'hexagon']]) +>>> df_multindex.execute() + angles degrees +A circle 0 360 + triangle 3 180 + rectangle 4 360 +B square 4 360 + pentagon 5 540 + hexagon 6 720 + +>>> df.div(df_multindex, level=1, fill_value=0).execute() + angles degrees +A circle NaN 1.0 + triangle 1.0 1.0 + rectangle 1.0 1.0 +B square 0.0 0.0 + pentagon 0.0 0.0 + hexagon 0.0 0.0 +""" + +_flex_doc_SERIES = """ +Return {desc} of series and other, element-wise (binary operator `{op_name}`). + +Equivalent to ``series {equiv} other``, but with support to substitute a fill_value for +missing data in one of the inputs. + +Parameters +---------- +other : Series or scalar value +fill_value : None or float value, default None (NaN) + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result will be missing. +level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + +Returns +------- +Series + The result of the operation. + +See Also +-------- +Series.{reverse} + +Examples +-------- +>>> import numpy as np +>>> import mars.dataframe as md +>>> a = md.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a.execute() +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 + +>>> b = md.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b.execute() +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +""" + +_flex_comp_doc_FRAME = """ +Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`). +Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison +operators. + +Equivalent to `dataframe {equiv} other` with support to choose axis (rows or columns) +and level for comparison. + +Parameters +---------- +other : scalar, sequence, Series, or DataFrame + Any single or multiple element data structure, or list-like object. +axis : {{0 or 'index', 1 or 'columns'}}, default 'columns' + Whether to compare by the index (0 or 'index') or columns + (1 or 'columns'). +level : int or label + Broadcast across a level, matching Index values on the passed + MultiIndex level. + +Returns +------- +DataFrame of bool + Result of the comparison. + +See Also +-------- +DataFrame.eq : Compare DataFrames for equality elementwise. +DataFrame.ne : Compare DataFrames for inequality elementwise. +DataFrame.le : Compare DataFrames for less than inequality + or equality elementwise. +DataFrame.lt : Compare DataFrames for strictly less than + inequality elementwise. +DataFrame.ge : Compare DataFrames for greater than inequality + or equality elementwise. +DataFrame.gt : Compare DataFrames for strictly greater than + inequality elementwise. + +Notes +----- +Mismatched indices will be unioned together. +`NaN` values are considered different (i.e. `NaN` != `NaN`). + +Examples +-------- +>>> df = pd.DataFrame({{'cost': [250, 150, 100], +... 'revenue': [100, 250, 300]}}, +... index=['A', 'B', 'C']) +>>> df.execute() + cost revenue +A 250 100 +B 150 250 +C 100 300 + +Comparison with a scalar, using either the operator or method: + +>>> (df == 100).execute() + cost revenue +A False True +B False False +C True False + +>>> df.eq(100).execute() + cost revenue +A False True +B False False +C True False + +When `other` is a :class:`Series`, the columns of a DataFrame are aligned +with the index of `other` and broadcast: + +>>> (df != pd.Series([100, 250], index=["cost", "revenue"])).execute() + cost revenue +A True True +B True False +C False True + +Use the method to control the broadcast axis: + +>>> df.ne(pd.Series([100, 300], index=["A", "D"]), axis='index').execute() + cost revenue +A True False +B True True +C True True +D True True + +When comparing to an arbitrary sequence, the number of columns must +match the number elements in `other`: + +>>> (df == [250, 100]).execute() + cost revenue +A True True +B False False +C False False + +Use the method to control the axis: + +>>> df.eq([250, 250, 100], axis='index').execute() + cost revenue +A True False +B False True +C True False + +Compare to a DataFrame of different shape. + +>>> other = pd.DataFrame({{'revenue': [300, 250, 100, 150]}}, +... index=['A', 'B', 'C', 'D']) +>>> other.execute() + revenue +A 300 +B 250 +C 100 +D 150 + +>>> df.gt(other).execute() + cost revenue +A False False +B False False +C False True +D False False + +Compare to a MultiIndex by level. + +>>> df_multindex = pd.DataFrame({{'cost': [250, 150, 100, 150, 300, 220], +... 'revenue': [100, 250, 300, 200, 175, 225]}}, +... index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'], +... ['A', 'B', 'C', 'A', 'B', 'C']]) +>>> df_multindex.execute() + cost revenue +Q1 A 250 100 + B 150 250 + C 100 300 +Q2 A 150 200 + B 300 175 + C 220 225 + +>>> df.le(df_multindex, level=1).execute() + cost revenue +Q1 A True True + B True True + C True True +Q2 A False True + B True False + C True False +""" + + +_flex_comp_doc_SERIES = """ +Return {desc} of series and other, element-wise (binary operator `{op_name}`). + +Equivalent to ``series {equiv} other``, but with support to substitute a fill_value for +missing data in one of the inputs. + +Parameters +---------- +other : Series or scalar value +fill_value : None or float value, default None (NaN) + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result will be missing. +level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + +Returns +------- +Series + The result of the operation. + +Examples +-------- +>>> import numpy as np +>>> import mars.dataframe as md +>>> a = md.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a.execute() +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 + +>>> b = md.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b.execute() +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +""" + + +def bin_arithmetic_doc( + desc, op_name=None, equiv=None, reverse=None, series_example=None +): + def wrapper(fun): + nonlocal op_name, reverse + op_name = op_name or fun.__name__ + if reverse is None: + reverse = op_name[1:] if op_name.startswith("r") else "r" + op_name + fun.__frame_doc__ = _flex_doc_FRAME.format( + desc=desc, op_name=op_name, equiv=equiv, reverse=reverse + ) + fun.__series_doc__ = _flex_doc_SERIES.format( + desc=desc, op_name=op_name, equiv=equiv, reverse=reverse + ) + if series_example is not None: # pragma: no branch + fun.__series_doc__ += "\n" + series_example.strip() + return fun + + return wrapper + + +def bin_compare_doc(desc, op_name=None, equiv=None, series_example=None): + def wrapper(fun): + nonlocal op_name + op_name = op_name or fun.__name__ + fun.__frame_doc__ = _flex_comp_doc_FRAME.format( + desc=desc, op_name=op_name, equiv=equiv + ) + fun.__series_doc__ = _flex_comp_doc_SERIES.format( + desc=desc, op_name=op_name, equiv=equiv + ) + if series_example is not None: # pragma: no branch + fun.__series_doc__ += "\n" + series_example.strip() + return fun + + return wrapper diff --git a/python/xorbits/_mars/dataframe/arithmetic/dot.py b/python/xorbits/_mars/dataframe/arithmetic/dot.py new file mode 100644 index 000000000..57473536a --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/dot.py @@ -0,0 +1,306 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import recursive_tile +from ...serialization.serializables import AnyField, KeyField +from ...tensor import tensor as astensor +from ...tensor.core import TENSOR_TYPE +from ...tensor.utils import decide_unify_split, validate_axis +from ..core import DATAFRAME_TYPE, SERIES_TYPE, IndexValue +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import parse_index + + +class DataFrameDot(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.DOT + + lhs = KeyField("lhs") + rhs = AnyField("rhs") + + def __init__(self, output_types=None, **kw): + super().__init__(_output_types=output_types, **kw) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self.lhs = self._inputs[0] + self.rhs = self._inputs[1] + + def __call__(self, lhs, rhs): + lhs = self._process_input(lhs) + rhs = self._process_input(rhs) + if not isinstance(rhs, (DATAFRAME_TYPE, SERIES_TYPE)): + rhs = astensor(rhs) + test_rhs = rhs + else: + test_rhs = rhs.to_tensor() + + test_ret = lhs.to_tensor().dot(test_rhs) + if test_ret.ndim == 0: + if isinstance(lhs, SERIES_TYPE) and isinstance(rhs, TENSOR_TYPE): + # return tensor + return test_ret + return self.new_scalar([lhs, rhs], dtype=test_ret.dtype) + elif test_ret.ndim == 1: + if lhs.ndim == 1: + if hasattr(rhs, "columns_value"): + index_value = rhs.columns_value + else: + # tensor + length = -1 if np.isnan(rhs.shape[1]) else rhs.shape[1] + pd_index = pd.RangeIndex(length) + index_value = parse_index(pd_index, store_data=True) + else: + assert rhs.ndim == 1 + index_value = lhs.index_value + return self.new_series( + [lhs, rhs], + shape=test_ret.shape, + dtype=test_ret.dtype, + index_value=index_value, + ) + else: + if isinstance(rhs, TENSOR_TYPE): + dtypes = pd.Series( + np.repeat(test_ret.dtype, test_ret.shape[1]), + index=pd.RangeIndex(test_ret.shape[1]), + ) + columns_value = parse_index(dtypes.index, store_data=True) + else: + dtypes = pd.Series( + np.repeat(test_ret.dtype, test_ret.shape[1]), + index=rhs.columns_value.to_pandas(), + ) + columns_value = rhs.columns_value + return self.new_dataframe( + [lhs, rhs], + shape=test_ret.shape, + index_value=lhs.index_value, + columns_value=columns_value, + dtypes=dtypes, + ) + + @classmethod + def _align(cls, lhs, rhs): + if isinstance(rhs, TENSOR_TYPE): + # no need to align when rhs is a tensor + return lhs, rhs + + is_lhs_range_index = False + if isinstance(lhs, DATAFRAME_TYPE) and isinstance( + lhs.columns_value.value, IndexValue.RangeIndex + ): + is_lhs_range_index = True + if isinstance(lhs, SERIES_TYPE) and isinstance( + lhs.index_value.value, IndexValue.RangeIndex + ): + is_lhs_range_index = True + + is_rhs_range_index = False + if isinstance(rhs.index_value.value, IndexValue.RangeIndex): + is_rhs_range_index = True + + if not is_lhs_range_index or not is_rhs_range_index: + # TODO: e.g. use rhs.loc[lhs.columns_value.to_pandas()] + # when lhs is a DataFrame and lhs.columns is not a RangeIndex, + # so does Series + raise NotImplementedError + + return lhs, rhs + + @classmethod + def tile(cls, op): + from ..datasource.from_tensor import dataframe_from_tensor, series_from_tensor + + lhs, rhs = op.lhs, op.rhs + lhs, rhs = cls._align(lhs, rhs) + out = op.outputs[0] + + if any(np.isnan(ns) for ns in lhs.nsplits[-1]): + yield + if any(np.isnan(ns) for ns in rhs.nsplits[0]): + yield + + nsplit = decide_unify_split(lhs.nsplits[-1], rhs.nsplits[0]) + lhs_axis = validate_axis(lhs.ndim, -1) + lhs = yield from recursive_tile(lhs.rechunk({lhs_axis: nsplit})) + rhs = yield from recursive_tile(rhs.rechunk({0: nsplit})) + + # delegate computation to tensor + lhs_tensor = lhs if isinstance(lhs, TENSOR_TYPE) else lhs.to_tensor() + rhs_tensor = rhs if isinstance(rhs, TENSOR_TYPE) else rhs.to_tensor() + ret = lhs_tensor.dot(rhs_tensor) + + if isinstance(out, TENSOR_TYPE): + pass + elif ret.ndim == 1: + index = None + if isinstance(lhs, DATAFRAME_TYPE): + index = lhs.index + elif isinstance(rhs, DATAFRAME_TYPE): + index = rhs.dtypes.index + ret = series_from_tensor(ret, index=index) + elif ret.ndim == 2: + index = lhs.index + columns = None + if isinstance(rhs, DATAFRAME_TYPE): + columns = rhs.dtypes.index + ret = dataframe_from_tensor(ret, index=index, columns=columns) + + tiled = yield from recursive_tile(ret) + return [tiled] + + +def dot(df_or_series, other): + op = DataFrameDot(lhs=df_or_series, rhs=other) + return op(df_or_series, other) + + +def rdot(df_or_series, other): + op = DataFrameDot(lhs=other, rhs=df_or_series) + return op(other, df_or_series) + + +dot.__frame_doc__ = """ +Compute the matrix multiplication between the DataFrame and other. + +This method computes the matrix product between the DataFrame and the +values of an other Series, DataFrame or a numpy array. + +It can also be called using ``self @ other`` in Python >= 3.5. + +Parameters +---------- +other : Series, DataFrame or array-like + The other object to compute the matrix product with. + +Returns +------- +Series or DataFrame + If other is a Series, return the matrix product between self and + other as a Series. If other is a DataFrame or a numpy.array, return + the matrix product of self and other in a DataFrame of a np.array. + +See Also +-------- +Series.dot: Similar method for Series. + +Notes +----- +The dimensions of DataFrame and other must be compatible in order to +compute the matrix multiplication. In addition, the column names of +DataFrame and the index of other must contain the same values, as they +will be aligned prior to the multiplication. + +The dot method for Series computes the inner product, instead of the +matrix product here. + +Examples +-------- +Here we multiply a DataFrame with a Series. + +>>> import mars.tensor as mt +>>> import mars.dataframe as md +>>> df = md.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) +>>> s = md.Series([1, 1, 2, 1]) +>>> df.dot(s).execute() +0 -4 +1 5 +dtype: int64 + +Here we multiply a DataFrame with another DataFrame. + +>>> other = md.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]]) +>>> df.dot(other).execute() + 0 1 +0 1 4 +1 2 2 + +Note that the dot method give the same result as @ + +>>> (df @ other).execute() + 0 1 +0 1 4 +1 2 2 + +The dot method works also if other is an np.array. + +>>> arr = mt.array([[0, 1], [1, 2], [-1, -1], [2, 0]]) +>>> df.dot(arr).execute() + 0 1 +0 1 4 +1 2 2 + +Note how shuffling of the objects does not change the result. + +>>> s2 = s.reindex([1, 0, 2, 3]) +>>> df.dot(s2).execute() +0 -4 +1 5 +dtype: int64 +""" +dot.__series_doc__ = """ +Compute the dot product between the Series and the columns of other. + +This method computes the dot product between the Series and another +one, or the Series and each columns of a DataFrame, or the Series and +each columns of an array. + +It can also be called using `self @ other` in Python >= 3.5. + +Parameters +---------- +other : Series, DataFrame or array-like + The other object to compute the dot product with its columns. + +Returns +------- +scalar, Series or numpy.ndarray + Return the dot product of the Series and other if other is a + Series, the Series of the dot product of Series and each rows of + other if other is a DataFrame or a numpy.ndarray between the Series + and each columns of the numpy array. + +See Also +-------- +DataFrame.dot: Compute the matrix product with the DataFrame. +Series.mul: Multiplication of series and other, element-wise. + +Notes +----- +The Series and other has to share the same index if other is a Series +or a DataFrame. + +Examples +-------- +>>> import mars.tensor as mt +>>> import mars.dataframe as md +>>> s = md.Series([0, 1, 2, 3]) +>>> other = md.Series([-1, 2, -3, 4]) +>>> s.dot(other).execute() +8 +>>> (s @ other).execute() +8 +>>> df = md.DataFrame([[0, 1], [-2, 3], [4, -5], [6, 7]]) +>>> s.dot(df).execute() +0 24 +1 14 +dtype: int64 +>>> arr = mt.array([[0, 1], [-2, 3], [4, -5], [6, 7]]) +>>> s.dot(arr).execute() +array([24, 14]) +""" diff --git a/python/xorbits/_mars/dataframe/arithmetic/equal.py b/python/xorbits/_mars/dataframe/arithmetic/equal.py new file mode 100644 index 000000000..2e8237230 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/equal.py @@ -0,0 +1,56 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameBinopUfunc +from .docstring import bin_compare_doc + + +class DataFrameEqual(DataFrameBinopUfunc): + _op_type_ = OperandDef.EQ + + _func_name = "eq" + _rfunc_name = "eq" + + return_dtype = np.dtype(bool) + + @classproperty + def _operator(self): + return lambda lhs, rhs: lhs.eq(rhs) + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorEqual + + return TensorEqual + + +_eq_example = """ +>>> a.eq(b, fill_value=0).execute() +a True +b False +c False +d False +e False +dtype: bool +""" + + +@bin_compare_doc("Equal to", equiv="==", series_example=_eq_example) +def eq(df, other, axis="columns", level=None): + op = DataFrameEqual(axis=axis, level=level, lhs=df, rhs=other) + return op(df, other) diff --git a/python/xorbits/_mars/dataframe/arithmetic/exp.py b/python/xorbits/_mars/dataframe/arithmetic/exp.py new file mode 100644 index 000000000..2cbe3d544 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/exp.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameExp(DataFrameUnaryUfunc): + _op_type_ = OperandDef.EXP + _func_name = "exp" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorExp + + return TensorExp diff --git a/python/xorbits/_mars/dataframe/arithmetic/exp2.py b/python/xorbits/_mars/dataframe/arithmetic/exp2.py new file mode 100644 index 000000000..e83e0302a --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/exp2.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameExp2(DataFrameUnaryUfunc): + _op_type_ = OperandDef.EXP2 + _func_name = "exp2" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorExp2 + + return TensorExp2 diff --git a/python/xorbits/_mars/dataframe/arithmetic/expm1.py b/python/xorbits/_mars/dataframe/arithmetic/expm1.py new file mode 100644 index 000000000..e64b24b6a --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/expm1.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameExpm1(DataFrameUnaryUfunc): + _op_type_ = OperandDef.EXPM1 + _func_name = "expm1" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorExpm1 + + return TensorExpm1 diff --git a/python/xorbits/_mars/dataframe/arithmetic/floor.py b/python/xorbits/_mars/dataframe/arithmetic/floor.py new file mode 100644 index 000000000..719c1c4cd --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/floor.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameFloor(DataFrameUnaryUfunc): + _op_type_ = OperandDef.FLOOR + _func_name = "floor" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorFloor + + return TensorFloor diff --git a/python/xorbits/_mars/dataframe/arithmetic/floordiv.py b/python/xorbits/_mars/dataframe/arithmetic/floordiv.py new file mode 100644 index 000000000..15eb654ea --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/floordiv.py @@ -0,0 +1,64 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameBinopUfunc +from .docstring import bin_arithmetic_doc + + +class DataFrameFloorDiv(DataFrameBinopUfunc): + _op_type_ = OperandDef.FLOORDIV + + _func_name = "floordiv" + _rfunc_name = "rfloordiv" + + @classproperty + def _operator(self): + return operator.floordiv + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorFloorDiv + + return TensorFloorDiv + + +_floordiv_example = """ +>>> a.floordiv(b, fill_value=0).execute() +a 1.0 +b NaN +c NaN +d 0.0 +e NaN +dtype: float64 +""" + + +@bin_arithmetic_doc("Integer division", equiv="//", series_example=_floordiv_example) +def floordiv(df, other, axis="columns", level=None, fill_value=None): + op = DataFrameFloorDiv( + axis=axis, level=level, fill_value=fill_value, lhs=df, rhs=other + ) + return op(df, other) + + +@bin_arithmetic_doc("Integer division", equiv="//", series_example=_floordiv_example) +def rfloordiv(df, other, axis="columns", level=None, fill_value=None): + op = DataFrameFloorDiv( + axis=axis, level=level, fill_value=fill_value, lhs=other, rhs=df + ) + return op.rcall(df, other) diff --git a/python/xorbits/_mars/dataframe/arithmetic/greater.py b/python/xorbits/_mars/dataframe/arithmetic/greater.py new file mode 100644 index 000000000..5f75f1de9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/greater.py @@ -0,0 +1,57 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameBinopUfunc +from .docstring import bin_compare_doc + + +class DataFrameGreater(DataFrameBinopUfunc): + _op_type_ = OperandDef.GT + + _func_name = "gt" + _rfunc_name = "lt" + + return_dtype = np.dtype(bool) + + @classproperty + def _operator(self): + return lambda lhs, rhs: lhs.gt(rhs) + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorGreaterThan + + return TensorGreaterThan + + +_gt_example = """ +>>> a.gt(b, fill_value=0).execute() +a True +b False +c False +d False +e True +f False +dtype: bool +""" + + +@bin_compare_doc("Greater than", equiv=">", series_example=_gt_example) +def gt(df, other, axis="columns", level=None): + op = DataFrameGreater(axis=axis, level=level, lhs=df, rhs=other) + return op(df, other) diff --git a/python/xorbits/_mars/dataframe/arithmetic/greater_equal.py b/python/xorbits/_mars/dataframe/arithmetic/greater_equal.py new file mode 100644 index 000000000..7c0d30b05 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/greater_equal.py @@ -0,0 +1,57 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameBinopUfunc +from .docstring import bin_compare_doc + + +class DataFrameGreaterEqual(DataFrameBinopUfunc): + _op_type_ = OperandDef.GE + + _func_name = "ge" + _rfunc_name = "le" + + return_dtype = np.dtype(bool) + + @classproperty + def _operator(self): + return lambda lhs, rhs: lhs.ge(rhs) + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorGreaterEqual + + return TensorGreaterEqual + + +_ge_example = """ +>>> a.ge(b, fill_value=0).execute() +a True +b True +c False +d False +e True +f False +dtype: bool +""" + + +@bin_compare_doc("Greater than or equal to", equiv=">=", series_example=_ge_example) +def ge(df, other, axis="columns", level=None): + op = DataFrameGreaterEqual(axis=axis, level=level, lhs=df, rhs=other) + return op(df, other) diff --git a/python/xorbits/_mars/dataframe/arithmetic/invert.py b/python/xorbits/_mars/dataframe/arithmetic/invert.py new file mode 100644 index 000000000..202f3e525 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/invert.py @@ -0,0 +1,33 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameNot(DataFrameUnaryUfunc): + _op_type_ = OperandDef.INVERT + _func_name = "__invert__" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorNot + + return TensorNot + + +def invert(df): + op = DataFrameNot() + return op(df) diff --git a/python/xorbits/_mars/dataframe/arithmetic/is_ufuncs.py b/python/xorbits/_mars/dataframe/arithmetic/is_ufuncs.py new file mode 100644 index 000000000..757b1f31e --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/is_ufuncs.py @@ -0,0 +1,62 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameIsUFuncMixin: + @classmethod + def _get_output_dtype(cls, df): + if df.ndim == 2: + return pd.Series(np.dtype(bool), index=df.dtypes.index) + else: + return np.dtype(bool) + + +class DataFrameIsNan(DataFrameIsUFuncMixin, DataFrameUnaryUfunc): + _op_type_ = OperandDef.ISNAN + _func_name = "isnan" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorIsNan + + return TensorIsNan + + +class DataFrameIsInf(DataFrameIsUFuncMixin, DataFrameUnaryUfunc): + _op_type_ = OperandDef.ISINF + _func_name = "isinf" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorIsInf + + return TensorIsInf + + +class DataFrameIsFinite(DataFrameIsUFuncMixin, DataFrameUnaryUfunc): + _op_type_ = OperandDef.ISFINITE + _func_name = "isfinite" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorIsFinite + + return TensorIsFinite diff --git a/python/xorbits/_mars/dataframe/arithmetic/less.py b/python/xorbits/_mars/dataframe/arithmetic/less.py new file mode 100644 index 000000000..88e5b2d02 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/less.py @@ -0,0 +1,57 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameBinopUfunc +from .docstring import bin_compare_doc + + +class DataFrameLess(DataFrameBinopUfunc): + _op_type_ = OperandDef.LT + + _func_name = "lt" + _rfunc_name = "gt" + + return_dtype = np.dtype(bool) + + @classproperty + def _operator(self): + return lambda lhs, rhs: lhs.lt(rhs) + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorLessThan + + return TensorLessThan + + +_lt_example = """ +>>> a.lt(b, fill_value=0).execute() +a False +b False +c True +d False +e False +f True +dtype: bool +""" + + +@bin_compare_doc("Less than", equiv="<", series_example=_lt_example) +def lt(df, other, axis="columns", level=None): + op = DataFrameLess(axis=axis, level=level, lhs=df, rhs=other) + return op(df, other) diff --git a/python/xorbits/_mars/dataframe/arithmetic/less_equal.py b/python/xorbits/_mars/dataframe/arithmetic/less_equal.py new file mode 100644 index 000000000..78db91f6c --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/less_equal.py @@ -0,0 +1,57 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameBinopUfunc +from .docstring import bin_compare_doc + + +class DataFrameLessEqual(DataFrameBinopUfunc): + _op_type_ = OperandDef.LE + + _func_name = "le" + _rfunc_name = "ge" + + return_dtype = np.dtype(bool) + + @classproperty + def _operator(self): + return lambda lhs, rhs: lhs.le(rhs) + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorLessEqual + + return TensorLessEqual + + +_le_example = """ +>>> a.le(b, fill_value=0).execute() +a False +b True +c True +d False +e False +f True +dtype: bool +""" + + +@bin_compare_doc("Less than or equal to", equiv="<=", series_example=_le_example) +def le(df, other, axis="columns", level=None): + op = DataFrameLessEqual(axis=axis, level=level, lhs=df, rhs=other) + return op(df, other) diff --git a/python/xorbits/_mars/dataframe/arithmetic/log.py b/python/xorbits/_mars/dataframe/arithmetic/log.py new file mode 100644 index 000000000..df3d96a08 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/log.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameLog(DataFrameUnaryUfunc): + _op_type_ = OperandDef.LOG + _func_name = "log" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorLog + + return TensorLog diff --git a/python/xorbits/_mars/dataframe/arithmetic/log10.py b/python/xorbits/_mars/dataframe/arithmetic/log10.py new file mode 100644 index 000000000..4f2d49423 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/log10.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameLog10(DataFrameUnaryUfunc): + _op_type_ = OperandDef.LOG10 + _func_name = "log10" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorLog10 + + return TensorLog10 diff --git a/python/xorbits/_mars/dataframe/arithmetic/log2.py b/python/xorbits/_mars/dataframe/arithmetic/log2.py new file mode 100644 index 000000000..6fa3a42de --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/log2.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameLog2(DataFrameUnaryUfunc): + _op_type_ = OperandDef.LOG2 + _func_name = "log2" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorLog2 + + return TensorLog2 diff --git a/python/xorbits/_mars/dataframe/arithmetic/mod.py b/python/xorbits/_mars/dataframe/arithmetic/mod.py new file mode 100644 index 000000000..6cfc12592 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/mod.py @@ -0,0 +1,60 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameBinopUfunc +from .docstring import bin_arithmetic_doc + + +class DataFrameMod(DataFrameBinopUfunc): + _op_type_ = OperandDef.MOD + + _func_name = "mod" + _rfunc_name = "rmod" + + @classproperty + def _operator(self): + return operator.mod + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorMod + + return TensorMod + + +_mod_example = """ +>>> a.mod(b, fill_value=0).execute() +a 0.0 +b NaN +c NaN +d 0.0 +e NaN +dtype: float64 +""" + + +@bin_arithmetic_doc("Modulo", equiv="%", series_example=_mod_example) +def mod(df, other, axis="columns", level=None, fill_value=None): + op = DataFrameMod(axis=axis, level=level, fill_value=fill_value, lhs=df, rhs=other) + return op(df, other) + + +@bin_arithmetic_doc("Modulo", equiv="%", series_example=_mod_example) +def rmod(df, other, axis="columns", level=None, fill_value=None): + op = DataFrameMod(axis=axis, level=level, fill_value=fill_value, lhs=other, rhs=df) + return op.rcall(df, other) diff --git a/python/xorbits/_mars/dataframe/arithmetic/multiply.py b/python/xorbits/_mars/dataframe/arithmetic/multiply.py new file mode 100644 index 000000000..b6ba3492f --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/multiply.py @@ -0,0 +1,60 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameBinopUfunc +from .docstring import bin_arithmetic_doc + + +class DataFrameMul(DataFrameBinopUfunc): + _op_type_ = OperandDef.MUL + + _func_name = "mul" + _rfunc_name = "rmul" + + @classproperty + def _operator(self): + return operator.mul + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorMultiply + + return TensorMultiply + + +_mul_example = """ +>>> a.multiply(b, fill_value=0).execute() +a 1.0 +b 0.0 +c 0.0 +d 0.0 +e NaN +dtype: float64 +""" + + +@bin_arithmetic_doc("Multiplication", equiv="*", series_example=_mul_example) +def mul(df, other, axis="columns", level=None, fill_value=None): + op = DataFrameMul(axis=axis, level=level, fill_value=fill_value, lhs=df, rhs=other) + return op(df, other) + + +@bin_arithmetic_doc("Multiplication", equiv="*", series_example=_mul_example) +def rmul(df, other, axis="columns", level=None, fill_value=None): + op = DataFrameMul(axis=axis, level=level, fill_value=fill_value, lhs=other, rhs=df) + return op.rcall(df, other) diff --git a/python/xorbits/_mars/dataframe/arithmetic/negative.py b/python/xorbits/_mars/dataframe/arithmetic/negative.py new file mode 100644 index 000000000..c5312250b --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/negative.py @@ -0,0 +1,33 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameNegative(DataFrameUnaryUfunc): + _op_type_ = OperandDef.NEGATIVE + _func_name = "negative" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorNegative + + return TensorNegative + + +def negative(df): + op = DataFrameNegative() + return op(df) diff --git a/python/xorbits/_mars/dataframe/arithmetic/not_equal.py b/python/xorbits/_mars/dataframe/arithmetic/not_equal.py new file mode 100644 index 000000000..10571b55e --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/not_equal.py @@ -0,0 +1,56 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameBinopUfunc +from .docstring import bin_compare_doc + + +class DataFrameNotEqual(DataFrameBinopUfunc): + _op_type_ = OperandDef.NE + + _func_name = "ne" + _rfunc_name = "ne" + + return_dtype = np.dtype(bool) + + @classproperty + def _operator(self): + return lambda lhs, rhs: lhs.ne(rhs) + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorNotEqual + + return TensorNotEqual + + +_ne_example = """ +>>> a.ne(b, fill_value=0).execute() +a False +b True +c True +d True +e True +dtype: bool +""" + + +@bin_compare_doc("Not equal to", equiv="!=", series_example=_ne_example) +def ne(df, other, axis="columns", level=None): + op = DataFrameNotEqual(axis=axis, level=level, lhs=df, rhs=other) + return op(df, other) diff --git a/python/xorbits/_mars/dataframe/arithmetic/power.py b/python/xorbits/_mars/dataframe/arithmetic/power.py new file mode 100644 index 000000000..acb9726c2 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/power.py @@ -0,0 +1,68 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameBinopUfunc +from .docstring import bin_arithmetic_doc + + +class DataFramePower(DataFrameBinopUfunc): + _op_type_ = OperandDef.POW + + _func_name = "pow" + _rfunc_name = "rpow" + + @classproperty + def _operator(self): + return operator.pow + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorPower + + return TensorPower + + +_pow_example = """ +>>> a.pow(b, fill_value=0).execute() +a 1.0 +b 1.0 +c 1.0 +d 0.0 +e NaN +dtype: float64 +""" + + +@bin_arithmetic_doc( + "Exponential power", op_name="pow", equiv="**", series_example=_pow_example +) +def power(df, other, axis="columns", level=None, fill_value=None): + op = DataFramePower( + axis=axis, level=level, fill_value=fill_value, lhs=df, rhs=other + ) + return op(df, other) + + +@bin_arithmetic_doc( + "Exponential power", op_name="rpow", equiv="**", series_example=_pow_example +) +def rpower(df, other, axis="columns", level=None, fill_value=None): + op = DataFramePower( + axis=axis, level=level, fill_value=fill_value, lhs=other, rhs=df + ) + return op.rcall(df, other) diff --git a/python/xorbits/_mars/dataframe/arithmetic/radians.py b/python/xorbits/_mars/dataframe/arithmetic/radians.py new file mode 100644 index 000000000..ac870a95b --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/radians.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameRadians(DataFrameUnaryUfunc): + _op_type_ = OperandDef.RADIANS + _func_name = "radians" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorRadians + + return TensorRadians diff --git a/python/xorbits/_mars/dataframe/arithmetic/sin.py b/python/xorbits/_mars/dataframe/arithmetic/sin.py new file mode 100644 index 000000000..d969f6a7c --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/sin.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameSin(DataFrameUnaryUfunc): + _op_type_ = OperandDef.SIN + _func_name = "sin" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorSin + + return TensorSin diff --git a/python/xorbits/_mars/dataframe/arithmetic/sinh.py b/python/xorbits/_mars/dataframe/arithmetic/sinh.py new file mode 100644 index 000000000..3e1f98c2c --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/sinh.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameSinh(DataFrameUnaryUfunc): + _op_type_ = OperandDef.SINH + _func_name = "sinh" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorSinh + + return TensorSinh diff --git a/python/xorbits/_mars/dataframe/arithmetic/sqrt.py b/python/xorbits/_mars/dataframe/arithmetic/sqrt.py new file mode 100644 index 000000000..8bc063cb1 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/sqrt.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameSqrt(DataFrameUnaryUfunc): + _op_type_ = OperandDef.SQRT + _func_name = "sqrt" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorSqrt + + return TensorSqrt diff --git a/python/xorbits/_mars/dataframe/arithmetic/subtract.py b/python/xorbits/_mars/dataframe/arithmetic/subtract.py new file mode 100644 index 000000000..8765908a7 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/subtract.py @@ -0,0 +1,64 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameBinopUfunc +from .docstring import bin_arithmetic_doc + + +class DataFrameSubtract(DataFrameBinopUfunc): + _op_type_ = OperandDef.SUB + + _func_name = "sub" + _rfunc_name = "rsub" + + @classproperty + def _operator(self): + return operator.sub + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorSubtract + + return TensorSubtract + + +_sub_example = """ +>>> a.subtract(b, fill_value=0).execute() +a 0.0 +b 1.0 +c 1.0 +d -1.0 +e NaN +dtype: float64 +""" + + +@bin_arithmetic_doc("Subtraction", equiv="-", series_example=_sub_example) +def subtract(df, other, axis="columns", level=None, fill_value=None): + op = DataFrameSubtract( + axis=axis, level=level, fill_value=fill_value, lhs=df, rhs=other + ) + return op(df, other) + + +@bin_arithmetic_doc("Subtraction", equiv="-", series_example=_sub_example) +def rsubtract(df, other, axis="columns", level=None, fill_value=None): + op = DataFrameSubtract( + axis=axis, level=level, fill_value=fill_value, lhs=other, rhs=df + ) + return op.rcall(df, other) diff --git a/python/xorbits/_mars/dataframe/arithmetic/tan.py b/python/xorbits/_mars/dataframe/arithmetic/tan.py new file mode 100644 index 000000000..f737c0731 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/tan.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameTan(DataFrameUnaryUfunc): + _op_type_ = OperandDef.TAN + _func_name = "tan" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorTan + + return TensorTan diff --git a/python/xorbits/_mars/dataframe/arithmetic/tanh.py b/python/xorbits/_mars/dataframe/arithmetic/tanh.py new file mode 100644 index 000000000..990801507 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/tanh.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameUnaryUfunc + + +class DataFrameTanh(DataFrameUnaryUfunc): + _op_type_ = OperandDef.TANH + _func_name = "tanh" + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorTanh + + return TensorTanh diff --git a/python/xorbits/_mars/dataframe/arithmetic/tests/__init__.py b/python/xorbits/_mars/dataframe/arithmetic/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/arithmetic/tests/test_arithmetic.py b/python/xorbits/_mars/dataframe/arithmetic/tests/test_arithmetic.py new file mode 100644 index 000000000..ed6f681c1 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/tests/test_arithmetic.py @@ -0,0 +1,1551 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import itertools +import operator +from dataclasses import dataclass +from typing import Callable + +import numpy as np +import pandas as pd +import pytest + +from .... import tensor as mt +from ....core import OperandType, OutputType, tile +from ....core.operand import OperandStage +from ....utils import dataslots +from ...align import DataFrameIndexAlign, DataFrameShuffleProxy +from ...core import IndexValue +from ...datasource.dataframe import DataFrameDataSource, from_pandas +from ...datasource.from_tensor import dataframe_from_tensor +from ...datasource.series import SeriesDataSource +from ...datasource.series import from_pandas as from_pandas_series +from ...utils import ( + build_split_idx_to_origin_idx, + filter_index_value, + hash_dtypes, + split_monotonic_index_min_max, +) +from .. import ( + DataFrameAbs, + DataFrameAdd, + DataFrameAnd, + DataFrameEqual, + DataFrameFloorDiv, + DataFrameGreater, + DataFrameGreaterEqual, + DataFrameLess, + DataFrameLessEqual, + DataFrameMod, + DataFrameMul, + DataFrameNot, + DataFrameNotEqual, + DataFrameOr, + DataFramePower, + DataFrameSubtract, + DataFrameTrueDiv, + DataFrameXor, +) + + +def comp_func(name, reverse_name): + def inner(lhs, rhs): + try: + return getattr(lhs, name)(rhs) + except AttributeError: + return getattr(rhs, reverse_name)(lhs) + + return inner + + +@dataslots +@dataclass +class FunctionOptions: + func: Callable + op: OperandType + func_name: str + rfunc_name: str + + +binary_functions = dict( + add=FunctionOptions( + func=operator.add, op=DataFrameAdd, func_name="add", rfunc_name="radd" + ), + subtract=FunctionOptions( + func=operator.sub, op=DataFrameSubtract, func_name="sub", rfunc_name="rsub" + ), + multiply=FunctionOptions( + func=operator.mul, op=DataFrameMul, func_name="mul", rfunc_name="rmul" + ), + floordiv=FunctionOptions( + func=operator.floordiv, + op=DataFrameFloorDiv, + func_name="floordiv", + rfunc_name="rfloordiv", + ), + truediv=FunctionOptions( + func=operator.truediv, + op=DataFrameTrueDiv, + func_name="truediv", + rfunc_name="rtruediv", + ), + mod=FunctionOptions( + func=operator.mod, op=DataFrameMod, func_name="mod", rfunc_name="rmod" + ), + power=FunctionOptions( + func=operator.pow, op=DataFramePower, func_name="pow", rfunc_name="rpow" + ), + equal=FunctionOptions( + func=comp_func("eq", "eq"), op=DataFrameEqual, func_name="eq", rfunc_name="eq" + ), + not_equal=FunctionOptions( + func=comp_func("ne", "ne"), + op=DataFrameNotEqual, + func_name="ne", + rfunc_name="ne", + ), + greater=FunctionOptions( + func=comp_func("gt", "lt"), op=DataFrameGreater, func_name="gt", rfunc_name="lt" + ), + less=FunctionOptions( + func=comp_func("lt", "gt"), op=DataFrameLess, func_name="lt", rfunc_name="gt" + ), + greater_equal=FunctionOptions( + func=comp_func("ge", "le"), + op=DataFrameGreaterEqual, + func_name="ge", + rfunc_name="le", + ), + less_equal=FunctionOptions( + func=comp_func("le", "ge"), + op=DataFrameLessEqual, + func_name="le", + rfunc_name="ge", + ), + logical_and=FunctionOptions( + func=operator.and_, op=DataFrameAnd, func_name="__and__", rfunc_name="and" + ), + logical_or=FunctionOptions( + func=operator.or_, op=DataFrameOr, func_name="__or__", rfunc_name="__ror__" + ), + logical_xor=FunctionOptions( + func=operator.xor, op=DataFrameXor, func_name="__xor__", rfunc_name="__rxor__" + ), +) + + +def to_boolean_if_needed(func_name, value, split_value=0.5): + if func_name in ["__and__", "__or__", "__xor__"]: + return value > split_value + else: + return value + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_without_shuffle(func_name, func_opts): + # all the axes are monotonic + # data1 with index split into [0...4], [5...9], + # columns [3...7], [8...12] + data1 = pd.DataFrame( + np.random.rand(10, 10), index=np.arange(10), columns=np.arange(3, 13) + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=5) + # data2 with index split into [6...11], [2, 5], + # columns [4...9], [10, 13] + data2 = pd.DataFrame( + np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=np.arange(4, 14) + ) + data2 = to_boolean_if_needed(func_opts.func_name, data2) + df2 = from_pandas(data2, chunk_size=6) + + df3 = func_opts.func(df1, df2) + + # test df3's index and columns + pd.testing.assert_index_equal( + df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns + ) + assert isinstance(df3.index_value.value, IndexValue.Int64Index) + pd.testing.assert_index_equal( + df3.index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) + assert df3.index_value.key != df1.index_value.key + assert df3.index_value.key != df2.index_value.key + assert df3.shape[1] == 11 # columns is recorded, so we can get it + + df1, df2, df3 = tile(df1, df2, df3) + + # test df3's index and columns after tiling + pd.testing.assert_index_equal( + df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns + ) + assert isinstance(df3.index_value.value, IndexValue.Int64Index) + pd.testing.assert_index_equal( + df3.index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) + assert df3.index_value.key != df1.index_value.key + assert df3.index_value.key != df2.index_value.key + assert df3.shape[1] == 11 # columns is recorded, so we can get it + + data1_index_min_max = [(0, True, 4, True), (5, True, 9, True)] + data1_columns_min_max = [[3, True, 7, True], [8, True, 12, True]] + data2_index_min_max = [(2, True, 5, True), (6, True, 11, True)] + data2_columns_min_max = [(4, True, 9, True), (10, True, 13, True)] + + left_index_splits, right_index_splits = split_monotonic_index_min_max( + data1_index_min_max, True, data2_index_min_max, False + ) + left_columns_splits, right_columns_splits = split_monotonic_index_min_max( + data1_columns_min_max, True, data2_columns_min_max, True + ) + + left_index_idx_to_original_idx = build_split_idx_to_origin_idx(left_index_splits) + right_index_idx_to_original_idx = build_split_idx_to_origin_idx( + right_index_splits, False + ) + left_columns_idx_to_original_idx = build_split_idx_to_origin_idx( + left_columns_splits + ) + right_columns_idx_to_original_idx = build_split_idx_to_origin_idx( + right_columns_splits + ) + + assert df3.chunk_shape == (7, 7) + for c in df3.chunks: + assert isinstance(c.op, func_opts.op) + assert len(c.inputs) == 2 + # test shape + idx = c.index + # test the left side + assert isinstance(c.inputs[0].op, DataFrameIndexAlign) + assert c.inputs[0].op.stage == OperandStage.map + left_row_idx, left_row_inner_idx = left_index_idx_to_original_idx[idx[0]] + left_col_idx, left_col_inner_idx = left_columns_idx_to_original_idx[idx[1]] + expect_df1_input = df1.cix[left_row_idx, left_col_idx].data + assert c.inputs[0].inputs[0] is expect_df1_input + left_index_min_max = left_index_splits[left_row_idx][left_row_inner_idx] + assert c.inputs[0].op.index_min == left_index_min_max[0] + assert c.inputs[0].op.index_min_close == left_index_min_max[1] + assert c.inputs[0].op.index_max == left_index_min_max[2] + assert c.inputs[0].op.index_max_close == left_index_min_max[3] + assert isinstance(c.inputs[0].index_value.to_pandas(), type(data1.index)) + left_column_min_max = left_columns_splits[left_col_idx][left_col_inner_idx] + assert c.inputs[0].op.column_min == left_column_min_max[0] + assert c.inputs[0].op.column_min_close == left_column_min_max[1] + assert c.inputs[0].op.column_max == left_column_min_max[2] + assert c.inputs[0].op.column_max_close == left_column_min_max[3] + expect_left_columns = filter_index_value( + expect_df1_input.columns_value, left_column_min_max, store_data=True + ) + pd.testing.assert_index_equal( + c.inputs[0].columns_value.to_pandas(), expect_left_columns.to_pandas() + ) + pd.testing.assert_index_equal( + c.inputs[0].dtypes.index, expect_left_columns.to_pandas() + ) + # test the right side + assert isinstance(c.inputs[1].op, DataFrameIndexAlign) + assert c.inputs[1].op.stage == OperandStage.map + right_row_idx, right_row_inner_idx = right_index_idx_to_original_idx[idx[0]] + right_col_idx, right_col_inner_idx = right_columns_idx_to_original_idx[idx[1]] + expect_df2_input = df2.cix[right_row_idx, right_col_idx].data + assert c.inputs[1].inputs[0] is expect_df2_input + right_index_min_max = right_index_splits[right_row_idx][right_row_inner_idx] + assert c.inputs[1].op.index_min == right_index_min_max[0] + assert c.inputs[1].op.index_min_close == right_index_min_max[1] + assert c.inputs[1].op.index_max == right_index_min_max[2] + assert c.inputs[1].op.index_max_close == right_index_min_max[3] + assert isinstance(c.inputs[1].index_value.to_pandas(), type(data2.index)) + right_column_min_max = right_columns_splits[right_col_idx][right_col_inner_idx] + assert c.inputs[1].op.column_min == right_column_min_max[0] + assert c.inputs[1].op.column_min_close == right_column_min_max[1] + assert c.inputs[1].op.column_max == right_column_min_max[2] + assert c.inputs[1].op.column_max_close == right_column_min_max[3] + expect_right_columns = filter_index_value( + expect_df2_input.columns_value, left_column_min_max, store_data=True + ) + pd.testing.assert_index_equal( + c.inputs[1].columns_value.to_pandas(), expect_right_columns.to_pandas() + ) + pd.testing.assert_index_equal( + c.inputs[1].dtypes.index, expect_right_columns.to_pandas() + ) + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_dataframe_and_series_with_align_map(func_name, func_opts): + data1 = pd.DataFrame( + np.random.rand(10, 10), index=np.arange(10), columns=np.arange(3, 13) + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=5) + s1 = df1[3] + + df2 = func_opts.func(df1, s1) + df1, df2, s1 = tile(df1, df2, s1) + + assert df2.shape == (df1.shape[0], np.nan) + assert df2.index_value.key == df1.index_value.key + + data1_columns_min_max = [[3, True, 7, True], [8, True, 12, True]] + data2_index_min_max = [(0, True, 4, True), (5, True, 9, True)] + + left_columns_splits, right_index_splits = split_monotonic_index_min_max( + data1_columns_min_max, True, data2_index_min_max, True + ) + + left_columns_idx_to_original_idx = build_split_idx_to_origin_idx( + left_columns_splits + ) + right_index_idx_to_original_idx = build_split_idx_to_origin_idx(right_index_splits) + + assert df2.chunk_shape == (2, 7) + for c in df2.chunks: + assert isinstance(c.op, func_opts.op) + assert len(c.inputs) == 2 + # test shape + idx = c.index + # test the left side (dataframe) + assert isinstance(c.inputs[0].op, DataFrameIndexAlign) + assert c.inputs[0].op.stage == OperandStage.map + left_col_idx, left_col_inner_idx = left_columns_idx_to_original_idx[idx[1]] + expect_df1_input = df1.cix[idx[0], left_col_idx].data + assert c.inputs[0].inputs[0] is expect_df1_input + left_column_min_max = left_columns_splits[left_col_idx][left_col_inner_idx] + assert c.inputs[0].op.column_min == left_column_min_max[0] + assert c.inputs[0].op.column_min_close == left_column_min_max[1] + assert c.inputs[0].op.column_max == left_column_min_max[2] + assert c.inputs[0].op.column_max_close == left_column_min_max[3] + expect_left_columns = filter_index_value( + expect_df1_input.columns_value, left_column_min_max, store_data=True + ) + pd.testing.assert_index_equal( + c.inputs[0].columns_value.to_pandas(), expect_left_columns.to_pandas() + ) + pd.testing.assert_index_equal( + c.inputs[0].dtypes.index, expect_left_columns.to_pandas() + ) + + # test the right side (series) + assert isinstance(c.inputs[1].op, DataFrameIndexAlign) + assert c.inputs[1].op.stage == OperandStage.map + right_row_idx, right_row_inner_idx = right_index_idx_to_original_idx[idx[1]] + expect_s1_input = s1.cix[(right_row_idx,)].data + assert c.inputs[1].inputs[0] is expect_s1_input + right_index_min_max = right_index_splits[right_row_idx][right_row_inner_idx] + assert c.inputs[1].op.index_min == right_index_min_max[0] + assert c.inputs[1].op.index_min_close == right_index_min_max[1] + assert c.inputs[1].op.index_max == right_index_min_max[2] + assert c.inputs[1].op.index_max_close == right_index_min_max[3] + assert isinstance(c.inputs[1].index_value.to_pandas(), type(data1[3].index)) + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_dataframe_and_series_identical(func_name, func_opts): + data1 = pd.DataFrame( + np.random.rand(10, 10), index=np.arange(10), columns=np.arange(10) + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=5) + s1 = from_pandas_series(data1[3], chunk_size=5) + + df2 = func_opts.func(df1, s1) + df1, df2, s1 = tile(df1, df2, s1) + + assert df2.shape == (10, 10) + assert df2.index_value.key == df1.index_value.key + assert df2.columns_value.key == df1.columns_value.key + assert df2.columns_value.key == s1.index_value.key + + assert df2.chunk_shape == (2, 2) + for c in df2.chunks: + assert isinstance(c.op, func_opts.op) + assert len(c.inputs) == 2 + assert c.shape == (5, 5) + assert c.index_value.key == df1.cix[c.index].index_value.key + assert c.index_value.key == df2.cix[c.index].index_value.key + assert c.columns_value.key == df1.cix[c.index].columns_value.key + assert c.columns_value.key == df2.cix[c.index].columns_value.key + pd.testing.assert_index_equal( + c.columns_value.to_pandas(), df1.cix[c.index].columns_value.to_pandas() + ) + pd.testing.assert_index_equal( + c.columns_value.to_pandas(), df2.cix[c.index].columns_value.to_pandas() + ) + pd.testing.assert_index_equal( + c.dtypes.index, df1.cix[c.index].columns_value.to_pandas() + ) + + # test the left side + assert isinstance(c.inputs[0].op, DataFrameDataSource) + assert c.inputs[0] is df1.cix[c.index].data + # test the right side + assert isinstance(c.inputs[1].op, SeriesDataSource) + assert c.inputs[1] is s1.cix[(c.index[1],)].data + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_dataframe_and_series_with_shuffle(func_name, func_opts): + data1 = pd.DataFrame( + np.random.rand(10, 10), + index=[4, 9, 3, 2, 1, 5, 8, 6, 7, 10], + columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=5) + s1 = from_pandas_series(data1[10], chunk_size=6) + + df2 = func_opts.func(df1, s1) + + # test df2's index and columns + assert df2.shape == (df1.shape[0], np.nan) + assert df2.index_value.key == df1.index_value.key + pd.testing.assert_index_equal( + df2.columns_value.to_pandas(), pd.Index([], dtype=np.int64) + ) + assert df2.columns_value.key != df1.columns_value.key + + df1, df2, s1 = tile(df1, df2, s1) + + assert df2.chunk_shape == (2, 2) + for c in df2.chunks: + assert isinstance(c.op, func_opts.op) + assert len(c.inputs) == 2 + idx = c.index + # test the left side + assert isinstance(c.inputs[0].op, DataFrameIndexAlign) + assert c.inputs[0].op.stage == OperandStage.reduce + expect_dtypes = pd.concat( + [ + hash_dtypes(ic.inputs[0].op.data.dtypes, 2)[c.index[1]] + for ic in c.inputs[0].inputs[0].inputs + ] + ) + pd.testing.assert_series_equal(c.inputs[0].dtypes, expect_dtypes) + pd.testing.assert_index_equal( + c.inputs[0].columns_value.to_pandas(), c.inputs[0].dtypes.index + ) + pd.testing.assert_index_equal( + c.inputs[0].index_value.to_pandas(), c.index_value.to_pandas() + ) + assert isinstance(c.inputs[0].inputs[0].op, DataFrameShuffleProxy) + for j, ci, ic in zip( + itertools.count(0), c.inputs[0].inputs[0].inputs, df1.cix[idx[0], :] + ): + assert isinstance(ci.op, DataFrameIndexAlign) + assert ci.op.stage == OperandStage.map + assert ci.index == (idx[0], j) + assert ci.op.column_shuffle_size + shuffle_segments = ci.op.column_shuffle_segments + expected_shuffle_segments = hash_dtypes(ic.data.dtypes, 2) + assert len(shuffle_segments) == len(expected_shuffle_segments) + for ss, ess in zip(shuffle_segments, expected_shuffle_segments): + pd.testing.assert_series_equal(ss, ess) + assert ci.inputs[0] is ic.data + + # test the right side + assert isinstance(c.inputs[1].op, DataFrameIndexAlign) + assert c.inputs[1].op.stage == OperandStage.reduce + assert c.inputs[1].op.output_types[0] == OutputType.series + assert isinstance(c.inputs[1].inputs[0].op, DataFrameShuffleProxy) + for j, ci, ic in zip( + itertools.count(0), c.inputs[1].inputs[0].inputs, s1.chunks + ): + assert isinstance(ci.op, DataFrameIndexAlign) + assert ci.op.stage == OperandStage.map + assert ci.index == (j,) + assert ci.op.index_shuffle_size + assert ci.inputs[0] is ic.data + + # make sure shuffle proxies' key are different + proxy_keys = set() + for i in range(df2.chunk_shape[0]): + cs = [c for c in df2.chunks if c.index[0] == i] + lps = {c.inputs[0].inputs[0].op.key for c in cs} + assert len(lps) == 1 + proxy_keys.add(lps.pop()) + rps = {c.inputs[1].inputs[0].op.key for c in cs} + assert len(rps) == 1 + proxy_keys.add(rps.pop()) + assert len(proxy_keys) == df2.chunk_shape[0] + 1 + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_series_and_series_with_align_map(func_name, func_opts): + data1 = pd.DataFrame( + np.random.rand(10, 10), index=np.arange(10), columns=np.arange(3, 13) + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=5) + + s1 = df1.iloc[4] + s2 = df1[3] + + s3 = func_opts.func(s1, s2) + + s1, s2, s3 = tile(s1, s2, s3) + + assert s3.shape == (np.nan,) + + s1_index_min_max = [[3, True, 7, True], [8, True, 12, True]] + s2_index_min_max = [(0, True, 4, True), (5, True, 9, True)] + + left_index_splits, right_index_splits = split_monotonic_index_min_max( + s1_index_min_max, True, s2_index_min_max, True + ) + + left_index_idx_to_original_idx = build_split_idx_to_origin_idx(left_index_splits) + right_index_idx_to_original_idx = build_split_idx_to_origin_idx(right_index_splits) + + assert s3.chunk_shape == (7,) + for c in s3.chunks: + assert isinstance(c.op, func_opts.op) + assert len(c.inputs) == 2 + # test shape + idx = c.index + # test the left side (series) + assert isinstance(c.inputs[0].op, DataFrameIndexAlign) + assert c.inputs[0].op.stage == OperandStage.map + left_col_idx, left_col_inner_idx = left_index_idx_to_original_idx[idx[0]] + expect_s1_input = s1.cix[(left_col_idx,)].data + assert c.inputs[0].inputs[0] is expect_s1_input + left_index_min_max = left_index_splits[left_col_idx][left_col_inner_idx] + assert c.inputs[0].op.index_min == left_index_min_max[0] + assert c.inputs[0].op.index_min_close == left_index_min_max[1] + assert c.inputs[0].op.index_max == left_index_min_max[2] + assert c.inputs[0].op.index_max_close == left_index_min_max[3] + assert isinstance( + c.inputs[0].index_value.to_pandas(), type(data1.iloc[4].index) + ) + expect_left_index = filter_index_value( + expect_s1_input.index_value, left_index_min_max, store_data=True + ) + pd.testing.assert_index_equal( + c.inputs[0].index_value.to_pandas(), expect_left_index.to_pandas() + ) + + # test the right side (series) + assert isinstance(c.inputs[1].op, DataFrameIndexAlign) + assert c.inputs[1].op.stage == OperandStage.map + right_row_idx, right_row_inner_idx = right_index_idx_to_original_idx[idx[0]] + expect_s2_input = s2.cix[(right_row_idx,)].data + assert c.inputs[1].inputs[0] is expect_s2_input + right_index_min_max = right_index_splits[right_row_idx][right_row_inner_idx] + assert c.inputs[1].op.index_min == right_index_min_max[0] + assert c.inputs[1].op.index_min_close == right_index_min_max[1] + assert c.inputs[1].op.index_max == right_index_min_max[2] + assert c.inputs[1].op.index_max_close == right_index_min_max[3] + assert isinstance(c.inputs[1].index_value.to_pandas(), type(data1[3].index)) + expect_right_index = filter_index_value( + expect_s2_input.index_value, right_index_min_max, store_data=True + ) + pd.testing.assert_index_equal( + c.inputs[1].index_value.to_pandas(), expect_right_index.to_pandas() + ) + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_series_and_series_identical(func_name, func_opts): + data1 = pd.DataFrame( + np.random.rand(10, 10), index=np.arange(10), columns=np.arange(10) + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + s1 = from_pandas_series(data1[1], chunk_size=5) + s2 = from_pandas_series(data1[3], chunk_size=5) + + s3 = func_opts.func(s1, s2) + + s1, s2, s3 = tile(s1, s2, s3) + + assert s3.shape == (10,) + assert s3.index_value.key == s1.index_value.key + assert s3.index_value.key == s2.index_value.key + + assert s3.chunk_shape == (2,) + for c in s3.chunks: + assert isinstance(c.op, func_opts.op) + assert c.op.output_types[0] == OutputType.series + assert len(c.inputs) == 2 + assert c.shape == (5,) + assert c.index_value.key == s1.cix[c.index].index_value.key + assert c.index_value.key == s2.cix[c.index].index_value.key + + # test the left side + assert isinstance(c.inputs[0].op, SeriesDataSource) + assert c.inputs[0] is s1.cix[c.index].data + # test the right side + assert isinstance(c.inputs[1].op, SeriesDataSource) + assert c.inputs[1] is s2.cix[c.index].data + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_series_and_series_with_shuffle(func_name, func_opts): + data1 = pd.DataFrame( + np.random.rand(10, 10), + index=[4, 9, 3, 2, 1, 5, 8, 6, 7, 10], + columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + s1 = from_pandas_series(data1.iloc[4], chunk_size=5) + s2 = from_pandas_series(data1[10], chunk_size=6) + + s3 = func_opts.func(s1, s2) + + # test s3's index + assert s3.shape == (np.nan,) + assert s3.index_value.key != s1.index_value.key + assert s3.index_value.key != s2.index_value.key + pd.testing.assert_index_equal( + s3.index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) + + s1, s2, s3 = tile(s1, s2, s3) + + assert s3.chunk_shape == (2,) + for c in s3.chunks: + assert isinstance(c.op, func_opts.op) + assert len(c.inputs) == 2 + # test the left side + assert isinstance(c.inputs[0].op, DataFrameIndexAlign) + assert c.inputs[0].op.stage == OperandStage.reduce + assert c.inputs[0].op.output_types[0] == OutputType.series + assert isinstance(c.inputs[0].inputs[0].op, DataFrameShuffleProxy) + for j, ci, ic in zip( + itertools.count(0), c.inputs[0].inputs[0].inputs, s1.chunks + ): + assert isinstance(ci.op, DataFrameIndexAlign) + assert ci.op.stage == OperandStage.map + assert ci.index == (j,) + assert ci.op.index_shuffle_size + assert ci.inputs[0] is ic.data + + # test the right side + assert isinstance(c.inputs[1].op, DataFrameIndexAlign) + assert c.inputs[1].op.stage == OperandStage.reduce + assert c.inputs[1].op.output_types[0] == OutputType.series + assert isinstance(c.inputs[1].inputs[0].op, DataFrameShuffleProxy) + for j, ci, ic in zip( + itertools.count(0), c.inputs[1].inputs[0].inputs, s2.chunks + ): + assert isinstance(ci.op, DataFrameIndexAlign) + assert ci.op.stage == OperandStage.map + assert ci.index == (j,) + assert ci.op.index_shuffle_size + assert ci.inputs[0] is ic.data + + # make sure shuffle proxies' key are different + proxy_keys = set() + for c in s3.chunks: + proxy_keys.add(c.inputs[0].inputs[0].op.key) + proxy_keys.add(c.inputs[1].inputs[0].op.key) + assert len(proxy_keys) == 2 + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_identical_index_and_columns(func_name, func_opts): + data1 = pd.DataFrame(np.random.rand(10, 10), columns=np.arange(3, 13)) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=5) + data2 = pd.DataFrame(np.random.rand(10, 10), columns=np.arange(3, 13)) + data2 = to_boolean_if_needed(func_opts.func_name, data2) + df2 = from_pandas(data2, chunk_size=5) + + df3 = func_opts.func(df1, df2) + + # test df3's index and columns + pd.testing.assert_index_equal( + df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns + ) + assert isinstance(df3.index_value.value, IndexValue.RangeIndex) + pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.RangeIndex(0, 10)) + assert df3.index_value.key == df1.index_value.key + assert df3.index_value.key == df2.index_value.key + assert df3.shape == (10, 10) # columns is recorded, so we can get it + + df1, df2, df3 = tile(df1, df2, df3) + + assert df3.chunk_shape == (2, 2) + for c in df3.chunks: + assert isinstance(c.op, func_opts.op) + assert len(c.inputs) == 2 + assert c.shape == (5, 5) + assert c.index_value.key == df1.cix[c.index].index_value.key + assert c.index_value.key == df2.cix[c.index].index_value.key + assert c.columns_value.key == df1.cix[c.index].columns_value.key + assert c.columns_value.key == df2.cix[c.index].columns_value.key + pd.testing.assert_index_equal( + c.columns_value.to_pandas(), df1.cix[c.index].columns_value.to_pandas() + ) + pd.testing.assert_index_equal( + c.columns_value.to_pandas(), df2.cix[c.index].columns_value.to_pandas() + ) + pd.testing.assert_index_equal( + c.dtypes.index, df1.cix[c.index].columns_value.to_pandas() + ) + + # test the left side + assert c.inputs[0] is df1.cix[c.index].data + # test the right side + assert c.inputs[1] is df2.cix[c.index].data + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_with_one_shuffle(func_name, func_opts): + # only 1 axis is monotonic + # data1 with index split into [0...4], [5...9], + data1 = pd.DataFrame( + np.random.rand(10, 10), + index=np.arange(10), + columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=5) + # data2 with index split into [6...11], [2, 5], + data2 = pd.DataFrame( + np.random.rand(10, 10), + index=np.arange(11, 1, -1), + columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], + ) + data2 = to_boolean_if_needed(func_opts.func_name, data2) + df2 = from_pandas(data2, chunk_size=6) + + df3 = func_opts.func(df1, df2) + + # test df3's index and columns + pd.testing.assert_index_equal( + df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns + ) + assert isinstance(df3.index_value.value, IndexValue.Int64Index) + pd.testing.assert_index_equal( + df3.index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) + assert df3.index_value.key != df1.index_value.key + assert df3.index_value.key != df2.index_value.key + assert df3.shape[1] == 12 # columns is recorded, so we can get it + + df1, df2, df3 = tile(df1, df2, df3) + + data1_index_min_max = [(0, True, 4, True), (5, True, 9, True)] + data2_index_min_max = [(2, True, 5, True), (6, True, 11, True)] + + left_index_splits, right_index_splits = split_monotonic_index_min_max( + data1_index_min_max, True, data2_index_min_max, False + ) + + left_index_idx_to_original_idx = build_split_idx_to_origin_idx(left_index_splits) + right_index_idx_to_original_idx = build_split_idx_to_origin_idx( + right_index_splits, False + ) + + assert df3.chunk_shape == (7, 2) + for c in df3.chunks: + assert isinstance(c.op, func_opts.op) + assert len(c.inputs) == 2 + idx = c.index + # test the left side + assert isinstance(c.inputs[0].op, DataFrameIndexAlign) + assert c.inputs[0].op.stage == OperandStage.reduce + expect_dtypes = pd.concat( + [ + hash_dtypes(ic.inputs[0].op.data.dtypes, 2)[c.index[1]] + for ic in c.inputs[0].inputs[0].inputs + ] + ) + pd.testing.assert_series_equal(c.inputs[0].dtypes, expect_dtypes) + pd.testing.assert_index_equal( + c.inputs[0].columns_value.to_pandas(), c.inputs[0].dtypes.index + ) + assert isinstance(c.inputs[0].index_value.to_pandas(), type(data1.index)) + assert isinstance(c.inputs[0].inputs[0].op, DataFrameShuffleProxy) + left_row_idx, left_row_inner_idx = left_index_idx_to_original_idx[idx[0]] + left_index_min_max = left_index_splits[left_row_idx][left_row_inner_idx] + ics = [ic for ic in df1.chunks if ic.index[0] == left_row_idx] + for j, ci, ic in zip(itertools.count(0), c.inputs[0].inputs[0].inputs, ics): + assert isinstance(ci.op, DataFrameIndexAlign) + assert ci.op.stage == OperandStage.map + assert ci.index == (idx[0], j) + assert ci.op.index_min == left_index_min_max[0] + assert ci.op.index_min_close == left_index_min_max[1] + assert ci.op.index_max == left_index_min_max[2] + assert ci.op.index_max_close == left_index_min_max[3] + assert isinstance(ci.index_value.to_pandas(), type(data1.index)) + assert ci.op.column_shuffle_size + shuffle_segments = ci.op.column_shuffle_segments + expected_shuffle_segments = hash_dtypes(ic.data.dtypes, 2) + assert len(shuffle_segments) == len(expected_shuffle_segments) + for ss, ess in zip(shuffle_segments, expected_shuffle_segments): + pd.testing.assert_series_equal(ss, ess) + assert ci.inputs[0] is ic.data + # test the right side + assert isinstance(c.inputs[1].op, DataFrameIndexAlign) + assert c.inputs[1].op.stage == OperandStage.reduce + expect_dtypes = pd.concat( + [ + hash_dtypes(ic.inputs[0].op.data.dtypes, 2)[c.index[1]] + for ic in c.inputs[1].inputs[0].inputs + ] + ) + pd.testing.assert_series_equal(c.inputs[1].dtypes, expect_dtypes) + pd.testing.assert_index_equal( + c.inputs[1].columns_value.to_pandas(), c.inputs[1].dtypes.index + ) + assert isinstance(c.inputs[1].index_value.to_pandas(), type(data1.index)) + assert isinstance(c.inputs[1].inputs[0].op, DataFrameShuffleProxy) + right_row_idx, right_row_inner_idx = right_index_idx_to_original_idx[idx[0]] + right_index_min_max = right_index_splits[right_row_idx][right_row_inner_idx] + ics = [ic for ic in df2.chunks if ic.index[0] == right_row_idx] + for j, ci, ic in zip(itertools.count(0), c.inputs[1].inputs[0].inputs, ics): + assert isinstance(ci.op, DataFrameIndexAlign) + assert ci.op.stage == OperandStage.map + assert ci.index == (idx[0], j) + assert ci.op.index_min == right_index_min_max[0] + assert ci.op.index_min_close == right_index_min_max[1] + assert ci.op.index_max == right_index_min_max[2] + assert ci.op.index_max_close == right_index_min_max[3] + assert ci.op.column_shuffle_size + shuffle_segments = ci.op.column_shuffle_segments + expected_shuffle_segments = hash_dtypes(ic.data.dtypes, 2) + assert len(shuffle_segments) == len(expected_shuffle_segments) + for ss, ess in zip(shuffle_segments, expected_shuffle_segments): + pd.testing.assert_series_equal(ss, ess) + assert ci.inputs[0] is ic.data + + # make sure shuffle proxies' key are different + proxy_keys = set() + for i in range(df3.chunk_shape[0]): + cs = [c for c in df3.chunks if c.index[0] == i] + lps = {c.inputs[0].inputs[0].op.key for c in cs} + assert len(lps) == 1 + proxy_keys.add(lps.pop()) + rps = {c.inputs[1].inputs[0].op.key for c in cs} + assert len(rps) == 1 + proxy_keys.add(rps.pop()) + assert len(proxy_keys) == 2 * df3.chunk_shape[0] + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_with_all_shuffle(func_name, func_opts): + # no axis is monotonic + data1 = pd.DataFrame( + np.random.rand(10, 10), + index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], + columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=5) + data2 = pd.DataFrame( + np.random.rand(10, 10), + index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], + columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], + ) + data2 = to_boolean_if_needed(func_opts.func_name, data2) + df2 = from_pandas(data2, chunk_size=6) + + df3 = func_opts.func(df1, df2) + + # test df3's index and columns + pd.testing.assert_index_equal( + df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns + ) + assert isinstance(df3.index_value.value, IndexValue.Int64Index) + pd.testing.assert_index_equal( + df3.index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) + assert df3.index_value.key != df1.index_value.key + assert df3.index_value.key != df2.index_value.key + assert df3.shape[1] == 12 # columns is recorded, so we can get it + + df1, df2, df3 = tile(df1, df2, df3) + + assert df3.chunk_shape == (2, 2) + proxy_keys = set() + for c in df3.chunks: + assert isinstance(c.op, func_opts.op) + assert len(c.inputs) == 2 + # test left side + assert isinstance(c.inputs[0].op, DataFrameIndexAlign) + assert c.inputs[0].op.stage == OperandStage.reduce + expect_dtypes = pd.concat( + [ + hash_dtypes(ic.inputs[0].op.data.dtypes, 2)[c.index[1]] + for ic in c.inputs[0].inputs[0].inputs + if ic.index[0] == 0 + ] + ) + pd.testing.assert_series_equal(c.inputs[0].dtypes, expect_dtypes) + pd.testing.assert_index_equal( + c.inputs[0].columns_value.to_pandas(), c.inputs[0].dtypes.index + ) + assert isinstance(c.inputs[0].index_value.to_pandas(), type(data1.index)) + assert isinstance(c.inputs[0].inputs[0].op, DataFrameShuffleProxy) + proxy_keys.add(c.inputs[0].inputs[0].op.key) + for ic, ci in zip(c.inputs[0].inputs[0].inputs, df1.chunks): + assert isinstance(ic.op, DataFrameIndexAlign) + assert ic.op.stage == OperandStage.map + assert ic.op.index_shuffle_size == 2 + assert isinstance(ic.index_value.to_pandas(), type(data1.index)) + assert ic.op.column_shuffle_size == 2 + assert ic.columns_value is not None + shuffle_segments = ic.op.column_shuffle_segments + expected_shuffle_segments = hash_dtypes(ci.data.dtypes, 2) + assert len(shuffle_segments) == len(expected_shuffle_segments) + for ss, ess in zip(shuffle_segments, expected_shuffle_segments): + pd.testing.assert_series_equal(ss, ess) + assert ic.inputs[0] is ci.data + # test right side + assert isinstance(c.inputs[1].op, DataFrameIndexAlign) + assert c.inputs[1].op.stage == OperandStage.reduce + expect_dtypes = pd.concat( + [ + hash_dtypes(ic.inputs[0].op.data.dtypes, 2)[c.index[1]] + for ic in c.inputs[1].inputs[0].inputs + if ic.index[0] == 0 + ] + ) + pd.testing.assert_series_equal(c.inputs[1].dtypes, expect_dtypes) + pd.testing.assert_index_equal( + c.inputs[1].columns_value.to_pandas(), c.inputs[1].dtypes.index + ) + assert isinstance(c.inputs[0].index_value.to_pandas(), type(data1.index)) + assert isinstance(c.inputs[1].inputs[0].op, DataFrameShuffleProxy) + proxy_keys.add(c.inputs[1].inputs[0].op.key) + for ic, ci in zip(c.inputs[1].inputs[0].inputs, df2.chunks): + assert isinstance(ic.op, DataFrameIndexAlign) + assert ic.op.stage == OperandStage.map + assert ic.op.index_shuffle_size == 2 + assert isinstance(ic.index_value.to_pandas(), type(data1.index)) + assert ic.op.column_shuffle_size == 2 + assert ic.columns_value is not None + shuffle_segments = ic.op.column_shuffle_segments + expected_shuffle_segments = hash_dtypes(ci.data.dtypes, 2) + assert len(shuffle_segments) == len(expected_shuffle_segments) + for ss, ess in zip(shuffle_segments, expected_shuffle_segments): + pd.testing.assert_series_equal(ss, ess) + assert ic.inputs[0] is ci.data + + assert len(proxy_keys) == 2 + + data4 = pd.DataFrame( + np.random.rand(10, 10), + index=np.random.randint(-100, 100, size=(10,)), + columns=[np.random.bytes(10) for _ in range(10)], + ) + data4 = to_boolean_if_needed(func_opts.func_name, data4) + df4 = from_pandas(data4, chunk_size=3) + + data5 = pd.DataFrame( + np.random.rand(10, 10), + index=np.random.randint(-100, 100, size=(10,)), + columns=[np.random.bytes(10) for _ in range(10)], + ) + data5 = to_boolean_if_needed(func_opts.func_name, data5) + df5 = from_pandas(data5, chunk_size=3) + + df6 = func_opts.func(df4, df5) + + # test df6's index and columns + pd.testing.assert_index_equal( + df6.columns_value.to_pandas(), func_opts.func(data4, data5).columns + ) + assert isinstance(df6.index_value.value, IndexValue.Int64Index) + pd.testing.assert_index_equal( + df6.index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) + assert df6.index_value.key != df4.index_value.key + assert df6.index_value.key != df5.index_value.key + assert df6.shape[1] == 20 # columns is recorded, so we can get it + + df4, df5, df6 = tile(df4, df5, df6) + + assert df6.chunk_shape == (4, 4) + proxy_keys = set() + for c in df6.chunks: + assert isinstance(c.op, func_opts.op) + assert len(c.inputs) == 2 + # test left side + assert isinstance(c.inputs[0].op, DataFrameIndexAlign) + assert c.inputs[0].op.stage == OperandStage.reduce + expect_dtypes = pd.concat( + [ + hash_dtypes(ic.inputs[0].op.data.dtypes, 4)[c.index[1]] + for ic in c.inputs[0].inputs[0].inputs + if ic.index[0] == 0 + ] + ) + pd.testing.assert_series_equal(c.inputs[0].dtypes, expect_dtypes) + pd.testing.assert_index_equal( + c.inputs[0].columns_value.to_pandas(), c.inputs[0].dtypes.index + ) + assert isinstance(c.inputs[0].index_value.to_pandas(), type(data1.index)) + assert isinstance(c.inputs[0].inputs[0].op, DataFrameShuffleProxy) + proxy_keys.add(c.inputs[0].inputs[0].op.key) + for ic, ci in zip(c.inputs[0].inputs[0].inputs, df4.chunks): + assert isinstance(ic.op, DataFrameIndexAlign) + assert ic.op.stage == OperandStage.map + assert ic.op.index_shuffle_size == 4 + assert isinstance(ic.index_value.to_pandas(), type(data1.index)) + assert ic.op.column_shuffle_size == 4 + assert ic.columns_value is not None + shuffle_segments = ic.op.column_shuffle_segments + expected_shuffle_segments = hash_dtypes(ci.data.dtypes, 4) + assert len(shuffle_segments) == len(expected_shuffle_segments) + for ss, ess in zip(shuffle_segments, expected_shuffle_segments): + pd.testing.assert_series_equal(ss, ess) + assert ic.inputs[0] is ci.data + # test right side + assert isinstance(c.inputs[1].op, DataFrameIndexAlign) + assert c.inputs[1].op.stage == OperandStage.reduce + expect_dtypes = pd.concat( + [ + hash_dtypes(ic.inputs[0].op.data.dtypes, 4)[c.index[1]] + for ic in c.inputs[1].inputs[0].inputs + if ic.index[0] == 0 + ] + ) + pd.testing.assert_series_equal(c.inputs[1].dtypes, expect_dtypes) + pd.testing.assert_index_equal( + c.inputs[1].columns_value.to_pandas(), c.inputs[1].dtypes.index + ) + assert isinstance(c.inputs[0].index_value.to_pandas(), type(data1.index)) + assert isinstance(c.inputs[1].inputs[0].op, DataFrameShuffleProxy) + proxy_keys.add(c.inputs[1].inputs[0].op.key) + for ic, ci in zip(c.inputs[1].inputs[0].inputs, df5.chunks): + assert isinstance(ic.op, DataFrameIndexAlign) + assert ic.op.stage == OperandStage.map + assert ic.op.index_shuffle_size == 4 + assert isinstance(ic.index_value.to_pandas(), type(data1.index)) + assert ic.op.column_shuffle_size == 4 + assert ic.columns_value is not None + shuffle_segments = ic.op.column_shuffle_segments + expected_shuffle_segments = hash_dtypes(ci.data.dtypes, 4) + assert len(shuffle_segments) == len(expected_shuffle_segments) + for ss, ess in zip(shuffle_segments, expected_shuffle_segments): + pd.testing.assert_series_equal(ss, ess) + assert ic.inputs[0] is ci.data + + assert len(proxy_keys) == 2 + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_without_shuffle_and_with_one_chunk(func_name, func_opts): + # only 1 axis is monotonic + # data1 with index split into [0...4], [5...9], + data1 = pd.DataFrame( + np.random.rand(10, 10), + index=np.arange(10), + columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=(5, 10)) + # data2 with index split into [6...11], [2, 5], + data2 = pd.DataFrame( + np.random.rand(10, 10), + index=np.arange(11, 1, -1), + columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], + ) + data2 = to_boolean_if_needed(func_opts.func_name, data2) + df2 = from_pandas(data2, chunk_size=(6, 10)) + + df3 = func_opts.func(df1, df2) + + # test df3's index and columns + pd.testing.assert_index_equal( + df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns + ) + assert isinstance(df3.index_value.value, IndexValue.Int64Index) + pd.testing.assert_index_equal( + df3.index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) + assert df3.index_value.key != df1.index_value.key + assert df3.index_value.key != df2.index_value.key + assert df3.shape[1] == 12 # columns is recorded, so we can get it + + df1, df2, df3 = tile(df1, df2, df3) + + data1_index_min_max = [(0, True, 4, True), (5, True, 9, True)] + data2_index_min_max = [(2, True, 5, True), (6, True, 11, True)] + + left_index_splits, right_index_splits = split_monotonic_index_min_max( + data1_index_min_max, True, data2_index_min_max, False + ) + + left_index_idx_to_original_idx = build_split_idx_to_origin_idx(left_index_splits) + right_index_idx_to_original_idx = build_split_idx_to_origin_idx( + right_index_splits, False + ) + + assert df3.chunk_shape == (7, 1) + for c in df3.chunks: + assert isinstance(c.op, func_opts.op) + assert len(c.inputs) == 2 + # test shape + idx = c.index + # test the left side + assert isinstance(c.inputs[0].op, DataFrameIndexAlign) + assert c.inputs[0].op.stage == OperandStage.map + left_row_idx, left_row_inner_idx = left_index_idx_to_original_idx[idx[0]] + expect_df1_input = df1.cix[left_row_idx, 0].data + assert c.inputs[0].inputs[0] is expect_df1_input + left_index_min_max = left_index_splits[left_row_idx][left_row_inner_idx] + assert c.inputs[0].op.index_min == left_index_min_max[0] + assert c.inputs[0].op.index_min_close == left_index_min_max[1] + assert c.inputs[0].op.index_max == left_index_min_max[2] + assert c.inputs[0].op.index_max_close == left_index_min_max[3] + assert isinstance(c.inputs[0].index_value.to_pandas(), type(data1.index)) + assert c.inputs[0].op.column_min == expect_df1_input.columns_value.min_val + assert ( + c.inputs[0].op.column_min_close + == expect_df1_input.columns_value.min_val_close + ) + assert c.inputs[0].op.column_max == expect_df1_input.columns_value.max_val + assert ( + c.inputs[0].op.column_max_close + == expect_df1_input.columns_value.max_val_close + ) + expect_left_columns = expect_df1_input.columns_value + pd.testing.assert_index_equal( + c.inputs[0].columns_value.to_pandas(), expect_left_columns.to_pandas() + ) + pd.testing.assert_index_equal( + c.inputs[0].dtypes.index, expect_left_columns.to_pandas() + ) + # test the right side + assert isinstance(c.inputs[1].op, DataFrameIndexAlign) + assert c.inputs[1].op.stage == OperandStage.map + right_row_idx, right_row_inner_idx = right_index_idx_to_original_idx[idx[0]] + expect_df2_input = df2.cix[right_row_idx, 0].data + assert c.inputs[1].inputs[0] is expect_df2_input + right_index_min_max = right_index_splits[right_row_idx][right_row_inner_idx] + assert c.inputs[1].op.index_min == right_index_min_max[0] + assert c.inputs[1].op.index_min_close == right_index_min_max[1] + assert c.inputs[1].op.index_max == right_index_min_max[2] + assert c.inputs[1].op.index_max_close == right_index_min_max[3] + assert isinstance(c.inputs[1].index_value.to_pandas(), type(data2.index)) + assert c.inputs[1].op.column_min == expect_df2_input.columns_value.min_val + assert ( + c.inputs[1].op.column_min_close + == expect_df2_input.columns_value.min_val_close + ) + assert c.inputs[1].op.column_max == expect_df2_input.columns_value.max_val + assert ( + c.inputs[1].op.column_max_close + == expect_df2_input.columns_value.max_val_close + ) + expect_right_columns = expect_df2_input.columns_value + pd.testing.assert_index_equal( + c.inputs[1].columns_value.to_pandas(), expect_right_columns.to_pandas() + ) + pd.testing.assert_index_equal( + c.inputs[1].dtypes.index, expect_right_columns.to_pandas() + ) + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_both_one_chunk(func_name, func_opts): + # no axis is monotonic, but 1 chunk for all axes + data1 = pd.DataFrame( + np.random.rand(10, 10), + index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], + columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=10) + data2 = pd.DataFrame( + np.random.rand(10, 10), + index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], + columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], + ) + data2 = to_boolean_if_needed(func_opts.func_name, data2) + df2 = from_pandas(data2, chunk_size=10) + + df3 = func_opts.func(df1, df2) + + # test df3's index and columns + pd.testing.assert_index_equal( + df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns + ) + assert isinstance(df3.index_value.value, IndexValue.Int64Index) + pd.testing.assert_index_equal( + df3.index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) + assert df3.index_value.key != df1.index_value.key + assert df3.index_value.key != df2.index_value.key + assert df3.shape[1] == 12 # columns is recorded, so we can get it + + df1, df2, df3 = tile(df1, df2, df3) + + assert df3.chunk_shape == (1, 1) + for c in df3.chunks: + assert isinstance(c.op, func_opts.op) + assert len(c.inputs) == 2 + # test the left side + assert c.inputs[0] is df1.chunks[0].data + # test the right side + assert c.inputs[1] is df2.chunks[0].data + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_with_shuffle_and_one_chunk(func_name, func_opts): + # no axis is monotonic + data1 = pd.DataFrame( + np.random.rand(10, 10), + index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], + columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=(5, 10)) + data2 = pd.DataFrame( + np.random.rand(10, 10), + index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], + columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], + ) + data2 = to_boolean_if_needed(func_opts.func_name, data2) + df2 = from_pandas(data2, chunk_size=(6, 10)) + + df3 = func_opts.func(df1, df2) + + # test df3's index and columns + pd.testing.assert_index_equal( + df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns + ) + assert isinstance(df3.index_value.value, IndexValue.Int64Index) + pd.testing.assert_index_equal( + df3.index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) + assert df3.index_value.key != df1.index_value.key + assert df3.index_value.key != df2.index_value.key + assert df3.shape[1] == 12 # columns is recorded, so we can get it + + df1, df2, df3 = tile(df1, df2, df3) + + assert df3.chunk_shape == (2, 1) + proxy_keys = set() + for c in df3.chunks: + assert isinstance(c.op, func_opts.op) + assert len(c.inputs) == 2 + # test left side + assert isinstance(c.inputs[0].op, DataFrameIndexAlign) + assert c.inputs[0].op.stage == OperandStage.reduce + expect_dtypes = pd.concat( + [ + ic.inputs[0].op.data.dtypes + for ic in c.inputs[0].inputs[0].inputs + if ic.index[0] == 0 + ] + ) + pd.testing.assert_series_equal(c.inputs[0].dtypes, expect_dtypes) + pd.testing.assert_index_equal( + c.inputs[0].columns_value.to_pandas(), c.inputs[0].dtypes.index + ) + assert isinstance(c.inputs[0].index_value.to_pandas(), type(data1.index)) + assert isinstance(c.inputs[0].inputs[0].op, DataFrameShuffleProxy) + proxy_keys.add(c.inputs[0].inputs[0].op.key) + for ic, ci in zip(c.inputs[0].inputs[0].inputs, df1.chunks): + assert isinstance(ic.op, DataFrameIndexAlign) + assert ic.op.stage == OperandStage.map + assert ic.op.index_shuffle_size == 2 + assert isinstance(ic.index_value.to_pandas(), type(data1.index)) + assert ic.op.column_min == ci.columns_value.min_val + assert ic.op.column_min_close == ci.columns_value.min_val_close + assert ic.op.column_max == ci.columns_value.max_val + assert ic.op.column_max_close == ci.columns_value.max_val_close + assert ic.op.column_shuffle_size is None + assert ic.columns_value is not None + assert ic.inputs[0] is ci.data + # test right side + assert isinstance(c.inputs[1].op, DataFrameIndexAlign) + assert c.inputs[1].op.stage == OperandStage.reduce + expect_dtypes = pd.concat( + [ + ic.inputs[0].op.data.dtypes + for ic in c.inputs[1].inputs[0].inputs + if ic.index[0] == 0 + ] + ) + pd.testing.assert_series_equal(c.inputs[1].dtypes, expect_dtypes) + pd.testing.assert_index_equal( + c.inputs[1].columns_value.to_pandas(), c.inputs[1].dtypes.index + ) + assert isinstance(c.inputs[0].index_value.to_pandas(), type(data1.index)) + assert isinstance(c.inputs[1].inputs[0].op, DataFrameShuffleProxy) + proxy_keys.add(c.inputs[1].inputs[0].op.key) + for ic, ci in zip(c.inputs[1].inputs[0].inputs, df2.chunks): + assert isinstance(ic.op, DataFrameIndexAlign) + assert ic.op.stage == OperandStage.map + assert ic.op.index_shuffle_size == 2 + assert isinstance(ic.index_value.to_pandas(), type(data1.index)) + assert ic.op.column_shuffle_size is None + assert ic.op.column_min == ci.columns_value.min_val + assert ic.op.column_min_close == ci.columns_value.min_val_close + assert ic.op.column_max == ci.columns_value.max_val + assert ic.op.column_max_close == ci.columns_value.max_val_close + assert ic.op.column_shuffle_size is None + assert ic.columns_value is not None + assert ic.inputs[0] is ci.data + + assert len(proxy_keys) == 2 + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_on_same_dataframe(func_name, func_opts): + data = pd.DataFrame( + np.random.rand(10, 10), + index=np.random.randint(-100, 100, size=(10,)), + columns=[np.random.bytes(10) for _ in range(10)], + ) + data = to_boolean_if_needed(func_opts.func_name, data) + df = from_pandas(data, chunk_size=3) + df2 = func_opts.func(df, df) + + # test df2's index and columns + pd.testing.assert_index_equal( + df2.columns_value.to_pandas(), func_opts.func(data, data).columns + ) + assert isinstance(df2.index_value.value, IndexValue.Int64Index) + pd.testing.assert_index_equal( + df2.index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) + assert df2.index_value.key == df.index_value.key + assert df2.columns_value.key == df.columns_value.key + assert df2.shape[1] == 10 + + df, df2 = tile(df, df2) + + assert df2.chunk_shape == df.chunk_shape + for c in df2.chunks: + assert isinstance(c.op, func_opts.op) + assert len(c.inputs) == 2 + # test the left side + assert c.inputs[0] is df.cix[c.index].data + # test the right side + assert c.inputs[1] is df.cix[c.index].data + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_dataframe_and_scalar(func_name, func_opts): + if func_opts.func_name in ["__and__", "__or__", "__xor__"]: + # bitwise logical operators doesn\'t support floating point scalars + return + + data = pd.DataFrame( + np.random.rand(10, 10), index=np.arange(10), columns=np.arange(3, 13) + ) + df = from_pandas(data, chunk_size=5) + # test operator with scalar + result = func_opts.func(df, 1) + result2 = getattr(df, func_opts.func_name)(1) + + # test reverse operator with scalar + result3 = getattr(df, func_opts.rfunc_name)(1) + result4 = func_opts.func(df, 1) + result5 = func_opts.func(1, df) + + expected = func_opts.func(data, 2) + pd.testing.assert_series_equal(result.dtypes, expected.dtypes) + + pd.testing.assert_index_equal(result.columns_value.to_pandas(), data.columns) + assert isinstance(result.index_value.value, IndexValue.Int64Index) + + pd.testing.assert_index_equal(result2.columns_value.to_pandas(), data.columns) + assert isinstance(result2.index_value.value, IndexValue.Int64Index) + + pd.testing.assert_index_equal(result3.columns_value.to_pandas(), data.columns) + assert isinstance(result3.index_value.value, IndexValue.Int64Index) + + pd.testing.assert_index_equal(result4.columns_value.to_pandas(), data.columns) + assert isinstance(result4.index_value.value, IndexValue.Int64Index) + + pd.testing.assert_index_equal(result5.columns_value.to_pandas(), data.columns) + assert isinstance(result5.index_value.value, IndexValue.Int64Index) + + if "builtin_function_or_method" not in str(type(func_opts.func)): + # skip NotImplemented test for comparison function + return + + # test NotImplemented, use other's rfunc instead + class TestRFunc: + pass + + setattr(TestRFunc, f"__{func_opts.rfunc_name}__", lambda *_: 1) + other = TestRFunc() + ret = func_opts.func(df, other) + assert ret == 1 + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_series_and_scalar(func_name, func_opts): + if func_opts.func_name in ["__and__", "__or__", "__xor__"]: + # bitwise logical operators doesn\'t support floating point scalars + return + + data = pd.Series(range(10), index=[1, 3, 4, 2, 9, 10, 33, 23, 999, 123]) + s1 = from_pandas_series(data, chunk_size=3) + r = getattr(s1, func_opts.func_name)(456) + s1, r = tile(s1, r) + + assert r.index_value.key == s1.index_value.key + assert r.chunk_shape == s1.chunk_shape + assert r.dtype == getattr(data, func_opts.func_name)(456).dtype + + for cr in r.chunks: + cs = s1.cix[cr.index] + assert cr.index_value.key == cs.index_value.key + assert isinstance(cr.op, func_opts.op) + assert len(cr.inputs) == 1 + assert isinstance(cr.inputs[0].op, SeriesDataSource) + assert cr.op.rhs == 456 + + if "builtin_function_or_method" not in str(type(func_opts.func)): + # skip rfunc test for comparison function + return + + s1 = from_pandas_series(data, chunk_size=3) + r = getattr(s1, func_opts.rfunc_name)(789) + s1, r = tile(s1, r) + + assert r.index_value.key == s1.index_value.key + assert r.chunk_shape == s1.chunk_shape + + for cr in r.chunks: + cs = s1.cix[cr.index] + assert cr.index_value.key == cs.index_value.key + assert isinstance(cr.op, func_opts.op) + assert len(cr.inputs) == 1 + assert isinstance(cr.inputs[0].op, SeriesDataSource) + assert cr.op.lhs == 789 + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_check_inputs(func_name, func_opts): + data = pd.DataFrame(np.random.rand(10, 3)) + data = to_boolean_if_needed(func_opts.func_name, data) + df = from_pandas(data) + + with pytest.raises(ValueError): + _ = df + np.random.rand(5, 3) + + with pytest.raises(ValueError): + _ = df + np.random.rand(10) + + with pytest.raises(ValueError): + _ = df + np.random.rand(10, 3, 2) + + data = pd.Series(np.random.rand(10)) + series = from_pandas_series(data) + + with pytest.raises(ValueError): + _ = series + np.random.rand(5, 3) + + with pytest.raises(ValueError): + _ = series + np.random.rand(5) + + +def test_abs(): + data1 = pd.DataFrame( + np.random.rand(10, 10), + index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], + columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], + ) + df1 = from_pandas(data1, chunk_size=(5, 10)) + + df2 = df1.abs() + + # test df2's index and columns + pd.testing.assert_index_equal( + df2.columns_value.to_pandas(), df1.columns_value.to_pandas() + ) + assert isinstance(df2.index_value.value, IndexValue.Int64Index) + assert df2.shape == (10, 10) + + df1, df2 = tile(df1, df2) + + assert df2.chunk_shape == (2, 1) + for c2, c1 in zip(df2.chunks, df1.chunks): + assert isinstance(c2.op, DataFrameAbs) + assert len(c2.inputs) == 1 + # compare with input chunks + assert c2.index == c1.index + pd.testing.assert_index_equal( + c2.columns_value.to_pandas(), c1.columns_value.to_pandas() + ) + pd.testing.assert_index_equal( + c2.index_value.to_pandas(), c1.index_value.to_pandas() + ) + + +def test_not(): + data1 = pd.DataFrame( + np.random.rand(10, 10) > 0.5, + index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], + columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], + ) + df1 = from_pandas(data1, chunk_size=(5, 10)) + + df2 = ~df1 + + # test df2's index and columns + pd.testing.assert_index_equal( + df2.columns_value.to_pandas(), df1.columns_value.to_pandas() + ) + assert isinstance(df2.index_value.value, IndexValue.Int64Index) + assert df2.shape == (10, 10) + + df1, df2 = tile(df1, df2) + + assert df2.chunk_shape == (2, 1) + for c2, c1 in zip(df2.chunks, df1.chunks): + assert isinstance(c2.op, DataFrameNot) + assert len(c2.inputs) == 1 + # compare with input chunks + assert c2.index == c1.index + pd.testing.assert_index_equal( + c2.columns_value.to_pandas(), c1.columns_value.to_pandas() + ) + pd.testing.assert_index_equal( + c2.index_value.to_pandas(), c1.index_value.to_pandas() + ) + + +def test_arithmetic_lazy_chunk_meta(): + df = dataframe_from_tensor(mt.random.rand(10, 3, chunk_size=3)) + df2 = df + 1 + df2 = tile(df2) + + chunk = df2.chunks[0].data + assert chunk._FIELDS["_dtypes"].get(chunk) is None + pd.testing.assert_series_equal(chunk.dtypes, df.dtypes) + assert chunk._FIELDS["_dtypes"].get(chunk) is not None + assert chunk._FIELDS["_index_value"].get(chunk) is None + pd.testing.assert_index_equal(chunk.index_value.to_pandas(), pd.RangeIndex(3)) + assert chunk._FIELDS["_index_value"].get(chunk) is not None + assert chunk._FIELDS["_columns_value"].get(chunk) is None + pd.testing.assert_index_equal(chunk.columns_value.to_pandas(), pd.RangeIndex(3)) + assert chunk._FIELDS["_columns_value"].get(chunk) is not None + + +def test_datetime_arithmetic(): + data1 = ( + pd.Series([pd.Timedelta(days=d) for d in range(10)]) + datetime.datetime.now() + ) + s1 = from_pandas_series(data1) + + assert (s1 + pd.Timedelta(days=10)).dtype == (data1 + pd.Timedelta(days=10)).dtype + assert (s1 + datetime.timedelta(days=10)).dtype == ( + data1 + datetime.timedelta(days=10) + ).dtype + assert (s1 - pd.Timestamp.now()).dtype == (data1 - pd.Timestamp.now()).dtype + assert (s1 - datetime.datetime.now()).dtype == ( + data1 - datetime.datetime.now() + ).dtype diff --git a/python/xorbits/_mars/dataframe/arithmetic/tests/test_arithmetic_execution.py b/python/xorbits/_mars/dataframe/arithmetic/tests/test_arithmetic_execution.py new file mode 100644 index 000000000..d4b45c2be --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/tests/test_arithmetic_execution.py @@ -0,0 +1,917 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator +from dataclasses import dataclass +from functools import partial +from typing import Callable, Union + +import numpy as np +import pandas as pd +import pytest + +from .... import tensor as mt +from ....tensor.datasource import array as from_array +from ....utils import dataslots +from ... import to_datetime +from ...datasource.dataframe import from_pandas +from ...datasource.series import from_pandas as from_pandas_series +from ..tests.test_arithmetic import comp_func + + +@dataslots +@dataclass +class FunctionOptions: + func: Callable + func_name: str + rfunc_name: str + + +binary_functions = dict( + add=FunctionOptions(func=operator.add, func_name="add", rfunc_name="radd"), + equal=FunctionOptions(func=comp_func("eq", "eq"), func_name="eq", rfunc_name="eq"), + logical_and=FunctionOptions( + func=operator.and_, func_name="__and__", rfunc_name="__rand__" + ), +) + + +def sort_dataframe( + df: Union[pd.DataFrame, pd.Series], index: bool = True, columns: bool = True +): + if index: + df.sort_index(inplace=True) + if columns and isinstance(df, pd.DataFrame): + df.sort_index(axis=1, inplace=True) + return df + + +def to_boolean_if_needed(func_name, value, split_value=0.5): + if func_name in ["__and__", "__or__", "__xor__"]: + return value > split_value + else: + return value + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_without_shuffle_execution(setup, func_name, func_opts): + if func_opts.func_name in ["__and__", "__or__", "__xor__"]: + # FIXME bitwise logical operators behave differently with pandas when index is not aligned. + return + + # all the axes are monotonic + # data1 with index split into [0...4], [5...9], + # columns [3...7], [8...12] + data1 = pd.DataFrame( + np.random.rand(10, 10), index=np.arange(10), columns=np.arange(3, 13) + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=5) + # data2 with index split into [6...11], [2, 5], + # columns [4...9], [10, 13] + data2 = pd.DataFrame( + np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=np.arange(4, 14) + ) + data2 = to_boolean_if_needed(func_opts.func_name, data2) + df2 = from_pandas(data2, chunk_size=6) + + df3 = func_opts.func(df1, df2) + + expected = func_opts.func(data1, data2) + result = df3.execute().fetch() + + pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result)) + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_with_one_shuffle_execution(setup, func_name, func_opts): + if func_opts.func_name in ["__and__", "__or__", "__xor__"]: + # FIXME bitwise logical operators behave differently with pandas when index is not aligned. + return + + # only 1 axis is monotonic + # data1 with index split into [0...4], [5...9], + data1 = pd.DataFrame( + np.random.rand(10, 10), + index=np.arange(10), + columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=5) + # data2 with index split into [6...11], [2, 5], + data2 = pd.DataFrame( + np.random.rand(10, 10), + index=np.arange(11, 1, -1), + columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], + ) + data2 = to_boolean_if_needed(func_opts.func_name, data2) + df2 = from_pandas(data2, chunk_size=6) + + df3 = func_opts.func(df1, df2) + + expected = func_opts.func(data1, data2) + result = df3.execute().fetch() + + pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result)) + + # only 1 axis is monotonic + # data1 with columns split into [0...4], [5...9], + data1 = pd.DataFrame( + np.random.rand(10, 10), + index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], + columns=np.arange(10), + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=5) + # data2 with columns split into [6...11], [2, 5], + data2 = pd.DataFrame( + np.random.rand(10, 10), + index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], + columns=np.arange(11, 1, -1), + ) + data2 = to_boolean_if_needed(func_opts.func_name, data2) + df2 = from_pandas(data2, chunk_size=6) + + df3 = func_opts.func(df1, df2) + + expected = func_opts.func(data1, data2) + result = df3.execute().fetch() + + pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result)) + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_with_all_shuffle_execution(setup, func_name, func_opts): + if func_opts.func_name in ["__and__", "__or__", "__xor__"]: + # FIXME bitwise logical operators behave differently with pandas when index is not aligned. + return + + # no axis is monotonic + data1 = pd.DataFrame( + np.random.rand(10, 10), + index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], + columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=5) + data2 = pd.DataFrame( + np.random.rand(10, 10), + index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], + columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], + ) + data2 = to_boolean_if_needed(func_opts.func_name, data2) + df2 = from_pandas(data2, chunk_size=6) + + df3 = func_opts.func(df1, df2) + + expected = func_opts.func(data1, data2) + result = df3.execute().fetch() + + pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result)) + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_both_with_one_chunk(setup, func_name, func_opts): + if func_opts.func_name in ["__and__", "__or__", "__xor__"]: + # FIXME bitwise logical operators behave differently with pandas when index is not aligned. + return + + # only 1 axis is monotonic + # data1 with index split into [0...4], [5...9], + data1 = pd.DataFrame( + np.random.rand(10, 10), + index=np.arange(10), + columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=10) + # data2 with index split into [6...11], [2, 5], + data2 = pd.DataFrame( + np.random.rand(10, 10), + index=np.arange(11, 1, -1), + columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], + ) + data2 = to_boolean_if_needed(func_opts.func_name, data2) + df2 = from_pandas(data2, chunk_size=10) + + df3 = func_opts.func(df1, df2) + + expected = func_opts.func(data1, data2) + result = df3.execute().fetch() + + pd.testing.assert_frame_equal(expected, result) + + # only 1 axis is monotonic + # data1 with columns split into [0...4], [5...9], + data1 = pd.DataFrame( + np.random.rand(10, 10), + index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], + columns=np.arange(10), + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=10) + # data2 with columns split into [6...11], [2, 5], + data2 = pd.DataFrame( + np.random.rand(10, 10), + index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], + columns=np.arange(11, 1, -1), + ) + data2 = to_boolean_if_needed(func_opts.func_name, data2) + df2 = from_pandas(data2, chunk_size=10) + + df3 = func_opts.func(df1, df2) + + expected = func_opts.func(data1, data2) + result = df3.execute().fetch() + + pd.testing.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_without_shuffle_and_with_one_chunk(setup, func_name, func_opts): + if func_opts.func_name in ["__and__", "__or__", "__xor__"]: + # FIXME bitwise logical operators behave differently with pandas when index is not aligned. + return + + # only 1 axis is monotonic + # data1 with index split into [0...4], [5...9], + data1 = pd.DataFrame( + np.random.rand(10, 10), + index=np.arange(10), + columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=(5, 10)) + # data2 with index split into [6...11], [2, 5], + data2 = pd.DataFrame( + np.random.rand(10, 10), + index=np.arange(11, 1, -1), + columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], + ) + data2 = to_boolean_if_needed(func_opts.func_name, data2) + df2 = from_pandas(data2, chunk_size=(6, 10)) + + df3 = func_opts.func(df1, df2) + + expected = func_opts.func(data1, data2) + result = df3.execute().fetch() + + pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result)) + + # only 1 axis is monotonic + # data1 with columns split into [0...4], [5...9], + data1 = pd.DataFrame( + np.random.rand(10, 10), + index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], + columns=np.arange(10), + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=(10, 5)) + # data2 with columns split into [6...11], [2, 5], + data2 = pd.DataFrame( + np.random.rand(10, 10), + index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], + columns=np.arange(11, 1, -1), + ) + data2 = to_boolean_if_needed(func_opts.func_name, data2) + df2 = from_pandas(data2, chunk_size=(10, 6)) + + df3 = func_opts.func(df1, df2) + + expected = func_opts.func(data1, data2) + result = df3.execute().fetch() + + pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result)) + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_with_shuffle_and_with_one_chunk(setup, func_name, func_opts): + if func_opts.func_name in ["__and__", "__or__", "__xor__"]: + # pandas fails to compute some expected values due to `na`. + return + + # only 1 axis is monotonic + # data1 with index split into [0...4], [5...9], + data1 = pd.DataFrame( + np.random.rand(10, 10), + index=np.arange(10), + columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], + ) + df1 = from_pandas(data1, chunk_size=(10, 5)) + # data2 with index split into [6...11], [2, 5], + data2 = pd.DataFrame( + np.random.rand(10, 10), + index=np.arange(11, 1, -1), + columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], + ) + df2 = from_pandas(data2, chunk_size=(10, 6)) + + df3 = func_opts.func(df1, df2) + + expected = func_opts.func(data1, data2) + result = df3.execute().fetch() + + pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result)) + + # only 1 axis is monotonic + # data1 with columns split into [0...4], [5...9], + data1 = pd.DataFrame( + np.random.rand(10, 10), + index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], + columns=np.arange(10), + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=(5, 10)) + # data2 with columns split into [6...11], [2, 5], + data2 = pd.DataFrame( + np.random.rand(10, 10), + index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], + columns=np.arange(11, 1, -1), + ) + data2 = to_boolean_if_needed(func_opts.func_name, data2) + df2 = from_pandas(data2, chunk_size=(6, 10)) + + df3 = func_opts.func(df1, df2) + + expected = func_opts.func(data1, data2) + result = df3.execute().fetch() + + pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result)) + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_same_index(setup, func_name, func_opts): + data = pd.DataFrame( + np.random.rand(10, 10), + index=np.random.randint(0, 2, size=(10,)), + columns=["c" + str(i) for i in range(10)], + ) + data = to_boolean_if_needed(func_opts.func_name, data) + df = from_pandas(data, chunk_size=3) + df2 = func_opts.func(df, df) + + expected = func_opts.func(data, data) + result = df2.execute().fetch() + pd.testing.assert_frame_equal(expected, result) + + series = from_pandas_series(data.iloc[0], chunk_size=3) + df3 = func_opts.func(df, series) + + expected = func_opts.func(data, data.iloc[0]) + result = df3.execute().fetch() + pd.testing.assert_frame_equal(expected, result) + + series = from_pandas_series(data.iloc[:, 0], chunk_size=3) + df4 = getattr(df, func_opts.func_name)(series, axis=0) + + if func_opts.func_name not in ["__and__", "__or__", "__xor__"]: + expected = getattr(data, func_opts.func_name)(data.iloc[:, 0], axis=0) + result = df4.execute().fetch() + pd.testing.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_chained(setup, func_name, func_opts): + data1 = pd.DataFrame(np.random.rand(10, 10)) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=5) + data2 = pd.DataFrame(np.random.rand(10, 10)) + data2 = to_boolean_if_needed(func_opts.func_name, data2) + df2 = from_pandas(data2, chunk_size=6) + + df3 = func_opts.func(df1, df2) + + data4 = pd.DataFrame(np.random.rand(10, 10)) + data4 = to_boolean_if_needed(func_opts.func_name, data4) + df4 = from_pandas(data4, chunk_size=6) + + df5 = func_opts.func(df3, df4) + + result = df5.execute().fetch() + expected = func_opts.func(func_opts.func(data1, data2), data4) + + pd.testing.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_rfunc(setup, func_name, func_opts): + data1 = pd.DataFrame(np.random.rand(10, 10)) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=5) + data2 = pd.DataFrame(np.random.rand(10, 10)) + data2 = to_boolean_if_needed(func_opts.func_name, data2) + df2 = from_pandas(data2, chunk_size=6) + df3 = getattr(df1, func_opts.rfunc_name)(df2) + result = df3.execute().fetch() + expected = func_opts.func(data2, data1) + pd.testing.assert_frame_equal(expected, result) + + data3 = pd.DataFrame(np.random.rand(10, 10)) + data3 = to_boolean_if_needed(func_opts.func_name, data3) + df4 = from_pandas(data3, chunk_size=5) + df5 = getattr(df4, func_opts.rfunc_name)(1) + # todo check dtypes when pandas reverts its behavior on broadcasting + check_dtypes = func_opts.func_name not in ("__and__", "__or__", "__xor__") + result = df5.execute().fetch(extra_config=dict(check_dtypes=check_dtypes)) + expected2 = func_opts.func(1, data3) + pd.testing.assert_frame_equal(expected2, result) + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_with_multi_forms(setup, func_name, func_opts): + # test multiple forms + # such as self+other, func_opts.add(other), add(self,other) + data1 = pd.DataFrame(np.random.rand(10, 10)) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=5) + data2 = pd.DataFrame(np.random.rand(10, 10)) + data2 = to_boolean_if_needed(func_opts.func_name, data2) + df2 = from_pandas(data2, chunk_size=6) + + expected = func_opts.func(data1, data2) + result = func_opts.func(df1, df2).execute().fetch() + pd.testing.assert_frame_equal(expected, result) + result = func_opts.func(df1, df2).execute().fetch() + pd.testing.assert_frame_equal(expected, result) + result = getattr(df1, func_opts.func_name)(df2).execute().fetch() + pd.testing.assert_frame_equal(expected, result) + result = getattr(df1, func_opts.rfunc_name)(df2).execute().fetch() + pd.testing.assert_frame_equal(func_opts.func(data2, data1), result) + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_dataframe_and_scalar(setup, func_name, func_opts): + if func_opts.func_name in ["__and__", "__or__", "__xor__"]: + # FIXME bitwise logical operators doesn\'t support floating point scalars + return + + # test dataframe and scalar + pdf = pd.DataFrame(np.random.rand(10, 10)) + pdf = to_boolean_if_needed(func_opts.func_name, pdf) + df = from_pandas(pdf, chunk_size=2) + expected = func_opts.func(pdf, 1) + result = func_opts.func(df, 1).execute().fetch() + pd.testing.assert_frame_equal(expected, result) + result2 = func_opts.func(df, 1).execute().fetch() + pd.testing.assert_frame_equal(expected, result2) + result3 = getattr(df, func_opts.func_name)(1).execute().fetch() + pd.testing.assert_frame_equal(expected, result3) + + # test scalar and dataframe + result4 = func_opts.func(df, 1).execute().fetch() + pd.testing.assert_frame_equal(expected, result4) + + expected2 = func_opts.func(1, pdf) + result5 = func_opts.func(1, df).execute().fetch() + pd.testing.assert_frame_equal(expected2, result5) + + result6 = getattr(df, func_opts.rfunc_name)(1).execute().fetch() + pd.testing.assert_frame_equal(expected2, result6) + + # test pandas series and dataframe + pdf2 = pd.DataFrame(np.random.rand(10, 10)) + expected = func_opts.func(pdf2, pdf) + result = func_opts.func(pdf2, df).execute().fetch() + pd.testing.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_with_shuffle_on_string_index(setup, func_name, func_opts): + if func_opts.func_name in ["__and__", "__or__", "__xor__"]: + # FIXME bitwise logical operators behave differently with pandas when index is not aligned. + return + + # no axis is monotonic, and the index values are strings. + data1 = pd.DataFrame( + np.random.rand(10, 10), + index=[str(x) for x in [0, 10, 2, 3, 4, 5, 6, 7, 8, 9]], + columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=5) + data2 = pd.DataFrame( + np.random.rand(10, 10), + index=[str(x) for x in [11, 1, 2, 5, 7, 6, 8, 9, 10, 3]], + columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], + ) + data2 = to_boolean_if_needed(func_opts.func_name, data2) + df2 = from_pandas(data2, chunk_size=6) + + df3 = func_opts.func(df1, df2) + + expected = func_opts.func(data1, data2) + result = df3.execute().fetch() + + pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result)) + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_dataframe_and_series(setup, func_name, func_opts): + if func_opts.func_name in ["__and__", "__or__", "__xor__"]: + # pandas fails to compute some expected values due to `na`. + return + + data1 = pd.DataFrame( + np.random.rand(10, 10), + index=np.arange(10), + columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + data2 = pd.DataFrame( + np.random.rand(10, 10), + index=np.arange(11, 1, -1), + columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], + ) + data2 = to_boolean_if_needed(func_opts.func_name, data2) + + s1 = from_pandas_series(data2[1], chunk_size=(6,)) + + # operate on single-column dataframe and series + df1 = from_pandas(data1[[1]], chunk_size=(5, 5)) + r1 = getattr(df1, func_opts.func_name)(s1, axis="index") + + expected = getattr(data1[[1]], func_opts.func_name)(data2[1], axis="index") + result = r1.execute().fetch() + pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result)) + + # operate on dataframe and series without shuffle + df2 = from_pandas(data1, chunk_size=(5, 5)) + r2 = getattr(df2, func_opts.func_name)(s1, axis="index") + + expected = getattr(data1, func_opts.func_name)(data2[1], axis="index") + result = r2.execute().fetch() + pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result)) + + # operate on dataframe and series with shuffle + df3 = from_pandas(data1, chunk_size=(5, 5)) + r3 = getattr(df3, func_opts.func_name)(s1, axis="columns") + + expected = getattr(data1, func_opts.func_name)(data2[1], axis="columns") + result = r3.execute().fetch() + pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result)) + + # test both one chunk, axis=0 + pdf = pd.DataFrame({"ca": [1, 3, 2], "cb": [360, 180, 2]}, index=[1, 2, 3]) + pdf = to_boolean_if_needed(func_opts.func_name, pdf) + df = from_pandas(pdf) + series = pd.Series([0, 1, 2], index=[1, 2, 3]) + mars_series = from_pandas_series(series) + result = getattr(df, func_opts.func_name)(mars_series, axis=0).execute().fetch() + expected = getattr(pdf, func_opts.func_name)(series, axis=0) + pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result)) + + # test different number of chunks, axis=0 + pdf = pd.DataFrame({"ca": [1, 3, 2], "cb": [360, 180, 2]}, index=[1, 2, 3]) + pdf = to_boolean_if_needed(func_opts.func_name, pdf) + df = from_pandas(pdf, chunk_size=1) + series = pd.Series([0, 1, 2], index=[1, 2, 3]) + mars_series = from_pandas_series(series) + result = getattr(df, func_opts.func_name)(mars_series, axis=0).execute().fetch() + expected = getattr(pdf, func_opts.func_name)(series, axis=0) + pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result)) + + # test with row shuffle, axis=0 + pdf = pd.DataFrame({"ca": [1, 3, 2], "cb": [360, 180, 2]}, index=[2, 1, 3]) + pdf = to_boolean_if_needed(func_opts.func_name, pdf) + df = from_pandas(pdf, chunk_size=1) + series = pd.Series([0, 1, 2], index=[3, 1, 2]) + mars_series = from_pandas_series(series) + result = getattr(df, func_opts.func_name)(mars_series, axis=0).execute().fetch() + expected = getattr(pdf, func_opts.func_name)(series, axis=0).reindex([3, 1, 2]) + # modify the order of rows + result = result.reindex(index=[3, 1, 2]) + pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result)) + + # test both one chunk, axis=1 + pdf = pd.DataFrame( + {1: [1, 3, 2], 2: [360, 180, 2], 3: [1, 2, 3]}, index=["ra", "rb", "rc"] + ) + pdf = to_boolean_if_needed(func_opts.func_name, pdf) + df = from_pandas(pdf) + series = pd.Series([0, 1, 2], index=[1, 2, 3]) + mars_series = from_pandas_series(series) + result = getattr(df, func_opts.func_name)(mars_series, axis=1).execute().fetch() + expected = getattr(pdf, func_opts.func_name)(series, axis=1) + pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result)) + + # test different number of chunks, axis=1 + pdf = pd.DataFrame( + {1: [1, 3, 2], 2: [360, 180, 2], 3: [1, 2, 3]}, index=["ra", "rb", "rc"] + ) + pdf = to_boolean_if_needed(func_opts.func_name, pdf) + df = from_pandas(pdf, chunk_size=1) + series = pd.Series([0, 1, 2], index=[1, 2, 3]) + mars_series = from_pandas_series(series) + result = getattr(df, func_opts.func_name)(mars_series, axis=1).execute().fetch() + expected = getattr(pdf, func_opts.func_name)(series, axis=1) + pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result)) + + # test with row shuffle, axis=1 + pdf = pd.DataFrame( + {1: [1, 3, 2], 3: [1, 2, 3], 2: [360, 180, 2]}, index=["ra", "rb", "rc"] + ) + pdf = to_boolean_if_needed(func_opts.func_name, pdf) + df = from_pandas(pdf, chunk_size=1) + series = pd.Series([0, 1, 2], index=[3, 1, 2]) + mars_series = from_pandas_series(series) + result = getattr(df, func_opts.func_name)(mars_series, axis=1).execute().fetch() + expected = getattr(pdf, func_opts.func_name)(series, axis=1) + # modify the order of columns + result = result[[1, 2, 3]] + pd.testing.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_series(setup, func_name, func_opts): + # only one chunk + s1 = pd.Series(np.arange(10) + 1) + s1 = to_boolean_if_needed(func_opts.func_name, s1) + s2 = pd.Series(np.arange(10) + 1) + s2 = to_boolean_if_needed(func_opts.func_name, s2) + r = func_opts.func( + from_pandas_series(s1, chunk_size=10), from_pandas_series(s2, chunk_size=10) + ) + result = r.execute().fetch() + expected = func_opts.func(s1, s2) + pd.testing.assert_series_equal(expected, result) + + # same index + s1 = pd.Series(np.arange(10) + 1) + s1 = to_boolean_if_needed(func_opts.func_name, s1) + s2 = pd.Series(np.arange(10) + 1) + s2 = to_boolean_if_needed(func_opts.func_name, s2) + r = func_opts.func( + from_pandas_series(s1, chunk_size=4), from_pandas_series(s2, chunk_size=6) + ) + result = r.execute().fetch() + expected = func_opts.func(s1, s2) + pd.testing.assert_series_equal(expected, result) + + # no shuffle + s1 = pd.Series(np.arange(10) + 1, index=range(10)) + s1 = to_boolean_if_needed(func_opts.func_name, s1) + s2 = pd.Series(np.arange(10) + 1, index=range(10, 0, -1)) + s2 = to_boolean_if_needed(func_opts.func_name, s2) + r = func_opts.func( + from_pandas_series(s1, chunk_size=4), from_pandas_series(s2, chunk_size=6) + ) + result = r.execute().fetch() + expected = func_opts.func(s1, s2) + pd.testing.assert_series_equal(expected, result) + + # shuffle + data = (np.arange(10) + 1).astype(np.int64, copy=False) + s1 = pd.Series(data, index=np.random.permutation(range(10))) + s1 = to_boolean_if_needed(func_opts.func_name, s1) + s2 = pd.Series(data, index=np.random.permutation(range(10, 0, -1))) + s2 = to_boolean_if_needed(func_opts.func_name, s2) + r = func_opts.func( + from_pandas_series(s1, chunk_size=4), from_pandas_series(s2, chunk_size=6) + ) + result = r.execute().fetch() + expected = func_opts.func(s1, s2) + pd.testing.assert_series_equal(sort_dataframe(expected), sort_dataframe(result)) + + if func_opts.func_name in ["__and__", "__or__", "__xor__"]: + # bitwise logical operators doesn\'t support floating point scalars + return + + # operate with scalar + s1 = pd.Series(np.arange(10) + 1, index=np.random.permutation(range(10))) + s1 = to_boolean_if_needed(func_opts.func_name, s1) + r = func_opts.func(from_pandas_series(s1, chunk_size=4), 4) + result = r.execute().fetch() + expected = func_opts.func(s1, 4) + pd.testing.assert_series_equal(expected, result) + + # reverse with scalar + s1 = pd.Series(np.arange(10) + 1, index=np.random.permutation(range(10))) + s1 = to_boolean_if_needed(func_opts.func_name, s1) + r = func_opts.func(4, from_pandas_series(s1, chunk_size=4)) + result = r.execute().fetch() + expected = func_opts.func(4, s1) + pd.testing.assert_series_equal(expected, result) + + +@pytest.mark.skipif( + pd.__version__ < "1.2.0", reason="skip due to the incompatibilities." +) +@pytest.mark.parametrize("func_name, func_opts", binary_functions.items()) +def test_with_plain_value(setup, func_name, func_opts): + if func_opts.func_name in ["__and__", "__or__", "__xor__"]: + # skip tests for bitwise logical operators on plain value. + return + + data1 = pd.DataFrame( + np.random.rand(10, 10), + index=np.arange(10), + columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=6) + s1 = df1[2] + + r = getattr(df1, func_opts.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0) + result = r.execute().fetch() + expected = getattr(data1, func_opts.func_name)( + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0 + ) + pd.testing.assert_frame_equal(expected, result) + + r = getattr(df1, func_opts.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10), axis=0) + result = r.execute().fetch() + expected = getattr(data1, func_opts.func_name)( + (1, 2, 3, 4, 5, 6, 7, 8, 9, 10), axis=0 + ) + pd.testing.assert_frame_equal(expected, result) + + r = getattr(s1, func_opts.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + result = r.execute().fetch() + expected = getattr(data1[2], func_opts.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + pd.testing.assert_series_equal(expected, result) + + r = getattr(s1, func_opts.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) + result = r.execute().fetch() + expected = getattr(data1[2], func_opts.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) + pd.testing.assert_series_equal(expected, result) + + # specify index, not the default range index + data1 = pd.DataFrame( + np.random.rand(10, 7), index=np.arange(5, 15), columns=[4, 1, 3, 2, 5, 6, 7] + ) + data1 = to_boolean_if_needed(func_opts.func_name, data1) + df1 = from_pandas(data1, chunk_size=6) + s1 = df1[2] + + r = getattr(df1, func_opts.func_name)( + np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0 + ) + result = r.execute().fetch() + expected = getattr(data1, func_opts.func_name)( + np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0 + ) + pd.testing.assert_frame_equal(expected, result) + + r = getattr(df1, func_opts.func_name)( + from_array(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])), axis=0 + ) + result = r.execute().fetch() + expected = getattr(data1, func_opts.func_name)( + np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0 + ) + pd.testing.assert_frame_equal(expected, result) + + r = getattr(s1, func_opts.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) + result = r.execute().fetch() + expected = getattr(data1[2], func_opts.func_name)( + np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + ) + pd.testing.assert_series_equal(expected, result) + + r = getattr(s1, func_opts.func_name)( + from_array(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) + ) + result = r.execute().fetch() + expected = getattr(data1[2], func_opts.func_name)( + np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + ) + pd.testing.assert_series_equal(expected, result) + + +def test_abs(setup): + data1 = pd.DataFrame(np.random.uniform(low=-1, high=1, size=(10, 10))) + df1 = from_pandas(data1, chunk_size=5) + + result = df1.abs().execute().fetch() + expected = data1.abs() + pd.testing.assert_frame_equal(expected, result) + + result = abs(df1).execute().fetch() + pd.testing.assert_frame_equal(expected, result) + + +def test_not(setup): + data1 = pd.DataFrame(np.random.uniform(low=-1, high=1, size=(10, 10)) > 0) + df1 = from_pandas(data1, chunk_size=5) + + result = ~df1.execute().fetch() + expected = ~data1 + pd.testing.assert_frame_equal(expected, result) + + +def test_negative(setup): + data1 = pd.DataFrame(np.random.randint(low=0, high=100, size=(10, 10))) + df1 = from_pandas(data1, chunk_size=5) + + result = -df1.execute().fetch() + expected = -data1 + pd.testing.assert_frame_equal(expected, result) + + +def test_ufunc(setup): + df_raw = pd.DataFrame( + np.random.uniform(size=(10, 10)), index=pd.RangeIndex(9, -1, -1) + ) + df = from_pandas(df_raw, chunk_size=5) + + series_raw = pd.Series(np.random.uniform(size=10), index=pd.RangeIndex(9, -1, -1)) + series = from_pandas_series(series_raw, chunk_size=5) + + ufuncs = [ + [np.abs, mt.abs], + [np.log, mt.log], + [np.log2, mt.log2], + [np.log10, mt.log10], + [np.sin, mt.sin], + [np.cos, mt.cos], + [np.tan, mt.tan], + [np.sinh, mt.sinh], + [np.cosh, mt.cosh], + [np.tanh, mt.tanh], + [np.arcsin, mt.arcsin], + [np.arccos, mt.arccos], + [np.arctan, mt.arctan], + [np.arcsinh, mt.arcsinh], + [np.arccosh, mt.arccosh], + [np.arctanh, mt.arctanh], + [np.radians, mt.radians], + [np.degrees, mt.degrees], + [np.ceil, mt.ceil], + [np.floor, mt.floor], + [partial(np.around, decimals=2), partial(mt.around, decimals=2)], + [np.exp, mt.exp], + [np.exp2, mt.exp2], + [np.expm1, mt.expm1], + [np.sqrt, mt.sqrt], + [np.isnan, mt.isnan], + [np.isfinite, mt.isfinite], + [np.isinf, mt.isinf], + [np.negative, mt.negative], + ] + + for raw, data in [(df_raw, df), (series_raw, series)]: + for npf, mtf in ufuncs: + r = mtf(data) + + result = r.execute().fetch() + expected = npf(raw) + + if isinstance(raw, pd.DataFrame): + pd.testing.assert_frame_equal(result, expected) + else: + pd.testing.assert_series_equal(result, expected) + + # test numpy ufunc + r = npf(data) + + result = r.execute().fetch() + + if isinstance(raw, pd.DataFrame): + pd.testing.assert_frame_equal(result, expected) + else: + pd.testing.assert_series_equal(result, expected) + + +def test_date_time_bin(setup): + rs = np.random.RandomState(0) + df_raw = pd.DataFrame( + { + "a": rs.randint(1000, size=10), + "b": rs.rand(10), + "c": [pd.Timestamp(rs.randint(1604000000, 1604481373)) for _ in range(10)], + }, + index=pd.RangeIndex(9, -1, -1), + ) + df = from_pandas(df_raw, chunk_size=5) + r = (df["c"] > to_datetime("2000-01-01")) & (df["c"] < to_datetime("2021-01-01")) + + result = r.execute().fetch() + expected = (df_raw["c"] > pd.to_datetime("2000-01-01")) & ( + df_raw["c"] < pd.to_datetime("2021-01-01") + ) + pd.testing.assert_series_equal(result, expected) + + +def test_series_and_tensor(setup): + rs = np.random.RandomState(0) + s_raw = pd.Series(rs.rand(10)) < 0.5 + a_raw = rs.rand(10) < 0.5 + + series = from_pandas_series(s_raw, chunk_size=5) + t = mt.tensor(a_raw, chunk_size=5) + + r = t | series + result = r.execute().fetch() + expected = a_raw | s_raw + pd.testing.assert_series_equal(result, expected) diff --git a/python/xorbits/_mars/dataframe/arithmetic/tests/test_comparison.py b/python/xorbits/_mars/dataframe/arithmetic/tests/test_comparison.py new file mode 100644 index 000000000..c92491bcd --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/tests/test_comparison.py @@ -0,0 +1,83 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator +from datetime import datetime + +import numpy as np +import pandas as pd +import pytest + +from ....core import enter_mode +from ...initializer import DataFrame, Series + + +def test_comp(setup): + raw_df1 = pd.DataFrame(np.random.rand(4, 3)) + raw_df2 = pd.DataFrame(np.random.rand(4, 3)) + df1 = DataFrame(raw_df1) + df2 = DataFrame(raw_df2) + + with enter_mode(build=True): + assert not df1.data == df2.data + assert df1.data == df1.data + + for op in [ + operator.eq, + operator.ne, + operator.lt, + operator.gt, + operator.le, + operator.ge, + ]: + eq_df = op(df1, df2) + pd.testing.assert_index_equal( + eq_df.index_value.to_pandas(), df1.index_value.to_pandas() + ) + eq_df = op(raw_df1, df2) + pd.testing.assert_index_equal( + eq_df.index_value.to_pandas(), df1.index_value.to_pandas() + ) + + # index not identical + df3 = DataFrame(pd.DataFrame(np.random.rand(4, 3), index=[1, 2, 3, 4])) + with pytest.raises(ValueError): + op(df1, df3) + + # columns not identical + df4 = DataFrame(pd.DataFrame(np.random.rand(4, 3), columns=["a", "b", "c"])) + with pytest.raises(ValueError): + op(df1, df4) + + # test datetime + df = DataFrame(pd.DataFrame(pd.date_range("20130101", periods=6))) + for op in [ + operator.eq, + operator.ne, + operator.lt, + operator.gt, + operator.le, + operator.ge, + ]: + r_df = op(df, datetime(2013, 1, 2)) + pd.testing.assert_index_equal( + r_df.index_value.to_pandas(), df.index_value.to_pandas() + ) + + # test period type + raw = pd.period_range("2000-01-01", periods=10, freq="D") + raw_series = pd.Series(raw) + series = Series(raw, chunk_size=5) + r = series >= series[1] + pd.testing.assert_series_equal(r.to_pandas(), raw_series >= raw_series[1]) diff --git a/python/xorbits/_mars/dataframe/arithmetic/tests/test_dot.py b/python/xorbits/_mars/dataframe/arithmetic/tests/test_dot.py new file mode 100644 index 000000000..7993efcf4 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/tests/test_dot.py @@ -0,0 +1,93 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest + +from ... import DataFrame, Series + + +def test_dot_execution(setup): + df1_raw = pd.DataFrame(np.random.rand(4, 7)) + df2_raw = pd.DataFrame(np.random.rand(7, 5), columns=list("efghi")) + s1_raw = pd.Series(np.random.rand(7)) + s2_raw = pd.Series(np.random.rand(7)) + + df1 = DataFrame(df1_raw, chunk_size=(3, 2)) + df2 = DataFrame(df2_raw, chunk_size=(3, 4)) + + # df.dot(df) + r = df1.dot(df2) + result = r.execute().fetch() + expected = df1_raw.dot(df2_raw) + pd.testing.assert_frame_equal(result, expected) + + # test @ + r = df1 @ df2 + result = r.execute().fetch() + expected = df1_raw @ df2_raw + pd.testing.assert_frame_equal(result, expected) + + # test reversed @ + r = df1_raw @ df2 + result = r.execute().fetch() + expected = df1_raw @ df2_raw + pd.testing.assert_frame_equal(result, expected) + + series1 = Series(s1_raw, chunk_size=5) + + # df.dot(series) + r = df1.dot(series1) + result = r.execute().fetch() + expected = df1_raw.dot(s1_raw) + pd.testing.assert_series_equal(result, expected) + + # df.dot(2d_array) + r = df1.dot(df2_raw.to_numpy()) + result = r.execute().fetch() + expected = df1_raw.dot(df2_raw.to_numpy()) + pd.testing.assert_frame_equal(result, expected) + + # df.dot(1d_array) + r = df1.dot(s1_raw.to_numpy()) + result = r.execute().fetch() + expected = df1_raw.dot(s1_raw.to_numpy()) + pd.testing.assert_series_equal(result, expected) + + series2 = Series(s2_raw, chunk_size=4) + + # series.dot(series) + r = series1.dot(series2) + result = r.execute().fetch() + expected = s1_raw.dot(s2_raw) + assert pytest.approx(result) == expected + + # series.dot(df) + r = series1.dot(df2) + result = r.execute().fetch() + expected = s1_raw.dot(df2_raw) + pd.testing.assert_series_equal(result, expected) + + # series.dot(2d_array) + r = series1.dot(df2_raw.to_numpy()) + result = r.execute().fetch() + expected = s1_raw.dot(df2_raw.to_numpy()) + np.testing.assert_almost_equal(result, expected) + + # series.dot(1d_array) + r = series1.dot(s2_raw.to_numpy()) + result = r.execute().fetch() + expected = s1_raw.dot(s2_raw.to_numpy()) + assert pytest.approx(result) == expected diff --git a/python/xorbits/_mars/dataframe/arithmetic/truediv.py b/python/xorbits/_mars/dataframe/arithmetic/truediv.py new file mode 100644 index 000000000..7bf2ecc09 --- /dev/null +++ b/python/xorbits/_mars/dataframe/arithmetic/truediv.py @@ -0,0 +1,64 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator + +from ... import opcodes as OperandDef +from ...utils import classproperty +from .core import DataFrameBinopUfunc +from .docstring import bin_arithmetic_doc + + +class DataFrameTrueDiv(DataFrameBinopUfunc): + _op_type_ = OperandDef.DIV + + _func_name = "truediv" + _rfunc_name = "rtruediv" + + @classproperty + def _operator(self): + return operator.truediv + + @classproperty + def tensor_op_type(self): + from ...tensor.arithmetic import TensorTrueDiv + + return TensorTrueDiv + + +_truediv_example = """ +>>> a.truediv(b, fill_value=0).execute() +a 1.0 +b inf +c inf +d 0.0 +e NaN +dtype: float64 +""" + + +@bin_arithmetic_doc("Floating division", equiv="/", series_example=_truediv_example) +def truediv(df, other, axis="columns", level=None, fill_value=None): + op = DataFrameTrueDiv( + axis=axis, level=level, fill_value=fill_value, lhs=df, rhs=other + ) + return op(df, other) + + +@bin_arithmetic_doc("Floating division", equiv="/", series_example=_truediv_example) +def rtruediv(df, other, axis="columns", level=None, fill_value=None): + op = DataFrameTrueDiv( + axis=axis, level=level, fill_value=fill_value, lhs=other, rhs=df + ) + return op.rcall(df, other) diff --git a/python/xorbits/_mars/dataframe/arrays.py b/python/xorbits/_mars/dataframe/arrays.py new file mode 100644 index 000000000..e0d13cc7e --- /dev/null +++ b/python/xorbits/_mars/dataframe/arrays.py @@ -0,0 +1,864 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import operator +import re +from copy import copy as copy_obj +from numbers import Integral +from typing import Sequence, Type + +import numpy as np +import pandas as pd +from pandas._libs import lib +from pandas.api.extensions import ( + ExtensionArray, + ExtensionDtype, + register_extension_dtype, +) +from pandas.api.indexers import check_array_indexer +from pandas.api.types import ( + is_array_like, + is_list_like, + is_scalar, + is_string_dtype, + pandas_dtype, +) +from pandas.arrays import StringArray as StringArrayBase +from pandas.compat import set_function_name +from pandas.core import ops +from pandas.core.algorithms import take + +try: + from pandas._libs.arrays import NDArrayBacked +except ImportError: + NDArrayBacked = None +try: + import pyarrow as pa + + pa_null = pa.NULL +except ImportError: # pragma: no cover + pa = None + pa_null = None +try: + import pyarrow.compute as pc +except ImportError: # pragma: no cover + pc = None + +from ..config import options +from ..core import is_kernel_mode +from ..utils import pd_release_version, tokenize + +_use_bool_any_all = pd_release_version[:2] >= (1, 3) +_use_extension_index = pd_release_version[:2] >= (1, 4) +_object_engine_for_string_array = pd_release_version[:2] >= (1, 5) + +if _object_engine_for_string_array: + StringArrayBase = type(StringArrayBase)( + "StringArrayBase", StringArrayBase.__bases__, dict(StringArrayBase.__dict__) + ) + + +class ArrowDtype(ExtensionDtype): + @property + def arrow_type(self): # pragma: no cover + raise NotImplementedError + + def __from_arrow__(self, array): + return self.construct_array_type()(array) + + +@register_extension_dtype +class ArrowStringDtype(ArrowDtype): + """ + Extension dtype for arrow string data. + + .. warning:: + + ArrowStringDtype is considered experimental. The implementation and + parts of the API may change without warning. + + In particular, ArrowStringDtype.na_value may change to no longer be + ``numpy.nan``. + + Attributes + ---------- + None + + Methods + ------- + None + + Examples + -------- + >>> import mars.dataframe as md + >>> md.ArrowStringDtype() + ArrowStringDtype + """ + + type = str + kind = "U" + name = "Arrow[string]" + na_value = pa_null + + @classmethod + def construct_from_string(cls, string): + if string == cls.name: + return cls() + else: + raise TypeError(f"Cannot construct a '{cls}' from '{string}'") + + @classmethod + def construct_array_type(cls) -> "Type[ArrowStringArray]": + return ArrowStringArray + + @property + def arrow_type(self): + return pa.string() + + +@register_extension_dtype +class ArrowStringDtypeAlias(ArrowStringDtype): + name = "arrow_string" # register an alias name for compatibility + + +class ArrowListDtypeType(type): + """ + the type of ArrowListDtype, this metaclass determines subclass ability + """ + + pass + + +class ArrowListDtype(ArrowDtype): + _metadata = ("_value_type",) + + def __init__(self, dtype): + if isinstance(dtype, type(self)): + dtype = dtype.value_type + if pa and isinstance(dtype, pa.DataType): + dtype = dtype.to_pandas_dtype() + + dtype = pandas_dtype(dtype) + if is_string_dtype(dtype) and not isinstance(dtype, ArrowStringDtype): + # convert string dtype to arrow string dtype + dtype = ArrowStringDtype() + + self._value_type = dtype + + @property + def value_type(self): + return self._value_type + + @property + def kind(self): + return "O" + + @property + def type(self): + return ArrowListDtypeType + + @property + def name(self): + return f"Arrow[List[{self.value_type.name}]]" + + @property + def arrow_type(self): + if isinstance(self._value_type, ArrowDtype): + arrow_subdtype = self._value_type.arrow_type + else: + arrow_subdtype = pa.from_numpy_dtype(self._value_type) + return pa.list_(arrow_subdtype) + + def __repr__(self) -> str: + return self.name + + @classmethod + def construct_array_type(cls) -> "Type[ArrowListArray]": + return ArrowListArray + + @classmethod + def construct_from_string(cls, string): + msg = f"Cannot construct a 'ArrowListDtype' from '{string}'" + xpr = re.compile(r"Arrow\[List\[(?P[^,]*)\]\]$") + m = xpr.match(string) + if m: + value_type = m.groupdict()["value_type"] + return ArrowListDtype(value_type) + else: + raise TypeError(msg) + + @classmethod + def is_dtype(cls, dtype) -> bool: + dtype = getattr(dtype, "dtype", dtype) + if isinstance(dtype, str): + try: + cls.construct_from_string(dtype) + except TypeError: + return False + else: + return True + else: + return isinstance(dtype, cls) + + def __hash__(self): + return super().__hash__() + + def __eq__(self, other): + if not isinstance(other, ArrowListDtype): + return False + + value_type = self._value_type + other_value_type = other._value_type + + try: + return value_type == other_value_type + except TypeError: + # cannot compare numpy dtype and extension dtype + return other_value_type == value_type + + +class ArrowArray(ExtensionArray): + _arrow_type = None + + def __init__(self, values, dtype: ArrowDtype = None, copy=False): + pandas_only = self._pandas_only() + + if pa is not None and not pandas_only: + self._init_by_arrow(values, dtype=dtype, copy=copy) + elif not is_kernel_mode(): + # not in kernel mode, allow to use numpy handle data + # just for infer dtypes purpose + self._init_by_numpy(values, dtype=dtype, copy=copy) + else: + raise ImportError("Cannot create ArrowArray when `pyarrow` not installed") + + # for test purpose + self._force_use_pandas = pandas_only + + def _init_by_arrow(self, values, dtype: ArrowDtype = None, copy=False): + if isinstance(values, (pd.Index, pd.Series)): + # for pandas Index and Series, + # convert to PandasArray + values = values.array + + if isinstance(values, type(self)): + arrow_array = values._arrow_array + elif isinstance(values, ExtensionArray): + # if come from pandas object like index, + # convert to pandas StringArray first, + # validation will be done in construct + arrow_array = pa.chunked_array([pa.array(values, from_pandas=True)]) + elif isinstance(values, pa.ChunkedArray): + arrow_array = values + elif isinstance(values, pa.Array): + arrow_array = pa.chunked_array([values]) + elif len(values) == 0: # pragma: no cover + arrow_array = pa.chunked_array([pa.array([], type=dtype.arrow_type)]) + else: + arrow_array = pa.chunked_array([pa.array(values, type=dtype.arrow_type)]) + + if copy: + arrow_array = copy_obj(arrow_array) + + self._use_arrow = True + self._arrow_array = arrow_array + + if NDArrayBacked is not None and isinstance(self, NDArrayBacked): + NDArrayBacked.__init__(self, np.array([]), dtype) + else: + self._dtype = dtype + + def _init_by_numpy(self, values, dtype: ArrowDtype = None, copy=False): + self._use_arrow = False + + ndarray = np.array(values, copy=copy) + if NDArrayBacked is not None and isinstance(self, NDArrayBacked): + NDArrayBacked.__init__(self, ndarray, dtype) + else: + self._dtype = dtype + self._ndarray = np.array(values, copy=copy) + + @classmethod + def _pandas_only(cls): + return options.dataframe.arrow_array.pandas_only + + def __repr__(self): + return f"{type(self).__name__}({repr(self._array)})" + + @property + def _array(self): + return self._arrow_array if self._use_arrow else self._ndarray + + @property + def dtype(self) -> "Type[ArrowDtype]": + return self._dtype + + @property + def nbytes(self) -> int: + if self._use_arrow: + return sum( + x.size + for chunk in self._arrow_array.chunks + for x in chunk.buffers() + if x is not None + ) + else: + return self._ndarray.nbytes + + @property + def shape(self): + if self._use_arrow: + return (self._arrow_array.length(),) + else: + return self._ndarray.shape + + def memory_usage(self, deep=True) -> int: + if self._use_arrow: + return self.nbytes + else: + return pd.Series(self._ndarray).memory_usage(index=False, deep=deep) + + @classmethod + def _to_arrow_array(cls, scalars): + return pa.array(scalars) + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + if pa is None or cls._pandas_only(): + # pyarrow not installed, just return numpy + ret = np.empty(len(scalars), dtype=object) + ret[:] = scalars + return cls(ret) + + if pa_null is not None and isinstance(scalars, type(pa_null)): + scalars = [] + elif not hasattr(scalars, "dtype"): + ret = np.empty(len(scalars), dtype=object) + for i, s in enumerate(scalars): + ret[i] = s + scalars = ret + elif isinstance(scalars, cls): + if copy: + scalars = scalars.copy() + return scalars + arrow_array = pa.chunked_array([cls._to_arrow_array(scalars)]) + return cls(arrow_array, dtype=dtype, copy=copy) + + @classmethod + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + return cls._from_sequence(strings, dtype=dtype, copy=copy) + + @staticmethod + def _can_process_slice_via_arrow(slc): + if not isinstance(slc, slice): + return False + if slc.step is not None and slc.step != 1: + return False + if slc.start is not None and not isinstance( + slc.start, Integral + ): # pragma: no cover + return False + if slc.stop is not None and not isinstance( + slc.stop, Integral + ): # pragma: no cover + return False + return True + + def _values_for_factorize(self): + arr = self.to_numpy() + mask = self.isna() + arr[mask] = -1 + return arr, -1 + + def _values_for_argsort(self): + return self.to_numpy() + + @classmethod + def _from_factorized(cls, values, original): + return cls(values) + + @staticmethod + def _process_pos(pos, length, is_start): + if pos is None: + return 0 if is_start else length + return pos + length if pos < 0 else pos + + @classmethod + def _post_scalar_getitem(cls, lst): + return lst.to_pandas()[0] + + def __getitem__(self, item): + cls = type(self) + + if pa is None or self._force_use_pandas: + # pyarrow not installed + result = self._ndarray[item] + if pd.api.types.is_scalar(item): + return result + else: + return type(self)(result) + + has_take = hasattr(self._arrow_array, "take") + if not self._force_use_pandas and has_take: + if pd.api.types.is_scalar(item): + item = item + len(self) if item < 0 else item + return self._post_scalar_getitem(self._arrow_array.take([item])) + elif self._can_process_slice_via_arrow(item): + length = len(self) + start, stop = item.start, item.stop + start = self._process_pos(start, length, True) + stop = self._process_pos(stop, length, False) + return cls( + self._arrow_array.slice(offset=start, length=stop - start), + dtype=self._dtype, + ) + elif hasattr(item, "dtype") and np.issubdtype(item.dtype, np.bool_): + return cls( + self._arrow_array.filter(pa.array(item, from_pandas=True)), + dtype=self._dtype, + ) + elif hasattr(item, "dtype"): + length = len(self) + item = np.where(item < 0, item + length, item) + return cls(self._arrow_array.take(item), dtype=self._dtype) + + array = np.asarray(self._arrow_array.to_pandas()) + return cls(array[item], dtype=self._dtype) + + @classmethod + def _concat_same_type(cls, to_concat: Sequence["ArrowArray"]) -> "ArrowArray": + if pa is None or cls._pandas_only(): + # pyarrow not installed + return cls(np.concatenate([x._array for x in to_concat])) + + chunks = list( + itertools.chain.from_iterable(x._arrow_array.chunks for x in to_concat) + ) + if len(chunks) == 0: + chunks = [pa.array([], type=to_concat[0].dtype.arrow_type)] + return cls(pa.chunked_array(chunks)) + + def __len__(self): + return len(self._array) + + def __array__(self, dtype=None): + return self.to_numpy(dtype=dtype) + + def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default): + if self._use_arrow: + array = np.asarray(self._arrow_array.to_pandas()) + else: + array = self._ndarray + if copy or na_value is not lib.no_default: + array = array.copy() + if na_value is not lib.no_default: + array[self.isna()] = na_value + return array + + @classmethod + def _array_fillna(cls, array, value): + return array.fillna(value) + + def fillna(self, value=None, method=None, limit=None): + cls = type(self) + + if pa is None or self._force_use_pandas: + # pyarrow not installed + return cls( + pd.Series(self.to_numpy()).fillna( + value=value, method=method, limit=limit + ) + ) + + chunks = [] + for chunk_array in self._arrow_array.chunks: + array = chunk_array.to_pandas() + if method is None: + result_array = self._array_fillna(array, value) + else: + result_array = array.fillna(value=value, method=method, limit=limit) + chunks.append(pa.array(result_array, from_pandas=True)) + return cls(pa.chunked_array(chunks), dtype=self._dtype) + + def astype(self, dtype, copy=True): + dtype = pandas_dtype(dtype) + if isinstance(dtype, ArrowStringDtype): + if copy: + return self.copy() + return self + + if pa is None or self._force_use_pandas: + # pyarrow not installed + if isinstance(dtype, ArrowDtype): + dtype = dtype.type + return type(self)(pd.Series(self.to_numpy()).astype(dtype, copy=copy)) + + # try to slice 1 record to get the result dtype + test_array = self._arrow_array.slice(0, 1).to_pandas() + test_result_array = test_array.astype(dtype).array + if _use_extension_index: + test_result_type = type(test_array.astype(dtype).values) + if test_result_type is np.ndarray: + test_result_type = np.array + else: + test_result_type = type(test_result_array) + + result_array = test_result_type( + np.full( + self.shape, + test_result_array.dtype.na_value, + dtype=np.asarray(test_result_array).dtype, + ) + ) + + start = 0 + # use chunks to do astype + for chunk_array in self._arrow_array.chunks: + result_array[start : start + len(chunk_array)] = ( + chunk_array.to_pandas().astype(dtype).array + ) + start += len(chunk_array) + return result_array + + def isna(self): + if ( + not self._force_use_pandas + and self._use_arrow + and hasattr(self._arrow_array, "is_null") + ): + return self._arrow_array.is_null().to_pandas().to_numpy() + elif self._use_arrow: + return pd.isna(self._arrow_array.to_pandas()).to_numpy() + else: + return pd.isna(self._ndarray) + + def take(self, indices, allow_fill=False, fill_value=None): + if ( + allow_fill is False or (allow_fill and fill_value is self.dtype.na_value) + ) and len(self) > 0: + return type(self)(self[indices], dtype=self._dtype) + + if self._use_arrow: + array = self._arrow_array.to_pandas().to_numpy() + else: + array = self._ndarray + + replace = False + if allow_fill and (fill_value is None or fill_value == self._dtype.na_value): + fill_value = self.dtype.na_value + replace = True + + result = take(array, indices, fill_value=fill_value, allow_fill=allow_fill) + del array + if replace and pa is not None: + # pyarrow cannot recognize pa.NULL + result[result == self.dtype.na_value] = None + return type(self)(result, dtype=self._dtype) + + def copy(self): + if self._use_arrow: + return type(self)(copy_obj(self._arrow_array)) + else: + return type(self)(self._ndarray.copy()) + + def unique(self): + if self._force_use_pandas or not self._use_arrow or not hasattr(pc, "unique"): + return type(self)(np.unique(self.to_numpy()), dtype=self._dtype) + return type(self)(pc.unique(self._arrow_array), dtype=self._dtype) + + def value_counts(self, dropna=False): + if self._use_arrow: + series = self._arrow_array.to_pandas() + else: + series = pd.Series(self._ndarray) + return type(self)(series.value_counts(dropna=dropna), dtype=self._dtype) + + if _use_bool_any_all: + + def any(self, axis=0, out=None): + return self.to_numpy().astype(bool).any(axis=axis, out=out) + + def all(self, axis=0, out=None): + return self.to_numpy().astype(bool).all(axis=axis, out=out) + + else: + + def any(self, axis=0, out=None): + return self.to_numpy().any(axis=axis, out=out) + + def all(self, axis=0, out=None): + return self.to_numpy().all(axis=axis, out=out) + + def __mars_tokenize__(self): + if self._use_arrow: + return tokenize( + [ + memoryview(x) + for chunk in self._arrow_array.chunks + for x in chunk.buffers() + if x is not None + ] + ) + else: + return self._ndarray + + +class ArrowStringArray(ArrowArray, StringArrayBase): + def __init__(self, values, dtype=None, copy=False): + if dtype is not None: + assert isinstance(dtype, ArrowStringDtype) + ArrowArray.__init__(self, values, ArrowStringDtype(), copy=copy) + + @classmethod + def from_scalars(cls, values): + if pa is None or cls._pandas_only(): + return cls._from_sequence(values) + else: + arrow_array = pa.chunked_array([cls._to_arrow_array(values)]) + return cls(arrow_array) + + @classmethod + def _to_arrow_array(cls, scalars): + return pa.array(scalars).cast(pa.string()) + + def __setitem__(self, key, value): + if isinstance(value, (pd.Index, pd.Series)): + value = value.to_numpy() + if isinstance(value, type(self)): + value = value.to_numpy() + + key = check_array_indexer(self, key) + scalar_key = is_scalar(key) + scalar_value = is_scalar(value) + if scalar_key and not scalar_value: + raise ValueError("setting an array element with a sequence.") + + # validate new items + if scalar_value: + if pd.isna(value): + value = None + elif not isinstance(value, str): + raise ValueError( + f"Cannot set non-string value '{value}' into a ArrowStringArray." + ) + else: + if not is_array_like(value): + value = np.asarray(value, dtype=object) + if len(value) and not lib.is_string_array(value, skipna=True): + raise ValueError("Must provide strings.") + + if self._use_arrow: + string_array = np.asarray(self._arrow_array.to_pandas()) + string_array[key] = value + self._arrow_array = pa.chunked_array([pa.array(string_array)]) + else: + self._ndarray[key] = value + + # Override parent because we have different return types. + @classmethod + def _create_arithmetic_method(cls, op): + # Note: this handles both arithmetic and comparison methods. + def method(self, other): + is_arithmetic = True if op.__name__ in ops.ARITHMETIC_BINOPS else False + pandas_only = cls._pandas_only() + + is_other_array = False + if not is_scalar(other): + is_other_array = True + other = np.asarray(other) + + self_is_na = self.isna() + other_is_na = pd.isna(other) + mask = self_is_na | other_is_na + + if pa is None or pandas_only: + if is_arithmetic: + ret = np.empty(self.shape, dtype=object) + else: + ret = np.zeros(self.shape, dtype=bool) + valid = ~mask + arr = ( + self._arrow_array.to_pandas().to_numpy() + if self._use_arrow + else self._ndarray + ) + o = other[valid] if is_other_array else other + ret[valid] = op(arr[valid], o) + if is_arithmetic: + return ArrowStringArray(ret) + else: + return pd.arrays.BooleanArray(ret, mask) + + chunks = [] + mask_chunks = [] + start = 0 + for chunk_array in self._arrow_array.chunks: + chunk_array = np.asarray(chunk_array.to_pandas()) + end = start + len(chunk_array) + chunk_mask = mask[start:end] + chunk_valid = ~chunk_mask + + if is_arithmetic: + result = np.empty(chunk_array.shape, dtype=object) + else: + result = np.zeros(chunk_array.shape, dtype=bool) + + chunk_other = other + if is_other_array: + chunk_other = other[start:end] + chunk_other = chunk_other[chunk_valid] + + # calculate only for both not None + result[chunk_valid] = op(chunk_array[chunk_valid], chunk_other) + + if is_arithmetic: + chunks.append(pa.array(result, type=pa.string(), from_pandas=True)) + else: + chunks.append(result) + mask_chunks.append(chunk_mask) + + if is_arithmetic: + return ArrowStringArray(pa.chunked_array(chunks)) + else: + return pd.arrays.BooleanArray( + np.concatenate(chunks), np.concatenate(mask_chunks) + ) + + return set_function_name(method, f"__{op.__name__}__", cls) + + def shift(self, periods: int = 1, fill_value: object = None) -> "ArrowStringArray": + return ExtensionArray.shift(self, periods=periods, fill_value=fill_value) + + @classmethod + def _add_arithmetic_ops(cls): + cls.__add__ = cls._create_arithmetic_method(operator.add) + cls.__radd__ = cls._create_arithmetic_method(ops.radd) + + cls.__mul__ = cls._create_arithmetic_method(operator.mul) + cls.__rmul__ = cls._create_arithmetic_method(ops.rmul) + + @classmethod + def _add_comparison_ops(cls): + cls.__eq__ = cls._create_comparison_method(operator.eq) + cls.__ne__ = cls._create_comparison_method(operator.ne) + cls.__lt__ = cls._create_comparison_method(operator.lt) + cls.__gt__ = cls._create_comparison_method(operator.gt) + cls.__le__ = cls._create_comparison_method(operator.le) + cls.__ge__ = cls._create_comparison_method(operator.ge) + + _create_comparison_method = _create_arithmetic_method + + +ArrowStringArray._add_arithmetic_ops() +ArrowStringArray._add_comparison_ops() + + +class ArrowListArray(ArrowArray): + def __init__(self, values, dtype: ArrowListDtype = None, copy=False): + if dtype is None: + if isinstance(values, type(self)): + dtype = values.dtype + elif pa is not None: + if isinstance(values, pa.Array): + dtype = ArrowListDtype(values.type.value_type) + elif isinstance(values, pa.ChunkedArray): + dtype = ArrowListDtype(values.type.value_type) + else: + values = pa.array(values) + if values.type == pa.null(): + dtype = ArrowListDtype(pa.string()) + else: + dtype = ArrowListDtype(values.type.value_type) + else: + value_type = np.asarray(values[0]).dtype + dtype = ArrowListDtype(value_type) + + super().__init__(values, dtype=dtype, copy=copy) + + def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default): + if self._use_arrow: + s = self._arrow_array.to_pandas() + else: + s = pd.Series(self._ndarray) + s = s.map(lambda x: x.tolist() if hasattr(x, "tolist") else x) + if copy or na_value is not lib.no_default: + s = s.copy() + if na_value is not lib.no_default: + s[self.isna()] = na_value + return np.asarray(s) + + @classmethod + def _post_scalar_getitem(cls, lst): + return lst[0].as_py() + + def __setitem__(self, key, value): + if isinstance(value, (pd.Index, pd.Series)): + value = value.to_numpy() + + key = check_array_indexer(self, key) + scalar_key = is_scalar(key) + + # validate new items + if scalar_key: + if pd.isna(value): + value = None + elif not is_list_like(value): + raise ValueError("Must provide list.") + + if self._use_arrow: + array = np.asarray(self._arrow_array.to_pandas()) + array[key] = value + self._arrow_array = pa.chunked_array( + [pa.array(array, type=self.dtype.arrow_type)] + ) + else: + self._ndarray[key] = value + + @classmethod + def _array_fillna(cls, series, value): + # cannot fillna directly, because value is a list-like object + return series.apply(lambda x: x if is_list_like(x) or not pd.isna(x) else value) + + def astype(self, dtype, copy=True): + msg = f"cannot astype from {self.dtype} to {dtype}" + dtype = pandas_dtype(dtype) + if isinstance(dtype, ArrowListDtype): + if self.dtype == dtype: + if copy: + return self.copy() + return self + else: + if self._use_arrow: + try: + arrow_array = self._arrow_array.cast(dtype.arrow_type) + return ArrowListArray(arrow_array) + except (NotImplementedError, pa.ArrowInvalid): + raise TypeError(msg) + else: + + def f(x): + return pd.Series(x).astype(dtype.value_type.type).tolist() + + try: + arr = pd.Series(self._ndarray) + ret = arr.map(f).to_numpy() + return ArrowStringArray(ret) + except ValueError: + raise TypeError(msg) + + try: + return super().astype(dtype, copy=copy) + except ValueError: + raise TypeError(msg) diff --git a/python/xorbits/_mars/dataframe/base/__init__.py b/python/xorbits/_mars/dataframe/base/__init__.py new file mode 100644 index 000000000..fb7119733 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/__init__.py @@ -0,0 +1,151 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .apply import df_apply, series_apply +from .astype import astype, index_astype +from .cartesian_chunk import cartesian_chunk +from .check_monotonic import ( + check_monotonic, + is_monotonic, + is_monotonic_decreasing, + is_monotonic_increasing, +) +from .cut import cut +from .describe import describe +from .diff import df_diff, series_diff +from .drop import df_drop, df_pop, index_drop, series_drop +from .drop_duplicates import ( + df_drop_duplicates, + index_drop_duplicates, + series_drop_duplicates, +) +from .duplicated import df_duplicated, index_duplicated, series_duplicated +from .eval import df_eval, df_query +from .explode import df_explode, series_explode +from .isin import df_isin, series_isin +from .map import index_map, series_map +from .map_chunk import map_chunk +from .melt import melt +from .memory_usage import df_memory_usage, index_memory_usage, series_memory_usage +from .pct_change import pct_change +from .qcut import qcut +from .rebalance import rebalance +from .rechunk import rechunk +from .select_dtypes import select_dtypes +from .shift import shift, tshift +from .stack import stack +from .to_cpu import to_cpu +from .to_gpu import to_gpu +from .transform import df_transform, series_transform +from .transpose import transpose +from .value_counts import value_counts + + +def _install(): + from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE + from .accessor import CachedAccessor, DatetimeAccessor, StringAccessor + from .datetimes import _datetime_method_to_handlers + from .standardize_range_index import ChunkStandardizeRangeIndex + from .string_ import _string_method_to_handlers + + for t in DATAFRAME_TYPE: + setattr(t, "to_gpu", to_gpu) + setattr(t, "to_cpu", to_cpu) + setattr(t, "rechunk", rechunk) + setattr(t, "describe", describe) + setattr(t, "apply", df_apply) + setattr(t, "transform", df_transform) + setattr(t, "isin", df_isin) + setattr(t, "shift", shift) + setattr(t, "tshift", tshift) + setattr(t, "diff", df_diff) + setattr(t, "astype", astype) + setattr(t, "drop", df_drop) + setattr(t, "pop", df_pop) + setattr( + t, "__delitem__", lambda df, items: df_drop(df, items, axis=1, inplace=True) + ) + setattr(t, "drop_duplicates", df_drop_duplicates) + setattr(t, "duplicated", df_duplicated) + setattr(t, "melt", melt) + setattr(t, "memory_usage", df_memory_usage) + setattr(t, "select_dtypes", select_dtypes) + setattr(t, "map_chunk", map_chunk) + setattr(t, "cartesian_chunk", cartesian_chunk) + setattr(t, "rebalance", rebalance) + setattr(t, "stack", stack) + setattr(t, "explode", df_explode) + setattr(t, "eval", df_eval) + setattr(t, "query", df_query) + setattr(t, "pct_change", pct_change) + setattr(t, "transpose", transpose) + + for t in SERIES_TYPE: + setattr(t, "to_gpu", to_gpu) + setattr(t, "to_cpu", to_cpu) + setattr(t, "rechunk", rechunk) + setattr(t, "map", series_map) + setattr(t, "describe", describe) + setattr(t, "apply", series_apply) + setattr(t, "transform", series_transform) + setattr(t, "isin", series_isin) + setattr(t, "shift", shift) + setattr(t, "tshift", tshift) + setattr(t, "diff", series_diff) + setattr(t, "value_counts", value_counts) + setattr(t, "astype", astype) + setattr(t, "drop", series_drop) + setattr(t, "drop_duplicates", series_drop_duplicates) + setattr(t, "duplicated", series_duplicated) + setattr(t, "memory_usage", series_memory_usage) + setattr(t, "map_chunk", map_chunk) + setattr(t, "cartesian_chunk", cartesian_chunk) + setattr(t, "rebalance", rebalance) + setattr(t, "explode", series_explode) + setattr(t, "check_monotonic", check_monotonic) + setattr(t, "is_monotonic", property(fget=is_monotonic)) + setattr(t, "is_monotonic_increasing", property(fget=is_monotonic_increasing)) + setattr(t, "is_monotonic_decreasing", property(fget=is_monotonic_decreasing)) + setattr(t, "pct_change", pct_change) + + for t in INDEX_TYPE: + setattr(t, "map", index_map) + setattr(t, "rechunk", rechunk) + setattr(t, "rebalance", rebalance) + setattr(t, "drop", index_drop) + setattr(t, "drop_duplicates", index_drop_duplicates) + setattr(t, "duplicated", index_duplicated) + setattr(t, "memory_usage", index_memory_usage) + setattr(t, "astype", index_astype) + setattr(t, "value_counts", value_counts) + setattr(t, "check_monotonic", check_monotonic) + setattr(t, "is_monotonic", property(fget=is_monotonic)) + setattr(t, "is_monotonic_increasing", property(fget=is_monotonic_increasing)) + setattr(t, "is_monotonic_decreasing", property(fget=is_monotonic_decreasing)) + + for method in _string_method_to_handlers: + if not hasattr(StringAccessor, method): + StringAccessor._register(method) + + for method in _datetime_method_to_handlers: + if not hasattr(DatetimeAccessor, method): + DatetimeAccessor._register(method) + + for series in SERIES_TYPE: + series.str = CachedAccessor("str", StringAccessor) + series.dt = CachedAccessor("dt", DatetimeAccessor) + + +_install() +del _install diff --git a/python/xorbits/_mars/dataframe/base/_duplicate.py b/python/xorbits/_mars/dataframe/base/_duplicate.py new file mode 100644 index 000000000..32869ddf3 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/_duplicate.py @@ -0,0 +1,412 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect + +import numpy as np +import pandas as pd +from pandas.api.types import is_list_like + +from ...config import options +from ...core import OutputType, recursive_tile +from ...core.operand import MapReduceOperand, OperandStage +from ...serialization.serializables import AnyField, Int32Field, KeyField, StringField +from ...utils import ceildiv, has_unknown_shape, lazy_import +from ..initializer import DataFrame as asdataframe +from ..operands import DataFrameOperandMixin, DataFrameShuffleProxy + +cudf = lazy_import("cudf") + + +class DuplicateOperand(MapReduceOperand, DataFrameOperandMixin): + _input = KeyField("input") + _subset = AnyField("subset") + _keep = AnyField("keep") + _method = StringField("method") + + # subset chunk, used for method 'subset_tree' + _subset_chunk = KeyField("subset_chunk") + # shuffle phase, used in shuffle method + _shuffle_size = Int32Field("shuffle_size") + + @property + def input(self): + return self._input + + @property + def subset(self): + return self._subset + + @property + def keep(self): + return self._keep + + @property + def method(self): + return self._method + + @property + def subset_chunk(self): + return self._subset_chunk + + @property + def shuffle_size(self): + return self._shuffle_size + + @classmethod + def _get_shape(cls, input_shape, op): # pragma: no cover + raise NotImplementedError + + @classmethod + def _gen_tileable_params( + cls, op: "DuplicateOperand", input_params + ): # pragma: no cover + raise NotImplementedError + + @classmethod + def _gen_chunk_params(cls, op: "DuplicateOperand", input_chunk): # pragma: no cover + raise NotImplementedError + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + if self._subset_chunk is not None: + self._subset_chunk = self._inputs[1] + + @classmethod + def _tile_one_chunk(cls, op: "DuplicateOperand"): + inp = op.input + out = op.outputs[0] + + chunk_op = op.copy().reset_key() + chunk_op._method = None + in_chunk = inp.chunks[0] + chunk_params = cls._gen_chunk_params(chunk_op, in_chunk) + chunk = chunk_op.new_chunk([in_chunk], kws=[chunk_params]) + + params = out.params + params["chunks"] = [chunk] + params["nsplits"] = tuple((s,) for s in chunk.shape) + new_op = op.copy() + return new_op.new_tileables([inp], kws=[params]) + + @classmethod + def _get_map_output_types(cls, input_chunk, method: str): + raise NotImplementedError + + @classmethod + def _gen_map_chunks(cls, op: "DuplicateOperand", inp, method, **kw): + chunks = inp.chunks + map_chunks = [] + for c in chunks: + chunk_op = op.copy().reset_key() + chunk_op._output_types = cls._get_map_output_types(c, method) + chunk_op._method = method + chunk_op.stage = OperandStage.map + for k, v in kw.items(): + setattr(chunk_op, k, v) + chunk_params = cls._gen_chunk_params(chunk_op, c) + map_chunks.append(chunk_op.new_chunk([c], kws=[chunk_params])) + return map_chunks + + @classmethod + def _gen_tree_chunks(cls, op: "DuplicateOperand", inp, method): + from ..merge import DataFrameConcat + + out = op.outputs[0] + combine_size = options.combine_size + new_chunks = cls._gen_map_chunks(op, inp, method) + while len(new_chunks) > 1: + out_chunk_size = ceildiv(len(new_chunks), combine_size) + out_chunks = [] + for i in range(out_chunk_size): + in_chunks = new_chunks[i * combine_size : (i + 1) * combine_size] + s = sum(c.shape[0] for c in in_chunks) + if in_chunks[0].ndim == 2: + kw = dict( + dtypes=in_chunks[0].dtypes, + index_value=in_chunks[0].index_value, + columns_value=in_chunks[0].columns_value, + shape=(s, in_chunks[0].shape[1]), + index=(i, 0), + ) + else: + kw = dict( + dtype=in_chunks[0].dtype, + index_value=in_chunks[0].index_value, + name=in_chunks[0].name, + shape=(s,), + index=(i,), + ) + concat_chunk = DataFrameConcat( + output_types=in_chunks[0].op.output_types + ).new_chunk(in_chunks, **kw) + chunk_op = op.copy().reset_key() + chunk_op._method = method + chunk_op.stage = ( + OperandStage.combine if out_chunk_size > 1 else OperandStage.agg + ) + if out_chunk_size > 1 and method == "tree": + # for tree, chunks except last one should be dataframes, + chunk_op._output_types = ( + concat_chunk.op.output_types + if out_chunk_size > 1 + else out.op.output_types + ) + elif method == "subset_tree": + # `subset_tree` will tile chunks that are always dataframes + chunk_op._output_types = [OutputType.dataframe] + params = cls._gen_chunk_params(chunk_op, concat_chunk) + if out.ndim == 1 and out_chunk_size == 1: + params["name"] = out.name + out_chunks.append(chunk_op.new_chunk([concat_chunk], kws=[params])) + new_chunks = out_chunks + + return new_chunks + + @classmethod + def _tile_tree(cls, op: "DuplicateOperand", inp): + out = op.outputs[0] + + params = out.params + params["chunks"] = chunks = cls._gen_tree_chunks(op, inp, "tree") + params["nsplits"] = tuple((s,) for s in chunks[0].shape) + new_op = op.copy() + return new_op.new_tileables([inp], kws=[params]) + + @classmethod + def _tile_subset_tree(cls, op: "DuplicateOperand", inp): + # subset is available for DataFrame only + inp = asdataframe(inp) + out = op.outputs[0] + subset = op.subset + if subset is None: + subset = inp.dtypes.index.tolist() + # select subset first + subset_df = yield from recursive_tile(inp[subset]) + # tree aggregate subset + subset_chunk = cls._gen_tree_chunks(op, subset_df, "subset_tree")[0] + + out_chunks = [] + for c in inp.chunks: + chunk_op = op.copy().reset_key() + chunk_op._method = "subset_tree" + chunk_op._subset_chunk = subset_chunk + chunk_params = cls._gen_chunk_params(chunk_op, c) + out_chunks.append(chunk_op.new_chunk([c, subset_chunk], kws=[chunk_params])) + + new_op = op.copy() + params = out.params + params["chunks"] = out_chunks + splits = tuple(c.shape[0] for c in out_chunks) + if out.ndim == 2: + params["nsplits"] = (splits, inp.nsplits[1]) + else: + params["nsplits"] = (splits,) + return new_op.new_tileables([inp], kws=[params]) + + @classmethod + def _tile_shuffle(cls, op: "DuplicateOperand", inp): + out = op.outputs[0] + + map_chunks = cls._gen_map_chunks( + op, inp, "shuffle", _shuffle_size=inp.chunk_shape[0] + ) + proxy_chunk = DataFrameShuffleProxy( + output_types=map_chunks[0].op.output_types + ).new_chunk(map_chunks, shape=()) + reduce_chunks = [] + for i in range(len(map_chunks)): + reduce_op = op.copy().reset_key() + reduce_op._method = "shuffle" + reduce_op.stage = OperandStage.reduce + reduce_op.reducer_phase = "drop_duplicates" + reduce_op.n_reducers = len(map_chunks) + reduce_op.reducer_ordinal = i + reduce_op._shuffle_size = inp.chunk_shape[0] + reduce_op._output_types = op.output_types + reduce_chunk_params = map_chunks[0].params + reduce_chunk_params["index"] = (i,) + reduce_chunk_params["index"][1:] + reduce_chunk_params["is_mapper"] = True + reduce_chunks.append( + reduce_op.new_chunk([proxy_chunk], kws=[reduce_chunk_params]) + ) + + put_back_proxy_chunk = DataFrameShuffleProxy( + output_types=map_chunks[0].op.output_types + ).new_chunk(reduce_chunks, shape=()) + put_back_chunks = [] + for i in range(len(map_chunks)): + put_back_op = op.copy().reset_key() + put_back_op._method = "shuffle" + put_back_op.stage = OperandStage.reduce + put_back_op.reducer_phase = "put_back" + put_back_op.reducer_index = (i,) + put_back_op.n_reducers = len(map_chunks) + put_back_op.reducer_ordinal = i + if out.ndim == 2: + put_back_chunk_params = map_chunks[i].params + else: + put_back_chunk_params = out.params.copy() + map_chunk_params = map_chunks[i].params + put_back_chunk_params["index_value"] = map_chunk_params["index_value"] + put_back_chunk_params["index"] = map_chunk_params["index"][:1] + if out.ndim == 1: + put_back_chunk_params["index"] = (i,) + else: + put_back_chunk_params["index"] = (i,) + put_back_chunk_params["index"][ + 1: + ] + put_back_chunk_params["shape"] = cls._get_shape( + map_chunks[i].op.input.shape, op + ) + put_back_chunks.append( + put_back_op.new_chunk( + [put_back_proxy_chunk], kws=[put_back_chunk_params] + ) + ) + + new_op = op.copy() + params = out.params + params["chunks"] = put_back_chunks + split = tuple(c.shape[0] for c in put_back_chunks) + if out.ndim == 2: + params["nsplits"] = (split, inp.nsplits[1]) + else: + params["nsplits"] = (split,) + return new_op.new_tileables([inp], kws=[params]) + + @classmethod + def tile(cls, op: "DuplicateOperand"): + inp = op.input + + if len(inp.chunks) == 1: + # one chunk + return cls._tile_one_chunk(op) + + if inp.ndim == 2 and inp.chunk_shape[1] > 1: + if has_unknown_shape(inp): + yield + inp = yield from recursive_tile(inp.rechunk({1: inp.shape[1]})) + + default_tile = cls._tile_tree + + if op.method == "auto": + # if method == 'auto', pick appropriate method + if np.isnan(inp.shape[0]) or op.subset is None: + # if any unknown shape exist, + # choose merge method + return default_tile(op, inp) + + # check subset data to see if it's small enough + subset_dtypes = inp.dtypes[op.subset] + memory_usage = 0.0 + for s_dtype in subset_dtypes: + if s_dtype.kind == "O" or not hasattr(s_dtype, "itemsize"): + # object, just use default tile + return default_tile(op, inp) + else: + memory_usage += s_dtype.itemsize * inp.shape[0] + if memory_usage <= options.chunk_store_limit: + # if subset is small enough, use method 'subset_tree' + r = yield from cls._tile_subset_tree(op, inp) + return r + else: + return default_tile(op, inp) + elif op.method == "subset_tree": + r = yield from cls._tile_subset_tree(op, inp) + return r + elif op.method == "tree": + return cls._tile_tree(op, inp) + else: + assert op.method == "shuffle" + ret = cls._tile_shuffle(op, inp) + if inspect.isgenerator(ret): + return (yield from ret) + else: + return ret + + @classmethod + def _drop_duplicates(cls, inp, op, subset=None, keep=None, ignore_index=None): + if ignore_index is None: + ignore_index = op.ignore_index + if subset is None: + subset = op.subset + if keep is None: + keep = op.keep + if inp.ndim == 2: + try: + return inp.drop_duplicates( + subset=subset, keep=keep, ignore_index=ignore_index + ) + except TypeError: + # no ignore_index for pandas < 1.0 + ret = inp.drop_duplicates(subset=subset, keep=keep) + if ignore_index: + ret.reset_index(drop=True, inplace=True) + return ret + else: + return inp.drop_duplicates(keep=keep) + + @classmethod + def _get_xdf(cls, x): + if cudf is None: + return pd + elif isinstance(x, (pd.Index, pd.Series, pd.DataFrame)): # pragma: no cover + return pd + else: # pragma: no cover + return cudf + + @classmethod + def _execute_subset_tree_map(cls, ctx, op): + out = op.outputs[0] + idx = out.index[0] + inp = ctx[op.input.key] + xdf = cls._get_xdf(inp) + + # index would be (chunk_index, i) + index = xdf.MultiIndex.from_arrays( + [np.full(inp.shape[0], idx), np.arange(inp.shape[0])], + names=["_chunk_index_", "_i_"], + ) + inp = inp.set_index(index) + ctx[out.key] = cls._drop_duplicates(inp, op, ignore_index=False) + + @classmethod + def _execute_subset_tree_combine(cls, ctx, op): + inp = ctx[op.input.key] + ctx[op.outputs[0].key] = cls._drop_duplicates(inp, op, ignore_index=False) + + @classmethod + def _execute_subset_tree_agg(cls, ctx, op): + inp = ctx[op.input.key] + ret = cls._drop_duplicates(inp, op, ignore_index=False) + ret = ret.index.to_frame() + ret.reset_index(drop=True, inplace=True) + ctx[op.outputs[0].key] = ret + + +def validate_subset(df, subset): + if subset is None: + return subset + if not is_list_like(subset): + subset = [subset] + else: + subset = list(subset) + + for s in subset: + if s not in df.dtypes: + raise KeyError(pd.Index([s])) + + return subset diff --git a/python/xorbits/_mars/dataframe/base/accessor.py b/python/xorbits/_mars/dataframe/base/accessor.py new file mode 100644 index 000000000..fb9b88c39 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/accessor.py @@ -0,0 +1,276 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import wraps +from typing import Iterable + +import pandas as pd +from pandas.api.types import ( + is_datetime64_dtype, + is_datetime64tz_dtype, + is_period_dtype, + is_timedelta64_dtype, +) + +from ...utils import adapt_mars_docstring +from .datetimes import SeriesDatetimeMethod, _datetime_method_to_handlers +from .string_ import SeriesStringMethod, _string_method_to_handlers + + +class StringAccessor: + """ + Vectorized string functions for Series and Index. + NAs stay NA unless handled otherwise by a particular method. + Patterned after Python's string methods, with some inspiration from + R's stringr package. + Examples + -------- + >>> import mars.dataframe as md + >>> s = md.Series(["A_Str_Series"]) + >>> s.execute() + 0 A_Str_Series + dtype: object + >>> s.str.split("_").execute() + 0 [A, Str, Series] + dtype: object + >>> s.str.replace("_", "").execute() + 0 AStrSeries + dtype: object + """ + + def __init__(self, series): + self._series = series + + @classmethod + def _gen_func(cls, method): + @wraps(getattr(pd.Series.str, method)) + def _inner(self, *args, **kwargs): + op = SeriesStringMethod( + method=method, method_args=args, method_kwargs=kwargs + ) + return op(self._series) + + _inner.__doc__ = adapt_mars_docstring(getattr(pd.Series.str, method).__doc__) + return _inner + + def __getitem__(self, item): + return self._gen_func("__getitem__")(self, item) + + def __dir__(self) -> Iterable[str]: + s = set(super().__dir__()) + s.update(_string_method_to_handlers.keys()) + return list(s) + + @classmethod + def _register(cls, method): + setattr(cls, method, cls._gen_func(method)) + + def split(self, pat=None, n=-1, expand=False): + r""" + Split strings around given separator/delimiter. + + Splits the string in the Series/Index from the beginning, + at the specified delimiter string. Equivalent to :meth:`str.split`. + + Parameters + ---------- + pat : str, optional + String or regular expression to split on. + If not specified, split on whitespace. + n : int, default -1 (all) + Limit number of splits in output. + ``None``, 0 and -1 will be interpreted as return all splits. + expand : bool, default False + Expand the splitted strings into separate columns. + + * If ``True``, return DataFrame/MultiIndex expanding dimensionality. + * If ``False``, return Series/Index, containing lists of strings. + + Returns + ------- + Series, Index, DataFrame or MultiIndex + Type matches caller unless ``expand=True`` (see Notes). + + See Also + -------- + Series.str.split : Split strings around given separator/delimiter. + Series.str.rsplit : Splits string around given separator/delimiter, + starting from the right. + Series.str.join : Join lists contained as elements in the Series/Index + with passed delimiter. + str.split : Standard library version for split. + str.rsplit : Standard library version for rsplit. + + Notes + ----- + The handling of the `n` keyword depends on the number of found splits: + + - If found splits > `n`, make first `n` splits only + - If found splits <= `n`, make all splits + - If for a certain row the number of found splits < `n`, + append `None` for padding up to `n` if ``expand=True`` + + If using ``expand=True``, Series and Index callers return DataFrame and + MultiIndex objects, respectively. + + Examples + -------- + >>> import numpy as np + >>> import mars.dataframe as md + >>> s = md.Series(["this is a regular sentence", + >>> "https://docs.python.org/3/tutorial/index.html", + >>> np.nan]) + >>> s.execute() + 0 this is a regular sentence + 1 https://docs.python.org/3/tutorial/index.html + 2 NaN + dtype: object + + In the default setting, the string is split by whitespace. + + >>> s.str.split().execute() + 0 [this, is, a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + Without the `n` parameter, the outputs of `rsplit` and `split` + are identical. + + >>> s.str.rsplit().execute() + 0 [this, is, a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + The `n` parameter can be used to limit the number of splits on the + delimiter. The outputs of `split` and `rsplit` are different. + + >>> s.str.split(n=2).execute() + 0 [this, is, a regular sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + >>> s.str.rsplit(n=2).execute() + 0 [this is a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + The `pat` parameter can be used to split by other characters. + + >>> s.str.split(pat = "/").execute() + 0 [this is a regular sentence] + 1 [https:, , docs.python.org, 3, tutorial, index... + 2 NaN + dtype: object + + When using ``expand=True``, the split elements will expand out into + separate columns. If NaN is present, it is propagated throughout + the columns during the split. + + >>> s.str.split(expand=True).execute() + 0 1 2 3 + 0 this is a regular + 1 https://docs.python.org/3/tutorial/index.html None None None + 2 NaN NaN NaN NaN \ + 4 + 0 sentence + 1 None + 2 NaN + + For slightly more complex use cases like splitting the html document name + from a url, a combination of parameter settings can be used. + + >>> s.str.rsplit("/", n=1, expand=True).execute() + 0 1 + 0 this is a regular sentence None + 1 https://docs.python.org/3/tutorial index.html + 2 NaN NaN + + Remember to escape special characters when explicitly using regular + expressions. + + >>> s = pd.Series(["1+1=2"]) + >>> s.str.split(r"\+|=", expand=True).execute() + 0 1 2 + 0 1 1 2 + """ + return self._gen_func("split")(self, pat=pat, n=n, expand=expand) + + def rsplit(self, pat=None, n=-1, expand=False): + return self._gen_func("rsplit")(self, pat=pat, n=n, expand=expand) + + def cat(self, others=None, sep=None, na_rep=None, join="left"): + return self._gen_func("cat")( + self, others=others, sep=sep, na_rep=na_rep, join=join + ) + + rsplit.__doc__ = adapt_mars_docstring(pd.Series.str.rsplit.__doc__) + cat.__doc__ = adapt_mars_docstring(pd.Series.str.cat.__doc__) + + +class DatetimeAccessor: + def __init__(self, series): + if ( + not is_datetime64_dtype(series.dtype) + and not is_datetime64tz_dtype(series.dtype) + and not is_timedelta64_dtype(series.dtype) + and not is_period_dtype(series.dtype) + ): + raise AttributeError("Can only use .dt accessor with datetimelike values") + self._series = series + + @classmethod + def _gen_func(cls, method, is_property): + @wraps(getattr(pd.Series.dt, method)) + def _inner(self, *args, **kwargs): + op = SeriesDatetimeMethod( + method=method, + is_property=is_property, + method_args=args, + method_kwargs=kwargs, + ) + return op(self._series) + + _inner.__doc__ = adapt_mars_docstring(getattr(pd.Series.dt, method).__doc__) + return _inner + + @classmethod + def _register(cls, method): + is_property = not callable(getattr(pd.Series.dt, method)) + func = cls._gen_func(method, is_property) + if is_property: + func = property(func) + setattr(cls, method, func) + + def __dir__(self) -> Iterable[str]: + s = set(super().__dir__()) + s.update(_datetime_method_to_handlers.keys()) + return list(s) + + +class CachedAccessor: + def __init__(self, name: str, accessor) -> None: + self._name = name + self._accessor = accessor + + def __get__(self, obj, cls): + if obj is None: + # we're accessing the attribute of the class, i.e., Dataset.geo + return self._accessor + if self._name not in obj._accessors: + obj._accessors[self._name] = self._accessor(obj) + return obj._accessors[self._name] diff --git a/python/xorbits/_mars/dataframe/base/apply.py b/python/xorbits/_mars/dataframe/base/apply.py new file mode 100644 index 000000000..d8db860f7 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/apply.py @@ -0,0 +1,942 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...config import options +from ...core import OutputType, recursive_tile +from ...core.custom_log import redirect_custom_log +from ...core.operand import OperatorLogicKeyGeneratorMixin +from ...serialization.serializables import ( + AnyField, + BoolField, + DictField, + FunctionField, + StringField, + TupleField, +) +from ...utils import enter_current_session, get_func_token, quiet_stdio, tokenize +from ..arrays import ArrowArray +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import ( + build_df, + build_empty_df, + build_empty_series, + build_series, + clean_up_func, + make_dtype, + make_dtypes, + parse_index, + restore_func, + validate_axis, + validate_output_types, +) + + +class ApplyOperandLogicKeyGeneratorMixin(OperatorLogicKeyGeneratorMixin): + def _get_logic_key_token_values(self): + token_values = super()._get_logic_key_token_values() + [ + self._axis, + self._convert_dtype, + self._raw, + self._result_type, + self._elementwise, + ] + if self.func: + return token_values + [get_func_token(self.func)] + else: # pragma: no cover + return token_values + + +class ApplyOperand( + DataFrameOperand, DataFrameOperandMixin, ApplyOperandLogicKeyGeneratorMixin +): + _op_type_ = opcodes.APPLY + + _func = FunctionField("func") + _axis = AnyField("axis") + _convert_dtype = BoolField("convert_dtype") + _raw = BoolField("raw") + _result_type = StringField("result_type") + _elementwise = BoolField("elementwise") + _logic_key = StringField("logic_key") + _func_key = AnyField("func_key") + _need_clean_up_func = BoolField("need_clean_up_func") + _args = TupleField("args") + _kwds = DictField("kwds") + + def __init__( + self, + func=None, + axis=None, + convert_dtype=None, + raw=None, + result_type=None, + args=None, + kwds=None, + output_type=None, + elementwise=None, + logic_key=None, + func_key=None, + need_clean_up_func=False, + **kw, + ): + if output_type: + kw["_output_types"] = [output_type] + super().__init__( + _func=func, + _axis=axis, + _convert_dtype=convert_dtype, + _raw=raw, + _result_type=result_type, + _args=args, + _kwds=kwds, + _elementwise=elementwise, + _logic_key=logic_key, + _func_key=func_key, + _need_clean_up_func=need_clean_up_func, + **kw, + ) + + def _update_key(self): + values = [v for v in self._values_ if v is not self.func] + [ + get_func_token(self.func) + ] + self._obj_set("_key", tokenize(type(self).__name__, *values)) + return self + + @property + def func(self): + return self._func + + @func.setter + def func(self, func): + self._func = func + + @property + def axis(self): + return self._axis + + @property + def convert_dtype(self): + return self._convert_dtype + + @property + def raw(self): + return self._raw + + @property + def result_type(self): + return self._result_type + + @property + def elementwise(self): + return self._elementwise + + @property + def logic_key(self): + return self._logic_key + + @logic_key.setter + def logic_key(self, logic_key): + self._logic_key = logic_key + + @property + def func_key(self): + return self._func_key + + @func_key.setter + def func_key(self, func_key): + self._func_key = func_key + + @property + def need_clean_up_func(self): + return self._need_clean_up_func + + @need_clean_up_func.setter + def need_clean_up_func(self, need_clean_up_func: bool): + self._need_clean_up_func = need_clean_up_func + + @property + def args(self): + return getattr(self, "_args", None) or () + + @property + def kwds(self): + return getattr(self, "_kwds", None) or dict() + + @classmethod + @redirect_custom_log + @enter_current_session + def execute(cls, ctx, op): + restore_func(ctx, op) + input_data = ctx[op.inputs[0].key] + out = op.outputs[0] + if len(input_data) == 0: + if op.output_types[0] == OutputType.dataframe: + ctx[out.key] = build_empty_df(out.dtypes) + else: + ctx[out.key] = build_empty_series(out.dtype, name=out.name) + return + + if isinstance(input_data, pd.DataFrame): + result = input_data.apply( + op.func, + axis=op.axis, + raw=op.raw, + result_type=op.result_type, + args=op.args, + **op.kwds, + ) + else: + try: + result = input_data.apply( + op.func, convert_dtype=op.convert_dtype, args=op.args, **op.kwds + ) + except TypeError: + if isinstance(input_data.values, ArrowArray): + input_data = pd.Series( + input_data.to_numpy(), + name=input_data.name, + index=input_data.index, + ) + result = input_data.apply( + op.func, convert_dtype=op.convert_dtype, args=op.args, **op.kwds + ) + else: # pragma: no cover + raise + ctx[out.key] = result + + @classmethod + def _tile_df(cls, op): + in_df = op.inputs[0] + out_df = op.outputs[0] + axis = op.axis + elementwise = op.elementwise + + if not elementwise and in_df.chunk_shape[axis] > 1: + chunk_size = ( + in_df.shape[axis], + max(1, options.chunk_store_limit // in_df.shape[axis]), + ) + if axis == 1: + chunk_size = chunk_size[::-1] + in_df = yield from recursive_tile(in_df.rechunk(chunk_size)) + + chunks = [] + if op.output_types and op.output_types[0] == OutputType.df_or_series: + for c in in_df.chunks: + new_op = op.copy().reset_key() + new_op.tileable_op_key = op.key + chunks.append(new_op.new_chunk([c], collapse_axis=axis, index=c.index)) + new_nsplits = None + elif out_df.ndim == 2: + for c in in_df.chunks: + if elementwise: + new_shape = c.shape + new_index_value, new_columns_value = c.index_value, c.columns_value + else: + new_shape = [np.nan, np.nan] + new_shape[1 - axis] = c.shape[1 - axis] + if axis == 0: + new_index_value = out_df.index_value + new_columns_value = c.columns_value + else: + new_index_value = c.index_value + new_columns_value = out_df.columns_value + + if op.axis == 0: + new_dtypes = out_df.dtypes[c.dtypes.keys()] + else: + new_dtypes = out_df.dtypes + + new_op = op.copy().reset_key() + new_op.tileable_op_key = op.key + chunks.append( + new_op.new_chunk( + [c], + shape=tuple(new_shape), + index=c.index, + dtypes=new_dtypes, + index_value=new_index_value, + columns_value=new_columns_value, + ) + ) + + new_nsplits = list(in_df.nsplits) + if not elementwise: + new_nsplits[axis] = (np.nan,) * len(new_nsplits[axis]) + else: + for c in in_df.chunks: + shape_len = c.shape[1 - axis] + new_index_value = c.index_value if axis == 1 else c.columns_value + new_index = (c.index[1 - axis],) + new_op = op.copy().reset_key() + new_op.tileable_op_key = op.key + chunks.append( + new_op.new_chunk( + [c], + shape=(shape_len,), + index=new_index, + dtype=out_df.dtype, + index_value=new_index_value, + ) + ) + new_nsplits = (in_df.nsplits[1 - axis],) + + new_op = op.copy() + kw = out_df.params.copy() + if isinstance(new_nsplits, list): + new_nsplits = tuple(new_nsplits) + kw.update(dict(chunks=chunks, nsplits=new_nsplits)) + return new_op.new_tileables(op.inputs, **kw) + + @classmethod + def _tile_series(cls, op): + in_series = op.inputs[0] + out_series = op.outputs[0] + output_type = op.output_types[0] if op.output_types else None + + chunks = [] + for c in in_series.chunks: + new_op = op.copy().reset_key() + new_op.tileable_op_key = op.key + if output_type == OutputType.df_or_series: + chunks.append(new_op.new_chunk([c], collapse_axis=None, index=c.index)) + continue + kw = c.params.copy() + if out_series.ndim == 1: + kw["dtype"] = out_series.dtype + else: + kw["index"] = (c.index[0], 0) + kw["shape"] = (c.shape[0], out_series.shape[1]) + kw["dtypes"] = out_series.dtypes + kw["columns_value"] = out_series.columns_value + chunks.append(new_op.new_chunk([c], **kw)) + + new_op = op.copy() + kw = out_series.params.copy() + if output_type == OutputType.df_or_series: + kw.update(dict(chunks=chunks, nsplits=None)) + else: + kw.update(dict(chunks=chunks, nsplits=in_series.nsplits)) + if output_type != OutputType.df_or_series and out_series.ndim == 2: + kw["nsplits"] = (in_series.nsplits[0], (out_series.shape[1],)) + kw["columns_value"] = out_series.columns_value + return new_op.new_tileables(op.inputs, **kw) + + @classmethod + def tile(cls, op): + clean_up_func(op) + if op.inputs[0].ndim == 2: + return (yield from cls._tile_df(op)) + else: + return cls._tile_series(op) + + def _infer_df_func_returns(self, df, dtypes, dtype=None, name=None, index=None): + if isinstance(self._func, np.ufunc): + output_type = OutputType.dataframe + new_dtypes = None + index_value = "inherit" + new_elementwise = True + else: + if self.output_types is not None and ( + dtypes is not None or dtype is not None + ): + ret_dtypes = dtypes if dtypes is not None else (name, dtype) + ret_index_value = parse_index(index) if index is not None else None + self._elementwise = False + return ret_dtypes, ret_index_value + + output_type = new_dtypes = index_value = None + new_elementwise = False + + try: + empty_df = build_df(df, size=2) + with np.errstate(all="ignore"), quiet_stdio(): + infer_df = empty_df.apply( + self._func, + axis=self._axis, + raw=self._raw, + result_type=self._result_type, + args=self.args, + **self.kwds, + ) + if index_value is None: + if infer_df.index is empty_df.index: + index_value = "inherit" + else: + index_value = parse_index(pd.RangeIndex(-1)) + + if isinstance(infer_df, pd.DataFrame): + output_type = output_type or OutputType.dataframe + new_dtypes = new_dtypes or infer_df.dtypes + else: + output_type = output_type or OutputType.series + new_dtypes = (name or infer_df.name, dtype or infer_df.dtype) + new_elementwise = False if new_elementwise is None else new_elementwise + except: # noqa: E722 # nosec + pass + + self.output_types = ( + [output_type] if not self.output_types else self.output_types + ) + dtypes = new_dtypes if dtypes is None else dtypes + index_value = index_value if index is None else parse_index(index) + self._elementwise = ( + new_elementwise if self._elementwise is None else self._elementwise + ) + return dtypes, index_value + + def _call_df_or_series(self, df): + return self.new_df_or_series([df]) + + def _call_dataframe(self, df, dtypes=None, dtype=None, name=None, index=None): + # for backward compatibility + dtype = dtype if dtype is not None else dtypes + dtypes, index_value = self._infer_df_func_returns( + df, dtypes, dtype=dtype, name=name, index=index + ) + if index_value is None: + index_value = parse_index(None, (df.key, df.index_value.key)) + for arg, desc in zip((self.output_types, dtypes), ("output_types", "dtypes")): + if arg is None: + raise TypeError( + f"Cannot determine {desc} by calculating with enumerate data, " + "please specify it as arguments" + ) + + if index_value == "inherit": + index_value = df.index_value + + if self._elementwise: + shape = df.shape + elif self.output_types[0] == OutputType.dataframe: + shape = [np.nan, np.nan] + shape[1 - self.axis] = df.shape[1 - self.axis] + shape = tuple(shape) + else: + shape = (df.shape[1 - self.axis],) + + if self.output_types[0] == OutputType.dataframe: + if self.axis == 0: + return self.new_dataframe( + [df], + shape=shape, + dtypes=dtypes, + index_value=index_value, + columns_value=parse_index(dtypes.index, store_data=True), + ) + else: + return self.new_dataframe( + [df], + shape=shape, + dtypes=dtypes, + index_value=df.index_value, + columns_value=parse_index(dtypes.index, store_data=True), + ) + else: + name, dtype = dtypes + return self.new_series( + [df], shape=shape, name=name, dtype=dtype, index_value=index_value + ) + + def _call_series(self, series, dtypes=None, dtype=None, name=None, index=None): + # for backward compatibility + dtype = dtype if dtype is not None else dtypes + if self._convert_dtype: + if self.output_types is not None and ( + dtypes is not None or dtype is not None + ): + infer_series = test_series = None + else: + test_series = build_series(series, size=2, name=series.name) + try: + with np.errstate(all="ignore"), quiet_stdio(): + infer_series = test_series.apply( + self._func, args=self.args, **self.kwds + ) + except: # noqa: E722 # nosec # pylint: disable=bare-except + infer_series = None + + output_type = self._output_types[0] + + if index is not None: + index_value = parse_index(index) + elif infer_series is not None: + if infer_series.index is test_series.index: + index_value = series.index_value + else: # pragma: no cover + index_value = parse_index(infer_series.index) + else: + index_value = parse_index(None, series) + + if output_type == OutputType.dataframe: + if dtypes is None: + if infer_series is not None and infer_series.ndim == 2: + dtypes = infer_series.dtypes + else: + raise TypeError( + "Cannot determine dtypes, " + "please specify `dtypes` as argument" + ) + columns_value = parse_index(dtypes.index, store_data=True) + + return self.new_dataframe( + [series], + shape=(series.shape[0], len(dtypes)), + index_value=index_value, + columns_value=columns_value, + dtypes=dtypes, + ) + else: + if ( + dtype is None + and infer_series is not None + and infer_series.ndim == 1 + ): + dtype = infer_series.dtype + else: + dtype = dtype if dtype is not None else np.dtype(object) + if infer_series is not None and infer_series.ndim == 1: + name = name or infer_series.name + return self.new_series( + [series], + dtype=dtype, + shape=series.shape, + index_value=index_value, + name=name, + ) + else: + dtype = dtype if dtype is not None else np.dtype("object") + return self.new_series( + [series], + dtype=dtype, + shape=series.shape, + index_value=series.index_value, + name=name, + ) + + def __call__(self, df_or_series, dtypes=None, dtype=None, name=None, index=None): + axis = getattr(self, "axis", None) or 0 + dtypes = make_dtypes(dtypes) + dtype = make_dtype(dtype) + self._axis = validate_axis(axis, df_or_series) + + if self.output_types and self.output_types[0] == OutputType.df_or_series: + return self._call_df_or_series(df_or_series) + + if df_or_series.op.output_types[0] == OutputType.dataframe: + return self._call_dataframe( + df_or_series, dtypes=dtypes, dtype=dtype, name=name, index=index + ) + else: + return self._call_series( + df_or_series, dtypes=dtypes, dtype=dtype, name=name, index=index + ) + + +def df_apply( + df, + func, + axis=0, + raw=False, + result_type=None, + args=(), + dtypes=None, + dtype=None, + name=None, + output_type=None, + index=None, + elementwise=None, + skip_infer=False, + **kwds, +): + """ + Apply a function along an axis of the DataFrame. + + Objects passed to the function are Series objects whose index is + either the DataFrame's index (``axis=0``) or the DataFrame's columns + (``axis=1``). By default (``result_type=None``), the final return type + is inferred from the return type of the applied function. Otherwise, + it depends on the `result_type` argument. + + Parameters + ---------- + func : function + Function to apply to each column or row. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Axis along which the function is applied: + + * 0 or 'index': apply function to each column. + * 1 or 'columns': apply function to each row. + + raw : bool, default False + Determines if row or column is passed as a Series or ndarray object: + + * ``False`` : passes each row or column as a Series to the + function. + * ``True`` : the passed function will receive ndarray objects + instead. + If you are just applying a NumPy reduction function this will + achieve much better performance. + + result_type : {'expand', 'reduce', 'broadcast', None}, default None + These only act when ``axis=1`` (columns): + + * 'expand' : list-like results will be turned into columns. + * 'reduce' : returns a Series if possible rather than expanding + list-like results. This is the opposite of 'expand'. + * 'broadcast' : results will be broadcast to the original shape + of the DataFrame, the original index and columns will be + retained. + + The default behaviour (None) depends on the return value of the + applied function: list-like results will be returned as a Series + of those. However if the apply function returns a Series these + are expanded to columns. + + output_type : {'dataframe', 'series'}, default None + Specify type of returned object. See `Notes` for more details. + + dtypes : Series, default None + Specify dtypes of returned DataFrames. See `Notes` for more details. + + dtype : numpy.dtype, default None + Specify dtype of returned Series. See `Notes` for more details. + + name : str, default None + Specify name of returned Series. See `Notes` for more details. + + index : Index, default None + Specify index of returned object. See `Notes` for more details. + + elementwise : bool, default False + Specify whether ``func`` is an elementwise function: + + * ``False`` : The function is not elementwise. Mars will try + concatenating chunks in rows (when ``axis=0``) or in columns + (when ``axis=1``) and then apply ``func`` onto the concatenated + chunk. The concatenation step can cause extra latency. + * ``True`` : The function is elementwise. Mars will apply + ``func`` to original chunks. This will not introduce extra + concatenation step and reduce overhead. + + skip_infer: bool, default False + Whether infer dtypes when dtypes or output_type is not specified. + + args : tuple + Positional arguments to pass to `func` in addition to the + array/series. + + **kwds + Additional keyword arguments to pass as keywords arguments to + `func`. + + Returns + ------- + Series or DataFrame + Result of applying ``func`` along the given axis of the + DataFrame. + + See Also + -------- + DataFrame.applymap: For elementwise operations. + DataFrame.aggregate: Only perform aggregating type operations. + DataFrame.transform: Only perform transforming type operations. + + Notes + ----- + When deciding output dtypes and shape of the return value, Mars will + try applying ``func`` onto a mock DataFrame, and the apply call may + fail. When this happens, you need to specify the type of apply call + (DataFrame or Series) in output_type. + + * For DataFrame output, you need to specify a list or a pandas Series + as ``dtypes`` of output DataFrame. ``index`` of output can also be + specified. + * For Series output, you need to specify ``dtype`` and ``name`` of + output Series. + + Examples + -------- + >>> import numpy as np + >>> import mars.tensor as mt + >>> import mars.dataframe as md + >>> df = md.DataFrame([[4, 9]] * 3, columns=['A', 'B']) + >>> df.execute() + A B + 0 4 9 + 1 4 9 + 2 4 9 + + Using a reducing function on either axis + + >>> df.apply(np.sum, axis=0).execute() + A 12 + B 27 + dtype: int64 + + >>> df.apply(np.sum, axis=1).execute() + 0 13 + 1 13 + 2 13 + dtype: int64 + + Returning a list-like will result in a Series + + >>> df.apply(lambda x: [1, 2], axis=1).execute() + 0 [1, 2] + 1 [1, 2] + 2 [1, 2] + dtype: object + + Passing ``result_type='expand'`` will expand list-like results + to columns of a Dataframe + + >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand').execute() + 0 1 + 0 1 2 + 1 1 2 + 2 1 2 + + Returning a Series inside the function is similar to passing + ``result_type='expand'``. The resulting column names + will be the Series index. + + >>> df.apply(lambda x: md.Series([1, 2], index=['foo', 'bar']), axis=1).execute() + foo bar + 0 1 2 + 1 1 2 + 2 1 2 + + Passing ``result_type='broadcast'`` will ensure the same shape + result, whether list-like or scalar is returned by the function, + and broadcast it along the axis. The resulting column names will + be the originals. + + >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast').execute() + A B + 0 1 2 + 1 1 2 + 2 1 2 + """ + if isinstance(func, (list, dict)): + return df.aggregate(func, axis) + + output_types = kwds.pop("output_types", None) + object_type = kwds.pop("object_type", None) + output_types = validate_output_types( + output_type=output_type, output_types=output_types, object_type=object_type + ) + output_type = output_types[0] if output_types else None + if skip_infer and output_type is None: + output_type = OutputType.df_or_series + + # calling member function + if isinstance(func, str): + func = getattr(df, func) + sig = inspect.getfullargspec(func) + if "axis" in sig.args: + kwds["axis"] = axis + return func(*args, **kwds) + + op = ApplyOperand( + func=func, + axis=axis, + raw=raw, + result_type=result_type, + args=args, + kwds=kwds, + output_type=output_type, + elementwise=elementwise, + ) + return op(df, dtypes=dtypes, dtype=dtype, name=name, index=index) + + +def series_apply( + series, + func, + convert_dtype=True, + output_type=None, + args=(), + dtypes=None, + dtype=None, + name=None, + index=None, + skip_infer=False, + **kwds, +): + """ + Invoke function on values of Series. + + Can be ufunc (a NumPy function that applies to the entire Series) + or a Python function that only works on single values. + + Parameters + ---------- + func : function + Python function or NumPy ufunc to apply. + + convert_dtype : bool, default True + Try to find better dtype for elementwise function results. If + False, leave as dtype=object. + + output_type : {'dataframe', 'series'}, default None + Specify type of returned object. See `Notes` for more details. + + dtypes : Series, default None + Specify dtypes of returned DataFrames. See `Notes` for more details. + + dtype : numpy.dtype, default None + Specify dtype of returned Series. See `Notes` for more details. + + name : str, default None + Specify name of returned Series. See `Notes` for more details. + + index : Index, default None + Specify index of returned object. See `Notes` for more details. + + args : tuple + Positional arguments passed to func after the series value. + + skip_infer: bool, default False + Whether infer dtypes when dtypes or output_type is not specified. + + **kwds + Additional keyword arguments passed to func. + + Returns + ------- + Series or DataFrame + If func returns a Series object the result will be a DataFrame. + + See Also + -------- + Series.map: For element-wise operations. + Series.agg: Only perform aggregating type operations. + Series.transform: Only perform transforming type operations. + + Notes + ----- + When deciding output dtypes and shape of the return value, Mars will + try applying ``func`` onto a mock Series, and the apply call may fail. + When this happens, you need to specify the type of apply call + (DataFrame or Series) in output_type. + + * For DataFrame output, you need to specify a list or a pandas Series + as ``dtypes`` of output DataFrame. ``index`` of output can also be + specified. + * For Series output, you need to specify ``dtype`` and ``name`` of + output Series. + + Examples + -------- + Create a series with typical summer temperatures for each city. + + >>> import mars.tensor as mt + >>> import mars.dataframe as md + >>> s = md.Series([20, 21, 12], + ... index=['London', 'New York', 'Helsinki']) + >>> s.execute() + London 20 + New York 21 + Helsinki 12 + dtype: int64 + + Square the values by defining a function and passing it as an + argument to ``apply()``. + + >>> def square(x): + ... return x ** 2 + >>> s.apply(square).execute() + London 400 + New York 441 + Helsinki 144 + dtype: int64 + + Square the values by passing an anonymous function as an + argument to ``apply()``. + + >>> s.apply(lambda x: x ** 2).execute() + London 400 + New York 441 + Helsinki 144 + dtype: int64 + + Define a custom function that needs additional positional + arguments and pass these additional arguments using the + ``args`` keyword. + + >>> def subtract_custom_value(x, custom_value): + ... return x - custom_value + + >>> s.apply(subtract_custom_value, args=(5,)).execute() + London 15 + New York 16 + Helsinki 7 + dtype: int64 + + Define a custom function that takes keyword arguments + and pass these arguments to ``apply``. + + >>> def add_custom_values(x, **kwargs): + ... for month in kwargs: + ... x += kwargs[month] + ... return x + + >>> s.apply(add_custom_values, june=30, july=20, august=25).execute() + London 95 + New York 96 + Helsinki 87 + dtype: int64 + """ + if isinstance(func, (list, dict)): + return series.aggregate(func) + + # calling member function + if isinstance(func, str): + func_body = getattr(series, func, None) + if func_body is not None: + return func_body(*args, **kwds) + func_str = func + func = getattr(np, func_str, None) + if func is None: + raise AttributeError( + f"'{func_str!r}' is not a valid function " + f"for '{type(series).__name__}' object" + ) + + if skip_infer and output_type is None: + output_type = OutputType.df_or_series + + output_types = kwds.pop("output_types", None) + object_type = kwds.pop("object_type", None) + output_types = validate_output_types( + output_type=output_type, output_types=output_types, object_type=object_type + ) + output_type = output_types[0] if output_types else OutputType.series + + op = ApplyOperand( + func=func, + convert_dtype=convert_dtype, + args=args, + kwds=kwds, + output_type=output_type, + ) + return op(series, dtypes=dtypes, dtype=dtype, name=name, index=index) diff --git a/python/xorbits/_mars/dataframe/base/astype.py b/python/xorbits/_mars/dataframe/base/astype.py new file mode 100644 index 000000000..cd73be813 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/astype.py @@ -0,0 +1,416 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +from pandas.api.types import CategoricalDtype + +from ... import opcodes as OperandDef +from ...core import recursive_tile +from ...serialization.serializables import AnyField, ListField, StringField +from ...tensor.base import sort +from ...utils import pd_release_version +from ..core import DATAFRAME_TYPE, SERIES_TYPE +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_empty_df, build_empty_series, parse_index + +_need_astype_contiguous = pd_release_version == (1, 3, 0) + + +class DataFrameAstype(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.ASTYPE + + _dtype_values = AnyField("dtype_values") + _errors = StringField("errors") + _category_cols = ListField("category_cols") + + def __init__( + self, + dtype_values=None, + errors=None, + category_cols=None, + output_types=None, + **kw + ): + super().__init__( + _dtype_values=dtype_values, + _errors=errors, + _category_cols=category_cols, + _output_types=output_types, + **kw + ) + + @property + def dtype_values(self): + return self._dtype_values + + @property + def errors(self): + return self._errors + + @property + def category_cols(self): + return self._category_cols + + @classmethod + def _tile_one_chunk(cls, op): + c = op.inputs[0].chunks[0] + chunk_op = op.copy().reset_key() + chunk_params = op.outputs[0].params.copy() + chunk_params["index"] = c.index + out_chunks = [chunk_op.new_chunk([c], **chunk_params)] + + new_op = op.copy() + return new_op.new_tileables( + op.inputs, + nsplits=op.inputs[0].nsplits, + chunks=out_chunks, + **op.outputs[0].params.copy() + ) + + @classmethod + def _tile_series_index(cls, op): + in_series = op.inputs[0] + out = op.outputs[0] + + unique_chunk = None + if op.dtype_values == "category" and isinstance(op.dtype_values, str): + unique_chunk = (yield from recursive_tile(sort(in_series.unique()))).chunks[ + 0 + ] + + chunks = [] + for c in in_series.chunks: + chunk_op = op.copy().reset_key() + params = c.params.copy() + params["dtype"] = out.dtype + if unique_chunk is not None: + chunk_op._category_cols = [in_series.name] + new_chunk = chunk_op.new_chunk([c, unique_chunk], **params) + else: + new_chunk = chunk_op.new_chunk([c], **params) + chunks.append(new_chunk) + + new_op = op.copy() + return new_op.new_tileables( + op.inputs, nsplits=in_series.nsplits, chunks=chunks, **out.params.copy() + ) + + @classmethod + def _tile_dataframe(cls, op): + in_df = op.inputs[0] + out = op.outputs[0] + cum_nsplits = np.cumsum((0,) + in_df.nsplits[1]) + out_chunks = [] + + if op.dtype_values == "category": + # all columns need unique values + for c in in_df.chunks: + chunk_op = op.copy().reset_key() + params = c.params.copy() + dtypes = out.dtypes[ + cum_nsplits[c.index[1]] : cum_nsplits[c.index[1] + 1] + ] + params["dtypes"] = dtypes + chunk_op._category_cols = list(c.columns_value.to_pandas()) + unique_chunks = [] + for col in c.columns_value.to_pandas(): + unique = yield from recursive_tile(sort(in_df[col].unique())) + unique_chunks.append(unique.chunks[0]) + new_chunk = chunk_op.new_chunk([c] + unique_chunks, **params) + out_chunks.append(new_chunk) + elif ( + isinstance(op.dtype_values, dict) and "category" in op.dtype_values.values() + ): + # some columns' types are category + category_cols = [ + c + for c, v in op.dtype_values.items() + if isinstance(v, str) and v == "category" + ] + unique_chunks = dict() + for col in category_cols: + unique = yield from recursive_tile(sort(in_df[col].unique())) + unique_chunks[col] = unique.chunks[0] + for c in in_df.chunks: + chunk_op = op.copy().reset_key() + params = c.params.copy() + dtypes = out.dtypes[ + cum_nsplits[c.index[1]] : cum_nsplits[c.index[1] + 1] + ] + params["dtypes"] = dtypes + chunk_category_cols = [] + chunk_unique_chunks = [] + for col in c.columns_value.to_pandas(): + if col in category_cols: + chunk_category_cols.append(col) + chunk_unique_chunks.append(unique_chunks[col]) + chunk_op._category_cols = chunk_category_cols + new_chunk = chunk_op.new_chunk([c] + chunk_unique_chunks, **params) + out_chunks.append(new_chunk) + else: + for c in in_df.chunks: + chunk_op = op.copy().reset_key() + params = c.params.copy() + dtypes = out.dtypes[ + cum_nsplits[c.index[1]] : cum_nsplits[c.index[1] + 1] + ] + params["dtypes"] = dtypes + new_chunk = chunk_op.new_chunk([c], **params) + out_chunks.append(new_chunk) + + new_op = op.copy() + return new_op.new_dataframes( + op.inputs, nsplits=in_df.nsplits, chunks=out_chunks, **out.params.copy() + ) + + @classmethod + def tile(cls, op): + if len(op.inputs[0].chunks) == 1: + return cls._tile_one_chunk(op) + elif isinstance(op.inputs[0], DATAFRAME_TYPE): + return (yield from cls._tile_dataframe(op)) + else: + return (yield from cls._tile_series_index(op)) + + @classmethod + def execute(cls, ctx, op): + in_data = ctx[op.inputs[0].key] + if not isinstance(op.dtype_values, dict): + if op.category_cols is not None: + uniques = [ctx[c.key] for c in op.inputs[1:]] + dtype = dict( + (col, CategoricalDtype(unique_values)) + for col, unique_values in zip(op.category_cols, uniques) + ) + ctx[op.outputs[0].key] = in_data.astype(dtype, errors=op.errors) + + elif isinstance(in_data, pd.Index): + ctx[op.outputs[0].key] = in_data.astype(op.dtype_values) + else: + if _need_astype_contiguous and not in_data.values.flags.contiguous: + # astype changes the data order in pandas==1.3.0, see pandas#42396 + in_data = in_data.copy() + ctx[op.outputs[0].key] = in_data.astype( + op.dtype_values, errors=op.errors + ) + else: + selected_dtype = dict( + (k, v) for k, v in op.dtype_values.items() if k in in_data.columns + ) + if op.category_cols is not None: + uniques = [ctx[c.key] for c in op.inputs[1:]] + for col, unique_values in zip(op.category_cols, uniques): + selected_dtype[col] = CategoricalDtype(unique_values) + ctx[op.outputs[0].key] = in_data.astype(selected_dtype, errors=op.errors) + + def __call__(self, df): + if isinstance(df, DATAFRAME_TYPE): + empty_df = build_empty_df(df.dtypes) + new_df = empty_df.astype(self.dtype_values, errors=self.errors) + dtypes = [] + for dt, new_dt in zip(df.dtypes, new_df.dtypes): + if new_dt != dt and isinstance(new_dt, CategoricalDtype): + dtypes.append(CategoricalDtype()) + else: + dtypes.append(new_dt) + dtypes = pd.Series(dtypes, index=new_df.dtypes.index) + return self.new_dataframe( + [df], + shape=df.shape, + dtypes=dtypes, + index_value=df.index_value, + columns_value=df.columns_value, + ) + else: + empty_series = build_empty_series(df.dtype) + new_series = empty_series.astype(self.dtype_values, errors=self.errors) + if new_series.dtype != df.dtype: + dtype = ( + CategoricalDtype() + if isinstance(new_series.dtype, CategoricalDtype) + else new_series.dtype + ) + else: # pragma: no cover + dtype = df.dtype + + if isinstance(df, SERIES_TYPE): + return self.new_series( + [df], + shape=df.shape, + dtype=dtype, + name=df.name, + index_value=df.index_value, + ) + else: + new_index = df.index_value.to_pandas().astype(self.dtype_values) + new_index_value = parse_index( + new_index, store_data=df.index_value.has_value() + ) + return self.new_index( + [df], + shape=df.shape, + dtype=dtype, + name=df.name, + index_value=new_index_value, + ) + + +def astype(df, dtype, copy=True, errors="raise"): + """ + Cast a pandas object to a specified dtype ``dtype``. + + Parameters + ---------- + dtype : data type, or dict of column name -> data type + Use a numpy.dtype or Python type to cast entire pandas object to + the same type. Alternatively, use {col: dtype, ...}, where col is a + column label and dtype is a numpy.dtype or Python type to cast one + or more of the DataFrame's columns to column-specific types. + copy : bool, default True + Return a copy when ``copy=True`` (be very careful setting + ``copy=False`` as changes to values then may propagate to other + pandas objects). + errors : {'raise', 'ignore'}, default 'raise' + Control raising of exceptions on invalid data for provided dtype. + + - ``raise`` : allow exceptions to be raised + - ``ignore`` : suppress exceptions. On error return original object. + + Returns + ------- + casted : same type as caller + + See Also + -------- + to_datetime : Convert argument to datetime. + to_timedelta : Convert argument to timedelta. + to_numeric : Convert argument to a numeric type. + numpy.ndarray.astype : Cast a numpy array to a specified type. + + Examples + -------- + Create a DataFrame: + + >>> import mars.dataframe as md + >>> df = md.DataFrame(pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})) + >>> df.dtypes + col1 int64 + col2 int64 + dtype: object + + Cast all columns to int32: + + >>> df.astype('int32').dtypes + col1 int32 + col2 int32 + dtype: object + + Cast col1 to int32 using a dictionary: + + >>> df.astype({'col1': 'int32'}).dtypes + col1 int32 + col2 int64 + dtype: object + + Create a series: + + >>> ser = md.Series(pd.Series([1, 2], dtype='int32')) + >>> ser.execute() + 0 1 + 1 2 + dtype: int32 + >>> ser.astype('int64').execute() + 0 1 + 1 2 + dtype: int64 + + Convert to categorical type: + + >>> ser.astype('category').execute() + 0 1 + 1 2 + dtype: category + Categories (2, int64): [1, 2] + + Convert to ordered categorical type with custom ordering: + + >>> cat_dtype = pd.api.types.CategoricalDtype( + ... categories=[2, 1], ordered=True) + >>> ser.astype(cat_dtype).execute() + 0 1 + 1 2 + dtype: category + Categories (2, int64): [2 < 1] + + Note that using ``copy=False`` and changing data on a new + pandas object may propagate changes: + + >>> s1 = md.Series(pd.Series([1, 2])) + >>> s2 = s1.astype('int64', copy=False) + >>> s1.execute() # note that s1[0] has changed too + 0 1 + 1 2 + dtype: int64 + """ + if isinstance(dtype, dict): + keys = list(dtype.keys()) + if isinstance(df, SERIES_TYPE): + if len(keys) != 1 or keys[0] != df.name: + raise KeyError( + "Only the Series name can be used for the key in Series dtype mappings." + ) + else: + dtype = list(dtype.values())[0] + else: + for k in keys: + columns = df.columns_value.to_pandas() + if k not in columns: + raise KeyError( + "Only a column name can be used for the key in a dtype mappings argument." + ) + op = DataFrameAstype(dtype_values=dtype, errors=errors) + r = op(df) + if not copy: + df.data = r.data + return df + else: + return r + + +def index_astype(ix, dtype, copy=True): + """ + Create an Index with values cast to dtypes. + + The class of a new Index is determined by dtype. When conversion is + impossible, a ValueError exception is raised. + + Parameters + ---------- + dtype : numpy dtype or pandas type + Note that any signed integer `dtype` is treated as ``'int64'``, + and any unsigned integer `dtype` is treated as ``'uint64'``, + regardless of the size. + copy : bool, default True + By default, astype always returns a newly allocated object. + If copy is set to False and internal requirements on dtype are + satisfied, the original data is used to create a new Index + or the original Index is returned. + + Returns + ------- + Index + Index with values cast to specified dtype. + """ + return astype(ix, dtype, copy=copy) diff --git a/python/xorbits/_mars/dataframe/base/bloom_filter.py b/python/xorbits/_mars/dataframe/base/bloom_filter.py new file mode 100644 index 000000000..b3fccc8c4 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/bloom_filter.py @@ -0,0 +1,283 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, List, Union + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...config import options +from ...core import OutputType +from ...core.context import Context +from ...lib.bloom_filter import BloomFilter +from ...serialization.serializables import ( + AnyField, + Float64Field, + Int64Field, + StringField, +) +from ...typing import TileableType +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import parse_index + + +class DataFrameBloomFilter(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.DATAFRAME_BLOOM_FILTER + + left_on = AnyField("left_on") + right_on = AnyField("right_on") + on = AnyField("on") + # for build + max_elements = Int64Field("max_elements") + error_rate = Float64Field("error_rate") + combine_size = Int64Field("combine_size") + # chunk + execution_stage = StringField("execution_stage", default=None) + + def __init__(self, execution_stage=None, **kwargs): + if execution_stage in ["build", "union"]: + output_types = [OutputType.object] + else: + output_types = [OutputType.dataframe] + kwargs["_output_types"] = output_types + super().__init__(execution_stage=execution_stage, **kwargs) + + def __call__(self, df1: TileableType, df2: TileableType): + return self.new_tileable([df1, df2], **df1.params) + + @classmethod + def tile(cls, op: "DataFrameBloomFilter"): + df1, df2 = op.inputs + # use df2's chunks to build bloom filter + chunks = [] + for c in df2.chunks: + build_op = DataFrameBloomFilter( + on=op.right_on, + max_elements=op.max_elements, + error_rate=op.error_rate, + execution_stage="build", + ) + chunks.append(build_op.new_chunk(inputs=[c])) + + # union all chunk filters + combine_size = op.combine_size + while len(chunks) > combine_size: + new_chunks = [] + for i in range(0, len(chunks), combine_size): + chks = chunks[i : i + combine_size] + if len(chks) == 1: + chk = chks[0] + else: + union_op = DataFrameBloomFilter(execution_stage="union") + for j, c in enumerate(chks): + c._index = (j, 0) + chk = union_op.new_chunk(chks) + new_chunks.append(chk) + chunks = new_chunks + if len(chunks) > 1: + union_op = DataFrameBloomFilter(execution_stage="union") + filter_chunk = union_op.new_chunk(chunks) + else: + filter_chunk = chunks[0] + + filter_chunk.is_broadcaster = True + # filter df1 + out_chunks = [] + for chunk in df1.chunks: + filter_op = DataFrameBloomFilter(on=op.left_on, execution_stage="filter") + params = chunk.params.copy() + params["shape"] = (np.nan, chunk.shape[1]) + params["index_value"] = parse_index(pd.RangeIndex(-1)) + out_chunks.append(filter_op.new_chunk([chunk, filter_chunk], **params)) + + new_op = op.copy() + df1_params = df1.params.copy() + df1_params["chunks"] = out_chunks + df1_params["nsplits"] = ((np.nan,) * len(out_chunks), df1.nsplits[1]) + return new_op.new_dataframes(op.inputs, **df1_params) + + @classmethod + def _get_value(cls, value: Any): + # value could be an element or a series, as BloomFilter + # doesn't accept series, convert to list here + if isinstance(value, pd.Series): + return value.tolist() + else: + return value + + @classmethod + def _filter_on_index(cls, on: Union[str, List, None], data: pd.DataFrame): + if on is None: + return True + elif isinstance(on, str): + return on not in data.columns + else: + assert isinstance(on, list) + return any(c not in data.columns for c in on) + + @classmethod + def _build_index_filter(cls, in_data: pd.DataFrame, op: "DataFrameBloomFilter"): + if isinstance(in_data.index, pd.MultiIndex): + index = in_data.index.get_level_values(op.on) + else: + index = in_data.index + bloom_filter = BloomFilter( + max_elements=op.max_elements, error_rate=op.error_rate + ) + index.map(lambda v: bloom_filter.add(cls._get_value(v))) + return bloom_filter + + @classmethod + def _build_series_filter(cls, in_data: pd.Series, op: "DataFrameBloomFilter"): + try: + bloom_filter = BloomFilter( + max_elements=op.max_elements, error_rate=op.error_rate + ) + in_data[op.on].map(lambda v: bloom_filter.add(cls._get_value(v))) + except TypeError: + # has unhashable data, convert to str + in_data = in_data.astype(str) + bloom_filter = BloomFilter( + max_elements=op.max_elements, error_rate=op.error_rate + ) + in_data[op.on].map(lambda v: bloom_filter.add(cls._get_value(v))) + return bloom_filter + + @classmethod + def _build_dataframe_filter(cls, in_data: pd.DataFrame, op: "DataFrameBloomFilter"): + try: + bloom_filter = BloomFilter( + max_elements=op.max_elements, error_rate=op.error_rate + ) + in_data[op.on].apply(lambda v: bloom_filter.add(cls._get_value(v)), axis=1) + except TypeError: + # has unhashable data, convert to str + in_data = in_data.astype(cls._convert_to_hashable_dtypes(in_data.dtypes)) + bloom_filter = BloomFilter( + max_elements=op.max_elements, error_rate=op.error_rate + ) + in_data[op.on].apply(lambda v: bloom_filter.add(cls._get_value(v)), axis=1) + return bloom_filter + + @classmethod + def _convert_to_hashable_dtypes(cls, dtypes: pd.Series): + dtypes = dict( + (name, dtype) if np.issubdtype(dtype, int) else (name, str) + for name, dtype in dtypes.iteritems() + ) + return dtypes + + @classmethod + def execute(cls, ctx: Union[dict, Context], op: "DataFrameBloomFilter"): + if op.execution_stage == "build": + on = op.on + in_data = ctx[op.inputs[0].key] + if cls._filter_on_index(on, in_data): + bloom_filter = cls._build_index_filter(in_data, op) + elif isinstance(on, str): + bloom_filter = cls._build_series_filter(in_data, op) + else: + bloom_filter = cls._build_dataframe_filter(in_data, op) + ctx[op.outputs[0].key] = bloom_filter + elif op.execution_stage == "union": + # union bloom filters + filters = [ctx[inp.key] for inp in op.inputs] + out = filters[0] + for f in filters[1:]: + out.union(f) + ctx[op.outputs[0].key] = out + elif op.execution_stage == "filter": + on = op.on + in_data = ctx[op.inputs[0].key] + bloom_filter = ctx[op.inputs[1].key] + if cls._filter_on_index(on, in_data): + if isinstance(in_data.index, pd.MultiIndex): + idx = in_data.index.names.index(on) + ctx[op.outputs[0].key] = in_data[ + in_data.index.map(lambda x: x[idx] in bloom_filter) + ] + else: + ctx[op.outputs[0].key] = in_data[ + in_data.index.map(lambda x: x in bloom_filter) + ] + else: + row_func = lambda row: cls._get_value(row) in bloom_filter + if isinstance(on, str): + # series + try: + filtered = in_data[in_data[on].map(row_func)] + except TypeError: + converted_data = in_data.astype(str) + filtered = in_data[converted_data[on].map(row_func)] + ctx[op.outputs[0].key] = filtered + else: + # dataframe + try: + filtered = in_data[in_data[on].apply(row_func, axis=1)] + except TypeError: + converted_data = in_data.astype( + cls._convert_to_hashable_dtypes(in_data.dtypes) + ) + filtered = in_data[converted_data[on].apply(row_func, axis=1)] + ctx[op.outputs[0].key] = filtered + + else: # pragma: no cover + raise ValueError(f"Unknown execution stage: {op.execution_stage}") + + +def filter_by_bloom_filter( + df1: TileableType, + df2: TileableType, + left_on: Union[str, List], + right_on: Union[str, List], + max_elements: int = 10000, + error_rate: float = 0.1, + combine_size: int = None, +): + """ + Use bloom filter to filter DataFrame. + + Parameters + ---------- + df1: DataFrame. + DataFrame to be filtered. + df2: DataFrame. + Dataframe to build filter. + left_on: str or list. + Column(s) selected on df1. + right_on: str or list. + Column(s) selected on df2. + max_elements: int + How many elements you expect the filter to hold. + error_rate: float + error_rate defines accuracy. + combine_size: int + Combine size. + + Returns + ------- + DataFrame + Filtered df1. + """ + if combine_size is None: + combine_size = options.combine_size + op = DataFrameBloomFilter( + left_on=left_on, + right_on=right_on, + max_elements=max_elements, + error_rate=error_rate, + combine_size=combine_size, + ) + return op(df1, df2) diff --git a/python/xorbits/_mars/dataframe/base/cartesian_chunk.py b/python/xorbits/_mars/dataframe/base/cartesian_chunk.py new file mode 100644 index 000000000..fb7171d63 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/cartesian_chunk.py @@ -0,0 +1,277 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...core import recursive_tile +from ...core.custom_log import redirect_custom_log +from ...serialization.serializables import ( + DictField, + FunctionField, + KeyField, + TupleField, +) +from ...utils import enter_current_session, has_unknown_shape, quiet_stdio +from ..operands import DataFrameOperand, DataFrameOperandMixin, OutputType +from ..utils import ( + build_df, + build_empty_df, + build_series, + parse_index, + validate_output_types, +) + + +class DataFrameCartesianChunk(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.CARTESIAN_CHUNK + + _left = KeyField("left") + _right = KeyField("right") + _func = FunctionField("func") + _args = TupleField("args") + _kwargs = DictField("kwargs") + + def __init__( + self, + left=None, + right=None, + func=None, + args=None, + kwargs=None, + output_types=None, + **kw + ): + super().__init__( + _left=left, + _right=right, + _func=func, + _args=args, + _kwargs=kwargs, + _output_types=output_types, + **kw + ) + if self.memory_scale is None: + self.memory_scale = 2.0 + + @property + def left(self): + return self._left + + @property + def right(self): + return self._right + + @property + def func(self): + return self._func + + @property + def args(self): + return self._args + + @property + def kwargs(self): + return self._kwargs + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._left = self._inputs[0] + self._right = self._inputs[1] + + @staticmethod + def _build_test_obj(obj): + return ( + build_df(obj, size=2) + if obj.ndim == 2 + else build_series(obj, size=2, name=obj.name) + ) + + def __call__(self, left, right, index=None, dtypes=None): + test_left = self._build_test_obj(left) + test_right = self._build_test_obj(right) + output_type = self._output_types[0] if self._output_types else None + + if output_type == OutputType.df_or_series: + return self.new_df_or_series([left, right]) + + # try run to infer meta + try: + with np.errstate(all="ignore"), quiet_stdio(): + obj = self._func(test_left, test_right, *self._args, **self._kwargs) + except: # noqa: E722 # nosec # pylint: disable=bare-except + if output_type == OutputType.series: + obj = pd.Series([], dtype=np.dtype(object)) + elif output_type == OutputType.dataframe and dtypes is not None: + obj = build_empty_df(dtypes) + else: + raise TypeError( + "Cannot determine `output_type`, " + "you have to specify it as `dataframe` or `series`, " + "for dataframe, `dtypes` is required as well " + "if output_type='dataframe'" + ) + + if getattr(obj, "ndim", 0) == 1 or output_type == OutputType.series: + shape = self._kwargs.pop("shape", (np.nan,)) + if index is None: + index = obj.index + index_value = parse_index( + index, left, right, self._func, self._args, self._kwargs + ) + return self.new_series( + [left, right], + dtype=obj.dtype, + shape=shape, + index_value=index_value, + name=obj.name, + ) + else: + dtypes = dtypes if dtypes is not None else obj.dtypes + # dataframe + shape = (np.nan, len(dtypes)) + columns_value = parse_index(dtypes.index, store_data=True) + if index is None: + index = obj.index + index_value = parse_index( + index, left, right, self._func, self._args, self._kwargs + ) + return self.new_dataframe( + [left, right], + shape=shape, + dtypes=dtypes, + index_value=index_value, + columns_value=columns_value, + ) + + @classmethod + def tile(cls, op: "DataFrameCartesianChunk"): + left = op.left + right = op.right + out = op.outputs[0] + out_type = op.output_types[0] + + if left.ndim == 2 and left.chunk_shape[1] > 1: + if has_unknown_shape(left): + yield + # if left is a DataFrame, make sure 1 chunk on axis columns + left = yield from recursive_tile(left.rechunk({1: left.shape[1]})) + if right.ndim == 2 and right.chunk_shape[1] > 1: + if has_unknown_shape(right): + yield + # if right is a DataFrame, make sure 1 chunk on axis columns + right = yield from recursive_tile(right.rechunk({1: right.shape[1]})) + + out_chunks = [] + if out_type == OutputType.dataframe: + nsplits = [[], [out.shape[1]]] + elif out_type == OutputType.series: + nsplits = [[]] + else: + # DataFrameOrSeries + nsplits = None + i = 0 + for left_chunk in left.chunks: + for right_chunk in right.chunks: + chunk_op = op.copy().reset_key() + chunk_op.tileable_op_key = op.key + if out_type == OutputType.df_or_series: + out_chunks.append( + chunk_op.new_chunk( + [left_chunk, right_chunk], index=(i, 0), collapse_axis=1 + ) + ) + elif out_type == OutputType.dataframe: + shape = (np.nan, out.shape[1]) + index_value = parse_index( + out.index_value.to_pandas(), + left_chunk, + right_chunk, + op.func, + op.args, + op.kwargs, + ) + out_chunk = chunk_op.new_chunk( + [left_chunk, right_chunk], + shape=shape, + index_value=index_value, + columns_value=out.columns_value, + dtypes=out.dtypes, + index=(i, 0), + ) + out_chunks.append(out_chunk) + nsplits[0].append(out_chunk.shape[0]) + else: + shape = (np.nan,) + index_value = parse_index( + out.index_value.to_pandas(), + left_chunk, + right_chunk, + op.func, + op.args, + op.kwargs, + ) + out_chunk = chunk_op.new_chunk( + [left_chunk, right_chunk], + shape=shape, + index_value=index_value, + dtype=out.dtype, + name=out.name, + index=(i,), + ) + out_chunks.append(out_chunk) + nsplits[0].append(out_chunk.shape[0]) + i += 1 + + params = out.params + params["nsplits"] = tuple(tuple(ns) for ns in nsplits) if nsplits else nsplits + params["chunks"] = out_chunks + new_op = op.copy() + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + @redirect_custom_log + @enter_current_session + def execute(cls, ctx, op: "DataFrameCartesianChunk"): + left, right = ctx[op.left.key], ctx[op.right.key] + ctx[op.outputs[0].key] = op.func(left, right, *op.args, **(op.kwargs or dict())) + + +def cartesian_chunk(left, right, func, skip_infer=False, args=(), **kwargs): + output_type = kwargs.pop("output_type", None) + output_types = kwargs.pop("output_types", None) + object_type = kwargs.pop("object_type", None) + output_types = validate_output_types( + output_type=output_type, output_types=output_types, object_type=object_type + ) + output_type = output_types[0] if output_types else None + if output_type: + output_types = [output_type] + elif skip_infer: + output_types = [OutputType.df_or_series] + index = kwargs.pop("index", None) + dtypes = kwargs.pop("dtypes", None) + memory_scale = kwargs.pop("memory_scale", None) + + op = DataFrameCartesianChunk( + left=left, + right=right, + func=func, + args=args, + kwargs=kwargs, + output_types=output_types, + memory_scale=memory_scale, + ) + return op(left, right, index=index, dtypes=dtypes) diff --git a/python/xorbits/_mars/dataframe/base/check_monotonic.py b/python/xorbits/_mars/dataframe/base/check_monotonic.py new file mode 100644 index 000000000..2d76daf96 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/check_monotonic.py @@ -0,0 +1,168 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...core import OutputType +from ...core.operand import OperandStage +from ...serialization.serializables import BoolField +from ...tensor.core import TensorOrder +from ...tensor.merge import TensorConcatenate +from ..operands import DataFrameOperand, DataFrameOperandMixin + + +class DataFrameCheckMonotonic(DataFrameOperand, DataFrameOperandMixin): + _op_code_ = opcodes.CHECK_MONOTONIC + + # 0 - increasing, 1 - decreasing + _decreasing = BoolField("decreasing") + _strict = BoolField("strict") + + def __init__(self, decreasing=None, strict=None, output_types=None, **kw): + super().__init__( + _decreasing=decreasing, _strict=strict, _output_types=output_types, **kw + ) + + @property + def decreasing(self): + return self._decreasing + + @property + def strict(self): + return self._strict + + def __call__(self, df_obj): + self._output_types = [OutputType.scalar] + return self.new_tileable([df_obj], shape=(), dtype=np.dtype(bool)) + + @classmethod + def tile(cls, op: "DataFrameCheckMonotonic"): + map_chunks = [] + for c in op.inputs[0].chunks: + new_op = DataFrameCheckMonotonic( + decreasing=op.decreasing, + strict=op.strict, + stage=OperandStage.map, + output_types=[OutputType.series], + order=TensorOrder.C_ORDER, + ) + map_chunks.append(new_op.new_chunk([c], shape=(2,), dtype=np.dtype(bool))) + + concat_op = TensorConcatenate(axis=0, dtype=np.dtype(bool)) + concat_r_chunk = concat_op.new_chunk( + map_chunks, + shape=(len(map_chunks),), + index=(0, 0), + order=TensorOrder.C_ORDER, + ) + + new_op = DataFrameCheckMonotonic( + decreasing=op.decreasing, + strict=op.strict, + stage=OperandStage.reduce, + output_types=[OutputType.scalar], + order=TensorOrder.C_ORDER, + ) + r_chunk = new_op.new_chunk( + [concat_r_chunk], shape=(), order=TensorOrder.C_ORDER, dtype=np.dtype(bool) + ) + + new_op = op.copy().reset_key() + params = op.outputs[0].params + params["chunks"] = [r_chunk] + params["nsplits"] = () + return new_op.new_tileables(op.inputs, **params) + + @classmethod + def execute(cls, ctx, op: "DataFrameCheckMonotonic"): + in_data = ctx[op.inputs[0].key] + if op.stage == OperandStage.map: + is_mono = ( + in_data.is_monotonic_increasing + if not op.decreasing + else in_data.is_monotonic_decreasing + ) + if op.strict and is_mono: + is_mono = in_data.is_unique + + if isinstance(in_data, pd.Index): + edge_array = np.array([in_data[0], in_data[-1]]) + else: + edge_array = np.array([in_data.iloc[0], in_data.iloc[-1]]) + + ctx[op.outputs[0].key] = ( + np.array([is_mono]), + edge_array, + ) + else: + in_series = pd.Series(in_data[1]) + is_edge_mono = ( + in_series.is_monotonic_increasing + if not op.decreasing + else in_series.is_monotonic_decreasing + ) + if op.strict and is_edge_mono: + is_edge_mono = in_series.is_unique + ctx[op.outputs[0].key] = in_data[0].all() and is_edge_mono + + +def check_monotonic(series_or_index, decreasing=False, strict=False): + """ + Check if values in the object are monotonic increasing + or decreasing. + + Parameters + ---------- + decreasing : bool + If True, check if values are monotonic decreasing, + otherwise check if values are monotonic increasing + strict : bool + If True, values need to be unique to get a positive + result + + Returns + ------- + Scalar + """ + op = DataFrameCheckMonotonic(decreasing=decreasing, strict=strict) + return op(series_or_index) + + +def is_monotonic(series_or_index): + """ + Return boolean scalar if values in the object are + monotonic_increasing. + + Returns + ------- + Scalar + """ + return check_monotonic(series_or_index, decreasing=False, strict=False) + + +is_monotonic_increasing = is_monotonic + + +def is_monotonic_decreasing(series_or_index): + """ + Return boolean scalar if values in the object are + monotonic_decreasing. + + Returns + ------- + Scalar + """ + return check_monotonic(series_or_index, decreasing=True, strict=False) diff --git a/python/xorbits/_mars/dataframe/base/core.py b/python/xorbits/_mars/dataframe/base/core.py new file mode 100644 index 000000000..6a5f38c41 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/core.py @@ -0,0 +1,64 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...serialization.serializables import KeyField +from ..core import DATAFRAME_TYPE, SERIES_TYPE +from ..operands import DataFrameOperand, DataFrameOperandMixin + + +class DataFrameDeviceConversionBase(DataFrameOperand, DataFrameOperandMixin): + _input = KeyField("input") + + @property + def input(self): + return self._input + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = inputs[0] + + def __call__(self, obj): + if isinstance(obj, DATAFRAME_TYPE): + return self.new_dataframe( + [obj], + shape=obj.shape, + dtypes=obj.dtypes, + index_value=obj.index_value, + columns_value=obj.columns_value, + ) + else: + assert isinstance(obj, SERIES_TYPE) + return self.new_series( + [obj], + shape=obj.shape, + dtype=obj.dtype, + index_value=obj.index_value, + name=obj.name, + ) + + @classmethod + def tile(cls, op): + # Isolate ops on cpu from subsequent ops on gpu + yield + out_chunks = [] + for c in op.input.chunks: + chunk_op = op.copy().reset_key() + out_chunk = chunk_op.new_chunk([c], **c.params) + out_chunks.append(out_chunk) + + new_op = op.copy().reset_key() + out = op.outputs[0] + return new_op.new_tileables( + op.inputs, chunks=out_chunks, nsplits=op.inputs[0].nsplits, **out.params + ) diff --git a/python/xorbits/_mars/dataframe/base/cut.py b/python/xorbits/_mars/dataframe/base/cut.py new file mode 100644 index 000000000..cb8405d35 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/cut.py @@ -0,0 +1,607 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import partial +from numbers import Integral + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import ENTITY_TYPE, ExecutableTuple, OutputType, recursive_tile +from ...core.context import get_context +from ...serialization.serializables import ( + AnyField, + BoolField, + Int32Field, + KeyField, + StringField, +) +from ...tensor import tensor as astensor +from ...tensor.core import TENSOR_TYPE, TensorOrder +from ...utils import has_unknown_shape +from ..core import INDEX_TYPE, SERIES_TYPE +from ..datasource.index import from_pandas as asindex +from ..initializer import Series as asseries +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import parse_index + + +class DataFrameCut(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.CUT + + _input = KeyField("input") + _bins = AnyField("bins") + _right = BoolField("right") + _labels = AnyField("labels") + _retbins = BoolField("retbins") + _precision = Int32Field("precision") + _include_lowest = BoolField("include_lowest") + _duplicates = StringField("duplicates") + _ordered = BoolField("ordered") + + def __init__( + self, + bins=None, + right=None, + labels=None, + retbins=None, + precision=None, + include_lowest=None, + duplicates=None, + ordered=None, + **kw + ): + super().__init__( + _bins=bins, + _right=right, + _labels=labels, + _retbins=retbins, + _precision=precision, + _include_lowest=include_lowest, + _duplicates=duplicates, + _ordered=ordered, + **kw + ) + + @property + def input(self): + return self._input + + @property + def bins(self): + return self._bins + + @property + def right(self): + return self._right + + @property + def labels(self): + return self._labels + + @property + def retbins(self): + return self._retbins + + @property + def precision(self): + return self._precision + + @property + def include_lowest(self): + return self._include_lowest + + @property + def duplicates(self): + return self._duplicates + + @property + def ordered(self): + return self._ordered + + @property + def output_limit(self): + return 1 if not self._retbins else 2 + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + self._input = next(inputs_iter) + if isinstance(self._bins, ENTITY_TYPE): + self._bins = next(inputs_iter) + if isinstance(self._labels, ENTITY_TYPE): + self._labels = next(inputs_iter) + + def __call__(self, x): + if isinstance(x, pd.Series): + x = asseries(x) + elif not isinstance(x, ENTITY_TYPE): + x = astensor(x) + if x.ndim != 1: + raise ValueError("Input array must be 1 dimensional") + if x.size == 0: + raise ValueError("Cannot cut empty array") + + inputs = [x] + if self._labels is not None and not isinstance( + self._labels, (bool, ENTITY_TYPE) + ): + self._labels = np.asarray(self._labels) + + # infer dtype + x_empty = ( + pd.Series([1], dtype=x.dtype) + if isinstance(x, SERIES_TYPE) + else np.asarray([1], dtype=x.dtype) + ) + if isinstance(self._bins, INDEX_TYPE): + bins = self._bins.index_value.to_pandas() + inputs.append(self._bins) + bins_unknown = True + elif isinstance(self._bins, ENTITY_TYPE): + bins = np.asarray([2], dtype=self._bins.dtype) + inputs.append(self._bins) + bins_unknown = True + else: + bins = self._bins + bins_unknown = isinstance(self._bins, Integral) + if isinstance(self._labels, ENTITY_TYPE): + bins_unknown = True + labels = None + inputs.append(self._labels) + else: + if self._labels is False or not bins_unknown: + labels = self._labels + else: + labels = None + ret = pd.cut( + x_empty, + bins, + right=self._right, + labels=labels, + retbins=True, + include_lowest=self._include_lowest, + duplicates=self._duplicates, + ) + + kws = [] + output_types = [] + if bins_unknown and isinstance(ret[0].dtype, pd.CategoricalDtype): + # inaccurate dtype, just create an empty one + out_dtype = pd.CategoricalDtype() + else: + out_dtype = ret[0].dtype + if isinstance(ret[0], pd.Series): + output_types.append(OutputType.series) + kws.append( + { + "dtype": out_dtype, + "shape": x.shape, + "index_value": x.index_value, + "name": x.name, + } + ) + elif isinstance(ret[0], np.ndarray): + output_types.append(OutputType.tensor) + kws.append( + {"dtype": out_dtype, "shape": x.shape, "order": TensorOrder.C_ORDER} + ) + else: + assert isinstance(ret[0], pd.Categorical) + output_types.append(OutputType.categorical) + kws.append( + { + "dtype": out_dtype, + "shape": x.shape, + "categories_value": parse_index( + out_dtype.categories, store_data=True + ), + } + ) + + if self._retbins: + if isinstance(self._bins, (pd.IntervalIndex, INDEX_TYPE)): + output_types.append(OutputType.index) + kws.append( + { + "dtype": self._bins.dtype, + "shape": self._bins.shape, + "index_value": self._bins.index_value + if isinstance(self._bins, INDEX_TYPE) + else parse_index(self._bins, store_data=False), + "name": self._bins.name, + } + ) + else: + output_types.append(OutputType.tensor) + kws.append( + { + "dtype": ret[1].dtype, + "shape": ret[1].shape if ret[1].size > 0 else (np.nan,), + "order": TensorOrder.C_ORDER, + } + ) + + self.output_types = output_types + return ExecutableTuple(self.new_tileables(inputs, kws=kws)) + + @classmethod + def tile(cls, op): + if isinstance(op.bins, ENTITY_TYPE): + # check op.bins chunk shapes + if has_unknown_shape(op.bins): + yield + bins = yield from recursive_tile(op.bins.rechunk(op.bins.shape)) + else: + bins = op.bins + + if isinstance(op.labels, ENTITY_TYPE): + # check op.labels chunk shapes + if has_unknown_shape(op.labels): + yield + labels = yield from recursive_tile(op.labels.rechunk(op.labels.shape)) + else: + labels = op.labels + + if isinstance(op.bins, Integral): + input_min, input_max = yield from recursive_tile( + op.input.min(), op.input.max() + ) + input_min_chunk = input_min.chunks[0] + input_max_chunk = input_max.chunks[0] + + # let input min and max execute first + min_max_chunks = [input_min_chunk, input_max_chunk] + yield min_max_chunks + [c for inp in op.inputs for c in inp.chunks] + + ctx = get_context() + keys = [input_min_chunk.key, input_max_chunk.key] + # get min and max of x + min_val, max_val = ctx.get_chunks_result(keys) + # calculate bins + if np.isinf(min_val) or np.isinf(max_val): + raise ValueError( + "cannot specify integer `bins` when input data contains infinity" + ) + elif min_val == max_val: # adjust end points before binning + min_val -= 0.001 * abs(min_val) if min_val != 0 else 0.001 + max_val += 0.001 * abs(max_val) if max_val != 0 else 0.001 + bins = np.linspace(min_val, max_val, bins + 1, endpoint=True) + else: # adjust end points before binning + bins = np.linspace(min_val, max_val, bins + 1, endpoint=True) + adj = (max_val - min_val) * 0.001 # 0.1% of the range + if op.right: + bins[0] -= adj + else: + bins[-1] += adj + + outs = op.outputs + + out_chunks = [] + for c in op.input.chunks: + chunk_op = op.copy().reset_key() + chunk_inputs = [c] + chunk_op._bins = bins + # do not return bins always for chunk + chunk_op._retbins = False + if isinstance(bins, ENTITY_TYPE): + chunk_inputs.append(bins.chunks[0]) + chunk_op._labels = labels + if isinstance(labels, ENTITY_TYPE): + chunk_inputs.append(labels.chunks[0]) + + chunk_kws = [] + if isinstance(outs[0], SERIES_TYPE): + chunk_kws.append( + { + "dtype": outs[0].dtype, + "shape": c.shape, + "index_value": c.index_value, + "name": c.name, + "index": c.index, + } + ) + elif isinstance(outs[0], TENSOR_TYPE): + chunk_kws.append( + { + "dtype": outs[0].dtype, + "shape": c.shape, + "order": TensorOrder.C_ORDER, + "index": c.index, + } + ) + else: + chunk_kws.append( + { + "dtype": outs[0].dtype, + "shape": c.shape, + "categories_value": outs[0].categories_value, + "index": c.index, + } + ) + + out_chunks.append(chunk_op.new_chunk(chunk_inputs, kws=chunk_kws)) + + kws = [] + out_kw = outs[0].params + out_kw["chunks"] = out_chunks + out_kw["nsplits"] = op.input.nsplits + kws.append(out_kw) + if len(outs) == 2: + bins_kw = outs[1].params + bins_kw["chunks"] = bins_chunks = [] + if isinstance(bins, ENTITY_TYPE): + bins_chunks.append(bins.chunks[0]) + else: + if op.duplicates == "drop": + if isinstance(bins, (np.ndarray, list, tuple)): + bins = np.unique(bins) + else: + bins = bins.unique() + bins = bins.astype(outs[1].dtype, copy=False) + convert = ( + astensor if not isinstance(bins, pd.IntervalIndex) else asindex + ) + converted = yield from recursive_tile( + convert(bins, chunk_size=len(bins)) + ) + bins_chunks.append(converted.chunks[0]) + bins_kw["nsplits"] = ((len(bins),),) + kws.append(bins_kw) + new_op = op.copy() + return new_op.new_tileables(op.inputs, kws=kws) + + @classmethod + def execute(cls, ctx, op): + x = ctx[op.input.key] + bins = ctx[op.bins.key] if isinstance(op.bins, ENTITY_TYPE) else op.bins + labels = ctx[op.labels.key] if isinstance(op.labels, ENTITY_TYPE) else op.labels + + if pd.__version__ >= "1.1.0": + cut = partial( + pd.cut, + right=op.right, + retbins=op.retbins, + precision=op.precision, + include_lowest=op.include_lowest, + duplicates=op.duplicates, + ordered=op.ordered, + ) + else: + cut = partial( + pd.cut, + right=op.right, + retbins=op.retbins, + precision=op.precision, + include_lowest=op.include_lowest, + duplicates=op.duplicates, + ) + try: + ret = cut(x, bins, labels=labels) + except ValueError: + # fail due to buffer source array is read-only + ret = cut(x.copy(), bins, labels=labels) + if op.retbins: # pragma: no cover + ctx[op.outputs[0].key] = ret[0] + ctx[op.outputs[1].key] = ret[1] + else: + ctx[op.outputs[0].key] = ret + + +def cut( + x, + bins, + right: bool = True, + labels=None, + retbins: bool = False, + precision: int = 3, + include_lowest: bool = False, + duplicates: str = "raise", + ordered: bool = True, +): + """ + Bin values into discrete intervals. + + Use `cut` when you need to segment and sort data values into bins. This + function is also useful for going from a continuous variable to a + categorical variable. For example, `cut` could convert ages to groups of + age ranges. Supports binning into an equal number of bins, or a + pre-specified array of bins. + + Parameters + ---------- + x : array-like + The input array to be binned. Must be 1-dimensional. + bins : int, sequence of scalars, or IntervalIndex + The criteria to bin by. + + * int : Defines the number of equal-width bins in the range of `x`. The + range of `x` is extended by .1% on each side to include the minimum + and maximum values of `x`. + * sequence of scalars : Defines the bin edges allowing for non-uniform + width. No extension of the range of `x` is done. + * IntervalIndex : Defines the exact bins to be used. Note that + IntervalIndex for `bins` must be non-overlapping. + + right : bool, default True + Indicates whether `bins` includes the rightmost edge or not. If + ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]`` + indicate (1,2], (2,3], (3,4]. This argument is ignored when + `bins` is an IntervalIndex. + labels : array or False, default None + Specifies the labels for the returned bins. Must be the same length as + the resulting bins. If False, returns only integer indicators of the + bins. This affects the type of the output container (see below). + This argument is ignored when `bins` is an IntervalIndex. If True, + raises an error. + retbins : bool, default False + Whether to return the bins or not. Useful when bins is provided + as a scalar. + precision : int, default 3 + The precision at which to store and display the bins labels. + include_lowest : bool, default False + Whether the first interval should be left-inclusive or not. + duplicates : {default 'raise', 'drop'}, optional + If bin edges are not unique, raise ValueError or drop non-uniques. + ordered : bool, default True + Whether the labels are ordered or not. Applies to returned types + Categorical and Series (with Categorical dtype). If True, the resulting + categorical will be ordered. If False, the resulting categorical will be + unordered (labels must be provided). + + Returns + ------- + out : Categorical, Series, or Tensor + An array-like object representing the respective bin for each value + of `x`. The type depends on the value of `labels`. + + * True (default) : returns a Series for Series `x` or a + Categorical for all other inputs. The values stored within + are Interval dtype. + + * sequence of scalars : returns a Series for Series `x` or a + Categorical for all other inputs. The values stored within + are whatever the type in the sequence is. + + * False : returns a tensor of integers. + + bins : Tensor or IntervalIndex. + The computed or specified bins. Only returned when `retbins=True`. + For scalar or sequence `bins`, this is a tensor with the computed + bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For + an IntervalIndex `bins`, this is equal to `bins`. + + See Also + -------- + qcut : Discretize variable into equal-sized buckets based on rank + or based on sample quantiles. + Categorical : Array type for storing data that come from a + fixed set of values. + Series : One-dimensional array with axis labels (including time series). + IntervalIndex : Immutable Index implementing an ordered, sliceable set. + + Notes + ----- + Any NA values will be NA in the result. Out of bounds values will be NA in + the resulting Series or Categorical object. + + Examples + -------- + Discretize into three equal-sized bins. + + >>> import mars.tensor as mt + >>> import mars.dataframe as md + + >>> md.cut(mt.array([1, 7, 5, 4, 6, 3]), 3).execute() + ... # doctest: +ELLIPSIS + [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... + Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ... + + >>> md.cut(mt.array([1, 7, 5, 4, 6, 3]), 3, retbins=True).execute() + ... # doctest: +ELLIPSIS + ([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... + Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ... + array([0.994, 3. , 5. , 7. ])) + + Discovers the same bins, but assign them specific labels. Notice that + the returned Categorical's categories are `labels` and is ordered. + + >>> md.cut(mt.array([1, 7, 5, 4, 6, 3]), + ... 3, labels=["bad", "medium", "good"]).execute() + [bad, good, medium, medium, good, bad] + Categories (3, object): [bad < medium < good] + + ordered=False will result in unordered categories when labels are passed. This parameter + can be used to allow non-unique labels: + + >>> md.cut(np.array([1, 7, 5, 4, 6, 3]), 3, + ... labels=["B", "A", "B"], ordered=False).execute() + ['B', 'B', 'A', 'A', 'B', 'B'] + Categories (2, object): ['A', 'B'] + + ``labels=False`` implies you just want the bins back. + + >>> md.cut([0, 1, 1, 2], bins=4, labels=False).execute() + array([0, 1, 1, 3]) + + Passing a Series as an input returns a Series with categorical dtype: + + >>> s = md.Series(mt.array([2, 4, 6, 8, 10]), + ... index=['a', 'b', 'c', 'd', 'e']) + >>> md.cut(s, 3).execute() + ... # doctest: +ELLIPSIS + a (1.992, 4.667] + b (1.992, 4.667] + c (4.667, 7.333] + d (7.333, 10.0] + e (7.333, 10.0] + dtype: category + Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, ... + + Passing a Series as an input returns a Series with mapping value. + It is used to map numerically to intervals based on bins. + + >>> s = md.Series(mt.array([2, 4, 6, 8, 10]), + ... index=['a', 'b', 'c', 'd', 'e']) + >>> md.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False).execute() + ... # doctest: +ELLIPSIS + (a 0.0 + b 1.0 + c 2.0 + d 3.0 + e NaN + dtype: float64, array([0, 2, 4, 6, 8, 10])) + + Use `drop` optional when bins is not unique + + >>> md.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True, + ... right=False, duplicates='drop').execute() + ... # doctest: +ELLIPSIS + (a 0.0 + b 1.0 + c 2.0 + d 3.0 + e NaN + dtype: float64, array([0, 2, 4, 6, 10])) + + Passing an IntervalIndex for `bins` results in those categories exactly. + Notice that values not covered by the IntervalIndex are set to NaN. 0 + is to the left of the first bin (which is closed on the right), and 1.5 + falls between two bins. + + >>> bins = md.Index(pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])) + >>> md.cut([0, 0.5, 1.5, 2.5, 4.5], bins).execute() + [NaN, (0, 1], NaN, (2, 3], (4, 5]] + Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]] + """ + + if isinstance(bins, Integral) and bins < 1: + raise ValueError("`bins` should be a positive integer") + + op = DataFrameCut( + bins=bins, + right=right, + labels=labels, + retbins=retbins, + precision=precision, + include_lowest=include_lowest, + duplicates=duplicates, + ordered=ordered, + ) + ret = op(x) + if not retbins: + return ret[0] + else: + return ret diff --git a/python/xorbits/_mars/dataframe/base/datetimes.py b/python/xorbits/_mars/dataframe/base/datetimes.py new file mode 100644 index 000000000..8d7375eae --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/datetimes.py @@ -0,0 +1,154 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import OutputType +from ...serialization.serializables import ( + BoolField, + DictField, + KeyField, + StringField, + TupleField, +) +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_empty_series + + +class SeriesDatetimeMethod(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.DATETIME_METHOD + + _input = KeyField("input") + _method = StringField("method") + _method_args = TupleField("method_args") + _method_kwargs = DictField("method_kwargs") + _is_property = BoolField("is_property") + + def __init__( + self, + method=None, + method_args=None, + method_kwargs=None, + is_property=None, + output_types=None, + **kw + ): + super().__init__( + _method=method, + _method_args=method_args, + _method_kwargs=method_kwargs, + _is_property=is_property, + _output_types=output_types, + **kw + ) + if not self.output_types: + self.output_types = [OutputType.series] + + @property + def input(self): + return self._input + + @property + def method(self): + return self._method + + @property + def method_args(self): + return self._method_args + + @property + def method_kwargs(self): + return self._method_kwargs + + @property + def is_property(self): + return self._is_property + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + def __call__(self, inp): + return _datetime_method_to_handlers[self._method].call(self, inp) + + @classmethod + def tile(cls, op): + return _datetime_method_to_handlers[op.method].tile(op) + + @classmethod + def execute(cls, ctx, op): + return _datetime_method_to_handlers[op.method].execute(ctx, op) + + +class SeriesDatetimeMethodBaseHandler: + @classmethod + def call(cls, op, inp): + empty_series = build_empty_series(inp.dtype) + if op.is_property: + test_obj = getattr(empty_series.dt, op.method) + else: + test_obj = getattr(empty_series.dt, op.method)( + *op.method_args, **op.method_kwargs + ) + dtype = test_obj.dtype + return op.new_series( + [inp], + shape=inp.shape, + dtype=dtype, + index_value=inp.index_value, + name=inp.name, + ) + + @classmethod + def tile(cls, op): + out = op.outputs[0] + + out_chunks = [] + for series_chunk in op.input.chunks: + chunk_op = op.copy().reset_key() + out_chunks.append( + chunk_op.new_chunk( + [series_chunk], + shape=series_chunk.shape, + dtype=out.dtype, + index=series_chunk.index, + index_value=series_chunk.index_value, + name=series_chunk.name, + ) + ) + + params = out.params + params["chunks"] = out_chunks + params["nsplits"] = op.input.nsplits + new_op = op.copy() + return new_op.new_tileables([op.input], kws=[params]) + + @classmethod + def execute(cls, ctx, op): + inp = ctx[op.input.key] + try: + out = getattr(inp.dt, op.method) + except ValueError: + # fail due to buffer read-only + out = getattr(inp.copy().dt, op.method) + if not op.is_property: + out = out(*op.method_args, **op.method_kwargs) + ctx[op.outputs[0].key] = out + + +_datetime_method_to_handlers = {} +for method in dir(pd.Series.dt): + if not method.startswith("_"): + _datetime_method_to_handlers[method] = SeriesDatetimeMethodBaseHandler diff --git a/python/xorbits/_mars/dataframe/base/describe.py b/python/xorbits/_mars/dataframe/base/describe.py new file mode 100644 index 000000000..769026bfe --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/describe.py @@ -0,0 +1,266 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ... import tensor as mt +from ...core import recursive_tile +from ...core.operand import OperandStage +from ...serialization.serializables import AnyField, FieldTypes, KeyField, ListField +from ...utils import has_unknown_shape, lazy_import +from ..core import SERIES_TYPE +from ..initializer import DataFrame, Series +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_empty_df, parse_index + +cudf = lazy_import("cudf") + + +class DataFrameDescribe(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.DESCRIBE + + _input = KeyField("input") + _percentiles = ListField("percentiles", FieldTypes.float64) + _include = AnyField("include") + _exclude = AnyField("exclude") + + def __init__( + self, percentiles=None, include=None, exclude=None, output_types=None, **kw + ): + super().__init__( + _percentiles=percentiles, + _include=include, + _exclude=exclude, + _output_types=output_types, + **kw + ) + + @property + def input(self): + return self._input + + @property + def percentiles(self): + return self._percentiles + + @property + def include(self): + return self._include + + @property + def exclude(self): + return self._exclude + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if self.stage != OperandStage.agg: + self._input = self._inputs[0] + + def __call__(self, df_or_series): + if isinstance(df_or_series, SERIES_TYPE): + if not np.issubdtype(df_or_series.dtype, np.number): + raise NotImplementedError("non-numeric type is not supported for now") + test_series = pd.Series([], dtype=df_or_series.dtype).describe( + percentiles=self._percentiles, + include=self._include, + exclude=self._exclude, + ) + return self.new_series( + [df_or_series], + shape=(len(test_series),), + dtype=test_series.dtype, + index_value=parse_index(test_series.index, store_data=True), + ) + else: + test_inp_df = build_empty_df(df_or_series.dtypes) + test_df = test_inp_df.describe( + percentiles=self._percentiles, + include=self._include, + exclude=self._exclude, + ) + if len(self.percentiles) == 0: + # specify percentiles=False + # Note: unlike pandas that False is illegal value for percentiles, + # Mars DataFrame allows user to specify percentiles=False + # to skip computation about percentiles + test_df.drop(["50%"], axis=0, inplace=True) + for dtype in test_df.dtypes: + if not np.issubdtype(dtype, np.number): + raise NotImplementedError( + "non-numeric type is not supported for now" + ) + return self.new_dataframe( + [df_or_series], + shape=test_df.shape, + dtypes=test_df.dtypes, + index_value=parse_index(test_df.index, store_data=True), + columns_value=parse_index(test_df.columns, store_data=True), + ) + + @classmethod + def tile(cls, op): + inp = op.input + + if len(inp.chunks) == 1: + return cls._tile_one_chunk(op) + + if isinstance(inp, SERIES_TYPE): + result = yield from cls._tile_series(op) + else: + result = yield from cls._tile_dataframe(op) + return result + + @classmethod + def _tile_one_chunk(cls, op): + out = op.outputs[0] + + chunk_op = op.copy().reset_key() + chunk_params = out.params.copy() + chunk_params["index"] = (0,) * out.ndim + out_chunk = chunk_op.new_chunk([op.input.chunks[0]], kws=[chunk_params]) + + new_op = op.copy() + params = out.params.copy() + params["chunks"] = [out_chunk] + params["nsplits"] = tuple((s,) for s in out.shape) + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + def _tile_series(cls, op): + series = Series(op.input) + out = op.outputs[0] + index = out.index_value.to_pandas() + # ['count', 'mean', 'std', 'min', {percentiles}, 'max'] + names = index.tolist() + + values = [None] * 6 + for i, agg in enumerate(names[:4]): + values[i] = mt.atleast_1d(getattr(series, agg)()) + values[-1] = mt.atleast_1d(getattr(series, names[-1])()) + values[4] = series.quantile(op.percentiles).to_tensor() + + t = mt.concatenate(values).rechunk(len(names)) + ret = Series(t, index=index, name=series.name) + ret = yield from recursive_tile(ret) + return [ret] + + @classmethod + def _tile_dataframe(cls, op): + df = DataFrame(op.input) + out = op.outputs[0] + dtypes = out.dtypes + columns = dtypes.index.tolist() + + if df.chunk_shape[1] > 1: + df = df.rechunk({1: df.shape[1]}) + + # check dtypes if selected all fields + # to reduce graph scale + if df.dtypes.index.tolist() != columns: + df = df[columns] + + # calculate percentiles + percentiles = None + if len(op.percentiles) > 0: + if has_unknown_shape(*op.inputs): + yield + percentiles = yield from recursive_tile(df.quantile(op.percentiles)) + + # perform aggregation together + aggregation = yield from recursive_tile( + df.agg(["count", "mean", "std", "min", "max"]) + ) + + chunk_op = DataFrameDescribe( + output_types=op.output_types, + stage=OperandStage.agg, + percentiles=op.percentiles, + ) + chunk_params = out.params.copy() + chunk_params["index"] = (0, 0) + in_chunks = aggregation.chunks + if percentiles is not None: + in_chunks += percentiles.chunks + out_chunk = chunk_op.new_chunk(in_chunks, kws=[chunk_params]) + + new_op = op.copy() + params = out.params.copy() + params["chunks"] = [out_chunk] + params["nsplits"] = tuple((s,) for s in out.shape) + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + def execute(cls, ctx, op): + out = op.outputs[0] + if op.stage is None: # 1 chunk + df_or_series = ctx[op.input.key] + + ctx[out.key] = df_or_series.describe( + percentiles=op.percentiles, include=op.include, exclude=op.exclude + ) + else: + assert op.stage == OperandStage.agg + + inputs = [ctx[inp.key] for inp in op.inputs] + xdf = ( + pd + if isinstance(inputs[0], (pd.DataFrame, pd.Series, pd.Index)) + or cudf is None + else cudf + ) + + if len(inputs) == 1: + df = inputs[0] + else: + assert len(inputs) > 1 + aggregations = inputs[0] + percentiles = xdf.concat(inputs[1:], axis=0) + df = xdf.concat( + [aggregations.iloc[:-1], percentiles, aggregations.iloc[-1:]], + axis=0, + ) + # ['count', 'mean', 'std', 'min', {percentiles}, 'max'] + df.index = out.index_value.to_pandas() + ctx[out.key] = df + + +def describe(df_or_series, percentiles=None, include=None, exclude=None): + if percentiles is False: + percentiles = [] + elif percentiles is None: + percentiles = [0.25, 0.5, 0.75] + else: + percentiles = list(percentiles) + if percentiles is not None: + for p in percentiles: + if p < 0 or p > 1: + raise ValueError( + "percentiles should all be in the interval [0, 1]. " + "Try [{0:.3f}] instead.".format(p / 100) + ) + # median should always be included + if 0.5 not in percentiles: + percentiles.append(0.5) + percentiles = np.asarray(percentiles) + + # sort and check for duplicates + unique_pcts = np.unique(percentiles) + if len(unique_pcts) < len(percentiles): + raise ValueError("percentiles cannot contain duplicates") + percentiles = unique_pcts.tolist() + + op = DataFrameDescribe(percentiles=percentiles, include=include, exclude=exclude) + return op(df_or_series) diff --git a/python/xorbits/_mars/dataframe/base/diff.py b/python/xorbits/_mars/dataframe/base/diff.py new file mode 100644 index 000000000..a63994812 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/diff.py @@ -0,0 +1,304 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np + +from ... import opcodes +from ...core import recursive_tile +from ...serialization.serializables import AnyField, Int8Field, Int64Field +from ..core import DATAFRAME_TYPE, OutputType +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_empty_df, build_empty_series, validate_axis +from .shift import DataFrameShift + + +class DataFrameDiff(DataFrameOperandMixin, DataFrameOperand): + _op_type_ = opcodes.DIFF + + _periods = Int64Field("periods") + _axis = Int8Field("axis") + + _bool_columns = AnyField("bool_columns") + + @property + def periods(self): + return self._periods + + @property + def axis(self): + return self._axis + + @property + def bool_columns(self): + return self._bool_columns + + def __init__(self, periods=None, axis=None, bool_columns=None, **kw): + super().__init__(_periods=periods, _axis=axis, _bool_columns=bool_columns, **kw) + + def __call__(self, df_or_series): + params = df_or_series.params.copy() + + if isinstance(df_or_series, DATAFRAME_TYPE): + self.output_types = [OutputType.dataframe] + mock_obj = build_empty_df(df_or_series.dtypes) + params["dtypes"] = mock_obj.diff().dtypes + else: + self.output_types = [OutputType.series] + mock_obj = build_empty_series(df_or_series.dtype, name=df_or_series.name) + params["dtype"] = mock_obj.diff().dtype + + return self.new_tileable([df_or_series], **params) + + @classmethod + def tile(cls, op): + in_obj = op.inputs[0] + out_obj = op.outputs[0] + axis = op.axis or 0 + + if in_obj.chunk_shape[axis] > 1: + shifted = yield from recursive_tile( + DataFrameShift(periods=op.periods, axis=axis)(in_obj) + ) + shift_chunks = shifted.chunks + else: + shift_chunks = itertools.repeat(None) + + chunks = [] + bool_columns_dict = dict() + for in_chunk, shift_chunk in zip(in_obj.chunks, shift_chunks): + params = in_chunk.params.copy() + if in_chunk.ndim == 2: + params["dtypes"] = out_obj.dtypes[in_chunk.dtypes.index] + try: + bool_columns = bool_columns_dict[in_chunk.index[1]] + except KeyError: + bool_columns = bool_columns_dict[in_chunk.index[1]] = [ + col + for col, dt in in_chunk.dtypes.items() + if dt == np.dtype(bool) + ] + else: + params["dtype"] = out_obj.dtype + bool_columns = in_chunk.dtype == np.dtype(bool) + + new_op = op.copy().reset_key() + new_op._bool_columns = bool_columns + + if shift_chunk is None: + chunks.append(new_op.new_chunk([in_chunk], **params)) + else: + chunks.append(new_op.new_chunk([in_chunk, shift_chunk], **params)) + + new_op = op.copy().reset_key() + return new_op.new_tileables( + [in_obj], chunks=chunks, nsplits=in_obj.nsplits, **out_obj.params + ) + + @classmethod + def execute(cls, ctx, op): + in_data = ctx[op.inputs[0].key] + if len(op.inputs) == 1: + if in_data.ndim == 2: + try: + ctx[op.outputs[0].key] = in_data.diff( + periods=op.periods, axis=op.axis + ) + except ValueError: + ctx[op.outputs[0].key] = in_data.copy().diff( + periods=op.periods, axis=op.axis + ) + else: + ctx[op.outputs[0].key] = in_data.diff(periods=op.periods) + else: + in_shift = ctx[op.inputs[1].key] + result = in_data - in_shift + if op.bool_columns: + if in_data.ndim == 2: + result.replace( + {c: {1: True, -1: True, 0: False} for c in op.bool_columns}, + inplace=True, + ) + else: + result.replace({1: True, -1: True, 0: False}, inplace=True) + ctx[op.outputs[0].key] = result + + +def df_diff(df, periods=1, axis=0): + """ + First discrete difference of element. + Calculates the difference of a DataFrame element compared with another + element in the DataFrame (default is the element in the same column + of the previous row). + + Parameters + ---------- + periods : int, default 1 + Periods to shift for calculating difference, accepts negative + values. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Take difference over rows (0) or columns (1). + + Returns + ------- + DataFrame + + See Also + -------- + Series.diff : First discrete difference for a Series. + DataFrame.pct_change : Percent change over given number of periods. + DataFrame.shift : Shift index by desired number of periods with an + optional time freq. + + Notes + ----- + For boolean dtypes, this uses :meth:`operator.xor` rather than + :meth:`operator.sub`. + + Examples + -------- + Difference with previous row + + >>> import mars.dataframe as md + >>> df = md.DataFrame({'a': [1, 2, 3, 4, 5, 6], + ... 'b': [1, 1, 2, 3, 5, 8], + ... 'c': [1, 4, 9, 16, 25, 36]}) + >>> df.execute() + a b c + 0 1 1 1 + 1 2 1 4 + 2 3 2 9 + 3 4 3 16 + 4 5 5 25 + 5 6 8 36 + + >>> df.diff().execute() + a b c + 0 NaN NaN NaN + 1 1.0 0.0 3.0 + 2 1.0 1.0 5.0 + 3 1.0 1.0 7.0 + 4 1.0 2.0 9.0 + 5 1.0 3.0 11.0 + + Difference with previous column + + >>> df.diff(axis=1).execute() + a b c + 0 NaN 0.0 0.0 + 1 NaN -1.0 3.0 + 2 NaN -1.0 7.0 + 3 NaN -1.0 13.0 + 4 NaN 0.0 20.0 + 5 NaN 2.0 28.0 + + Difference with 3rd previous row + + >>> df.diff(periods=3).execute() + a b c + 0 NaN NaN NaN + 1 NaN NaN NaN + 2 NaN NaN NaN + 3 3.0 2.0 15.0 + 4 3.0 4.0 21.0 + 5 3.0 6.0 27.0 + + Difference with following row + + >>> df.diff(periods=-1).execute() + a b c + 0 -1.0 0.0 -3.0 + 1 -1.0 -1.0 -5.0 + 2 -1.0 -1.0 -7.0 + 3 -1.0 -2.0 -9.0 + 4 -1.0 -3.0 -11.0 + 5 NaN NaN NaN + """ + axis = validate_axis(axis, df) + op = DataFrameDiff(periods=periods, axis=axis) + return op(df) + + +def series_diff(series, periods=1): + """ + First discrete difference of element. + Calculates the difference of a Series element compared with another + element in the Series (default is element in previous row). + + Parameters + ---------- + periods : int, default 1 + Periods to shift for calculating difference, accepts negative + values. + + Returns + ------- + Series + First differences of the Series. + + See Also + -------- + Series.pct_change : + Percent change over given number of periods. + Series.shift : + Shift index by desired number of periods with an optional time freq. + DataFrame.diff : + First discrete difference of object. + + Notes + ----- + For boolean dtypes, this uses :meth:`operator.xor` rather than + :meth:`operator.sub`. + + Examples + -------- + + Difference with previous row + + >>> import mars.dataframe as md + >>> s = md.Series([1, 1, 2, 3, 5, 8]) + >>> s.diff().execute() + 0 NaN + 1 0.0 + 2 1.0 + 3 1.0 + 4 2.0 + 5 3.0 + dtype: float64 + + Difference with 3rd previous row + + >>> s.diff(periods=3).execute() + 0 NaN + 1 NaN + 2 NaN + 3 2.0 + 4 4.0 + 5 6.0 + dtype: float64 + + Difference with following row + + >>> s.diff(periods=-1).execute() + 0 0.0 + 1 -1.0 + 2 -1.0 + 3 -2.0 + 4 -3.0 + 5 NaN + dtype: float64 + """ + op = DataFrameDiff(periods=periods) + return op(series) diff --git a/python/xorbits/_mars/dataframe/base/drop.py b/python/xorbits/_mars/dataframe/base/drop.py new file mode 100644 index 000000000..d7cbaaedd --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/drop.py @@ -0,0 +1,545 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from collections import OrderedDict + +import numpy as np + +from ... import opcodes +from ...core import CHUNK_TYPE, Chunk, Entity, OutputType, recursive_tile +from ...serialization.serializables import AnyField, StringField +from ..core import DATAFRAME_TYPE, INDEX_CHUNK_TYPE, SERIES_TYPE, IndexValue +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import parse_index, validate_axis + + +class DataFrameDrop(DataFrameOperandMixin, DataFrameOperand): + _op_type_ = opcodes.DATAFRAME_DROP + + _index = AnyField("index") + _columns = AnyField("columns") + _level = AnyField("level") + _errors = StringField("errors") + + def __init__(self, index=None, columns=None, level=None, errors=None, **kw): + super().__init__( + _index=index, _columns=columns, _level=level, _errors=errors, **kw + ) + + @property + def index(self): + return self._index + + @property + def columns(self): + return self._columns + + @property + def level(self): + return self._level + + @property + def errors(self): + return self._errors + + def _filter_dtypes(self, dtypes, ignore_errors=False): + if self._columns: + return dtypes.drop( + index=self._columns, + level=self._level, + errors="ignore" if ignore_errors else self._errors, + ) + else: + return dtypes + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs[1:]) + if len(self._inputs) > 1: + self._index = next(inputs_iter) + + def __call__(self, df_or_series): + params = df_or_series.params.copy() + shape_list = list(df_or_series.shape) + + if self._index is not None: + if isinstance(df_or_series.index_value.value, IndexValue.RangeIndex): + params["index_value"] = parse_index( + None, (df_or_series.key, df_or_series.index_value.key) + ) + shape_list[0] = np.nan + + if isinstance(df_or_series, DATAFRAME_TYPE): + new_dtypes = self._filter_dtypes(df_or_series.dtypes) + params["columns_value"] = parse_index(new_dtypes.index, store_data=True) + params["dtypes"] = new_dtypes + shape_list[1] = len(new_dtypes) + self.output_types = [OutputType.dataframe] + elif isinstance(df_or_series, SERIES_TYPE): + self.output_types = [OutputType.series] + else: + self.output_types = [OutputType.index] + + params["shape"] = tuple(shape_list) + + inputs = [df_or_series] + if isinstance(self._index, Entity): + inputs.append(self._index) + return self.new_tileable(inputs, **params) + + @classmethod + def tile(cls, op: "DataFrameDrop"): + inp = op.inputs[0] + out = op.outputs[0] + if len(op.inputs) > 1: + rechunked = yield from recursive_tile( + op.index.rechunk({0: (op.index.shape[0],)}) + ) + index_chunk = rechunked.chunks[0] + else: + index_chunk = op.index + + col_to_args = OrderedDict() + chunks = [] + for c in inp.chunks: + params = c.params.copy() + if isinstance(inp, DATAFRAME_TYPE): + new_dtypes, new_col_id = col_to_args.get(c.index[1], (None, None)) + + if new_dtypes is None: + new_col_id = len(col_to_args) + new_dtypes = op._filter_dtypes(c.dtypes, ignore_errors=True) + if len(new_dtypes) == 0: + continue + col_to_args[c.index[1]] = (new_dtypes, new_col_id) + + params.update( + dict( + dtypes=new_dtypes, + index=(c.index[0], new_col_id), + index_value=c.index_value, + columns_value=parse_index(new_dtypes.index, store_data=True), + ) + ) + if op.index is not None: + params.update( + dict( + shape=(np.nan, len(new_dtypes)), + index_value=parse_index(None, (c.key, c.index_value.key)), + ) + ) + else: + params["shape"] = (c.shape[0], len(new_dtypes)) + elif op.index is not None: + params.update( + dict( + shape=(np.nan,), + index_value=parse_index(None, (c.key, c.index_value.key)), + ) + ) + + chunk_inputs = [c] + if isinstance(index_chunk, Chunk): + chunk_inputs.append(index_chunk) + + new_op = op.copy().reset_key() + new_op._index = index_chunk + chunks.append(new_op.new_chunk(chunk_inputs, **params)) + + new_op = op.copy().reset_key() + params = out.params.copy() + if op.index is not None: + nsplits_list = [(np.nan,) * inp.chunk_shape[0]] + else: + nsplits_list = [inp.nsplits[0]] + if isinstance(inp, DATAFRAME_TYPE): + nsplits_list.append(tuple(len(dt) for dt, _ in col_to_args.values())) + params.update(dict(chunks=chunks, nsplits=tuple(nsplits_list))) + return new_op.new_tileables(op.inputs, **params) + + @classmethod + def execute(cls, ctx, op: "DataFrameDrop"): + inp = op.inputs[0] + if isinstance(op.index, CHUNK_TYPE): + index_val = ctx[op.index.key] + else: + index_val = op.index + + if isinstance(inp, INDEX_CHUNK_TYPE): + ctx[op.outputs[0].key] = ctx[inp.key].drop(index_val, errors="ignore") + else: + ctx[op.outputs[0].key] = ctx[inp.key].drop( + index=index_val, columns=op.columns, level=op.level, errors="ignore" + ) + + +def _drop( + df_or_series, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors="raise", +): + axis = validate_axis(axis, df_or_series) + if labels is not None: + if axis == 0: + index = labels + else: + columns = labels + + if index is not None and errors == "raise": + warnings.warn("Errors will not raise for non-existing indices") + if isinstance(columns, Entity): + raise NotImplementedError("Columns cannot be Mars objects") + + op = DataFrameDrop(index=index, columns=columns, level=level, errors=errors) + df = op(df_or_series) + if inplace: + df_or_series.data = df.data + else: + return df + + +def df_drop( + df, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors="raise", +): + """ + Drop specified labels from rows or columns. + + Remove rows or columns by specifying label names and corresponding + axis, or by specifying directly index or column names. When using a + multi-index, labels on different levels can be removed by specifying + the level. + + Parameters + ---------- + labels : single label or list-like + Index or column labels to drop. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Whether to drop labels from the index (0 or 'index') or + columns (1 or 'columns'). + index : single label or list-like + Alternative to specifying axis (``labels, axis=0`` + is equivalent to ``index=labels``). + columns : single label or list-like + Alternative to specifying axis (``labels, axis=1`` + is equivalent to ``columns=labels``). + level : int or level name, optional + For MultiIndex, level from which the labels will be removed. + inplace : bool, default False + If True, do operation inplace and return None. + errors : {'ignore', 'raise'}, default 'raise' + If 'ignore', suppress error and only existing labels are + dropped. Note that errors for missing indices will not raise. + + Returns + ------- + DataFrame + DataFrame without the removed index or column labels. + + Raises + ------ + KeyError + If any of the labels is not found in the selected axis. + + See Also + -------- + DataFrame.loc : Label-location based indexer for selection by label. + DataFrame.dropna : Return DataFrame with labels on given axis omitted + where (all or any) data are missing. + DataFrame.drop_duplicates : Return DataFrame with duplicate rows + removed, optionally only considering certain columns. + Series.drop : Return Series with specified index labels removed. + + Examples + -------- + >>> import numpy as np + >>> import pandas as pd + >>> import mars.dataframe as md + >>> df = md.DataFrame(np.arange(12).reshape(3, 4), + ... columns=['A', 'B', 'C', 'D']) + >>> df.execute() + A B C D + 0 0 1 2 3 + 1 4 5 6 7 + 2 8 9 10 11 + + Drop columns + + >>> df.drop(['B', 'C'], axis=1).execute() + A D + 0 0 3 + 1 4 7 + 2 8 11 + + >>> df.drop(columns=['B', 'C']).execute() + A D + 0 0 3 + 1 4 7 + 2 8 11 + + Drop a row by index + + >>> df.drop([0, 1]).execute() + A B C D + 2 8 9 10 11 + + Drop columns and/or rows of MultiIndex DataFrame + + >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'], + ... ['speed', 'weight', 'length']], + ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], + ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + >>> df = md.DataFrame(index=midx, columns=['big', 'small'], + ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20], + ... [250, 150], [1.5, 0.8], [320, 250], + ... [1, 0.8], [0.3, 0.2]]) + >>> df.execute() + big small + lama speed 45.0 30.0 + weight 200.0 100.0 + length 1.5 1.0 + cow speed 30.0 20.0 + weight 250.0 150.0 + length 1.5 0.8 + falcon speed 320.0 250.0 + weight 1.0 0.8 + length 0.3 0.2 + + >>> df.drop(index='cow', columns='small').execute() + big + lama speed 45.0 + weight 200.0 + length 1.5 + falcon speed 320.0 + weight 1.0 + length 0.3 + + >>> df.drop(index='length', level=1).execute() + big small + lama speed 45.0 30.0 + weight 200.0 100.0 + cow speed 30.0 20.0 + weight 250.0 150.0 + falcon speed 320.0 250.0 + weight 1.0 0.8 + """ + return _drop( + df, + labels=labels, + axis=axis, + index=index, + columns=columns, + level=level, + inplace=inplace, + errors=errors, + ) + + +def df_pop(df, item): + """ + Return item and drop from frame. Raise KeyError if not found. + + Parameters + ---------- + item : str + Label of column to be popped. + + Returns + ------- + Series + + Examples + -------- + >>> import numpy as np + >>> import mars.dataframe as md + >>> df = md.DataFrame([('falcon', 'bird', 389.0), + ... ('parrot', 'bird', 24.0), + ... ('lion', 'mammal', 80.5), + ... ('monkey', 'mammal', np.nan)], + ... columns=('name', 'class', 'max_speed')) + >>> df.execute() + name class max_speed + 0 falcon bird 389.0 + 1 parrot bird 24.0 + 2 lion mammal 80.5 + 3 monkey mammal NaN + + >>> df.pop('class').execute() + 0 bird + 1 bird + 2 mammal + 3 mammal + Name: class, dtype: object + + >>> df.execute() + name max_speed + 0 falcon 389.0 + 1 parrot 24.0 + 2 lion 80.5 + 3 monkey NaN + """ + series = df.data[item] + df_drop(df, item, axis=1, inplace=True) + return series + + +def series_drop( + series, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors="raise", +): + """ + Return Series with specified index labels removed. + + Remove elements of a Series based on specifying the index labels. + When using a multi-index, labels on different levels can be removed + by specifying the level. + + Parameters + ---------- + labels : single label or list-like + Index labels to drop. + axis : 0, default 0 + Redundant for application on Series. + index : single label or list-like + Redundant for application on Series, but 'index' can be used instead + of 'labels'. + + .. versionadded:: 0.21.0 + columns : single label or list-like + No change is made to the Series; use 'index' or 'labels' instead. + + .. versionadded:: 0.21.0 + level : int or level name, optional + For MultiIndex, level for which the labels will be removed. + inplace : bool, default False + If True, do operation inplace and return None. + errors : {'ignore', 'raise'}, default 'raise' + Note that this argument is kept only for compatibility, and errors + will not raise even if ``errors=='raise'``. + + Returns + ------- + Series + Series with specified index labels removed. + + Raises + ------ + KeyError + If none of the labels are found in the index. + + See Also + -------- + Series.reindex : Return only specified index labels of Series. + Series.dropna : Return series without null values. + Series.drop_duplicates : Return Series with duplicate values removed. + DataFrame.drop : Drop specified labels from rows or columns. + + Examples + -------- + >>> import numpy as np + >>> import pandas as pd + >>> import mars.dataframe as md + >>> s = md.Series(data=np.arange(3), index=['A', 'B', 'C']) + >>> s.execute() + A 0 + B 1 + C 2 + dtype: int64 + + Drop labels B en C + + >>> s.drop(labels=['B', 'C']).execute() + A 0 + dtype: int64 + + Drop 2nd level label in MultiIndex Series + + >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'], + ... ['speed', 'weight', 'length']], + ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], + ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + >>> s = md.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], + ... index=midx) + >>> s.execute() + lama speed 45.0 + weight 200.0 + length 1.2 + cow speed 30.0 + weight 250.0 + length 1.5 + falcon speed 320.0 + weight 1.0 + length 0.3 + dtype: float64 + + >>> s.drop(labels='weight', level=1).execute() + lama speed 45.0 + length 1.2 + cow speed 30.0 + length 1.5 + falcon speed 320.0 + length 0.3 + dtype: float64 + """ + return _drop( + series, + labels=labels, + axis=axis, + index=index, + columns=columns, + level=level, + inplace=inplace, + errors=errors, + ) + + +def index_drop(index, labels, errors="raise"): + """ + Make new Index with passed list of labels deleted. + + Parameters + ---------- + labels : array-like + errors : {'ignore', 'raise'}, default 'raise' + Note that this argument is kept only for compatibility, and errors + will not raise even if ``errors=='raise'``. + + Returns + ------- + dropped : Index + + Raises + ------ + KeyError + If not all of the labels are found in the selected axis + """ + return _drop(index, labels=labels, errors=errors) diff --git a/python/xorbits/_mars/dataframe/base/drop_duplicates.py b/python/xorbits/_mars/dataframe/base/drop_duplicates.py new file mode 100644 index 000000000..596fd8c8a --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/drop_duplicates.py @@ -0,0 +1,421 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...core.operand import OperandStage +from ...serialization.serializables import BoolField +from ...utils import calc_nsplits, lazy_import +from ..operands import OutputType +from ..utils import ( + gen_unknown_index_value, + hash_dataframe_on, + parse_index, + standardize_range_index, +) +from ._duplicate import DuplicateOperand, validate_subset + +cudf = lazy_import("cudf") + + +class DataFrameDropDuplicates(DuplicateOperand): + _op_type_ = opcodes.DROP_DUPLICATES + + _ignore_index = BoolField("ignore_index") + + def __init__( + self, + subset=None, + keep=None, + ignore_index=None, + output_types=None, + method=None, + subset_chunk=None, + shuffle_size=None, + **kw + ): + super().__init__( + _subset=subset, + _keep=keep, + _ignore_index=ignore_index, + _output_types=output_types, + _method=method, + _subset_chunk=subset_chunk, + _shuffle_size=shuffle_size, + **kw + ) + + @property + def ignore_index(self): + return self._ignore_index + + @classmethod + def _get_shape(cls, input_shape, op): + shape = (np.nan,) + input_shape[1:] + if op.output_types[0] == OutputType.dataframe and len(shape) == 1: + shape += (3,) + return shape + + @classmethod + def _gen_tileable_params(cls, op: "DataFrameDropDuplicates", input_params): + params = input_params.copy() + if op.ignore_index: + params["index_value"] = parse_index(pd.RangeIndex(-1)) + else: + params["index_value"] = gen_unknown_index_value( + input_params["index_value"], op.keep, op.subset, type(op).__name__ + ) + params["shape"] = cls._get_shape(input_params["shape"], op) + return params + + def __call__(self, inp, inplace=False): + self._output_types = inp.op.output_types + params = self._gen_tileable_params(self, inp.params) + + ret = self.new_tileable([inp], kws=[params]) + if inplace: + inp.data = ret.data + return ret + + @classmethod + def _gen_chunk_params(cls, op: "DataFrameDropDuplicates", input_chunk): + input_params = input_chunk.params + inp = op.inputs[0] + chunk_params = input_params.copy() + chunk_params["index"] = input_chunk.index[:1] + (0,) * (inp.ndim - 1) + chunk_params["shape"] = cls._get_shape(input_params["shape"], op) + chunk_params["index_value"] = gen_unknown_index_value( + input_params["index_value"], input_chunk + ) + if inp.ndim == 2: + chunk_params["columns_value"] = inp.columns_value + chunk_params["dtypes"] = inp.dtypes + else: + chunk_params["name"] = inp.name + chunk_params["dtype"] = inp.dtype + return chunk_params + + @classmethod + def _get_map_output_types(cls, input_chunk, method: str): + if method == "subset_tree": + return [OutputType.dataframe] + else: + return input_chunk.op.output_types + + @classmethod + def _tile_shuffle(cls, op: "DataFrameDropDuplicates", inp): + tiled = super()._tile_shuffle(op, inp)[0] + put_back_chunks = tiled.chunks + if op.ignore_index: + yield put_back_chunks + put_back_chunks = standardize_range_index(put_back_chunks) + new_op = op.copy() + params = tiled.params + params["nsplits"] = calc_nsplits({c.index: c.shape for c in put_back_chunks}) + params["chunks"] = put_back_chunks + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + def _execute_chunk(cls, ctx, op): + inp = ctx[op.input.key] + ctx[op.outputs[0].key] = cls._drop_duplicates(inp, op) + + @classmethod + def _execute_subset_tree_post(cls, ctx, op): + inp = ctx[op.input.key] + out = op.outputs[0] + idx = op.outputs[0].index[0] + subset = ctx[op.subset_chunk.key] + selected = subset[subset["_chunk_index_"] == idx]["_i_"] + ret = inp.iloc[selected] + if op.ignore_index: + prev_size = (subset["_chunk_index_"] < out.index[0]).sum() + ret.index = pd.RangeIndex(prev_size, prev_size + len(ret)) + ctx[op.outputs[0].key] = ret + + @classmethod + def _execute_shuffle_map(cls, ctx, op): + out = op.outputs[0] + shuffle_size = op.shuffle_size + subset = op.subset + + inp = ctx[op.input.key] + dropped = cls._drop_duplicates(inp, op) + if dropped.ndim == 1: + dropped = dropped.to_frame() + subset = dropped.columns.tolist() + else: + if subset is None: + subset = dropped.columns.tolist() + dropped["_chunk_index_"] = out.index[0] + dropped["_i_"] = np.arange(dropped.shape[0]) + hashed = hash_dataframe_on(dropped, subset, shuffle_size) + for i, data in enumerate(hashed): + reducer_idx = (i,) + out.index[1:] + ctx[out.key, reducer_idx] = dropped.iloc[data] + + @classmethod + def _execute_shuffle_reduce(cls, ctx, op: "DataFrameDropDuplicates"): + out = op.outputs[0] + inputs = list(op.iter_mapper_data(ctx)) + + xdf = cls._get_xdf(inputs[0]) + inp = xdf.concat(inputs) + dropped = cls._drop_duplicates( + inp, + op, + subset=[c for c in inp.columns if c not in ("_chunk_index_", "_i_")], + keep=op.keep, + ignore_index=op.ignore_index, + ) + for i in range(op.shuffle_size): + filtered = dropped[dropped["_chunk_index_"] == i] + del filtered["_chunk_index_"] + ctx[out.key, (i,)] = filtered + + @classmethod + def _execute_shuffle_put_back(cls, ctx, op: "DataFrameDropDuplicates"): + out = op.outputs[0] + inputs = list(op.iter_mapper_data(ctx)) + + xdf = cls._get_xdf(inputs[0]) + inp = xdf.concat(inputs) + inp.sort_values("_i_", inplace=True) + del inp["_i_"] + + if out.op.output_types[0] == OutputType.index: + assert inp.shape[1] == 1 + ret = xdf.Index(inp.iloc[:, 0]) + elif out.op.output_types[0] == OutputType.series: + assert inp.shape[1] == 1 + ret = inp.iloc[:, 0] + ret.name = out.name + else: + ret = inp + + if op.ignore_index: + ret.reset_index(drop=True, inplace=True) + ctx[out.key] = ret + + @classmethod + def execute(cls, ctx, op): + if op.method is None: + # one chunk + cls._execute_chunk(ctx, op) + elif op.method == "tree": + # tree + cls._execute_chunk(ctx, op) + elif op.method == "subset_tree": + # subset tree + if op.stage == OperandStage.map: + cls._execute_subset_tree_map(ctx, op) + elif op.stage == OperandStage.combine: + cls._execute_subset_tree_combine(ctx, op) + elif op.stage == OperandStage.agg: + cls._execute_subset_tree_agg(ctx, op) + else: + # post + cls._execute_subset_tree_post(ctx, op) + else: + assert op.method == "shuffle" + if op.stage == OperandStage.map: + cls._execute_shuffle_map(ctx, op) + elif op.reducer_phase == "drop_duplicates": + cls._execute_shuffle_reduce(ctx, op) + else: + assert op.reducer_phase == "put_back" + cls._execute_shuffle_put_back(ctx, op) + + +def df_drop_duplicates( + df, subset=None, keep="first", inplace=False, ignore_index=False, method="auto" +): + """ + Return DataFrame with duplicate rows removed. + + Considering certain columns is optional. Indexes, including time indexes + are ignored. + + Parameters + ---------- + subset : column label or sequence of labels, optional + Only consider certain columns for identifying duplicates, by + default use all of the columns. + keep : {'first', 'last', False}, default 'first' + Determines which duplicates (if any) to keep. + - ``first`` : Drop duplicates except for the first occurrence. + - ``last`` : Drop duplicates except for the last occurrence. + - False : Drop all duplicates. + inplace : bool, default False + Whether to drop duplicates in place or to return a copy. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + Returns + ------- + DataFrame + DataFrame with duplicates removed or None if ``inplace=True``. + """ + if method not in ("auto", "tree", "subset_tree", "shuffle", None): + raise ValueError( + "method could only be one of " + "'auto', 'tree', 'subset_tree', 'shuffle' or None" + ) + subset = validate_subset(df, subset) + op = DataFrameDropDuplicates( + subset=subset, keep=keep, ignore_index=ignore_index, method=method + ) + return op(df, inplace=inplace) + + +def series_drop_duplicates(series, keep="first", inplace=False, method="auto"): + """ + Return Series with duplicate values removed. + + Parameters + ---------- + keep : {'first', 'last', ``False``}, default 'first' + Method to handle dropping duplicates: + + - 'first' : Drop duplicates except for the first occurrence. + - 'last' : Drop duplicates except for the last occurrence. + - ``False`` : Drop all duplicates. + + inplace : bool, default ``False`` + If ``True``, performs operation inplace and returns None. + + Returns + ------- + Series + Series with duplicates dropped. + + See Also + -------- + Index.drop_duplicates : Equivalent method on Index. + DataFrame.drop_duplicates : Equivalent method on DataFrame. + Series.duplicated : Related method on Series, indicating duplicate + Series values. + + Examples + -------- + Generate a Series with duplicated entries. + + >>> import mars.dataframe as md + >>> s = md.Series(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'], + ... name='animal') + >>> s.execute() + 0 lama + 1 cow + 2 lama + 3 beetle + 4 lama + 5 hippo + Name: animal, dtype: object + + With the 'keep' parameter, the selection behaviour of duplicated values + can be changed. The value 'first' keeps the first occurrence for each + set of duplicated entries. The default value of keep is 'first'. + + >>> s.drop_duplicates().execute() + 0 lama + 1 cow + 3 beetle + 5 hippo + Name: animal, dtype: object + + The value 'last' for parameter 'keep' keeps the last occurrence for + each set of duplicated entries. + + >>> s.drop_duplicates(keep='last').execute() + 1 cow + 3 beetle + 4 lama + 5 hippo + Name: animal, dtype: object + + The value ``False`` for parameter 'keep' discards all sets of + duplicated entries. Setting the value of 'inplace' to ``True`` performs + the operation inplace and returns ``None``. + + >>> s.drop_duplicates(keep=False, inplace=True) + >>> s.execute() + 1 cow + 3 beetle + 5 hippo + Name: animal, dtype: object + """ + if method not in ("auto", "tree", "shuffle", None): + raise ValueError( + "method could only be one of 'auto', 'tree', 'shuffle' or None" + ) + op = DataFrameDropDuplicates(keep=keep, method=method) + return op(series, inplace=inplace) + + +def index_drop_duplicates(index, keep="first", method="auto"): + """ + Return Index with duplicate values removed. + + Parameters + ---------- + keep : {'first', 'last', ``False``}, default 'first' + - 'first' : Drop duplicates except for the first occurrence. + - 'last' : Drop duplicates except for the last occurrence. + - ``False`` : Drop all duplicates. + + Returns + ------- + deduplicated : Index + + See Also + -------- + Series.drop_duplicates : Equivalent method on Series. + DataFrame.drop_duplicates : Equivalent method on DataFrame. + Index.duplicated : Related method on Index, indicating duplicate + Index values. + + Examples + -------- + Generate an pandas.Index with duplicate values. + + >>> import mars.dataframe as md + + >>> idx = md.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) + + The `keep` parameter controls which duplicate values are removed. + The value 'first' keeps the first occurrence for each + set of duplicated entries. The default value of keep is 'first'. + + >>> idx.drop_duplicates(keep='first').execute() + Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object') + + The value 'last' keeps the last occurrence for each set of duplicated + entries. + + >>> idx.drop_duplicates(keep='last').execute() + Index(['cow', 'beetle', 'lama', 'hippo'], dtype='object') + + The value ``False`` discards all sets of duplicated entries. + + >>> idx.drop_duplicates(keep=False).execute() + Index(['cow', 'beetle', 'hippo'], dtype='object') + """ + if method not in ("auto", "tree", "shuffle", None): + raise ValueError( + "method could only be one of 'auto', 'tree', 'shuffle' or None" + ) + op = DataFrameDropDuplicates(keep=keep, method=method) + return op(index) diff --git a/python/xorbits/_mars/dataframe/base/duplicated.py b/python/xorbits/_mars/dataframe/base/duplicated.py new file mode 100644 index 000000000..bb4db862f --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/duplicated.py @@ -0,0 +1,533 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes +from ...core import OutputType +from ...core.operand import OperandStage +from ..utils import gen_unknown_index_value, hash_dataframe_on +from ._duplicate import DuplicateOperand, validate_subset + + +class DataFrameDuplicated(DuplicateOperand): + _op_type_ = opcodes.DUPLICATED + + def __init__( + self, + subset=None, + keep=None, + output_types=None, + method=None, + subset_chunk=None, + shuffle_size=None, + **kw + ): + super().__init__( + _subset=subset, + _keep=keep, + _output_types=output_types, + _method=method, + _subset_chunk=subset_chunk, + _shuffle_size=shuffle_size, + **kw + ) + + @classmethod + def _get_shape(cls, input_shape, op): + return (input_shape[0],) + + @classmethod + def _gen_tileable_params(cls, op: "DataFrameDuplicated", input_params): + # duplicated() always returns a Series + return { + "shape": cls._get_shape(input_params["shape"], op), + "index_value": input_params["index_value"], + "dtype": np.dtype(bool), + "name": input_params.get("name"), + } + + def __call__(self, inp, inplace=False): + self._output_types = [OutputType.series] + params = self._gen_tileable_params(self, inp.params) + + return self.new_tileable([inp], kws=[params]) + + @classmethod + def _get_map_output_types(cls, input_chunk, method: str): + if method in ("tree", "subset_tree"): + return [OutputType.dataframe] + else: + return input_chunk.op.output_types + + @classmethod + def _gen_chunk_params_default(cls, op: "DataFrameDuplicated", input_chunk): + return { + "shape": cls._get_shape(input_chunk.shape, op), + "index_value": input_chunk.index_value, + "dtype": np.dtype(bool), + "name": input_chunk.name if input_chunk.ndim == 1 else None, + "index": (input_chunk.index[0],), + } + + @classmethod + def _get_intermediate_shape(cls, input_shape): + if len(input_shape) > 1: + s = input_shape[1:] + else: + s = (2,) + return (np.nan,) + s + + @classmethod + def _gen_intermediate_chunk_params(cls, op: "DataFrameDuplicated", input_chunk): + inp = op.input + chunk_params = dict() + chunk_params["shape"] = shape = cls._get_intermediate_shape(input_chunk.shape) + chunk_params["index"] = input_chunk.index[:1] + (0,) * (len(shape) - 1) + chunk_params["index_value"] = gen_unknown_index_value( + input_chunk.index_value, input_chunk + ) + if inp.ndim == 2 and len(shape) == 2: + chunk_params["columns_value"] = input_chunk.columns_value + chunk_params["dtypes"] = input_chunk.dtypes + return chunk_params + + @classmethod + def _gen_chunk_params(cls, op: "DataFrameDuplicated", input_chunk): + is_terminal_chunk = False + if op.method is None: + # one chunk + is_terminal_chunk = True + elif op.method == "subset_tree" and op.stage is None: + is_terminal_chunk = True + elif op.method == "tree" and op.stage == OperandStage.agg: + is_terminal_chunk = True + elif op.method == "shuffle" and op.reducer_phase == "put_back": + is_terminal_chunk = True + + if is_terminal_chunk: + return cls._gen_chunk_params_default(op, input_chunk) + else: + return cls._gen_intermediate_chunk_params(op, input_chunk) + + @classmethod + def _duplicated(cls, inp, op, subset=None, keep=None): + if keep is None: + keep = op.keep + if inp.ndim == 2: + if subset is None: + subset = op.subset + return inp.duplicated(subset=subset, keep=keep) + else: + return inp.duplicated(keep=keep) + + @classmethod + def _execute_chunk(cls, ctx, op): + inp = ctx[op.input.key] + ctx[op.outputs[0].key] = cls._duplicated(inp, op) + + @classmethod + def _execute_tree_map(cls, ctx, op): + inp = ctx[op.input.key] + xdf = cls._get_xdf(inp) + if op.subset is not None: + result = inp[op.subset].copy() + else: + result = inp.copy() + duplicated = cls._duplicated(inp, op) + if not duplicated.name: + duplicated.name = "_duplicated_" + result.iloc[duplicated.values] = None + result = xdf.concat([result, duplicated], axis=1) + ctx[op.outputs[0].key] = result + + @classmethod + def _execute_tree_combine(cls, ctx, op): + inp = ctx[op.input.key] + result = inp.copy() + duplicated_filter = ~inp.iloc[:, -1] + duplicates = inp.loc[duplicated_filter] + dup_on_duplicated = cls._duplicated(duplicates, op) + result.iloc[duplicated_filter.to_numpy().nonzero()[0], -1] = dup_on_duplicated + duplicated = result.iloc[:, -1] + result.iloc[duplicated.values, :-1] = None + ctx[op.outputs[0].key] = result + + @classmethod + def _execute_tree_agg(cls, ctx, op): + inp = ctx[op.input.key] + result = inp.iloc[:, -1].copy() + duplicates = inp[~inp.iloc[:, -1]] + dup_on_duplicated = cls._duplicated(duplicates, op) + result[~inp.iloc[:, -1]] = dup_on_duplicated + expect_name = op.outputs[0].name + if result.name != expect_name: + result.name = expect_name + result = result.astype(bool) + ctx[op.outputs[0].key] = result + + @classmethod + def _execute_subset_tree_post(cls, ctx, op): + inp = ctx[op.input.key] + idx = op.outputs[0].index[0] + subset = ctx[op.subset_chunk.key] + selected = subset[subset["_chunk_index_"] == idx]["_i_"] + + xdf = cls._get_xdf(inp) + duplicated = np.ones(len(inp), dtype=bool) + duplicated[selected] = False + + ctx[op.outputs[0].key] = xdf.Series(duplicated, index=inp.index) + + @classmethod + def _execute_shuffle_map(cls, ctx, op): + out = op.outputs[0] + shuffle_size = op.shuffle_size + subset = op.subset + + inp = ctx[op.input.key] + if subset is not None: + result = inp[subset].copy() + else: + result = inp.copy() + if result.ndim == 1: + name = result.name + result = result.to_frame() + if name is None: + result.columns = ["_duplicated_"] + subset = result.columns.tolist() + else: + if subset is None: + subset = result.columns.tolist() + if len(subset) == 1: + result.columns = subset = ["_duplicated_"] + result["_chunk_index_"] = out.index[0] + result["_i_"] = np.arange(result.shape[0]) + hashed = hash_dataframe_on(result, subset, shuffle_size) + for i, data in enumerate(hashed): + reducer_idx = (i,) + out.index[1:] + ctx[out.key, reducer_idx] = result.iloc[data] + + @classmethod + def _execute_shuffle_reduce(cls, ctx, op: "DataFrameDuplicated"): + out = op.outputs[0] + inputs = list(op.iter_mapper_data(ctx)) + + xdf = cls._get_xdf(inputs[0]) + inp = xdf.concat(inputs) + subset = [c for c in inp.columns if c not in ("_chunk_index_", "_i_")] + duplicated = cls._duplicated(inp, op, subset=subset) + result = xdf.concat([duplicated, inp[["_chunk_index_", "_i_"]]], axis=1) + for i in range(op.shuffle_size): + filtered = result[result["_chunk_index_"] == i] + del filtered["_chunk_index_"] + if len(subset) > 1 or subset[0] == "_duplicated_": + filtered.columns = ["_duplicated_"] + filtered.columns[1:].tolist() + else: + filtered.columns = [subset[0]] + filtered.columns[1:].tolist() + ctx[out.key, (i,)] = filtered + + @classmethod + def _execute_shuffle_put_back(cls, ctx, op: "DataFrameDuplicated"): + inputs = list(op.iter_mapper_data(ctx)) + + xdf = cls._get_xdf(inputs[0]) + inp = xdf.concat(inputs) + inp.sort_values("_i_", inplace=True) + del inp["_i_"] + duplicated = inp.iloc[:, 0] + if duplicated.name == "_duplicated_": + duplicated.name = None + ctx[op.outputs[0].key] = duplicated + + @classmethod + def execute(cls, ctx, op: "DataFrameDuplicated"): + if op.method is None: + # one chunk + cls._execute_chunk(ctx, op) + elif op.method == "tree": + # tree + if op.stage == OperandStage.map: + cls._execute_tree_map(ctx, op) + elif op.stage == OperandStage.combine: + cls._execute_tree_combine(ctx, op) + else: + assert op.stage == OperandStage.agg + cls._execute_tree_agg(ctx, op) + elif op.method == "subset_tree": + # subset tree + if op.stage == OperandStage.map: + cls._execute_subset_tree_map(ctx, op) + elif op.stage == OperandStage.combine: + cls._execute_subset_tree_combine(ctx, op) + elif op.stage == OperandStage.agg: + cls._execute_subset_tree_agg(ctx, op) + else: + # post + cls._execute_subset_tree_post(ctx, op) + else: + assert op.method == "shuffle" + if op.stage == OperandStage.map: + cls._execute_shuffle_map(ctx, op) + elif op.reducer_phase == "drop_duplicates": + cls._execute_shuffle_reduce(ctx, op) + else: + assert op.reducer_phase == "put_back" + cls._execute_shuffle_put_back(ctx, op) + + +def df_duplicated(df, subset=None, keep="first", method="auto"): + """ + Return boolean Series denoting duplicate rows. + + Considering certain columns is optional. + + Parameters + ---------- + subset : column label or sequence of labels, optional + Only consider certain columns for identifying duplicates, by + default use all of the columns. + keep : {'first', 'last', False}, default 'first' + Determines which duplicates (if any) to mark. + + - ``first`` : Mark duplicates as ``True`` except for the first occurrence. + - ``last`` : Mark duplicates as ``True`` except for the last occurrence. + - False : Mark all duplicates as ``True``. + + Returns + ------- + Series + Boolean series for each duplicated rows. + + See Also + -------- + Index.duplicated : Equivalent method on index. + Series.duplicated : Equivalent method on Series. + Series.drop_duplicates : Remove duplicate values from Series. + DataFrame.drop_duplicates : Remove duplicate values from DataFrame. + + Examples + -------- + Consider dataset containing ramen rating. + + >>> import mars.dataframe as md + + >>> df = md.DataFrame({ + ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], + ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], + ... 'rating': [4, 4, 3.5, 15, 5] + ... }) + >>> df.execute() + brand style rating + 0 Yum Yum cup 4.0 + 1 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 3 Indomie pack 15.0 + 4 Indomie pack 5.0 + + By default, for each set of duplicated values, the first occurrence + is set on False and all others on True. + + >>> df.duplicated().execute() + 0 False + 1 True + 2 False + 3 False + 4 False + dtype: bool + + By using 'last', the last occurrence of each set of duplicated values + is set on False and all others on True. + + >>> df.duplicated(keep='last').execute() + 0 True + 1 False + 2 False + 3 False + 4 False + dtype: bool + + By setting ``keep`` on False, all duplicates are True. + + >>> df.duplicated(keep=False).execute() + 0 True + 1 True + 2 False + 3 False + 4 False + dtype: bool + + To find duplicates on specific column(s), use ``subset``. + + >>> df.duplicated(subset=['brand']).execute() + 0 False + 1 True + 2 False + 3 True + 4 True + dtype: bool + """ + + if method not in ("auto", "tree", "subset_tree", "shuffle", None): + raise ValueError( + "method could only be one of " + "'auto', 'tree', 'subset_tree', 'shuffle' or None" + ) + subset = validate_subset(df, subset) + op = DataFrameDuplicated(subset=subset, keep=keep, method=method) + return op(df) + + +def series_duplicated(series, keep="first", method="auto"): + """ + Indicate duplicate Series values. + + Duplicated values are indicated as ``True`` values in the resulting + Series. Either all duplicates, all except the first or all except the + last occurrence of duplicates can be indicated. + + Parameters + ---------- + keep : {'first', 'last', False}, default 'first' + Method to handle dropping duplicates: + + - 'first' : Mark duplicates as ``True`` except for the first + occurrence. + - 'last' : Mark duplicates as ``True`` except for the last + occurrence. + - ``False`` : Mark all duplicates as ``True``. + + Returns + ------- + Series + Series indicating whether each value has occurred in the + preceding values. + + See Also + -------- + Index.duplicated : Equivalent method on pandas.Index. + DataFrame.duplicated : Equivalent method on pandas.DataFrame. + Series.drop_duplicates : Remove duplicate values from Series. + + Examples + -------- + By default, for each set of duplicated values, the first occurrence is + set on False and all others on True: + + >>> import mars.dataframe as md + + >>> animals = md.Series(['lama', 'cow', 'lama', 'beetle', 'lama']) + >>> animals.duplicated().execute() + 0 False + 1 False + 2 True + 3 False + 4 True + dtype: bool + + which is equivalent to + + >>> animals.duplicated(keep='first').execute() + 0 False + 1 False + 2 True + 3 False + 4 True + dtype: bool + + By using 'last', the last occurrence of each set of duplicated values + is set on False and all others on True: + + >>> animals.duplicated(keep='last').execute() + 0 True + 1 False + 2 True + 3 False + 4 False + dtype: bool + + By setting keep on ``False``, all duplicates are True: + + >>> animals.duplicated(keep=False).execute() + 0 True + 1 False + 2 True + 3 False + 4 True + dtype: bool + """ + if method not in ("auto", "tree", "shuffle", None): + raise ValueError( + "method could only be one of 'auto', 'tree', 'shuffle' or None" + ) + op = DataFrameDuplicated(keep=keep, method=method) + return op(series) + + +def index_duplicated(index, keep="first"): + """ + Indicate duplicate index values. + + Duplicated values are indicated as ``True`` values in the resulting + array. Either all duplicates, all except the first, or all except the + last occurrence of duplicates can be indicated. + + Parameters + ---------- + keep : {'first', 'last', False}, default 'first' + The value or values in a set of duplicates to mark as missing. + - 'first' : Mark duplicates as ``True`` except for the first + occurrence. + - 'last' : Mark duplicates as ``True`` except for the last + occurrence. + - ``False`` : Mark all duplicates as ``True``. + + Returns + ------- + Tensor + + See Also + -------- + Series.duplicated : Equivalent method on pandas.Series. + DataFrame.duplicated : Equivalent method on pandas.DataFrame. + Index.drop_duplicates : Remove duplicate values from Index. + + Examples + -------- + By default, for each set of duplicated values, the first occurrence is + set to False and all others to True: + + >>> import mars.dataframe as md + + >>> idx = md.Index(['lama', 'cow', 'lama', 'beetle', 'lama']) + >>> idx.duplicated().execute() + array([False, False, True, False, True]) + + which is equivalent to + + >>> idx.duplicated(keep='first').execute() + array([False, False, True, False, True]) + + By using 'last', the last occurrence of each set of duplicated values + is set on False and all others on True: + + >>> idx.duplicated(keep='last').execute() + array([ True, False, True, False, False]) + + By setting keep on ``False``, all duplicates are True: + + >>> idx.duplicated(keep=False).execute() + array([ True, False, True, False, True]) + """ + return index.to_series().duplicated(keep=keep).to_tensor() diff --git a/python/xorbits/_mars/dataframe/base/eval.py b/python/xorbits/_mars/dataframe/base/eval.py new file mode 100644 index 000000000..dc7172d1c --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/eval.py @@ -0,0 +1,836 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ast +import binascii +import operator +import sys +import textwrap +import tokenize +from collections import OrderedDict +from functools import reduce +from io import StringIO + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...core import ENTITY_TYPE, OutputType, get_output_types, recursive_tile +from ...serialization.serializables import BoolField, DictField, StringField +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import parse_index + +LOCAL_TAG = "_local_var_" +BACKTICK_TAG = "_backtick_var_" + + +def _tokenize_str(reader): + token_generator = tokenize.generate_tokens(reader) + + def _iter_backtick_string(gen, line, back_start): + for _, tokval, start, _, _ in gen: + if tokval == "`": + return ( + BACKTICK_TAG + + binascii.b2a_hex( + line[back_start[1] + 1 : start[1]].encode() + ).decode() + ) + else: + raise SyntaxError(f"backtick quote at {back_start} does not match") + + for toknum, tokval, start, _, line in token_generator: + if toknum == tokenize.OP: + if tokval == "@": + tokval = LOCAL_TAG + if tokval == "&": + toknum = tokenize.NAME + tokval = "and" + elif tokval == "|": + toknum = tokenize.NAME + tokval = "or" + elif tokval == "`": + yield tokenize.NAME, _iter_backtick_string(token_generator, line, start) + continue + yield toknum, tokval + + +class CollectionVisitor(ast.NodeVisitor): + _op_handlers = { + ast.Add: operator.add, + ast.Sub: operator.sub, + ast.Mult: operator.mul, + ast.Div: operator.truediv, + ast.FloorDiv: operator.floordiv, + ast.mod: operator.mod, + ast.Pow: operator.pow, + ast.Eq: operator.eq, + ast.NotEq: operator.ne, + ast.Lt: operator.lt, + ast.LtE: operator.le, + ast.Gt: operator.gt, + ast.GtE: operator.ge, + ast.In: lambda x, y: y.isin(x), + ast.NotIn: lambda x, y: ~y.isin(x), + ast.UAdd: operator.pos, + ast.USub: operator.neg, + ast.Invert: operator.invert, + ast.And: operator.and_, + ast.Or: operator.or_, + } + + def __init__(self, resolvers, target, env): + self.env = env + self.target = target + self.resolvers = resolvers + + self.referenced_vars = set() + self.assigned = False + self.entity_subscribe = False + + def _preparse(self, expr): + reader = StringIO(expr).readline + return tokenize.untokenize(list(_tokenize_str(reader))) + + def eval(self, expr, rewrite=True): + if rewrite: + expr = self._preparse(expr) + node = ast.fix_missing_locations(ast.parse(expr)) + return self.visit(node) + + def get_named_object(self, obj_name): + for resolver in self.resolvers: + try: + return resolver[obj_name] + except (IndexError, KeyError): + continue + if obj_name in self.env: + self.referenced_vars.add(obj_name) + return self.env[obj_name] + raise KeyError(f"name {obj_name} is not defined") + + def visit(self, node): + if isinstance(node, ENTITY_TYPE): + return node + node_name = node.__class__.__name__ + method = "visit_" + node_name + try: + visitor = getattr(self, method) + except AttributeError: + raise SyntaxError( + "Query string contains unsupported syntax: {}".format(node_name) + ) + return visitor(node) + + def visit_Module(self, node): + if self.target is None and len(node.body) != 1: + raise SyntaxError("Only a single expression is allowed") + result = None + for expr in node.body: + result = self.visit(expr) + return result + + def visit_Expr(self, node): + return self.visit(node.value) + + def visit_BinOp(self, node): + left = self.visit(node.left) + right = self.visit(node.right) + return self._op_handlers[type(node.op)](left, right) + + def visit_Call(self, node): + func = self.visit(node.func) + args = [self.visit(n) for n in node.args] + kwargs = OrderedDict([(kw.arg, self.visit(kw.value)) for kw in node.keywords]) + return func(*args, **kwargs) + + def visit_Compare(self, node): + ops = node.ops + comps = node.comparators + + if len(comps) == 1: + binop = ast.BinOp(op=ops[0], left=node.left, right=comps[0]) + return self.visit(binop) + + left = node.left + values = [] + for op, comp in zip(ops, comps): + new_node = ast.Compare(comparators=[comp], left=left, ops=[op]) + left = comp + values.append(new_node) + return self.visit(ast.BoolOp(op=ast.And(), values=values)) + + def visit_BoolOp(self, node): + def func(lhs, rhs): + binop = ast.BinOp(op=node.op, left=lhs, right=rhs) + return self.visit(binop) + + return reduce(func, node.values) + + def visit_UnaryOp(self, node): + op = self.visit(node.operand) + return self._op_handlers[type(node.op)](op) + + def visit_Name(self, node): + if node.id.startswith(LOCAL_TAG): + local_name = node.id.replace(LOCAL_TAG, "") + self.referenced_vars.add(local_name) + return self.env[local_name] + if node.id.startswith(BACKTICK_TAG): + local_name = binascii.a2b_hex( + node.id.replace(BACKTICK_TAG, "").encode() + ).decode() + return self.get_named_object(local_name) + return self.get_named_object(node.id) + + def visit_NameConstant(self, node): # pragma: no cover + return node.value + + def visit_Num(self, node): # pragma: no cover + return node.n + + def visit_Str(self, node): # pragma: no cover + return node.s + + def visit_Constant(self, node): + return node.value + + def visit_List(self, node): + return [self.visit(e) for e in node.elts] + + def visit_Assign(self, node): + if self.target is None: + raise ValueError("Target not specified for assignment") + if isinstance(node.targets[0], ast.Tuple): + raise ValueError("Does not support assigning to multiple objects") + + target = node.targets[0].id + value = self.visit(node.value) + self.target[target] = value + self.assigned = True + + visit_Tuple = visit_List + + def visit_Attribute(self, node): + attr = node.attr + value = node.value + + ctx = node.ctx + if isinstance(ctx, ast.Load): + resolved = self.visit(value) + return getattr(resolved, attr) + + raise ValueError("Invalid Attribute context {0}".format(ctx.__name__)) + + def visit_Subscript(self, node): + value = self.visit(node.value) + sub = self.visit(node.slice) + if isinstance(value, ENTITY_TYPE): + self.entity_subscribe = True + return value[sub] + + def visit_Index(self, node): + return self.visit(node.value) + + def visit_Slice(self, node): + lower = node.lower + if lower is not None: + lower = self.visit(lower) + upper = node.upper + if upper is not None: + upper = self.visit(upper) + step = node.step + if step is not None: + step = self.visit(step) + + return slice(lower, upper, step) + + +class DataFrameEval(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.DATAFRAME_EVAL + + _expr = StringField("expr") + _parser = StringField("parser") + _engine = StringField("engine") + _variables = DictField("variables") + _self_target = BoolField("self_target") + _is_query = BoolField("is_query") + + def __init__( + self, + expr=None, + parser=None, + engine=None, + variables=None, + self_target=None, + is_query=None, + **kw, + ): + super().__init__( + _expr=expr, + _parser=parser, + _engine=engine, + _variables=variables, + _self_target=self_target, + _is_query=is_query, + **kw, + ) + + @property + def expr(self): + return self._expr + + @property + def parser(self): + return self._parser + + @property + def engine(self): + return self._engine + + @property + def variables(self): + return self._variables + + @property + def self_target(self): + return self._self_target + + @property + def is_query(self): + return self._is_query + + def __call__(self, df, output_type, shape, dtypes): + self._output_types = [output_type] + params = df.params + new_index_value = ( + df.index_value if not np.isnan(shape[0]) else parse_index(pd.RangeIndex(-1)) + ) + if output_type == OutputType.dataframe: + params.update( + dict( + dtypes=dtypes, + shape=shape, + columns_value=parse_index(dtypes.index, store_data=True), + index_value=new_index_value, + ) + ) + else: + name, dtype = dtypes + params = dict( + name=name, + dtype=dtype, + shape=shape, + index_value=new_index_value, + ) + return self.new_tileable([df], **params) + + def convert_to_query(self, df, output_type, shape, dtypes): + new_op = self.copy().reset_key() + new_op._is_query = True + new_op._self_target = False + return new_op(df, output_type, shape, dtypes) + + @classmethod + def tile(cls, op: "DataFrameEval"): + in_df = op.inputs[0] + out_df = op.outputs[0] + + if in_df.ndim == 2: + if in_df.chunk_shape[1] > 1: + in_df = yield from recursive_tile(in_df.rechunk({1: in_df.shape[1]})) + + chunks = [] + for c in in_df.chunks: + if out_df.ndim == 2: + new_shape = ( + np.nan if np.isnan(out_df.shape[0]) else c.shape[0], + out_df.shape[1], + ) + params = dict( + dtypes=out_df.dtypes, + shape=new_shape, + columns_value=parse_index(out_df.dtypes.index, store_data=True), + index_value=c.index_value, + index=c.index, + ) + else: + new_shape = (np.nan if np.isnan(out_df.shape[0]) else c.shape[0],) + params = dict( + name=out_df.name, + dtype=out_df.dtype, + shape=new_shape, + index_value=c.index_value, + index=(c.index[0],), + ) + new_op = op.copy().reset_key() + chunks.append(new_op.new_chunk([c], **params)) + + new_op = op.copy().reset_key() + params = out_df.params + + new_nsplits = [in_df.nsplits[0], (out_df.shape[-1],)] + if np.isnan(out_df.shape[0]): + new_nsplits[0] = (np.nan,) * len(in_df.nsplits[0]) + if out_df.ndim == 1: + new_nsplits = new_nsplits[:1] + + params.update( + dict( + chunks=chunks, + nsplits=tuple(new_nsplits), + ) + ) + return new_op.new_tileables([in_df], **params) + + @classmethod + def execute(cls, ctx, op: "DataFrameEval"): + in_data = ctx[op.inputs[0].key] + + if op.self_target: + in_data = in_data.copy() + + if op.is_query: + val = in_data.query( + op.expr, parser=op.parser, engine=op.engine, local_dict=op.variables + ) + else: + val = in_data.eval( + op.expr, parser=op.parser, engine=op.engine, local_dict=op.variables + ) + ctx[op.outputs[0].key] = val + + +def mars_eval( + expr, + parser="mars", + engine=None, + local_dict=None, + global_dict=None, + resolvers=(), + level=0, + target=None, + inplace=False, +): + """ + + Evaluate a Python expression as a string using various backends. + + The following arithmetic operations are supported: ``+``, ``-``, ``*``, + ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following + boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not). + Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`, + :keyword:`or`, and :keyword:`not` with the same semantics as the + corresponding bitwise operators. :class:`~pandas.Series` and + :class:`~pandas.DataFrame` objects are supported and behave as they would + with plain ol' Python evaluation. + + Parameters + ---------- + expr : str + The expression to evaluate. This string cannot contain any Python + `statements + `__, + only Python `expressions + `__. + local_dict : dict or None, optional + A dictionary of local variables, taken from locals() by default. + global_dict : dict or None, optional + A dictionary of global variables, taken from globals() by default. + resolvers : list of dict-like or None, optional + A list of objects implementing the ``__getitem__`` special method that + you can use to inject an additional collection of namespaces to use for + variable lookup. For example, this is used in the + :meth:`~DataFrame.query` method to inject the + ``DataFrame.index`` and ``DataFrame.columns`` + variables that refer to their respective :class:`~pandas.DataFrame` + instance attributes. + level : int, optional + The number of prior stack frames to traverse and add to the current + scope. Most users will **not** need to change this parameter. + target : object, optional, default None + This is the target object for assignment. It is used when there is + variable assignment in the expression. If so, then `target` must + support item assignment with string keys, and if a copy is being + returned, it must also support `.copy()`. + inplace : bool, default False + If `target` is provided, and the expression mutates `target`, whether + to modify `target` inplace. Otherwise, return a copy of `target` with + the mutation. + + Returns + ------- + ndarray, numeric scalar, DataFrame, Series + + Raises + ------ + ValueError + There are many instances where such an error can be raised: + + - `target=None`, but the expression is multiline. + - The expression is multiline, but not all them have item assignment. + An example of such an arrangement is this: + + a = b + 1 + a + 2 + + Here, there are expressions on different lines, making it multiline, + but the last line has no variable assigned to the output of `a + 2`. + - `inplace=True`, but the expression is missing item assignment. + - Item assignment is provided, but the `target` does not support + string item assignment. + - Item assignment is provided and `inplace=False`, but the `target` + does not support the `.copy()` method + + See Also + -------- + DataFrame.query : Evaluates a boolean expression to query the columns + of a frame. + DataFrame.eval : Evaluate a string describing operations on + DataFrame columns. + + Notes + ----- + The ``dtype`` of any objects involved in an arithmetic ``%`` operation are + recursively cast to ``float64``. + + See the :ref:`enhancing performance ` documentation for + more details. + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]}) + >>> df.execute() + animal age + 0 dog 10 + 1 pig 20 + + We can add a new column using ``pd.eval``: + + >>> md.eval("double_age = df.age * 2", target=df).execute() + animal age double_age + 0 dog 10 20 + 1 pig 20 40 + """ + if not isinstance(expr, str): + raise TypeError("expr must be a string") + + expr = textwrap.dedent(expr) + + try: + frame = sys._getframe(level + 1) + local_dict = local_dict or dict() + local_dict.update(frame.f_locals) + global_dict = global_dict or dict() + global_dict.update(frame.f_globals) + finally: + del frame + + env = dict() + env.update(global_dict) + env.update(local_dict) + + ref_frames = set(resolvers) | set([target] if target is not None else []) + self_target = len(resolvers) > 0 and resolvers[0] is target + + if target is not None and not inplace: + target = target.copy() + + visitor = CollectionVisitor(resolvers, target, env) + result = visitor.eval(expr) + result = result if result is not None else target + has_var_frame = any( + isinstance(env[k], ENTITY_TYPE) for k in visitor.referenced_vars + ) + if len(ref_frames) != 1 or visitor.entity_subscribe or has_var_frame: + if parser != "mars": + raise NotImplementedError("Does not support parser names other than mars") + if engine is not None: + raise NotImplementedError("Does not support specifying engine names") + return result + else: + parser = "pandas" if parser == "mars" else parser + referenced_env = {k: env[k] for k in visitor.referenced_vars} + op = DataFrameEval( + expr, + parser=parser, + engine=engine, + variables=referenced_env, + self_target=visitor.assigned and self_target, + is_query=False, + ) + output_type = get_output_types(result)[0] + dtypes = result.dtypes if result.ndim == 2 else (result.name, result.dtype) + return op(resolvers[0], output_type, result.shape, dtypes) + + +def df_eval(df, expr, inplace=False, **kwargs): + """ + Evaluate a string describing operations on DataFrame columns. + + Operates on columns only, not specific rows or elements. This allows + `eval` to run arbitrary code, which can make you vulnerable to code + injection if you pass user input to this function. + + Parameters + ---------- + expr : str + The expression string to evaluate. + inplace : bool, default False + If the expression contains an assignment, whether to perform the + operation inplace and mutate the existing DataFrame. Otherwise, + a new DataFrame is returned. + **kwargs + See the documentation for :func:`eval` for complete details + on the keyword arguments accepted by + :meth:`~pandas.DataFrame.query`. + + Returns + ------- + ndarray, scalar, or pandas object + The result of the evaluation. + + See Also + -------- + DataFrame.query : Evaluates a boolean expression to query the columns + of a frame. + DataFrame.assign : Can evaluate an expression or function to create new + values for a column. + eval : Evaluate a Python expression as a string using various + backends. + + Notes + ----- + For more details see the API documentation for :func:`~eval`. + For detailed examples see :ref:`enhancing performance with eval + `. + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) + >>> df.execute() + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + >>> df.eval('A + B').execute() + 0 11 + 1 10 + 2 9 + 3 8 + 4 7 + dtype: int64 + + Assignment is allowed though by default the original DataFrame is not + modified. + + >>> df.eval('C = A + B').execute() + A B C + 0 1 10 11 + 1 2 8 10 + 2 3 6 9 + 3 4 4 8 + 4 5 2 7 + >>> df.execute() + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + + Use ``inplace=True`` to modify the original DataFrame. + + >>> df.eval('C = A + B', inplace=True) + >>> df.execute() + A B C + 0 1 10 11 + 1 2 8 10 + 2 3 6 9 + 3 4 4 8 + 4 5 2 7 + + Multiple columns can be assigned to using multi-line expressions: + + >>> df.eval(''' + ... C = A + B + ... D = A - B + ... ''').execute() + A B C D + 0 1 10 11 -9 + 1 2 8 10 -6 + 2 3 6 9 -3 + 3 4 4 8 0 + 4 5 2 7 3 + """ + level = kwargs.pop("level", None) or 0 + kwargs["inplace"] = inplace + val = mars_eval(expr, resolvers=(df,), target=df, level=level + 1, **kwargs) + if not inplace: + return val + + +def df_query(df, expr, inplace=False, **kwargs): + """ + Query the columns of a DataFrame with a boolean expression. + + Parameters + ---------- + expr : str + The query string to evaluate. + + You can refer to variables + in the environment by prefixing them with an '@' character like + ``@a + b``. + + You can refer to column names that contain spaces or operators by + surrounding them in backticks. This way you can also escape + names that start with a digit, or those that are a Python keyword. + Basically when it is not valid Python identifier. See notes down + for more details. + + For example, if one of your columns is called ``a a`` and you want + to sum it with ``b``, your query should be ```a a` + b``. + + inplace : bool + Whether the query should modify the data in place or return + a modified copy. + **kwargs + See the documentation for :func:`eval` for complete details + on the keyword arguments accepted by :meth:`DataFrame.query`. + + Returns + ------- + DataFrame + DataFrame resulting from the provided query expression. + + See Also + -------- + eval : Evaluate a string describing operations on + DataFrame columns. + DataFrame.eval : Evaluate a string describing operations on + DataFrame columns. + + Notes + ----- + The result of the evaluation of this expression is first passed to + :attr:`DataFrame.loc` and if that fails because of a + multidimensional key (e.g., a DataFrame) then the result will be passed + to :meth:`DataFrame.__getitem__`. + + This method uses the top-level :func:`eval` function to + evaluate the passed query. + + The :meth:`~pandas.DataFrame.query` method uses a slightly + modified Python syntax by default. For example, the ``&`` and ``|`` + (bitwise) operators have the precedence of their boolean cousins, + :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python, + however the semantics are different. + + You can change the semantics of the expression by passing the keyword + argument ``parser='python'``. This enforces the same semantics as + evaluation in Python space. Likewise, you can pass ``engine='python'`` + to evaluate an expression using Python itself as a backend. This is not + recommended as it is inefficient compared to using ``numexpr`` as the + engine. + + The :attr:`DataFrame.index` and + :attr:`DataFrame.columns` attributes of the + :class:`~pandas.DataFrame` instance are placed in the query namespace + by default, which allows you to treat both the index and columns of the + frame as a column in the frame. + The identifier ``index`` is used for the frame index; you can also + use the name of the index to identify it in a query. Please note that + Python keywords may not be used as identifiers. + + For further details and examples see the ``query`` documentation in + :ref:`indexing `. + + *Backtick quoted variables* + + Backtick quoted variables are parsed as literal Python code and + are converted internally to a Python valid identifier. + This can lead to the following problems. + + During parsing a number of disallowed characters inside the backtick + quoted string are replaced by strings that are allowed as a Python identifier. + These characters include all operators in Python, the space character, the + question mark, the exclamation mark, the dollar sign, and the euro sign. + For other characters that fall outside the ASCII range (U+0001..U+007F) + and those that are not further specified in PEP 3131, + the query parser will raise an error. + This excludes whitespace different than the space character, + but also the hashtag (as it is used for comments) and the backtick + itself (backtick can also not be escaped). + + In a special case, quotes that make a pair around a backtick can + confuse the parser. + For example, ```it's` > `that's``` will raise an error, + as it forms a quoted string (``'s > `that'``) with a backtick inside. + + See also the Python documentation about lexical analysis + (https://docs.python.org/3/reference/lexical_analysis.html) + in combination with the source code in :mod:`pandas.core.computation.parsing`. + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame({'A': range(1, 6), + ... 'B': range(10, 0, -2), + ... 'C C': range(10, 5, -1)}) + >>> df.execute() + A B C C + 0 1 10 10 + 1 2 8 9 + 2 3 6 8 + 3 4 4 7 + 4 5 2 6 + >>> df.query('A > B').execute() + A B C C + 4 5 2 6 + + The previous expression is equivalent to + + >>> df[df.A > df.B].execute() + A B C C + 4 5 2 6 + + For columns with spaces in their name, you can use backtick quoting. + + >>> df.query('B == `C C`').execute() + A B C C + 0 1 10 10 + + The previous expression is equivalent to + + >>> df[df.B == df['C C']].execute() + A B C C + 0 1 10 10 + """ + level = kwargs.pop("level", None) or 0 + predicate = mars_eval(expr, resolvers=(df,), level=level + 1, **kwargs) + result = df[predicate] + + if isinstance(predicate.op, DataFrameEval): + output_type = get_output_types(result)[0] + dtypes = result.dtypes if result.ndim == 2 else (result.name, result.dtype) + result = predicate.op.convert_to_query(df, output_type, result.shape, dtypes) + + if inplace: + df.data = result.data + else: + return result diff --git a/python/xorbits/_mars/dataframe/base/explode.py b/python/xorbits/_mars/dataframe/base/explode.py new file mode 100644 index 000000000..e8bacf35f --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/explode.py @@ -0,0 +1,213 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...core import OutputType, recursive_tile +from ...serialization.serializables import AnyField, BoolField +from ...utils import calc_nsplits +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import parse_index, standardize_range_index + + +class DataFrameExplode(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.EXPLODE + + _column = AnyField("column") + _ignore_index = BoolField("ignore_field") + + def __init__(self, column=None, ignore_index=None, output_types=None, **kw): + super().__init__( + _column=column, _ignore_index=ignore_index, _output_types=output_types, **kw + ) + + @property + def column(self): + return self._column + + @property + def ignore_index(self): + return self._ignore_index + + def _rewrite_params(self, in_obj): + params = in_obj.params.copy() + new_shape = list(in_obj.shape) + new_shape[0] = np.nan + params["shape"] = tuple(new_shape) + + if self.ignore_index: + params["index_value"] = parse_index( + pd.RangeIndex(-1), (in_obj.key, in_obj.index_value.key) + ) + else: + params["index_value"] = parse_index( + None, (in_obj.key, in_obj.index_value.key) + ) + return params + + def __call__(self, df_or_series): + return self.new_tileable([df_or_series], **self._rewrite_params(df_or_series)) + + @classmethod + def tile(cls, op: "DataFrameExplode"): + in_obj = op.inputs[0] + + if in_obj.ndim == 2 and in_obj.chunk_shape[1] > 1: + # make sure data's second dimension has only 1 chunk + in_obj = yield from recursive_tile(in_obj.rechunk({1: in_obj.shape[1]})) + + chunks = [] + for chunk in in_obj.chunks: + new_op = op.copy().reset_key() + chunks.append(new_op.new_chunk([chunk], **op._rewrite_params(chunk))) + + if op.ignore_index: + yield chunks + chunks = standardize_range_index(chunks) + + new_op = op.copy().reset_key() + out_params = op.outputs[0].params.copy() + out_params["chunks"] = chunks + out_params["nsplits"] = calc_nsplits({c.index: c.shape for c in chunks}) + return new_op.new_tileable([in_obj], kws=[out_params]) + + @classmethod + def execute(cls, ctx, op: "DataFrameExplode"): + in_data = ctx[op.inputs[0].key] + if in_data.ndim == 2: + ctx[op.outputs[0].key] = in_data.explode(op.column) + else: + ctx[op.outputs[0].key] = in_data.explode() + + +def df_explode(df, column, ignore_index=False): + """ + Transform each element of a list-like to a row, replicating index values. + + Parameters + ---------- + column : str or tuple + Column to explode. + ignore_index : bool, default False + If True, the resulting index will be labeled 0, 1, …, n - 1. + + Returns + ------- + DataFrame + Exploded lists to rows of the subset columns; + index will be duplicated for these rows. + + Raises + ------ + ValueError : + if columns of the frame are not unique. + + See Also + -------- + DataFrame.unstack : Pivot a level of the (necessarily hierarchical) + index labels. + DataFrame.melt : Unpivot a DataFrame from wide format to long format. + Series.explode : Explode a DataFrame from list-like columns to long format. + + Notes + ----- + This routine will explode list-likes including lists, tuples, + Series, and np.ndarray. The result dtype of the subset rows will + be object. Scalars will be returned unchanged. Empty list-likes will + result in a np.nan for that row. + + Examples + -------- + >>> import mars.tensor as mt + >>> import mars.dataframe as md + >>> df = md.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1}) + >>> df.execute() + A B + 0 [1, 2, 3] 1 + 1 foo 1 + 2 [] 1 + 3 [3, 4] 1 + + >>> df.explode('A').execute() + A B + 0 1 1 + 0 2 1 + 0 3 1 + 1 foo 1 + 2 NaN 1 + 3 3 1 + 3 4 1 + """ + op = DataFrameExplode( + column=column, ignore_index=ignore_index, output_types=[OutputType.dataframe] + ) + return op(df) + + +def series_explode(series, ignore_index=False): + """ + Transform each element of a list-like to a row. + + Parameters + ---------- + ignore_index : bool, default False + If True, the resulting index will be labeled 0, 1, …, n - 1. + + Returns + ------- + Series + Exploded lists to rows; index will be duplicated for these rows. + + See Also + -------- + Series.str.split : Split string values on specified separator. + Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex + to produce DataFrame. + DataFrame.melt : Unpivot a DataFrame from wide format to long format. + DataFrame.explode : Explode a DataFrame from list-like + columns to long format. + + Notes + ----- + This routine will explode list-likes including lists, tuples, + Series, and np.ndarray. The result dtype of the subset rows will + be object. Scalars will be returned unchanged. Empty list-likes will + result in a np.nan for that row. + + Examples + -------- + >>> import mars.tensor as mt + >>> import mars.dataframe as md + >>> s = md.Series([[1, 2, 3], 'foo', [], [3, 4]]) + >>> s.execute() + 0 [1, 2, 3] + 1 foo + 2 [] + 3 [3, 4] + dtype: object + + >>> s.explode().execute() + 0 1 + 0 2 + 0 3 + 1 foo + 2 NaN + 3 3 + 3 4 + dtype: object + """ + op = DataFrameExplode(ignore_index=ignore_index, output_types=[OutputType.series]) + return op(series) diff --git a/python/xorbits/_mars/dataframe/base/get_dummies.py b/python/xorbits/_mars/dataframe/base/get_dummies.py new file mode 100644 index 000000000..b81da2cb8 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/get_dummies.py @@ -0,0 +1,360 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ...core import OutputType, recursive_tile +from ...serialization.serializables import AnyField, BoolField, ListField, StringField +from ..core import SERIES_TYPE +from ..datasource.dataframe import from_pandas as from_pandas_df +from ..datasource.series import from_pandas as from_pandas_series +from ..initializer import Series as asseries +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..reduction.unique import unique +from ..utils import gen_unknown_index_value + +_encoding_dtype_kind = ["O", "S", "U"] + + +class DataFrameGetDummies(DataFrameOperand, DataFrameOperandMixin): + prefix = AnyField("prefix") + prefix_sep = StringField("prefix_sep") + dummy_na = BoolField("dummy_na") + columns = ListField("columns") + sparse = BoolField("sparse") + drop_first = BoolField("drop_first") + dtype = AnyField("dtype") + + def __init__( + self, + prefix=None, + prefix_sep=None, + dummy_na=None, + columns=None, + sparse=None, + drop_first=None, + dtype=None, + **kws, + ): + super().__init__( + prefix=prefix, + prefix_sep=prefix_sep, + dummy_na=dummy_na, + columns=columns, + sparse=sparse, + drop_first=drop_first, + dtype=dtype, + **kws, + ) + self.output_types = [OutputType.dataframe] + + @classmethod + def tile(cls, op): + inp = op.inputs[0] + out = op.outputs[0] + if len(inp.chunks) == 1: + chunk_op = op.copy().reset_key() + chunk_param = out.params + chunk_param["index"] = (0, 0) + chunk = chunk_op.new_chunk(inp.chunks, kws=[chunk_param]) + new_op = op.copy().reset_key() + param = out.params + param["chunks"] = [chunk] + param["nsplits"] = ((np.nan,), (np.nan,)) + return new_op.new_dataframe(op.inputs, kws=[param]) + elif isinstance(inp, SERIES_TYPE): + unique_inp = yield from recursive_tile(unique(inp)) + chunks = [] + for c in inp.chunks: + chunk_op = op.copy().reset_key() + chunk_param = out.params + chunk_param["index_value"] = gen_unknown_index_value(c.index_value) + chunk_param["index"] = (c.index[0], 0) + chunk = chunk_op.new_chunk([c] + unique_inp.chunks, kws=[chunk_param]) + chunks.append(chunk) + + new_op = op.copy().reset_key() + param = out.params + param["chunks"] = chunks + param["nsplits"] = (tuple([np.nan] * inp.chunk_shape[0]), (np.nan,)) + return new_op.new_dataframe(op.inputs, kws=[param]) + else: + if op.columns: + encoding_columns = op.columns + else: + encoding_columns = [] + for idx, dtype in enumerate(inp.dtypes.values): + if dtype.kind in _encoding_dtype_kind: + column = inp.dtypes.index[idx] + encoding_columns.append(column) + # reindex, make encoding columns in the end of dataframe, to keep pace with pandas.get_dummies + total_columns = list(inp.columns.to_pandas().array) + for col in encoding_columns: + total_columns.remove(col) + total_columns.extend(encoding_columns) + inp = yield from recursive_tile(inp[total_columns]) + + unique_chunks = dict() + for col in encoding_columns: + unique_chunks[col] = yield from recursive_tile(unique(inp[col])) + + chunks = [] + prefix = op.prefix + column_to_prefix = dict() + for c in inp.chunks: + chunk_op = op.copy().reset_key() + chunk_op.columns = [] + if isinstance(chunk_op.prefix, list): + chunk_op.prefix = [] + chunk_param = c.params + chunk_param["shape"] = (np.nan, np.nan) + chunk_columns = c.dtypes.index + inp_chunk = [c] + for chunk_column in chunk_columns: + if chunk_column in encoding_columns: + chunk_op.columns.append(chunk_column) + inp_chunk.extend(unique_chunks[chunk_column].chunks) + if isinstance(prefix, list): + if chunk_column in column_to_prefix.keys(): + chunk_op.prefix.append(column_to_prefix[chunk_column]) + else: + column_to_prefix[chunk_column] = prefix[0] + chunk_op.prefix.append(prefix[0]) + prefix = prefix[1:] + chunk = chunk_op.new_chunk(inp_chunk, kws=[chunk_param]) + chunks.append(chunk) + + new_op = op.copy() + kw = out.params.copy() + kw["chunks"] = chunks + kw["nsplits"] = ( + tuple([np.nan] * inp.chunk_shape[0]), + tuple([np.nan] * inp.chunk_shape[1]), + ) + return new_op.new_dataframe(op.inputs, kws=[kw]) + + @classmethod + def execute(cls, ctx, op): + inp = ctx[op.inputs[0].key] + result_length = inp.shape[0] + unique_inputs = [] + for unique_input in op.inputs[1:]: + unique_inputs.append(ctx[unique_input.key].tolist()) + + if unique_inputs: + if isinstance(inp, pd.Series): + extra_series = pd.Series(unique_inputs[0]) + inp = pd.concat([inp, extra_series]) + else: + # make all unique_input's length the same, then get a dataframe + max_length = len(max(unique_inputs, key=len)) + unique_inputs = [ + unique_list + [unique_list[0]] * (max_length - len(unique_list)) + for unique_list in unique_inputs + ] + extra_dataframe = pd.DataFrame(dict(zip(op.columns, unique_inputs))) + + # add the columns that need not to encode, to concat extra_dataframe and inp + total_columns = list(inp.columns.array) + for col in op.columns: + total_columns.remove(col) + remain_columns = total_columns + not_encode_columns = [] + if len(remain_columns) > 0: + for col in remain_columns: + not_encode_columns.append([inp[col].iloc[0]] * max_length) + not_encode_dataframe = pd.DataFrame( + dict(zip(remain_columns, not_encode_columns)) + ) + + extra_dataframe = pd.concat( + [not_encode_dataframe, extra_dataframe], axis=1 + ) + inp = pd.concat([inp, extra_dataframe], axis=0) + + result = pd.get_dummies( + inp, + op.prefix, + op.prefix_sep, + op.dummy_na, + op.columns, + op.sparse, + op.drop_first, + op.dtype, + ) + ctx[op.outputs[0].key] = result.iloc[:result_length] + + def __call__(self, data): + if isinstance(data, (list, tuple)): + data = asseries(data) + elif isinstance(data, pd.Series): + data = from_pandas_series(data) + elif isinstance(data, pd.DataFrame): + data = from_pandas_df(data) + + if self.prefix is not None: + if isinstance(self.prefix, list): + if self.columns is not None: + encoding_col_num = len(self.columns) + else: + encoding_col_num = 0 + for dtype in data.dtypes.values: + if dtype.kind in _encoding_dtype_kind: + encoding_col_num += 1 + prefix_num = len(self.prefix) + if prefix_num != encoding_col_num: + raise ValueError( + f"Length of 'prefix' ({prefix_num}) did not match " + + f"the length of the columns being encoded ({encoding_col_num})" + ) + elif isinstance(self.prefix, dict): + if self.columns is not None: + encoding_col_num = len(self.columns) + prefix_num = len(self.prefix) + if prefix_num != encoding_col_num: + raise ValueError( + f"Length of 'prefix' ({prefix_num}) did not match " + + f"the length of the columns being encoded ({encoding_col_num})" + ) + columns = self.prefix.keys() + for columns_columnname, prefix_columnname in zip( + columns, list(self.columns) + ): + if columns_columnname != prefix_columnname: + raise KeyError(f"{columns_columnname}") + else: + self.columns = list(self.prefix.keys()) + # Convert prefix from dict to list, to simplify tile work + self.prefix = list(self.prefix.values()) + + return self.new_dataframe( + [data], + shape=(np.nan, np.nan), + dtypes=None, + index_value=data.index_value, + columns_value=None, + ) + + +def get_dummies( + data, + prefix=None, + prefix_sep="_", + dummy_na=False, + columns=None, + sparse=False, + drop_first=False, + dtype=None, +): + """ + Convert categorical variable into dummy/indicator variables. + + Parameters + ---------- + data : array-like, Series, or DataFrame + Data of which to get dummy indicators. + prefix : str, list of str, or dict of str, default None + String to append DataFrame column names. + Pass a list with length equal to the number of columns + when calling get_dummies on a DataFrame. Alternatively, `prefix` + can be a dictionary mapping column names to prefixes. + prefix_sep : str, default '_' + If appending prefix, separator/delimiter to use. Or pass a + list or dictionary as with `prefix`. + dummy_na : bool, default False + Add a column to indicate NaNs, if False NaNs are ignored. + columns : list-like, default None + Column names in the DataFrame to be encoded. + If `columns` is None then all the columns with + `object` or `category` dtype will be converted. + sparse : bool, default False + Whether the dummy-encoded columns should be backed by + a :class:`SparseArray` (True) or a regular NumPy array (False). + drop_first : bool, default False + Whether to get k-1 dummies out of k categorical levels by removing the + first level. + dtype : dtype, default np.uint8 + Data type for new columns. Only a single dtype is allowed. + + Returns + ------- + DataFrame + Dummy-coded data. + + Examples + -------- + >>> import mars.dataframe as md + >>> s = md.Series(list('abca')) + + >>> md.get_dummies(s).execute() + a b c + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 + 3 1 0 0 + + >>> s1 = ['a', 'b', np.nan] + + >>> md.get_dummies(s1).execute() + a b + 0 1 0 + 1 0 1 + 2 0 0 + + >>> md.get_dummies(s1, dummy_na=True).execute() + a b NaN + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 + + >>> df = md.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], + ... 'C': [1, 2, 3]}) + + >>> md.get_dummies(df, prefix=['col1', 'col2']).execute() + C col1_a col1_b col2_a col2_b col2_c + 0 1 1 0 0 1 0 + 1 2 0 1 1 0 0 + 2 3 1 0 0 0 1 + + >>> md.get_dummies(pd.Series(list('abcaa'))).execute() + a b c + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 + 3 1 0 0 + 4 1 0 0 + + >>> md.get_dummies(pd.Series(list('abcaa')), drop_first=True).execute() + b c + 0 0 0 + 1 1 0 + 2 0 1 + 3 0 0 + 4 0 0 + + >>> md.get_dummies(pd.Series(list('abc')), dtype=float).execute() + a b c + 0 1.0 0.0 0.0 + 1 0.0 1.0 0.0 + 2 0.0 0.0 1.0 + """ + if columns is not None and not isinstance(columns, list): + raise TypeError("Input must be a list-like for parameter `columns`") + + op = DataFrameGetDummies( + prefix, prefix_sep, dummy_na, columns, sparse, drop_first, dtype + ) + + return op(data) diff --git a/python/xorbits/_mars/dataframe/base/isin.py b/python/xorbits/_mars/dataframe/base/isin.py new file mode 100644 index 000000000..d6737dff6 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/isin.py @@ -0,0 +1,401 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np +import pandas as pd +from pandas.api.types import is_list_like + +from ... import opcodes as OperandDef +from ...core import ENTITY_TYPE +from ...serialization.serializables import AnyField, KeyField +from ...tensor.core import TENSOR_TYPE +from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE, OutputType +from ..operands import DataFrameOperand, DataFrameOperandMixin +from .drop_duplicates import DataFrameDropDuplicates + + +class DataFrameIsin(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.ISIN + + input = KeyField("input") + values = AnyField("values") + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + self.input = next(inputs_iter) + if len(self._inputs) > 1: + if isinstance(self.values, dict): + new_values = dict() + for k, v in self.values.items(): + if isinstance(v, ENTITY_TYPE): + new_values[k] = next(inputs_iter) + else: + new_values[k] = v + self.values = new_values + else: + self.values = self._inputs[1] + + def __call__(self, elements): + inputs = [elements] + if isinstance(self.values, ENTITY_TYPE): + inputs.append(self.values) + elif isinstance(self.values, dict): + for v in self.values.values(): + if isinstance(v, ENTITY_TYPE): + inputs.append(v) + + if elements.ndim == 1: + return self.new_series( + inputs, + shape=elements.shape, + dtype=np.dtype("bool"), + index_value=elements.index_value, + name=elements.name, + ) + else: + dtypes = pd.Series( + [np.dtype(bool) for _ in elements.dtypes], index=elements.dtypes.index + ) + return self.new_dataframe( + inputs, + shape=elements.shape, + index_value=elements.index_value, + columns_value=elements.columns_value, + dtypes=dtypes, + ) + + @classmethod + def _tile_entity_values(cls, op): + from ...core.context import get_context + from ...tensor.base.unique import TensorUnique + from ..arithmetic.bitwise_or import tree_dataframe_or + from ..utils import auto_merge_chunks + + in_elements = op.input + out_elements = op.outputs[0] + # values contains mars objects + chunks_list = [] + in_chunks = in_elements.chunks + if any(len(t.chunks) > 4 for t in op.inputs): + # yield and merge value chunks to reduce graph nodes + yield_chunks = [c for c in in_chunks] + unique_values = [] + for value in op.inputs[1:]: + if len(value.chunks) >= len(in_chunks) * 2: + # when value chunks is much more than in_chunks, + # we call drop_duplicates to reduce the amount of data. + if isinstance(value, TENSOR_TYPE): + chunks = [ + TensorUnique( + return_index=False, + return_inverse=False, + return_counts=False, + ).new_chunk( + [c], index=c.index, shape=(np.nan,), dtype=c.dtype + ) + for c in value.chunks + ] + unique_values.append( + TensorUnique( + return_index=False, + return_inverse=False, + return_counts=False, + ).new_tensor( + [value], + chunks=chunks, + nsplits=((np.nan,) * len(chunks),), + shape=(np.nan,), + dtype=value.dtype, + ) + ) + yield_chunks += chunks + else: + # is series + chunks = [ + DataFrameDropDuplicates( + keep="first", + ignore_index=False, + method="tree", + output_types=[OutputType.series], + ).new_chunk( + [c], + index=c.index, + index_value=c.index_value, + name=c.name, + dtype=c.dtype, + shape=(np.nan,), + ) + for c in value.chunks + ] + unique_values.append( + DataFrameDropDuplicates( + keep="first", + ignore_index=False, + method="tree", + output_types=[OutputType.series], + ).new_series( + [value], + chunks=chunks, + nsplits=((np.nan,) * len(chunks),), + index_value=value.index_value, + dtype=value.dtype, + shape=(np.nan,), + ) + ) + yield_chunks += chunks + else: + yield_chunks += value.chunks + unique_values.append(value) + yield yield_chunks + in_elements = auto_merge_chunks(get_context(), op.input) + in_chunks = in_elements.chunks + for value in unique_values: + if isinstance(value, SERIES_TYPE): + merged = auto_merge_chunks(get_context(), value) + chunks_list.append(merged.chunks) + elif isinstance(value, ENTITY_TYPE): + chunks_list.append(value.chunks) + else: + for value in op.inputs[1:]: + if isinstance(value, ENTITY_TYPE): + chunks_list.append(value.chunks) + + out_chunks = [] + for in_chunk in in_chunks: + isin_chunks = [] + for value_chunks in itertools.product(*chunks_list): + input_chunks = [in_chunk] + list(value_chunks) + isin_chunks.append(cls._new_chunk(op, in_chunk, input_chunks)) + out_chunk = tree_dataframe_or(*isin_chunks, index=in_chunk.index) + out_chunks.append(out_chunk) + + new_op = op.copy() + params = out_elements.params + params["nsplits"] = in_elements.nsplits + params["chunks"] = out_chunks + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + def tile(cls, op): + in_elements = op.input + out_elements = op.outputs[0] + + if len(op.inputs) > 1: + return (yield from cls._tile_entity_values(op)) + + out_chunks = [] + for chunk in in_elements.chunks: + out_chunk = cls._new_chunk(op, chunk, [chunk]) + out_chunks.append(out_chunk) + + new_op = op.copy() + params = out_elements.params + params["nsplits"] = in_elements.nsplits + params["chunks"] = out_chunks + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + def _new_chunk(cls, op, chunk, input_chunks): + out_elements = op.outputs[0] + chunk_op = op.copy().reset_key() + if out_elements.ndim == 1: + out_chunk = chunk_op.new_chunk( + input_chunks, + shape=chunk.shape, + dtype=out_elements.dtype, + index_value=chunk.index_value, + name=out_elements.name, + index=chunk.index, + ) + else: + chunk_dtypes = pd.Series( + [np.dtype(bool) for _ in chunk.dtypes], index=chunk.dtypes.index + ) + out_chunk = chunk_op.new_chunk( + input_chunks, + shape=chunk.shape, + index_value=chunk.index_value, + columns_value=chunk.columns_value, + dtypes=chunk_dtypes, + index=chunk.index, + ) + return out_chunk + + @classmethod + def execute(cls, ctx, op): + inputs_iter = iter(op.inputs) + elements = ctx[next(inputs_iter).key] + + if isinstance(op.values, dict): + values = dict() + for k, v in op.values.items(): + if isinstance(v, ENTITY_TYPE): + values[k] = ctx[next(inputs_iter).key] + else: + values[k] = v + else: + if isinstance(op.values, ENTITY_TYPE): + values = ctx[next(inputs_iter).key] + else: + values = op.values + + try: + ctx[op.outputs[0].key] = elements.isin(values) + except ValueError: + # buffer read-only + ctx[op.outputs[0].key] = elements.copy().isin(values.copy()) + + +def series_isin(elements, values): + """ + Whether elements in Series are contained in `values`. + + Return a boolean Series showing whether each element in the Series + matches an element in the passed sequence of `values` exactly. + + Parameters + ---------- + values : set or list-like + The sequence of values to test. Passing in a single string will + raise a ``TypeError``. Instead, turn a single string into a + list of one element. + + Returns + ------- + Series + Series of booleans indicating if each element is in values. + + Raises + ------ + TypeError + * If `values` is a string + + See Also + -------- + DataFrame.isin : Equivalent method on DataFrame. + + Examples + -------- + >>> import mars.dataframe as md + >>> s = md.Series(['lama', 'cow', 'lama', 'beetle', 'lama', + ... 'hippo'], name='animal') + >>> s.isin(['cow', 'lama']).execute() + 0 True + 1 True + 2 True + 3 False + 4 True + 5 False + Name: animal, dtype: bool + + Passing a single string as ``s.isin('lama')`` will raise an error. Use + a list of one element instead: + + >>> s.isin(['lama']).execute() + 0 True + 1 False + 2 True + 3 False + 4 True + 5 False + Name: animal, dtype: bool + """ + if is_list_like(values): + values = list(values) + elif not isinstance(values, (SERIES_TYPE, TENSOR_TYPE, INDEX_TYPE)): + raise TypeError( + "only list-like objects are allowed to be passed to isin(), " + f"you passed a [{type(values)}]" + ) + op = DataFrameIsin(values=values) + return op(elements) + + +def df_isin(df, values): + """ + Whether each element in the DataFrame is contained in values. + + Parameters + ---------- + values : iterable, Series, DataFrame or dict + The result will only be true at a location if all the + labels match. If `values` is a Series, that's the index. If + `values` is a dict, the keys must be the column names, + which must match. If `values` is a DataFrame, + then both the index and column labels must match. + + Returns + ------- + DataFrame + DataFrame of booleans showing whether each element in the DataFrame + is contained in values. + + See Also + -------- + DataFrame.eq: Equality test for DataFrame. + Series.isin: Equivalent method on Series. + Series.str.contains: Test if pattern or regex is contained within a + string of a Series or Index. + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, + ... index=['falcon', 'dog']) + >>> df.execute() + num_legs num_wings + falcon 2 2 + dog 4 0 + + When ``values`` is a list check whether every value in the DataFrame + is present in the list (which animals have 0 or 2 legs or wings) + + >>> df.isin([0, 2]).execute() + num_legs num_wings + falcon True True + dog False True + + When ``values`` is a dict, we can pass values to check for each + column separately: + + >>> df.isin({'num_wings': [0, 3]}).execute() + num_legs num_wings + falcon False False + dog False True + + When ``values`` is a Series or DataFrame the index and column must + match. Note that 'falcon' does not match based on the number of legs + in df2. + + >>> other = md.DataFrame({'num_legs': [8, 2], 'num_wings': [0, 2]}, + ... index=['spider', 'falcon']) + >>> df.isin(other).execute() + num_legs num_wings + falcon True True + dog False False + """ + if is_list_like(values) and not isinstance(values, dict): + values = list(values) + elif not isinstance( + values, (SERIES_TYPE, DATAFRAME_TYPE, TENSOR_TYPE, INDEX_TYPE, dict) + ): + raise TypeError( + "only list-like objects or dict are allowed to be passed to isin(), " + f"you passed a [{type(values)}]" + ) + op = DataFrameIsin(values=values) + return op(df) diff --git a/python/xorbits/_mars/dataframe/base/map.py b/python/xorbits/_mars/dataframe/base/map.py new file mode 100644 index 000000000..f1b583f53 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/map.py @@ -0,0 +1,308 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from collections.abc import MutableMapping + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import OutputType, recursive_tile +from ...core.custom_log import redirect_custom_log +from ...serialization.serializables import AnyField, KeyField, StringField +from ...utils import enter_current_session, has_unknown_shape, quiet_stdio +from ..core import SERIES_TYPE +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_series + + +class DataFrameMap(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.MAP + + _input = KeyField("input") + _arg = AnyField("arg") + _na_action = StringField("na_action") + + def __init__( + self, arg=None, na_action=None, output_types=None, memory_scale=None, **kw + ): + super().__init__( + _arg=arg, + _na_action=na_action, + _output_types=output_types, + _memory_scale=memory_scale, + **kw + ) + if not self.output_types: + self.output_types = [OutputType.series] + + @property + def input(self): + return self._input + + @property + def arg(self): + return self._arg + + @property + def na_action(self): + return self._na_action + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + if len(inputs) == 2: + self._arg = self._inputs[1] + + def __call__(self, series, dtype, skip_infer=False): + if dtype is None and not skip_infer: + inferred_dtype = None + if callable(self._arg): + # arg is a function, try to inspect the signature + sig = inspect.signature(self._arg) + return_type = sig.return_annotation + if return_type is not inspect._empty: + inferred_dtype = np.dtype(return_type) + else: + try: + with quiet_stdio(): + # try to infer dtype by calling the function + inferred_dtype = ( + build_series(series) + .map(self._arg, na_action=self._na_action) + .dtype + ) + except: # noqa: E722 # nosec + pass + else: + if isinstance(self._arg, MutableMapping): + inferred_dtype = pd.Series(self._arg).dtype + else: + inferred_dtype = self._arg.dtype + if inferred_dtype is not None and np.issubdtype(inferred_dtype, np.number): + if np.issubdtype(inferred_dtype, np.inexact): + # for the inexact e.g. float + # we can make the decision, + # but for int, due to the nan which may occur, + # we cannot infer the dtype + dtype = inferred_dtype + else: + dtype = inferred_dtype + + if dtype is None: + if not skip_infer: + raise ValueError( + "cannot infer dtype, it needs to be specified manually for `map`" + ) + else: + dtype = np.int64 if dtype is int else dtype + dtype = np.dtype(dtype) + + inputs = [series] + if isinstance(self._arg, SERIES_TYPE): + inputs.append(self._arg) + + if isinstance(series, SERIES_TYPE): + return self.new_series( + inputs, + shape=series.shape, + dtype=dtype, + index_value=series.index_value, + name=series.name, + ) + else: + return self.new_index( + inputs, + shape=series.shape, + dtype=dtype, + index_value=series.index_value, + name=series.name, + ) + + @classmethod + def tile(cls, op): + in_series = op.input + out_series = op.outputs[0] + + arg = op.arg + if len(op.inputs) == 2: + # make sure arg has known shape when it's a md.Series + if has_unknown_shape(op.arg): + yield + arg = yield from recursive_tile(op.arg.rechunk(op.arg.shape)) + + out_chunks = [] + for chunk in in_series.chunks: + chunk_op = op.copy().reset_key() + chunk_op.tileable_op_key = op.key + chunk_inputs = [chunk] + if len(op.inputs) == 2: + chunk_inputs.append(arg.chunks[0]) + out_chunk = chunk_op.new_chunk( + chunk_inputs, + shape=chunk.shape, + dtype=out_series.dtype, + index_value=chunk.index_value, + name=out_series.name, + index=chunk.index, + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + params = out_series.params + params["chunks"] = out_chunks + params["nsplits"] = in_series.nsplits + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + @redirect_custom_log + @enter_current_session + def execute(cls, ctx, op): + series = ctx[op.inputs[0].key] + out = op.outputs[0] + if len(op.inputs) == 2: + arg = ctx[op.inputs[1].key] + else: + arg = op.arg + + ret = series.map(arg, na_action=op.na_action) + if ret.dtype != out.dtype: + ret = ret.astype(out.dtype) + ctx[out.key] = ret + + +def series_map( + series, arg, na_action=None, dtype=None, memory_scale=None, skip_infer=False +): + """ + Map values of Series according to input correspondence. + + Used for substituting each value in a Series with another value, + that may be derived from a function, a ``dict`` or + a :class:`Series`. + + Parameters + ---------- + arg : function, collections.abc.Mapping subclass or Series + Mapping correspondence. + na_action : {None, 'ignore'}, default None + If 'ignore', propagate NaN values, without passing them to the + mapping correspondence. + dtype : np.dtype, default None + Specify return type of the function. Must be specified when + we cannot decide the return type of the function. + memory_scale : float + Specify the scale of memory uses in the function versus + input size. + skip_infer: bool, default False + Whether infer dtypes when dtypes or output_type is not specified + + Returns + ------- + Series + Same index as caller. + + See Also + -------- + Series.apply : For applying more complex functions on a Series. + DataFrame.apply : Apply a function row-/column-wise. + DataFrame.applymap : Apply a function elementwise on a whole DataFrame. + + Notes + ----- + When ``arg`` is a dictionary, values in Series that are not in the + dictionary (as keys) are converted to ``NaN``. However, if the + dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e. + provides a method for default values), then this default is used + rather than ``NaN``. + + Examples + -------- + >>> import mars.tensor as mt + >>> import mars.dataframe as md + >>> s = md.Series(['cat', 'dog', mt.nan, 'rabbit']) + >>> s.execute() + 0 cat + 1 dog + 2 NaN + 3 rabbit + dtype: object + + ``map`` accepts a ``dict`` or a ``Series``. Values that are not found + in the ``dict`` are converted to ``NaN``, unless the dict has a default + value (e.g. ``defaultdict``): + + >>> s.map({'cat': 'kitten', 'dog': 'puppy'}).execute() + 0 kitten + 1 puppy + 2 NaN + 3 NaN + dtype: object + + It also accepts a function: + + >>> s.map('I am a {}'.format).execute() + 0 I am a cat + 1 I am a dog + 2 I am a nan + 3 I am a rabbit + dtype: object + + To avoid applying the function to missing values (and keep them as + ``NaN``) ``na_action='ignore'`` can be used: + + >>> s.map('I am a {}'.format, na_action='ignore').execute() + 0 I am a cat + 1 I am a dog + 2 NaN + 3 I am a rabbit + dtype: object + """ + op = DataFrameMap(arg=arg, na_action=na_action, memory_scale=memory_scale) + return op(series, dtype=dtype, skip_infer=skip_infer) + + +def index_map( + idx, mapper, na_action=None, dtype=None, memory_scale=None, skip_infer=False +): + """ + Map values using input correspondence (a dict, Series, or function). + + Parameters + ---------- + mapper : function, dict, or Series + Mapping correspondence. + na_action : {None, 'ignore'} + If 'ignore', propagate NA values, without passing them to the + mapping correspondence. + dtype : np.dtype, default None + Specify return type of the function. Must be specified when + we cannot decide the return type of the function. + memory_scale : float + Specify the scale of memory uses in the function versus + input size. + skip_infer: bool, default False + Whether infer dtypes when dtypes or output_type is not specified + + + Returns + ------- + applied : Union[Index, MultiIndex], inferred + The output of the mapping function applied to the index. + If the function returns a tuple with more than one element + a MultiIndex will be returned. + """ + op = DataFrameMap(arg=mapper, na_action=na_action, memory_scale=memory_scale) + return op(idx, dtype=dtype, skip_infer=skip_infer) diff --git a/python/xorbits/_mars/dataframe/base/map_chunk.py b/python/xorbits/_mars/dataframe/base/map_chunk.py new file mode 100644 index 000000000..eff7185da --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/map_chunk.py @@ -0,0 +1,433 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...core import CHUNK_TYPE, ENTITY_TYPE, get_output_types, recursive_tile +from ...core.custom_log import redirect_custom_log +from ...serialization.serializables import ( + AnyField, + BoolField, + DictField, + FunctionField, + KeyField, + StringField, + TupleField, +) +from ...utils import ( + enter_current_session, + find_objects, + has_unknown_shape, + quiet_stdio, + replace_objects, +) +from ..operands import DataFrameOperand, DataFrameOperandMixin, OutputType +from ..utils import ( + build_df, + build_empty_df, + build_empty_series, + build_series, + clean_up_func, + parse_index, + restore_func, + validate_output_types, +) + + +class DataFrameMapChunk(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.MAP_CHUNK + + _input = KeyField("input") + _func = FunctionField("func") + _args = TupleField("args") + _kwargs = DictField("kwargs") + _with_chunk_index = BoolField("with_chunk_index") + _logic_key = StringField("logic_key") + _func_key = AnyField("func_key") + _need_clean_up_func = BoolField("need_clean_up_func") + + def __init__( + self, + input=None, + func=None, + args=None, + kwargs=None, + output_types=None, + with_chunk_index=None, + logic_key=None, + func_key=None, + need_clean_up_func=False, + **kw, + ): + super().__init__( + _input=input, + _func=func, + _args=args, + _kwargs=kwargs, + _output_types=output_types, + _with_chunk_index=with_chunk_index, + _logic_key=logic_key, + _func_key=func_key, + _need_clean_up_func=need_clean_up_func, + **kw, + ) + + @property + def input(self): + return self._input + + @property + def func(self): + return self._func + + @func.setter + def func(self, func): + self._func = func + + @property + def logic_key(self): + return self._logic_key + + @logic_key.setter + def logic_key(self, logic_key): + self._logic_key = logic_key + + @property + def func_key(self): + return self._func_key + + @func_key.setter + def func_key(self, func_key): + self._func_key = func_key + + @property + def need_clean_up_func(self): + return self._need_clean_up_func + + @need_clean_up_func.setter + def need_clean_up_func(self, need_clean_up_func: bool): + self._need_clean_up_func = need_clean_up_func + + @property + def args(self): + return self._args + + @property + def kwargs(self): + return self._kwargs + + @property + def with_chunk_index(self): + return self._with_chunk_index + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + old_inputs = find_objects(self._args, ENTITY_TYPE) + find_objects( + self._kwargs, ENTITY_TYPE + ) + mapping = {o: n for o, n in zip(old_inputs, self._inputs[1:])} + self._args = replace_objects(self._args, mapping) + self._kwargs = replace_objects(self._kwargs, mapping) + self._input = self._inputs[0] + + def _infer_attrs_by_call(self, df_or_series): + test_obj = ( + build_df(df_or_series, size=2) + if df_or_series.ndim == 2 + else build_series(df_or_series, size=2, name=df_or_series.name) + ) + kwargs = self.kwargs or dict() + if self.with_chunk_index: + kwargs["chunk_index"] = (0,) * df_or_series.ndim + with np.errstate(all="ignore"), quiet_stdio(): + obj = self._func(test_obj, *self._args, **kwargs) + + if obj.ndim == 2: + output_type = OutputType.dataframe + dtypes = obj.dtypes + if obj.shape == test_obj.shape: + shape = (df_or_series.shape[0], len(dtypes)) + else: # pragma: no cover + shape = (np.nan, len(dtypes)) + else: + output_type = OutputType.series + dtypes = pd.Series([obj.dtype], name=obj.name) + if obj.shape == test_obj.shape: + shape = df_or_series.shape + else: + shape = (np.nan,) + + index_value = parse_index( + obj.index, df_or_series, self._func, self._args, self._kwargs + ) + return { + "output_type": output_type, + "index_value": index_value, + "shape": shape, + "dtypes": dtypes, + } + + def __call__(self, df_or_series, index=None, dtypes=None): + output_type = ( + self.output_types[0] + if self.output_types + else get_output_types(df_or_series)[0] + ) + shape = self._kwargs.pop("shape", None) + + if output_type == OutputType.df_or_series: + return self.new_df_or_series([df_or_series]) + elif dtypes is not None: + index = index if index is not None else pd.RangeIndex(-1) + index_value = parse_index( + index, df_or_series, self._func, self._args, self._kwargs + ) + if shape is None: # pragma: no branch + shape = ( + (np.nan,) + if output_type == OutputType.series + else (np.nan, len(dtypes)) + ) + else: + # try run to infer meta + try: + attrs = self._infer_attrs_by_call(df_or_series) + output_type = attrs["output_type"] + index_value = attrs["index_value"] + shape = attrs["shape"] + dtypes = attrs["dtypes"] + except: # noqa: E722 # nosec + raise TypeError( + "Cannot determine `output_type`, " + "you have to specify it as `dataframe` or `series`, " + "for dataframe, `dtypes` is required as well " + "if output_type='dataframe'" + ) + + inputs = ( + [df_or_series] + + find_objects(self.args, ENTITY_TYPE) + + find_objects(self.kwargs, ENTITY_TYPE) + ) + if output_type == OutputType.series: + return self.new_series( + inputs, + dtype=dtypes.iloc[0], + shape=shape, + index_value=index_value, + name=dtypes.name, + ) + else: + # dataframe + columns_value = parse_index(dtypes.index, store_data=True) + return self.new_dataframe( + inputs, + shape=shape, + dtypes=dtypes, + index_value=index_value, + columns_value=columns_value, + ) + + @classmethod + def tile(cls, op: "DataFrameMapChunk"): + clean_up_func(op) + inp = op.input + out = op.outputs[0] + out_type = op.output_types[0] + + if inp.ndim == 2 and inp.chunk_shape[1] > 1: + if has_unknown_shape(inp): + yield + # if input is a DataFrame, make sure 1 chunk on axis columns + inp = yield from recursive_tile(inp.rechunk({1: inp.shape[1]})) + arg_input_chunks = [] + for other_inp in op.inputs[1:]: + other_inp = yield from recursive_tile(other_inp.rechunk(other_inp.shape)) + arg_input_chunks.append(other_inp.chunks[0]) + + out_chunks = [] + if out_type == OutputType.dataframe: + nsplits = [[], [out.shape[1]]] + pd_out_index = out.index_value.to_pandas() + elif out_type == OutputType.series: + nsplits = [[]] + pd_out_index = out.index_value.to_pandas() + else: + # DataFrameOrSeries + nsplits = None + pd_out_index = None + for chunk in inp.chunks: + chunk_op = op.copy().reset_key() + chunk_op.tileable_op_key = op.key + if out_type == OutputType.df_or_series: + if inp.ndim == 2: + collapse_axis = 1 + else: + collapse_axis = None + out_chunks.append( + chunk_op.new_chunk( + [chunk], index=chunk.index, collapse_axis=collapse_axis + ) + ) + elif out_type == OutputType.dataframe: + if np.isnan(out.shape[0]): + shape = (np.nan, out.shape[1]) + else: + shape = (chunk.shape[0], out.shape[1]) + index_value = parse_index(pd_out_index, chunk, op.key) + out_chunk = chunk_op.new_chunk( + [chunk] + arg_input_chunks, + shape=shape, + dtypes=out.dtypes, + index_value=index_value, + columns_value=out.columns_value, + index=(chunk.index[0], 0), + ) + out_chunks.append(out_chunk) + nsplits[0].append(out_chunk.shape[0]) + else: + if np.isnan(out.shape[0]): + shape = (np.nan,) + else: + shape = (chunk.shape[0],) + index_value = parse_index(pd_out_index, chunk, op.key) + out_chunk = chunk_op.new_chunk( + [chunk] + arg_input_chunks, + shape=shape, + index_value=index_value, + name=out.name, + dtype=out.dtype, + index=(chunk.index[0],), + ) + out_chunks.append(out_chunk) + nsplits[0].append(out_chunk.shape[0]) + + params = out.params + params["nsplits"] = tuple(tuple(ns) for ns in nsplits) if nsplits else nsplits + params["chunks"] = out_chunks + new_op = op.copy() + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + @redirect_custom_log + @enter_current_session + def execute(cls, ctx, op: "DataFrameMapChunk"): + restore_func(ctx, op) + inp = ctx[op.input.key] + out = op.outputs[0] + if len(inp) == 0: + if op.output_types[0] == OutputType.dataframe: + ctx[out.key] = build_empty_df(out.dtypes) + elif op.output_types[0] == OutputType.series: + ctx[out.key] = build_empty_series(out.dtype, name=out.name) + else: + raise ValueError(f"Chunk can not be empty except for dataframe/series.") + return + + kwargs = op.kwargs or dict() + if op.with_chunk_index: + kwargs["chunk_index"] = out.index + args = op.args or tuple() + chunks = find_objects(args, CHUNK_TYPE) + find_objects(kwargs, CHUNK_TYPE) + mapping = {chunk: ctx[chunk.key] for chunk in chunks} + args = replace_objects(args, mapping) + kwargs = replace_objects(kwargs, mapping) + ctx[out.key] = op.func(inp, *args, **kwargs) + + +def map_chunk(df_or_series, func, args=(), kwargs=None, skip_infer=False, **kw): + """ + Apply function to each chunk. + + Parameters + ---------- + func : function + Function to apply to each chunk. + args : tuple + Positional arguments to pass to func in addition to the array/series. + kwargs: Dict + Additional keyword arguments to pass as keywords arguments to func. + skip_infer: bool, default False + Whether infer dtypes when dtypes or output_type is not specified. + + Returns + ------- + Series or DataFrame + Result of applying ``func`` to each chunk of the DataFrame or Series. + + See Also + -------- + DataFrame.apply : Perform any type of operations. + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame([[4, 9]] * 3, columns=['A', 'B']) + >>> df.execute() + A B + 0 4 9 + 1 4 9 + 2 4 9 + + Output type including Series or DataFrame will be auto inferred. + + >>> df.map_chunk(lambda c: c['A'] + c['B']).execute() + 0 13 + 1 13 + 2 13 + dtype: int64 + + You can specify ``output_type`` by yourself if auto infer failed. + + >>> import pandas as pd + >>> import numpy as np + >>> df['c'] = ['s1', 's2', 's3'] + >>> df.map_chunk(lambda c: pd.concat([c['A'], c['c'].str.slice(1).astype(int)], axis=1)).execute() + Traceback (most recent call last): + TypeError: Cannot determine `output_type`, you have to specify it as `dataframe` or `series`... + >>> df.map_chunk(lambda c: pd.concat([c['A'], c['c'].str.slice(1).astype(int)], axis=1), + >>> output_type='dataframe', dtypes=pd.Series([np.dtype(object), np.dtype(int)])).execute() + A c + 0 4 1 + 1 4 2 + 2 4 3 + """ + output_type = kw.pop("output_type", None) + output_types = kw.pop("output_types", None) + object_type = kw.pop("object_type", None) + output_types = validate_output_types( + output_type=output_type, output_types=output_types, object_type=object_type + ) + output_type = output_types[0] if output_types else None + if output_type: + output_types = [output_type] + elif skip_infer: + output_types = [OutputType.df_or_series] + index = kw.pop("index", None) + dtypes = kw.pop("dtypes", None) + with_chunk_index = kw.pop("with_chunk_index", False) + if kw: # pragma: no cover + raise TypeError(f"Unknown kwargs: {kw}") + + op = DataFrameMapChunk( + input=df_or_series, + func=func, + args=args, + kwargs=kwargs or {}, + output_types=output_types, + with_chunk_index=with_chunk_index, + ) + return op(df_or_series, index=index, dtypes=dtypes) diff --git a/python/xorbits/_mars/dataframe/base/melt.py b/python/xorbits/_mars/dataframe/base/melt.py new file mode 100644 index 000000000..21eba0d9d --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/melt.py @@ -0,0 +1,247 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...core import recursive_tile +from ...serialization.serializables import AnyField, StringField +from ...utils import calc_nsplits +from ..operands import DataFrameOperand, DataFrameOperandMixin, OutputType +from ..utils import build_empty_df, parse_index, standardize_range_index + + +class DataFrameMelt(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.MELT + + _id_vars = AnyField("id_vars") + _value_vars = AnyField("value_vars") + _var_name = StringField("var_name") + _value_name = StringField("value_name") + _col_level = AnyField("col_level") + + def __init__( + self, + id_vars=None, + value_vars=None, + var_name=None, + value_name=None, + col_level=None, + **kw + ): + super().__init__( + _id_vars=id_vars, + _value_vars=value_vars, + _var_name=var_name, + _value_name=value_name, + _col_level=col_level, + **kw + ) + + @property + def id_vars(self): + return self._id_vars + + @property + def value_vars(self): + return self._value_vars + + @property + def var_name(self): + return self._var_name + + @property + def value_name(self): + return self._value_name + + @property + def col_level(self): + return self._col_level + + def __call__(self, df): + empty_result = build_empty_df(df.dtypes).melt( + id_vars=self.id_vars, + value_vars=self.value_vars, + var_name=self.var_name, + value_name=self.value_name, + col_level=self.col_level, + ) + self._output_types = [OutputType.dataframe] + return self.new_tileable( + [df], + shape=(np.nan, len(empty_result.columns)), + dtypes=empty_result.dtypes, + index_value=parse_index(pd.RangeIndex(-1), df.key, df.index_value.key), + columns_value=parse_index(empty_result.columns, store_data=True), + ) + + @classmethod + def tile(cls, op: "DataFrameMelt"): + inp = op.inputs[0] + out = op.outputs[0] + + inp = yield from recursive_tile(inp.rechunk({1: (inp.shape[1],)})) + + chunks = [] + for c in inp.chunks: + new_op = op.copy().reset_key() + chunks.append( + new_op.new_chunk( + [c], + index=c.index, + shape=(np.nan, out.shape[1]), + dtypes=out.dtypes, + index_value=parse_index( + pd.RangeIndex(-1), c.key, c.index_value.key + ), + columns_value=out.columns_value, + ) + ) + + yield chunks + chunks = standardize_range_index(chunks) + new_op = op.copy().reset_key() + return new_op.new_tileables( + [inp], + chunks=chunks, + nsplits=calc_nsplits({c.index: c.shape for c in chunks}), + **out.params + ) + + @classmethod + def execute(cls, ctx, op: "DataFrameMelt"): + in_data = ctx[op.inputs[0].key] + ctx[op.outputs[0].key] = in_data.melt( + id_vars=op.id_vars, + value_vars=op.value_vars, + var_name=op.var_name, + value_name=op.value_name, + col_level=op.col_level, + ) + + +def melt( + frame, + id_vars=None, + value_vars=None, + var_name=None, + value_name="value", + col_level=None, +): + """ + Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. + + This function is useful to massage a DataFrame into a format where one + or more columns are identifier variables (`id_vars`), while all other + columns, considered measured variables (`value_vars`), are "unpivoted" to + the row axis, leaving just two non-identifier columns, 'variable' and + 'value'. + .. versionadded:: 0.20.0 + + Parameters + ---------- + id_vars : tuple, list, or ndarray, optional + Column(s) to use as identifier variables. + value_vars : tuple, list, or ndarray, optional + Column(s) to unpivot. If not specified, uses all columns that + are not set as `id_vars`. + var_name : scalar + Name to use for the 'variable' column. If None it uses + ``frame.columns.name`` or 'variable'. + value_name : scalar, default 'value' + Name to use for the 'value' column. + col_level : int or str, optional + If columns are a MultiIndex then use this level to melt. + + Returns + ------- + DataFrame + Unpivoted DataFrame. + + See Also + -------- + melt + pivot_table + DataFrame.pivot + Series.explode + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, + ... 'B': {0: 1, 1: 3, 2: 5}, + ... 'C': {0: 2, 1: 4, 2: 6}}) + >>> df.execute() + A B C + 0 a 1 2 + 1 b 3 4 + 2 c 5 6 + + >>> df.melt(id_vars=['A'], value_vars=['B']).execute() + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + + >>> df.melt(id_vars=['A'], value_vars=['B', 'C']).execute() + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + 3 a C 2 + 4 b C 4 + 5 c C 6 + + The names of 'variable' and 'value' columns can be customized: + + >>> df.melt(id_vars=['A'], value_vars=['B'], + ... var_name='myVarname', value_name='myValname').execute() + A myVarname myValname + 0 a B 1 + 1 b B 3 + 2 c B 5 + + If you have multi-index columns: + + >>> df = md.DataFrame({('A', 'D'): {0: 'a', 1: 'b', 2: 'c'}, + ... ('B', 'E'): {0: 1, 1: 3, 2: 5}, + ... ('C', 'F'): {0: 2, 1: 4, 2: 6}}) + >>> df.execute() + A B C + D E F + 0 a 1 2 + 1 b 3 4 + 2 c 5 6 + + >>> df.melt(col_level=0, id_vars=['A'], value_vars=['B']).execute() + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + + >>> df.melt(id_vars=[('A', 'D')], value_vars=[('B', 'E')]).execute() + (A, D) variable_0 variable_1 value + 0 a B E 1 + 1 b B E 3 + 2 c B E 5 + """ + op = DataFrameMelt( + id_vars=id_vars, + value_vars=value_vars, + var_name=var_name, + value_name=value_name, + col_level=col_level, + ) + return op(frame) diff --git a/python/xorbits/_mars/dataframe/base/memory_usage.py b/python/xorbits/_mars/dataframe/base/memory_usage.py new file mode 100644 index 000000000..75de25932 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/memory_usage.py @@ -0,0 +1,501 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator +from functools import reduce + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...config import options +from ...core.operand import OperandStage +from ...serialization.serializables import BoolField, Int64Field +from ...utils import ceildiv, lazy_import +from ..core import IndexValue +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import parse_index + +cudf = lazy_import("cudf") + + +class DataFrameMemoryUsage(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.MEMORY_USAGE + + # raw arguments of memory_usage method + _index = BoolField("index") + _deep = BoolField("deep") + + # size of range index, when set, the value will be prepended to the result series + # if the input is a dataframe, or added to the result when the input is a series + _range_index_size = Int64Field("range_index_size") + + def __init__(self, index=None, deep=None, range_index_size=None, **kw): + super().__init__( + _index=index, _deep=deep, _range_index_size=range_index_size, **kw + ) + + @property + def index(self) -> bool: + return self._index + + @index.setter + def index(self, value: bool): + self._index = value + + @property + def deep(self) -> bool: + return self._deep + + @property + def range_index_size(self) -> int: + return self._range_index_size + + @range_index_size.setter + def range_index_size(self, value: int): + self._range_index_size = value + + def _adapt_index(self, input_index, index=0): + """ + When ``index=True`` is passed, an extra column will be prepended to the result series + Thus we need to update the index of initial chunk for returned dataframe chunks + """ + if not self.index or index != 0: + return input_index + idx_data = input_index.to_pandas().insert(0, "Index") + return parse_index(idx_data, store_data=True) + + def _adapt_nsplits(self, input_nsplit): + """ + When ``index=True`` is passed, the size of returned series is one element larger + than the number of columns, which affects ``nsplits``. + """ + if not self.index: + return (input_nsplit[-1],) + nsplits_list = list(input_nsplit[-1]) + nsplits_list[0] += 1 + return (tuple(nsplits_list),) + + def __call__(self, df_or_series): + """ + Return output object of memory_usage() call + """ + if df_or_series.ndim == 1: + # the input data is a series, a Scalar will be returned + return self.new_scalar([df_or_series], dtype=np.dtype(np.int_)) + else: + # the input data is a DataFrame, a Scalar will be returned + # calculate shape of returning series given ``op.index`` + new_shape = ( + (df_or_series.shape[-1] + 1,) + if self.index + else (df_or_series.shape[-1],) + ) + return self.new_series( + [df_or_series], + index_value=self._adapt_index(df_or_series.columns_value), + shape=new_shape, + dtype=np.dtype(np.int_), + ) + + @classmethod + def _tile_single(cls, op: "DataFrameMemoryUsage"): + """ + Tile when input data has only one chunk on rows + """ + df_or_series = op.inputs[0] + output = op.outputs[0] + + chunks = [] + for c in df_or_series.chunks: + new_op = op.copy().reset_key() + if c.ndim == 1: + # Tile for series + chunks.append( + new_op.new_chunk([c], index=c.index, dtype=output.dtype, shape=()) + ) + else: + # tile for dataframes + # only calculate with index=True on the initial chunk + new_op.index = op.index and c.index[-1] == 0 + + # calculate shape of returning chunk given ``op.index`` + new_shape = ( + (c.shape[-1] + 1,) + if c.index[-1] == 0 and op.index + else (c.shape[-1],) + ) + chunks.append( + new_op.new_chunk( + [c], + shape=new_shape, + dtype=output.dtype, + index=(c.index[-1],), + index_value=op._adapt_index(c.columns_value, c.index[-1]), + ) + ) + + new_op = op.copy().reset_key() + # return objects with chunks and nsplits (if needed) + if df_or_series.ndim == 1: + return new_op.new_scalar([df_or_series], dtype=output.dtype, chunks=chunks) + else: + return new_op.new_series( + [df_or_series], + shape=output.shape, + dtype=output.dtype, + index_value=output.index_value, + chunks=chunks, + nsplits=op._adapt_nsplits(df_or_series.nsplits), + ) + + @classmethod + def _tile_dataframe(cls, op: "DataFrameMemoryUsage"): + """ + Tile dataframes using tree reduction + """ + df = op.inputs[0] + output = op.outputs[0] + is_range_index = isinstance(df.index_value.value, IndexValue.RangeIndex) + + # produce map chunks + # allocate matrix of chunks + chunks_to_reduce = np.empty(shape=df.chunk_shape, dtype=object) + for c in df.chunks: + new_op = op.copy().reset_key() + new_op.stage = OperandStage.map + + if op.index and is_range_index: + # when the index is ``pd.RangeIndex``, the size should be included + # after all computations are done + new_op.index = False + else: + # when the chunk is not the first chunk in the row, index size is not needed + new_op.index = op.index and c.index[-1] == 0 + + new_shape = ( + (c.shape[-1] + 1,) if c.index[-1] == 0 and op.index else (c.shape[-1],) + ) + + chunks_to_reduce[c.index] = new_op.new_chunk( + [c], + index=(c.index[-1],), + dtype=output.dtype, + shape=new_shape, + index_value=op._adapt_index(c.columns_value, c.index[-1]), + ) + + # reduce chunks using tree reduction + combine_size = options.combine_size + while chunks_to_reduce.shape[0] > 1: + # allocate matrix of chunks + new_chunks_to_reduce = np.empty( + ( + ceildiv(chunks_to_reduce.shape[0], combine_size), + chunks_to_reduce.shape[1], + ), + dtype=object, + ) + for idx in range(0, chunks_to_reduce.shape[0], combine_size): + for idx2 in range(chunks_to_reduce.shape[1]): + new_op = op.copy().reset_key() + new_op.stage = OperandStage.reduce + chunks = list(chunks_to_reduce[idx : idx + combine_size, idx2]) + + new_chunks_to_reduce[idx // combine_size, idx2] = new_op.new_chunk( + chunks, + index=(idx2,), + dtype=output.dtype, + shape=chunks[0].shape, + index_value=chunks[0].index_value, + ) + + chunks_to_reduce = new_chunks_to_reduce + + # handle RangeIndex at final outputs + if op.index and is_range_index: + chunks_to_reduce[ + 0, 0 + ].op.range_index_size = df.index_value.to_pandas().memory_usage() + + # return series with chunks and nsplits + new_op = op.copy().reset_key() + return new_op.new_series( + [df], + dtype=output.dtype, + shape=output.shape, + index_value=output.index_value, + chunks=list(chunks_to_reduce[0, :]), + nsplits=op._adapt_nsplits(df.nsplits), + ) + + @classmethod + def _tile_series(cls, op: "DataFrameMemoryUsage"): + """ + Tile series using tree reduction + """ + series = op.inputs[0] + output = op.outputs[0] + is_range_index = isinstance(series.index_value.value, IndexValue.RangeIndex) + + chunks_to_reduce = [] + for c in series.chunks: + new_op = op.copy().reset_key() + new_op.stage = OperandStage.map + + # when the index is ``pd.RangeIndex``, the size should be included + # after all computations are done + new_op.index = op.index and not is_range_index + + chunks_to_reduce.append( + new_op.new_chunk([c], index=c.index, dtype=output.dtype, shape=()) + ) + + # reduce chunks using tree reduction + combine_size = options.combine_size + while len(chunks_to_reduce) > 1: + new_chunks_to_reduce = [] + for idx in range(0, len(chunks_to_reduce), combine_size): + new_op = op.copy().reset_key() + new_op.stage = OperandStage.reduce + + new_chunks_to_reduce.append( + new_op.new_chunk( + chunks_to_reduce[idx : idx + combine_size], + shape=(), + index=(0,), + dtype=output.dtype, + ) + ) + + chunks_to_reduce = new_chunks_to_reduce + + # handle RangeIndex at final outputs + if op.index and is_range_index: + chunks_to_reduce[ + 0 + ].op.range_index_size = series.index_value.to_pandas().memory_usage() + + # return series with chunks + new_op = op.copy().reset_key() + return new_op.new_scalar([series], dtype=output.dtype, chunks=chunks_to_reduce) + + @classmethod + def tile(cls, op: "DataFrameMemoryUsage"): + df_or_series = op.inputs[0] + if ( + df_or_series.chunk_shape[0] == 1 + ): # only one chunk in row, no aggregation needed + return cls._tile_single(op) + elif df_or_series.ndim == 1: # series + return cls._tile_series(op) + else: # dataframe + return cls._tile_dataframe(op) + + @classmethod + def execute(cls, ctx, op: "DataFrameMemoryUsage"): + in_data = ctx[op.inputs[0].key] + # choose correct dataframe library + xdf = cudf if op.gpu else pd + + if op.stage == OperandStage.reduce: + result = reduce(operator.add, (ctx[c.key] for c in op.inputs)) + if op.range_index_size is not None: + if hasattr(in_data, "ndim"): + # dataframe input: prepend index size column + prepend_series = xdf.Series( + [op.range_index_size], index=["Index"], dtype=result.dtype + ) + result = xdf.concat([prepend_series, result]) + else: + # series input: add index size to the output + result += op.range_index_size + ctx[op.outputs[0].key] = result + elif isinstance(in_data, xdf.Index): + ctx[op.outputs[0].key] = in_data.memory_usage(deep=op.deep) + else: + ctx[op.outputs[0].key] = in_data.memory_usage(index=op.index, deep=op.deep) + + +def df_memory_usage(df, index=True, deep=False): + """ + Return the memory usage of each column in bytes. + + The memory usage can optionally include the contribution of + the index and elements of `object` dtype. + + This value is displayed in `DataFrame.info` by default. This can be + suppressed by setting ``pandas.options.display.memory_usage`` to False. + + Parameters + ---------- + index : bool, default True + Specifies whether to include the memory usage of the DataFrame's + index in returned Series. If ``index=True``, the memory usage of + the index is the first item in the output. + deep : bool, default False + If True, introspect the data deeply by interrogating + `object` dtypes for system-level memory consumption, and include + it in the returned values. + + Returns + ------- + Series + A Series whose index is the original column names and whose values + is the memory usage of each column in bytes. + + See Also + -------- + numpy.ndarray.nbytes : Total bytes consumed by the elements of an + ndarray. + Series.memory_usage : Bytes consumed by a Series. + Categorical : Memory-efficient array for string values with + many repeated values. + DataFrame.info : Concise summary of a DataFrame. + + Examples + -------- + >>> import mars.tensor as mt + >>> import mars.dataframe as md + >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool'] + >>> data = dict([(t, mt.ones(shape=5000).astype(t)) + ... for t in dtypes]) + >>> df = md.DataFrame(data) + >>> df.head().execute() + int64 float64 complex128 object bool + 0 1 1.0 1.000000+0.000000j 1 True + 1 1 1.0 1.000000+0.000000j 1 True + 2 1 1.0 1.000000+0.000000j 1 True + 3 1 1.0 1.000000+0.000000j 1 True + 4 1 1.0 1.000000+0.000000j 1 True + + >>> df.memory_usage().execute() + Index 128 + int64 40000 + float64 40000 + complex128 80000 + object 40000 + bool 5000 + dtype: int64 + + >>> df.memory_usage(index=False).execute() + int64 40000 + float64 40000 + complex128 80000 + object 40000 + bool 5000 + dtype: int64 + + The memory footprint of `object` dtype columns is ignored by default: + + >>> df.memory_usage(deep=True).execute() + Index 128 + int64 40000 + float64 40000 + complex128 80000 + object 160000 + bool 5000 + dtype: int64 + + Use a Categorical for efficient storage of an object-dtype column with + many repeated values. + + >>> df['object'].astype('category').memory_usage(deep=True).execute() + 5216 + """ + op = DataFrameMemoryUsage(index=index, deep=deep) + return op(df) + + +def series_memory_usage(series, index=True, deep=False): + """ + Return the memory usage of the Series. + + The memory usage can optionally include the contribution of + the index and of elements of `object` dtype. + + Parameters + ---------- + index : bool, default True + Specifies whether to include the memory usage of the Series index. + deep : bool, default False + If True, introspect the data deeply by interrogating + `object` dtypes for system-level memory consumption, and include + it in the returned value. + + Returns + ------- + int + Bytes of memory consumed. + + See Also + -------- + numpy.ndarray.nbytes : Total bytes consumed by the elements of the + array. + DataFrame.memory_usage : Bytes consumed by a DataFrame. + + Examples + -------- + >>> import mars.dataframe as md + >>> s = md.Series(range(3)) + >>> s.memory_usage().execute() + 152 + + Not including the index gives the size of the rest of the data, which + is necessarily smaller: + + >>> s.memory_usage(index=False).execute() + 24 + + The memory footprint of `object` values is ignored by default: + + >>> s = md.Series(["a", "b"]) + >>> s.values.execute() + array(['a', 'b'], dtype=object) + + >>> s.memory_usage().execute() + 144 + + >>> s.memory_usage(deep=True).execute() + 260 + """ + op = DataFrameMemoryUsage(index=index, deep=deep) + return op(series) + + +def index_memory_usage(index, deep=False): + """ + Memory usage of the values. + + Parameters + ---------- + deep : bool + Introspect the data deeply, interrogate + `object` dtypes for system-level memory consumption. + + Returns + ------- + bytes used + + See Also + -------- + numpy.ndarray.nbytes + + Notes + ----- + Memory usage does not include memory consumed by elements that + are not components of the array if deep=False + """ + op = DataFrameMemoryUsage(index=False, deep=deep) + return op(index) diff --git a/python/xorbits/_mars/dataframe/base/pct_change.py b/python/xorbits/_mars/dataframe/base/pct_change.py new file mode 100644 index 000000000..ec960adb7 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/pct_change.py @@ -0,0 +1,150 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..utils import validate_axis + + +def pct_change( + df_or_series, periods=1, fill_method="pad", limit=None, freq=None, **kwargs +): + """ + Percentage change between the current and a prior element. + + Computes the percentage change from the immediately previous row by + default. This is useful in comparing the percentage of change in a time + series of elements. + + Parameters + ---------- + periods : int, default 1 + Periods to shift for forming percent change. + fill_method : str, default 'pad' + How to handle NAs before computing percent changes. + limit : int, default None + The number of consecutive NAs to fill before stopping. + freq : DateOffset, timedelta, or str, optional + Increment to use from time series API (e.g. 'M' or BDay()). + **kwargs + Additional keyword arguments are passed into + `DataFrame.shift` or `Series.shift`. + + Returns + ------- + chg : Series or DataFrame + The same type as the calling object. + + See Also + -------- + Series.diff : Compute the difference of two elements in a Series. + DataFrame.diff : Compute the difference of two elements in a DataFrame. + Series.shift : Shift the index by some number of periods. + DataFrame.shift : Shift the index by some number of periods. + + Examples + -------- + **Series** + + >>> import mars.dataframe as md + + >>> s = md.Series([90, 91, 85]) + >>> s.execute() + 0 90 + 1 91 + 2 85 + dtype: int64 + + >>> s.pct_change().execute() + 0 NaN + 1 0.011111 + 2 -0.065934 + dtype: float64 + + >>> s.pct_change(periods=2).execute() + 0 NaN + 1 NaN + 2 -0.055556 + dtype: float64 + + See the percentage change in a Series where filling NAs with last + valid observation forward to next valid. + + >>> s = md.Series([90, 91, None, 85]) + >>> s.execute() + 0 90.0 + 1 91.0 + 2 NaN + 3 85.0 + dtype: float64 + + >>> s.pct_change(fill_method='ffill').execute() + 0 NaN + 1 0.011111 + 2 0.000000 + 3 -0.065934 + dtype: float64 + + **DataFrame** + + Percentage change in French franc, Deutsche Mark, and Italian lira from + 1980-01-01 to 1980-03-01. + + >>> df = md.DataFrame({ + ... 'FR': [4.0405, 4.0963, 4.3149], + ... 'GR': [1.7246, 1.7482, 1.8519], + ... 'IT': [804.74, 810.01, 860.13]}, + ... index=['1980-01-01', '1980-02-01', '1980-03-01']) + >>> df.execute() + FR GR IT + 1980-01-01 4.0405 1.7246 804.74 + 1980-02-01 4.0963 1.7482 810.01 + 1980-03-01 4.3149 1.8519 860.13 + + >>> df.pct_change().execute() + FR GR IT + 1980-01-01 NaN NaN NaN + 1980-02-01 0.013810 0.013684 0.006549 + 1980-03-01 0.053365 0.059318 0.061876 + + Percentage of change in GOOG and APPL stock volume. Shows computing + the percentage change between columns. + + >>> df = md.DataFrame({ + ... '2016': [1769950, 30586265], + ... '2015': [1500923, 40912316], + ... '2014': [1371819, 41403351]}, + ... index=['GOOG', 'APPL']) + >>> df.execute() + 2016 2015 2014 + GOOG 1769950 1500923 1371819 + APPL 30586265 40912316 41403351 + + >>> df.pct_change(axis='columns').execute() + 2016 2015 2014 + GOOG NaN -0.151997 -0.086016 + APPL NaN 0.337604 0.012002 + """ + + axis = validate_axis(kwargs.pop("axis", 0)) + if fill_method is None: + data = df_or_series + else: + data = df_or_series.fillna(method=fill_method, axis=axis, limit=limit) + + rs = data.div(data.shift(periods=periods, freq=freq, axis=axis, **kwargs)) - 1 + if freq is not None: + # Shift method is implemented differently when freq is not None + # We want to restore the original index + rs = rs.loc[~rs.index.duplicated()] + rs = rs.reindex_like(data) + return rs diff --git a/python/xorbits/_mars/dataframe/base/qcut.py b/python/xorbits/_mars/dataframe/base/qcut.py new file mode 100644 index 000000000..11435409e --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/qcut.py @@ -0,0 +1,104 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +from pandas.api.types import is_integer + +from ...core import ENTITY_TYPE +from ...tensor import tensor as astensor +from ...tensor.statistics.percentile import percentile +from ..core import DATAFRAME_TYPE, SERIES_TYPE +from ..initializer import DataFrame, Series +from .cut import cut + + +def qcut(x, q, labels=None, retbins=False, precision=3, duplicate="raise"): + """ + Quantile-based discretization function. + + Discretize variable into equal-sized buckets based on rank or based + on sample quantiles. For example 1000 values for 10 quantiles would + produce a Categorical object indicating quantile membership for each data point. + + Parameters + ---------- + x : 1d tensor or Series + q : int or list-like of float + Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately + array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. + labels : array or False, default None + Used as labels for the resulting bins. Must be of the same length as + the resulting bins. If False, return only integer indicators of the + bins. If True, raises an error. + retbins : bool, optional + Whether to return the (bins, labels) or not. Can be useful if bins + is given as a scalar. + precision : int, optional + The precision at which to store and display the bins labels. + duplicates : {default 'raise', 'drop'}, optional + If bin edges are not unique, raise ValueError or drop non-uniques. + + Returns + ------- + out : Categorical or Series or tensor of integers if labels is False + The return type (Categorical or Series) depends on the input: a Series + of type category if input is a Series else Categorical. Bins are + represented as categories when categorical data is returned. + bins : tensor of floats + Returned only if `retbins` is True. + + Notes + ----- + Out of bounds values will be NA in the resulting Categorical object + + Examples + -------- + >>> import mars.dataframe as md + >>> md.qcut(range(5), 4).execute() + ... # doctest: +ELLIPSIS + [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]] + Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] ... + + >>> md.qcut(range(5), 3, labels=["good", "medium", "bad"]).execute() + ... # doctest: +SKIP + [good, good, medium, bad, bad] + Categories (3, object): [good < medium < bad] + + >>> md.qcut(range(5), 4, labels=False).execute() + array([0, 0, 1, 2, 3]) + """ + if is_integer(q): + q = np.linspace(0, 1, q + 1) + + if isinstance(x, (DATAFRAME_TYPE, SERIES_TYPE, pd.DataFrame, pd.Series)): + x = DataFrame(x) if x.ndim == 2 else Series(x) + bins = x.quantile(q) + else: + x = astensor(x) + if isinstance(q, ENTITY_TYPE): + q = q * 100 + else: + q = [iq * 100 for iq in q] + bins = percentile(x, q) + + return cut( + x, + bins, + labels=labels, + retbins=retbins, + precision=precision, + include_lowest=True, + duplicates=duplicate, + ) diff --git a/python/xorbits/_mars/dataframe/base/rebalance.py b/python/xorbits/_mars/dataframe/base/rebalance.py new file mode 100644 index 000000000..2a2316a32 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/rebalance.py @@ -0,0 +1,112 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes +from ...serialization.serializables import Float64Field, Int64Field, KeyField +from ...tensor.base.rebalance import RebalanceMixin +from ..core import INDEX_TYPE +from ..initializer import DataFrame as asdataframe +from ..initializer import Index as asindex +from ..initializer import Series as asseries +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import validate_axis + + +class DataFrameRebalance(RebalanceMixin, DataFrameOperandMixin, DataFrameOperand): + _op_type_ = opcodes.REBALANCE + + _input = KeyField("input") + _factor = Float64Field("factor") + _axis = Int64Field("axis") + _num_partitions = Int64Field("num_partitions") + + def __init__( + self, + input=None, + factor=None, + axis=None, # pylint: disable=redefined-builtin + num_partitions=None, + output_types=None, + **kw + ): + super().__init__( + _input=input, + _factor=factor, + _axis=axis, + _num_partitions=num_partitions, + _output_types=output_types, + **kw + ) + + @property + def input(self): + return self._input + + @property + def factor(self): + return self._factor + + @property + def axis(self): + return self._axis + + @property + def num_partitions(self): + return self._num_partitions + + def _get_input_object(self): + in_obj = self.input + if isinstance(in_obj, INDEX_TYPE): + convert = asindex + else: + convert = asdataframe if in_obj.ndim == 2 else asseries + return convert(in_obj) + + +def rebalance( + df_or_series, factor=None, axis=0, num_partitions=None, reassign_worker=True +): + """ + Make Data more balanced across entire cluster. + + Parameters + ---------- + factor : float + Specified so that number of chunks after balance is + total CPU count of cluster * factor. + axis : int + The axis to rebalance. + num_partitions : int + Specified so the number of chunks are at most + num_partitions. + reassign_worker : bool + If True, workers will be reassigned. + + Returns + ------- + Series or DataFrame + Result of DataFrame or Series after rebalanced. + """ + axis = validate_axis(axis, df_or_series) + if num_partitions is None: + factor = factor if factor is not None else 1.2 + + op = DataFrameRebalance( + input=df_or_series, + factor=factor, + axis=axis, + num_partitions=num_partitions, + reassign_worker=reassign_worker, + ) + return op(df_or_series) diff --git a/python/xorbits/_mars/dataframe/base/rechunk.py b/python/xorbits/_mars/dataframe/base/rechunk.py new file mode 100644 index 000000000..a2f7c9642 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/rechunk.py @@ -0,0 +1,206 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Tuple + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import OutputType +from ...serialization.serializables import AnyField +from ...tensor.rechunk.core import chunk_size_type, gen_rechunk_infos, get_nsplits +from ...typing import TileableType +from ...utils import has_unknown_shape +from ..initializer import DataFrame as asdataframe +from ..initializer import Index as asindex +from ..initializer import Series as asseries +from ..operands import DATAFRAME_TYPE, DataFrameOperand, DataFrameOperandMixin +from ..utils import indexing_index_value, merge_index_value + + +class DataFrameRechunk(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.RECHUNK + + chunk_size = AnyField("chunk_size") + + def __call__(self, x): + if isinstance(x, DATAFRAME_TYPE): + return self.new_dataframe( + [x], + shape=x.shape, + dtypes=x.dtypes, + columns_value=x.columns_value, + index_value=x.index_value, + ) + else: + self.output_types = x.op.output_types + f = ( + self.new_series + if self.output_types[0] == OutputType.series + else self.new_index + ) + return f( + [x], + shape=x.shape, + dtype=x.dtype, + index_value=x.index_value, + name=x.name, + ) + + @classmethod + def tile(cls, op: "DataFrameRechunk"): + from ..indexing.iloc import ( + DataFrameIlocGetItem, + IndexIlocGetItem, + SeriesIlocGetItem, + ) + from ..merge.concat import DataFrameConcat + + if has_unknown_shape(*op.inputs): + yield + + out = op.outputs[0] + inp = op.inputs[0] + if inp.ndim == 2: + inp = asdataframe(inp) + elif inp.op.output_types[0] == OutputType.series: + inp = asseries(inp) + else: + inp = asindex(inp) + chunk_size = _get_chunk_size(inp, op.chunk_size) + if chunk_size == inp.nsplits: + return [inp] + + rechunk_infos = gen_rechunk_infos(inp, chunk_size) + out_chunks = [] + for rechunk_info in rechunk_infos: + chunk_index = rechunk_info.out_index + shape = rechunk_info.shape + inp_chunks = rechunk_info.input_chunks + inp_chunk_slices = rechunk_info.input_slices + inp_slice_chunks = [] + for inp_chunk, inp_chunk_slice in zip(inp_chunks, inp_chunk_slices): + if all(slc == slice(None) for slc in inp_chunk_slice): + inp_slice_chunks.append(inp_chunk) + else: + index_value = indexing_index_value( + inp_chunk.index_value, inp_chunk_slice[0], rechunk=True + ) + if inp_chunk.ndim == 1: + # Series or Index + slc_chunk_op_type = ( + SeriesIlocGetItem + if op.output_types[0] == OutputType.series + else IndexIlocGetItem + ) + slc_chunk = slc_chunk_op_type( + indexes=inp_chunk_slice, + output_types=op.output_types, + sparse=inp_chunk.op.sparse, + ).new_chunk( + [inp_chunk], + index_value=index_value, + dtype=inp_chunk.dtype, + name=inp_chunk.name, + index=inp_chunk.index, + ) + else: + # DataFrame + columns_value = indexing_index_value( + inp_chunk.columns_value, + inp_chunk_slice[1], + store_data=True, + rechunk=True, + ) + dtypes = inp_chunk.dtypes.iloc[inp_chunk_slice[1]] + slc_chunk = DataFrameIlocGetItem( + indexes=inp_chunk_slice, + output_types=[OutputType.dataframe], + sparse=inp_chunk.op.sparse, + ).new_chunk( + [inp_chunk], + index_value=index_value, + columns_value=columns_value, + dtypes=dtypes, + index=inp_chunk.index, + ) + inp_slice_chunks.append(slc_chunk) + + chunk_shape = rechunk_info.input_chunk_shape + inp_chunks_arr = np.empty(chunk_shape, dtype=object) + inp_chunks_arr.ravel()[:] = inp_slice_chunks + params = dict(index=chunk_index, shape=shape) + if inp_chunks_arr.ndim == 1: + params["index_value"] = merge_index_value( + {i: c.index_value for i, c in enumerate(inp_chunks_arr)} + ) + params["dtype"] = inp_slice_chunks[0].dtype + params["name"] = inp_slice_chunks[0].name + else: + params["index_value"] = merge_index_value( + {i: c.index_value for i, c in enumerate(inp_chunks_arr[:, 0])} + ) + params["columns_value"] = merge_index_value( + {i: c.columns_value for i, c in enumerate(inp_chunks_arr[0])}, + store_data=True, + ) + params["dtypes"] = pd.concat([c.dtypes for c in inp_chunks_arr[0]]) + if len(inp_slice_chunks) == 1: + c = inp_slice_chunks[0] + cc = c.op.copy().new_chunk(c.op.inputs, kws=[params]) + out_chunks.append(cc) + else: + out_chunk = DataFrameConcat( + output_types=[out.op.output_types[0]] + ).new_chunk(inp_slice_chunks, kws=[params]) + out_chunks.append(out_chunk) + + new_op = op.copy() + params = out.params + params["nsplits"] = chunk_size + params["chunks"] = out_chunks + df_or_series = new_op.new_tileable(op.inputs, kws=[params]) + + if op.reassign_worker: + for c in df_or_series.chunks: + c.op.reassign_worker = True + + return [df_or_series] + + +def _get_chunk_size( + a: TileableType, chunk_size: chunk_size_type +) -> Tuple[Tuple[int], ...]: + if isinstance(a, DATAFRAME_TYPE): + itemsize = max(getattr(dt, "itemsize", 8) for dt in a.dtypes) + else: + itemsize = a.dtype.itemsize + return get_nsplits(a, chunk_size, itemsize) + + +def rechunk(a: TileableType, chunk_size: chunk_size_type, reassign_worker=False): + if not any(pd.isna(s) for s in a.shape) and not a.is_coarse(): + if not has_unknown_shape(a): + # do client check only when no unknown shape, + # real nsplits will be recalculated inside `tile` + chunk_size = _get_chunk_size(a, chunk_size) + if chunk_size == a.nsplits: + return a + + op = DataFrameRechunk( + chunk_size=chunk_size, + reassign_worker=reassign_worker, + ) + return op(a) diff --git a/python/xorbits/_mars/dataframe/base/select_dtypes.py b/python/xorbits/_mars/dataframe/base/select_dtypes.py new file mode 100644 index 000000000..00b00c2ac --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/select_dtypes.py @@ -0,0 +1,104 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..utils import build_empty_df + + +def select_dtypes(df, include=None, exclude=None): + """ + Return a subset of the DataFrame's columns based on the column dtypes. + + Parameters + ---------- + include, exclude : scalar or list-like + A selection of dtypes or strings to be included/excluded. At least + one of these parameters must be supplied. + + Returns + ------- + DataFrame + The subset of the frame including the dtypes in ``include`` and + excluding the dtypes in ``exclude``. + + Raises + ------ + ValueError + * If both of ``include`` and ``exclude`` are empty + * If ``include`` and ``exclude`` have overlapping elements + * If any kind of string dtype is passed in. + + See Also + -------- + DataFrame.dtypes: Return Series with the data type of each column. + + Notes + ----- + * To select all *numeric* types, use ``np.number`` or ``'number'`` + * To select strings you must use the ``object`` dtype, but note that + this will return *all* object dtype columns + * See the `numpy dtype hierarchy + `__ + * To select datetimes, use ``np.datetime64``, ``'datetime'`` or + ``'datetime64'`` + * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or + ``'timedelta64'`` + * To select Pandas categorical dtypes, use ``'category'`` + * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in + 0.20.0) or ``'datetime64[ns, tz]'`` + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame({'a': [1, 2] * 3, + ... 'b': [True, False] * 3, + ... 'c': [1.0, 2.0] * 3}) + >>> df.execute() + a b c + 0 1 True 1.0 + 1 2 False 2.0 + 2 1 True 1.0 + 3 2 False 2.0 + 4 1 True 1.0 + 5 2 False 2.0 + + >>> df.select_dtypes(include='bool').execute() + b + 0 True + 1 False + 2 True + 3 False + 4 True + 5 False + + >>> df.select_dtypes(include=['float64']).execute() + c + 0 1.0 + 1 2.0 + 2 1.0 + 3 2.0 + 4 1.0 + 5 2.0 + + >>> df.select_dtypes(exclude=['int64']).execute() + b c + 0 True 1.0 + 1 False 2.0 + 2 True 1.0 + 3 False 2.0 + 4 True 1.0 + 5 False 2.0 + """ + test_df = build_empty_df(df.dtypes) + test_df = test_df.select_dtypes(include=include, exclude=exclude) + return df[test_df.dtypes.index.tolist()] diff --git a/python/xorbits/_mars/dataframe/base/shift.py b/python/xorbits/_mars/dataframe/base/shift.py new file mode 100644 index 000000000..75a4d0a04 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/shift.py @@ -0,0 +1,510 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import OutputType +from ...serialization.serializables import AnyField, Int8Field, Int64Field, KeyField +from ...utils import has_unknown_shape, no_default, pd_release_version +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_df, build_series, parse_index, validate_axis + +_need_consolidate = pd.__version__ in ("1.1.0", "1.3.0", "1.3.1") +_enable_no_default = pd_release_version[:2] > (1, 1) +_with_column_freq_bug = (1, 2, 0) <= pd_release_version < (1, 4, 3) + + +class DataFrameShift(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.SHIFT + + _input = KeyField("input") + _periods = Int64Field("periods") + _freq = AnyField("freq") + _axis = Int8Field("axis") + _fill_value = AnyField("fill_value") + + def __init__(self, periods=None, freq=None, axis=None, fill_value=None, **kw): + super().__init__( + _periods=periods, _freq=freq, _axis=axis, _fill_value=fill_value, **kw + ) + + @property + def input(self): + return self._input + + @property + def periods(self): + return self._periods + + @property + def freq(self): + return self._freq + + @property + def axis(self): + return self._axis + + @property + def fill_value(self): + return self._fill_value + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + def _call_dataframe(self, df): + test_df = build_df(df) + result_df = test_df.shift( + periods=self._periods, + freq=self._freq, + axis=self._axis, + fill_value=self._fill_value, + ) + + if self._freq is None: + # shift data + index_value = df.index_value + columns_value = df.columns_value + else: + # shift index + if self._axis == 0: + index_value = self._get_index_value( + df.index_value, self._periods, self._freq + ) + columns_value = df.columns_value + else: + columns_value = parse_index(result_df.dtypes.index, store_data=True) + index_value = df.index_value + + return self.new_dataframe( + [df], + shape=df.shape, + dtypes=result_df.dtypes, + index_value=index_value, + columns_value=columns_value, + ) + + def _call_series(self, series): + test_series = build_series(series) + result_series = test_series.shift( + periods=self._periods, + freq=self._freq, + axis=self._axis, + fill_value=self._fill_value, + ) + + index_value = series.index_value + if self._freq is not None: + # shift index + index_value = self._get_index_value(index_value, self._periods, self._freq) + + return self.new_series( + [series], + shape=series.shape, + index_value=index_value, + dtype=result_series.dtype, + name=series.name, + ) + + def __call__(self, df_or_series): + if df_or_series.op.output_types[0] == OutputType.dataframe: + self.output_types = [OutputType.dataframe] + return self._call_dataframe(df_or_series) + else: + assert df_or_series.op.output_types[0] == OutputType.series + self.output_types = [OutputType.series] + return self._call_series(df_or_series) + + @staticmethod + def _get_index_value(input_index_value, periods, freq): + if ( + not input_index_value.has_value() + and input_index_value.min_val is not None + and input_index_value.max_val is not None + and freq is not None + and input_index_value.is_monotonic_increasing_or_decreasing + ): + pd_index = pd.Index( + [input_index_value.min_val, input_index_value.max_val] + ).shift(periods=periods, freq=freq) + index_value = parse_index(pd_index) + index_value.value._min_val_close = input_index_value.min_val_close + index_value.value._max_val_close = input_index_value.max_val_close + return index_value + else: + pd_index = input_index_value.to_pandas() + return parse_index(pd_index, periods, freq) + + @classmethod + def _tile_dataframe(cls, op): + from ..indexing.iloc import DataFrameIlocGetItem + from ..merge.concat import DataFrameConcat + + inp = op.input + out = op.outputs[0] + axis = op.axis + + out_chunks = [] + if op.freq is not None: + cum_nsplit = [0] + np.cumsum(inp.nsplits[axis]).tolist() + # shift index + for c in inp.chunks: + chunk_op = op.copy().reset_key() + i = c.index[axis] + start, end = cum_nsplit[i], cum_nsplit[i + 1] + if axis == 0: + index_value = cls._get_index_value( + c.index_value, op.periods, op.freq + ) + columns_value = c.columns_value + dtypes = c.dtypes + else: + dtypes = out.dtypes.iloc[start:end] + columns_value = parse_index(dtypes.index, store_data=True) + index_value = c.index_value + out_chunk = chunk_op.new_chunk( + [c], + index=c.index, + shape=c.shape, + index_value=index_value, + columns_value=columns_value, + dtypes=dtypes, + ) + out_chunks.append(out_chunk) + else: + if np.isnan(np.sum(inp.nsplits[axis])): # pragma: no cover + yield + + # shift data + inc = op.periods > 0 + cum_nsplit = [0] + np.cumsum(inp.nsplits[axis]).tolist() + for j in range(inp.chunk_shape[1 - axis]): + for i in range(inp.chunk_shape[axis]): + index = [None, None] + index[axis] = i + index[1 - axis] = j + index = tuple(index) + + start, end = cum_nsplit[i], cum_nsplit[i + 1] + + c = inp.cix[index] + to_concats = [c] + left = abs(op.periods) + prev_i = i - 1 if inc else i + 1 + while left > 0 and 0 <= prev_i < inp.chunk_shape[axis]: + prev_index = [None, None] + prev_index[axis] = prev_i + prev_index[1 - axis] = j + prev_index = tuple(prev_index) + + prev_chunk = inp.cix[prev_index] + size = min(prev_chunk.shape[axis], left) + left -= size + prev_i = prev_i - 1 if inc else prev_i + 1 + + if size == prev_chunk.shape[axis]: + to_concat = prev_chunk + else: + slcs = [slice(None)] * 2 + slc = slice(-size, None) if inc else slice(size) + slcs[axis] = slc + slc_op = DataFrameIlocGetItem(indexes=slcs) + to_concat = slc_op.new_chunk([prev_chunk]) + + if inc: + to_concats.insert(0, to_concat) + else: + to_concats.append(to_concat) + + if len(to_concats) == 1: + to_shift_chunk = to_concats[0] + else: + concat_op = DataFrameConcat( + axis=axis, output_types=[OutputType.dataframe] + ) + to_shift_chunk = concat_op.new_chunk(to_concats) + + chunk_op = op.copy().reset_key() + if axis == 1: + dtypes = out.dtypes.iloc[start:end] + columns_value = parse_index(dtypes.index, store_data=True) + index_value = c.index_value + else: + dtypes = c.dtypes + columns_value = c.columns_value + index_value = cls._get_index_value( + c.index_value, op.periods, op.freq + ) + + out_chunk = chunk_op.new_chunk( + [to_shift_chunk], + index=index, + shape=c.shape, + index_value=index_value, + columns_value=columns_value, + dtypes=dtypes, + ) + out_chunks.append(out_chunk) + + params = out.params + params["chunks"] = out_chunks + params["nsplits"] = inp.nsplits + new_op = op.copy() + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + def _tile_series(cls, op): + from ..indexing.iloc import SeriesIlocGetItem + from ..merge import DataFrameConcat + + if has_unknown_shape(*op.inputs): + yield + + inp = op.input + out = op.outputs[0] + + out_chunks = [] + + for i, c in enumerate(inp.chunks): + chunk_op = op.copy().reset_key() + + if op.freq is not None: + # shift index + index_value = cls._get_index_value(c.index_value, op.periods, op.freq) + out_chunk = chunk_op.new_chunk( + [c], + shape=c.shape, + index_value=index_value, + name=c.name, + dtype=out.dtype, + index=c.index, + ) + else: + inc = op.periods > 0 + prev_i = i - 1 if inc else i + 1 + + to_concats = [c] + left = abs(op.periods) + while left > 0 and 0 <= prev_i < inp.chunk_shape[0]: + prev_chunk = inp.cix[prev_i,] + size = min(left, prev_chunk.shape[0]) + left -= size + prev_i = prev_i - 1 if inc else prev_i + 1 + + if size == prev_chunk.shape[0]: + to_concat = prev_chunk + else: + slc = slice(-size, None) if inc else slice(size) + slc_op = SeriesIlocGetItem(indexes=[slc]) + to_concat = slc_op.new_chunk([prev_chunk]) + + if inc: + to_concats.insert(0, to_concat) + else: + to_concats.append(to_concat) + + if len(to_concats) == 1: + to_concat = to_concats[0] + else: + concat_op = DataFrameConcat(output_types=[OutputType.series]) + to_concat = concat_op.new_chunk(to_concats) + + out_chunk = chunk_op.new_chunk( + [to_concat], + index=(i,), + shape=c.shape, + index_value=c.index_value, + dtype=out.dtype, + name=out.name, + ) + out_chunks.append(out_chunk) + + params = out.params + params["chunks"] = out_chunks + params["nsplits"] = inp.nsplits + new_op = op.copy() + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + def tile(cls, op): + if op.output_types[0] == OutputType.dataframe: + return (yield from cls._tile_dataframe(op)) + else: + return (yield from cls._tile_series(op)) + + @classmethod + def execute(cls, ctx, op): + axis = op.axis + periods = op.periods + + obj = ctx[op.input.key] + out = op.outputs[0] + + if ( + _need_consolidate + and isinstance(obj, (pd.Series, pd.DataFrame)) + and len(obj._data.blocks) > 1 + ): + # if #internal blocks > 1, shift will create wrong result in pandas 1.1.0 + # see https://github.com/pandas-dev/pandas/issues/35488 + # if shifting merged dataframe slices, shift will raise TypeError in pandas 1.3.0 + # see https://github.com/pandas-dev/pandas/issues/42401 + # thus we force to do consolidate + obj._data._consolidate_inplace() + + result = obj.shift( + periods=periods, freq=op.freq, axis=axis, fill_value=op.fill_value + ) + if result.shape != out.shape: + slc = [slice(None)] * obj.ndim + if periods > 0: + slc[axis] = slice(-out.shape[axis], None) + else: + slc[axis] = slice(out.shape[axis]) + + result = result.iloc[tuple(slc)] + assert result.shape == out.shape, (result.shape, out.shape) + + ctx[out.key] = result + + +def shift(df_or_series, periods=1, freq=None, axis=0, fill_value=None): + """ + Shift index by desired number of periods with an optional time `freq`. + + When `freq` is not passed, shift the index without realigning the data. + If `freq` is passed (in this case, the index must be date or datetime, + or it will raise a `NotImplementedError`), the index will be + increased using the periods and the `freq`. + + Parameters + ---------- + periods : int + Number of periods to shift. Can be positive or negative. + freq : DateOffset, tseries.offsets, timedelta, or str, optional + Offset to use from the tseries module or time rule (e.g. 'EOM'). + If `freq` is specified then the index values are shifted but the + data is not realigned. That is, use `freq` if you would like to + extend the index when shifting and preserve the original data. + axis : {0 or 'index', 1 or 'columns', None}, default None + Shift direction. + fill_value : object, optional + The scalar value to use for newly introduced missing values. + the default depends on the dtype of `self`. + For numeric data, ``np.nan`` is used. + For datetime, timedelta, or period data, etc. :attr:`NaT` is used. + For extension dtypes, ``self.dtype.na_value`` is used. + + Returns + ------- + DataFrame or Series + Copy of input object, shifted. + + See Also + -------- + Index.shift : Shift values of Index. + DatetimeIndex.shift : Shift values of DatetimeIndex. + PeriodIndex.shift : Shift values of PeriodIndex. + tshift : Shift the time index, using the index's frequency if + available. + + Examples + -------- + >>> import mars.dataframe as md + + >>> df = md.DataFrame({'Col1': [10, 20, 15, 30, 45], + ... 'Col2': [13, 23, 18, 33, 48], + ... 'Col3': [17, 27, 22, 37, 52]}) + + >>> df.shift(periods=3).execute() + Col1 Col2 Col3 + 0 NaN NaN NaN + 1 NaN NaN NaN + 2 NaN NaN NaN + 3 10.0 13.0 17.0 + 4 20.0 23.0 27.0 + + >>> df.shift(periods=1, axis='columns').execute() + Col1 Col2 Col3 + 0 NaN 10.0 13.0 + 1 NaN 20.0 23.0 + 2 NaN 15.0 18.0 + 3 NaN 30.0 33.0 + 4 NaN 45.0 48.0 + + >>> df.shift(periods=3, fill_value=0).execute() + Col1 Col2 Col3 + 0 0 0 0 + 1 0 0 0 + 2 0 0 0 + 3 10 13 17 + 4 20 23 27 + """ + axis = validate_axis(axis, df_or_series) + if periods == 0: + return df_or_series.copy() + if fill_value is no_default: # pragma: no cover + if not _enable_no_default or ( + _with_column_freq_bug and axis == 1 and freq is not None + ): + # pandas shift shows different behavior for axis=1 when freq is specified, + # see https://github.com/pandas-dev/pandas/issues/47039 for details. + fill_value = None + op = DataFrameShift(periods=periods, freq=freq, axis=axis, fill_value=fill_value) + return op(df_or_series) + + +def tshift(df_or_series, periods: int = 1, freq=None, axis=0): + """ + Shift the time index, using the index's frequency if available. + + Parameters + ---------- + periods : int + Number of periods to move, can be positive or negative. + freq : DateOffset, timedelta, or str, default None + Increment to use from the tseries module + or time rule expressed as a string (e.g. 'EOM'). + axis : {0 or ‘index’, 1 or ‘columns’, None}, default 0 + Corresponds to the axis that contains the Index. + + Returns + ------- + shifted : Series/DataFrame + + Notes + ----- + If freq is not specified then tries to use the freq or inferred_freq + attributes of the index. If neither of those attributes exist, a + ValueError is thrown + """ + axis = validate_axis(axis, df_or_series) + index = ( + df_or_series.index_value.to_pandas() + if axis == 0 + else df_or_series.columns_value.to_pandas() + ) + + if freq is None: + freq = getattr(index, "freq", None) + + if freq is None: # pragma: no cover + freq = getattr(index, "inferred_freq", None) + + if freq is None: + raise ValueError("Freq was not given and was not set in the index") + + return shift(df_or_series, periods=periods, freq=freq, axis=axis) diff --git a/python/xorbits/_mars/dataframe/base/stack.py b/python/xorbits/_mars/dataframe/base/stack.py new file mode 100644 index 000000000..7edda8a67 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/stack.py @@ -0,0 +1,312 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Union + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...core import recursive_tile +from ...serialization.serializables import AnyField, BoolField, KeyField +from ...utils import has_unknown_shape +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_df, parse_index + + +class DataFrameStack(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.STACK + + _input_df = KeyField("input_df") + _level = AnyField("level") + _dropna = BoolField("dropna") + + def __init__(self, input_df=None, level=None, dropna=None, **kw): + super().__init__(_input_df=input_df, _level=level, _dropna=dropna, **kw) + + @property + def input_df(self): + return self._input_df + + @property + def level(self): + return self._level + + @property + def dropna(self): + return self._dropna + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input_df = self._inputs[0] + + @classmethod + def _calc_size(cls, size: int, level: Union[List, int], dtypes: pd.Series): + index = dtypes.index + + if not isinstance(index, pd.MultiIndex): + return size * len(index) + + if isinstance(level, int): + level = [level] + return size * np.prod([index.levshape[lev] for lev in level]).item() + + def __call__(self, input_df): + test_df = build_df(input_df) + test_df = test_df.stack(level=self._level, dropna=self._dropna) + if self._dropna: + size = np.nan + else: + size = self._calc_size(input_df.shape[0], self._level, input_df.dtypes) + if test_df.ndim == 1: + shape = (size,) + return self.new_series( + [input_df], + shape=shape, + dtype=test_df.dtype, + index_value=parse_index(test_df.index, input_df), + name=test_df.name, + ) + else: + shape = (size, test_df.shape[1]) + return self.new_dataframe( + [input_df], + shape=shape, + dtypes=test_df.dtypes, + index_value=parse_index(test_df.index, input_df), + columns_value=parse_index(test_df.columns, store_data=True), + ) + + @classmethod + def tile(cls, op: "DataFrameStack"): + input_df = op.input_df + out = op.outputs[0] + out_index = out.index_value.to_pandas() + + if input_df.chunk_shape[1] > 1: + # rechunk into 1 chunk on axis 1 + if has_unknown_shape(input_df): + yield + input_df = yield from recursive_tile( + input_df.rechunk({1: input_df.shape[1]}) + ) + + out_chunks = [] + for c in input_df.chunks: + chunk_op = op.copy().reset_key() + if op.dropna: + size = np.nan + else: + size = cls._calc_size(c.shape[0], op.level, c.dtypes) + if out.ndim == 1: + kw = { + "shape": (size,), + "index": (c.index[0],), + "dtype": out.dtype, + "index_value": parse_index(out_index, c), + "name": out.name, + } + else: + kw = { + "shape": (size, out.shape[1]), + "index": (c.index[0], 0), + "dtypes": out.dtypes, + "index_value": parse_index(out_index, c), + "columns_value": out.columns_value, + } + out_chunk = chunk_op.new_chunk([c], **kw) + out_chunks.append(out_chunk) + + params = out.params + if out.ndim == 1: + params["nsplits"] = (tuple(out_c.shape[0] for out_c in out_chunks),) + else: + params["nsplits"] = ( + tuple(out_c.shape[0] for out_c in out_chunks), + (out.shape[1],), + ) + params["chunks"] = out_chunks + new_op = op.copy() + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + def execute(cls, ctx, op: "DataFrameStack"): + inp: pd.DataFrame = ctx[op.input_df.key] + ctx[op.outputs[0].key] = inp.stack(level=op.level, dropna=op.dropna) + + +def stack(df, level=-1, dropna=True): + """ + Stack the prescribed level(s) from columns to index. + + Return a reshaped DataFrame or Series having a multi-level + index with one or more new inner-most levels compared to the current + DataFrame. The new inner-most levels are created by pivoting the + columns of the current dataframe: + + - if the columns have a single level, the output is a Series; + - if the columns have multiple levels, the new index + level(s) is (are) taken from the prescribed level(s) and + the output is a DataFrame. + + Parameters + ---------- + level : int, str, list, default -1 + Level(s) to stack from the column axis onto the index + axis, defined as one index or label, or a list of indices + or labels. + dropna : bool, default True + Whether to drop rows in the resulting Frame/Series with + missing values. Stacking a column level onto the index + axis can create combinations of index and column values + that are missing from the original dataframe. See Examples + section. + + Returns + ------- + DataFrame or Series + Stacked dataframe or series. + + See Also + -------- + DataFrame.unstack : Unstack prescribed level(s) from index axis + onto column axis. + DataFrame.pivot : Reshape dataframe from long format to wide + format. + DataFrame.pivot_table : Create a spreadsheet-style pivot table + as a DataFrame. + + Notes + ----- + The function is named by analogy with a collection of books + being reorganized from being side by side on a horizontal + position (the columns of the dataframe) to being stacked + vertically on top of each other (in the index of the + dataframe). + + Examples + -------- + **Single level columns** + + >>> import mars.dataframe as md + >>> df_single_level_cols = md.DataFrame([[0, 1], [2, 3]], + ... index=['cat', 'dog'], + ... columns=['weight', 'height']) + + Stacking a dataframe with a single level column axis returns a Series: + + >>> df_single_level_cols.execute() + weight height + cat 0 1 + dog 2 3 + >>> df_single_level_cols.stack().execute() + cat weight 0 + height 1 + dog weight 2 + height 3 + dtype: int64 + + **Multi level columns: simple case** + + >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'), + ... ('weight', 'pounds')]) + >>> df_multi_level_cols1 = md.DataFrame([[1, 2], [2, 4]], + ... index=['cat', 'dog'], + ... columns=multicol1) + + Stacking a dataframe with a multi-level column axis: + + >>> df_multi_level_cols1.execute() + weight + kg pounds + cat 1 2 + dog 2 4 + >>> df_multi_level_cols1.stack().execute() + weight + cat kg 1 + pounds 2 + dog kg 2 + pounds 4 + + **Missing values** + + >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'), + ... ('height', 'm')]) + >>> df_multi_level_cols2 = md.DataFrame([[1.0, 2.0], [3.0, 4.0]], + ... index=['cat', 'dog'], + ... columns=multicol2) + + It is common to have missing values when stacking a dataframe + with multi-level columns, as the stacked dataframe typically + has more values than the original dataframe. Missing values + are filled with NaNs: + + >>> df_multi_level_cols2.execute() + weight height + kg m + cat 1.0 2.0 + dog 3.0 4.0 + >>> df_multi_level_cols2.stack().execute() + height weight + cat kg NaN 1.0 + m 2.0 NaN + dog kg NaN 3.0 + m 4.0 NaN + + **Prescribing the level(s) to be stacked** + + The first parameter controls which level or levels are stacked: + + >>> df_multi_level_cols2.stack(0).execute() + kg m + cat height NaN 2.0 + weight 1.0 NaN + dog height NaN 4.0 + weight 3.0 NaN + >>> df_multi_level_cols2.stack([0, 1]).execute() + cat height m 2.0 + weight kg 1.0 + dog height m 4.0 + weight kg 3.0 + dtype: float64 + + **Dropping missing values** + + >>> df_multi_level_cols3 = md.DataFrame([[None, 1.0], [2.0, 3.0]], + ... index=['cat', 'dog'], + ... columns=multicol2) + + Note that rows where all values are missing are dropped by + default but this behaviour can be controlled via the dropna + keyword parameter: + + >>> df_multi_level_cols3.execute() + weight height + kg m + cat NaN 1.0 + dog 2.0 3.0 + >>> df_multi_level_cols3.stack(dropna=False).execute() + height weight + cat kg NaN NaN + m 1.0 NaN + dog kg NaN 2.0 + m 3.0 NaN + >>> df_multi_level_cols3.stack(dropna=True).execute() + height weight + cat m 1.0 NaN + dog kg NaN 2.0 + m 3.0 NaN + """ + op = DataFrameStack(input_df=df, level=level, dropna=dropna) + return op(df) diff --git a/python/xorbits/_mars/dataframe/base/standardize_range_index.py b/python/xorbits/_mars/dataframe/base/standardize_range_index.py new file mode 100644 index 000000000..d253e0c76 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/standardize_range_index.py @@ -0,0 +1,42 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd + +from ... import opcodes as OperandDef +from ...serialization.serializables import FieldTypes, Int32Field, ListField +from ...utils import lazy_import +from ..operands import DataFrameOperand, DataFrameOperandMixin + +cudf = lazy_import("cudf") + + +class ChunkStandardizeRangeIndex(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.STANDARDIZE_RANGE_INDEX + + axis = Int32Field("axis") + prev_shapes = ListField("prev_shapes", FieldTypes.tuple) + + @classmethod + def execute(cls, ctx, op: "ChunkStandardizeRangeIndex"): + xdf = cudf if op.gpu else pd + in_data = ctx[op.inputs[0].key].copy() + index_start = sum([shape[op.axis] for shape in op.prev_shapes]) + if op.axis == 0: + in_data.index = xdf.RangeIndex(index_start, index_start + len(in_data)) + else: + in_data.columns = xdf.RangeIndex( + index_start, index_start + in_data.shape[1] + ) + ctx[op.outputs[0].key] = in_data diff --git a/python/xorbits/_mars/dataframe/base/string_.py b/python/xorbits/_mars/dataframe/base/string_.py new file mode 100644 index 000000000..f5dac2c11 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/string_.py @@ -0,0 +1,418 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import OutputType, recursive_tile +from ...serialization.serializables import DictField, KeyField, StringField, TupleField +from ...tensor import tensor as astensor +from ...tensor.core import TENSOR_TYPE +from ...utils import has_unknown_shape +from ..align import align_series_series +from ..core import SERIES_TYPE +from ..initializer import Series as asseries +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_empty_series, infer_index_value, parse_index + + +class SeriesStringMethod(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.STRING_METHOD + + _input = KeyField("input") + _method = StringField("method") + _method_args = TupleField("method_args") + _method_kwargs = DictField("method_kwargs") + + def __init__( + self, method=None, method_args=None, method_kwargs=None, output_types=None, **kw + ): + super().__init__( + _method=method, + _method_args=method_args, + _method_kwargs=method_kwargs, + _output_types=output_types, + **kw + ) + if not self.output_types: + self.output_types = [OutputType.series] + + @property + def input(self): + return self._input + + @property + def method(self): + return self._method + + @property + def method_args(self): + return self._method_args + + @property + def method_kwargs(self): + return self._method_kwargs + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + if len(self._inputs) == 2: + # for method cat + self._method_kwargs["others"] = self._inputs[1] + + def __call__(self, inp): + return _string_method_to_handlers[self._method].call(self, inp) + + @classmethod + def tile(cls, op): + tiled = _string_method_to_handlers[op.method].tile(op) + if inspect.isgenerator(tiled): + return (yield from tiled) + else: + return tiled + + @classmethod + def execute(cls, ctx, op): + return _string_method_to_handlers[op.method].execute(ctx, op) + + +class SeriesStringMethodBaseHandler: + @classmethod + def call(cls, op, inp): + empty_series = build_empty_series(inp.dtype) + dtype = getattr(empty_series.str, op.method)( + *op.method_args, **op.method_kwargs + ).dtype + return op.new_series( + [inp], + shape=inp.shape, + dtype=dtype, + index_value=inp.index_value, + name=inp.name, + ) + + @classmethod + def tile(cls, op): + out = op.outputs[0] + out_chunks = [] + for series_chunk in op.input.chunks: + chunk_op = op.copy().reset_key() + out_chunk = chunk_op.new_chunk( + [series_chunk], + shape=series_chunk.shape, + dtype=out.dtype, + index=series_chunk.index, + index_value=series_chunk.index_value, + name=series_chunk.name, + ) + out_chunks.append(out_chunk) + + params = out.params + params["chunks"] = out_chunks + params["nsplits"] = op.input.nsplits + new_op = op.copy() + return new_op.new_tileables([op.input], kws=[params]) + + @classmethod + def execute(cls, ctx, op): + inp = ctx[op.input.key] + ctx[op.outputs[0].key] = getattr(inp.str, op.method)( + *op.method_args, **op.method_kwargs + ) + + +class SeriesStringSplitHandler(SeriesStringMethodBaseHandler): + @classmethod + def call(cls, op, inp): + method_kwargs = op.method_kwargs + if method_kwargs.get("expand", False) is False: + return super().call(op, inp) + n = method_kwargs.get("n", -1) + # does not support if expand and n == -1 + if n == -1: # pragma: no cover + raise NotImplementedError("`n` needs to be specified when expand=True") + + op.output_types = [OutputType.dataframe] + columns = pd.RangeIndex(n + 1) + columns_value = parse_index(columns, store_data=True) + dtypes = pd.Series([inp.dtype] * len(columns), index=columns) + return op.new_dataframe( + [inp], + shape=(inp.shape[0], len(columns)), + dtypes=dtypes, + columns_value=columns_value, + index_value=inp.index_value, + ) + + @classmethod + def tile(cls, op): + out = op.outputs[0] + + if out.op.output_types[0] == OutputType.series: + return super().tile(op) + + out_chunks = [] + columns = out.columns_value.to_pandas() + for series_chunk in op.input.chunks: + chunk_op = op.copy().reset_key() + out_chunk = chunk_op.new_chunk( + [series_chunk], + shape=(series_chunk.shape[0], len(columns)), + index=(series_chunk.index[0], 0), + dtypes=out.dtypes, + index_value=series_chunk.index_value, + columns_value=out.columns_value, + ) + out_chunks.append(out_chunk) + + params = out.params + params["chunks"] = out_chunks + params["nsplits"] = (op.input.nsplits[0], (len(columns),)) + new_op = op.copy() + return new_op.new_tileables([op.input], kws=[params]) + + @classmethod + def execute(cls, ctx, op): + inp = ctx[op.input.key] + out = op.outputs[0] + result = getattr(inp.str, op.method)(*op.method_args, **op.method_kwargs) + if result.ndim == 2 and result.shape[1] < out.shape[1]: + for i in range(result.shape[1], out.shape[1]): + result[i] = None + ctx[op.outputs[0].key] = result + + +class SeriesStringCatHandler(SeriesStringMethodBaseHandler): + CAT_TYPE_ERROR = ( + "others must be Series, Index, DataFrame, " + "Tensor, np.ndarrary or list-like " + "(either containing only strings or " + "containing only objects of " + "type Series/Index/Tensor/np.ndarray[1-dim])" + ) + CAT_LEN_ERROR = ( + "If `others` contains arrays or lists (or other list-likes without an index), " + "these must all be of the same length as the calling Series/Index." + ) + + @classmethod + def call(cls, op, inp): + method_kwargs = op.method_kwargs + others = method_kwargs.get("others") + + if others is None: + from ..reduction import build_str_concat_object + + return build_str_concat_object( + inp, + sep=op.method_kwargs.get("sep"), + na_rep=op.method_kwargs.get("na_rep"), + ) + elif isinstance(others, (tuple, list, np.ndarray, TENSOR_TYPE)): + others = astensor(others, dtype=object) + if others.ndim != 1: + raise TypeError(cls.CAT_TYPE_ERROR) + if ( + not np.isnan(inp.shape[0]) + and not np.isnan(others.shape[0]) + and inp.shape[0] != others.shape[0] + ): + raise ValueError(cls.CAT_LEN_ERROR) + inputs = [inp] + if isinstance(others, TENSOR_TYPE): + inputs.append(others) + return op.new_series( + inputs, + shape=inp.shape, + dtype=inp.dtype, + index_value=inp.index_value, + name=inp.name, + ) + elif isinstance(others, (pd.Series, SERIES_TYPE)): + others = asseries(others) + if op.method_kwargs.get("join") != "outer": # pragma: no cover + raise NotImplementedError("only outer join supported for now") + return op.new_series( + [inp, others], + shape=inp.shape, + dtype=inp.dtype, + index_value=infer_index_value(inp.index_value, others.index_value), + name=inp.name, + ) + elif isinstance(others, str) and op.method_kwargs.get("sep") is None: + raise ValueError("Did you mean to supply a `sep` keyword?") + else: + raise TypeError(cls.CAT_TYPE_ERROR) + + @classmethod + def tile(cls, op): + inp = op.input + out = op.outputs[0] + + # aggregation concat resulting in scalars is redirected + assert out.ndim != 0 + + if isinstance(op.inputs[1], TENSOR_TYPE): + if has_unknown_shape(*op.inputs): + yield + # rechunk others as input + others = yield from recursive_tile(op.inputs[1].rechunk(op.input.nsplits)) + out_chunks = [] + for c in inp.chunks: + chunk_op = op.copy().reset_key() + chunk_op._method_kwargs = op.method_kwargs.copy() + out_chunk = chunk_op.new_chunk( + [c, others.cix[c.index]], + dtype=c.dtype, + index=c.index, + shape=c.shape, + index_value=c.index_value, + name=c.name, + ) + out_chunks.append(out_chunk) + new_op = op.copy() + params = out.params + params["nsplits"] = inp.nsplits + params["chunks"] = out_chunks + return new_op.new_tileables(op.inputs, kws=[params]) + elif isinstance(op.inputs[1], SERIES_TYPE): + # both series + out_chunks = [] + nsplits, _, left_chunks, right_chunks = align_series_series(*op.inputs) + for left_chunk, right_chunk in zip(left_chunks, right_chunks): + chunk_op = op.copy().reset_key() + chunk_op._method_kwargs = op.method_kwargs.copy() + params = left_chunk.params + params["name"] = out.name + out_chunk = chunk_op.new_chunk([left_chunk, right_chunk], **params) + out_chunks.append(out_chunk) + new_op = op.copy() + params = out.params + params["nsplits"] = nsplits + params["chunks"] = out_chunks + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + def execute(cls, ctx, op): + inputs = [ctx[inp.key] for inp in op.inputs] + method_kwargs = op.method_kwargs + + # aggregation concat is redirected and `others` is always defined + assert len(inputs) > 1 + + method_kwargs["others"] = inputs[1] + ctx[op.outputs[0].key] = inputs[0].str.cat(**method_kwargs) + + +class SeriesStringExtractHandler(SeriesStringMethodBaseHandler): + @classmethod + def call(cls, op, inp): + empty_series = build_empty_series( + inp.dtype, index=inp.index_value.to_pandas()[:0] + ) + test_df = getattr(empty_series.str, op.method)( + *op.method_args, **op.method_kwargs + ) + if test_df.ndim == 1: + return op.new_series( + [inp], + shape=inp.shape, + dtype=test_df.dtype, + index_value=inp.index_value, + name=inp.name, + ) + else: + op.output_types = [OutputType.dataframe] + if op.method == "extractall": + index_value = parse_index(test_df.index, inp) + shape = (np.nan, test_df.shape[1]) + else: + index_value = inp.index_value + shape = (inp.shape[0], test_df.shape[1]) + return op.new_dataframe( + [inp], + shape=shape, + dtypes=test_df.dtypes, + index_value=index_value, + columns_value=parse_index(test_df.columns, store_data=True), + ) + + @classmethod + def tile(cls, op): + out = op.outputs[0] + out_chunks = [] + for series_chunk in op.input.chunks: + chunk_op = op.copy().reset_key() + if out.ndim == 1: + out_chunk = chunk_op.new_chunk( + [series_chunk], + shape=series_chunk.shape, + index=series_chunk.index, + dtype=out.dtype, + index_value=series_chunk.index_value, + name=out.name, + ) + else: + if op.method == "extract": + index_value = series_chunk.index_value + shape = (series_chunk.shape[0], out.shape[1]) + else: + index_value = parse_index( + out.index_value.to_pandas()[:0], series_chunk + ) + shape = (np.nan, out.shape[1]) + out_chunk = chunk_op.new_chunk( + [series_chunk], + shape=shape, + index=(series_chunk.index[0], 0), + dtypes=out.dtypes, + index_value=index_value, + columns_value=out.columns_value, + ) + out_chunks.append(out_chunk) + + out = op.outputs[0] + params = out.params + params["chunks"] = out_chunks + if out.ndim == 1: + params["nsplits"] = op.input.nsplits + elif op.method == "extract": + params["nsplits"] = (op.input.nsplits[0], (out.shape[1],)) + else: + params["nsplits"] = ((np.nan,) * len(op.input.nsplits[0]), (out.shape[1],)) + new_op = op.copy() + return new_op.new_tileables([op.input], kws=[params]) + + +_string_method_to_handlers = {} +_not_implements = ["get_dummies"] +# start to register handlers for string methods +# register special methods first +_string_method_to_handlers["split"] = SeriesStringSplitHandler +_string_method_to_handlers["rsplit"] = SeriesStringSplitHandler +_string_method_to_handlers["cat"] = SeriesStringCatHandler +_string_method_to_handlers["extract"] = SeriesStringExtractHandler +_string_method_to_handlers["extractall"] = SeriesStringExtractHandler +# then come to the normal methods +for method in dir(pd.Series.str): + if method.startswith("_") and method != "__getitem__": + continue + if method in _not_implements: + continue + if method in _string_method_to_handlers: + continue + _string_method_to_handlers[method] = SeriesStringMethodBaseHandler diff --git a/python/xorbits/_mars/dataframe/base/tests/__init__.py b/python/xorbits/_mars/dataframe/base/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/base/tests/test_apply_execution.py b/python/xorbits/_mars/dataframe/base/tests/test_apply_execution.py new file mode 100644 index 000000000..7c2aeed8d --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/tests/test_apply_execution.py @@ -0,0 +1,284 @@ +# Copyright 2022 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import pandas as pd + +import pytest + +from .... import dataframe as md +from ....dataframe.core import DataFrame, DATAFRAME_OR_SERIES_TYPE +from ....dataframe.fetch.core import DataFrameFetch + + +def test_dataframe_apply_execution(setup): + df = pd.DataFrame({"col": [1, 2, 3, 4]}) + mdf = md.DataFrame(df) + + apply_func = lambda x: 20 if x[0] else 10 + with pytest.raises(TypeError): + mdf.apply(apply_func) + + res = mdf.apply(apply_func, output_type="df_or_series", axis=1).execute() + assert res.data_type == "series" + assert "dtype" in res.data_params + assert not ("dtypes" in res.data_params) + assert res.data_params["shape"] == (4,) + pd.testing.assert_series_equal(res.fetch(), df.apply(apply_func, axis=1)) + + res = mdf.apply(apply_func, output_type="df_or_series", axis=0).execute() + assert res.data_type == "series" + assert "dtype" in res.data_params + assert not ("dtypes" in res.data_params) + assert res.data_params["shape"] == (1,) + pd.testing.assert_series_equal(res.fetch(), df.apply(apply_func, axis=0)) + + apply_func = lambda x: x + 1 + res = mdf.apply(apply_func, output_type="df_or_series", axis=1).execute() + assert res.data_type == "dataframe" + assert "dtypes" in res.data_params + assert not ("dtype" in res.data_params) + assert res.data_params["shape"] == (4, 1) + pd.testing.assert_frame_equal(res.fetch(), df.apply(apply_func, axis=1)) + + res = mdf.apply(apply_func, output_type="df_or_series", axis=0).execute() + assert res.data_type == "dataframe" + assert "dtypes" in res.data_params + assert not ("dtype" in res.data_params) + assert res.data_params["shape"] == (4, 1) + pd.testing.assert_frame_equal(res.fetch(), df.apply(apply_func, axis=0)) + + apply_func = lambda x: sum(x) + res = mdf.apply(apply_func, output_type="df_or_series", axis=1).execute() + assert res.data_type == "series" + assert "dtype" in res.data_params + assert not ("dtypes" in res.data_params) + assert res.data_params["shape"] == (4,) + pd.testing.assert_series_equal(res.fetch(), df.apply(apply_func, axis=1)) + + res = mdf.apply(apply_func, output_type="df_or_series", axis=0).execute() + assert res.data_type == "series" + assert "dtype" in res.data_params + assert not ("dtypes" in res.data_params) + assert res.data_params["shape"] == (1,) + pd.testing.assert_series_equal(res.fetch(), df.apply(apply_func, axis=0)) + + df = pd.DataFrame({"c1": [1, 2, 3, 4], "c2": [5, 6, 7, 8]}) + mdf = md.DataFrame(df) + apply_func = lambda x: sum(x) / len(x) + + res = mdf.apply(apply_func, output_type="df_or_series", axis=1).execute() + assert res.data_type == "series" + assert "dtype" in res.data_params + assert res.data_params["dtype"] == "float64" + assert not ("dtypes" in res.data_params) + assert res.data_params["shape"] == (4,) + pd.testing.assert_series_equal(res.fetch(), df.apply(apply_func, axis=1)) + + res = mdf.apply(apply_func, output_type="df_or_series", axis=0).execute() + assert res.data_type == "series" + assert "dtype" in res.data_params + assert res.data_params["dtype"] == "float64" + assert not ("dtypes" in res.data_params) + assert res.data_params["shape"] == (2,) + pd.testing.assert_series_equal(res.fetch(), df.apply(apply_func, axis=0)) + + apply_func = lambda x: pd.Series([1, 2]) + res = mdf.apply(apply_func, output_type="df_or_series", axis=0).execute() + assert res.data_type == "dataframe" + assert "dtypes" in res.data_params + assert res.data_params["shape"] == (2, 2) + pd.testing.assert_frame_equal(res.fetch(), df.apply(apply_func, axis=0)) + + res = mdf.apply(apply_func, output_type="df_or_series", axis=1).execute() + assert res.data_type == "dataframe" + assert "dtypes" in res.data_params + assert res.data_params["shape"] == (4, 2) + pd.testing.assert_frame_equal(res.fetch(), df.apply(apply_func, axis=1)) + + apply_func = lambda x: [1, 2] + res = mdf.apply(apply_func, output_type="df_or_series", axis=0).execute() + assert res.data_type == "dataframe" + assert res.data_params["shape"] == (2, 2) + pd.testing.assert_frame_equal(res.fetch(), df.apply(apply_func, axis=0)) + + res = mdf.apply(apply_func, output_type="df_or_series", axis=1).execute() + assert res.data_type == "series" + assert res.data_params["shape"] == (4,) + assert res.data_params["dtype"] == "object" + pd.testing.assert_series_equal(res.fetch(), df.apply(apply_func, axis=1)) + + apply_func = lambda x: pd.Series([1, 2, 3.0], index=["c1", "c2", "c3"]) + res = mdf.apply(apply_func, output_type="df_or_series", axis=0).execute() + assert res.data_type == "dataframe" + assert res.data_params["shape"] == (3, 2) + pd.testing.assert_frame_equal(res.fetch(), df.apply(apply_func, axis=0)) + + res = mdf.apply(apply_func, output_type="df_or_series", axis=1).execute() + assert res.data_type == "dataframe" + assert res.data_params["shape"] == (4, 3) + pd.testing.assert_frame_equal(res.fetch(), df.apply(apply_func, axis=1)) + + apply_func = lambda x: [1, 2, 3] + res = mdf.apply( + apply_func, output_type="df_or_series", axis=1, result_type="expand" + ).execute() + expected = df.apply(apply_func, axis=1, result_type="expand") + pd.testing.assert_frame_equal(res.fetch(), expected) + + res = mdf.apply( + apply_func, output_type="df_or_series", axis=1, result_type="reduce" + ).execute() + expected = df.apply(apply_func, axis=1, result_type="reduce") + pd.testing.assert_series_equal(res.fetch(), expected) + + apply_func = lambda x: [1, 2] + res = mdf.apply( + apply_func, output_type="df_or_series", axis=1, result_type="broadcast" + ).execute() + expected = df.apply(apply_func, axis=1, result_type="broadcast") + pd.testing.assert_frame_equal(res.fetch(), expected) + + +def test_apply_with_skip_infer(setup): + df = pd.DataFrame({"col1": [1, 2, 3, 4], "col2": list("abcd")}) + mdf = md.DataFrame(df, chunk_size=2) + + def apply_func(series): + if series[1] not in "abcd": + # make it fail when inferring + raise TypeError + else: + return 1 + + with pytest.raises(TypeError): + mdf.apply(apply_func, axis=1) + + res = mdf.apply(apply_func, axis=1, skip_infer=True).execute() + assert isinstance(res, DATAFRAME_OR_SERIES_TYPE) + pd.testing.assert_series_equal(res.fetch(), pd.Series([1] * 4)) + + s = pd.Series([1, 2, 3, 4]) + ms = md.Series(s, chunk_size=2) + + apply_func = lambda x: pd.Series([1, 2]) + res = ms.apply(apply_func, skip_infer=True).execute() + assert isinstance(res, DATAFRAME_OR_SERIES_TYPE) + pd.testing.assert_frame_equal(res.fetch(), pd.DataFrame([[1, 2]] * 4)) + + +def test_series_apply_execution(setup): + s = pd.Series([1, 2, 3, 4]) + ms = md.Series(s) + + apply_func = lambda x: x + 1 + res = ms.apply(apply_func, output_type="df_or_series").execute() + assert res.data_type == "series" + assert res.data_params["shape"] == (4,) + assert res.data_params["dtype"] == "int64" + pd.testing.assert_series_equal(res.fetch(), s.apply(apply_func)) + + apply_func = lambda x: [1, 2] + res = ms.apply(apply_func, output_type="df_or_series").execute() + assert res.data_type == "series" + assert res.data_params["shape"] == (4,) + assert res.data_params["dtype"] == "object" + pd.testing.assert_series_equal(res.fetch(), s.apply(apply_func)) + + apply_func = lambda x: pd.Series([1, 2, 3]) + res = ms.apply(apply_func, output_type="df_or_series").execute() + assert res.data_type == "dataframe" + assert "dtypes" in res.data_params + assert res.data_params["shape"] == (4, 3) + pd.testing.assert_frame_equal(res.fetch(), s.apply(apply_func)) + + def subtract_custom_value(x, custom_value): + return x - custom_value + + apply_func = subtract_custom_value + res = ms.apply( + apply_func, args=(5,), convert_dtype=False, output_type="df_or_series" + ).execute() + assert res.data_params["dtype"] == "object" + pd.testing.assert_series_equal( + res.fetch(), s.apply(apply_func, args=(5,), convert_dtype=False) + ) + + res = ms.apply( + apply_func, args=(5,), convert_dtype=True, output_type="df_or_series" + ).execute() + assert res.dtype == "int64" + assert res.shape == (4,) + with pytest.raises(AttributeError): + _ = res.dtypes + pd.testing.assert_series_equal( + res.fetch(), s.apply(apply_func, args=(5,), convert_dtype=True) + ) + + +def test_apply_execution_with_multi_chunks(setup): + df = pd.DataFrame({"c1": [1, 2, 3, 4], "c2": [5, 6, 7, 8]}) + mdf = md.DataFrame(df, chunk_size=5) + apply_func = np.sqrt + + res = mdf.apply(apply_func, output_type="df_or_series", axis=0).execute() + assert res.data_type == "dataframe" + assert "dtypes" in res.data_params + assert res.data_params["dtypes"]["c1"] == np.dtype("float") + assert not ("dtype" in res.data_params) + assert res.data_params["shape"] == (4, 2) + pd.testing.assert_frame_equal(res.fetch(), df.apply(apply_func, axis=0)) + + res = mdf.apply(apply_func, output_type="df_or_series", axis=1).execute() + assert res.data_type == "dataframe" + assert "dtypes" in res.data_params + assert res.data_params["dtypes"]["c2"] == np.dtype("float") + assert not ("dtype" in res.data_params) + assert res.data_params["shape"] == (4, 2) + pd.testing.assert_frame_equal(res.fetch(), df.apply(apply_func, axis=1)) + + s = pd.Series([1, 2, 3, 4]) + ms = md.Series(s, chunk_size=4) + + res = ms.apply(apply_func, output_type="df_or_series").execute() + assert res.data_type == "series" + assert "dtype" in res.data_params + assert res.data_params["dtype"] == "float64" + pd.testing.assert_series_equal(res.fetch(), s.apply(apply_func)) + + +def test_apply_ensure_data(setup): + df = pd.DataFrame({"c1": [1, 2, 3, 4], "c2": [5, 6, 7, 8]}) + mdf = md.DataFrame(df, chunk_size=3) + apply_func = np.sqrt + + r = mdf.apply(apply_func, output_type="df_or_series") + res = r.ensure_data() + assert isinstance(res, DataFrame) + assert isinstance(res.op, DataFrameFetch) + pd.testing.assert_frame_equal(res.execute().fetch(), df.apply(apply_func)) + pd.testing.assert_frame_equal((res + 1).execute().fetch(), df.apply(apply_func) + 1) + pd.testing.assert_frame_equal((res * 3).execute().fetch(), df.apply(apply_func) * 3) + + r = res.groupby("c1").max() + expected = df.apply(apply_func).groupby("c1").max() + pd.testing.assert_frame_equal(r.execute().fetch(), expected) + + apply_func = np.mean + res = mdf.apply(apply_func, output_type="df_or_series", axis=1).ensure_data() + expected = df.apply(apply_func, axis=1) + pd.testing.assert_series_equal(res.execute().fetch(), expected) + + res = res.to_frame(name="foo").groupby("foo")[["foo"]].max().execute() + expected = expected.to_frame(name="foo").groupby("foo")[["foo"]].max() + pd.testing.assert_frame_equal(res.fetch(), expected) diff --git a/python/xorbits/_mars/dataframe/base/tests/test_base.py b/python/xorbits/_mars/dataframe/base/tests/test_base.py new file mode 100644 index 000000000..aa6eae4fe --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/tests/test_base.py @@ -0,0 +1,1106 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List, Union + +import numpy as np +import pandas as pd +import pytest + +from .... import opcodes +from ....config import option_context, options +from ....core import OutputType, Tileable, tile +from ....core.graph import ( + ChunkGraphBuilder, + TileableGraph, + TileableGraphBuilder, + TileContext, +) +from ....core.operand import OperandStage +from ....tensor.core import TENSOR_TYPE +from ... import cut +from ... import eval as mars_eval +from ... import get_dummies, to_numeric +from ...core import ( + CATEGORICAL_CHUNK_TYPE, + CATEGORICAL_TYPE, + DATAFRAME_TYPE, + INDEX_TYPE, + SERIES_CHUNK_TYPE, + SERIES_TYPE, + DataFrameData, + SeriesData, +) +from ...datasource.dataframe import from_pandas as from_pandas_df +from ...datasource.index import from_pandas as from_pandas_index +from ...datasource.series import from_pandas as from_pandas_series +from .. import astype, to_cpu, to_gpu + + +def _get_df_after_tile( + tileables: List[Tileable], +) -> List[Union[DataFrameData, SeriesData]]: + graph = TileableGraph(tileables) + next(TileableGraphBuilder(graph).build()) + context = TileContext() + chunk_graph_builder = ChunkGraphBuilder( + graph, fuse_enabled=False, tile_context=context + ) + chunk_graph_builder = chunk_graph_builder.build() + for _ in chunk_graph_builder: + pass + return [context[df] for df in tileables] + + +def test_to_gpu(): + # test dataframe + data = pd.DataFrame( + np.random.rand(10, 10), + index=np.random.randint(-100, 100, size=(10,)), + columns=[np.random.bytes(10) for _ in range(10)], + ) + df = from_pandas_df(data) + cdf = to_gpu(df) + + assert df.index_value == cdf.index_value + assert df.columns_value == cdf.columns_value + assert cdf.op.gpu is True + pd.testing.assert_series_equal(df.dtypes, cdf.dtypes) + + df, cdf = _get_df_after_tile([df.data, cdf.data]) + + assert df.nsplits == cdf.nsplits + assert df.chunks[0].index_value == cdf.chunks[0].index_value + assert df.chunks[0].columns_value == cdf.chunks[0].columns_value + assert cdf.chunks[0].op.gpu is True + pd.testing.assert_series_equal(df.chunks[0].dtypes, cdf.chunks[0].dtypes) + + assert cdf is to_gpu(cdf) + + # test series + sdata = data.iloc[:, 0] + series = from_pandas_series(sdata) + cseries = to_gpu(series) + + assert series.index_value == cseries.index_value + assert cseries.op.gpu is True + + series, cseries = _get_df_after_tile([series.data, cseries.data]) + + assert series.nsplits == cseries.nsplits + assert series.chunks[0].index_value == cseries.chunks[0].index_value + assert cseries.chunks[0].op.gpu is True + + assert cseries is to_gpu(cseries) + + +def test_to_cpu(): + data = pd.DataFrame( + np.random.rand(10, 10), + index=np.random.randint(-100, 100, size=(10,)), + columns=[np.random.bytes(10) for _ in range(10)], + ) + df = from_pandas_df(data) + cdf = to_gpu(df) + df2 = to_cpu(cdf) + + assert df.index_value == df2.index_value + assert df.columns_value == df2.columns_value + assert df2.op.gpu is False + pd.testing.assert_series_equal(df.dtypes, df2.dtypes) + + df, df2 = _get_df_after_tile([df.data, df2.data]) + + assert df.nsplits == df2.nsplits + assert df.chunks[0].index_value == df2.chunks[0].index_value + assert df.chunks[0].columns_value == df2.chunks[0].columns_value + assert df2.chunks[0].op.gpu is False + pd.testing.assert_series_equal(df.chunks[0].dtypes, df2.chunks[0].dtypes) + + assert df2 is to_cpu(df2) + + +def test_rechunk(): + from ...merge.concat import DataFrameConcat + + raw = pd.DataFrame(np.random.rand(10, 10)) + df = from_pandas_df(raw, chunk_size=3) + df2 = tile(df.rechunk(4)) + + assert df2.shape == (10, 10) + assert len(df2.chunks) == 9 + + assert df2.chunks[0].shape == (4, 4) + pd.testing.assert_index_equal( + df2.chunks[0].index_value.to_pandas(), pd.RangeIndex(4) + ) + pd.testing.assert_index_equal( + df2.chunks[0].columns_value.to_pandas(), pd.RangeIndex(4) + ) + pd.testing.assert_series_equal(df2.chunks[0].dtypes, raw.dtypes[:4]) + + assert df2.chunks[2].shape == (4, 2) + pd.testing.assert_index_equal( + df2.chunks[2].index_value.to_pandas(), pd.RangeIndex(4) + ) + pd.testing.assert_index_equal( + df2.chunks[2].columns_value.to_pandas(), pd.RangeIndex(8, 10) + ) + pd.testing.assert_series_equal(df2.chunks[2].dtypes, raw.dtypes[-2:]) + + assert df2.chunks[-1].shape == (2, 2) + pd.testing.assert_index_equal( + df2.chunks[-1].index_value.to_pandas(), pd.RangeIndex(8, 10) + ) + pd.testing.assert_index_equal( + df2.chunks[-1].columns_value.to_pandas(), pd.RangeIndex(8, 10) + ) + pd.testing.assert_series_equal(df2.chunks[-1].dtypes, raw.dtypes[-2:]) + + for c in df2.chunks: + assert c.shape[1] == len(c.dtypes) + assert len(c.columns_value.to_pandas()) == len(c.dtypes) + + columns = [np.random.bytes(10) for _ in range(10)] + index = np.random.randint(-100, 100, size=(4,)) + raw = pd.DataFrame(np.random.rand(4, 10), index=index, columns=columns) + df = from_pandas_df(raw, chunk_size=3) + df2 = tile(df.rechunk(6)) + + assert df2.shape == (4, 10) + assert len(df2.chunks) == 2 + + assert df2.chunks[0].shape == (4, 6) + pd.testing.assert_index_equal( + df2.chunks[0].index_value.to_pandas(), df.index_value.to_pandas() + ) + pd.testing.assert_index_equal( + df2.chunks[0].columns_value.to_pandas(), pd.Index(columns[:6]) + ) + pd.testing.assert_series_equal(df2.chunks[0].dtypes, raw.dtypes[:6]) + + assert df2.chunks[1].shape == (4, 4) + pd.testing.assert_index_equal( + df2.chunks[1].index_value.to_pandas(), df.index_value.to_pandas() + ) + pd.testing.assert_index_equal( + df2.chunks[1].columns_value.to_pandas(), pd.Index(columns[6:]) + ) + pd.testing.assert_series_equal(df2.chunks[1].dtypes, raw.dtypes[-4:]) + + for c in df2.chunks: + assert c.shape[1] == len(c.dtypes) + assert len(c.columns_value.to_pandas()) == len(c.dtypes) + + # test Series rechunk + series = from_pandas_series(pd.Series(np.random.rand(10)), chunk_size=3) + series2 = tile(series.rechunk(4)) + + assert series2.shape == (10,) + assert len(series2.chunks) == 3 + pd.testing.assert_index_equal(series2.index_value.to_pandas(), pd.RangeIndex(10)) + + assert series2.chunk_shape == (3,) + assert series2.nsplits == ((4, 4, 2),) + assert series2.chunks[0].shape == (4,) + pd.testing.assert_index_equal( + series2.chunks[0].index_value.to_pandas(), pd.RangeIndex(4) + ) + assert series2.chunks[1].shape == (4,) + pd.testing.assert_index_equal( + series2.chunks[1].index_value.to_pandas(), pd.RangeIndex(4, 8) + ) + assert series2.chunks[2].shape == (2,) + pd.testing.assert_index_equal( + series2.chunks[2].index_value.to_pandas(), pd.RangeIndex(8, 10) + ) + + series2 = tile(series.rechunk(1)) + + assert series2.shape == (10,) + assert len(series2.chunks) == 10 + pd.testing.assert_index_equal(series2.index_value.to_pandas(), pd.RangeIndex(10)) + assert not any(isinstance(c.op, DataFrameConcat) for c in series2.chunks) + + assert series2.chunk_shape == (10,) + assert series2.nsplits == ((1,) * 10,) + assert series2.chunks[0].shape == (1,) + pd.testing.assert_index_equal( + series2.chunks[0].index_value.to_pandas(), pd.RangeIndex(1) + ) + + # no need to rechunk + series2 = tile(series.rechunk(3)) + series = tile(series) + assert series2.chunk_shape == series.chunk_shape + assert series2.nsplits == series.nsplits + + +def test_dataframe_apply(): + cols = [chr(ord("A") + i) for i in range(10)] + df_raw = pd.DataFrame(dict((c, [i**2 for i in range(20)]) for c in cols)) + + old_chunk_store_limit = options.chunk_store_limit + try: + options.chunk_store_limit = 20 + + df = from_pandas_df(df_raw, chunk_size=5) + + def df_func_with_err(v): + assert len(v) > 2 + return v.sort_values() + + def df_series_func_with_err(v): + assert len(v) > 2 + return 0 + + with pytest.raises(TypeError): + df.apply(df_func_with_err) + + r = df.apply(df_func_with_err, output_type="dataframe", dtypes=df_raw.dtypes) + assert r.shape == (np.nan, df.shape[-1]) + assert r.op._op_type_ == opcodes.APPLY + assert r.op.output_types[0] == OutputType.dataframe + assert r.op.elementwise is False + + r = df.apply( + df_series_func_with_err, output_type="series", dtype=object, name="output" + ) + assert r.dtype == np.dtype("O") + assert r.shape == (df.shape[-1],) + assert r.op._op_type_ == opcodes.APPLY + assert r.op.output_types[0] == OutputType.series + assert r.op.elementwise is False + + r = df.apply("ffill") + assert r.op._op_type_ == opcodes.FILL_NA + + r = tile(df.apply(np.sqrt)) + assert all(v == np.dtype("float64") for v in r.dtypes) is True + assert r.shape == df.shape + assert r.op._op_type_ == opcodes.APPLY + assert r.op.output_types[0] == OutputType.dataframe + assert r.op.elementwise is True + + r = tile(df.apply(lambda x: pd.Series([1, 2]))) + assert all(v == np.dtype("int64") for v in r.dtypes) is True + assert r.shape == (np.nan, df.shape[1]) + assert r.op.output_types[0] == OutputType.dataframe + assert r.chunks[0].shape == (np.nan, 1) + assert r.chunks[0].inputs[0].shape[0] == df_raw.shape[0] + assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE + assert r.op.elementwise is False + + r = tile(df.apply(np.sum, axis="index")) + assert np.dtype("int64") == r.dtype + assert r.shape == (df.shape[1],) + assert r.op.output_types[0] == OutputType.series + assert r.chunks[0].shape == (20 // df.shape[0],) + assert r.chunks[0].inputs[0].shape[0] == df_raw.shape[0] + assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE + assert r.op.elementwise is False + + r = tile(df.apply(np.sum, axis="columns")) + assert np.dtype("int64") == r.dtype + assert r.shape == (df.shape[0],) + assert r.op.output_types[0] == OutputType.series + assert r.chunks[0].shape == (20 // df.shape[1],) + assert r.chunks[0].inputs[0].shape[1] == df_raw.shape[1] + assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE + assert r.op.elementwise is False + + r = tile(df.apply(lambda x: pd.Series([1, 2], index=["foo", "bar"]), axis=1)) + assert all(v == np.dtype("int64") for v in r.dtypes) is True + assert r.shape == (df.shape[0], np.nan) + assert r.op.output_types[0] == OutputType.dataframe + assert r.chunks[0].shape == (20 // df.shape[1], np.nan) + assert r.chunks[0].inputs[0].shape[1] == df_raw.shape[1] + assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE + assert r.op.elementwise is False + + r = tile(df.apply(lambda x: [1, 2], axis=1, result_type="expand")) + assert all(v == np.dtype("int64") for v in r.dtypes) is True + assert r.shape == (df.shape[0], np.nan) + assert r.op.output_types[0] == OutputType.dataframe + assert r.chunks[0].shape == (20 // df.shape[1], np.nan) + assert r.chunks[0].inputs[0].shape[1] == df_raw.shape[1] + assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE + assert r.op.elementwise is False + + r = tile(df.apply(lambda x: list(range(10)), axis=1, result_type="reduce")) + assert np.dtype("object") == r.dtype + assert r.shape == (df.shape[0],) + assert r.op.output_types[0] == OutputType.series + assert r.chunks[0].shape == (20 // df.shape[1],) + assert r.chunks[0].inputs[0].shape[1] == df_raw.shape[1] + assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE + assert r.op.elementwise is False + + r = tile(df.apply(lambda x: list(range(10)), axis=1, result_type="broadcast")) + assert all(v == np.dtype("int64") for v in r.dtypes) is True + assert r.shape == (df.shape[0], np.nan) + assert r.op.output_types[0] == OutputType.dataframe + assert r.chunks[0].shape == (20 // df.shape[1], np.nan) + assert r.chunks[0].inputs[0].shape[1] == df_raw.shape[1] + assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE + assert r.op.elementwise is False + finally: + options.chunk_store_limit = old_chunk_store_limit + + raw = pd.DataFrame({"a": [np.array([1, 2, 3]), np.array([4, 5, 6])]}) + df = from_pandas_df(raw) + df2 = df.apply( + lambda x: x["a"].astype(pd.Series), + axis=1, + output_type="dataframe", + dtypes=pd.Series([np.dtype(float)] * 3), + ) + assert df2.ndim == 2 + + +def test_series_apply(): + idxes = [chr(ord("A") + i) for i in range(20)] + s_raw = pd.Series([i**2 for i in range(20)], index=idxes) + + series = from_pandas_series(s_raw, chunk_size=5) + + r = tile(series.apply("add", args=(1,))) + assert r.op._op_type_ == opcodes.ADD + + r = tile(series.apply(np.sqrt)) + assert np.dtype("float64") == r.dtype + assert r.shape == series.shape + assert r.index_value is series.index_value + assert r.op._op_type_ == opcodes.APPLY + assert r.op.output_types[0] == OutputType.series + assert r.chunks[0].shape == (5,) + assert r.chunks[0].inputs[0].shape == (5,) + + r = tile(series.apply("sqrt")) + assert np.dtype("float64") == r.dtype + assert r.shape == series.shape + assert r.op._op_type_ == opcodes.APPLY + assert r.op.output_types[0] == OutputType.series + assert r.chunks[0].shape == (5,) + assert r.chunks[0].inputs[0].shape == (5,) + + r = tile(series.apply(lambda x: [x, x + 1], convert_dtype=False)) + assert np.dtype("object") == r.dtype + assert r.shape == series.shape + assert r.op._op_type_ == opcodes.APPLY + assert r.op.output_types[0] == OutputType.series + assert r.chunks[0].shape == (5,) + assert r.chunks[0].inputs[0].shape == (5,) + + s_raw2 = pd.Series([np.array([1, 2, 3]), np.array([4, 5, 6])]) + series = from_pandas_series(s_raw2) + + r = series.apply(np.sum) + assert r.dtype == np.dtype(object) + + r = series.apply(lambda x: pd.Series([1]), output_type="dataframe") + expected = s_raw2.apply(lambda x: pd.Series([1])) + pd.testing.assert_series_equal(r.dtypes, expected.dtypes) + + dtypes = pd.Series([np.dtype(float)] * 3) + r = series.apply(pd.Series, output_type="dataframe", dtypes=dtypes) + assert r.ndim == 2 + pd.testing.assert_series_equal(r.dtypes, dtypes) + assert r.shape == (2, 3) + + def apply_with_error(_): + raise ValueError + + r = series.apply(apply_with_error, output_type="dataframe", dtypes=dtypes) + assert r.ndim == 2 + + r = series.apply( + pd.Series, output_type="dataframe", dtypes=dtypes, index=pd.RangeIndex(2) + ) + assert r.ndim == 2 + pd.testing.assert_series_equal(r.dtypes, dtypes) + assert r.shape == (2, 3) + + with pytest.raises(AttributeError, match="abc"): + series.apply("abc") + + with pytest.raises(TypeError): + # dtypes not provided + series.apply(lambda x: x.tolist(), output_type="dataframe") + + +def test_transform(): + cols = [chr(ord("A") + i) for i in range(10)] + df_raw = pd.DataFrame(dict((c, [i**2 for i in range(20)]) for c in cols)) + df = from_pandas_df(df_raw, chunk_size=5) + + idxes = [chr(ord("A") + i) for i in range(20)] + s_raw = pd.Series([i**2 for i in range(20)], index=idxes) + series = from_pandas_series(s_raw, chunk_size=5) + + def rename_fn(f, new_name): + f.__name__ = new_name + return f + + old_chunk_store_limit = options.chunk_store_limit + try: + options.chunk_store_limit = 20 + + # DATAFRAME CASES + + # test transform with infer failure + def transform_df_with_err(v): + assert len(v) > 2 + return v.sort_values() + + with pytest.raises(TypeError): + df.transform(transform_df_with_err) + + r = tile(df.transform(transform_df_with_err, dtypes=df_raw.dtypes)) + assert r.shape == df.shape + assert r.op._op_type_ == opcodes.TRANSFORM + assert r.op.output_types[0] == OutputType.dataframe + assert r.chunks[0].shape == (df.shape[0], 20 // df.shape[0]) + assert r.chunks[0].inputs[0].shape[0] == df_raw.shape[0] + assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE + + # test transform scenarios on data frames + r = tile(df.transform(lambda x: list(range(len(x))))) + assert all(v == np.dtype("int64") for v in r.dtypes) is True + assert r.shape == df.shape + assert r.op._op_type_ == opcodes.TRANSFORM + assert r.op.output_types[0] == OutputType.dataframe + assert r.chunks[0].shape == (df.shape[0], 20 // df.shape[0]) + assert r.chunks[0].inputs[0].shape[0] == df_raw.shape[0] + assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE + + r = tile(df.transform(lambda x: list(range(len(x))), axis=1)) + assert all(v == np.dtype("int64") for v in r.dtypes) is True + assert r.shape == df.shape + assert r.op._op_type_ == opcodes.TRANSFORM + assert r.op.output_types[0] == OutputType.dataframe + assert r.chunks[0].shape == (20 // df.shape[1], df.shape[1]) + assert r.chunks[0].inputs[0].shape[1] == df_raw.shape[1] + assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE + + r = tile(df.transform(["cumsum", "cummax", lambda x: x + 1])) + assert all(v == np.dtype("int64") for v in r.dtypes) is True + assert r.shape == (df.shape[0], df.shape[1] * 3) + assert r.op._op_type_ == opcodes.TRANSFORM + assert r.op.output_types[0] == OutputType.dataframe + assert r.chunks[0].shape == (df.shape[0], 20 // df.shape[0] * 3) + assert r.chunks[0].inputs[0].shape[0] == df_raw.shape[0] + assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE + + r = tile( + df.transform( + {"A": "cumsum", "D": ["cumsum", "cummax"], "F": lambda x: x + 1} + ) + ) + assert all(v == np.dtype("int64") for v in r.dtypes) is True + assert r.shape == (df.shape[0], 4) + assert r.op._op_type_ == opcodes.TRANSFORM + assert r.op.output_types[0] == OutputType.dataframe + assert r.chunks[0].shape == (df.shape[0], 1) + assert r.chunks[0].inputs[0].shape[0] == df_raw.shape[0] + assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE + + # test agg scenarios on series + r = tile(df.transform(lambda x: x.iloc[:-1], _call_agg=True)) + assert all(v == np.dtype("int64") for v in r.dtypes) is True + assert r.shape == (np.nan, df.shape[1]) + assert r.op._op_type_ == opcodes.TRANSFORM + assert r.op.output_types[0] == OutputType.dataframe + assert r.chunks[0].shape == (np.nan, 1) + assert r.chunks[0].inputs[0].shape[0] == df_raw.shape[0] + assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE + + r = tile(df.transform(lambda x: x.iloc[:-1], axis=1, _call_agg=True)) + assert all(v == np.dtype("int64") for v in r.dtypes) is True + assert r.shape == (df.shape[0], np.nan) + assert r.op._op_type_ == opcodes.TRANSFORM + assert r.op.output_types[0] == OutputType.dataframe + assert r.chunks[0].shape == (2, np.nan) + assert r.chunks[0].inputs[0].shape[1] == df_raw.shape[1] + assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE + + fn_list = [ + rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), "f1"), + lambda x: x.iloc[:-1].reset_index(drop=True), + ] + r = tile(df.transform(fn_list, _call_agg=True)) + assert all(v == np.dtype("int64") for v in r.dtypes) is True + assert r.shape == (np.nan, df.shape[1] * 2) + assert r.op._op_type_ == opcodes.TRANSFORM + assert r.op.output_types[0] == OutputType.dataframe + assert r.chunks[0].shape == (np.nan, 2) + assert r.chunks[0].inputs[0].shape[0] == df_raw.shape[0] + assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE + + r = tile(df.transform(lambda x: x.sum(), _call_agg=True)) + assert r.dtype == np.dtype("int64") + assert r.shape == (df.shape[1],) + assert r.op._op_type_ == opcodes.TRANSFORM + assert r.op.output_types[0] == OutputType.series + assert r.chunks[0].shape == (20 // df.shape[0],) + assert r.chunks[0].inputs[0].shape[0] == df_raw.shape[0] + assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE + + fn_dict = { + "A": rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), "f1"), + "D": [ + rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), "f1"), + lambda x: x.iloc[:-1].reset_index(drop=True), + ], + "F": lambda x: x.iloc[:-1].reset_index(drop=True), + } + r = tile(df.transform(fn_dict, _call_agg=True)) + assert all(v == np.dtype("int64") for v in r.dtypes) is True + assert r.shape == (np.nan, 4) + assert r.op._op_type_ == opcodes.TRANSFORM + assert r.op.output_types[0] == OutputType.dataframe + assert r.chunks[0].shape == (np.nan, 1) + assert r.chunks[0].inputs[0].shape[0] == df_raw.shape[0] + assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE + + # SERIES CASES + # test transform scenarios on series + r = tile(series.transform(lambda x: x + 1)) + assert np.dtype("int64") == r.dtype + assert r.shape == series.shape + assert r.op._op_type_ == opcodes.TRANSFORM + assert r.op.output_types[0] == OutputType.series + assert r.chunks[0].shape == (5,) + assert r.chunks[0].inputs[0].shape == (5,) + finally: + options.chunk_store_limit = old_chunk_store_limit + + +def test_string_method(): + s = pd.Series(["a", "b", "c"], name="s") + series = from_pandas_series(s, chunk_size=2) + + with pytest.raises(AttributeError): + _ = series.str.non_exist + + r = series.str.contains("c") + assert r.dtype == np.bool_ + assert r.name == s.name + pd.testing.assert_index_equal(r.index_value.to_pandas(), s.index) + assert r.shape == s.shape + + r = tile(r) + for i, c in enumerate(r.chunks): + assert c.index == (i,) + assert c.dtype == np.bool_ + assert c.name == s.name + pd.testing.assert_index_equal( + c.index_value.to_pandas(), s.index[i * 2 : (i + 1) * 2] + ) + assert c.shape == (2,) if i == 0 else (1,) + + r = series.str.split(",", expand=True, n=1) + assert r.op.output_types[0] == OutputType.dataframe + assert r.shape == (3, 2) + pd.testing.assert_index_equal(r.index_value.to_pandas(), s.index) + pd.testing.assert_index_equal(r.columns_value.to_pandas(), pd.RangeIndex(2)) + + r = tile(r) + for i, c in enumerate(r.chunks): + assert c.index == (i, 0) + pd.testing.assert_index_equal( + c.index_value.to_pandas(), s.index[i * 2 : (i + 1) * 2] + ) + pd.testing.assert_index_equal(c.columns_value.to_pandas(), pd.RangeIndex(2)) + assert c.shape == (2, 2) if i == 0 else (1, 2) + + with pytest.raises(TypeError): + _ = series.str.cat([["1", "2"]]) + + with pytest.raises(ValueError): + _ = series.str.cat(["1", "2"]) + + with pytest.raises(ValueError): + _ = series.str.cat(",") + + with pytest.raises(TypeError): + _ = series.str.cat({"1", "2", "3"}) + + r = series.str.cat(sep=",") + assert r.op.output_types[0] == OutputType.scalar + assert r.dtype == s.dtype + + r = tile(r) + assert len(r.chunks) == 1 + assert r.chunks[0].op.output_types[0] == OutputType.scalar + assert r.chunks[0].dtype == s.dtype + + r = series.str.extract(r"[ab](\d)", expand=False) + assert r.op.output_types[0] == OutputType.series + assert r.dtype == s.dtype + + r = tile(r) + for i, c in enumerate(r.chunks): + assert c.index == (i,) + assert c.dtype == s.dtype + assert c.name == s.name + pd.testing.assert_index_equal( + c.index_value.to_pandas(), s.index[i * 2 : (i + 1) * 2] + ) + assert c.shape == (2,) if i == 0 else (1,) + + r = series.str.extract(r"[ab](\d)", expand=True) + assert r.op.output_types[0] == OutputType.dataframe + assert r.shape == (3, 1) + pd.testing.assert_index_equal(r.index_value.to_pandas(), s.index) + pd.testing.assert_index_equal(r.columns_value.to_pandas(), pd.RangeIndex(1)) + + r = tile(r) + for i, c in enumerate(r.chunks): + assert c.index == (i, 0) + pd.testing.assert_index_equal( + c.index_value.to_pandas(), s.index[i * 2 : (i + 1) * 2] + ) + pd.testing.assert_index_equal(c.columns_value.to_pandas(), pd.RangeIndex(1)) + assert c.shape == (2, 1) if i == 0 else (1, 1) + + assert "lstrip" in dir(series.str) + + +def test_datetime_method(): + s = pd.Series( + [pd.Timestamp("2020-1-1"), pd.Timestamp("2020-2-1"), pd.Timestamp("2020-3-1")], + name="ss", + ) + series = from_pandas_series(s, chunk_size=2) + + r = series.dt.year + assert r.dtype == s.dt.year.dtype + pd.testing.assert_index_equal(r.index_value.to_pandas(), s.index) + assert r.shape == s.shape + assert r.op.output_types[0] == OutputType.series + assert r.name == s.dt.year.name + + r = tile(r) + for i, c in enumerate(r.chunks): + assert c.index == (i,) + assert c.dtype == s.dt.year.dtype + assert c.op.output_types[0] == OutputType.series + assert r.name == s.dt.year.name + pd.testing.assert_index_equal( + c.index_value.to_pandas(), s.index[i * 2 : (i + 1) * 2] + ) + assert c.shape == (2,) if i == 0 else (1,) + + with pytest.raises(AttributeError): + _ = from_pandas_series(pd.Series([1])).dt + with pytest.raises(AttributeError): + _ = series.dt.non_exist + + assert "ceil" in dir(series.dt) + + +def test_series_isin(): + # one chunk in multiple chunks + a = from_pandas_series(pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), chunk_size=10) + b = from_pandas_series(pd.Series([2, 1, 9, 3]), chunk_size=2) + + r = tile(a.isin(b)) + for i, c in enumerate(r.chunks): + assert c.index == (i,) + assert c.dtype == np.dtype("bool") + assert c.shape == (10,) + assert len(c.op.inputs) == 2 + assert c.op.output_types[0] == OutputType.series + assert c.op.inputs[0].index == (i,) + assert c.op.inputs[0].shape == (10,) + assert c.op.inputs[1].index == (0,) + assert c.op.inputs[1].shape == (10,) + + # multiple chunk in one chunks + a = from_pandas_series(pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), chunk_size=5) + b = from_pandas_series(pd.Series([2, 1, 9, 3]), chunk_size=4) + + r = tile(a.isin(b)) + for i, c in enumerate(r.chunks): + assert c.index == (i,) + assert c.dtype == np.dtype("bool") + assert c.shape == (5,) + assert len(c.op.inputs) == 2 + assert c.op.output_types[0] == OutputType.series + assert c.op.inputs[0].index == (i,) + assert c.op.inputs[0].shape == (5,) + assert c.op.inputs[1].index == (0,) + assert c.op.inputs[1].shape == (4,) + + # multiple chunk in multiple chunks + a = from_pandas_series(pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), chunk_size=5) + b = from_pandas_series(pd.Series([2, 1, 9, 3]), chunk_size=2) + + r = tile(a.isin(b)) + for i, c in enumerate(r.chunks): + assert c.index == (i,) + assert c.dtype == np.dtype("bool") + assert c.shape == (5,) + assert len(c.op.inputs) == 2 + assert c.op.output_types[0] == OutputType.series + assert c.op.inputs[0].index == (i,) + assert c.op.inputs[0].shape == (5,) + assert c.op.inputs[1].index == (i,) + assert c.op.inputs[1].shape == (5,) + + with pytest.raises(TypeError): + _ = a.isin("sth") + + with pytest.raises(TypeError): + _ = a.to_frame().isin("sth") + + +def test_cut(): + s = from_pandas_series(pd.Series([1.0, 2.0, 3.0, 4.0]), chunk_size=2) + + with pytest.raises(ValueError): + _ = cut(s, -1) + + with pytest.raises(ValueError): + _ = cut([[1, 2], [3, 4]], 3) + + with pytest.raises(ValueError): + _ = cut([], 3) + + r, b = cut(s, [1.5, 2.5], retbins=True) + assert isinstance(r, SERIES_TYPE) + assert isinstance(b, TENSOR_TYPE) + + r = tile(r) + + assert len(r.chunks) == 2 + for c in r.chunks: + assert isinstance(c, SERIES_CHUNK_TYPE) + assert c.shape == (2,) + + r = cut(s.to_tensor(), [1.5, 2.5]) + assert isinstance(r, CATEGORICAL_TYPE) + assert len(r) == len(s) + assert "Categorical" in repr(r) + + r = tile(r) + + assert len(r.chunks) == 2 + for c in r.chunks: + assert isinstance(c, CATEGORICAL_CHUNK_TYPE) + assert c.shape == (2,) + assert c.ndim == 1 + + r = cut([0, 1, 1, 2], bins=4, labels=False) + assert isinstance(r, TENSOR_TYPE) + e = pd.cut([0, 1, 1, 2], bins=4, labels=False) + assert r.dtype == e.dtype + + +def test_transpose(): + s = pd.DataFrame({"a": [1, 2, 3], "b": ["5", "-6", "7"], "c": [1, 2, 3]}) + df = from_pandas_df(s, chunk_size=2) + + r = tile(df.transpose()) + assert len(r.chunks) == 4 + assert isinstance(r, DATAFRAME_TYPE) + + r = tile(df.T) + assert len(r.chunks) == 4 + assert isinstance(r, DATAFRAME_TYPE) + + +def test_to_numeric(): + raw = pd.DataFrame({"a": [1.0, 2, 3, -3]}) + df = from_pandas_df(raw, chunk_size=2) + + with pytest.raises(ValueError): + _ = to_numeric(df) + + with pytest.raises(ValueError): + _ = to_numeric([["1.0", 1]]) + + with pytest.raises(ValueError): + _ = to_numeric([]) + + s = from_pandas_series(pd.Series(["1.0", "2.0", 1, -2]), chunk_size=2) + r = tile(to_numeric(s)) + assert len(r.chunks) == 2 + assert isinstance(r, SERIES_TYPE) + + r = tile(to_numeric(["1.0", "2.0", 1, -2])) + assert isinstance(r, TENSOR_TYPE) + + +def test_astype(): + s = from_pandas_series(pd.Series([1, 2, 1, 2], name="a"), chunk_size=2) + with pytest.raises(KeyError): + astype(s, {"b": "str"}) + + df = from_pandas_df( + pd.DataFrame({"a": [1, 2, 1, 2], "b": ["a", "b", "a", "b"]}), chunk_size=2 + ) + + with pytest.raises(KeyError): + astype(df, {"c": "str", "a": "str"}) + + +def test_get_dummies(): + raw = pd.DataFrame( + { + "a": [1.1, 2.1, 3.1], + "b": ["5", "-6", "-7"], + "c": [1, 2, 3], + "d": ["2", "3", "4"], + } + ) + df = from_pandas_df(raw, chunk_size=2) + + with pytest.raises(TypeError): + _ = get_dummies(df, columns="a") + + with pytest.raises(ValueError): + _ = get_dummies(df, prefix=["col1"]) + + with pytest.raises(ValueError): + _ = get_dummies(df, columns=["a"], prefix={"a": "col1", "c": "col2"}) + + with pytest.raises(KeyError): + _ = get_dummies(df, columns=["a", "b"], prefix={"a": "col1", "c": "col2"}) + + r = get_dummies(df) + assert isinstance(r, DATAFRAME_TYPE) + + +def test_drop(): + # test dataframe drop + rs = np.random.RandomState(0) + raw = pd.DataFrame( + rs.randint(1000, size=(20, 8)), columns=["c" + str(i + 1) for i in range(8)] + ) + + df = from_pandas_df(raw, chunk_size=8) + + with pytest.raises(KeyError): + df.drop(columns=["c9"]) + with pytest.raises(NotImplementedError): + df.drop(columns=from_pandas_series(pd.Series(["c9"]))) + + r = df.drop(columns=["c1"]) + pd.testing.assert_index_equal(r.index_value.to_pandas(), raw.index) + + tiled = tile(r) + start = 0 + for c in tiled.chunks: + raw_index = raw.index[start : start + c.shape[0]] + start += c.shape[0] + pd.testing.assert_index_equal(raw_index, c.index_value.to_pandas()) + + df = from_pandas_df(raw, chunk_size=3) + + columns = ["c2", "c4", "c5", "c6"] + index = [3, 6, 7] + r = df.drop(columns=columns, index=index) + assert isinstance(r, DATAFRAME_TYPE) + + # test series drop + raw = pd.Series(rs.randint(1000, size=(20,))) + series = from_pandas_series(raw, chunk_size=3) + + r = series.drop(index=index) + assert isinstance(r, SERIES_TYPE) + + # test index drop + ser = pd.Series(range(20)) + rs.shuffle(ser) + raw = pd.Index(ser) + + idx = from_pandas_index(raw) + + r = idx.drop(index) + assert isinstance(r, INDEX_TYPE) + + +def test_drop_duplicates(): + rs = np.random.RandomState(0) + raw = pd.DataFrame( + rs.randint(1000, size=(20, 7)), columns=["c" + str(i + 1) for i in range(7)] + ) + raw["c7"] = [f"s{j}" for j in range(20)] + + df = from_pandas_df(raw, chunk_size=10) + with pytest.raises(ValueError): + df.drop_duplicates(method="unknown") + with pytest.raises(KeyError): + df.drop_duplicates(subset="c8") + + # test auto method selection + assert tile(df.drop_duplicates()).chunks[0].op.method == "tree" + # subset size less than chunk_store_limit + assert ( + tile(df.drop_duplicates(subset=["c1", "c3"])).chunks[0].op.method + == "subset_tree" + ) + with option_context({"chunk_store_limit": 5}): + # subset size greater than chunk_store_limit + assert ( + tile(df.drop_duplicates(subset=["c1", "c3"])).chunks[0].op.method == "tree" + ) + assert tile(df.drop_duplicates(subset=["c1", "c7"])).chunks[0].op.method == "tree" + assert tile(df["c7"].drop_duplicates()).chunks[0].op.method == "tree" + + s = df["c7"] + with pytest.raises(ValueError): + s.drop_duplicates(method="unknown") + + +def test_memory_usage(): + dtypes = ["int64", "float64", "complex128", "object", "bool"] + data = dict([(t, np.ones(shape=500).astype(t)) for t in dtypes]) + raw = pd.DataFrame(data) + + df = from_pandas_df(raw, chunk_size=(500, 2)) + r = tile(df.memory_usage()) + + assert isinstance(r, SERIES_TYPE) + assert r.shape == (6,) + assert len(r.chunks) == 3 + assert r.chunks[0].op.stage is None + + df = from_pandas_df(raw, chunk_size=(100, 3)) + r = tile(df.memory_usage(index=True)) + + assert isinstance(r, SERIES_TYPE) + assert r.shape == (6,) + assert len(r.chunks) == 2 + assert r.chunks[0].op.stage == OperandStage.reduce + + r = tile(df.memory_usage(index=False)) + + assert isinstance(r, SERIES_TYPE) + assert r.shape == (5,) + assert len(r.chunks) == 2 + assert r.chunks[0].op.stage == OperandStage.reduce + + raw = pd.Series(np.ones(shape=500).astype("object"), name="s") + + series = from_pandas_series(raw) + r = tile(series.memory_usage()) + + assert isinstance(r, TENSOR_TYPE) + assert r.shape == () + assert len(r.chunks) == 1 + assert r.chunks[0].op.stage is None + + series = from_pandas_series(raw, chunk_size=100) + r = tile(series.memory_usage()) + + assert isinstance(r, TENSOR_TYPE) + assert r.shape == () + assert len(r.chunks) == 1 + assert r.chunks[0].op.stage == OperandStage.reduce + + +def test_shift(): + rs = np.random.RandomState(0) + raw = pd.DataFrame( + rs.randint(1000, size=(10, 8)), + columns=["col" + str(i + 1) for i in range(8)], + index=pd.date_range("2021-1-1", periods=10), + ) + df = from_pandas_df(raw, chunk_size=5) + + df2 = df.shift(1) + df2 = tile(df2) + + for c in df2.chunks: + pd.testing.assert_index_equal(c.dtypes.index, c.columns_value.to_pandas()) + + df2 = df.shift(1, freq="D") + df2 = tile(df2) + + for c in df2.chunks: + pd.testing.assert_index_equal(c.dtypes.index, c.columns_value.to_pandas()) + + +def test_eval_query(): + rs = np.random.RandomState(0) + raw = pd.DataFrame({"a": rs.rand(100), "b": rs.rand(100), "c c": rs.rand(100)}) + df = from_pandas_df(raw, chunk_size=(10, 2)) + + with pytest.raises(NotImplementedError): + mars_eval("df.a * 2", engine="numexpr") + with pytest.raises(NotImplementedError): + mars_eval("df.a * 2", parser="pandas") + with pytest.raises(TypeError): + df.eval(df) + with pytest.raises(SyntaxError): + df.query( + """ + a + b + a + `c c` + """ + ) + with pytest.raises(SyntaxError): + df.eval( + """ + def a(): + return v + a() + """ + ) + with pytest.raises(SyntaxError): + df.eval("a + `c") + with pytest.raises(KeyError): + df.eval("a + c") + with pytest.raises(ValueError): + df.eval("p, q = a + c") + with pytest.raises(ValueError): + df.query("p = a + c") + + +def test_empty(): + # for DataFrame + assert from_pandas_df(pd.DataFrame()).empty == pd.DataFrame().empty + assert from_pandas_df(pd.DataFrame({})).empty == pd.DataFrame({}).empty + assert ( + from_pandas_df(pd.DataFrame({"a": []})).empty == pd.DataFrame({"a": []}).empty + ) + assert ( + from_pandas_df(pd.DataFrame({"a": [1]})).empty == pd.DataFrame({"a": [1]}).empty + ) + assert ( + from_pandas_df(pd.DataFrame({"a": [1], "b": [2]})).empty + == pd.DataFrame({"a": [1], "b": [2]}).empty + ) + assert ( + from_pandas_df(pd.DataFrame(np.empty(shape=(4, 0)))).empty + == pd.DataFrame(np.empty(shape=(4, 0))).empty + ) + + # for Series + assert from_pandas_series(pd.Series()).empty == pd.Series().empty + assert from_pandas_series(pd.Series({})).empty == pd.Series({}).empty + assert from_pandas_series(pd.Series({"a": []})).empty == pd.Series({"a": []}).empty + assert ( + from_pandas_series(pd.Series({"a": [1]})).empty == pd.Series({"a": [1]}).empty + ) + + # Maybe fail due to lazy evaluation + with pytest.raises(ValueError): + a = from_pandas_df(pd.DataFrame(np.random.rand(10, 2))) + assert a[a > 0].empty + with pytest.raises(ValueError): + a = from_pandas_series(pd.Series(np.random.rand(10))) + assert a[a > 0].empty diff --git a/python/xorbits/_mars/dataframe/base/tests/test_base_execution.py b/python/xorbits/_mars/dataframe/base/tests/test_base_execution.py new file mode 100644 index 000000000..ae94cfec2 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/tests/test_base_execution.py @@ -0,0 +1,2470 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +from collections import OrderedDict + +import mars +import numpy as np +import pandas as pd +import pytest + +try: + import pyarrow as pa +except ImportError: # pragma: no cover + pa = None + +from ....config import option_context, options +from ....dataframe import DataFrame, Series +from ....tensor import arange, tensor +from ....tensor.random import rand +from ....tests.core import require_cudf +from ....utils import lazy_import, no_default, pd_release_version +from ... import cut +from ... import eval as mars_eval +from ... import get_dummies, qcut +from ...core import DATAFRAME_OR_SERIES_TYPE +from ...datasource.dataframe import from_pandas as from_pandas_df +from ...datasource.index import from_pandas as from_pandas_index +from ...datasource.series import from_pandas as from_pandas_series +from .. import to_cpu, to_gpu +from ..bloom_filter import filter_by_bloom_filter +from ..rebalance import DataFrameRebalance +from ..shift import _enable_no_default, _with_column_freq_bug +from ..to_numeric import to_numeric + +pytestmark = pytest.mark.pd_compat + +cudf = lazy_import("cudf") + +_explode_with_ignore_index = pd_release_version[:2] >= (1, 1) +_interval_range_closed_arg = pd_release_version[:2] >= (1, 5) + + +@require_cudf +def test_to_gpu_execution(setup_gpu): + pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1)) + df = from_pandas_df(pdf, chunk_size=(13, 21)) + cdf = to_gpu(df) + + res = cdf.execute().fetch() + assert isinstance(res, cudf.DataFrame) + pd.testing.assert_frame_equal(res.to_pandas(), pdf) + + pseries = pdf.iloc[:, 0] + series = from_pandas_series(pseries) + cseries = series.to_gpu() + + res = cseries.execute().fetch() + assert isinstance(res, cudf.Series) + pd.testing.assert_series_equal(res.to_pandas(), pseries) + + +@require_cudf +def test_to_cpu_execution(setup_gpu): + pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1)) + df = from_pandas_df(pdf, chunk_size=(13, 21)) + cdf = to_gpu(df) + df2 = to_cpu(cdf) + + res = df2.execute().fetch() + assert isinstance(res, pd.DataFrame) + pd.testing.assert_frame_equal(res, pdf) + + pseries = pdf.iloc[:, 0] + series = from_pandas_series(pseries, chunk_size=(13, 21)) + cseries = to_gpu(series) + series2 = to_cpu(cseries) + + res = series2.execute().fetch() + assert isinstance(res, pd.Series) + pd.testing.assert_series_equal(res, pseries) + + +def test_rechunk_execution(setup): + ns = np.random.RandomState(0) + df = pd.DataFrame(ns.rand(100, 10), columns=["a" + str(i) for i in range(10)]) + + # test rechunk after sort + mdf = DataFrame(df, chunk_size=10) + result = mdf.sort_values("a0").rechunk(chunk_size=10).execute().fetch() + expected = df.sort_values("a0") + pd.testing.assert_frame_equal(result, expected) + + data = pd.DataFrame(np.random.rand(8, 10)) + df = from_pandas_df(pd.DataFrame(data), chunk_size=3) + df2 = df.rechunk((3, 4)) + res = df2.execute().fetch() + pd.testing.assert_frame_equal(data, res) + + data = pd.DataFrame( + np.random.rand(10, 10), + index=np.random.randint(-100, 100, size=(10,)), + columns=[np.random.bytes(10) for _ in range(10)], + ) + df = from_pandas_df(data) + df2 = df.rechunk(5) + res = df2.execute().fetch() + pd.testing.assert_frame_equal(data, res) + + # test Series rechunk execution. + data = pd.Series(np.random.rand(10)) + series = from_pandas_series(data) + series2 = series.rechunk(3) + res = series2.execute().fetch() + pd.testing.assert_series_equal(data, res) + + series2 = series.rechunk(1) + res = series2.execute().fetch() + pd.testing.assert_series_equal(data, res) + + # test index rechunk execution + data = pd.Index(np.random.rand(10)) + index = from_pandas_index(data) + index2 = index.rechunk(3) + res = index2.execute().fetch() + pd.testing.assert_index_equal(data, res) + + index2 = index.rechunk(1) + res = index2.execute().fetch() + pd.testing.assert_index_equal(data, res) + + # test rechunk on mixed typed columns + data = pd.DataFrame({0: [1, 2], 1: [3, 4], "a": [5, 6]}) + df = from_pandas_df(data) + df = df.rechunk((2, 2)).rechunk({1: 3}) + res = df.execute().fetch() + pd.testing.assert_frame_equal(data, res) + + +def test_series_map_execution(setup): + raw = pd.Series(np.arange(10)) + s = from_pandas_series(raw, chunk_size=7) + + with pytest.raises(ValueError): + # cannot infer dtype, the inferred is int, + # but actually it is float + # just due to nan + s.map({5: 10}) + + r = s.map({5: 10}, dtype=float) + result = r.execute().fetch() + expected = raw.map({5: 10}) + pd.testing.assert_series_equal(result, expected) + + # use skip_infer when infer failed + r = s.map({5: 10}, skip_infer=True) + assert r.dtype is None + result = r.execute().fetch() + assert np.issubdtype(r.dtype, np.dtype("float")) + expected = raw.map({5: 10}) + pd.testing.assert_series_equal(result, expected) + + r = s.map({i: 10 + i for i in range(7)}, dtype=float) + result = r.execute().fetch() + expected = raw.map({i: 10 + i for i in range(7)}) + pd.testing.assert_series_equal(result, expected) + + r = s.map({5: 10}, dtype=float, na_action="ignore") + result = r.execute().fetch() + expected = raw.map({5: 10}, na_action="ignore") + pd.testing.assert_series_equal(result, expected) + + # dtype can be inferred + r = s.map({5: 10.0}) + result = r.execute().fetch() + expected = raw.map({5: 10.0}) + pd.testing.assert_series_equal(result, expected) + + r = s.map(lambda x: x + 1, dtype=int) + result = r.execute().fetch() + expected = raw.map(lambda x: x + 1) + pd.testing.assert_series_equal(result, expected) + + def f(x: int) -> float: + return x + 1.0 + + # dtype can be inferred for function + r = s.map(f) + result = r.execute().fetch() + expected = raw.map(lambda x: x + 1.0) + pd.testing.assert_series_equal(result, expected) + + def f(x: int): + return x + 1.0 + + # dtype can be inferred for function + r = s.map(f) + result = r.execute().fetch() + expected = raw.map(lambda x: x + 1.0) + pd.testing.assert_series_equal(result, expected) + + # test arg is a md.Series + raw2 = pd.Series([10], index=[5]) + s2 = from_pandas_series(raw2) + + r = s.map(s2, dtype=float) + result = r.execute().fetch() + expected = raw.map(raw2) + pd.testing.assert_series_equal(result, expected) + + # test arg is a md.Series, and dtype can be inferred + raw2 = pd.Series([10.0], index=[5]) + s2 = from_pandas_series(raw2) + + r = s.map(s2) + result = r.execute().fetch() + expected = raw.map(raw2) + pd.testing.assert_series_equal(result, expected) + + # test str + raw = pd.Series(["a", "b", "c", "d"]) + s = from_pandas_series(raw, chunk_size=2) + + r = s.map({"c": "e"}) + result = r.execute().fetch() + expected = raw.map({"c": "e"}) + pd.testing.assert_series_equal(result, expected) + + # test map index + raw = pd.Index(np.random.rand(7)) + idx = from_pandas_index(pd.Index(raw), chunk_size=2) + r = idx.map(f) + result = r.execute().fetch() + expected = raw.map(lambda x: x + 1.0) + pd.testing.assert_index_equal(result, expected) + + +def test_describe_execution(setup): + s_raw = pd.Series(np.random.rand(10)) + + # test one chunk + series = from_pandas_series(s_raw, chunk_size=10) + + r = series.describe() + result = r.execute().fetch() + expected = s_raw.describe() + pd.testing.assert_series_equal(result, expected) + + r = series.describe(percentiles=[]) + result = r.execute().fetch() + expected = s_raw.describe(percentiles=[]) + pd.testing.assert_series_equal(result, expected) + + # test multi chunks + series = from_pandas_series(s_raw, chunk_size=3) + + r = series.describe() + result = r.execute().fetch() + expected = s_raw.describe() + pd.testing.assert_series_equal(result, expected) + + r = series.describe(percentiles=[]) + result = r.execute().fetch() + expected = s_raw.describe(percentiles=[]) + pd.testing.assert_series_equal(result, expected) + + rs = np.random.RandomState(5) + df_raw = pd.DataFrame(rs.rand(10, 4), columns=list("abcd")) + df_raw["e"] = rs.randint(100, size=10) + + # test one chunk + df = from_pandas_df(df_raw, chunk_size=10) + + r = df.describe() + result = r.execute().fetch() + expected = df_raw.describe() + pd.testing.assert_frame_equal(result, expected) + + r = series.describe(percentiles=[], include=np.float64) + result = r.execute().fetch() + expected = s_raw.describe(percentiles=[], include=np.float64) + pd.testing.assert_series_equal(result, expected) + + # test multi chunks + df = from_pandas_df(df_raw, chunk_size=3) + + r = df.describe() + result = r.execute().fetch() + expected = df_raw.describe() + pd.testing.assert_frame_equal(result, expected) + + r = df.describe(percentiles=[], include=np.float64) + result = r.execute().fetch() + expected = df_raw.describe(percentiles=[], include=np.float64) + pd.testing.assert_frame_equal(result, expected) + + # test skip percentiles + r = df.describe(percentiles=False, include=np.float64) + result = r.execute().fetch() + expected = df_raw.describe(percentiles=[], include=np.float64) + expected.drop(["50%"], axis=0, inplace=True) + pd.testing.assert_frame_equal(result, expected) + + with pytest.raises(ValueError): + df.describe(percentiles=[1.1]) + + with pytest.raises(ValueError): + # duplicated values + df.describe(percentiles=[0.3, 0.5, 0.3]) + + # test input dataframe which has unknown shape + df = from_pandas_df(df_raw, chunk_size=3) + df2 = df[df["a"] < 0.5] + r = df2.describe() + + result = r.execute().fetch() + expected = df_raw[df_raw["a"] < 0.5].describe() + pd.testing.assert_frame_equal(result, expected) + + +def test_data_frame_apply_execute(setup): + cols = [chr(ord("A") + i) for i in range(10)] + df_raw = pd.DataFrame(dict((c, [i**2 for i in range(20)]) for c in cols)) + + old_chunk_store_limit = options.chunk_store_limit + try: + options.chunk_store_limit = 20 + + df = from_pandas_df(df_raw, chunk_size=5) + + r = df.apply("ffill") + result = r.execute().fetch() + expected = df_raw.apply("ffill") + pd.testing.assert_frame_equal(result, expected) + + r = df.apply(["sum", "max"]) + result = r.execute().fetch() + expected = df_raw.apply(["sum", "max"]) + pd.testing.assert_frame_equal(result, expected) + + r = df.apply(["sum", "max"], axis=1) + result = r.execute().fetch() + expected = df_raw.apply(["sum", "max"], axis=1) + pd.testing.assert_frame_equal(result, expected) + + r = df.apply(np.sqrt) + result = r.execute().fetch() + expected = df_raw.apply(np.sqrt) + pd.testing.assert_frame_equal(result, expected) + + r = df.apply(lambda x: pd.Series([1, 2])) + result = r.execute().fetch() + expected = df_raw.apply(lambda x: pd.Series([1, 2])) + pd.testing.assert_frame_equal(result, expected) + + r = df.apply(np.sum, axis="index") + result = r.execute().fetch() + expected = df_raw.apply(np.sum, axis="index") + pd.testing.assert_series_equal(result, expected) + + r = df.apply(np.sum, axis="columns") + result = r.execute().fetch() + expected = df_raw.apply(np.sum, axis="columns") + pd.testing.assert_series_equal(result, expected) + + r = df.apply(lambda x: [1, 2], axis=1) + result = r.execute().fetch() + expected = df_raw.apply(lambda x: [1, 2], axis=1) + pd.testing.assert_series_equal(result, expected) + + r = df.apply(lambda x: pd.Series([1, 2], index=["foo", "bar"]), axis=1) + result = r.execute().fetch() + expected = df_raw.apply( + lambda x: pd.Series([1, 2], index=["foo", "bar"]), axis=1 + ) + pd.testing.assert_frame_equal(result, expected) + + r = df.apply(lambda x: [1, 2], axis=1, result_type="expand") + result = r.execute().fetch() + expected = df_raw.apply(lambda x: [1, 2], axis=1, result_type="expand") + pd.testing.assert_frame_equal(result, expected) + + r = df.apply(lambda x: list(range(10)), axis=1, result_type="reduce") + result = r.execute().fetch() + expected = df_raw.apply(lambda x: list(range(10)), axis=1, result_type="reduce") + pd.testing.assert_series_equal(result, expected) + + r = df.apply(lambda x: list(range(10)), axis=1, result_type="broadcast") + result = r.execute().fetch() + expected = df_raw.apply( + lambda x: list(range(10)), axis=1, result_type="broadcast" + ) + pd.testing.assert_frame_equal(result, expected) + finally: + options.chunk_store_limit = old_chunk_store_limit + + +def test_data_frame_apply_closure_execute(setup): + cols = [chr(ord("A") + i) for i in range(10)] + df_raw = pd.DataFrame(dict((c, [i**2 for i in range(20)]) for c in cols)) + df = from_pandas_df(df_raw, chunk_size=5) + + x = pd.Series([i for i in range(10**4)]) + y = pd.Series([i for i in range(10**4)]) + + def closure(z): + return pd.concat([x, y], ignore_index=True) + + r = df.apply(closure, axis=1) + result = r.execute().fetch() + expected = df_raw.apply(closure, axis=1) + pd.testing.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("multiplier", [1, 3, 4]) +def test_data_frame_apply_callable_execute(setup, multiplier): + cols = [chr(ord("A") + i) for i in range(10)] + df_raw = pd.DataFrame(dict((c, [i**2 for i in range(20)]) for c in cols)) + df = from_pandas_df(df_raw, chunk_size=5) + + class callable_df: + __slots__ = "x", "__dict__" + + def __init__(self, multiplier: int = 1): + self.x = pd.Series([i for i in range(10**multiplier)]) + self.y = pd.Series([i for i in range(10**multiplier)]) + + def __call__(self, pdf): + return pd.concat([self.x, self.y], ignore_index=True) + + cdf_large = callable_df(multiplier=multiplier) + r = df.apply(cdf_large, axis=1) + result = r.execute().fetch() + expected = df_raw.apply(cdf_large, axis=1) + pd.testing.assert_frame_equal(result, expected) + + +def test_series_apply_execute(setup): + idxes = [chr(ord("A") + i) for i in range(20)] + s_raw = pd.Series([i**2 for i in range(20)], index=idxes) + + series = from_pandas_series(s_raw, chunk_size=5) + + r = series.apply("add", args=(1,)) + result = r.execute().fetch() + expected = s_raw.apply("add", args=(1,)) + pd.testing.assert_series_equal(result, expected) + + r = series.apply(["sum", "max"]) + result = r.execute().fetch() + expected = s_raw.apply(["sum", "max"]) + pd.testing.assert_series_equal(result, expected) + + r = series.apply(np.sqrt) + result = r.execute().fetch() + expected = s_raw.apply(np.sqrt) + pd.testing.assert_series_equal(result, expected) + + r = series.apply("sqrt") + result = r.execute().fetch() + expected = s_raw.apply("sqrt") + pd.testing.assert_series_equal(result, expected) + + r = series.apply(lambda x: [x, x + 1], convert_dtype=False) + result = r.execute().fetch() + expected = s_raw.apply(lambda x: [x, x + 1], convert_dtype=False) + pd.testing.assert_series_equal(result, expected) + + s_raw2 = pd.Series([np.array([1, 2, 3]), np.array([4, 5, 6])]) + series = from_pandas_series(s_raw2) + + dtypes = pd.Series([np.dtype(float)] * 3) + r = series.apply(pd.Series, output_type="dataframe", dtypes=dtypes) + result = r.execute().fetch() + expected = s_raw2.apply(pd.Series) + pd.testing.assert_frame_equal(result, expected) + + +def test_series_apply_closure_execute(setup): + idxes = [chr(ord("A") + i) for i in range(20)] + s_raw = pd.Series([i**2 for i in range(20)], index=idxes) + + series = from_pandas_series(s_raw, chunk_size=5) + + x, y = 1, 2 + + def closure(z): + return [z + x, z + y] + + r = series.apply(closure, convert_dtype=False) + result = r.execute().fetch() + expected = s_raw.apply(closure, convert_dtype=False) + pd.testing.assert_series_equal(result, expected) + + class callable_series: + __slots__ = "x", "__dict__" + + def __init__(self): + self.x = 1 + self.y = 2 + + def __call__(self, z): + return [z + self.x, z + self.y] + + cs = callable_series() + r = series.apply(cs, convert_dtype=False) + result = r.execute().fetch() + expected = s_raw.apply(cs, convert_dtype=False) + pd.testing.assert_series_equal(result, expected) + + +@pytest.mark.skipif(pa is None, reason="pyarrow not installed") +def test_apply_with_arrow_dtype_execution(setup): + df1 = pd.DataFrame({"a": [1, 2, 1], "b": ["a", "b", "a"]}) + df = from_pandas_df(df1) + df["b"] = df["b"].astype("Arrow[string]") + + r = df.apply(lambda row: str(row[0]) + row[1], axis=1) + result = r.execute().fetch() + expected = df1.apply(lambda row: str(row[0]) + row[1], axis=1) + pd.testing.assert_series_equal(result, expected) + + s1 = df1["b"] + s = from_pandas_series(s1) + s = s.astype("arrow_string") + + r = s.apply(lambda x: x + "_suffix") + result = r.execute().fetch() + expected = s1.apply(lambda x: x + "_suffix") + pd.testing.assert_series_equal(result, expected) + + +def test_transform_execute(setup): + cols = [chr(ord("A") + i) for i in range(10)] + df_raw = pd.DataFrame(dict((c, [i**2 for i in range(20)]) for c in cols)) + + idx_vals = [chr(ord("A") + i) for i in range(20)] + s_raw = pd.Series([i**2 for i in range(20)], index=idx_vals) + + def rename_fn(f, new_name): + f.__name__ = new_name + return f + + old_chunk_store_limit = options.chunk_store_limit + try: + options.chunk_store_limit = 20 + + # DATAFRAME CASES + df = from_pandas_df(df_raw, chunk_size=5) + + # test transform scenarios on data frames + def f(s): + if s[2] > 0: + return s + else: + return pd.Series([s[2]] * len(s)) + + with pytest.raises(TypeError): + df.transform(f) + r = df.transform(f, skip_infer=True) + result = r.execute().fetch() + expected = df_raw.transform(f) + pd.testing.assert_frame_equal(result, expected) + + r = df.transform(lambda x: list(range(len(x)))) + result = r.execute().fetch() + expected = df_raw.transform(lambda x: list(range(len(x)))) + pd.testing.assert_frame_equal(result, expected) + + r = df.transform(lambda x: list(range(len(x))), axis=1) + result = r.execute().fetch() + expected = df_raw.transform(lambda x: list(range(len(x))), axis=1) + pd.testing.assert_frame_equal(result, expected) + + r = df.transform(["cumsum", "cummax", lambda x: x + 1]) + result = r.execute().fetch() + expected = df_raw.transform(["cumsum", "cummax", lambda x: x + 1]) + pd.testing.assert_frame_equal(result, expected) + + fn_dict = OrderedDict( + [ + ("A", "cumsum"), + ("D", ["cumsum", "cummax"]), + ("F", lambda x: x + 1), + ] + ) + r = df.transform(fn_dict) + result = r.execute().fetch() + expected = df_raw.transform(fn_dict) + pd.testing.assert_frame_equal(result, expected) + + r = df.transform(lambda x: x.iloc[:-1], _call_agg=True) + result = r.execute().fetch() + expected = df_raw.agg(lambda x: x.iloc[:-1]) + pd.testing.assert_frame_equal(result, expected) + + r = df.transform(lambda x: x.iloc[:-1], axis=1, _call_agg=True) + result = r.execute().fetch() + expected = df_raw.agg(lambda x: x.iloc[:-1], axis=1) + pd.testing.assert_frame_equal(result, expected) + + fn_list = [ + rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), "f1"), + lambda x: x.iloc[:-1].reset_index(drop=True), + ] + r = df.transform(fn_list, _call_agg=True) + result = r.execute().fetch() + expected = df_raw.agg(fn_list) + pd.testing.assert_frame_equal(result, expected) + + r = df.transform(lambda x: x.sum(), _call_agg=True) + result = r.execute().fetch() + expected = df_raw.agg(lambda x: x.sum()) + pd.testing.assert_series_equal(result, expected) + + fn_dict = OrderedDict( + [ + ("A", rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), "f1")), + ( + "D", + [ + rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), "f1"), + lambda x: x.iloc[:-1].reset_index(drop=True), + ], + ), + ("F", lambda x: x.iloc[:-1].reset_index(drop=True)), + ] + ) + r = df.transform(fn_dict, _call_agg=True) + result = r.execute().fetch() + expected = df_raw.agg(fn_dict) + pd.testing.assert_frame_equal(result, expected) + + # SERIES CASES + series = from_pandas_series(s_raw, chunk_size=5) + + # test transform scenarios on series + r = series.transform(lambda x: x + 1) + result = r.execute().fetch() + expected = s_raw.transform(lambda x: x + 1) + pd.testing.assert_series_equal(result, expected) + + r = series.transform(["cumsum", lambda x: x + 1]) + result = r.execute().fetch() + expected = s_raw.transform(["cumsum", lambda x: x + 1]) + pd.testing.assert_frame_equal(result, expected) + + # test transform on string dtype + df_raw = pd.DataFrame({"col1": ["str"] * 10, "col2": ["string"] * 10}) + df = from_pandas_df(df_raw, chunk_size=3) + + r = df["col1"].transform(lambda x: x + "_suffix") + result = r.execute().fetch() + expected = df_raw["col1"].transform(lambda x: x + "_suffix") + pd.testing.assert_series_equal(result, expected) + + r = df.transform(lambda x: x + "_suffix") + result = r.execute().fetch() + expected = df_raw.transform(lambda x: x + "_suffix") + pd.testing.assert_frame_equal(result, expected) + + r = df["col2"].transform(lambda x: x + "_suffix", dtype=np.dtype("str")) + result = r.execute().fetch() + expected = df_raw["col2"].transform(lambda x: x + "_suffix") + pd.testing.assert_series_equal(result, expected) + finally: + options.chunk_store_limit = old_chunk_store_limit + + +@pytest.mark.skipif(pa is None, reason="pyarrow not installed") +def test_transform_with_arrow_dtype_execution(setup): + raw = pd.DataFrame({"a": [1, 2, 1], "b": ["a", "b", "a"]}) + df = from_pandas_df(raw) + df["b"] = df["b"].astype("Arrow[string]") + + r = df.transform({"b": lambda x: x + "_suffix"}) + result = r.execute().fetch() + result["b"] = result["b"].to_numpy() + expected = raw.transform({"b": lambda x: x + "_suffix"}) + pd.testing.assert_frame_equal(result, expected) + + s1 = raw["b"] + s = from_pandas_series(s1) + s = s.astype("arrow_string") + + r = s.transform(lambda x: x + "_suffix") + result = r.execute().fetch() + result = pd.Series(result.to_numpy(), name=result.name, index=result.index) + expected = s1.transform(lambda x: x + "_suffix") + pd.testing.assert_series_equal(result, expected) + + +def test_string_method_execution(setup): + s = pd.Series(["s1,s2", "ef,", "dd", np.nan]) + s2 = pd.concat([s, s, s]) + + series = from_pandas_series(s, chunk_size=2) + series2 = from_pandas_series(s2, chunk_size=2) + + # test getitem + r = series.str[:3] + result = r.execute().fetch() + expected = s.str[:3] + pd.testing.assert_series_equal(result, expected) + + # test split, expand=False + r = series.str.split(",", n=2) + result = r.execute().fetch() + expected = s.str.split(",", n=2) + pd.testing.assert_series_equal(result, expected) + + # test split, expand=True + r = series.str.split(",", expand=True, n=1) + result = r.execute().fetch() + expected = s.str.split(",", expand=True, n=1) + pd.testing.assert_frame_equal(result, expected) + + # test rsplit + r = series.str.rsplit(",", expand=True, n=1) + result = r.execute().fetch() + expected = s.str.rsplit(",", expand=True, n=1) + pd.testing.assert_frame_equal(result, expected) + + # test cat all data + r = series2.str.cat(sep="/", na_rep="e") + result = r.execute().fetch() + expected = s2.str.cat(sep="/", na_rep="e") + assert result == expected + + # test cat list + r = series.str.cat(["a", "b", np.nan, "c"]) + result = r.execute().fetch() + expected = s.str.cat(["a", "b", np.nan, "c"]) + pd.testing.assert_series_equal(result, expected) + + # test cat series + r = series.str.cat(series.str.capitalize(), join="outer") + result = r.execute().fetch() + expected = s.str.cat(s.str.capitalize(), join="outer") + pd.testing.assert_series_equal(result, expected) + + # test extractall + r = series.str.extractall(r"(?P[ab])(?P\d)") + result = r.execute().fetch() + expected = s.str.extractall(r"(?P[ab])(?P\d)") + pd.testing.assert_frame_equal(result, expected) + + # test extract, expand=False + r = series.str.extract(r"[ab](\d)", expand=False) + result = r.execute().fetch() + expected = s.str.extract(r"[ab](\d)", expand=False) + pd.testing.assert_series_equal(result, expected) + + # test extract, expand=True + r = series.str.extract(r"[ab](\d)", expand=True) + result = r.execute().fetch() + expected = s.str.extract(r"[ab](\d)", expand=True) + pd.testing.assert_frame_equal(result, expected) + + +def test_datetime_method_execution(setup): + # test datetime + s = pd.Series([pd.Timestamp("2020-1-1"), pd.Timestamp("2020-2-1"), np.nan]) + series = from_pandas_series(s, chunk_size=2) + + r = series.dt.year + result = r.execute().fetch() + expected = s.dt.year + pd.testing.assert_series_equal(result, expected) + + r = series.dt.strftime("%m-%d-%Y") + result = r.execute().fetch() + expected = s.dt.strftime("%m-%d-%Y") + pd.testing.assert_series_equal(result, expected) + + # test timedelta + s = pd.Series([pd.Timedelta("1 days"), pd.Timedelta("3 days"), np.nan]) + series = from_pandas_series(s, chunk_size=2) + + r = series.dt.days + result = r.execute().fetch() + expected = s.dt.days + pd.testing.assert_series_equal(result, expected) + + +def test_isin_execution(setup): + # one chunk in multiple chunks + a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + b = pd.Series([2, 1, 9, 3]) + sa = from_pandas_series(a, chunk_size=10) + sb = from_pandas_series(b, chunk_size=2) + + result = sa.isin(sb).execute().fetch() + expected = a.isin(b) + pd.testing.assert_series_equal(result, expected) + + # multiple chunk in one chunks + a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + b = pd.Series([2, 1, 9, 3]) + sa = from_pandas_series(a, chunk_size=2) + sb = from_pandas_series(b, chunk_size=4) + + result = sa.isin(sb).execute().fetch() + expected = a.isin(b) + pd.testing.assert_series_equal(result, expected) + + # multiple chunk in multiple chunks + a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + b = pd.Series([2, 1, 9, 3] * 2) + sa = from_pandas_series(a, chunk_size=2) + sb = from_pandas_series(b, chunk_size=2) + + result = sa.isin(sb).execute().fetch() + expected = a.isin(b) + pd.testing.assert_series_equal(result, expected) + + a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + b = pd.Series([2, 1, 9, 3] * 3) + sa = from_pandas_series(a, chunk_size=5) + sb = from_pandas_series(b, chunk_size=2) + + result = sa.isin(sb).execute().fetch() + expected = a.isin(b) + pd.testing.assert_series_equal(result, expected) + + a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + b = pd.Series([2, 1, 9, 3]) + sa = from_pandas_series(a, chunk_size=2) + + result = sa.isin(sb).execute().fetch() + expected = a.isin(b) + pd.testing.assert_series_equal(result, expected) + + a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + b = np.array([2, 1, 9, 3] * 5) + sa = from_pandas_series(a, chunk_size=5) + sb = tensor(b, chunk_size=4) + + result = sa.isin(sb).execute().fetch() + expected = a.isin(b) + pd.testing.assert_series_equal(result, expected) + + a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + b = {2, 1, 9, 3} # set + sa = from_pandas_series(a, chunk_size=2) + + result = sa.isin(sb).execute().fetch() + expected = a.isin(b) + pd.testing.assert_series_equal(result, expected) + + rs = np.random.RandomState(0) + raw = pd.DataFrame(rs.randint(1000, size=(10, 3))) + df = from_pandas_df(raw, chunk_size=(5, 2)) + + # set + b = {2, 1, raw[1][0]} + r = df.isin(b) + result = r.execute().fetch() + expected = raw.isin(b) + pd.testing.assert_frame_equal(result, expected) + + # mars object + b = tensor([2, 1, raw[1][0]] * 2, chunk_size=2) + r = df.isin(b) + result = r.execute().fetch() + expected = raw.isin([2, 1, raw[1][0]]) + pd.testing.assert_frame_equal(result, expected) + + # mars object and trigger iterative tiling + raw = pd.DataFrame(rs.randint(1000, size=(10, 3))) + df = from_pandas_df(raw, chunk_size=(5, 2)) + + b = from_pandas_series(pd.Series([raw[1][0]] + list(range(9))), chunk_size=2) + r = df.isin(b) + result = r.execute().fetch() + expected = raw.isin([2, 1, raw[1][0]]) + pd.testing.assert_frame_equal(result, expected) + + # dict + b = {1: tensor([2, 1, raw[1][0]], chunk_size=2), 2: [3, 10]} + r = df.isin(b) + result = r.execute().fetch() + expected = raw.isin({1: [2, 1, raw[1][0]], 2: [3, 10]}) + pd.testing.assert_frame_equal(result, expected) + + +def test_cut_execution(setup): + session = setup + + rs = np.random.RandomState(0) + raw = rs.random(15) * 1000 + s = pd.Series(raw, index=[f"i{i}" for i in range(15)]) + bins = [10, 100, 500] + if _interval_range_closed_arg: + ii = pd.interval_range(10, 500, 3, closed="right") + else: + ii = pd.interval_range(10, 500, 3) + labels = ["a", "b"] + + t = tensor(raw, chunk_size=4) + series = from_pandas_series(s, chunk_size=4) + iii = from_pandas_index(ii, chunk_size=2) + + # cut on Series + r = cut(series, bins) + result = r.execute().fetch() + pd.testing.assert_series_equal(result, pd.cut(s, bins)) + + r, b = cut(series, bins, retbins=True) + r_result = r.execute().fetch() + b_result = b.execute().fetch() + r_expected, b_expected = pd.cut(s, bins, retbins=True) + pd.testing.assert_series_equal(r_result, r_expected) + np.testing.assert_array_equal(b_result, b_expected) + + # cut on tensor + r = cut(t, bins) + # result and expected is array whose dtype is CategoricalDtype + result = r.execute().fetch() + expected = pd.cut(raw, bins) + assert len(result) == len(expected) + for r, e in zip(result, expected): + np.testing.assert_equal(r, e) + + # one chunk + r = cut(s, tensor(bins, chunk_size=2), right=False, include_lowest=True) + result = r.execute().fetch() + pd.testing.assert_series_equal( + result, pd.cut(s, bins, right=False, include_lowest=True) + ) + + # test labels + r = cut(t, bins, labels=labels) + # result and expected is array whose dtype is CategoricalDtype + result = r.execute().fetch() + expected = pd.cut(raw, bins, labels=labels) + assert len(result) == len(expected) + for r, e in zip(result, expected): + np.testing.assert_equal(r, e) + + r = cut(t, bins, labels=False) + # result and expected is array whose dtype is CategoricalDtype + result = r.execute().fetch() + expected = pd.cut(raw, bins, labels=False) + np.testing.assert_array_equal(result, expected) + + # test labels which is tensor + labels_t = tensor(["a", "b"], chunk_size=1) + r = cut(raw, bins, labels=labels_t, include_lowest=True) + # result and expected is array whose dtype is CategoricalDtype + result = r.execute().fetch() + expected = pd.cut(raw, bins, labels=labels, include_lowest=True) + assert len(result) == len(expected) + for r, e in zip(result, expected): + np.testing.assert_equal(r, e) + + # test labels=False + r, b = cut(raw, ii, labels=False, retbins=True) + # result and expected is array whose dtype is CategoricalDtype + r_result, b_result = session.fetch(*session.execute(r, b)) + r_expected, b_expected = pd.cut(raw, ii, labels=False, retbins=True) + for r, e in zip(r_result, r_expected): + np.testing.assert_equal(r, e) + pd.testing.assert_index_equal(b_result, b_expected) + + # test bins which is md.IntervalIndex + r, b = cut(series, iii, labels=tensor(labels, chunk_size=1), retbins=True) + r_result = r.execute().fetch() + b_result = b.execute().fetch() + r_expected, b_expected = pd.cut(s, ii, labels=labels, retbins=True) + pd.testing.assert_series_equal(r_result, r_expected) + pd.testing.assert_index_equal(b_result, b_expected) + + # test duplicates + bins2 = [0, 2, 4, 6, 10, 10] + r, b = cut(s, bins2, labels=False, retbins=True, right=False, duplicates="drop") + r_result = r.execute().fetch() + b_result = b.execute().fetch() + r_expected, b_expected = pd.cut( + s, bins2, labels=False, retbins=True, right=False, duplicates="drop" + ) + pd.testing.assert_series_equal(r_result, r_expected) + np.testing.assert_array_equal(b_result, b_expected) + + # test ordered + if pd.__version__ >= "1.1.0": + bins3 = [10, 100, 500] + r = cut(s, bins3, labels=labels, ordered=False) + r_result = r.execute().fetch() + r_expected = pd.cut(s, bins3, labels=labels, ordered=False) + pd.testing.assert_series_equal(r_result, r_expected) + + # test integer bins + r = cut(series, 3) + result = r.execute().fetch() + pd.testing.assert_series_equal(result, pd.cut(s, 3)) + + r, b = cut(series, 3, right=False, retbins=True) + r_result, b_result = session.fetch(*session.execute(r, b)) + r_expected, b_expected = pd.cut(s, 3, right=False, retbins=True) + pd.testing.assert_series_equal(r_result, r_expected) + np.testing.assert_array_equal(b_result, b_expected) + + # test min max same + s2 = pd.Series([1.1] * 15) + r = cut(s2, 3) + result = r.execute().fetch() + pd.testing.assert_series_equal(result, pd.cut(s2, 3)) + + # test inf exist + s3 = s2.copy() + s3[-1] = np.inf + with pytest.raises(ValueError): + cut(s3, 3).execute() + + +def test_transpose_execution(setup): + raw = pd.DataFrame( + {"a": ["1", "2", "3"], "b": ["5", "-6", "7"], "c": ["1", "2", "3"]} + ) + + # test 1 chunk + df = from_pandas_df(raw) + result = df.transpose().execute().fetch() + pd.testing.assert_frame_equal(result, raw.transpose()) + + # test multi chunks + df = from_pandas_df(raw, chunk_size=2) + result = df.transpose().execute().fetch() + pd.testing.assert_frame_equal(result, raw.transpose()) + + df = from_pandas_df(raw, chunk_size=2) + result = df.T.execute().fetch() + pd.testing.assert_frame_equal(result, raw.transpose()) + + # dtypes are varied + raw = pd.DataFrame({"a": [1.1, 2.2, 3.3], "b": [5, -6, 7], "c": [1, 2, 3]}) + + df = from_pandas_df(raw, chunk_size=2) + result = df.transpose().execute().fetch() + pd.testing.assert_frame_equal(result, raw.transpose()) + + raw = pd.DataFrame({"a": [1.1, 2.2, 3.3], "b": ["5", "-6", "7"]}) + + df = from_pandas_df(raw, chunk_size=2) + result = df.transpose().execute().fetch() + pd.testing.assert_frame_equal(result, raw.transpose()) + + # Transposing from results of other operands + raw = pd.DataFrame(np.arange(0, 100).reshape(10, 10)) + df = DataFrame(arange(0, 100, chunk_size=5).reshape(10, 10)) + result = df.transpose().execute().fetch() + pd.testing.assert_frame_equal(result, raw.transpose()) + + df = DataFrame(rand(100, 100, chunk_size=10)) + raw = df.to_pandas() + result = df.transpose().execute().fetch() + pd.testing.assert_frame_equal(result, raw.transpose()) + + +def test_get_dummies_execution(setup): + raw = pd.DataFrame( + { + "a": [1.1, 2.1, 3.1], + "b": ["5", "-6", "-7"], + "c": [1, 2, 3], + "d": ["2", "3", "4"], + } + ) + # test 1 chunk + df = from_pandas_df(raw) + r = get_dummies(df) + pd.testing.assert_frame_equal(r.execute().fetch(), pd.get_dummies(raw)) + + # test multi chunks + df = from_pandas_df(raw, chunk_size=2) + r = get_dummies(df) + pd.testing.assert_frame_equal(r.execute().fetch(), pd.get_dummies(raw)) + + # test prefix and prefix_sep + df = from_pandas_df(raw, chunk_size=2) + r = get_dummies(df, prefix=["col1", "col2"], prefix_sep="_") + pd.testing.assert_frame_equal( + r.execute().fetch(), + pd.get_dummies(raw, prefix=["col1", "col2"], prefix_sep="_"), + ) + + r = get_dummies(df, prefix={"b": "col1", "d": "col2"}, prefix_sep="_") + pd.testing.assert_frame_equal( + r.execute().fetch(), + pd.get_dummies(raw, prefix={"b": "col1", "d": "col2"}, prefix_sep="_"), + ) + + # test dummy_na + raw = pd.Series(["a", "b", "c", np.nan]) + df = from_pandas_series(raw) + r = get_dummies(df, dummy_na=False) + pd.testing.assert_frame_equal( + r.execute().fetch(), pd.get_dummies(raw, dummy_na=False) + ) + + # test columns + raw = pd.DataFrame( + { + "a": [1.1, 2.1, 3.1], + "b": ["5", "-6", "-7"], + "c": [1, 2, 3], + "d": ["2", "3", "4"], + } + ) + df = from_pandas_df(raw, chunk_size=2) + r = get_dummies(df, columns=["c"]) + pd.testing.assert_frame_equal( + r.execute().fetch(), pd.get_dummies(raw, columns=["c"]) + ) + + r = get_dummies(df, columns=["c", "d"], prefix=["col1", "col2"]) + pd.testing.assert_frame_equal( + r.execute().fetch(), + pd.get_dummies(raw, columns=["c", "d"], prefix=["col1", "col2"]), + ) + + # test drop_first + df = from_pandas_df(raw, chunk_size=2) + r = get_dummies(df, drop_first=True) + pd.testing.assert_frame_equal( + r.execute().fetch(), pd.get_dummies(raw, drop_first=True) + ) + + # test dtype + df = from_pandas_df(raw, chunk_size=2) + r = get_dummies(df, dtype=float) + pd.testing.assert_frame_equal(r.execute().fetch(), pd.get_dummies(raw, dtype=float)) + + # test series + raw = pd.Series([3, 4, 1, 2]) + series = from_pandas_series(raw, chunk_size=2) + r = get_dummies(series) + pd.testing.assert_frame_equal(r.execute().fetch(), pd.get_dummies(raw)) + + # test other variable + raw = [3, 4, 1, 2] + r = get_dummies(raw) + pd.testing.assert_frame_equal(r.execute().fetch(), pd.get_dummies(raw)) + + raw = pd.Series([3, 4, 2, 1]) + r = get_dummies(raw) + pd.testing.assert_frame_equal(r.execute().fetch(), pd.get_dummies(raw)) + + raw = pd.DataFrame( + { + "a": [1.1, 2.1, 3.1], + "b": ["5", "-6", "-7"], + "c": [1, 2, 3], + "d": ["2", "3", "4"], + } + ) + r = get_dummies(raw) + pd.testing.assert_frame_equal(r.execute().fetch(), pd.get_dummies(raw)) + + +def test_to_numeric_execution(setup): + rs = np.random.RandomState(0) + s = pd.Series(rs.randint(5, size=100)) + s[rs.randint(100)] = np.nan + + # test 1 chunk + series = from_pandas_series(s) + + r = to_numeric(series) + pd.testing.assert_series_equal(r.execute().fetch(), pd.to_numeric(s)) + + # test multi chunks + series = from_pandas_series(s, chunk_size=20) + + r = to_numeric(series) + pd.testing.assert_series_equal(r.execute().fetch(), pd.to_numeric(s)) + + # test object dtype + s = pd.Series(["1.0", 2, -3, "2.0"]) + series = from_pandas_series(s) + + r = to_numeric(series) + pd.testing.assert_series_equal(r.execute().fetch(), pd.to_numeric(s)) + + # test errors and downcast + s = pd.Series(["appple", 2, -3, "2.0"]) + series = from_pandas_series(s) + + r = to_numeric(series, errors="ignore", downcast="signed") + pd.testing.assert_series_equal( + r.execute().fetch(), pd.to_numeric(s, errors="ignore", downcast="signed") + ) + + # test list data + l = ["1.0", 2, -3, "2.0"] + + r = to_numeric(l) + np.testing.assert_array_equal(r.execute().fetch(), pd.to_numeric(l)) + + +def test_q_cut_execution(setup): + rs = np.random.RandomState(0) + raw = rs.random(15) * 1000 + s = pd.Series(raw, index=[f"i{i}" for i in range(15)]) + + series = from_pandas_series(s) + r = qcut(series, 3) + result = r.execute().fetch() + expected = pd.qcut(s, 3) + pd.testing.assert_series_equal(result, expected) + + r = qcut(s, 3) + result = r.execute().fetch() + expected = pd.qcut(s, 3) + pd.testing.assert_series_equal(result, expected) + + series = from_pandas_series(s) + r = qcut(series, [0.3, 0.5, 0.7]) + result = r.execute().fetch() + expected = pd.qcut(s, [0.3, 0.5, 0.7]) + pd.testing.assert_series_equal(result, expected) + + r = qcut(range(5), 3) + result = r.execute().fetch() + expected = pd.qcut(range(5), 3) + assert isinstance(result, type(expected)) + pd.testing.assert_series_equal(pd.Series(result), pd.Series(expected)) + + r = qcut(range(5), [0.2, 0.5]) + result = r.execute().fetch() + expected = pd.qcut(range(5), [0.2, 0.5]) + assert isinstance(result, type(expected)) + pd.testing.assert_series_equal(pd.Series(result), pd.Series(expected)) + + r = qcut(range(5), tensor([0.2, 0.5])) + result = r.execute().fetch() + expected = pd.qcut(range(5), [0.2, 0.5]) + assert isinstance(result, type(expected)) + pd.testing.assert_series_equal(pd.Series(result), pd.Series(expected)) + + +def test_shift_execution(setup): + fill_value_default = no_default + if not _enable_no_default or _with_column_freq_bug: + fill_value_default = None + + # test dataframe + rs = np.random.RandomState(0) + raw = pd.DataFrame( + rs.randint(1000, size=(10, 8)), columns=["col" + str(i + 1) for i in range(8)] + ) + + df = from_pandas_df(raw, chunk_size=5) + + for periods in (2, -2, 6, -6): + for axis in (0, 1): + for fill_value in (fill_value_default, 0, 1.0): + r = df.shift(periods=periods, axis=axis, fill_value=fill_value) + + try: + result = r.execute().fetch() + expected = raw.shift( + periods=periods, axis=axis, fill_value=fill_value + ) + pd.testing.assert_frame_equal(result, expected, check_dtype=False) + except AssertionError as e: # pragma: no cover + raise AssertionError( + f"Failed when periods: {periods}, axis: {axis}, fill_value: {fill_value}" + ) from e + + raw2 = raw.copy() + raw2.index = pd.date_range("2020-1-1", periods=10) + raw2.columns = pd.date_range("2020-3-1", periods=8) + + df2 = from_pandas_df(raw2, chunk_size=5) + + # test freq not None + for periods in (2, -2): + for axis in (0, 1): + for fill_value in (fill_value_default, 0, 1.0): + r = df2.shift( + periods=periods, freq="D", axis=axis, fill_value=fill_value + ) + + try: + result = r.execute().fetch() + expected = raw2.shift( + periods=periods, freq="D", axis=axis, fill_value=fill_value + ) + pd.testing.assert_frame_equal(result, expected) + except AssertionError as e: # pragma: no cover + raise AssertionError( + f"Failed when periods: {periods}, axis: {axis}, fill_value: {fill_value}" + ) from e + + # test tshift + r = df2.tshift(periods=1) + result = r.execute().fetch() + expected = raw2.tshift(periods=1) + pd.testing.assert_frame_equal(result, expected) + + with pytest.raises(ValueError): + _ = df.tshift(periods=1) + + # test series + s = raw.iloc[:, 0] + + series = from_pandas_series(s, chunk_size=5) + for periods in (0, 2, -2, 6, -6): + for fill_value in (fill_value_default, 0, 1.0): + r = series.shift(periods=periods, fill_value=fill_value) + + try: + result = r.execute().fetch() + expected = s.shift(periods=periods, fill_value=fill_value) + pd.testing.assert_series_equal(result, expected) + except AssertionError as e: # pragma: no cover + raise AssertionError( + f"Failed when periods: {periods}, fill_value: {fill_value}" + ) from e + + s2 = raw2.iloc[:, 0] + + # test freq not None + series2 = from_pandas_series(s2, chunk_size=5) + for periods in (2, -2): + for fill_value in (fill_value_default, 0, 1.0): + r = series2.shift(periods=periods, freq="D", fill_value=fill_value) + + try: + result = r.execute().fetch() + expected = s2.shift(periods=periods, freq="D", fill_value=fill_value) + pd.testing.assert_series_equal(result, expected) + except AssertionError as e: # pragma: no cover + raise AssertionError( + f"Failed when periods: {periods}, fill_value: {fill_value}" + ) from e + + +def test_diff_execution(setup): + rs = np.random.RandomState(0) + raw = pd.DataFrame( + rs.randint(1000, size=(10, 8)), columns=["col" + str(i + 1) for i in range(8)] + ) + + raw1 = raw.copy() + raw1["col4"] = raw1["col4"] < 400 + + r = from_pandas_df(raw1, chunk_size=(10, 5)).diff(-1) + pd.testing.assert_frame_equal(r.execute().fetch(), raw1.diff(-1)) + + r = from_pandas_df(raw1, chunk_size=5).diff(-1) + pd.testing.assert_frame_equal(r.execute().fetch(), raw1.diff(-1)) + + r = from_pandas_df(raw, chunk_size=(5, 8)).diff(1, axis=1) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.diff(1, axis=1)) + + r = from_pandas_df(raw, chunk_size=5).diff(1, axis=1) + pd.testing.assert_frame_equal( + r.execute().fetch(), raw.diff(1, axis=1), check_dtype=False + ) + + # test series + s = raw.iloc[:, 0] + s1 = s.copy() < 400 + + r = from_pandas_series(s, chunk_size=10).diff(-1) + pd.testing.assert_series_equal(r.execute().fetch(), s.diff(-1)) + + r = from_pandas_series(s, chunk_size=5).diff(-1) + pd.testing.assert_series_equal(r.execute().fetch(), s.diff(-1)) + + r = from_pandas_series(s1, chunk_size=5).diff(1) + pd.testing.assert_series_equal(r.execute().fetch(), s1.diff(1)) + + +def test_value_counts_execution(setup): + rs = np.random.RandomState(0) + s = pd.Series(rs.randint(5, size=100), name="s") + s[rs.randint(100)] = np.nan + + # test 1 chunk + series = from_pandas_series(s, chunk_size=100) + + r = series.value_counts() + pd.testing.assert_series_equal(r.execute().fetch(), s.value_counts()) + + r = series.value_counts(bins=5, normalize=True) + pd.testing.assert_series_equal( + r.execute().fetch(), s.value_counts(bins=5, normalize=True) + ) + + # test multi chunks + series = from_pandas_series(s, chunk_size=30) + + r = series.value_counts(method="tree") + pd.testing.assert_series_equal(r.execute().fetch(), s.value_counts()) + + r = series.value_counts(method="tree", normalize=True) + pd.testing.assert_series_equal(r.execute().fetch(), s.value_counts(normalize=True)) + + # test bins and normalize + r = series.value_counts(method="tree", bins=5, normalize=True) + pd.testing.assert_series_equal( + r.execute().fetch(), s.value_counts(bins=5, normalize=True) + ) + + +def test_astype(setup): + rs = np.random.RandomState(0) + raw = pd.DataFrame( + rs.randint(1000, size=(20, 8)), columns=["c" + str(i + 1) for i in range(8)] + ) + # single chunk + df = from_pandas_df(raw) + r = df.astype("int32") + + result = r.execute().fetch() + expected = raw.astype("int32") + pd.testing.assert_frame_equal(expected, result) + + # multiply chunks + df = from_pandas_df(raw, chunk_size=6) + r = df.astype("int32") + + result = r.execute().fetch() + expected = raw.astype("int32") + pd.testing.assert_frame_equal(expected, result) + + # dict type + df = from_pandas_df(raw, chunk_size=5) + r = df.astype({"c1": "int32", "c2": "float", "c8": "str"}) + + result = r.execute().fetch() + expected = raw.astype({"c1": "int32", "c2": "float", "c8": "str"}) + pd.testing.assert_frame_equal(expected, result) + + # test arrow_string dtype + df = from_pandas_df(raw, chunk_size=8) + r = df.astype({"c1": "arrow_string"}) + + result = r.execute().fetch() + expected = raw.astype({"c1": "arrow_string"}) + pd.testing.assert_frame_equal(expected, result) + + # test series + s = pd.Series(rs.randint(5, size=20)) + series = from_pandas_series(s) + r = series.astype("int32") + + result = r.execute().fetch() + expected = s.astype("int32") + pd.testing.assert_series_equal(result, expected) + + series = from_pandas_series(s, chunk_size=6) + r = series.astype("arrow_string") + + result = r.execute().fetch() + expected = s.astype("arrow_string") + pd.testing.assert_series_equal(result, expected) + + # test index + raw = pd.Index(rs.randint(5, size=20)) + mix = from_pandas_index(raw) + r = mix.astype("int32") + + result = r.execute().fetch() + expected = raw.astype("int32") + pd.testing.assert_index_equal(result, expected) + + # multiply chunks + series = from_pandas_series(s, chunk_size=6) + r = series.astype("str") + + result = r.execute().fetch() + expected = s.astype("str") + pd.testing.assert_series_equal(result, expected) + + # test category + raw = pd.DataFrame( + rs.randint(3, size=(20, 8)), columns=["c" + str(i + 1) for i in range(8)] + ) + + df = from_pandas_df(raw) + r = df.astype("category") + + result = r.execute().fetch() + expected = raw.astype("category") + pd.testing.assert_frame_equal(expected, result) + + df = from_pandas_df(raw) + r = df.astype({"c1": "category", "c8": "int32", "c4": "str"}) + + result = r.execute().fetch() + expected = raw.astype({"c1": "category", "c8": "int32", "c4": "str"}) + pd.testing.assert_frame_equal(expected, result) + + df = from_pandas_df(raw, chunk_size=5) + r = df.astype("category") + + result = r.execute().fetch() + expected = raw.astype("category") + pd.testing.assert_frame_equal(expected, result) + + df = from_pandas_df(raw, chunk_size=3) + r = df.astype({"c1": "category", "c8": "int32", "c4": "str"}) + + result = r.execute().fetch() + expected = raw.astype({"c1": "category", "c8": "int32", "c4": "str"}) + pd.testing.assert_frame_equal(expected, result) + + df = from_pandas_df(raw, chunk_size=6) + r = df.astype( + { + "c1": "category", + "c5": "float", + "c2": "int32", + "c7": pd.CategoricalDtype([1, 3, 4, 2]), + "c4": pd.CategoricalDtype([1, 3, 2]), + } + ) + result = r.execute().fetch() + expected = raw.astype( + { + "c1": "category", + "c5": "float", + "c2": "int32", + "c7": pd.CategoricalDtype([1, 3, 4, 2]), + "c4": pd.CategoricalDtype([1, 3, 2]), + } + ) + pd.testing.assert_frame_equal(expected, result) + + df = from_pandas_df(raw, chunk_size=8) + r = df.astype({"c2": "category"}) + result = r.execute().fetch() + expected = raw.astype({"c2": "category"}) + pd.testing.assert_frame_equal(expected, result) + + # test series category + raw = pd.Series(np.random.choice(["a", "b", "c"], size=(10,))) + series = from_pandas_series(raw, chunk_size=4) + result = series.astype("category").execute().fetch() + expected = raw.astype("category") + pd.testing.assert_series_equal(expected, result) + + series = from_pandas_series(raw, chunk_size=3) + result = ( + series.astype(pd.CategoricalDtype(["a", "c", "b"]), copy=False) + .execute() + .fetch() + ) + expected = raw.astype(pd.CategoricalDtype(["a", "c", "b"]), copy=False) + pd.testing.assert_series_equal(expected, result) + + series = from_pandas_series(raw, chunk_size=6) + result = series.astype(pd.CategoricalDtype(["a", "c", "b", "d"])).execute().fetch() + expected = raw.astype(pd.CategoricalDtype(["a", "c", "b", "d"])) + pd.testing.assert_series_equal(expected, result) + + +def test_drop(setup): + # test dataframe drop + rs = np.random.RandomState(0) + raw = pd.DataFrame( + rs.randint(1000, size=(20, 8)), columns=["c" + str(i + 1) for i in range(8)] + ) + + df = from_pandas_df(raw, chunk_size=3) + + columns = ["c2", "c4", "c5", "c6"] + index = [3, 6, 7] + r = df.drop(columns=columns, index=index) + pd.testing.assert_frame_equal( + r.execute().fetch(), raw.drop(columns=columns, index=index) + ) + + idx_series = from_pandas_series(pd.Series(index)) + r = df.drop(idx_series) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.drop(pd.Series(index))) + + df.drop(columns, axis=1, inplace=True) + pd.testing.assert_frame_equal(df.execute().fetch(), raw.drop(columns, axis=1)) + + del df["c3"] + pd.testing.assert_frame_equal( + df.execute().fetch(), raw.drop(columns + ["c3"], axis=1) + ) + + ps = df.pop("c8") + pd.testing.assert_frame_equal( + df.execute().fetch(), raw.drop(columns + ["c3", "c8"], axis=1) + ) + pd.testing.assert_series_equal(ps.execute().fetch(), raw["c8"]) + + # test series drop + raw = pd.Series(rs.randint(1000, size=(20,))) + + series = from_pandas_series(raw, chunk_size=3) + + r = series.drop(index=index) + pd.testing.assert_series_equal(r.execute().fetch(), raw.drop(index=index)) + + # test index drop + ser = pd.Series(range(20)) + rs.shuffle(ser) + raw = pd.Index(ser) + + idx = from_pandas_index(raw) + + r = idx.drop(index) + pd.testing.assert_index_equal(r.execute().fetch(), raw.drop(index)) + + +def test_melt(setup): + rs = np.random.RandomState(0) + raw = pd.DataFrame( + rs.randint(1000, size=(20, 8)), columns=["c" + str(i + 1) for i in range(8)] + ) + + df = from_pandas_df(raw, chunk_size=3) + + r = df.melt(id_vars=["c1"], value_vars=["c2", "c4"]) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_values(["c1", "variable"]).reset_index(drop=True), + raw.melt(id_vars=["c1"], value_vars=["c2", "c4"]) + .sort_values(["c1", "variable"]) + .reset_index(drop=True), + ) + + +def test_drop_duplicates(setup): + # test dataframe drop + rs = np.random.RandomState(0) + raw = pd.DataFrame( + rs.randint(1000, size=(20, 5)), + columns=["c" + str(i + 1) for i in range(5)], + index=["i" + str(j) for j in range(20)], + ) + duplicate_lines = rs.randint(1000, size=5) + for i in [1, 3, 10, 11, 15]: + raw.iloc[i] = duplicate_lines + + with option_context({"combine_size": 2}): + # test dataframe + for chunk_size in [(8, 3), (20, 5)]: + df = from_pandas_df(raw, chunk_size=chunk_size) + if chunk_size[0] < len(raw): + methods = ["tree", "subset_tree", "shuffle"] + else: + # 1 chunk + methods = [None] + for method in methods: + for subset in [None, "c1", ["c1", "c2"]]: + for keep in ["first", "last", False]: + for ignore_index in [True, False]: + try: + r = df.drop_duplicates( + method=method, + subset=subset, + keep=keep, + ignore_index=ignore_index, + ) + result = r.execute().fetch() + try: + expected = raw.drop_duplicates( + subset=subset, + keep=keep, + ignore_index=ignore_index, + ) + except TypeError: + # ignore_index is supported in pandas 1.0 + expected = raw.drop_duplicates( + subset=subset, keep=keep + ) + if ignore_index: + expected.reset_index(drop=True, inplace=True) + + pd.testing.assert_frame_equal(result, expected) + except Exception as e: # pragma: no cover + raise AssertionError( + f"failed when method={method}, subset={subset}, " + f"keep={keep}, ignore_index={ignore_index}" + ) from e + + # test series and index + s = raw["c3"] + ind = pd.Index(s) + + for tp, obj in [("series", s), ("index", ind)]: + for chunk_size in [8, 20]: + to_m = from_pandas_series if tp == "series" else from_pandas_index + mobj = to_m(obj, chunk_size=chunk_size) + if chunk_size < len(obj): + methods = ["tree", "shuffle"] + else: + # 1 chunk + methods = [None] + for method in methods: + for keep in ["first", "last", False]: + try: + r = mobj.drop_duplicates(method=method, keep=keep) + result = r.execute().fetch() + expected = obj.drop_duplicates(keep=keep) + + cmp = ( + pd.testing.assert_series_equal + if tp == "series" + else pd.testing.assert_index_equal + ) + cmp(result, expected) + except Exception as e: # pragma: no cover + raise AssertionError( + f"failed when method={method}, keep={keep}" + ) from e + + # test inplace + series = from_pandas_series(s, chunk_size=11) + series.drop_duplicates(inplace=True) + result = series.execute().fetch() + expected = s.drop_duplicates() + pd.testing.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("method", ["tree", "shuffle"]) +def test_series_drop_duplicates(setup, method): + raw = pd.Series(np.random.randint(5, size=50)) + s = Series(raw, chunk_size=10) + r = s.drop_duplicates(method=method).execute() + result = r.execute().fetch() + pd.testing.assert_series_equal(result, raw.drop_duplicates()) + + +def test_duplicated(setup): + # test dataframe drop + rs = np.random.RandomState(0) + raw = pd.DataFrame( + rs.randint(1000, size=(20, 5)), + columns=["c" + str(i + 1) for i in range(5)], + index=["i" + str(j) for j in range(20)], + ) + duplicate_lines = rs.randint(1000, size=5) + for i in [1, 3, 10, 11, 15]: + raw.iloc[i] = duplicate_lines + + with option_context({"combine_size": 2}): + # test dataframe + for chunk_size in [(8, 3), (20, 5)]: + df = from_pandas_df(raw, chunk_size=chunk_size) + if chunk_size[0] < len(raw): + methods = ["tree", "subset_tree", "shuffle"] + else: + # 1 chunk + methods = [None] + for method in methods: + for subset in [None, "c1", ["c1", "c2"]]: + for keep in ["first", "last", False]: + try: + r = df.duplicated(method=method, subset=subset, keep=keep) + result = r.execute().fetch() + expected = raw.duplicated(subset=subset, keep=keep) + pd.testing.assert_series_equal(result, expected) + except Exception as e: # pragma: no cover + raise AssertionError( + f"failed when method={method}, subset={subset}, " + f"keep={keep}" + ) from e + + # test series + s = raw["c3"] + + for tp, obj in [("series", s)]: + for chunk_size in [8, 20]: + to_m = from_pandas_series if tp == "series" else from_pandas_index + mobj = to_m(obj, chunk_size=chunk_size) + if chunk_size < len(obj): + methods = ["tree", "shuffle"] + else: + # 1 chunk + methods = [None] + for method in methods: + for keep in ["first", "last", False]: + try: + r = mobj.duplicated(method=method, keep=keep) + result = r.execute().fetch() + expected = obj.duplicated(keep=keep) + + cmp = ( + pd.testing.assert_series_equal + if tp == "series" + else pd.testing.assert_index_equal + ) + cmp(result, expected) + except Exception as e: # pragma: no cover + raise AssertionError( + f"failed when method={method}, keep={keep}" + ) from e + + +@pytest.mark.parametrize("method", ["tree", "shuffle"]) +def test_series_duplicated(setup, method): + raw = pd.Series(np.random.randint(5, size=50)) + s = Series(raw, chunk_size=10) + r = s.duplicated(method=method).execute() + result = r.execute().fetch() + pd.testing.assert_series_equal(result, raw.duplicated()) + + +def test_memory_usage_execution(setup): + dtypes = ["int64", "float64", "complex128", "object", "bool"] + data = dict([(t, np.ones(shape=500).astype(t)) for t in dtypes]) + raw = pd.DataFrame(data) + + df = from_pandas_df(raw, chunk_size=(500, 2)) + r = df.memory_usage(index=False) + pd.testing.assert_series_equal(r.execute().fetch(), raw.memory_usage(index=False)) + + df = from_pandas_df(raw, chunk_size=(500, 2)) + r = df.memory_usage(index=True) + pd.testing.assert_series_equal(r.execute().fetch(), raw.memory_usage(index=True)) + + df = from_pandas_df(raw, chunk_size=(100, 3)) + r = df.memory_usage(index=False) + pd.testing.assert_series_equal(r.execute().fetch(), raw.memory_usage(index=False)) + + r = df.memory_usage(index=True) + pd.testing.assert_series_equal(r.execute().fetch(), raw.memory_usage(index=True)) + + raw = pd.DataFrame(data, index=np.arange(500).astype("object")) + + df = from_pandas_df(raw, chunk_size=(100, 3)) + r = df.memory_usage(index=True) + pd.testing.assert_series_equal(r.execute().fetch(), raw.memory_usage(index=True)) + + raw = pd.Series(np.ones(shape=500).astype("object"), name="s") + + series = from_pandas_series(raw) + r = series.memory_usage(index=True) + assert r.execute().fetch() == raw.memory_usage(index=True) + + series = from_pandas_series(raw, chunk_size=100) + r = series.memory_usage(index=False) + assert r.execute().fetch() == raw.memory_usage(index=False) + + series = from_pandas_series(raw, chunk_size=100) + r = series.memory_usage(index=True) + assert r.execute().fetch() == raw.memory_usage(index=True) + + raw = pd.Series( + np.ones(shape=500).astype("object"), + index=np.arange(500).astype("object"), + name="s", + ) + + series = from_pandas_series(raw, chunk_size=100) + r = series.memory_usage(index=True) + assert r.execute().fetch() == raw.memory_usage(index=True) + + raw = pd.Index(np.arange(500), name="s") + + index = from_pandas_index(raw) + r = index.memory_usage() + assert r.execute().fetch() == raw.memory_usage() + + index = from_pandas_index(raw, chunk_size=100) + r = index.memory_usage() + assert r.execute().fetch() == raw.memory_usage() + + +def test_select_dtypes_execution(setup): + raw = pd.DataFrame({"a": np.random.rand(10), "b": np.random.randint(10, size=10)}) + + df = from_pandas_df(raw, chunk_size=5) + r = df.select_dtypes(include=["float64"]) + + result = r.execute().fetch() + expected = raw.select_dtypes(include=["float64"]) + pd.testing.assert_frame_equal(result, expected) + + +def test_map_chunk_execution(setup): + raw = pd.DataFrame(np.random.rand(10, 5), columns=[f"col{i}" for i in range(5)]) + + df = from_pandas_df(raw, chunk_size=(5, 3)) + + def f1(pdf): + return pdf + 1 + + r = df.map_chunk(f1) + + result = r.execute().fetch() + expected = raw + 1 + pd.testing.assert_frame_equal(result, expected) + + raw_s = raw["col1"] + series = from_pandas_series(raw_s, chunk_size=5) + + r = series.map_chunk(f1) + + result = r.execute().fetch() + expected = raw_s + 1 + pd.testing.assert_series_equal(result, expected) + + def f2(pdf): + return pdf.sum(axis=1) + + df = from_pandas_df(raw, chunk_size=5) + r = df.map_chunk(f2, output_type="series") + + result = r.execute().fetch() + expected = raw.sum(axis=1) + pd.testing.assert_series_equal(result, expected) + + raw = pd.DataFrame({"a": [f"s{i}" for i in range(10)], "b": np.arange(10)}) + + df = from_pandas_df(raw, chunk_size=5) + + def f3(pdf): + return pdf["a"].str.slice(1).astype(int) + pdf["b"] + + with pytest.raises(TypeError): + r = df.map_chunk(f3) + _ = r.execute().fetch() + + r = df.map_chunk(f3, output_type="series", dtypes=pd.Series([np.int64])) + result = r.execute(extra_config={"check_dtypes": False}).fetch() + expected = f3(raw) + pd.testing.assert_series_equal(result, expected) + + def f4(pdf): + ret = pd.DataFrame(columns=["a", "b"]) + ret["a"] = pdf["a"].str.slice(1).astype(int) + ret["b"] = pdf["b"] + return ret + + with pytest.raises(TypeError): + r = df.map_chunk(f4, output_type="dataframe") + _ = r.execute().fetch() + + r = df.map_chunk( + f4, + output_type="dataframe", + dtypes=pd.Series([np.dtype(int), raw["b"].dtype], index=["a", "b"]), + ) + result = r.execute().fetch() + expected = f4(raw) + pd.testing.assert_frame_equal(result, expected) + + raw2 = pd.DataFrame({"a": [np.array([1, 2, 3]), np.array([4, 5, 6])]}) + df2 = from_pandas_df(raw2) + dtypes = pd.Series([np.dtype(float)] * 3) + r = df2.map_chunk( + lambda x: x["a"].apply(pd.Series), output_type="dataframe", dtypes=dtypes + ) + assert r.shape == (np.nan, 3) + pd.testing.assert_series_equal(r.dtypes, dtypes) + result = r.execute().fetch() + expected = raw2.apply(lambda x: x["a"], axis=1, result_type="expand") + pd.testing.assert_frame_equal(result, expected) + + raw = pd.DataFrame(np.random.rand(10, 5), columns=[f"col{i}" for i in range(5)]) + + df = from_pandas_df(raw, chunk_size=(5, 3)) + + def f5(pdf, chunk_index): + return pdf + 1 + chunk_index[0] + + r = df.map_chunk(f5, with_chunk_index=True) + + result = r.execute().fetch() + expected = (raw + 1).add(np.arange(10) // 5, axis=0) + pd.testing.assert_frame_equal(result, expected) + + raw_s = raw["col1"] + series = from_pandas_series(raw_s, chunk_size=5) + + r = series.map_chunk(f5, with_chunk_index=True) + + result = r.execute().fetch() + expected = raw_s + 1 + np.arange(10) // 5 + pd.testing.assert_series_equal(result, expected) + + # test args or kwargs with mars objects + df = from_pandas_df(raw, chunk_size=5) + + def f6(df, mars_df): + return df + mars_df.sum() + + df_arg = from_pandas_df(raw, chunk_size=6) + r = df.map_chunk(f6, args=(df_arg,), output_type="dataframe", dtypes=df.dtypes) + expected = raw + raw.sum() + result = r.execute().fetch() + pd.testing.assert_frame_equal(result, expected) + + df = from_pandas_df(raw, chunk_size=5) + df_arg = from_pandas_df(raw, chunk_size=6) + r = df.map_chunk( + f6, kwargs=dict(mars_df=df_arg), output_type="dataframe", dtypes=df.dtypes + ) + expected = raw + raw.sum() + result = r.execute().fetch() + pd.testing.assert_frame_equal(result, expected) + + def f7(s): + return s.to_json() + + with pytest.raises(TypeError): + series.map_chunk(f7) + + +def test_map_chunk_with_df_or_series_output(setup): + raw = pd.DataFrame(np.random.rand(10, 5), columns=[f"col{i}" for i in range(5)]) + + df = from_pandas_df(raw, chunk_size=(5, 3)) + + def f1(pdf): + return pdf.iloc[2, :2] + + with pytest.raises(TypeError): + df.map_chunk(f1) + + for kwargs in [dict(output_type="df_or_series"), dict(skip_infer=True)]: + res = df.map_chunk(f1, **kwargs) + assert isinstance(res, DATAFRAME_OR_SERIES_TYPE) + res = res.execute() + assert res.data_type == "series" + assert res.dtype == np.dtype("float") + assert not ("dtypes" in res.data_params) + assert res.shape == (4,) + pd.testing.assert_series_equal( + res.fetch(), pd.concat([raw.iloc[2, :2], raw.iloc[7, :2]]) + ) + + def f2(pdf): + return pdf.iloc[[0, 2], :2] + + with pytest.raises(TypeError): + df.map_chunk(f2) + + res = df.map_chunk(f2, output_type="df_or_series") + assert isinstance(res, DATAFRAME_OR_SERIES_TYPE) + res = res.execute() + assert res.data_type == "dataframe" + pd.testing.assert_series_equal(res.dtypes, raw.dtypes[:2]) + assert not ("dtype" in res.data_params) + assert res.shape == (4, 2) + pd.testing.assert_frame_equal( + res.fetch(), + raw.iloc[[0, 2, 5, 7], :2], + ) + + +def test_map_chunk_closure_execute(setup): + raw = pd.DataFrame( + np.random.randint(10**3, size=(10, 5)), columns=[f"col{i}" for i in range(5)] + ) + + df = from_pandas_df(raw, chunk_size=5) + num = 1 + dic = {i: -i for i in range(10**3)} + + def f1(pdf): + return pdf + num + + r = df.map_chunk(f1) + + result = r.execute().fetch() + expected = raw + num + pd.testing.assert_frame_equal(result, expected) + + def f2(pdf): + ret = pd.DataFrame(columns=["col1", "col2"]) + ret["col1"] = pdf["col1"].apply(lambda x: dic.get(x, 0)) + ret["col2"] = pdf["col2"] + return ret + + r = df.map_chunk(f2, output_type="dataframe") + + result = r.execute().fetch() + expected = f2(raw) + pd.testing.assert_frame_equal(result, expected) + + class callable_df: + def __init__(self, multiplier: int = 1): + self.dic = {i: -i for i in range(10**multiplier)} + + def __call__(self, pdf): + ret = pd.DataFrame(columns=["col1", "col2"]) + ret["col1"] = pdf["col1"].apply(lambda x: self.dic.get(x, 0)) + ret["col2"] = pdf["col2"] + return ret + + cdf = callable_df(multiplier=4) + r = df.map_chunk(cdf, output_type="dataframe") + + result = r.execute().fetch() + expected = cdf(raw) + pd.testing.assert_frame_equal(result, expected) + + +@pytest.mark.ray_dag +def test_cartesian_chunk_execution(setup): + rs = np.random.RandomState(0) + raw1 = pd.DataFrame({"a": rs.randint(3, size=10), "b": rs.rand(10)}) + raw2 = pd.DataFrame( + {"c": rs.randint(3, size=10), "d": rs.rand(10), "e": rs.rand(10)} + ) + df1 = from_pandas_df(raw1, chunk_size=(5, 1)) + df2 = from_pandas_df(raw2, chunk_size=(5, 1)) + + def f(c1, c2): + c1, c2 = c1.copy(), c2.copy() + c1["x"] = 1 + c2["x"] = 1 + r = c1.merge(c2, on="x") + r = r[(r["b"] > r["d"]) & (r["b"] < r["e"])] + return r[["a", "c"]] + + rr = df1.cartesian_chunk(df2, f) + + result = rr.execute().fetch() + expected = f(raw1, raw2) + pd.testing.assert_frame_equal( + result.sort_values(by=["a", "c"]).reset_index(drop=True), + expected.sort_values(by=["a", "c"]).reset_index(drop=True), + ) + + def f2(c1, c2): + r = f(c1, c2) + return r["a"] + r["c"] + + rr = df1.cartesian_chunk(df2, f2) + + result = rr.execute().fetch() + expected = f2(raw1, raw2) + pd.testing.assert_series_equal( + result.sort_values().reset_index(drop=True), + expected.sort_values().reset_index(drop=True), + ) + + # size_res = setup.executor.execute_dataframe(rr, mock=True)[0][0] + # assert size_res > 0 + + def f3(c1, c2): + cr = pd.DataFrame() + cr["a"] = c1.str.slice(1).astype(np.int64) + cr["x"] = 1 + cr2 = pd.DataFrame() + cr2["b"] = c2.str.slice(1).astype(np.int64) + cr2["x"] = 1 + return cr.merge(cr2, on="x")[["a", "b"]] + + s_raw = pd.Series([f"s{i}" for i in range(10)]) + series = from_pandas_series(s_raw, chunk_size=5) + + rr = series.cartesian_chunk( + series, + f3, + output_type="dataframe", + dtypes=pd.Series([np.dtype(np.int64)] * 2, index=["a", "b"]), + ) + + result = rr.execute().fetch() + expected = f3(s_raw, s_raw) + pd.testing.assert_frame_equal( + result.sort_values(by=["a", "b"]).reset_index(drop=True), + expected.sort_values(by=["a", "b"]).reset_index(drop=True), + ) + + with pytest.raises(TypeError): + _ = series.cartesian_chunk(series, f3) + + def f4(c1, c2): + r = f3(c1, c2) + return r["a"] + r["b"] + + rr = series.cartesian_chunk( + series, f4, output_type="series", dtypes=np.dtype(np.int64) + ) + + result = rr.execute().fetch() + expected = f4(s_raw, s_raw) + pd.testing.assert_series_equal( + result.sort_values().reset_index(drop=True), + expected.sort_values().reset_index(drop=True), + ) + + +def test_cartesian_chunk_with_df_or_series(setup): + rs = np.random.RandomState(0) + raw1 = pd.DataFrame({"a": range(10), "b": rs.rand(10)}) + raw2 = pd.DataFrame( + {"c": rs.randint(3, size=10), "d": rs.rand(10), "e": rs.rand(10)} + ) + df1 = from_pandas_df(raw1, chunk_size=(5, 1)) + df2 = from_pandas_df(raw2, chunk_size=(5, 1)) + + def f1(c1, c2): + return c1.iloc[[2, 4], :] + + with pytest.raises(TypeError): + df1.cartesian_chunk(df2, f1) + + for kwargs in [dict(output_type="df_or_series"), dict(skip_infer=True)]: + res = df1.cartesian_chunk(df2, f1, **kwargs) + + assert isinstance(res, DATAFRAME_OR_SERIES_TYPE) + res = res.execute() + assert res.data_type == "dataframe" + assert not ("dtype" in res.data_params) + assert res.shape == (8, 2) + pd.testing.assert_series_equal(res.dtypes, raw1.dtypes) + pd.testing.assert_frame_equal( + res.fetch(), raw1.iloc[[2, 4] * 2 + [7, 9] * 2, :] + ) + + def f2(c1, c2): + return c1.iloc[2, :] + + with pytest.raises(TypeError): + df1.cartesian_chunk(df2, f2) + + res = df1.cartesian_chunk(df2, f2, output_type="df_or_series") + + assert isinstance(res, DATAFRAME_OR_SERIES_TYPE) + res = res.execute() + assert res.data_type == "series" + assert not ("dtypes" in res.data_params) + assert res.shape == (8,) + pd.testing.assert_series_equal( + res.fetch(), + pd.concat([raw1.iloc[2, :], raw1.iloc[2, :], raw1.iloc[7, :], raw1.iloc[7, :]]), + ) + + +def test_rebalance_execution(setup): + raw = pd.DataFrame(np.random.rand(10, 3), columns=list("abc")) + df = from_pandas_df(raw) + + def _expect_count(n): + def _tile_rebalance(op): + tileable = yield from op.tile(op) + assert len(tileable.chunks) == n + return tileable + + return _tile_rebalance + + r = df.rebalance(num_partitions=3) + extra_config = {"operand_tile_handlers": {DataFrameRebalance: _expect_count(3)}} + result = r.execute(extra_config=extra_config).fetch() + pd.testing.assert_frame_equal(result, raw) + + r = df.rebalance(factor=0.5) + extra_config = {"operand_tile_handlers": {DataFrameRebalance: _expect_count(1)}} + result = r.execute(extra_config=extra_config).fetch() + pd.testing.assert_frame_equal(result, raw) + + # test worker has two cores + r = df.rebalance() + extra_config = {"operand_tile_handlers": {DataFrameRebalance: _expect_count(2)}} + result = r.execute(extra_config=extra_config).fetch() + pd.testing.assert_frame_equal(result, raw) + + +def test_stack_execution(setup): + raw = pd.DataFrame( + np.random.rand(10, 3), columns=list("abc"), index=[f"s{i}" for i in range(10)] + ) + for loc in [(5, 1), (8, 2), (1, 0)]: + raw.iloc[loc] = np.nan + df = from_pandas_df(raw, chunk_size=(5, 2)) + + for dropna in (True, False): + r = df.stack(dropna=dropna) + result = r.execute().fetch() + expected = raw.stack(dropna=dropna) + pd.testing.assert_series_equal(result, expected) + + cols = pd.MultiIndex.from_tuples([("c1", "cc1"), ("c1", "cc2"), ("c2", "cc3")]) + raw2 = raw.copy() + raw2.columns = cols + df = from_pandas_df(raw2, chunk_size=(5, 2)) + + for level in [-1, 0, [0, 1]]: + for dropna in (True, False): + r = df.stack(level=level, dropna=dropna) + result = r.execute().fetch() + expected = raw2.stack(level=level, dropna=dropna) + assert_method = ( + pd.testing.assert_series_equal + if expected.ndim == 1 + else pd.testing.assert_frame_equal + ) + assert_method(result, expected) + + +@pytest.mark.parametrize( + "ignore_index", [False, True] if _explode_with_ignore_index else [False] +) +def test_explode_execution(setup, ignore_index): + explode_kw = {"ignore_index": True} if ignore_index else {} + + raw = pd.DataFrame( + { + "a": np.random.rand(10), + "b": [np.random.rand(random.randint(1, 10)) for _ in range(10)], + "c": np.random.rand(10), + "d": np.random.rand(10), + } + ) + df = from_pandas_df(raw, chunk_size=(4, 2)) + r = df.explode("b", ignore_index=ignore_index) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.explode("b", **explode_kw)) + + series = from_pandas_series(raw.b, chunk_size=4) + r = series.explode(ignore_index=ignore_index) + pd.testing.assert_series_equal(r.execute().fetch(), raw.b.explode(**explode_kw)) + + +def test_eval_query_execution(setup): + rs = np.random.RandomState(0) + raw = pd.DataFrame({"a": rs.rand(100), "b": rs.rand(100), "c c": rs.rand(100)}) + df = from_pandas_df(raw, chunk_size=(10, 2)) + + r = mars_eval('c = df.a * 2 + df["c c"]', target=df) + pd.testing.assert_frame_equal( + r.execute().fetch(), + pd.eval('c = raw.a * 2 + raw["c c"]', engine="python", target=raw), + ) + + r = df.eval("a + b") + pd.testing.assert_series_equal(r.execute().fetch(), raw.eval("a + b")) + + _val = 5.0 # noqa: F841 + _val_array = [1, 2, 3] # noqa: F841 + expr = """ + e = -a + b + 1 + f = b + `c c` + @_val + @_val_array[-1] + """ + r = df.eval(expr) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.eval(expr)) + + copied_df = df.copy() + copied_df.eval("c = a + b", inplace=True) + pd.testing.assert_frame_equal(copied_df.execute().fetch(), raw.eval("c = a + b")) + + expr = "a > b | a < `c c`" + r = df.query(expr) + pd.testing.assert_frame_equal( + r.execute(extra_config={"check_index_value": False}).fetch(), raw.query(expr) + ) + + expr = "a > b & ~(a < `c c`)" + r = df.query(expr) + pd.testing.assert_frame_equal( + r.execute(extra_config={"check_index_value": False}).fetch(), raw.query(expr) + ) + + expr = "a < b < `c c`" + r = df.query(expr) + pd.testing.assert_frame_equal( + r.execute(extra_config={"check_index_value": False}).fetch(), raw.query(expr) + ) + + expr = "a < 0.5 and a != 0.1 and b != 0.2" + r = df.query(expr) + pd.testing.assert_frame_equal( + r.execute(extra_config={"check_index_value": False}).fetch(), raw.query(expr) + ) + + expr = "(a < 0.5 or a > 0.7) and (b != 0.1 or `c c` > 0.2)" + r = df.query(expr) + pd.testing.assert_frame_equal( + r.execute(extra_config={"check_index_value": False}).fetch(), raw.query(expr) + ) + + copied_df = df.copy() + copied_df.query("a < b", inplace=True) + pd.testing.assert_frame_equal( + copied_df.execute(extra_config={"check_index_value": False}).fetch(), + raw.query("a < b"), + ) + + +def test_check_monotonic_execution(setup): + idx_value = pd.Index(list(range(1000))) + + idx_increase = from_pandas_index(idx_value, chunk_size=100) + assert idx_increase.is_monotonic_increasing.execute().fetch() is True + assert idx_increase.is_monotonic_decreasing.execute().fetch() is False + + idx_decrease = from_pandas_index(idx_value[::-1], chunk_size=100) + assert idx_decrease.is_monotonic_increasing.execute().fetch() is False + assert idx_decrease.is_monotonic_decreasing.execute().fetch() is True + + idx_mixed = from_pandas_index( + pd.Index(list(range(500)) + list(range(500))), chunk_size=100 + ) + assert idx_mixed.is_monotonic_increasing.execute().fetch() is False + assert idx_mixed.is_monotonic_decreasing.execute().fetch() is False + + ser_mixed = from_pandas_series( + pd.Series(list(range(500)) + list(range(499, 999))), chunk_size=100 + ) + assert ser_mixed.is_monotonic_increasing.execute().fetch() is True + assert ser_mixed.is_monotonic_decreasing.execute().fetch() is False + + +def test_pct_change_execution(setup): + # test dataframe + rs = np.random.RandomState(0) + raw = pd.DataFrame( + rs.randint(1000, size=(10, 8)), + columns=["col" + str(i + 1) for i in range(8)], + index=pd.date_range("2021-1-1", periods=10), + ) + + df = from_pandas_df(raw, chunk_size=5) + r = df.pct_change() + result = r.execute().fetch() + expected = raw.pct_change() + pd.testing.assert_frame_equal(expected, result) + + df = from_pandas_df(raw, chunk_size=5) + r = df.pct_change(fill_method=None) + result = r.execute().fetch() + expected = raw.pct_change(fill_method=None) + pd.testing.assert_frame_equal(expected, result) + + df = from_pandas_df(raw, chunk_size=5) + r = df.pct_change(freq="D") + result = r.execute().fetch() + expected = raw.pct_change(freq="D") + pd.testing.assert_frame_equal(expected, result) + + +def test_bloom_filter(setup): + rs = np.random.RandomState(0) + raw1 = pd.DataFrame( + {"col1": rs.randint(0, 100, size=(100,)), "col2": rs.random(100)} + ) + raw2 = pd.DataFrame( + {"col1": rs.randint(0, 10, size=(100,)), "col2": rs.random(100)} + ) + + df1 = from_pandas_df(raw1, chunk_size=10) + df2 = from_pandas_df(raw2, chunk_size=20) + + filtered = filter_by_bloom_filter(df1, df2, "col1", "col1") + r1, r2, filtered_r = mars.fetch(mars.execute(df1, df2, filtered)) + assert r1.shape[0] > filtered_r.shape[0] + assert len(filtered_r[filtered_r["col1"] > 10]) < 10 + + pd.testing.assert_frame_equal(r1, raw1) + pd.testing.assert_frame_equal(r2, raw2) + pd.testing.assert_frame_equal( + filtered_r[filtered_r["col1"] <= 10], raw1[raw1["col1"] <= 10] + ) diff --git a/python/xorbits/_mars/dataframe/base/to_cpu.py b/python/xorbits/_mars/dataframe/base/to_cpu.py new file mode 100644 index 000000000..4b2e79406 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/to_cpu.py @@ -0,0 +1,38 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from .core import DataFrameDeviceConversionBase + + +class DataFrameToCPU(DataFrameDeviceConversionBase): + _op_type_ = OperandDef.TO_CPU + + def __init__(self, dtypes=None, output_types=None, **kw): + super().__init__(_dtypes=dtypes, _output_types=output_types, **kw) + if self.gpu or self.gpu is None: + self.gpu = False + + @classmethod + def execute(cls, ctx, op): + ctx[op.outputs[0].key] = ctx[op.inputs[0].key].to_pandas() + + +def to_cpu(df_or_series): + if df_or_series.op.gpu is False: + # if op.gpu is None, means unknown + return df_or_series + + op = DataFrameToCPU() + return op(df_or_series) diff --git a/python/xorbits/_mars/dataframe/base/to_gpu.py b/python/xorbits/_mars/dataframe/base/to_gpu.py new file mode 100644 index 000000000..16e111f96 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/to_gpu.py @@ -0,0 +1,43 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from .core import DataFrameDeviceConversionBase + + +class DataFrameToGPU(DataFrameDeviceConversionBase): + _op_type_ = OperandDef.TO_GPU + + def __init__(self, dtypes=None, output_types=None, **kw): + super().__init__(_dtypes=dtypes, _output_types=output_types, **kw) + if not self.gpu: + self.gpu = True + + @classmethod + def execute(cls, ctx, op): + import cudf + + out_df = op.outputs[0] + if out_df.ndim == 2: + ctx[out_df.key] = cudf.DataFrame.from_pandas(ctx[op.inputs[0].key]) + else: + ctx[out_df.key] = cudf.Series.from_pandas(ctx[op.inputs[0].key]) + + +def to_gpu(df_or_series): + if df_or_series.op.gpu: + return df_or_series + + op = DataFrameToGPU() + return op(df_or_series) diff --git a/python/xorbits/_mars/dataframe/base/to_numeric.py b/python/xorbits/_mars/dataframe/base/to_numeric.py new file mode 100644 index 000000000..762913bd1 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/to_numeric.py @@ -0,0 +1,220 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import pandas as pd + +from ...core import ENTITY_TYPE, OutputType +from ...serialization.serializables import StringField +from ...tensor import tensor as astensor +from ...tensor.core import TENSOR_TYPE, TensorOrder +from ..core import SERIES_TYPE +from ..initializer import Series as asseries +from ..operands import DataFrameOperand, DataFrameOperandMixin + + +class DataFrameToNumeric(DataFrameOperand, DataFrameOperandMixin): + errors = StringField("errors") + downcast = StringField("downcast") + + def __init__(self, errors="raise", downcast=None, **kw): + super().__init__(errors=errors, downcast=downcast, **kw) + + def __call__(self, arg): + if isinstance(arg, pd.Series): + arg = asseries(arg) + elif not isinstance(arg, ENTITY_TYPE): + arg = astensor(arg) + if arg.ndim != 1: + raise ValueError("Input array must be 1 dimensional") + if arg.size == 0: + raise ValueError("Input array can not be empty") + + if isinstance(arg, asseries): + series = arg + self.output_types = [OutputType.series] + return self.new_series( + [series], + shape=series.shape, + name=series.name, + index_value=series.index_value, + dtype=series.dtype, + ) + else: + tensor = arg + self.output_types = [OutputType.tensor] + dtype = tensor.dtype + if dtype.kind == "U": + dtype = np.dtype(object) + return self.new_tileables([tensor], shape=tensor.shape, dtype=dtype)[0] + + @classmethod + def tile(cls, op): + in_df = op.inputs[0] + out_df = op.outputs[0] + + out_chunks = [] + for in_chunk in in_df.chunks: + out_op = op.copy().reset_key() + chunk_kws = [] + if isinstance(out_df, SERIES_TYPE): + chunk_kws.append( + { + "dtype": out_df.dtype, + "shape": in_chunk.shape, + "index": in_chunk.index, + "index_value": in_chunk.index_value, + "name": in_chunk.name, + } + ) + elif isinstance(out_df, TENSOR_TYPE): + chunk_kws.append( + { + "dtype": out_df.dtype, + "shape": in_chunk.shape, + "order": TensorOrder.C_ORDER, + "index": in_chunk.index, + } + ) + out_chunks.append(out_op.new_chunk([in_chunk], kws=chunk_kws)) + + new_op = op.copy() + kw = out_df.params + kw["nsplits"] = in_df.nsplits + kw["chunks"] = out_chunks + return new_op.new_tileables(op.inputs, kws=[kw]) + + @classmethod + def execute(cls, ctx, op): + input_data = ctx[op.inputs[0].key] + errors_ = op.errors + downcast_ = op.downcast + ctx[op.outputs[0].key] = pd.to_numeric( + input_data, errors=errors_, downcast=downcast_ + ) + + +def to_numeric(arg, errors="raise", downcast=None): + """ + Convert argument to a numeric type. + + The default return dtype is `float64` or `int64` + depending on the data supplied. Use the `downcast` parameter + to obtain other dtypes. + + Please note that precision loss may occur if really large numbers + are passed in. Due to the internal limitations of `ndarray`, if + numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min) + or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are + passed in, it is very likely they will be converted to float so that + they can stored in an `ndarray`. These warnings apply similarly to + `Series` since it internally leverages `ndarray`. + + Parameters + ---------- + arg : scalar, list, tuple, 1-d array, or Series + Argument to be converted. + errors : {'ignore', 'raise', 'coerce'}, default 'raise' + - If 'raise', then invalid parsing will raise an exception. + - If 'coerce', then invalid parsing will be set as NaN. + - If 'ignore', then invalid parsing will return the input. + downcast : {'integer', 'signed', 'unsigned', 'float'}, default None + If not None, and if the data has been successfully cast to a + numerical dtype (or if the data was numeric to begin with), + downcast that resulting data to the smallest numerical dtype + possible according to the following rules: + + - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) + - 'unsigned': smallest unsigned int dtype (min.: np.uint8) + - 'float': smallest float dtype (min.: np.float32) + + As this behaviour is separate from the core conversion to + numeric values, any errors raised during the downcasting + will be surfaced regardless of the value of the 'errors' input. + + In addition, downcasting will only occur if the size + of the resulting data's dtype is strictly larger than + the dtype it is to be cast to, so if none of the dtypes + checked satisfy that specification, no downcasting will be + performed on the data. + + Returns + ------- + ret + Numeric if parsing succeeded. + Return type depends on input. Series if Series, otherwise Tensor. + + See Also + -------- + DataFrame.astype : Cast argument to a specified dtype. + to_datetime : Convert argument to datetime. + to_timedelta : Convert argument to timedelta. + numpy.ndarray.astype : Cast a numpy array to a specified type. + DataFrame.convert_dtypes : Convert dtypes. + + Examples + -------- + Take separate series and convert to numeric, coercing when told to + + >>> s = md.Series(['1.0', '2', -3]) + >>> md.to_numeric(s).execute() + 0 1.0 + 1 2.0 + 2 -3.0 + dtype: float64 + >>> md.to_numeric(s, downcast='float').execute() + 0 1.0 + 1 2.0 + 2 -3.0 + dtype: float32 + >>> md.to_numeric(s, downcast='signed').execute() + 0 1 + 1 2 + 2 -3 + dtype: int8 + >>> s = md.Series(['apple', '1.0', '2', -3]) + >>> md.to_numeric(s, errors='ignore').execute() + 0 apple + 1 1.0 + 2 2 + 3 -3 + dtype: object + >>> md.to_numeric(s, errors='coerce').execute() + 0 NaN + 1 1.0 + 2 2.0 + 3 -3.0 + dtype: float64 + + Downcasting of nullable integer and floating dtypes is supported: + + >>> s = md.Series([1, 2, 3], dtype="int64") + >>> md.to_numeric(s, downcast="integer").execute() + 0 1 + 1 2 + 2 3 + dtype: int8 + >>> s = md.Series([1.0, 2.1, 3.0], dtype="float64") + >>> md.to_numeric(s, downcast="float").execute() + 0 1.0 + 1 2.1 + 2 3.0 + dtype: float32 + """ + if errors not in ("ignore", "raise", "coerce"): + raise ValueError("invalid error value specified") + if downcast not in (None, "integer", "signed", "unsigned", "float"): + raise ValueError("invalid downcasting method provided") + + op = DataFrameToNumeric(errors=errors, downcast=downcast) + return op(arg) diff --git a/python/xorbits/_mars/dataframe/base/transform.py b/python/xorbits/_mars/dataframe/base/transform.py new file mode 100644 index 000000000..e72bca0cb --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/transform.py @@ -0,0 +1,533 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...config import options +from ...core import OutputType, recursive_tile +from ...core.custom_log import redirect_custom_log +from ...serialization.serializables import AnyField, BoolField, DictField, TupleField +from ...utils import enter_current_session, pd_release_version, quiet_stdio +from ..core import DATAFRAME_CHUNK_TYPE, DATAFRAME_TYPE +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import ( + build_df, + build_series, + filter_dtypes_by_index, + make_dtypes, + parse_index, + validate_axis, +) + +_with_convert_dtype = pd_release_version < (1, 2, 0) + + +class TransformOperand(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.TRANSFORM + + _func = AnyField("func") + _axis = AnyField("axis") + _convert_dtype = BoolField("convert_dtype") + _args = TupleField("args") + _kwds = DictField("kwds") + + _call_agg = BoolField("call_agg") + + def __init__( + self, + func=None, + axis=None, + convert_dtype=None, + args=None, + kwds=None, + call_agg=None, + output_types=None, + memory_scale=None, + **kw + ): + super().__init__( + _func=func, + _axis=axis, + _convert_dtype=convert_dtype, + _args=args, + _kwds=kwds, + _call_agg=call_agg, + _output_types=output_types, + _memory_scale=memory_scale, + **kw + ) + + @property + def func(self): + return self._func + + @property + def convert_dtype(self): + return self._convert_dtype + + @property + def axis(self): + return self._axis + + @property + def args(self): + return getattr(self, "_args", None) or () + + @property + def kwds(self): + return getattr(self, "_kwds", None) or dict() + + @property + def call_agg(self): + return self._call_agg + + @classmethod + @redirect_custom_log + @enter_current_session + def execute(cls, ctx, op): + in_data = ctx[op.inputs[0].key] + out_chunk = op.outputs[0] + + if op.call_agg: + result = in_data.agg(op.func, axis=op.axis, *op.args, **op.kwds) + else: + result = in_data.transform(op.func, axis=op.axis, *op.args, **op.kwds) + + if isinstance(out_chunk, DATAFRAME_CHUNK_TYPE): + if out_chunk.dtypes is not None: + result.columns = out_chunk.dtypes.index + ctx[op.outputs[0].key] = result + + @classmethod + def tile(cls, op: "TransformOperand"): + in_df = op.inputs[0] + out_df = op.outputs[0] + axis = op.axis + + if isinstance(in_df, DATAFRAME_TYPE): + if in_df.chunk_shape[axis] > 1: + chunk_size = ( + in_df.shape[axis], + max(1, options.chunk_store_limit // in_df.shape[axis]), + ) + if axis == 1: + chunk_size = chunk_size[::-1] + in_df = yield from recursive_tile(in_df.rechunk(chunk_size)) + elif isinstance(op.func, str) or ( + isinstance(op.func, list) and any(isinstance(e, str) for e in op.func) + ): + # builtin cols handles whole columns, thus merge is needed + if in_df.chunk_shape[0] > 1: + in_df = yield from recursive_tile(in_df.rechunk((in_df.shape[axis],))) + + chunks = [] + axis_index_map = dict() + col_sizes = [] + for c in in_df.chunks: + new_op = op.copy().reset_key() + new_op.tileable_op_key = op.key + params = c.params.copy() + + if out_df.ndim == 2: + if out_df.dtypes is None: + new_dtypes = None + new_shape = [c.shape[0], np.nan] + new_index = c.index + new_columns_value = None + if c.index[0] == 0: + col_sizes.append(np.nan) + elif isinstance(c, DATAFRAME_CHUNK_TYPE): + columns = c.columns_value.to_pandas() + new_dtypes = filter_dtypes_by_index(out_df.dtypes, columns) + + if len(new_dtypes) == 0: + continue + if c.index[0] == 0: + col_sizes.append(len(new_dtypes)) + + new_index = list(c.index) + try: + new_index[1 - op.axis] = axis_index_map[c.index[1 - op.axis]] + except KeyError: + new_index[1 - op.axis] = axis_index_map[ + c.index[1 - op.axis] + ] = len(axis_index_map) + + if isinstance(op.func, dict): + new_op._func = dict( + (k, v) for k, v in op.func.items() if k in new_dtypes + ) + + new_shape = list(c.shape) + new_shape[1] = len(new_dtypes) + + if op.call_agg: + new_shape[op.axis] = np.nan + params["index_value"] = parse_index( + None, c.key, c.index_value.key + ) + new_columns_value = parse_index(new_dtypes.index) + else: + new_dtypes = out_df.dtypes + new_index = c.index + (0,) + new_shape = [c.shape[0], len(new_dtypes)] + if op.call_agg: + new_shape[0] = np.nan + if c.index[0] == 0: + col_sizes.append(len(new_dtypes)) + new_columns_value = out_df.columns_value + params.update( + dict( + dtypes=new_dtypes, + shape=tuple(new_shape), + index=tuple(new_index), + columns_value=new_columns_value, + ) + ) + else: + params["dtype"] = out_df.dtype + if isinstance(in_df, DATAFRAME_TYPE): + params.pop("columns_value", None) + params["index_value"] = out_df.index_value + params["shape"] = (c.shape[1 - op.axis],) + params["index"] = (c.index[1 - op.axis],) + chunks.append(new_op.new_chunk([c], **params)) + + if out_df.ndim == 2: + new_nsplits = [in_df.nsplits[0], tuple(col_sizes)] + if op.call_agg: + new_nsplits[op.axis] = (np.nan,) + elif op.call_agg: + if isinstance(in_df, DATAFRAME_TYPE): + new_nsplits = (in_df.nsplits[1],) + else: + new_nsplits = ((np.nan,),) + else: + new_nsplits = in_df.nsplits + + new_op = op.copy() + kw = out_df.params.copy() + kw.update(dict(chunks=chunks, nsplits=tuple(new_nsplits))) + return new_op.new_tileables(op.inputs, **kw) + + def _infer_df_func_returns(self, df, dtypes): + if self.output_types[0] == OutputType.dataframe: + test_df = build_df(df, fill_value=1, size=2) + try: + with np.errstate(all="ignore"), quiet_stdio(): + if self.call_agg: + infer_df = test_df.agg( + self._func, axis=self._axis, *self.args, **self.kwds + ) + else: + infer_df = test_df.transform( + self._func, axis=self._axis, *self.args, **self.kwds + ) + except: # noqa: E722 + infer_df = None + else: + test_df = build_series(df, size=2, name=df.name) + try: + with np.errstate(all="ignore"), quiet_stdio(): + if self.call_agg: + infer_df = test_df.agg(self._func, args=self.args, **self.kwds) + else: + if not _with_convert_dtype: + infer_df = test_df.transform( + self._func, *self.args, **self.kwds + ) + else: # pragma: no cover + infer_df = test_df.transform( + self._func, + convert_dtype=self.convert_dtype, + args=self.args, + **self.kwds + ) + except: # noqa: E722 + infer_df = None + + if infer_df is None and dtypes is None: + raise TypeError( + "Failed to infer dtype, please specify dtypes as arguments." + ) + + if infer_df is None: + is_df = self.output_types[0] == OutputType.dataframe + else: + is_df = isinstance(infer_df, pd.DataFrame) + + if is_df: + new_dtypes = make_dtypes(dtypes) if dtypes is not None else infer_df.dtypes + self.output_types = [OutputType.dataframe] + else: + new_dtypes = ( + dtypes if dtypes is not None else (infer_df.name, infer_df.dtype) + ) + self.output_types = [OutputType.series] + + return new_dtypes + + def __call__(self, df, dtypes=None, index=None, skip_infer=None): + axis = getattr(self, "axis", None) or 0 + self._axis = validate_axis(axis, df) + + if not skip_infer: + dtypes = self._infer_df_func_returns(df, dtypes) + + if self.output_types[0] == OutputType.dataframe: + new_shape = list(df.shape) + new_index_value = df.index_value + if len(new_shape) == 1: + new_shape.append(len(dtypes) if dtypes is not None else np.nan) + else: + new_shape[1] = len(dtypes) if dtypes is not None else np.nan + + if self.call_agg: + new_shape[self.axis] = np.nan + new_index_value = parse_index(None, (df.key, df.index_value.key)) + if dtypes is None: + columns_value = None + else: + columns_value = parse_index(dtypes.index, store_data=True) + return self.new_dataframe( + [df], + shape=tuple(new_shape), + dtypes=dtypes, + index_value=new_index_value, + columns_value=columns_value, + ) + else: + if dtypes is not None: + name, dtype = dtypes + else: + name, dtype = None, None + + if isinstance(df, DATAFRAME_TYPE): + new_shape = (df.shape[1 - axis],) + new_index_value = [df.columns_value, df.index_value][axis] + else: + new_shape = (np.nan,) if self.call_agg else df.shape + new_index_value = df.index_value + + return self.new_series( + [df], + shape=new_shape, + name=name, + dtype=dtype, + index_value=new_index_value, + ) + + +def df_transform(df, func, axis=0, *args, dtypes=None, skip_infer=False, **kwargs): + """ + Call ``func`` on self producing a DataFrame with transformed values. + + Produced DataFrame will have same axis length as self. + + Parameters + ---------- + func : function, str, list or dict + Function to use for transforming the data. If a function, must either + work when passed a DataFrame or when passed to DataFrame.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.exp. 'sqrt']`` + - dict of axis labels -> functions, function names or list of such. + axis : {0 or 'index', 1 or 'columns'}, default 0 + If 0 or 'index': apply function to each column. + If 1 or 'columns': apply function to each row. + + dtypes : Series, default None + Specify dtypes of returned DataFrames. See `Notes` for more details. + + skip_infer: bool, default False + Whether infer dtypes when dtypes or output_type is not specified. + + *args + Positional arguments to pass to `func`. + **kwargs + Keyword arguments to pass to `func`. + + Returns + ------- + DataFrame + A DataFrame that must have the same length as self. + + Raises + ------ + ValueError : If the returned DataFrame has a different length than self. + + See Also + -------- + DataFrame.agg : Only perform aggregating type operations. + DataFrame.apply : Invoke function on a DataFrame. + + Notes + ----- + When deciding output dtypes and shape of the return value, Mars will + try applying ``func`` onto a mock DataFrame and the apply call may + fail. When this happens, you need to specify a list or a pandas + Series as ``dtypes`` of output DataFrame. + + Examples + -------- + >>> import mars.tensor as mt + >>> import mars.dataframe as md + >>> df = md.DataFrame({'A': range(3), 'B': range(1, 4)}) + >>> df.execute() + A B + 0 0 1 + 1 1 2 + 2 2 3 + >>> df.transform(lambda x: x + 1).execute() + A B + 0 1 2 + 1 2 3 + 2 3 4 + + Even though the resulting DataFrame must have the same length as the + input DataFrame, it is possible to provide several input functions: + + >>> s = md.Series(range(3)) + >>> s.execute() + 0 0 + 1 1 + 2 2 + dtype: int64 + >>> s.transform([mt.sqrt, mt.exp]).execute() + sqrt exp + 0 0.000000 1.000000 + 1 1.000000 2.718282 + 2 1.414214 7.389056 + """ + op = TransformOperand( + func=func, + axis=axis, + args=args, + kwds=kwargs, + output_types=[OutputType.dataframe], + call_agg=kwargs.pop("_call_agg", False), + ) + return op(df, dtypes=dtypes, skip_infer=skip_infer) + + +def series_transform( + series, + func, + convert_dtype=True, + axis=0, + *args, + skip_infer=False, + dtype=None, + **kwargs +): + """ + Call ``func`` on self producing a Series with transformed values. + + Produced Series will have same axis length as self. + + Parameters + ---------- + func : function, str, list or dict + Function to use for transforming the data. If a function, must either + work when passed a Series or when passed to Series.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.exp. 'sqrt']`` + - dict of axis labels -> functions, function names or list of such. + axis : {0 or 'index'} + Parameter needed for compatibility with DataFrame. + + dtype : numpy.dtype, default None + Specify dtypes of returned DataFrames. See `Notes` for more details. + + skip_infer: bool, default False + Whether infer dtypes when dtypes or output_type is not specified. + + *args + Positional arguments to pass to `func`. + **kwargs + Keyword arguments to pass to `func`. + + Returns + ------- + Series + A Series that must have the same length as self. + + Raises + ------ + ValueError : If the returned Series has a different length than self. + + See Also + -------- + Series.agg : Only perform aggregating type operations. + Series.apply : Invoke function on a Series. + + Notes + ----- + When deciding output dtypes and shape of the return value, Mars will + try applying ``func`` onto a mock Series, and the transform call may + fail. When this happens, you need to specify ``dtype`` of output + Series. + + Examples + -------- + >>> import mars.tensor as mt + >>> import mars.dataframe as md + >>> df = md.DataFrame({'A': range(3), 'B': range(1, 4)}) + >>> df.execute() + A B + 0 0 1 + 1 1 2 + 2 2 3 + >>> df.transform(lambda x: x + 1).execute() + A B + 0 1 2 + 1 2 3 + 2 3 4 + + Even though the resulting Series must have the same length as the + input Series, it is possible to provide several input functions: + + >>> s = md.Series(range(3)) + >>> s.execute() + 0 0 + 1 1 + 2 2 + dtype: int64 + >>> s.transform([mt.sqrt, mt.exp]).execute() + sqrt exp + 0 0.000000 1.000000 + 1 1.000000 2.718282 + 2 1.414214 7.389056 + """ + op = TransformOperand( + func=func, + axis=axis, + convert_dtype=convert_dtype, + args=args, + kwds=kwargs, + output_types=[OutputType.series], + call_agg=kwargs.pop("_call_agg", False), + ) + dtypes = (series.name, dtype) if dtype is not None else None + return op(series, dtypes=dtypes, skip_infer=skip_infer) diff --git a/python/xorbits/_mars/dataframe/base/transpose.py b/python/xorbits/_mars/dataframe/base/transpose.py new file mode 100644 index 000000000..229dc0dff --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/transpose.py @@ -0,0 +1,169 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes +from ...core import OutputType +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import parse_index + + +class DataFrameTranspose(DataFrameOperand, DataFrameOperandMixin): + _op_code_ = opcodes.TRANSPOSE + + def __init__(self, **kw): + super().__init__(**kw) + self.output_types = [OutputType.dataframe] + + def __call__(self, args): + arg = args[0] + new_shape = arg.shape[::-1] + columns_value = arg.index_value + index_value = parse_index(arg.dtypes.index) + return self.new_dataframe( + [arg], + shape=new_shape, + dtypes=None, + columns_value=columns_value, + index_value=index_value, + ) + + @classmethod + def tile(cls, op): + out_chunks = [] + for c in op.inputs[0].chunks: + chunk_op = op.copy().reset_key() + chunk_shape = tuple(s if np.isnan(s) else int(s) for s in c.shape[::-1]) + chunk_idx = c.index[::-1] + index_value = parse_index(c.dtypes.index) + columns_value = c.index_value + out_chunk = chunk_op.new_chunk( + [c], + shape=chunk_shape, + index=chunk_idx, + index_value=index_value, + columns_value=columns_value, + dtypes=None, + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + nsplits = op.inputs[0].nsplits[::-1] + params = op.outputs[0].params + return new_op.new_dataframe( + op.inputs, chunks=out_chunks, nsplits=nsplits, **params + ) + + @classmethod + def execute(cls, ctx, op): + inp = ctx[op.inputs[0].key] + ctx[op.outputs[0].key] = inp.transpose() + + +def transpose(*args): + """ + Transpose index and columns. + + Reflect the DataFrame over its main diagonal by writing rows as columns + and vice-versa. The property :attr:`.T` is an accessor to the method + :meth:`transpose`. + + Parameters + ---------- + *args : tuple, optional + Accepted for compatibility with NumPy. + + Returns + ------- + DataFrame + The transposed DataFrame. + + See Also + -------- + numpy.transpose : Permute the dimensions of a given array. + + Notes + ----- + Transposing a DataFrame with mixed dtypes will result in a homogeneous + DataFrame with the `object` dtype. + + Examples + -------- + **Square DataFrame with homogeneous dtype** + + >>> import mars.dataframe as md + >>> d1 = {'col1': [1, 2], 'col2': [3, 4]} + >>> df1 = md.DataFrame(data=d1).execute() + >>> df1 + col1 col2 + 0 1 3 + 1 2 4 + + >>> df1_transposed = df1.T.execute() # or df1.transpose().execute() + >>> df1_transposed + 0 1 + col1 1 2 + col2 3 4 + + When the dtype is homogeneous in the original DataFrame, we get a + transposed DataFrame with the same dtype: + + >>> df1.dtypes + col1 int64 + col2 int64 + dtype: object + + >>> df1_transposed.dtypes + 0 int64 + 1 int64 + dtype: object + + **Non-square DataFrame with mixed dtypes** + + >>> d2 = {'name': ['Alice', 'Bob'], + ... 'score': [9.5, 8], + ... 'employed': [False, True], + ... 'kids': [0, 0]} + >>> df2 = md.DataFrame(data=d2).execute() + >>> df2 + name score employed kids + 0 Alice 9.5 False 0 + 1 Bob 8.0 True 0 + + >>> df2_transposed = df2.T.execute() # or df2.transpose().execute() + >>> df2_transposed + 0 1 + name Alice Bob + score 9.5 8.0 + employed False True + kids 0 0 + + When the DataFrame has mixed dtypes, we get a transposed DataFrame with + the `object` dtype: + + >>> df2.dtypes + name object + score float64 + employed bool + kids int64 + dtype: object + + >>> df2_transposed.dtypes + 0 object + 1 object + dtype: object + """ + op = DataFrameTranspose() + return op(args) diff --git a/python/xorbits/_mars/dataframe/base/value_counts.py b/python/xorbits/_mars/dataframe/base/value_counts.py new file mode 100644 index 000000000..260257c36 --- /dev/null +++ b/python/xorbits/_mars/dataframe/base/value_counts.py @@ -0,0 +1,294 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...core import OutputType, recursive_tile +from ...core.operand import OperandStage +from ...serialization.serializables import BoolField, Int64Field, KeyField, StringField +from ...utils import has_unknown_shape, pd_release_version +from ..core import Series +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_series, parse_index + +_keep_original_order = pd_release_version >= (1, 3, 0) + + +class DataFrameValueCounts(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.VALUE_COUNTS + + input = KeyField("input") + normalize = BoolField("normalize") + sort = BoolField("sort") + ascending = BoolField("ascending") + bins = Int64Field("bins") + dropna = BoolField("dropna") + method = StringField("method") + convert_index_to_interval = BoolField("convert_index_to_interval", default=None) + nrows = Int64Field("nrows", default=None) + + def __init__(self, **kw): + super().__init__(**kw) + self.output_types = [OutputType.series] + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self.input = self._inputs[0] + + def __call__(self, inp): + test_series = build_series(inp).value_counts(normalize=self.normalize) + if self.bins is not None: + from .cut import cut + + # cut + try: + inp = cut(inp, self.bins, include_lowest=True) + except TypeError: # pragma: no cover + raise TypeError("bins argument only works with numeric data.") + + self.bins = None + self.convert_index_to_interval = True + return self.new_series( + [inp], + shape=(np.nan,), + index_value=parse_index(pd.CategoricalIndex([]), inp, store_data=False), + name=inp.name, + dtype=test_series.dtype, + ) + else: + return self.new_series( + [inp], + shape=(np.nan,), + index_value=parse_index(test_series.index, store_data=False), + name=inp.name, + dtype=test_series.dtype, + ) + + @classmethod + def tile(cls, op: "DataFrameValueCounts"): + inp = op.input + out = op.outputs[0] + + if len(inp.chunks) == 1: + chunk_op = op.copy().reset_key() + chunk_param = out.params + chunk_param["index"] = (0,) + chunk = chunk_op.new_chunk(inp.chunks, kws=[chunk_param]) + + new_op = op.copy() + param = out.params + param["chunks"] = [chunk] + param["nsplits"] = ((np.nan,),) + return new_op.new_seriess(op.inputs, kws=[param]) + + inp = Series(inp) + + if op.dropna: + inp = inp.dropna() + + inp = inp.groupby(inp, sort=not _keep_original_order).count(method=op.method) + + if op.normalize: + if op.convert_index_to_interval: + if has_unknown_shape(op.input): + yield + inp = inp.truediv(op.input.shape[0], axis=0) + else: + inp = inp.truediv(inp.sum(), axis=0) + + if op.sort: + inp = inp.sort_values( + ascending=op.ascending, + kind="mergesort" if _keep_original_order else "quicksort", + ) + + if op.nrows: + # set to sort_values + inp.op.nrows = op.nrows + elif op.nrows: + inp = inp.iloc[: op.nrows] + + ret = yield from recursive_tile(inp) + + chunks = [] + for c in ret.chunks: + chunk_op = DataFrameValueCounts( + convert_index_to_interval=op.convert_index_to_interval, + stage=OperandStage.map, + ) + chunk_params = c.params + if op.convert_index_to_interval: + # convert index to IntervalDtype + chunk_params["index_value"] = parse_index( + pd.IntervalIndex([]), c, store_data=False + ) + chunks.append(chunk_op.new_chunk([c], kws=[chunk_params])) + + new_op = op.copy() + params = out.params + params["chunks"] = chunks + params["nsplits"] = ret.nsplits + return new_op.new_seriess(out.inputs, kws=[params]) + + @classmethod + def execute(cls, ctx, op: "DataFrameValueCounts"): + if op.stage != OperandStage.map: + in_data = ctx[op.input.key] + if op.convert_index_to_interval: + result = in_data.value_counts( + normalize=False, + sort=op.sort, + ascending=op.ascending, + bins=op.bins, + dropna=op.dropna, + ) + if op.normalize: + result /= in_data.shape[0] + else: + try: + result = in_data.value_counts( + normalize=op.normalize, + sort=op.sort, + ascending=op.ascending, + bins=op.bins, + dropna=op.dropna, + ) + except ValueError: + in_data = in_data.copy() + result = in_data.value_counts( + normalize=op.normalize, + sort=op.sort, + ascending=op.ascending, + bins=op.bins, + dropna=op.dropna, + ) + else: + result = ctx[op.input.key] + # set index name to None to keep consistency with pandas + result.index.name = None + if op.convert_index_to_interval: + # convert CategoricalDtype which generated in `cut` + # to IntervalDtype + result.index = result.index.astype("interval") + if op.nrows: + result = result.head(op.nrows) + ctx[op.outputs[0].key] = result + + +def value_counts( + series, + normalize=False, + sort=True, + ascending=False, + bins=None, + dropna=True, + method="auto", +): + """ + Return a Series containing counts of unique values. + + The resulting object will be in descending order so that the + first element is the most frequently-occurring element. + Excludes NA values by default. + + Parameters + ---------- + normalize : bool, default False + If True then the object returned will contain the relative + frequencies of the unique values. + sort : bool, default True + Sort by frequencies. + ascending : bool, default False + Sort in ascending order. + bins : int, optional + Rather than count values, group them into half-open bins, + a convenience for ``pd.cut``, only works with numeric data. + dropna : bool, default True + Don't include counts of NaN. + method : str, default 'auto' + 'auto', 'shuffle', or 'tree', 'tree' method provide + a better performance, while 'shuffle' is recommended + if aggregated result is very large, 'auto' will use + 'shuffle' method in distributed mode and use 'tree' + in local mode. + + Returns + ------- + Series + + See Also + -------- + Series.count: Number of non-NA elements in a Series. + DataFrame.count: Number of non-NA elements in a DataFrame. + + Examples + -------- + >>> import mars.dataframe as md + >>> import mars.tensor as mt + + >>> s = md.Series([3, 1, 2, 3, 4, mt.nan]) + >>> s.value_counts().execute() + 3.0 2 + 4.0 1 + 2.0 1 + 1.0 1 + dtype: int64 + + With `normalize` set to `True`, returns the relative frequency by + dividing all values by the sum of values. + + >>> s = md.Series([3, 1, 2, 3, 4, mt.nan]) + >>> s.value_counts(normalize=True).execute() + 3.0 0.4 + 4.0 0.2 + 2.0 0.2 + 1.0 0.2 + dtype: float64 + + **bins** + + Bins can be useful for going from a continuous variable to a + categorical variable; instead of counting unique + apparitions of values, divide the index in the specified + number of half-open bins. + + >>> s.value_counts(bins=3).execute() + (2.0, 3.0] 2 + (0.996, 2.0] 2 + (3.0, 4.0] 1 + dtype: int64 + + **dropna** + + With `dropna` set to `False` we can also see NaN index values. + + >>> s.value_counts(dropna=False).execute() + 3.0 2 + NaN 1 + 4.0 1 + 2.0 1 + 1.0 1 + dtype: int64 + """ + op = DataFrameValueCounts( + normalize=normalize, + sort=sort, + ascending=ascending, + bins=bins, + dropna=dropna, + method=method, + ) + return op(series) diff --git a/python/xorbits/_mars/dataframe/contrib/__init__.py b/python/xorbits/_mars/dataframe/contrib/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/contrib/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/contrib/raydataset/__init__.py b/python/xorbits/_mars/dataframe/contrib/raydataset/__init__.py new file mode 100644 index 000000000..7bab2ce69 --- /dev/null +++ b/python/xorbits/_mars/dataframe/contrib/raydataset/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .dataset import get_chunk_refs, to_ray_dataset +from .mldataset import ChunkRefBatch, to_ray_mldataset diff --git a/python/xorbits/_mars/dataframe/contrib/raydataset/dataset.py b/python/xorbits/_mars/dataframe/contrib/raydataset/dataset.py new file mode 100644 index 000000000..5e900bf53 --- /dev/null +++ b/python/xorbits/_mars/dataframe/contrib/raydataset/dataset.py @@ -0,0 +1,66 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import operator +from functools import reduce +from typing import Dict, List + +from ....utils import lazy_import +from .mldataset import _rechunk_if_needed + +ray = lazy_import("ray") +# Ray Datasets is available in early preview at ray.data with Ray 1.6+ +# (and ray.experimental.data in Ray 1.5) +ray_dataset = lazy_import("ray.data", rename="ray_dataset") + + +def to_ray_dataset(df, num_shards: int = None): + """Create a Ray Dataset from Mars DataFrame + + Args: + df (mars.dataframe.Dataframe): the Mars DataFrame + num_shards (int, optional): the number of shards that will be created + for the Ray Dataset. Defaults to None. + If num_shards is None, chunks will be grouped by nodes where they lie. + Otherwise, chunks will be grouped by their order in DataFrame. + + Returns: + a Ray Dataset + """ + df = _rechunk_if_needed(df, num_shards) + # chunk_addr_refs is fetched directly rather than in batches + # during `fetch` procedure, it'll be checked that df has been executed + # items in chunk_addr_refs are ordered by positions in df + # while adjacent chunks may belong to different addrs, i.e. + # chunk1 for addr1, + # chunk2 & chunk3 for addr2, + # chunk4 for addr1 + chunk_refs: List["ray.ObjectRef"] = get_chunk_refs(df) + dataset = ray_dataset.from_pandas_refs(chunk_refs) + # Hold mars dataframe to avoid mars dataframe and ray object gc. + dataset.dataframe = df + + def __getstate__(): + state = dataset.__dict__.copy() + state.pop("dataframe", None) + return state + + # `dataframe` is not serializable by ray. + dataset.__getstate__ = __getstate__ + return dataset + + +def get_chunk_refs(df): + fetched_infos: Dict[str, List] = df.fetch_infos(["object_refs"]) + object_refs = reduce(operator.concat, fetched_infos["object_refs"]) + return object_refs diff --git a/python/xorbits/_mars/dataframe/contrib/raydataset/mldataset.py b/python/xorbits/_mars/dataframe/contrib/raydataset/mldataset.py new file mode 100644 index 000000000..6384c3116 --- /dev/null +++ b/python/xorbits/_mars/dataframe/contrib/raydataset/mldataset.py @@ -0,0 +1,137 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +from typing import Dict, Iterable, List, Tuple + +import numpy as np +import pandas as pd + +from ....utils import lazy_import + +ray = lazy_import("ray") +parallel_it = lazy_import("ray.util.iter", rename="parallel_it") +ml_dataset = lazy_import("ray.util.data", rename="ml_dataset") + + +class ChunkRefBatch: + def __init__(self, shard_id: int, obj_refs: "ray.ObjectRef"): + """Iterable batch holding a list of ray.ObjectRefs. + + Args: + shard_id (int): id of the shard + prefix (str): prefix name of the batch + obj_refs (List[ray.ObjectRefs]): list of ray.ObjectRefs + """ + self._shard_id = shard_id + self._obj_refs = obj_refs + + @property + def shard_id(self) -> int: + return self._shard_id + + def __iter__(self) -> Iterable[pd.DataFrame]: + """Returns the item_generator required from ParallelIteratorWorker.""" + for obj_ref in self._obj_refs: + yield ray.get(obj_ref) + + +def _group_chunk_refs( + chunk_addr_refs: List[Tuple[Tuple, "ray.ObjectRef"]], num_shards: int +): + """Group fetched ray.ObjectRefs into a dict for later use. + + Args: + chunk_addr_refs (List[Tuple[Tuple, ray.ObjectRef]]): a list of tuples of + band & ray.ObjectRef of each chunk. + num_shards (int): the number of shards that will be created for the MLDataset. + + Returns: + Dict[str, List[ray.ObjectRef]]: a dict that defines which group of ray.ObjectRefs will + be in an ChunkRefBatch. + """ + group_to_obj_refs = defaultdict(list) + if not num_shards: + for addr, obj_ref in chunk_addr_refs: + group_to_obj_refs[addr].append(obj_ref) + else: + splits = np.array_split([ref for _, ref in chunk_addr_refs], num_shards) + for idx, split in enumerate(splits): + group_to_obj_refs["group-" + str(idx)] = list(split) + return group_to_obj_refs + + +def _rechunk_if_needed(df, num_shards: int = None): + try: + if num_shards: + assert isinstance(num_shards, int) and num_shards > 0 + df = df.rebalance(axis=0, num_partitions=num_shards) + df = df.rechunk({1: df.shape[1]}) + df = df.reset_index(drop=True) + return df.execute() + except Exception as e: # pragma: no cover + raise Exception(f"rechunk failed df.shape {df.shape}") from e + + +def to_ray_mldataset(df, num_shards: int = None): + """Create a MLDataset from Mars DataFrame + + Args: + df (mars.dataframe.Dataframe): the Mars DataFrame + num_shards (int, optional): the number of shards that will be created + for the MLDataset. Defaults to None. + If num_shards is None, chunks will be grouped by nodes where they lie. + Otherwise, chunks will be grouped by their order in DataFrame. + + Returns: + a MLDataset + """ + df = _rechunk_if_needed(df, num_shards) + # chunk_addr_refs is fetched directly rather than in batches + # during `fetch` procedure, it'll be checked that df has been executed + # items in chunk_addr_refs are ordered by positions in df + # while adjacent chunks may belong to different addrs, i.e. + # chunk1 for addr1, + # chunk2 & chunk3 for addr2, + # chunk4 for addr1 + fetched_infos: Dict[str, List] = df.fetch_infos(fields=["bands", "object_refs"]) + chunk_addr_refs: List[Tuple[Tuple, "ray.ObjectRef"]] = [] + for bands, object_refs in zip(fetched_infos["bands"], fetched_infos["object_refs"]): + chunk_addr_ref = ( + (bands[0], object_refs[0]) if bands else ("ray_dag_0", object_refs[0]) + ) + chunk_addr_refs.append(chunk_addr_ref) + + group_to_obj_refs: Dict[str, List[ray.ObjectRef]] = _group_chunk_refs( + chunk_addr_refs, num_shards + ) + + record_batches = [] + for rank, obj_refs in enumerate(group_to_obj_refs.values()): + record_batches.append(ChunkRefBatch(shard_id=rank, obj_refs=obj_refs)) + worker_cls = ray.remote(num_cpus=0)(parallel_it.ParallelIteratorWorker) + actors = [worker_cls.remote(g, False) for g in record_batches] + it = parallel_it.from_actors(actors, "from_mars") + dataset = ml_dataset.from_parallel_iter(it, need_convert=False, batch_size=0) + # Hold mars dataframe to avoid mars dataframe and ray object gc. + dataset.dataframe = df + + def __getstate__(): + state = dataset.__dict__.copy() + state.pop("dataframe", None) + return state + + # `dataframe` is not serializable by ray. + dataset.__getstate__ = __getstate__ + return dataset diff --git a/python/xorbits/_mars/dataframe/contrib/raydataset/tests/__init__.py b/python/xorbits/_mars/dataframe/contrib/raydataset/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/contrib/raydataset/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/contrib/raydataset/tests/test_mldataset.py b/python/xorbits/_mars/dataframe/contrib/raydataset/tests/test_mldataset.py new file mode 100644 index 000000000..14d612e0c --- /dev/null +++ b/python/xorbits/_mars/dataframe/contrib/raydataset/tests/test_mldataset.py @@ -0,0 +1,150 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import numpy as np +import pandas as pd +import pytest + +from ..... import dataframe as md +from .....conftest import MARS_CI_BACKEND +from .....deploy.oscar.ray import new_cluster +from .....deploy.oscar.session import new_session +from .....tests.core import require_ray +from .....utils import lazy_import +from ....contrib import raydataset as mdd +from ....utils import ray_deprecate_ml_dataset + +ray = lazy_import("ray") +ml_dataset = lazy_import("ray.util.data", rename="ml_dataset") + +try: + import xgboost_ray +except ImportError: # pragma: no cover + xgboost_ray = None +try: + import sklearn +except ImportError: # pragma: no cover + sklearn = None + + +@pytest.fixture +async def create_cluster(request): + client = await new_cluster( + supervisor_mem=256 * 1024**2, + worker_num=2, + worker_cpu=1, + worker_mem=256 * 1024**2, + backend=MARS_CI_BACKEND, + ) + async with client: + yield client + + +@require_ray +@pytest.mark.asyncio +@pytest.mark.skipif( + ray_deprecate_ml_dataset in (True, None), + reason="Ray (>=2.0) has deprecated MLDataset.", +) +async def test_dataset_related_classes(ray_start_regular_shared): + from ..mldataset import ChunkRefBatch + + # in order to pass checks + value1 = np.random.rand(10, 10) + value2 = np.random.rand(10, 10) + df1 = pd.DataFrame(value1) + df2 = pd.DataFrame(value2) + if ray: + obj_ref1, obj_ref2 = ray.put(df1), ray.put(df2) + batch = ChunkRefBatch(shard_id=0, obj_refs=[obj_ref1, obj_ref2]) + assert batch.shard_id == 0 + # the first data in batch + batch = iter(batch) + pd.testing.assert_frame_equal(next(batch), df1) + pd.testing.assert_frame_equal(next(batch), df2) + + +@require_ray +@pytest.mark.asyncio +@pytest.mark.parametrize("chunk_size_and_num_shards", [[5, 5], [5, 4], [None, None]]) +@pytest.mark.skipif( + ray_deprecate_ml_dataset in (True, None), + reason="Ray (>=2.0) has deprecated MLDataset.", +) +async def test_convert_to_ray_mldataset( + ray_start_regular_shared, create_cluster, chunk_size_and_num_shards +): + assert create_cluster.session + session = new_session(address=create_cluster.address, backend="ray") + with session: + value = np.random.rand(10, 10) + chunk_size, num_shards = chunk_size_and_num_shards + df: md.DataFrame = md.DataFrame(value, chunk_size=chunk_size) + df.execute() + + ds = mdd.to_ray_mldataset(df, num_shards=num_shards) + assert isinstance(ds, ml_dataset.MLDataset) + + +@require_ray +@pytest.mark.asyncio +@pytest.mark.skipif(xgboost_ray is None, reason="xgboost_ray not installed") +@pytest.mark.skipif( + ray_deprecate_ml_dataset in (True, None), + reason="Ray (>=2.0) has deprecated MLDataset.", +) +async def test_mars_with_xgboost(ray_start_regular_shared, create_cluster): + from sklearn.datasets import load_breast_cancer + from xgboost_ray import RayDMatrix, RayParams, predict, train + + assert create_cluster.session + session = new_session(address=create_cluster.address, backend="ray") + with session: + train_x, train_y = load_breast_cancer(return_X_y=True, as_frame=True) + df: md.DataFrame = md.concat( + [md.DataFrame(train_x), md.DataFrame(train_y)], axis=1 + ) + df.execute() + + num_shards = 4 + ds = mdd.to_ray_mldataset(df, num_shards) + assert isinstance(ds, ml_dataset.MLDataset) + + import gc + + gc.collect() # Ensure MLDataset does hold mars dataframe to avoid gc. + + # train + train_set = RayDMatrix(ds, "target") + evals_result = {} + bst = train( + { + "objective": "binary:logistic", + "eval_metric": ["logloss", "error"], + }, + train_set, + evals_result=evals_result, + evals=[(train_set, "train")], + verbose_eval=False, + ray_params=RayParams( + num_actors=num_shards, cpus_per_actor=1 # Number of remote actors + ), + ) + bst.save_model("model.xgb") + assert os.path.exists("model.xgb") + os.remove("model.xgb") + print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) + predict(bst, train_set, ray_params=RayParams(num_actors=2)) diff --git a/python/xorbits/_mars/dataframe/contrib/raydataset/tests/test_raydataset.py b/python/xorbits/_mars/dataframe/contrib/raydataset/tests/test_raydataset.py new file mode 100644 index 000000000..7d2b4b6b4 --- /dev/null +++ b/python/xorbits/_mars/dataframe/contrib/raydataset/tests/test_raydataset.py @@ -0,0 +1,188 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import numpy as np +import pandas as pd +import pytest + +from ..... import dataframe as md +from .....conftest import MARS_CI_BACKEND +from .....deploy.oscar.ray import new_cluster +from .....deploy.oscar.session import new_session +from .....tests.core import require_ray +from .....utils import lazy_import +from ....contrib import raydataset as mdd + +ray = lazy_import("ray") +# Ray Datasets is available in early preview at ray.data with Ray 1.6+ +# (and ray.experimental.data in Ray 1.5) +ray_dataset = lazy_import("ray.data", rename="ray_dataset") +xgboost_ray = lazy_import("xgboost_ray") +try: + import sklearn +except ImportError: # pragma: no cover + sklearn = None + + +@pytest.fixture +async def create_cluster(request): + client = await new_cluster( + supervisor_mem=256 * 1024**2, + worker_num=2, + worker_cpu=1, + worker_mem=256 * 1024**2, + backend=MARS_CI_BACKEND, + ) + async with client: + yield client + + +@require_ray +@pytest.mark.asyncio +@pytest.mark.parametrize("chunk_size_and_num_shards", [[3, 3], [3, 2], [None, None]]) +async def test_convert_to_ray_dataset( + ray_start_regular_shared, create_cluster, chunk_size_and_num_shards +): + assert create_cluster.session + session = new_session(address=create_cluster.address, default=True) + with session: + value = np.random.rand(10, 10) + chunk_size, num_shards = chunk_size_and_num_shards + # ray dataset needs str columns + df: md.DataFrame = md.DataFrame( + value, + chunk_size=chunk_size, + columns=[f"c{i}" for i in range(value.shape[1])], + ) + df.execute() + + ds = mdd.to_ray_dataset(df, num_shards=num_shards) + assert isinstance(ds, ray_dataset.Dataset) + + +@require_ray +@pytest.mark.asyncio +@pytest.mark.skipif(xgboost_ray is None, reason="xgboost_ray not installed") +async def test_mars_with_xgboost(ray_start_regular_shared, create_cluster): + from sklearn.datasets import load_breast_cancer + from xgboost_ray import RayDMatrix, RayParams, train + + assert create_cluster.session + session = new_session(address=create_cluster.address, backend="ray") + with session: + train_x, train_y = load_breast_cancer(return_X_y=True, as_frame=True) + pd_df = pd.concat([train_x, train_y], axis=1) + df: md.DataFrame = md.DataFrame(pd_df) + df.execute() + + num_shards = 4 + ds = md.to_ray_dataset(df, num_shards=num_shards) + assert isinstance(ds, ray_dataset.Dataset) + + # train + train_set = RayDMatrix(ds, "target") + evals_result = {} + bst = train( + { + "objective": "binary:logistic", + "eval_metric": ["logloss", "error"], + }, + train_set, + evals_result=evals_result, + evals=[(train_set, "train")], + verbose_eval=False, + ray_params=RayParams( + num_actors=num_shards, cpus_per_actor=1 # Number of remote actors + ), + ) + bst.save_model("model.xgb") + assert os.path.exists("model.xgb") + os.remove("model.xgb") + print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) + + +@require_ray +@pytest.mark.asyncio +@pytest.mark.skipif(sklearn is None, reason="sklearn not installed") +@pytest.mark.skipif(xgboost_ray is None, reason="xgboost_ray not installed") +async def test_mars_with_xgboost_sklearn_clf(ray_start_regular_shared, create_cluster): + from sklearn.datasets import load_breast_cancer + from xgboost_ray import RayDMatrix, RayParams, RayXGBClassifier + + assert create_cluster.session + session = new_session(address=create_cluster.address, backend="ray") + with session: + train_x, train_y = load_breast_cancer(return_X_y=True, as_frame=True) + df: md.DataFrame = md.concat( + [md.DataFrame(train_x), md.DataFrame(train_y)], axis=1 + ) + df.execute() + columns = list(df.columns.to_pandas()) + print(f"Columns {columns}, pandas columns {train_x.columns}") + assert columns[:-1] == list(train_x.columns) + num_shards = 4 + ds = md.to_ray_dataset(df, num_shards) + assert isinstance(ds, ray_dataset.Dataset) + print(f"Columns {columns}, dataset columns {train_x.columns}") + assert columns == ds.schema().names + import gc + + gc.collect() # Ensure MLDataset does hold mars dataframe to avoid gc. + ray_params = RayParams(num_actors=2, cpus_per_actor=1) + clf = RayXGBClassifier( + ray_params=ray_params, + random_state=42, + use_label_encoder=False, + num_class=2, + ) + # train + clf.fit(RayDMatrix(ds, "target"), y=None, ray_params=ray_params) + clf.predict(RayDMatrix(ds, "target")) + # Enable it when https://github.com/ray-project/xgboost_ray/issues/177 got fixed + # pred = clf.predict(train_x) + # print("predicted values: ", pred) + + +@require_ray +@pytest.mark.asyncio +@pytest.mark.skipif(sklearn is None, reason="sklearn not installed") +@pytest.mark.skipif(xgboost_ray is None, reason="xgboost_ray not installed") +async def test_mars_with_xgboost_sklearn_reg(ray_start_regular_shared, create_cluster): + from sklearn.datasets import make_regression + from xgboost_ray import RayDMatrix, RayParams, RayXGBRegressor + + assert create_cluster.session + session = new_session(address=create_cluster.address, backend="ray") + with session: + np_X, np_y = make_regression(n_samples=1_0000, n_features=10) + columns = [f"c{i}" for i in range(np_X.shape[1])] + X, y = md.DataFrame(np_X, columns=columns), md.DataFrame({"target": np_y}) + df: md.DataFrame = md.concat([md.DataFrame(X), md.DataFrame(y)], axis=1) + df.execute() + + num_shards = 4 + ds = md.to_ray_dataset(df, num_shards) + assert isinstance(ds, ray_dataset.Dataset) + + import gc + + gc.collect() # Ensure Dataset does hold mars dataframe to avoid gc. + ray_params = RayParams(num_actors=2, cpus_per_actor=1) + reg = RayXGBRegressor(ray_params=ray_params, random_state=42) + # train + reg.fit(RayDMatrix(ds, "target"), y=None, ray_params=ray_params) + reg.predict(RayDMatrix(ds, "target")) + reg.predict(pd.DataFrame(np_X, columns=columns)) diff --git a/python/xorbits/_mars/dataframe/core.py b/python/xorbits/_mars/dataframe/core.py new file mode 100644 index 000000000..92cea24b3 --- /dev/null +++ b/python/xorbits/_mars/dataframe/core.py @@ -0,0 +1,3264 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import operator +import weakref +from collections.abc import Iterable +from io import StringIO +from typing import Any, Dict, List, Optional, Tuple, Union + +import numpy as np +import pandas as pd + +from ..core import ( + ENTITY_TYPE, + Chunk, + ChunkData, + HasShapeTileable, + HasShapeTileableData, + OutputType, + Tileable, + _ExecuteAndFetchMixin, + is_build_mode, + register_output_types, +) +from ..core.entity.utils import refresh_tileable_shape +from ..deploy.oscar.session import get_default_session +from ..lib.groupby_wrapper import GroupByWrapper +from ..serialization.serializables import ( + AnyField, + BoolField, + DataTypeField, + DictField, + FieldTypes, + Int8Field, + Int32Field, + IntervalArrayField, + ListField, + NDArrayField, + OneOfField, + ReferenceField, + Serializable, + SeriesField, + SliceField, + StringField, + TupleField, +) +from ..tensor import statistics +from ..utils import ( + calc_nsplits, + ceildiv, + estimate_pandas_size, + on_deserialize_shape, + on_serialize_numpy_type, + on_serialize_shape, + tokenize, +) +from .utils import ReprSeries, fetch_corner_data, merge_index_value, parse_index + + +class IndexValue(Serializable): + """ + Meta class for index, held by IndexData, SeriesData and DataFrameData + """ + + __slots__ = () + + class IndexBase(Serializable): + _key = StringField("key") # to identify if the index is the same + _is_monotonic_increasing = BoolField("is_monotonic_increasing") + _is_monotonic_decreasing = BoolField("is_monotonic_decreasing") + _is_unique = BoolField("is_unique") + _max_val = AnyField("max_val", on_serialize=on_serialize_numpy_type) + _max_val_close = BoolField("max_val_close") + _min_val = AnyField("min_val", on_serialize=on_serialize_numpy_type) + _min_val_close = BoolField("min_val_close") + + @property + def is_monotonic_increasing(self): + return self._is_monotonic_increasing + + @property + def is_monotonic_decreasing(self): + return self._is_monotonic_decreasing + + @property + def is_unique(self): + return self._is_unique + + @property + def min_val(self): + return self._min_val + + @property + def min_val_close(self): + return self._min_val_close + + @property + def max_val(self): + return self._max_val + + @property + def max_val_close(self): + return self._max_val_close + + @property + def key(self): + return self._key + + @property + def inferred_type(self): + return None + + def to_pandas(self): + kw = { + field.tag: getattr(self, attr, None) + for attr, field in self._FIELDS.items() + if attr not in super(type(self), self)._FIELDS + } + kw = {k: v for k, v in kw.items() if v is not None} + if kw.get("data") is None: + kw["data"] = [] + + pd_initializer = getattr(self, "_pd_initializer", None) + if pd_initializer is None: + pd_initializer = getattr(pd, type(self).__name__) + return pd_initializer(**kw) + + class Index(IndexBase): + _name = AnyField("name") + _data = NDArrayField("data") + _dtype = DataTypeField("dtype") + + class RangeIndex(IndexBase): + _name = AnyField("name") + _slice = SliceField("slice") + _dtype = DataTypeField("dtype") + + @property + def slice(self): + return self._slice + + @property + def dtype(self): + return getattr(self, "_dtype", np.dtype(np.intc)) + + def to_pandas(self): + slc = self._slice + return pd.RangeIndex( + slc.start, slc.stop, slc.step, name=getattr(self, "_name", None) + ) + + class CategoricalIndex(IndexBase): + _name = AnyField("name") + _data = NDArrayField("data") + _categories = AnyField("categories") + _ordered = BoolField("ordered") + + @property + def inferred_type(self): + return "categorical" + + class IntervalIndex(IndexBase): + _name = AnyField("name") + _data = IntervalArrayField("data") + _closed = StringField("closed") + + @property + def inferred_type(self): + return "interval" + + class DatetimeIndex(IndexBase): + _name = AnyField("name") + _data = NDArrayField("data") + _freq = AnyField("freq") + _start = AnyField("start") + _periods = AnyField("periods") + _end = AnyField("end") + _closed = AnyField("closed") + _tz = AnyField("tz") + _ambiguous = AnyField("ambiguous") + _dayfirst = BoolField("dayfirst") + _yearfirst = BoolField("yearfirst") + + @property + def inferred_type(self): + return "datetime64" + + @property + def freq(self): + return getattr(self, "_freq", None) + + class TimedeltaIndex(IndexBase): + _name = AnyField("name") + _data = NDArrayField("data") + _unit = AnyField("unit") + _freq = AnyField("freq") + _start = AnyField("start") + _periods = AnyField("periods") + _end = AnyField("end") + _closed = AnyField("closed") + + @property + def inferred_type(self): + return "timedelta64" + + class PeriodIndex(IndexBase): + _name = AnyField("name") + _data = NDArrayField("data") + _freq = AnyField("freq") + _start = AnyField("start") + _periods = AnyField("periods") + _end = AnyField("end") + _year = AnyField("year") + _month = AnyField("month") + _quarter = AnyField("quarter") + _day = AnyField("day") + _hour = AnyField("hour") + _minute = AnyField("minute") + _second = AnyField("second") + _tz = AnyField("tz") + _dtype = DataTypeField("dtype") + + @property + def inferred_type(self): + return "period" + + class Int64Index(IndexBase): + _pd_initializer = pd.Index + + _name = AnyField("name") + _data = NDArrayField("data") + _dtype = DataTypeField("dtype") + + @property + def inferred_type(self): + return "integer" + + class UInt64Index(IndexBase): + _pd_initializer = pd.Index + + _name = AnyField("name") + _data = NDArrayField("data") + _dtype = DataTypeField("dtype") + + @property + def inferred_type(self): + return "integer" + + class Float64Index(IndexBase): + _pd_initializer = pd.Index + + _name = AnyField("name") + _data = NDArrayField("data") + _dtype = DataTypeField("dtype") + + @property + def inferred_type(self): + return "floating" + + class MultiIndex(IndexBase): + _names = ListField("names", on_serialize=list) + _dtypes = ListField("dtypes", on_serialize=list) + _data = NDArrayField("data") + _sortorder = Int32Field("sortorder") + + @property + def inferred_type(self): + return "mixed" + + @property + def names(self) -> list: + return self._names + + def to_pandas(self): + data = getattr(self, "_data", None) + sortorder = getattr(self, "_sortorder", None) + + def _build_empty_array(dtype): + try: + return np.array([], dtype=dtype) + except TypeError: # pragma: no cover + return pd.array([], dtype=dtype) + + if data is None: + return pd.MultiIndex.from_arrays( + [_build_empty_array(dtype) for dtype in self._dtypes], + sortorder=sortorder, + names=self._names, + ) + return pd.MultiIndex.from_tuples( + [tuple(d) for d in data], sortorder=sortorder, names=self._names + ) + + _index_value = OneOfField( + "index_value", + index=Index, + range_index=RangeIndex, + categorical_index=CategoricalIndex, + interval_index=IntervalIndex, + datetime_index=DatetimeIndex, + timedelta_index=TimedeltaIndex, + period_index=PeriodIndex, + int64_index=Int64Index, + uint64_index=UInt64Index, + float64_index=Float64Index, + multi_index=MultiIndex, + ) + + def __mars_tokenize__(self): + # return object for tokenize + v = self._index_value + return v._key + + @property + def value(self): + return self._index_value + + @property + def key(self): + return self._index_value.key + + @property + def is_monotonic_increasing(self): + return self._index_value.is_monotonic_increasing + + @property + def is_monotonic_decreasing(self): + return self._index_value.is_monotonic_decreasing + + @property + def is_monotonic_increasing_or_decreasing(self): + return self.is_monotonic_increasing or self.is_monotonic_decreasing + + @property + def is_unique(self): + return self._index_value.is_unique + + @property + def min_val(self): + return self._index_value.min_val + + @property + def min_val_close(self): + return self._index_value.min_val_close + + @property + def max_val(self): + return self._index_value.max_val + + @property + def max_val_close(self): + return self._index_value.max_val_close + + @property + def min_max(self): + return ( + self._index_value.min_val, + self._index_value.min_val_close, + self._index_value.max_val, + self._index_value.max_val_close, + ) + + @property + def name(self): + return getattr(self._index_value, "_name", None) + + @property + def inferred_type(self): + return self._index_value.inferred_type + + def has_value(self): + if isinstance(self._index_value, self.RangeIndex): + if np.isnan(self._index_value.max_val): + return False + else: + return True + elif getattr(self._index_value, "_data", None) is not None: + return True + return False + + def to_pandas(self): + return self._index_value.to_pandas() + + +class DtypesValue(Serializable): + """ + Meta class for dtypes. + """ + + __slots__ = () + + _key = StringField("key") + _value = SeriesField("value") + + def __init__(self, key=None, value=None, **kw): + super().__init__(_key=key, _value=value, **kw) + if self._key is None: + self._key = tokenize(self._value) + + @property + def key(self): + return self._key + + @property + def value(self): + return self._value + + +def refresh_index_value(tileable: ENTITY_TYPE): + index_to_index_values = dict() + for chunk in tileable.chunks: + if chunk.ndim == 1: + index_to_index_values[chunk.index] = chunk.index_value + elif chunk.index[1] == 0: + index_to_index_values[chunk.index] = chunk.index_value + index_value = merge_index_value(index_to_index_values, store_data=False) + # keep key as original index_value's + if tileable.index_value is not None: + index_value._index_value._key = tileable.index_value.key + tileable._index_value = index_value + + +def refresh_dtypes(tileable: ENTITY_TYPE): + all_dtypes = [c.dtypes_value.value for c in tileable.chunks if c.index[0] == 0] + dtypes = pd.concat(all_dtypes) + tileable._dtypes = dtypes + columns_values = parse_index(dtypes.index, store_data=True) + tileable._columns_value = columns_values + tileable._dtypes_value = DtypesValue(key=tokenize(dtypes), value=dtypes) + + +_tileable_key_property = "_tileable_key" +_tileable_dtypes_property = "_tileable_dtypes" +_tileable_index_value_property = "_tileable_index_value" +_tileable_columns_value_property = "_tileable_columns_value" +_nsplits_property = "_tileable_nsplits" +_lazy_chunk_meta_properties = ( + _tileable_key_property, + _tileable_dtypes_property, + _tileable_index_value_property, + _tileable_columns_value_property, + _nsplits_property, +) + + +class LazyMetaChunkData(ChunkData): + __slots__ = _lazy_chunk_meta_properties + + def _set_tileable_meta( + self, + tileable_key: str = None, + nsplits: Tuple[Tuple[int, ...]] = None, + index_value: IndexValue = None, + columns_value: IndexValue = None, + dtypes: pd.Series = None, + ): + setattr(self, _tileable_key_property, tileable_key) + setattr(self, _nsplits_property, nsplits) + setattr(self, _tileable_index_value_property, index_value) + setattr(self, _tileable_columns_value_property, columns_value) + setattr(self, _tileable_dtypes_property, dtypes) + + +def is_chunk_meta_lazy(chunk: ChunkData) -> bool: + chunk = chunk.data if hasattr(chunk, "data") else chunk + return isinstance(chunk, LazyMetaChunkData) and hasattr( + chunk, _tileable_key_property + ) + + +@functools.lru_cache(maxsize=128) +def _get_cum_nsplit(nsplit: Tuple[int]) -> List[int]: + return [0] + np.cumsum(nsplit).tolist() + + +def _calc_axis_slice(nsplit: Tuple[int], index: int) -> slice: + if not isinstance(nsplit, tuple): + nsplit = tuple(nsplit) + cum_nsplit = _get_cum_nsplit(nsplit) + return slice(cum_nsplit[index], cum_nsplit[index + 1]) + + +class ChunkDtypesField(SeriesField): + _tileable_key_index_to_dtypes = dict() + + @staticmethod + def _gen_chunk_dtypes(instance: Chunk, index: int) -> Optional[pd.Series]: + # dtypes of tileable + try: + tileable_key = getattr(instance, _tileable_key_property) + except AttributeError: + return + cache = ChunkDtypesField._tileable_key_index_to_dtypes + try: + return cache[tileable_key, index] + except KeyError: + tileable_dtypes = getattr(instance, _tileable_dtypes_property) + # nsplits of tileable + nsplits = getattr(instance, _nsplits_property)[1] + # calc slice + slc = _calc_axis_slice(nsplits, index) + dtypes = tileable_dtypes.iloc[slc] + cache[tileable_key, index] = dtypes + return dtypes + + def __get__(self, instance, owner=None): + if not issubclass(owner, LazyMetaChunkData): # pragma: no cover + return super().__get__(instance, owner) + + try: + value = self.get(instance, owner) + if value is not None: + return value + except AttributeError: # pragma: no cover + pass + + if instance.index is None: + return super().__get__(instance, owner) + + # get dtypes lazily + index = instance.index[1] + dtypes = self._gen_chunk_dtypes(instance, index) + # cache dtypes + self.set(instance, dtypes) + return dtypes + + +class ChunkIndexValueField(ReferenceField): + _tileable_key_index_to_index_value = dict() + + @staticmethod + def _gen_chunk_index_value(instance: Chunk, index: int) -> Optional[IndexValue]: + # index_value of tileable + try: + tileable_key = getattr(instance, _tileable_key_property) + except AttributeError: + return + cache = ChunkIndexValueField._tileable_key_index_to_index_value + try: + return cache[tileable_key, index] + except KeyError: + tileable_index_value = getattr(instance, _tileable_index_value_property) + # nsplits of tileable + nsplit = getattr(instance, _nsplits_property)[0] + # calc slice + slc = _calc_axis_slice(nsplit, index) + pd_index = tileable_index_value.to_pandas() + if np.isnan(slc.stop - slc.start): + chunk_pd_index = pd_index[:0] + else: + chunk_pd_index = pd_index[slc] + index_value = parse_index( + chunk_pd_index, + key=f"{tileable_index_value.key}_index_{index}_{slc.start}_{slc.stop}", + ) + cache[tileable_key, index] = index_value + return index_value + + def __get__(self, instance, owner=None): + if not issubclass(owner, LazyMetaChunkData): # pragma: no cover + return super().__get__(instance, owner) + + try: + value = self.get(instance, owner) + if value is not None: + return value + except AttributeError: # pragma: no cover + pass + + if instance.index is None: + return super().__get__(instance, owner) + + # get index_value lazily + index = instance.index[0] + index_value = self._gen_chunk_index_value(instance, index) + # cache index_value + self.set(instance, index_value) + return index_value + + +class ChunkColumnsValueField(ReferenceField): + _tileable_key_index_to_index_value = dict() + + @staticmethod + def _gen_chunk_columns_value(instance: Chunk, index: int) -> Optional[IndexValue]: + # columns_value of tileable + try: + tileable_key = getattr(instance, _tileable_key_property) + except AttributeError: + return + cache = ChunkColumnsValueField._tileable_key_index_to_index_value + try: + return cache[tileable_key, index] + except KeyError: + tileable_columns_value = getattr(instance, _tileable_columns_value_property) + # nsplits of tileable + nsplit = getattr(instance, _nsplits_property)[1] + # calc slice + slc = _calc_axis_slice(nsplit, index) + pd_index = tileable_columns_value.to_pandas() + chunk_pd_index = ( + pd_index[:0] if np.isnan(slc.stop - slc.start) else pd_index[slc] + ) + columns_value = parse_index(chunk_pd_index, store_data=True) + cache[tileable_key, index] = columns_value + return columns_value + + def __get__(self, instance, owner=None): + if not issubclass(owner, LazyMetaChunkData): # pragma: no cover + return super().__get__(instance, owner) + + try: + value = self.get(instance, owner) + if value is not None: + return value + except AttributeError: # pragma: no cover + pass + + if instance.index is None: + return super().__get__(instance, owner) + + # get columns_value lazily + index = instance.index[1] + columns_value = self._gen_chunk_columns_value(instance, index) + # cache columns_value + self.set(instance, columns_value) + return columns_value + + +class IndexChunkData(ChunkData): + __slots__ = () + type_name = "Index" + + # required fields + _shape = TupleField( + "shape", + FieldTypes.int64, + on_serialize=on_serialize_shape, + on_deserialize=on_deserialize_shape, + ) + # optional field + _dtype = DataTypeField("dtype") + _name = AnyField("name") + _index_value = ReferenceField("index_value", IndexValue) + + def __init__( + self, + op=None, + shape=None, + index=None, + dtype=None, + name=None, + index_value=None, + **kw, + ): + super().__init__( + _op=op, + _shape=shape, + _index=index, + _dtype=dtype, + _name=name, + _index_value=index_value, + **kw, + ) + + @property + def params(self) -> Dict[str, Any]: + # params return the properties which useful to rebuild a new chunk + return { + "shape": self.shape, + "dtype": self.dtype, + "index": self.index, + "index_value": self.index_value, + "name": self.name, + } + + @params.setter + def params(self, new_params: Dict[str, Any]): + params = new_params.copy() + params.pop("index", None) # index not needed to update + new_shape = params.pop("shape", None) + if new_shape is not None: + self._shape = new_shape + dtype = params.pop("dtype", None) + if dtype is not None: + self._dtype = dtype + index_value = params.pop("index_value", None) + if index_value is not None: + self._index_value = index_value + name = params.pop("name", None) + if name is not None: + self._name = name + if params: # pragma: no cover + raise TypeError(f"Unknown params: {list(params)}") + + @classmethod + def get_params_from_data(cls, data: pd.Index) -> Dict[str, Any]: + return { + "shape": data.shape, + "dtype": data.dtype, + "index_value": parse_index(data, store_data=False), + "name": data.name, + } + + @property + def shape(self): + return getattr(self, "_shape", None) + + @property + def ndim(self): + return len(self.shape) + + @property + def dtype(self): + return self._dtype + + @property + def name(self): + return self._name + + @property + def index_value(self): + return self._index_value + + +class IndexChunk(Chunk): + __slots__ = () + _allow_data_type_ = (IndexChunkData,) + type_name = "Index" + + +def _on_deserialize_index_value(index_value): + if index_value is None: + return + try: + getattr(index_value, "value") + return index_value + except AttributeError: + return + + +class _ToPandasMixin(_ExecuteAndFetchMixin): + __slots__ = () + + def to_pandas(self, session=None, **kw): + return self._execute_and_fetch(session=session, **kw) + + +class _BatchedFetcher: + __slots__ = () + + def _iter(self, batch_size=None, session=None, **kw): + from .indexing.iloc import iloc + + if batch_size is not None: + size = self.shape[0] + n_batch = ceildiv(size, batch_size) + + if n_batch > 1: + for i in range(n_batch): + batch_data = iloc(self)[batch_size * i : batch_size * (i + 1)] + yield batch_data._fetch(session=session, **kw) + else: + yield self._fetch(session=session, **kw) + else: + # if batch_size is not specified, use first batch to estimate + # batch_size. + default_batch_bytes = 50 * 1024**2 + first_batch = 1000 + size = self.shape[0] + + if size >= first_batch: + batch_data = iloc(self)[:first_batch] + first_batch_data = batch_data._fetch(session=session, **kw) + yield first_batch_data + data_size = estimate_pandas_size(first_batch_data) + batch_size = int(default_batch_bytes / data_size * first_batch) + n_batch = ceildiv(size - 1000, batch_size) + for i in range(n_batch): + batch_data = iloc(self)[ + first_batch + + batch_size * i : first_batch + + batch_size * (i + 1) + ] + yield batch_data._fetch(session=session, **kw) + else: + yield self._fetch(session=session, **kw) + + def iterbatch(self, batch_size=None, session=None, **kw): + # trigger execution + self.execute(session=session, **kw) + return self._iter(batch_size=batch_size, session=session) + + def fetch(self, session=None, **kw): + from .indexing.iloc import DataFrameIlocGetItem, SeriesIlocGetItem + + batch_size = kw.pop("batch_size", None) + if isinstance(self.op, (DataFrameIlocGetItem, SeriesIlocGetItem)): + # see GH#1871 + # already iloc, do not trigger batch fetch + return self._fetch(session=session, **kw) + else: + batches = list(self._iter(batch_size=batch_size, session=session, **kw)) + return pd.concat(batches) if len(batches) > 1 else batches[0] + + def fetch_infos(self, fields=None, session=None, **kw): + return self._fetch_infos(fields=fields, session=session, **kw) + + +class IndexData(HasShapeTileableData, _ToPandasMixin): + __slots__ = () + type_name = "Index" + + # optional field + _dtype = DataTypeField("dtype") + _name = AnyField("name") + _names = AnyField("names") + _index_value = ReferenceField( + "index_value", IndexValue, on_deserialize=_on_deserialize_index_value + ) + _chunks = ListField( + "chunks", + FieldTypes.reference(IndexChunkData), + on_serialize=lambda x: [it.data for it in x] if x is not None else x, + on_deserialize=lambda x: [IndexChunk(it) for it in x] if x is not None else x, + ) + + def __init__( + self, + op=None, + shape=None, + nsplits=None, + dtype=None, + name=None, + names=None, + index_value=None, + chunks=None, + **kw, + ): + super().__init__( + _op=op, + _shape=shape, + _nsplits=nsplits, + _dtype=dtype, + _name=name, + _names=names, + _index_value=index_value, + _chunks=chunks, + **kw, + ) + + @property + def params(self) -> Dict[str, Any]: + # params return the properties which useful to rebuild a new tileable object + return { + "shape": self.shape, + "dtype": self.dtype, + "name": self.name, + "index_value": self.index_value, + } + + @params.setter + def params(self, new_params: Dict[str, Any]): + params = new_params.copy() + new_shape = params.pop("shape", None) + if new_shape is not None: + self._shape = new_shape + dtype = params.pop("dtype", None) + if dtype is not None: + self._dtype = dtype + index_value = params.pop("index_value", None) + if index_value is not None: + self._index_value = index_value + name = params.pop("name", None) + if name is not None: + self._name = name + if params: # pragma: no cover + raise TypeError(f"Unknown params: {list(params)}") + + def refresh_params(self): + # refresh params when chunks updated + refresh_tileable_shape(self) + refresh_index_value(self) + if self._dtype is None: + self._dtype = self.chunks[0].dtype + if self._name is None: + self._name = self.chunks[0].name + + def _to_str(self, representation=False): + if is_build_mode() or len(self._executed_sessions) == 0: + # in build mode, or not executed, just return representation + if representation: + return f"Index IndexValue: + return self._index_value + + @property + def inferred_type(self): + return self._index_value.inferred_type + + def to_tensor(self, dtype=None, extract_multi_index=False): + from ..tensor.datasource.from_dataframe import from_index + + return from_index(self, dtype=dtype, extract_multi_index=extract_multi_index) + + +class Index(HasShapeTileable, _ToPandasMixin): + __slots__ = "_df_or_series", "_parent_key", "_axis" + _allow_data_type_ = (IndexData,) + type_name = "Index" + + def __new__(cls, data: Union[pd.Index, IndexData] = None, **_): + if data is not None and not isinstance(data, pd.Index): + # create corresponding Index class + # according to type of index_value + clz = globals()[type(data.index_value.value).__name__] + else: + clz = cls + return object.__new__(clz) + + def __len__(self): + return len(self._data) + + def __mars_tensor__(self, dtype=None, order="K"): + return self._data.__mars_tensor__(dtype=dtype, order=order) + + def _get_df_or_series(self): + obj = getattr(self, "_df_or_series", None) + if obj is not None: + return obj() + return None + + def _set_df_or_series(self, df_or_series, axis): + self._df_or_series = weakref.ref(df_or_series) + self._parent_key = df_or_series.key + self._axis = axis + + @property + def T(self): + """Return the transpose, which is by definition self.""" + return self + + @property + def name(self): + return self._data.name + + @name.setter + def name(self, value): + df_or_series = self._get_df_or_series() + if df_or_series is not None and df_or_series.key == self._parent_key: + df_or_series.rename_axis(value, axis=self._axis, inplace=True) + self.data = df_or_series.axes[self._axis].data + else: + self.rename(value, inplace=True) + + @property + def names(self): + return self._data.names + + @names.setter + def names(self, value): + df_or_series = self._get_df_or_series() + if df_or_series is not None: + df_or_series.rename_axis(value, axis=self._axis, inplace=True) + self.data = df_or_series.axes[self._axis].data + else: + self.rename(value, inplace=True) + + @property + def values(self): + return self.to_tensor() + + def to_frame(self, index: bool = True, name=None): + """ + Create a DataFrame with a column containing the Index. + + Parameters + ---------- + index : bool, default True + Set the index of the returned DataFrame as the original Index. + + name : object, default None + The passed name should substitute for the index name (if it has + one). + + Returns + ------- + DataFrame + DataFrame containing the original Index data. + + See Also + -------- + Index.to_series : Convert an Index to a Series. + Series.to_frame : Convert Series to DataFrame. + + Examples + -------- + >>> import mars.dataframe as md + >>> idx = md.Index(['Ant', 'Bear', 'Cow'], name='animal') + >>> idx.to_frame().execute() + animal + animal + Ant Ant + Bear Bear + Cow Cow + + By default, the original Index is reused. To enforce a new Index: + + >>> idx.to_frame(index=False).execute() + animal + 0 Ant + 1 Bear + 2 Cow + + To override the name of the resulting column, specify `name`: + + >>> idx.to_frame(index=False, name='zoo').execute() + zoo + 0 Ant + 1 Bear + 2 Cow + """ + from . import dataframe_from_tensor + + if isinstance(self.index_value.value, IndexValue.MultiIndex): + old_names = self.index_value.value.names + + if ( + name is not None + and not isinstance(name, Iterable) + or isinstance(name, str) + ): + raise TypeError("'name' must be a list / sequence of column names.") + + name = list(name if name is not None else old_names) + if len(name) != len(old_names): + raise ValueError( + "'name' should have same length as number of levels on index." + ) + + columns = [ + old or new or idx for idx, (old, new) in enumerate(zip(old_names, name)) + ] + else: + columns = [name or self.name or 0] + index_ = self if index else None + return dataframe_from_tensor( + self._data._to_mars_tensor(self, extract_multi_index=True), + index=index_, + columns=columns, + ) + + def to_series(self, index=None, name=None): + """ + Create a Series with both index and values equal to the index keys. + + Useful with map for returning an indexer based on an index. + + Parameters + ---------- + index : Index, optional + Index of resulting Series. If None, defaults to original index. + name : str, optional + Dame of resulting Series. If None, defaults to name of original + index. + + Returns + ------- + Series + The dtype will be based on the type of the Index values. + """ + from . import series_from_index + + return series_from_index(self, index=index, name=name) + + +class RangeIndex(Index): + __slots__ = () + + +class CategoricalIndex(Index): + __slots__ = () + + +class IntervalIndex(Index): + __slots__ = () + + +class DatetimeIndex(Index): + __slots__ = () + + +class TimedeltaIndex(Index): + __slots__ = () + + +class PeriodIndex(Index): + __slots__ = () + + +class Int64Index(Index): + __slots__ = () + + +class UInt64Index(Index): + __slots__ = () + + +class Float64Index(Index): + __slots__ = () + + +class MultiIndex(Index): + __slots__ = () + + +class BaseSeriesChunkData(LazyMetaChunkData): + __slots__ = () + + # required fields + _shape = TupleField( + "shape", + FieldTypes.int64, + on_serialize=on_serialize_shape, + on_deserialize=on_deserialize_shape, + ) + # optional field + _dtype = DataTypeField("dtype") + _name = AnyField("name") + _index_value = ChunkIndexValueField( + "index_value", IndexValue, on_deserialize=_on_deserialize_index_value + ) + + def __init__( + self, + op=None, + shape=None, + index=None, + dtype=None, + name=None, + index_value=None, + **kw, + ): + super().__init__( + _op=op, + _shape=shape, + _index=index, + _dtype=dtype, + _name=name, + _index_value=index_value, + **kw, + ) + + def _get_params(self) -> Dict[str, Any]: + # params return the properties which useful to rebuild a new chunk + return { + "shape": self.shape, + "dtype": self.dtype, + "index": self.index, + "index_value": self.index_value, + "name": self.name, + } + + def _set_params(self, new_params: Dict[str, Any]): + params = new_params.copy() + params.pop("index", None) # index not needed to update + new_shape = params.pop("shape", None) + if new_shape is not None: + self._shape = new_shape + dtype = params.pop("dtype", None) + if dtype is not None: + self._dtype = dtype + index_value = params.pop("index_value", None) + if index_value is not None: + self._index_value = index_value + name = params.pop("name", None) + if name is not None: + self._name = name + if params: # pragma: no cover + raise TypeError(f"Unknown params: {list(params)}") + + params = property(_get_params, _set_params) + + @classmethod + def get_params_from_data(cls, data: pd.Series) -> Dict[str, Any]: + return { + "shape": data.shape, + "dtype": data.dtype, + "index_value": parse_index(data.index, store_data=False), + "name": data.name, + } + + @property + def shape(self): + return getattr(self, "_shape", None) + + @property + def ndim(self): + return len(self.shape) + + @property + def dtype(self): + return self._dtype + + @property + def name(self): + return self._name + + @property + def index_value(self): + return self._index_value + + +class SeriesChunkData(BaseSeriesChunkData): + type_name = "Series" + + +class SeriesChunk(Chunk): + __slots__ = () + _allow_data_type_ = (SeriesChunkData,) + type_name = "Series" + + +class BaseSeriesData(HasShapeTileableData, _ToPandasMixin): + __slots__ = "_cache", "_accessors" + + # optional field + _dtype = DataTypeField("dtype") + _name = AnyField("name") + _index_value = ReferenceField( + "index_value", IndexValue, on_deserialize=_on_deserialize_index_value + ) + _chunks = ListField( + "chunks", + FieldTypes.reference(SeriesChunkData), + on_serialize=lambda x: [it.data for it in x] if x is not None else x, + on_deserialize=lambda x: [SeriesChunk(it) for it in x] if x is not None else x, + ) + + def __init__( + self, + op=None, + shape=None, + nsplits=None, + dtype=None, + name=None, + index_value=None, + chunks=None, + **kw, + ): + super().__init__( + _op=op, + _shape=shape, + _nsplits=nsplits, + _dtype=dtype, + _name=name, + _index_value=index_value, + _chunks=chunks, + **kw, + ) + self._accessors = dict() + + def _get_params(self) -> Dict[str, Any]: + # params return the properties which useful to rebuild a new tileable object + return { + "shape": self.shape, + "dtype": self.dtype, + "name": self.name, + "index_value": self.index_value, + } + + def _set_params(self, new_params: Dict[str, Any]): + params = new_params.copy() + new_shape = params.pop("shape", None) + if new_shape is not None: + self._shape = new_shape + dtype = params.pop("dtype", None) + if dtype is not None: + self._dtype = dtype + index_value = params.pop("index_value", None) + if index_value is not None: + self._index_value = index_value + name = params.pop("name", None) + if name is not None: + self._name = name + if params: # pragma: no cover + raise TypeError(f"Unknown params: {list(params)}") + + params = property(_get_params, _set_params) + + def refresh_params(self): + # refresh params when chunks updated + refresh_tileable_shape(self) + refresh_index_value(self) + if self._dtype is None: + self._dtype = self.chunks[0].dtype + if self._name is None: + self._name = self.chunks[0].name + + def _to_str(self, representation=False): + if is_build_mode() or len(self._executed_sessions) == 0: + # in build mode, or not executed, just return representation + if representation: + return ( + f"{self.type_name} " + ) + else: + return f"{self.type_name}(op={type(self._op).__name__})" + else: + corner_data = fetch_corner_data(self, session=self._executed_sessions[-1]) + + buf = StringIO() + max_rows = pd.get_option("display.max_rows") + corner_max_rows = ( + max_rows if self.shape[0] <= max_rows else corner_data.shape[0] - 1 + ) # make sure max_rows < corner_data + + with pd.option_context("display.max_rows", corner_max_rows): + if self.shape[0] <= max_rows: + corner_series = corner_data + else: + corner_series = ReprSeries(corner_data, self.shape) + buf.write(repr(corner_series) if representation else str(corner_series)) + + return buf.getvalue() + + def __str__(self): + return self._to_str(representation=False) + + def __repr__(self): + return self._to_str(representation=False) + + @property + def dtype(self): + return getattr(self, "_dtype", None) or getattr(self.op, "dtype", None) + + @property + def name(self): + return self._name + + @property + def index_value(self): + return self._index_value + + @property + def index(self): + from .datasource.index import from_tileable + + return from_tileable(self) + + @property + def axes(self): + return [self.index] + + @property + def empty(self): + shape = getattr(self, "_shape") + if np.any(np.isnan(shape)): + raise ValueError("Tileable object must be executed first") + return shape == (0,) + + def to_tensor(self, dtype=None): + from ..tensor.datasource.from_dataframe import from_series + + return from_series(self, dtype=dtype) + + @staticmethod + def from_tensor(in_tensor, index=None, name=None): + from .datasource.from_tensor import series_from_tensor + + return series_from_tensor(in_tensor, index=index, name=name) + + +class SeriesData(_BatchedFetcher, BaseSeriesData): + type_name = "Series" + + def __mars_tensor__(self, dtype=None, order="K"): + tensor = self.to_tensor() + dtype = dtype if dtype is not None else tensor.dtype + return tensor.astype(dtype=dtype, order=order, copy=False) + + def iteritems(self, batch_size=10000, session=None): + for batch_data in self.iterbatch(batch_size=batch_size, session=session): + yield from getattr(batch_data, "iteritems")() + + items = iteritems + + def to_dict(self, into=dict, batch_size=10000, session=None): + fetch_kwargs = dict(batch_size=batch_size) + return self.to_pandas(session=session, fetch_kwargs=fetch_kwargs).to_dict( + into=into + ) + + +class Series(HasShapeTileable, _ToPandasMixin): + __slots__ = ("_cache",) + _allow_data_type_ = (SeriesData,) + type_name = "Series" + + def to_tensor(self, dtype=None): + return self._data.to_tensor(dtype=dtype) + + def from_tensor(self, in_tensor, index=None, name=None): + return self._data.from_tensor(in_tensor, index=index, name=name) + + @property + def T(self): + """Return the transpose, which is by definition self.""" + return self + + @property + def ndim(self): + """ + Return an int representing the number of axes / array dimensions. + + Return 1 if Series. Otherwise return 2 if DataFrame. + + See Also + -------- + ndarray.ndim : Number of array dimensions. + + Examples + -------- + >>> import mars.dataframe as md + >>> s = md.Series({'a': 1, 'b': 2, 'c': 3}) + >>> s.ndim.execute() + 1 + + >>> df = md.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df.ndim.execute() + 2 + """ + return super().ndim + + @property + def index(self): + """ + The index (axis labels) of the Series. + """ + idx = self._data.index + idx._set_df_or_series(self, 0) + return idx + + @index.setter + def index(self, new_index): + self.set_axis(new_index, axis=0, inplace=True) + + @property + def name(self): + return self._data.name + + @name.setter + def name(self, val): + from .indexing.rename import DataFrameRename + + op = DataFrameRename(new_name=val, output_types=[OutputType.series]) + new_series = op(self) + self.data = new_series.data + + @property + def dtype(self): + """ + Return the dtype object of the underlying data. + """ + return self._data.dtype + + def copy(self, deep=True): # pylint: disable=arguments-differ + """ + Make a copy of this object's indices and data. + + When ``deep=True`` (default), a new object will be created with a + copy of the calling object's data and indices. Modifications to + the data or indices of the copy will not be reflected in the + original object (see notes below). + + When ``deep=False``, a new object will be created without copying + the calling object's data or index (only references to the data + and index are copied). Any changes to the data of the original + will be reflected in the shallow copy (and vice versa). + + Parameters + ---------- + deep : bool, default True + Make a deep copy, including a copy of the data and the indices. + With ``deep=False`` neither the indices nor the data are copied. + + Returns + ------- + copy : Series or DataFrame + Object type matches caller. + """ + if deep: + return super().copy() + else: + return super()._view() + + def __len__(self): + return len(self._data) + + def __mars_tensor__(self, dtype=None, order="K"): + return self._data.__mars_tensor__(dtype=dtype, order=order) + + def keys(self): + """ + Return alias for index. + + Returns + ------- + Index + Index of the Series. + """ + return self.index + + @property + def values(self): + return self.to_tensor() + + def iteritems(self, batch_size=10000, session=None): + """ + Lazily iterate over (index, value) tuples. + + This method returns an iterable tuple (index, value). This is + convenient if you want to create a lazy iterator. + + Returns + ------- + iterable + Iterable of tuples containing the (index, value) pairs from a + Series. + + See Also + -------- + DataFrame.items : Iterate over (column name, Series) pairs. + DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) pairs. + + Examples + -------- + >>> import mars.dataframe as md + >>> s = md.Series(['A', 'B', 'C']) + >>> for index, value in s.items(): + ... print(f"Index : {index}, Value : {value}") + Index : 0, Value : A + Index : 1, Value : B + Index : 2, Value : C + """ + return self._data.iteritems(batch_size=batch_size, session=session) + + items = iteritems + + def to_dict(self, into=dict, batch_size=10000, session=None): + """ + Convert Series to {label -> value} dict or dict-like object. + + Parameters + ---------- + into : class, default dict + The collections.abc.Mapping subclass to use as the return + object. Can be the actual class or an empty + instance of the mapping type you want. If you want a + collections.defaultdict, you must pass it initialized. + + Returns + ------- + collections.abc.Mapping + Key-value representation of Series. + + Examples + -------- + >>> import mars.dataframe as md + >>> s = md.Series([1, 2, 3, 4]) + >>> s.to_dict() + {0: 1, 1: 2, 2: 3, 3: 4} + >>> from collections import OrderedDict, defaultdict + >>> s.to_dict(OrderedDict) + OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)]) + >>> dd = defaultdict(list) + >>> s.to_dict(dd) + defaultdict(, {0: 1, 1: 2, 2: 3, 3: 4}) + """ + return self._data.to_dict(into=into, batch_size=batch_size, session=session) + + def to_frame(self, name=None): + """ + Convert Series to DataFrame. + + Parameters + ---------- + name : object, default None + The passed name should substitute for the series name (if it has + one). + + Returns + ------- + DataFrame + DataFrame representation of Series. + + Examples + -------- + >>> import mars.dataframe as md + >>> s = md.Series(["a", "b", "c"], name="vals") + >>> s.to_frame().execute() + vals + 0 a + 1 b + 2 c + """ + from . import dataframe_from_tensor + + name = name or self.name or 0 + return dataframe_from_tensor(self, columns=[name]) + + def between(self, left, right, inclusive="both"): + """ + Return boolean Series equivalent to left <= series <= right. + This function returns a boolean vector containing `True` wherever the + corresponding Series element is between the boundary values `left` and + `right`. NA values are treated as `False`. + + Parameters + ---------- + left : scalar or list-like + Left boundary. + right : scalar or list-like + Right boundary. + inclusive : {"both", "neither", "left", "right"} + Include boundaries. Whether to set each bound as closed or open. + + Returns + ------- + Series + Series representing whether each element is between left and + right (inclusive). + + See Also + -------- + Series.gt : Greater than of series and other. + Series.lt : Less than of series and other. + + Notes + ----- + This function is equivalent to ``(left <= ser) & (ser <= right)`` + + Examples + -------- + >>> import mars.dataframe as md + >>> s = md.Series([2, 0, 4, 8, np.nan]) + + Boundary values are included by default: + + >>> s.between(1, 4).execute() + 0 True + 1 False + 2 True + 3 False + 4 False + dtype: bool + + With `inclusive` set to ``"neither"`` boundary values are excluded: + + >>> s.between(1, 4, inclusive="neither").execute() + 0 True + 1 False + 2 False + 3 False + 4 False + dtype: bool + + `left` and `right` can be any scalar value: + + >>> s = md.Series(['Alice', 'Bob', 'Carol', 'Eve']) + >>> s.between('Anna', 'Daniel').execute() + 0 False + 1 True + 2 True + 3 False + dtype: bool + """ + if isinstance(inclusive, bool): # pragma: no cover + # for pandas < 1.3.0 + if inclusive: + inclusive = "both" + else: + inclusive = "neither" + if inclusive == "both": + lmask = self >= left + rmask = self <= right + elif inclusive == "left": + lmask = self >= left + rmask = self < right + elif inclusive == "right": + lmask = self > left + rmask = self <= right + elif inclusive == "neither": + lmask = self > left + rmask = self < right + else: + raise ValueError( + "Inclusive has to be either string of 'both'," + "'left', 'right', or 'neither'." + ) + + return lmask & rmask + + def median( + self, axis=None, skipna=True, out=None, overwrite_input=False, keepdims=False + ): + """ + Return the median of the values over the requested axis. + + Parameters + ---------- + axis : {index (0)} + Axis or axes along which the medians are computed. The default + is to compute the median along a flattened version of the tensor. + A sequence of axes is supported since version 1.9.0. + skipna : bool, optional, default True + Exclude NA/null values when computing the result. + out : Tensor, default None + Output tensor in which to place the result. It must + have the same shape and buffer length as the expected output, + but the type (of the output) will be cast if necessary. + overwrite_input : bool, default False + Just for compatibility with Numpy, would not take effect. + keepdims : bool, default False + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the original `arr`. + + Returns + ------- + median : scalar + Return the median of the values over the requested axis. + + See Also + -------- + tensor.mean, tensor.percentile + + Notes + ----- + Given a vector ``V`` of length ``N``, the median of ``V`` is the + middle value of a sorted copy of ``V``, ``V_sorted`` - i + e., ``V_sorted[(N-1)/2]``, when ``N`` is odd, and the average of the + two middle values of ``V_sorted`` when ``N`` is even. + + Examples + -------- + >>> import mars.dataframe as md + >>> a = md.Series([10, 7, 4, 3, 2, 1]) + >>> a.median().execute() + 2.0 + >>> mt.median(a).execute() + 3.5 + >>> a = md.Series([10, 7, 4, None, 2, 1]) + >>> a.median().execute() + 4.0 + >>> a.median(skipna=False).execute() + nan + """ + if skipna: + return statistics.median( + self.dropna(), + axis=None, + out=None, + overwrite_input=False, + keepdims=False, + ) + else: + return statistics.median( + self, axis=None, out=None, overwrite_input=False, keepdims=False + ) + + +class BaseDataFrameChunkData(LazyMetaChunkData): + __slots__ = ("_dtypes_value",) + _no_copy_attrs_ = ChunkData._no_copy_attrs_ | {"_dtypes", "_columns_value"} + + # required fields + _shape = TupleField( + "shape", + FieldTypes.int64, + on_serialize=on_serialize_shape, + on_deserialize=on_deserialize_shape, + ) + # optional fields + _dtypes = ChunkDtypesField("dtypes") + _index_value = ChunkIndexValueField( + "index_value", IndexValue, on_deserialize=_on_deserialize_index_value + ) + _columns_value = ChunkColumnsValueField("columns_value", IndexValue) + + def __init__( + self, + op=None, + shape=None, + index=None, + dtypes=None, + index_value=None, + columns_value=None, + **kw, + ): + super().__init__( + _op=op, + _shape=shape, + _index=index, + _dtypes=dtypes, + _index_value=index_value, + _columns_value=columns_value, + **kw, + ) + self._dtypes_value = None + + def __on_deserialize__(self): + super(BaseDataFrameChunkData, self).__on_deserialize__() + self._dtypes_value = None + + def __len__(self): + return self.shape[0] + + def _get_params(self) -> Dict[str, Any]: + # params return the properties which useful to rebuild a new chunk + return { + "shape": self.shape, + "dtypes": self.dtypes, + "dtypes_value": self.dtypes_value, + "index": self.index, + "index_value": self.index_value, + "columns_value": self.columns_value, + } + + def _set_params(self, new_params: Dict[str, Any]): + params = new_params.copy() + params.pop("index", None) # index not needed to update + new_shape = params.pop("shape", None) + if new_shape is not None: + self._shape = new_shape + index_value = params.pop("index_value", None) + if index_value is not None: + self._index_value = index_value + dtypes = params.pop("dtypes", None) + if dtypes is not None: + self._dtypes = dtypes + columns_value = params.pop("columns_value", None) + if columns_value is not None: + self._columns_value = columns_value + dtypes_value = params.pop("dtypes_value", None) + if dtypes_value is not None: + if dtypes is None: + self._dtypes = dtypes_value.value + if columns_value is None: + self._columns_value = parse_index(self._dtypes.index, store_data=True) + self._dtypes_value = dtypes_value + if params: # pragma: no cover + raise TypeError(f"Unknown params: {list(params)}") + + params = property(_get_params, _set_params) + + @classmethod + def get_params_from_data(cls, data: pd.DataFrame) -> Dict[str, Any]: + parse_index(data.index, store_data=False) + return { + "shape": data.shape, + "index_value": parse_index(data.index, store_data=False), + "dtypes_value": DtypesValue(key=tokenize(data.dtypes), value=data.dtypes), + } + + @property + def shape(self): + return getattr(self, "_shape", None) + + @property + def ndim(self): + return len(self.shape) + + @property + def dtypes(self): + dt = getattr(self, "_dtypes", None) + if dt is not None: + return dt + return getattr(self.op, "dtypes", None) + + @property + def dtypes_value(self): + if self._dtypes_value is not None: + return self._dtypes_value + # TODO(qinxuye): when creating Dataframe, + # dtypes_value instead of dtypes later must be passed into + dtypes = self.dtypes + if dtypes is not None: + self._dtypes_value = DtypesValue(key=tokenize(dtypes), value=dtypes) + return self._dtypes_value + + @property + def index_value(self): + return self._index_value + + @property + def columns_value(self): + return self._columns_value + + +class DataFrameChunkData(BaseDataFrameChunkData): + type_name = "DataFrame" + + +class DataFrameChunk(Chunk): + __slots__ = () + _allow_data_type_ = (DataFrameChunkData,) + type_name = "DataFrame" + + def __len__(self): + return len(self._data) + + +class BaseDataFrameData(HasShapeTileableData, _ToPandasMixin): + __slots__ = "_accessors", "_dtypes_value", "_dtypes_dict" + + # optional fields + _dtypes = SeriesField("dtypes") + _index_value = ReferenceField( + "index_value", IndexValue, on_deserialize=_on_deserialize_index_value + ) + _columns_value = ReferenceField("columns_value", IndexValue) + _chunks = ListField( + "chunks", + FieldTypes.reference(DataFrameChunkData), + on_serialize=lambda x: [it.data for it in x] if x is not None else x, + on_deserialize=lambda x: [DataFrameChunk(it) for it in x] + if x is not None + else x, + ) + + def __init__( + self, + op=None, + shape=None, + nsplits=None, + dtypes=None, + index_value=None, + columns_value=None, + chunks=None, + **kw, + ): + super().__init__( + _op=op, + _shape=shape, + _nsplits=nsplits, + _dtypes=dtypes, + _index_value=index_value, + _columns_value=columns_value, + _chunks=chunks, + **kw, + ) + self._accessors = dict() + self._dtypes_value = None + self._dtypes_dict = None + + def __on_deserialize__(self): + super().__on_deserialize__() + self._accessors = dict() + self._dtypes_value = None + self._dtypes_dict = None + + def _get_params(self) -> Dict[str, Any]: + # params return the properties which useful to rebuild a new tileable object + return { + "shape": self.shape, + "dtypes": self.dtypes, + "index_value": self.index_value, + "columns_value": self.columns_value, + "dtypes_value": self.dtypes_value, + } + + def _set_params(self, new_params: Dict[str, Any]): + params = new_params.copy() + new_shape = params.pop("shape", None) + if new_shape is not None: + self._shape = new_shape + index_value = params.pop("index_value", None) + if index_value is not None: + self._index_value = index_value + dtypes = params.pop("dtypes", None) + if dtypes is not None: + self._dtypes = dtypes + columns_value = params.pop("columns_value", None) + if columns_value is not None: + self._columns_value = columns_value + dtypes_value = params.pop("dtypes_value", None) + if dtypes_value is not None: + if dtypes is None: + self._dtypes = dtypes_value.value + if columns_value is None: + self._columns_value = parse_index(self._dtypes.index, store_data=True) + self._dtypes_value = dtypes_value + if params: # pragma: no cover + raise TypeError(f"Unknown params: {list(params)}") + + params = property(_get_params, _set_params) + + def refresh_params(self): + # refresh params when chunks updated + refresh_tileable_shape(self) + refresh_index_value(self) + refresh_dtypes(self) + + @property + def dtypes(self): + dt = getattr(self, "_dtypes", None) + if dt is not None: + return dt + return getattr(self.op, "dtypes", None) + + @property + def dtypes_value(self): + if self._dtypes_value is not None: + return self._dtypes_value + # TODO(qinxuye): when creating Dataframe, + # dtypes_value instead of dtypes later must be passed into + dtypes = self.dtypes + if dtypes is not None: + self._dtypes_value = DtypesValue(key=tokenize(dtypes), value=dtypes) + return self._dtypes_value + + @property + def index_value(self): + return self._index_value + + @property + def columns_value(self): + return self._columns_value + + @property + def empty(self): + shape = getattr(self, "_shape") + if np.any(np.isnan(shape)): + raise ValueError("Tileable object must be executed first") + return 0 in shape + + def to_tensor(self, dtype=None): + from ..tensor.datasource.from_dataframe import from_dataframe + + return from_dataframe(self, dtype=dtype) + + @staticmethod + def from_tensor(in_tensor, index=None, columns=None): + from .datasource.from_tensor import dataframe_from_tensor + + return dataframe_from_tensor(in_tensor, index=index, columns=columns) + + @staticmethod + def from_records(records, **kw): + from .datasource.from_records import from_records + + return from_records(records, **kw) + + @property + def index(self): + from .datasource.index import from_tileable + + return from_tileable(self) + + @property + def columns(self): + from .datasource.index import from_pandas as from_pandas_index + + return from_pandas_index(self.dtypes.index, store_data=True) + + @property + def axes(self): + return [self.index, self.columns] + + def _get_dtypes_dict(self): + if self._dtypes_dict is None: + self._dtypes_dict = d = dict() + for k, v in self.dtypes.items(): + try: + obj_list = d[k] + except KeyError: + obj_list = d[k] = [] + obj_list.append(v) + return self._dtypes_dict + + def _get_dtypes_by_columns(self, columns: list): + dtypes_dict = self._get_dtypes_dict() + return functools.reduce(operator.add, (dtypes_dict[c] for c in columns), []) + + def _get_columns_by_columns(self, columns: list): + dtypes_dict = self._get_dtypes_dict() + return functools.reduce( + operator.add, ([c] * len(dtypes_dict[c]) for c in columns), [] + ) + + +class DataFrameData(_BatchedFetcher, BaseDataFrameData): + type_name = "DataFrame" + + def _to_str(self, representation=False): + if is_build_mode() or len(self._executed_sessions) == 0: + # in build mode, or not executed, just return representation + if representation: + return ( + f"{self.type_name} " + ) + else: + return f"{self.type_name}(op={type(self._op).__name__})" + else: + corner_data = fetch_corner_data(self, session=self._executed_sessions[-1]) + + buf = StringIO() + max_rows = pd.get_option("display.max_rows") + + if self.shape[0] <= max_rows: + buf.write(repr(corner_data) if representation else str(corner_data)) + else: + # remember we cannot directly call repr(df), + # because the [... rows x ... columns] may show wrong rows + with pd.option_context( + "display.show_dimensions", + False, + "display.max_rows", + corner_data.shape[0] - 1, + ): + if representation: + s = repr(corner_data) + else: + s = str(corner_data) + buf.write(s) + if pd.get_option("display.show_dimensions"): + n_rows, n_cols = self.shape + buf.write(f"\n\n[{n_rows} rows x {n_cols} columns]") + + return buf.getvalue() + + def __str__(self): + return self._to_str(representation=False) + + def __repr__(self): + return self._to_str(representation=True) + + def __mars_tensor__(self, dtype=None, order="K"): + return self.to_tensor().astype(dtype=dtype, order=order, copy=False) + + def _repr_html_(self): + if len(self._executed_sessions) == 0: + # not executed before, fall back to normal repr + raise NotImplementedError + + corner_data = fetch_corner_data(self, session=self._executed_sessions[-1]) + + buf = StringIO() + max_rows = pd.get_option("display.max_rows") + if self.shape[0] <= max_rows: + buf.write(corner_data._repr_html_()) + else: + with pd.option_context( + "display.show_dimensions", + False, + "display.max_rows", + corner_data.shape[0] - 1, + ): + buf.write(corner_data._repr_html_().rstrip().rstrip("")) + if pd.get_option("display.show_dimensions"): + n_rows, n_cols = self.shape + buf.write(f"

{n_rows} rows × {n_cols} columns

\n") + buf.write("") + + return buf.getvalue() + + def items(self): + for col_name in self.dtypes.index: + yield col_name, self[col_name] + + iteritems = items + + def iterrows(self, batch_size=1000, session=None): + for batch_data in self.iterbatch(batch_size=batch_size, session=session): + yield from getattr(batch_data, "iterrows")() + + def itertuples(self, index=True, name="Pandas", batch_size=1000, session=None): + for batch_data in self.iterbatch(batch_size=batch_size, session=session): + yield from getattr(batch_data, "itertuples")(index=index, name=name) + + def _need_execution(self): + if self._dtypes is None: + return True + return False + + +class DataFrame(HasShapeTileable, _ToPandasMixin): + __slots__ = ("_cache",) + _allow_data_type_ = (DataFrameData,) + type_name = "DataFrame" + + def __len__(self): + return len(self._data) + + def to_tensor(self): + return self._data.to_tensor() + + def from_tensor(self, in_tensor, index=None, columns=None): + return self._data.from_tensor(in_tensor, index=index, columns=columns) + + def from_records(self, records, **kw): + return self._data.from_records(records, **kw) + + def __mars_tensor__(self, dtype=None, order="K"): + return self._data.__mars_tensor__(dtype=dtype, order=order) + + def __getattr__(self, key): + try: + return getattr(self._data, key) + except AttributeError: + if key in self.dtypes: + return self[key] + else: + raise + + def __dir__(self): + result = list(super().__dir__()) + return sorted( + result + + [k for k in self.dtypes.index if isinstance(k, str) and k.isidentifier()] + ) + + @property + def T(self): + return self.transpose() + + @property + def ndim(self): + """ + Return an int representing the number of axes / array dimensions. + + Return 1 if Series. Otherwise return 2 if DataFrame. + + See Also + -------- + ndarray.ndim : Number of array dimensions. + + Examples + -------- + >>> import mars.dataframe as md + >>> s = md.Series({'a': 1, 'b': 2, 'c': 3}) + >>> s.ndim.execute() + 1 + + >>> df = md.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df.ndim.execute() + 2 + """ + return super().ndim + + @property + def index(self): + idx = self._data.index + idx._set_df_or_series(self, 0) + return idx + + @index.setter + def index(self, new_index): + self.set_axis(new_index, axis=0, inplace=True) + + @property + def columns(self): + col = self._data.columns + col._set_df_or_series(self, 1) + return col + + @columns.setter + def columns(self, new_columns): + self.set_axis(new_columns, axis=1, inplace=True) + + def keys(self): + """ + Get the 'info axis' (see Indexing for more). + + This is index for Series, columns for DataFrame. + + Returns + ------- + Index + Info axis. + """ + return self.columns + + @property + def values(self): + return self.to_tensor() + + @property + def dtypes(self): + """ + Return the dtypes in the DataFrame. + + This returns a Series with the data type of each column. + The result's index is the original DataFrame's columns. Columns + with mixed types are stored with the ``object`` dtype. See + :ref:`the User Guide ` for more. + + Returns + ------- + pandas.Series + The data type of each column. + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame({'float': [1.0], + ... 'int': [1], + ... 'datetime': [md.Timestamp('20180310')], + ... 'string': ['foo']}) + >>> df.dtypes + float float64 + int int64 + datetime datetime64[ns] + string object + dtype: object + """ + return self._data.dtypes + + def iterrows(self, batch_size=1000, session=None): + """ + Iterate over DataFrame rows as (index, Series) pairs. + + Yields + ------ + index : label or tuple of label + The index of the row. A tuple for a `MultiIndex`. + data : Series + The data of the row as a Series. + + it : generator + A generator that iterates over the rows of the frame. + + See Also + -------- + DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values. + DataFrame.items : Iterate over (column name, Series) pairs. + + Notes + ----- + + 1. Because ``iterrows`` returns a Series for each row, + it does **not** preserve dtypes across the rows (dtypes are + preserved across columns for DataFrames). For example, + + >>> import mars.dataframe as md + >>> df = md.DataFrame([[1, 1.5]], columns=['int', 'float']) + >>> row = next(df.iterrows())[1] + >>> row + int 1.0 + float 1.5 + Name: 0, dtype: float64 + >>> print(row['int'].dtype) + float64 + >>> print(df['int'].dtype) + int64 + + To preserve dtypes while iterating over the rows, it is better + to use :meth:`itertuples` which returns namedtuples of the values + and which is generally faster than ``iterrows``. + + 2. You should **never modify** something you are iterating over. + This is not guaranteed to work in all cases. Depending on the + data types, the iterator returns a copy and not a view, and writing + to it will have no effect. + """ + return self._data.iterrows(batch_size=batch_size, session=session) + + def itertuples(self, index=True, name="Pandas", batch_size=1000, session=None): + """ + Iterate over DataFrame rows as namedtuples. + + Parameters + ---------- + index : bool, default True + If True, return the index as the first element of the tuple. + name : str or None, default "Pandas" + The name of the returned namedtuples or None to return regular + tuples. + + Returns + ------- + iterator + An object to iterate over namedtuples for each row in the + DataFrame with the first field possibly being the index and + following fields being the column values. + + See Also + -------- + DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) + pairs. + DataFrame.items : Iterate over (column name, Series) pairs. + + Notes + ----- + The column names will be renamed to positional names if they are + invalid Python identifiers, repeated, or start with an underscore. + On python versions < 3.7 regular tuples are returned for DataFrames + with a large number of columns (>254). + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]}, + ... index=['dog', 'hawk']) + >>> df.execute() + num_legs num_wings + dog 4 0 + hawk 2 2 + >>> for row in df.itertuples(): + ... print(row) + ... + Pandas(Index='dog', num_legs=4, num_wings=0) + Pandas(Index='hawk', num_legs=2, num_wings=2) + + By setting the `index` parameter to False we can remove the index + as the first element of the tuple: + + >>> for row in df.itertuples(index=False): + ... print(row) + ... + Pandas(num_legs=4, num_wings=0) + Pandas(num_legs=2, num_wings=2) + + With the `name` parameter set we set a custom name for the yielded + namedtuples: + + >>> for row in df.itertuples(name='Animal'): + ... print(row) + ... + Animal(Index='dog', num_legs=4, num_wings=0) + Animal(Index='hawk', num_legs=2, num_wings=2) + """ + return self._data.itertuples( + batch_size=batch_size, session=session, index=index, name=name + ) + + def assign(self, **kwargs): + """ + Assign new columns to a DataFrame. + Returns a new object with all original columns in addition to new ones. + Existing columns that are re-assigned will be overwritten. + + Parameters + ---------- + **kwargs : dict of {str: callable or Series} + The column names are keywords. If the values are + callable, they are computed on the DataFrame and + assigned to the new columns. The callable must not + change input DataFrame (though pandas doesn't check it). + If the values are not callable, (e.g. a Series, scalar, or array), + they are simply assigned. + + Returns + ------- + DataFrame + A new DataFrame with the new columns in addition to + all the existing columns. + + Notes + ----- + Assigning multiple columns within the same ``assign`` is possible. + Later items in 'kwargs' may refer to newly created or modified + columns in 'df'; items are computed and assigned into 'df' in order. + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame({'temp_c': [17.0, 25.0]}, + ... index=['Portland', 'Berkeley']) + >>> df.execute() + temp_c + Portland 17.0 + Berkeley 25.0 + + Where the value is a callable, evaluated on `df`: + + >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32).execute() + temp_c temp_f + Portland 17.0 62.6 + Berkeley 25.0 77.0 + + Alternatively, the same behavior can be achieved by directly + referencing an existing Series or sequence: + + >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32).execute() + temp_c temp_f + Portland 17.0 62.6 + Berkeley 25.0 77.0 + + You can create multiple columns within the same assign where one + of the columns depends on another one defined within the same assign: + + >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32, + ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9).execute() + temp_c temp_f temp_k + Portland 17.0 62.6 290.15 + Berkeley 25.0 77.0 298.15 + """ + + def apply_if_callable(maybe_callable, obj, **kwargs): + if callable(maybe_callable): + return maybe_callable(obj, **kwargs) + + return maybe_callable + + data = self.copy() + + for k, v in kwargs.items(): + data[k] = apply_if_callable(v, data) + return data + + +class DataFrameGroupByChunkData(BaseDataFrameChunkData): + type_name = "DataFrameGroupBy" + + _key_dtypes = SeriesField("key_dtypes") + _selection = AnyField("selection") + + @property + def key_dtypes(self): + return self._key_dtypes + + @property + def selection(self): + return self._selection + + def _get_params(self) -> Dict[str, Any]: + p = super()._get_params() + p.update(dict(key_dtypes=self.key_dtypes, selection=self.selection)) + return p + + def _set_params(self, new_params: Dict[str, Any]): + params = new_params.copy() + key_dtypes = params.pop("key_dtypes", None) + if key_dtypes is not None: + self._key_dtypes = key_dtypes + selection = params.pop("selection", None) + if selection is not None: + self._selection = selection + super()._set_params(params) + + params = property(_get_params, _set_params) + + @classmethod + def get_params_from_data(cls, data: GroupByWrapper) -> Dict[str, Any]: + params = super().get_params_from_data(data.obj) + if data.selection: + dtypes = params["dtypes_value"].value[data.selection] + params["dtypes_value"] = DtypesValue(value=dtypes) + params["shape"] = data.shape + return params + + def __init__(self, key_dtypes=None, selection=None, **kw): + super().__init__(_key_dtypes=key_dtypes, _selection=selection, **kw) + + +class DataFrameGroupByChunk(Chunk): + __slots__ = () + _allow_data_type_ = (DataFrameGroupByChunkData,) + type_name = "DataFrameGroupBy" + + def __len__(self): + return len(self._data) + + +class SeriesGroupByChunkData(BaseSeriesChunkData): + type_name = "SeriesGroupBy" + + _key_dtypes = AnyField("key_dtypes") + + @property + def key_dtypes(self): + return self._key_dtypes + + def _get_params(self) -> Dict[str, Any]: + p = super()._get_params() + p["key_dtypes"] = self.key_dtypes + return p + + def _set_params(self, new_params: Dict[str, Any]): + params = new_params.copy() + key_dtypes = params.pop("key_dtypes", None) + if key_dtypes is not None: + self._key_dtypes = key_dtypes + super()._set_params(new_params) + + params = property(_get_params, _set_params) + + @classmethod + def get_params_from_data(cls, data: GroupByWrapper): + series_name = data.selection or data.obj.name + if hasattr(data.obj, "dtype"): + dtype = data.obj.dtype + else: + dtype = data.obj.dtypes[series_name] + + return { + "shape": (data.obj.shape[0],), + "dtype": dtype, + "index_value": parse_index(data.obj.index, store_data=False), + "name": series_name, + } + + def __init__(self, key_dtypes=None, **kw): + super().__init__(_key_dtypes=key_dtypes, **kw) + + +class SeriesGroupByChunk(Chunk): + __slots__ = () + _allow_data_type_ = (SeriesGroupByChunkData,) + type_name = "SeriesGroupBy" + + def __len__(self): + return len(self._data) + + +class DataFrameGroupByData(BaseDataFrameData): + type_name = "DataFrameGroupBy" + + _key_dtypes = SeriesField("key_dtypes") + _selection = AnyField("selection") + _chunks = ListField( + "chunks", + FieldTypes.reference(DataFrameGroupByChunkData), + on_serialize=lambda x: [it.data for it in x] if x is not None else x, + on_deserialize=lambda x: [DataFrameGroupByChunk(it) for it in x] + if x is not None + else x, + ) + + @property + def key_dtypes(self): + return self._key_dtypes + + @property + def selection(self): + return self._selection + + def _get_params(self) -> Dict[str, Any]: + p = super()._get_params() + p.update(dict(key_dtypes=self.key_dtypes, selection=self.selection)) + return p + + def _set_params(self, new_params: Dict[str, Any]): + params = new_params.copy() + key_dtypes = params.pop("key_dtypes", None) + if key_dtypes is not None: + self._key_dtypes = key_dtypes + selection = params.pop("selection", None) + if selection is not None: + self._selection = selection + super()._set_params(params) + + params = property(_get_params, _set_params) + + def __init__(self, key_dtypes=None, selection=None, **kw): + super().__init__(_key_dtypes=key_dtypes, _selection=selection, **kw) + + def _equal(self, o): + # FIXME We need to implemented a true `==` operator for DataFrameGroupby + if is_build_mode(): + return self is o + else: + return self == o + + +class SeriesGroupByData(BaseSeriesData): + type_name = "SeriesGroupBy" + + _key_dtypes = AnyField("key_dtypes") + _chunks = ListField( + "chunks", + FieldTypes.reference(SeriesGroupByChunkData), + on_serialize=lambda x: [it.data for it in x] if x is not None else x, + on_deserialize=lambda x: [SeriesGroupByChunk(it) for it in x] + if x is not None + else x, + ) + + @property + def key_dtypes(self): + return self._key_dtypes + + def _get_params(self) -> Dict[str, Any]: + p = super()._get_params() + p["key_dtypes"] = self.key_dtypes + return p + + def _set_params(self, new_params: Dict[str, Any]): + params = new_params.copy() + key_dtypes = params.pop("key_dtypes", None) + if key_dtypes is not None: + self._key_dtypes = key_dtypes + super()._set_params(params) + + params = property(_get_params, _set_params) + + def __init__(self, key_dtypes=None, **kw): + super().__init__(_key_dtypes=key_dtypes, **kw) + + def _equal(self, o): + # FIXME We need to implemented a true `==` operator for DataFrameGroupby + if is_build_mode(): + return self is o + else: + return self == o + + +class GroupBy(Tileable, _ToPandasMixin): + __slots__ = () + + +class DataFrameGroupBy(GroupBy): + __slots__ = () + _allow_data_type_ = (DataFrameGroupByData,) + type_name = "DataFrameGroupBy" + + def __eq__(self, other): + return self._equal(other) + + def __hash__(self): + # NB: we have customized __eq__ explicitly, thus we need define __hash__ explicitly as well. + return super().__hash__() + + def __getattr__(self, item): + try: + return super().__getattr__(item) + except AttributeError: + if item in self.dtypes: + return self[item] + else: + raise + + def __dir__(self): + result = list(super().__dir__()) + return sorted( + result + + [k for k in self.dtypes.index if isinstance(k, str) and k.isidentifier()] + ) + + +class SeriesGroupBy(GroupBy): + __slots__ = () + _allow_data_type_ = (SeriesGroupByData,) + type_name = "SeriesGroupBy" + + def __eq__(self, other): + return self._equal(other) + + def __hash__(self): + # NB: we have customized __eq__ explicitly, thus we need define __hash__ explicitly as well. + return super().__hash__() + + +class CategoricalChunkData(ChunkData): + __slots__ = () + type_name = "Categorical" + + # required fields + _shape = TupleField( + "shape", + FieldTypes.int64, + on_serialize=on_serialize_shape, + on_deserialize=on_deserialize_shape, + ) + # optional field + _dtype = DataTypeField("dtype") + _categories_value = ReferenceField( + "categories_value", IndexValue, on_deserialize=_on_deserialize_index_value + ) + + def __init__( + self, op=None, shape=None, index=None, dtype=None, categories_value=None, **kw + ): + super().__init__( + _op=op, + _shape=shape, + _index=index, + _dtype=dtype, + _categories_value=categories_value, + **kw, + ) + + @property + def params(self) -> Dict[str, Any]: + # params return the properties which useful to rebuild a new chunk + return { + "shape": self.shape, + "dtype": self.dtype, + "index": self.index, + "categories_value": self.categories_value, + } + + @params.setter + def params(self, new_params: Dict[str, Any]): + params = new_params.copy() + params.pop("index", None) # index not needed to update + new_shape = params.pop("shape", None) + if new_shape is not None: + self._shape = new_shape + dtype = params.pop("dtype", None) + if dtype is not None: + self._dtype = dtype + categories_value = params.pop("categories_value", None) + if categories_value is not None: + self._categories_value = categories_value + if params: # pragma: no cover + raise TypeError(f"Unknown params: {list(params)}") + + @classmethod + def get_params_from_data(cls, data: pd.Categorical) -> Dict[str, Any]: + return { + "shape": data.shape, + "dtype": data.dtype, + "categories_value": parse_index(data.categories, store_data=True), + } + + @property + def shape(self): + return getattr(self, "_shape", None) + + @property + def ndim(self): + return len(self.shape) + + @property + def dtype(self): + return self._dtype + + @property + def categories_value(self): + return self._categories_value + + +class CategoricalChunk(Chunk): + __slots__ = () + _allow_data_type_ = (CategoricalChunkData,) + type_name = "Categorical" + + +class CategoricalData(HasShapeTileableData, _ToPandasMixin): + __slots__ = ("_cache",) + type_name = "Categorical" + + # optional field + _dtype = DataTypeField("dtype") + _categories_value = ReferenceField( + "categories_value", IndexValue, on_deserialize=_on_deserialize_index_value + ) + _chunks = ListField( + "chunks", + FieldTypes.reference(CategoricalChunkData), + on_serialize=lambda x: [it.data for it in x] if x is not None else x, + on_deserialize=lambda x: [CategoricalChunk(it) for it in x] + if x is not None + else x, + ) + + def __init__( + self, + op=None, + shape=None, + nsplits=None, + dtype=None, + categories_value=None, + chunks=None, + **kw, + ): + super().__init__( + _op=op, + _shape=shape, + _nsplits=nsplits, + _dtype=dtype, + _categories_value=categories_value, + _chunks=chunks, + **kw, + ) + + @property + def params(self) -> Dict[str, Any]: + # params return the properties which useful to rebuild a new tileable object + return { + "shape": self.shape, + "dtype": self.dtype, + "categories_value": self.categories_value, + } + + @params.setter + def params(self, new_params: Dict[str, Any]): + params = new_params.copy() + new_shape = params.pop("shape", None) + if new_shape is not None: + self._shape = new_shape + dtype = params.pop("dtype", None) + if dtype is not None: + self._dtype = dtype + categories_value = params.pop("categories_value", None) + if categories_value is not None: + self._categories_value = categories_value + if params: # pragma: no cover + raise TypeError(f"Unknown params: {list(params)}") + + def refresh_params(self): + # refresh params when chunks updated + refresh_tileable_shape(self) + if self._dtype is None: + self._dtype = self.chunks[0].dtype + if self._categories_value is None: + categories = [] + for chunk in self.chunks: + categories.extend(chunk.categories_value.to_pandas()) + self._categories_value = parse_index( + pd.Categorical(categories).categories, store_data=True + ) + + def _to_str(self, representation=False): + if is_build_mode() or len(self._executed_sessions) == 0: + # in build mode, or not executed, just return representation + if representation: + return f"{self.type_name} " + else: + return f"{self.type_name}(op={type(self.op).__name__})" + else: + data = self.fetch(session=self._executed_sessions[-1]) + return repr(data) if repr(data) else str(data) + + def __str__(self): + return self._to_str(representation=False) + + def __repr__(self): + return self._to_str(representation=True) + + def _equal(self, o): + # FIXME We need to implemented a true `==` operator for DataFrameGroupby + if is_build_mode(): + return self is o + else: # pragma: no cover + return self == o + + @property + def dtype(self): + return getattr(self, "_dtype", None) or self.op.dtype + + @property + def categories_value(self): + return self._categories_value + + def __eq__(self, other): + return self._equal(other) + + def __hash__(self): + # NB: we have customized __eq__ explicitly, thus we need define __hash__ explicitly as well. + return super().__hash__() + + +class Categorical(HasShapeTileable, _ToPandasMixin): + __slots__ = () + _allow_data_type_ = (CategoricalData,) + type_name = "Categorical" + + def __len__(self): + return len(self._data) + + def __eq__(self, other): + return self._equal(other) + + def __hash__(self): + # NB: we have customized __eq__ explicitly, thus we need define __hash__ explicitly as well. + return super().__hash__() + + +class DataFrameOrSeriesChunkData(ChunkData): + __slots__ = () + type_name = "DataFrameOrSeries" + + _collapse_axis = Int8Field("collapse_axis") + _data_type = StringField("data_type") + _data_params = DictField("data_params") + + def __init__( + self, + op=None, + index=None, + collapse_axis=None, + data_type=None, + data_params=None, + **kw, + ): + self._collapse_axis = collapse_axis + self._index = index + self._data_type = data_type + self._data_params = data_params or dict() + super().__init__(_op=op, **kw) + + def __getattr__(self, item): + if item in self._data_params: + return self._data_params[item] + raise AttributeError(f"'{type(self)}' object has no attribute '{item}'") + + @property + def ndim(self) -> int: + return (self._data_type == "dataframe") + 1 + + @property + def params(self) -> Dict[str, Any]: + return { + "collapse_axis": self._collapse_axis, + "index": self._index, + "data_type": self._data_type, + "data_params": self._data_params, + } + + @params.setter + def params(self, new_params: Dict[str, Any]): + self._data_type = new_params.get("data_type") + if self._collapse_axis is not None and self._data_type == "series": + self._index = (self._index[1 - self._collapse_axis],) + if self._collapse_axis is None and self._data_type == "dataframe": + self._index = (self._index[0], 0) + data_params = new_params["data_params"] + if self._data_type == "dataframe": + data_params["dtypes"] = data_params["dtypes_value"].value + data_params["columns_value"] = parse_index( + data_params["dtypes_value"].value.index, store_data=True + ) + self._data_params = {k: v for k, v in data_params.items()} + + @classmethod + def get_params_from_data(cls, data: Any) -> Dict[str, Any]: + if isinstance(data, pd.DataFrame): + return { + "data_type": "dataframe", + "data_params": DataFrameChunkData.get_params_from_data(data), + } + else: + return { + "data_type": "series", + "data_params": SeriesChunkData.get_params_from_data(data), + } + + +class DataFrameOrSeriesChunk(Chunk): + __slots__ = () + _allow_data_type_ = (DataFrameOrSeriesChunkData,) + type_name = "DataFrameOrSeries" + + +class DataFrameOrSeriesData(HasShapeTileableData, _ToPandasMixin): + __slots__ = () + _chunks = ListField( + "chunks", + FieldTypes.reference(DataFrameOrSeriesChunkData), + on_serialize=lambda x: [it.data for it in x] if x is not None else x, + on_deserialize=lambda x: [DataFrameOrSeriesChunk(it) for it in x] + if x is not None + else x, + ) + + _data_type = StringField("data_type") + _data_params = DictField("data_params") + + def __init__( + self, + op=None, + chunks=None, + data_type=None, + data_params=None, + **kw, + ): + self._data_type = data_type + self._data_params = data_params or dict() + super().__init__( + _op=op, + _chunks=chunks, + **kw, + ) + + def __getattr__(self, item): + if item in self._data_params: + return self._data_params[item] + raise AttributeError(f"'{type(self)}' object has no attribute '{item}'") + + @property + def shape(self): + return self._data_params.get("shape", None) + + @property + def nsplits(self): + return self._data_params.get("nsplits", None) + + @property + def data_type(self): + return self._data_type + + @property + def data_params(self): + return self._data_params + + @property + def params(self) -> Dict[str, Any]: + return {"data_type": self._data_type, "data_params": self._data_params} + + @params.setter + def params(self, new_params: Dict[str, Any]): + # After execution, create DataFrameFetch, and the data + # corresponding to the original key is still DataFrameOrSeries type, + # so when restoring DataFrameOrSeries type, + # there is no "data_type" field in params. + if "data_type" not in new_params: + if "dtype" in new_params: + self._data_type = "series" + else: + self._data_type = "dataframe" + self._data_params = new_params.copy() + else: + self._data_type = new_params.get("data_type") + self._data_params = { + k: v for k, v in new_params.get("data_params", {}).items() + } + + def refresh_params(self): + index_to_index_values = dict() + for chunk in self.chunks: + if chunk.ndim == 1: + index_to_index_values[chunk.index] = chunk.index_value + elif chunk.index[1] == 0: + index_to_index_values[chunk.index] = chunk.index_value + index_value = merge_index_value(index_to_index_values, store_data=False) + nsplits = calc_nsplits({c.index: c.shape for c in self.chunks}) + shape = tuple(sum(ns) for ns in nsplits) + + data_params = dict() + data_params["nsplits"] = nsplits + data_params["shape"] = shape + data_params["index_value"] = index_value + + self._data_type = self._chunks[0]._data_type + if self.data_type == "dataframe": + all_dtypes = [c.dtypes_value.value for c in self.chunks if c.index[0] == 0] + dtypes = pd.concat(all_dtypes) + data_params["dtypes"] = dtypes + columns_values = parse_index(dtypes.index, store_data=True) + data_params["columns_value"] = columns_values + data_params["dtypes_value"] = DtypesValue( + key=tokenize(dtypes), value=dtypes + ) + else: + data_params["dtype"] = self.chunks[0].dtype + data_params["name"] = self.chunks[0].name + self._data_params.update(data_params) + + def ensure_data(self): + from .fetch.core import DataFrameFetch + + self.execute() + default_sess = get_default_session() + self._detach_session(default_sess._session) + + fetch_tileable = default_sess._session._tileable_to_fetch[self] + new = DataFrameFetch( + output_types=[getattr(OutputType, self.data_type)] + ).new_tileable( + [], + _key=self.key, + chunks=fetch_tileable.chunks, + nsplits=fetch_tileable.nsplits, + **self.data_params, + ) + new._attach_session(default_sess._session) + return new + + +class DataFrameOrSeries(HasShapeTileable, _ToPandasMixin): + __slots__ = () + _allow_data_type_ = (DataFrameOrSeriesData,) + type_name = "DataFrameOrSeries" + + +INDEX_TYPE = (Index, IndexData) +INDEX_CHUNK_TYPE = (IndexChunk, IndexChunkData) +SERIES_TYPE = (Series, SeriesData) +SERIES_CHUNK_TYPE = (SeriesChunk, SeriesChunkData) +DATAFRAME_OR_SERIES_TYPE = (DataFrameOrSeries, DataFrameOrSeriesData) +DATAFRAME_OR_SERIES_CHUNK_TYPE = (DataFrameOrSeriesChunk, DataFrameOrSeriesChunkData) +DATAFRAME_TYPE = (DataFrame, DataFrameData) +DATAFRAME_CHUNK_TYPE = (DataFrameChunk, DataFrameChunkData) +DATAFRAME_GROUPBY_TYPE = (DataFrameGroupBy, DataFrameGroupByData) +DATAFRAME_GROUPBY_CHUNK_TYPE = (DataFrameGroupByChunk, DataFrameGroupByChunkData) +SERIES_GROUPBY_TYPE = (SeriesGroupBy, SeriesGroupByData) +SERIES_GROUPBY_CHUNK_TYPE = (SeriesGroupByChunk, SeriesGroupByChunkData) +GROUPBY_TYPE = (GroupBy,) + DATAFRAME_GROUPBY_TYPE + SERIES_GROUPBY_TYPE +GROUPBY_CHUNK_TYPE = DATAFRAME_GROUPBY_CHUNK_TYPE + SERIES_GROUPBY_CHUNK_TYPE +CATEGORICAL_TYPE = (Categorical, CategoricalData) +CATEGORICAL_CHUNK_TYPE = (CategoricalChunk, CategoricalChunkData) +TILEABLE_TYPE = ( + INDEX_TYPE + SERIES_TYPE + DATAFRAME_TYPE + GROUPBY_TYPE + CATEGORICAL_TYPE +) +CHUNK_TYPE = ( + INDEX_CHUNK_TYPE + + SERIES_CHUNK_TYPE + + DATAFRAME_CHUNK_TYPE + + GROUPBY_CHUNK_TYPE + + CATEGORICAL_CHUNK_TYPE +) + +register_output_types(OutputType.dataframe, DATAFRAME_TYPE, DATAFRAME_CHUNK_TYPE) +register_output_types(OutputType.series, SERIES_TYPE, SERIES_CHUNK_TYPE) +register_output_types( + OutputType.df_or_series, DATAFRAME_OR_SERIES_TYPE, DATAFRAME_OR_SERIES_CHUNK_TYPE +) +register_output_types(OutputType.index, INDEX_TYPE, INDEX_CHUNK_TYPE) +register_output_types(OutputType.categorical, CATEGORICAL_TYPE, CATEGORICAL_CHUNK_TYPE) +register_output_types( + OutputType.dataframe_groupby, DATAFRAME_GROUPBY_TYPE, DATAFRAME_GROUPBY_CHUNK_TYPE +) +register_output_types( + OutputType.series_groupby, SERIES_GROUPBY_TYPE, SERIES_GROUPBY_CHUNK_TYPE +) diff --git a/python/xorbits/_mars/dataframe/datasource/__init__.py b/python/xorbits/_mars/dataframe/datasource/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/datasource/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/datasource/core.py b/python/xorbits/_mars/dataframe/datasource/core.py new file mode 100644 index 000000000..005efc3e7 --- /dev/null +++ b/python/xorbits/_mars/dataframe/datasource/core.py @@ -0,0 +1,243 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import uuid +from typing import List, Optional, Union + +import numpy as np + +from ...config import options +from ...core import recursive_tile +from ...core.context import Context, get_context +from ...oscar import ActorNotExist +from ...serialization.serializables import Int64Field, StringField +from ...typing import OperandType, TileableType +from ...utils import parse_readable_size +from ..core import IndexValue, OutputType +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import merge_index_value + + +class HeadOptimizedDataSource(DataFrameOperand, DataFrameOperandMixin): + __slots__ = () + # Data source op that optimized for head, + # First, it will try to trigger first_chunk.head() and raise TilesError, + # When iterative tiling is triggered, + # check if the first_chunk.head() meets requirements. + nrows = Int64Field("nrows", default=None) + + @property + def first_chunk(self): + return getattr(self, "_first_chunk", None) + + @classmethod + def _tile(cls, op): # pragma: no cover + raise NotImplementedError + + @classmethod + def _tile_head(cls, op: "HeadOptimizedDataSource"): + tileds = cls._tile(op) + chunks = tileds[0].chunks + + # execute first chunk + yield chunks[:1] + + chunk_shape = chunks[0].shape + if chunk_shape[0] == op.nrows: + # the first chunk has enough data + tileds[0]._nsplits = tuple((s,) for s in chunk_shape) + chunks[0]._shape = chunk_shape + tileds[0]._chunks = chunks[:1] + tileds[0]._shape = chunk_shape + else: + for chunk in tileds[0].chunks: + chunk.op.nrows = None + # otherwise + tiled = yield from recursive_tile(tileds[0].iloc[: op.nrows]) + tileds = [tiled] + return tileds + + @classmethod + def tile(cls, op: "HeadOptimizedDataSource"): + if op.nrows is not None: + return (yield from cls._tile_head(op)) + else: + return cls._tile(op) + + +class ColumnPruneSupportedDataSourceMixin(DataFrameOperandMixin): + __slots__ = () + + def get_columns(self): # pragma: no cover + raise NotImplementedError + + def set_pruned_columns(self, columns, *, keep_order=None): # pragma: no cover + raise NotImplementedError + + +class _IncrementalIndexRecorder: + _done: List[Optional[asyncio.Event]] + _chunk_sizes: List[Optional[int]] + + def __init__(self, n_chunk: int): + self._n_chunk = n_chunk + self._done = [asyncio.Event() for _ in range(n_chunk)] + self._chunk_sizes = [None] * n_chunk + self._waiters = set() + + def _can_destroy(self): + return all(e.is_set() for e in self._done) and not self._waiters + + def add_waiter(self, i: int): + self._waiters.add(i) + + async def wait(self, i: int): + if i == 0: + return 0, self._can_destroy() + self._waiters.add(i) + try: + await asyncio.gather(*(e.wait() for e in self._done[:i])) + finally: + self._waiters.remove(i) + # all chunk finished and no waiters + return sum(self._chunk_sizes[:i]), self._can_destroy() + + async def finish(self, i: int, size: int): + self._chunk_sizes[i] = size + self._done[i].set() + + +class IncrementalIndexDatasource(HeadOptimizedDataSource): + __slots__ = () + + incremental_index_recorder_name = StringField("incremental_index_recorder_name") + + +class IncrementalIndexDataSourceMixin(DataFrameOperandMixin): + __slots__ = () + + @classmethod + def post_tile(cls, op: OperandType, results: List[TileableType]): + if ( + op.incremental_index + and results is not None + and isinstance(results[0].index_value.value, IndexValue.RangeIndex) + ): + result = results[0] + chunks = [] + for chunk in result.chunks: + if not isinstance(chunk.op, cls): + # some chunks are merged, get the inputs + chunks.extend(chunk.inputs) + else: + chunks.append(chunk) + for chunk in chunks: + chunk.op.priority = -chunk.index[0] + n_chunk = len(chunks) + ctx = get_context() + if ctx: + name = str(uuid.uuid4()) + ctx.create_remote_object(name, _IncrementalIndexRecorder, n_chunk) + for chunk in chunks: + chunk.op.incremental_index_recorder_name = name + + @classmethod + def pre_execute(cls, ctx: Union[dict, Context], op: OperandType): + out = op.outputs[0] + if ( + op.incremental_index + and isinstance(out.index_value.value, IndexValue.RangeIndex) + and getattr(op, "incremental_index_recorder_name", None) + ): + index = out.index[0] + recorder_name = op.incremental_index_recorder_name + recorder = ctx.get_remote_object(recorder_name) + recorder.add_waiter(index) + + @classmethod + def post_execute(cls, ctx: Union[dict, Context], op: OperandType): + out = op.outputs[0] + result = ctx[out.key] + if ( + op.incremental_index + and isinstance(out.index_value.value, IndexValue.RangeIndex) + and getattr(op, "incremental_index_recorder_name", None) + ): + recorder_name = op.incremental_index_recorder_name + recorder = ctx.get_remote_object(recorder_name) + index = out.index[0] + recorder.finish(index, len(result)) + # wait for previous chunks to finish, then update index + size, can_destroy = recorder.wait(index) + result.index += size + if can_destroy: + try: + ctx.destroy_remote_object(recorder_name) + except ActorNotExist: + pass + + +def merge_small_files( + df: TileableType, + n_sample_file: int = 10, + merged_file_size: Union[int, float, str] = None, +) -> TileableType: + from ..merge import DataFrameConcat + + if len(df.chunks) < n_sample_file: + # if number of chunks is small(less than `n_sample_file`, + # skip this process + return df + + if merged_file_size is not None: + merged_file_size = parse_readable_size(merged_file_size)[0] + else: + # Estimated size is relatively large than the real one, + # so we double the merged size + merged_file_size = options.chunk_store_limit * 2 + # sample files whose size equals `n_sample_file` + sampled_chunks = np.random.choice(df.chunks, n_sample_file) + max_chunk_size = 0 + ctx = dict() + for sampled_chunk in sampled_chunks: + sampled_chunk.op.estimate_size(ctx, sampled_chunk.op) + size = ctx[sampled_chunk.key][0] + max_chunk_size = max(max_chunk_size, size) + to_merge_size = merged_file_size // max_chunk_size + if to_merge_size < 2: + return df + # merge files + n_new_chunks = np.ceil(len(df.chunks) / to_merge_size) + new_chunks = [] + new_nsplit = [] + for i, chunks in enumerate(np.array_split(df.chunks, n_new_chunks)): + chunk_size = sum(c.shape[0] for c in chunks) + kw = dict( + dtypes=chunks[0].dtypes, + index_value=merge_index_value({c.index: c.index_value for c in chunks}), + columns_value=chunks[0].columns_value, + shape=(chunk_size, chunks[0].shape[1]), + index=(i, 0), + ) + new_chunk = DataFrameConcat(output_types=[OutputType.dataframe]).new_chunk( + chunks.tolist(), **kw + ) + new_chunks.append(new_chunk) + new_nsplit.append(chunk_size) + new_op = df.op.copy() + params = df.params.copy() + params["chunks"] = new_chunks + params["nsplits"] = (tuple(new_nsplit), df.nsplits[1]) + return new_op.new_dataframe(df.op.inputs, kws=[params]) diff --git a/python/xorbits/_mars/dataframe/datasource/dataframe.py b/python/xorbits/_mars/dataframe/datasource/dataframe.py new file mode 100644 index 000000000..6a2e6c524 --- /dev/null +++ b/python/xorbits/_mars/dataframe/datasource/dataframe.py @@ -0,0 +1,129 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import pandas as pd + +from ... import opcodes as OperandDef +from ...config import options +from ...core import OutputType +from ...serialization.serializables import DataFrameField, SeriesField +from ...tensor.utils import get_chunk_slices +from ...utils import estimate_pandas_size +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import decide_dataframe_chunk_sizes, is_cudf, parse_index + + +class DataFrameDataSource(DataFrameOperand, DataFrameOperandMixin): + """ + Represents data from pandas DataFrame + """ + + _op_type_ = OperandDef.DATAFRAME_DATA_SOURCE + + data = DataFrameField("data") + dtypes = SeriesField("dtypes") + + def __init__(self, data=None, dtypes=None, gpu=None, **kw): + if dtypes is None and data is not None: + dtypes = data.dtypes + if gpu is None and is_cudf(data): # pragma: no cover + gpu = True + super().__init__( + data=data, + dtypes=dtypes, + gpu=gpu, + _output_types=[OutputType.dataframe], + **kw + ) + + def __call__(self, shape, chunk_size=None): + return self.new_dataframe( + None, + shape, + dtypes=self.dtypes, + index_value=parse_index(self.data.index), + columns_value=parse_index(self.data.columns, store_data=True), + raw_chunk_size=chunk_size, + ) + + @classmethod + def tile(cls, op: "DataFrameDataSource"): + df = op.outputs[0] + raw_df = op.data + + # estimate column memory usage instead of calling df.memory_usage(deep=True) + memory_usage = pd.Series( + {c: estimate_pandas_size(s) for c, s in raw_df.items()} + ) + chunk_size = df.extra_params.raw_chunk_size or options.chunk_size + chunk_size = decide_dataframe_chunk_sizes(df.shape, chunk_size, memory_usage) + chunk_size_idxes = (range(len(size)) for size in chunk_size) + + out_chunks = [] + index_values = dict() + column_values = dict() + for chunk_shape, chunk_idx in zip( + itertools.product(*chunk_size), itertools.product(*chunk_size_idxes) + ): + chunk_op = op.copy().reset_key() + slc = get_chunk_slices(chunk_size, chunk_idx) + i_slc, j_slc = slc + if j_slc == slice(0, df.shape[1]): + # optimize full slice, it's way more faster + j_slc = slice(None) + chunk_op.data = raw_df.iloc[i_slc, j_slc] + chunk_op.dtypes = chunk_op.data.dtypes + i, j = chunk_idx + if i in index_values: + index_value = index_values[i] + else: + index_value = index_values[i] = parse_index(chunk_op.data.index) + if j in column_values: + column_value = column_values[j] + else: + column_value = column_values[j] = parse_index( + chunk_op.data.columns, store_data=True + ) + out_chunk = chunk_op.new_chunk( + None, + shape=chunk_shape, + index=chunk_idx, + index_value=index_value, + columns_value=column_value, + dtypes=chunk_op.data.dtypes, + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_dataframes( + None, + df.shape, + dtypes=op.dtypes, + index_value=df.index_value, + columns_value=df.columns_value, + chunks=out_chunks, + nsplits=chunk_size, + **df.extra_params + ) + + @classmethod + def execute(cls, ctx, op): + ctx[op.outputs[0].key] = op.data + + +def from_pandas(data, chunk_size=None, gpu=None, sparse=False): + op = DataFrameDataSource(data=data, gpu=gpu, sparse=sparse) + return op(data.shape, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/dataframe/datasource/date_range.py b/python/xorbits/_mars/dataframe/datasource/date_range.py new file mode 100644 index 000000000..7e952ace8 --- /dev/null +++ b/python/xorbits/_mars/dataframe/datasource/date_range.py @@ -0,0 +1,601 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from datetime import date, datetime, time + +import numpy as np +import pandas as pd +from pandas import NaT, Timestamp +from pandas._libs.tslibs import timezones +from pandas.tseries.frequencies import to_offset +from pandas.tseries.offsets import Tick + +from ... import opcodes as OperandDef +from ...config import options +from ...core import OutputType +from ...serialization.serializables import AnyField, BoolField, Int64Field, StringField +from ...tensor.utils import decide_chunk_sizes +from ...utils import no_default, pd_release_version +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import parse_index + +try: + from pandas._libs.tslib import normalize_date +except ImportError: # pragma: no cover + + def normalize_date(dt): # from pandas/_libs/tslibs/conversion.pyx + if isinstance(dt, datetime): + if isinstance(dt, pd.Timestamp): + return dt.replace( + hour=0, minute=0, second=0, microsecond=0, nanosecond=0 + ) + else: + return dt.replace(hour=0, minute=0, second=0, microsecond=0) + elif isinstance(dt, date): + return datetime(dt.year, dt.month, dt.day) + else: + raise TypeError(f"Unrecognized type: {type(dt)}") + + +_date_range_use_inclusive = pd_release_version[:2] >= (1, 4) + + +# adapted from pandas.core.arrays.datetimes.generate_range +def generate_range_count( + start=None, end=None, periods=None, offset=None +): # pragma: no cover + offset = to_offset(offset) + + start = Timestamp(start) + start = start if start is not NaT else None + end = Timestamp(end) + end = end if end is not NaT else None + + if start and not offset.is_on_offset(start): + start = offset.rollforward(start) + + elif end and not offset.is_on_offset(end): + end = offset.rollback(end) + + if periods is None and end < start and offset.n >= 0: + end = None + periods = 0 + + if end is None: + end = start + (periods - 1) * offset + + if start is None: + start = end - (periods - 1) * offset + + cur = start + count = 0 + if offset.n >= 0: + while cur <= end: + count += 1 + + if cur == end: + # GH#24252 avoid overflows by not performing the addition + # in offset.apply unless we have to + break + + # faster than cur + offset + try: + next_date = offset._apply(cur) + except AttributeError: + next_date = cur + offset + if next_date <= cur: + raise ValueError(f"Offset {offset} did not increment date") + cur = next_date + else: + while cur >= end: + count += 1 + + if cur == end: + # GH#24252 avoid overflows by not performing the addition + # in offset.apply unless we have to + break + + # faster than cur + offset + try: + next_date = offset._apply(cur) + except AttributeError: + next_date = cur + offset + if next_date >= cur: + raise ValueError(f"Offset {offset} did not decrement date") + cur = next_date + return count + + +class DataFrameDateRange(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.DATE_RANGE + + start = AnyField("start") + end = AnyField("end") + periods = Int64Field("periods") + freq = AnyField("freq") + tz = AnyField("tz") + normalize = BoolField("normalize") + name = StringField("name") + inclusive = StringField("inclusive") + + def __init__( + self, + output_types=None, + **kw, + ): + super().__init__(_output_types=output_types, **kw) + if self.output_types is None: + self.output_types = [OutputType.index] + if getattr(self, "inclusive", None) is None: + self.inclusive = "both" + + def __call__(self, shape, chunk_size=None): + dtype = pd.Index([self.start]).dtype + index_value = parse_index( + pd.Index([], dtype=dtype), self.start, self.end, self.periods, self.tz + ) + # gen index value info + index_value.value._min_val = self.start + index_value.value._min_val_close = True + index_value.value._max_val = self.end + index_value.value._max_val_close = True + index_value.value._is_unique = True + index_value.value._is_monotonic_increasing = True + index_value.value._freq = self.freq + return self.new_index( + None, + shape=shape, + dtype=dtype, + index_value=index_value, + name=self.name, + raw_chunk_size=chunk_size, + freq=self.freq, + ) + + @classmethod + def tile(cls, op: "DataFrameDateRange"): + out = op.outputs[0] + start = op.start + end = op.end + freq = op.freq + periods = op.periods + inclusive = op.inclusive + + chunk_length = out.extra_params.raw_chunk_size or options.chunk_size + chunk_length = decide_chunk_sizes(out.shape, chunk_length, out.dtype.itemsize)[ + 0 + ] + + if inclusive in ("neither", "right"): + # if left not close, add one more for the first chunk + chunk_length = (chunk_length[0] + 1,) + chunk_length[1:] + + if freq is None: + if periods > 1: + freq = (end - op.start) / (periods - 1) + else: + freq = end - start + + out_chunks = [] + cum_nsplit = [0] + np.cumsum(chunk_length).tolist() + for i, chunk_size in enumerate(chunk_length): + chunk_op = op.copy().reset_key() + chunk_op.periods = chunk_size + + if i > 0 or inclusive not in ("neither", "right"): + # for chunks in the middle, all sides are inclusive + chunk_op.inclusive = "both" + elif 0 == i and inclusive == "neither": + chunk_op.inclusive = "right" + + chunk_i_start = cum_nsplit[i] + if chunk_i_start > 0: + chunk_start = chunk_op.start = start + freq * chunk_i_start + else: + chunk_start = chunk_op.start = start + chunk_end = chunk_op.end = chunk_start + (chunk_size - 1) * freq + + # gen chunk index_value + chunk_index_value = parse_index(out.index_value.to_pandas(), i, out) + chunk_index_value.value._min_val = chunk_start + chunk_index_value.value._min_val_close = True + chunk_index_value.value._max_val = chunk_end + chunk_index_value.value._max_val_close = True + chunk_index_value.value._is_unique = True + chunk_index_value.value._is_monotonic_increasing = True + + size = ( + chunk_size - 1 + if i == 0 and inclusive in ("neither", "right") + else chunk_size + ) + out_chunk = chunk_op.new_chunk( + None, + shape=(size,), + index=(i,), + dtype=out.dtype, + index_value=chunk_index_value, + name=out.name, + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + params = out.params + params["chunks"] = out_chunks + params["nsplits"] = (tuple(c.shape[0] for c in out_chunks),) + return new_op.new_indexes(None, kws=[params]) + + @classmethod + def execute(cls, ctx, op: "DataFrameDateRange"): + start, end, periods = op.start, op.end, op.periods + freq = op.freq + if freq is not None: + end = None + kw = dict( + start=start, + end=end, + periods=periods, + freq=freq, + tz=op.tz, + normalize=op.normalize, + name=op.name, + inclusive=op.inclusive, + ) + if not _date_range_use_inclusive: + closed = kw.pop("inclusive") + assert closed != "neither" + kw["closed"] = None if closed == "both" else closed + ctx[op.outputs[0].key] = pd.date_range(**kw) + + +_midnight = time(0, 0) + + +def _maybe_normalize_endpoints(start, end, normalize): # pragma: no cover + _normalized = True + + if start is not None: + if normalize: + start = normalize_date(start) + _normalized = True + else: + _normalized = _normalized and start.time() == _midnight + + if end is not None: + if normalize: + end = normalize_date(end) + _normalized = True + else: + _normalized = _normalized and end.time() == _midnight + + return start, end, _normalized + + +def _infer_tz_from_endpoints(start, end, tz): # pragma: no cover + """ + If a timezone is not explicitly given via `tz`, see if one can + be inferred from the `start` and `end` endpoints. If more than one + of these inputs provides a timezone, require that they all agree. + + Parameters + ---------- + start : Timestamp + end : Timestamp + tz : tzinfo or None + + Returns + ------- + tz : tzinfo or None + + Raises + ------ + TypeError : if start and end timezones do not agree + """ + try: + inferred_tz = timezones.infer_tzinfo(start, end) + except AssertionError: + # infer_tzinfo raises AssertionError if passed mismatched timezones + raise TypeError( + "Start and end cannot both be tz-aware with different timezones" + ) + + inferred_tz = timezones.maybe_get_tz(inferred_tz) + tz = timezones.maybe_get_tz(tz) + + if tz is not None and inferred_tz is not None: + if not timezones.tz_compare(inferred_tz, tz): + raise AssertionError("Inferred time zone not equal to passed time zone") + + elif inferred_tz is not None: + tz = inferred_tz + + return tz + + +def _maybe_localize_point( + ts, is_none, is_not_none, freq, tz, ambiguous, nonexistent +): # pragma: no cover + """ + Localize a start or end Timestamp to the timezone of the corresponding + start or end Timestamp + + Parameters + ---------- + ts : start or end Timestamp to potentially localize + is_none : argument that should be None + is_not_none : argument that should not be None + freq : Tick, DateOffset, or None + tz : str, timezone object or None + ambiguous: str, localization behavior for ambiguous times + nonexistent: str, localization behavior for nonexistent times + + Returns + ------- + ts : Timestamp + """ + # Make sure start and end are timezone localized if: + # 1) freq = a Timedelta-like frequency (Tick) + # 2) freq = None i.e. generating a linspaced range + if is_none is None and is_not_none is not None: + # Note: We can't ambiguous='infer' a singular ambiguous time; however, + # we have historically defaulted ambiguous=False + ambiguous = ambiguous if ambiguous != "infer" else False + localize_args = {"ambiguous": ambiguous, "nonexistent": nonexistent, "tz": None} + if isinstance(freq, Tick) or freq is None: + localize_args["tz"] = tz + ts = ts.tz_localize(**localize_args) + return ts + + +def date_range( + start=None, + end=None, + periods=None, + freq=None, + tz=None, + normalize=False, + name=None, + closed=no_default, + inclusive=None, + chunk_size=None, + **kwargs, +): + """ + Return a fixed frequency DatetimeIndex. + + Parameters + ---------- + start : str or datetime-like, optional + Left bound for generating dates. + end : str or datetime-like, optional + Right bound for generating dates. + periods : int, optional + Number of periods to generate. + freq : str or DateOffset, default 'D' + Frequency strings can have multiples, e.g. '5H'. See + :ref:`here ` for a list of + frequency aliases. + tz : str or tzinfo, optional + Time zone name for returning localized DatetimeIndex, for example + 'Asia/Hong_Kong'. By default, the resulting DatetimeIndex is + timezone-naive. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + name : str, default None + Name of the resulting DatetimeIndex. + inclusive : {“both”, “neither”, “left”, “right”}, default “both” + Include boundaries; Whether to set each bound as closed or open. + **kwargs + For compatibility. Has no effect on the result. + + Returns + ------- + rng : DatetimeIndex + + See Also + -------- + DatetimeIndex : An immutable container for datetimes. + timedelta_range : Return a fixed frequency TimedeltaIndex. + period_range : Return a fixed frequency PeriodIndex. + interval_range : Return a fixed frequency IntervalIndex. + + Notes + ----- + Of the four parameters ``start``, ``end``, ``periods``, and ``freq``, + exactly three must be specified. If ``freq`` is omitted, the resulting + ``DatetimeIndex`` will have ``periods`` linearly spaced elements between + ``start`` and ``end`` (closed on both sides). + + To learn more about the frequency strings, please see `this link + `__. + + Examples + -------- + **Specifying the values** + + The next four examples generate the same `DatetimeIndex`, but vary + the combination of `start`, `end` and `periods`. + + Specify `start` and `end`, with the default daily frequency. + >>> import mars.dataframe as md + + >>> md.date_range(start='1/1/2018', end='1/08/2018').execute() + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', + '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'], + dtype='datetime64[ns]', freq='D') + + Specify `start` and `periods`, the number of periods (days). + + >>> md.date_range(start='1/1/2018', periods=8).execute() + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', + '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'], + dtype='datetime64[ns]', freq='D') + + Specify `end` and `periods`, the number of periods (days). + + >>> md.date_range(end='1/1/2018', periods=8).execute() + DatetimeIndex(['2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28', + '2017-12-29', '2017-12-30', '2017-12-31', '2018-01-01'], + dtype='datetime64[ns]', freq='D') + + Specify `start`, `end`, and `periods`; the frequency is generated + automatically (linearly spaced). + + >>> md.date_range(start='2018-04-24', end='2018-04-27', periods=3).execute() + DatetimeIndex(['2018-04-24 00:00:00', '2018-04-25 12:00:00', + '2018-04-27 00:00:00'], + dtype='datetime64[ns]', freq=None) + + **Other Parameters** + + Changed the `freq` (frequency) to ``'M'`` (month end frequency). + + >>> md.date_range(start='1/1/2018', periods=5, freq='M').execute() + DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30', + '2018-05-31'], + dtype='datetime64[ns]', freq='M') + + Multiples are allowed + + >>> md.date_range(start='1/1/2018', periods=5, freq='3M').execute() + DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', + '2019-01-31'], + dtype='datetime64[ns]', freq='3M') + + `freq` can also be specified as an Offset object. + + >>> md.date_range(start='1/1/2018', periods=5, freq=md.offsets.MonthEnd(3)).execute() + DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', + '2019-01-31'], + dtype='datetime64[ns]', freq='3M') + + Specify `tz` to set the timezone. + + >>> md.date_range(start='1/1/2018', periods=5, tz='Asia/Tokyo').execute() + DatetimeIndex(['2018-01-01 00:00:00+09:00', '2018-01-02 00:00:00+09:00', + '2018-01-03 00:00:00+09:00', '2018-01-04 00:00:00+09:00', + '2018-01-05 00:00:00+09:00'], + dtype='datetime64[ns, Asia/Tokyo]', freq='D') + + `inclusive` controls whether to include `start` and `end` that are on the + boundary. The default, "both", includes boundary points on either end. + + >>> md.date_range(start='2017-01-01', end='2017-01-04', inclusive='both').execute() + DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04'], + dtype='datetime64[ns]', freq='D') + + Use ``inclusive='left'`` to exclude `end` if it falls on the boundary. + + >>> md.date_range(start='2017-01-01', end='2017-01-04', closed='left').execute() + DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03'], + dtype='datetime64[ns]', freq='D') + + Use ``inclusive='right'`` to exclude `start` if it falls on the boundary, + and similarly inclusive='neither' will exclude both `start` and `end`. + + >>> md.date_range(start='2017-01-01', end='2017-01-04', closed='right').execute() + DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'], + dtype='datetime64[ns]', freq='D') + + .. note:: + Pandas 1.4.0 or later is required to use ``inclusive='neither'``. + Otherwise an error may be raised. + """ + # validate periods + if isinstance(periods, (float, np.floating)): + periods = int(periods) + if periods is not None and not isinstance(periods, (int, np.integer)): + raise TypeError(f"periods must be a number, got {periods}") + + if freq is None and any(arg is None for arg in [periods, start, end]): + freq = "D" + if sum(arg is not None for arg in [start, end, periods, freq]) != 3: + raise ValueError( + "Of the four parameters: start, end, periods, " + "and freq, exactly three must be specified" + ) + freq = to_offset(freq) + + if _date_range_use_inclusive and closed is not no_default: + warnings.warn( + "Argument `closed` is deprecated in favor of `inclusive`.", FutureWarning + ) + elif closed is no_default: + closed = None + + if inclusive is None and closed is not no_default: + inclusive = closed + + if start is not None: + start = pd.Timestamp(start) + + if end is not None: + end = pd.Timestamp(end) + + if start is pd.NaT or end is pd.NaT: + raise ValueError("Neither `start` nor `end` can be NaT") + + start, end, _ = _maybe_normalize_endpoints(start, end, normalize) + tz = _infer_tz_from_endpoints(start, end, tz) + + if start is None and end is not None: + # start is None and end is not None + # adjust end first + end = pd.date_range(end=end, periods=1, freq=freq)[0] + if inclusive == "neither": + end -= freq + size = periods + start = end - (periods - 1) * freq + if inclusive in ("neither", "left"): + size -= 1 + elif inclusive == "right": + # when start is None, closed == 'left' would not take effect + # thus just ignore + inclusive = "both" + elif end is None: + # end is None + # adjust start first + start = pd.date_range(start=start, periods=1, freq=freq)[0] + size = periods + end = start + (periods - 1) * freq + if inclusive in ("neither", "right"): + size -= 1 + elif inclusive == "left": + # when end is None, closed == 'left' would not take effect + # thus just ignore + inclusive = "both" + else: + if periods is None: + periods = size = generate_range_count(start, end, periods, freq) + else: + size = periods + if inclusive in ("left", "right"): + size -= 1 + elif inclusive == "neither": + size -= 2 + + shape = (size,) + op = DataFrameDateRange( + start=start, + end=end, + periods=periods, + freq=freq, + tz=tz, + normalize=normalize, + name=name, + inclusive=inclusive, + **kwargs, + ) + return op(shape, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/dataframe/datasource/from_index.py b/python/xorbits/_mars/dataframe/datasource/from_index.py new file mode 100644 index 000000000..8c91b3c89 --- /dev/null +++ b/python/xorbits/_mars/dataframe/datasource/from_index.py @@ -0,0 +1,99 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes +from ...core import recursive_tile +from ...serialization.serializables import AnyField, KeyField +from ..initializer import Index +from ..operands import DataFrameOperand, DataFrameOperandMixin + + +class SeriesFromIndex(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.SERIES_FROM_INDEX + + input_ = KeyField("input_") + index = KeyField("index") + name = AnyField("name", default=None) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self.input_ = self._inputs[0] + if len(self._inputs) > 1: + self.index = self._inputs[1] + + def __call__(self, index, new_index=None, name=None): + inputs = [index] + index_value = index.index_value + if new_index is not None: + inputs.append(new_index) + index_value = new_index.index_value + return self.new_series( + inputs, + shape=index.shape, + dtype=index.dtype, + index_value=index_value, + name=name, + ) + + @classmethod + def tile(cls, op: "SeriesFromIndex"): + inp = op.input_ + out = op.outputs[0] + index = op.index + + if index is not None: + index = yield from recursive_tile(op.index.rechunk({0: inp.nsplits[0]})) + + chunks = [] + for i, c in enumerate(inp.chunks): + chunk_op = op.copy().reset_key() + chunk_inputs = [c] + chunk_index_value = c.index_value + if index is not None: + index_chunk = index.chunks[i] + chunk_index_value = index_chunk.index_value + chunk_inputs.append(index_chunk) + chunk = chunk_op.new_chunk( + chunk_inputs, + shape=c.shape, + dtype=c.dtype, + index_value=chunk_index_value, + name=out.name, + index=c.index, + ) + chunks.append(chunk) + + new_op = op.copy() + params = out.params + params["chunks"] = chunks + params["nsplits"] = inp.nsplits + return new_op.new_tileables([inp], kws=[params]) + + @classmethod + def execute(cls, ctx, op): + out = op.outputs[0] + inp = ctx[op.input_.key] + index = None + if op.index is not None: + index = ctx[op.index.key] + name = op.name or out.name + ctx[out.key] = inp.to_series(index=index, name=name) + + +def series_from_index(ind, index=None, name=None): + name = name or ind.name or 0 + if index is not None: + index = Index(index) + op = SeriesFromIndex(input_=ind, index=index, name=name) + return op(ind, new_index=index, name=name) diff --git a/python/xorbits/_mars/dataframe/datasource/from_records.py b/python/xorbits/_mars/dataframe/datasource/from_records.py new file mode 100644 index 000000000..990ef5165 --- /dev/null +++ b/python/xorbits/_mars/dataframe/datasource/from_records.py @@ -0,0 +1,164 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import OutputType +from ...serialization.serializables import BoolField, Int32Field, ListField +from ...tensor.core import TENSOR_TYPE +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import parse_index + + +class DataFrameFromRecords(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.DATAFRAME_FROM_RECORDS + + columns = ListField("columns", default=None) + exclude = ListField("exclude", default=None) + coerce_float = BoolField("coerce_float", default=False) + nrows = Int32Field("nrows", default=None) + + def __init__(self, index=None, columns=None, **kw): + if index is not None or columns is not None: + raise NotImplementedError("Specifying index value is not supported for now") + super().__init__(columns=columns, _output_types=[OutputType.dataframe], **kw) + + def __call__(self, data): + if self.nrows is None: + nrows = data.shape[0] + else: + nrows = self.nrows + index_value = parse_index(pd.RangeIndex(start=0, stop=nrows)) + dtypes = pd.Series(dict((k, np.dtype(v)) for k, v in data.dtype.descr)) + columns_value = parse_index(pd.Index(data.dtype.names), store_data=True) + return self.new_dataframe( + [data], + (data.shape[0], len(data.dtype.names)), + dtypes=dtypes, + index_value=index_value, + columns_value=columns_value, + ) + + @classmethod + def tile(cls, op): + df = op.outputs[0] + tensor = op.inputs[0] + + nsplit_acc = np.cumsum(tensor.nsplits[0]) + out_chunks = [] + for chunk in tensor.chunks: + begin_index = nsplit_acc[chunk.index[0]] - chunk.shape[0] + end_index = nsplit_acc[chunk.index[0]] + chunk_index_value = parse_index( + pd.RangeIndex(start=begin_index, stop=end_index) + ) + + # Here the `new_chunk` is tricky: + # + # We can construct tensor that have identifcal chunks, for example, from `mt.ones(...)`, we know + # that after tiling the chunk of the same shape (but at different position) in `mt.ones` is indeed + # the same chunk (has the same key)! + # + # Thus, when we construct dataframe from such tensor, we will have dataframe chunks that only differ + # in `index_value`. However the `index_value` field won't be used to calculate the chunk key of + # the dataframe chunk, thus `new_chunk` generated the same keys for those indeed different chunks + # (they have different `index_values`). + # + # Here, we construct new chunk with some unique `_extra_params` to make the `new_chunk` work as + # expected. + chunk_op = op.copy().reset_key() + chunk_op.extra_params["begin_index"] = begin_index + chunk_op.extra_params["end_index"] = end_index + out_chunk = chunk_op.new_chunk( + [chunk], + shape=(chunk.shape[0], df.shape[1]), + index=(chunk.index[0], 0), + dtypes=df.dtypes, + index_value=chunk_index_value, + columns_value=df.columns_value, + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_dataframes( + [tensor], + df.shape, + dtypes=df.dtypes, + index_value=df.index_value, + columns_value=df.columns_value, + chunks=out_chunks, + nsplits=[tensor.nsplits[0], [df.shape[1]]], + ) + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + ctx[chunk.key] = pd.DataFrame.from_records( + ctx[op.inputs[0].key], + index=chunk.index_value.to_pandas(), + columns=chunk.columns_value.to_pandas(), + exclude=op.exclude, + coerce_float=op.coerce_float, + nrows=op.nrows, + ) + + +def from_records( + data, + index=None, + exclude=None, + columns=None, + coerce_float=False, + nrows=None, + gpu=None, + sparse=False, + **kw +): + if isinstance(data, np.ndarray): + from .dataframe import from_pandas + + return from_pandas( + pd.DataFrame.from_records( + data, + index=index, + exclude=exclude, + columns=columns, + coerce_float=coerce_float, + nrows=nrows, + ), + **kw + ) + elif isinstance(data, TENSOR_TYPE): + if data.dtype.names is None: + raise TypeError("Not a tensor with structured dtype {0}", data.dtype) + if data.ndim != 1: + raise ValueError( + "Not a tensor with non 1-D structured dtype {0}", data.shape + ) + + op = DataFrameFromRecords( + index=None, + exclude=exclude, + columns=columns, + coerce_float=coerce_float, + nrows=nrows, + gpu=gpu, + sparse=sparse, + **kw + ) + return op(data) + else: + raise TypeError("Not support create DataFrame from {0}", type(data)) diff --git a/python/xorbits/_mars/dataframe/datasource/from_tensor.py b/python/xorbits/_mars/dataframe/datasource/from_tensor.py new file mode 100644 index 000000000..49b5a58dd --- /dev/null +++ b/python/xorbits/_mars/dataframe/datasource/from_tensor.py @@ -0,0 +1,754 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict +from typing import Any, Dict, List, Union + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import ENTITY_TYPE, OutputType, recursive_tile +from ...core.context import Context +from ...serialization.serializables import AnyField, KeyField +from ...tensor.core import Tensor +from ...tensor.datasource import tensor as astensor +from ...tensor.utils import unify_chunks +from ...typing import EntityType, TileableType +from ...utils import has_unknown_shape +from ..core import INDEX_TYPE, SERIES_TYPE +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import parse_index + + +class DataFrameFromTensor(DataFrameOperand, DataFrameOperandMixin): + """ + Represents data from mars tensor + """ + + _op_type_ = OperandDef.DATAFRAME_FROM_TENSOR + + input = AnyField("input") + index = AnyField("index") + columns = AnyField("columns") + + def __init__(self, *args, **kwargs): + kwargs["_output_types"] = [OutputType.dataframe] + super().__init__(*args, **kwargs) + + def _set_inputs(self, inputs: List[EntityType]): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + if self.input is not None: + if not isinstance(self.input, dict): + self.input = next(inputs_iter) + else: + # check each value for input + new_input = OrderedDict() + for k, v in self.input.items(): + if isinstance(v, ENTITY_TYPE): + new_input[k] = next(inputs_iter) + else: + new_input[k] = v + self.input = new_input + + if isinstance(self.index, ENTITY_TYPE): + self.index = next(inputs_iter) + + def __call__( + self, + input_tensor: Tensor, + index: Union[TileableType, pd.Index], + columns: pd.Index, + dtypes: pd.Series, + ): + if isinstance(input_tensor, dict): + return self._call_input_1d_tileables(input_tensor, index, columns, dtypes) + elif input_tensor is not None: + return self._call_input_tensor(input_tensor, index, columns, dtypes) + else: + return self._call_tensor_none(index, columns, dtypes) + + def _process_index( + self, index: Union[TileableType, pd.Index], inputs: List[EntityType] + ): + if not isinstance(index, pd.Index): + if isinstance(index, INDEX_TYPE): + index_value = index.index_value + inputs.append(index) + elif isinstance(index, ENTITY_TYPE): + index = astensor(index) + if index.ndim != 1: + raise ValueError(f"index should be 1-d, got {index.ndim}-d") + index_value = parse_index( + pd.Index([], dtype=index.dtype), index, type(self).__name__ + ) + inputs.append(index) + else: + index = pd.Index(index) + index_value = parse_index(index) + else: + index_value = parse_index(index) + return index_value + + def _call_input_1d_tileables( + self, + input_1d_tileables: Dict[Any, TileableType], + index: Union[TileableType, pd.Index], + columns: pd.Index, + dtypes: pd.Series, + ): + tileables = [] + shape = None + for tileable in input_1d_tileables.values(): + tileable_shape = astensor(tileable).shape + if len(tileable_shape) > 0: + if shape is None: + shape = tileable_shape + elif shape != tileable_shape: + raise ValueError("input 1-d tensors should have same shape") + + if isinstance(tileable, ENTITY_TYPE): + tileables.append(tileable) + + if index is not None: + tileable_size = tileables[0].shape[0] + if hasattr(index, "shape"): + index_size = index.shape[0] + else: + index_size = len(index) + if ( + not pd.isna(tileable_size) + and not pd.isna(index_size) + and tileable_size != index_size + ): + raise ValueError( + f"index {index} should have the same shape " + f"with tensor: {tileable_size}" + ) + index_value = self._process_index(index, tileables) + else: + self.index = index = pd.RangeIndex(0, tileables[0].shape[0]) + index_value = parse_index(index) + + if columns is not None: + if len(input_1d_tileables) != len(columns): + raise ValueError( + f"columns {columns} should have size {len(input_1d_tileables)}" + ) + if not isinstance(columns, pd.Index): + if isinstance(columns, ENTITY_TYPE): + raise NotImplementedError("The columns value cannot be a tileable") + columns = pd.Index(columns) + columns_value = parse_index(columns, store_data=True) + else: + columns_value = parse_index( + pd.RangeIndex(0, len(input_1d_tileables)), store_data=True + ) + + shape = (shape[0], len(input_1d_tileables)) + return self.new_dataframe( + tileables, + shape, + dtypes=dtypes, + index_value=index_value, + columns_value=columns_value, + ) + + def _call_input_tensor( + self, + input_tensor: Tensor, + index: Union[TileableType, pd.Index], + columns: pd.Index, + dtypes: pd.Series, + ): + if input_tensor.ndim not in {1, 2}: + raise ValueError("Must pass 1-d or 2-d input") + inputs = [input_tensor] + + if index is not None: + if input_tensor.shape[0] != len(index): + raise ValueError( + f"index {index} should have the same shape with tensor: {input_tensor.shape[0]}" + ) + index_value = self._process_index(index, inputs) + elif isinstance(input_tensor, SERIES_TYPE): + index_value = input_tensor.index_value + else: + stop = input_tensor.shape[0] + stop = -1 if np.isnan(stop) else stop + index = self.index = pd.RangeIndex(start=0, stop=stop) + index_value = parse_index(index) + + if columns is not None: + if not ( + input_tensor.ndim == 1 + and len(columns) == 1 + or input_tensor.shape[1] == len(columns) + ): + raise ValueError( + f"columns {columns} should have the same shape with tensor: {input_tensor.shape[1]}" + ) + if not isinstance(columns, pd.Index): + if isinstance(columns, ENTITY_TYPE): + raise NotImplementedError("The columns value cannot be a tileable") + columns = pd.Index(columns) + columns_value = parse_index(columns, store_data=True) + else: + if input_tensor.ndim == 1: + # convert to 1-d DataFrame + columns_value = parse_index( + pd.RangeIndex(start=0, stop=1), store_data=True + ) + else: + columns_value = parse_index( + pd.RangeIndex(start=0, stop=input_tensor.shape[1]), store_data=True + ) + + if input_tensor.ndim == 1: + shape = (input_tensor.shape[0], 1) + else: + shape = input_tensor.shape + + return self.new_dataframe( + inputs, + shape, + dtypes=dtypes, + index_value=index_value, + columns_value=columns_value, + ) + + def _call_tensor_none( + self, index: Union[TileableType, pd.Index], columns: pd.Index, dtypes: pd.Series + ): + inputs = [] + shape = [] + if index is not None: + index_value = self._process_index(index, inputs) + shape.append(index.shape[0]) + else: + index = self.index = pd.Index([], dtype=object) + index_value = parse_index(index) + shape.append(0) + + if columns is not None: + if not isinstance(columns, pd.Index): + if isinstance(columns, ENTITY_TYPE): + raise NotImplementedError("The columns value cannot be a tileable") + columns = pd.Index(columns) + columns_value = parse_index(columns, store_data=True) + shape.append(columns.shape[0]) + else: + columns_value = parse_index(pd.Index([], dtype=object), store_data=True) + shape.append(0) + + return self.new_dataframe( + inputs, + shape=tuple(shape), + dtypes=dtypes, + index_value=index_value, + columns_value=columns_value, + ) + + @classmethod + def tile(cls, op: "DataFrameFromTensor"): + if isinstance(op.input, dict): + return (yield from cls._tile_input_1d_tileables(op)) + elif op.input is not None: + return (yield from cls._tile_input_tensor(op)) + else: + return cls._tile_tensor_none(op) + + @classmethod + def _tile_input_1d_tileables(cls, op: "DataFrameFromTensor"): + # make sure all tensor have known chunk shapes + if has_unknown_shape(*op.inputs): + yield + + out_df = op.outputs[0] + in_tensors = op.inputs + in_tensors = yield from unify_chunks(*in_tensors) + nsplit = in_tensors[0].nsplits[0] + + cum_sizes = [0] + np.cumsum(nsplit).tolist() + out_chunks = [] + for i in range(in_tensors[0].chunk_shape[0]): + chunk_op = op.copy().reset_key() + new_input = OrderedDict() + for k, v in op.input.items(): + if not isinstance(v, ENTITY_TYPE): + try: + new_input[k] = v[cum_sizes[i] : cum_sizes[i + 1]] + except TypeError: + # scalar + new_input[k] = v + else: + # do not need to do slice, + # will be done in set_inputs + new_input[k] = v + chunk_op.input = new_input + columns_value = out_df.columns_value + dtypes = out_df.dtypes + chunk_index = (i, 0) + if isinstance(op.index, INDEX_TYPE): + index_value = in_tensors[-1].chunks[i].index_value + elif isinstance(op.index, pd.Index): + chunk_op.index = pd_index = op.index[cum_sizes[i] : cum_sizes[i + 1]] + index_value = parse_index(pd_index, store_data=True) + else: + assert op.index is not None + index_chunk = in_tensors[-1].cix[i,] + index_value = parse_index( + pd.Index([], dtype=index_chunk.dtype), + index_chunk, + type(chunk_op).__name__, + ) + shape = (nsplit[i], len(out_df.dtypes)) + out_chunk = chunk_op.new_chunk( + [t.cix[(i,)] for t in in_tensors], + shape=shape, + index=chunk_index, + dtypes=dtypes, + index_value=index_value, + columns_value=columns_value, + ) + out_chunks.append(out_chunk) + + nsplits = (nsplit, (len(out_df.dtypes),)) + new_op = op.copy() + return new_op.new_dataframes( + out_df.inputs, + out_df.shape, + dtypes=out_df.dtypes, + index_value=out_df.index_value, + columns_value=out_df.columns_value, + chunks=out_chunks, + nsplits=nsplits, + ) + + @classmethod + def _tile_input_tensor(cls, op: "DataFrameFromTensor"): + out_df = op.outputs[0] + in_tensor = op.input + out_chunks = [] + if out_df.index_value.has_value() and has_unknown_shape(in_tensor): + yield + + nsplits = in_tensor.nsplits + + if op.index is not None and hasattr(op.index, "key"): + # rechunk index if it's a tensor + if has_unknown_shape(*op.inputs): + yield + index_tensor = yield from recursive_tile(op.index.rechunk([nsplits[0]])) + else: + index_tensor = None + + # nsplits + if in_tensor.ndim == 1: + out_nsplits = in_tensor.nsplits + ((1,),) + else: + out_nsplits = in_tensor.nsplits + + cum_nsplits = [[0] + np.cumsum(ns).tolist() for ns in out_nsplits] + for in_chunk in in_tensor.chunks: + out_op = op.copy().reset_key() + chunk_inputs = [in_chunk] + if in_chunk.ndim == 1: + i = in_chunk.index[0] + chunk_index = (i, 0) + chunk_shape = (in_chunk.shape[0], 1) + else: + i, j = in_chunk.index + chunk_index = in_chunk.index + chunk_shape = in_chunk.shape + + if op.columns is not None: + column_nsplit = cum_nsplits[1] + j = chunk_index[1] + out_op.columns = op.columns[column_nsplit[j] : column_nsplit[j + 1]] + + if isinstance(op.index, INDEX_TYPE): + index_chunk = index_tensor.chunks[i] + chunk_inputs.append(index_chunk) + elif isinstance(op.index, pd.Index): + index_nsplit = cum_nsplits[0] + if op.index.size > 0: + out_op.index = op.index[index_nsplit[i] : index_nsplit[i + 1]] + elif index_tensor is not None: + index_chunk = index_tensor.cix[i] + chunk_inputs.append(index_chunk) + + out_chunk = out_op.new_chunk( + chunk_inputs, shape=chunk_shape, index=chunk_index + ) + out_chunk._set_tileable_meta( + tileable_key=out_df.key, + nsplits=out_nsplits, + index_value=out_df.index_value, + columns_value=out_df.columns_value, + dtypes=out_df.dtypes, + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + params = out_df.params.copy() + params["chunks"] = out_chunks + params["nsplits"] = out_nsplits + return new_op.new_dataframes(out_df.inputs, kws=[params]) + + @classmethod + def _tile_tensor_none(cls, op: "DataFrameFromTensor"): + out_df = op.outputs[0] + + out_chunks = [] + assert isinstance(op.index, INDEX_TYPE) + # tile as index + for index_chunk in op.index.chunks: + index_value = index_chunk.index_value + + chunk_shape = (index_chunk.shape[0], out_df.shape[1]) + chunk_index = (index_chunk.index[0], 0) + + chunk_op = op.copy().reset_key() + out_chunk = chunk_op.new_chunk( + [index_chunk], + shape=chunk_shape, + index=chunk_index, + index_value=index_value, + columns_value=out_df.columns_value, + dtypes=out_df.dtypes, + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + params = out_df.params.copy() + params["nsplits"] = (op.index.nsplits[0], (out_df.shape[1],)) + params["chunks"] = out_chunks + return new_op.new_dataframes(out_df.inputs, kws=[params]) + + @classmethod + def execute(cls, ctx: Union[dict, Context], op: "DataFrameFromTensor"): + chunk = op.outputs[0] + + if isinstance(op.input, dict): + d = OrderedDict() + for k, v in op.input.items(): + if hasattr(v, "key"): + d[k] = ctx[v.key] + else: + d[k] = v + if op.index is not None and hasattr(op.index, "key"): + index_data = ctx[op.index.key] + else: + index_data = op.index + ctx[chunk.key] = pd.DataFrame(d, index=index_data, columns=op.columns) + elif op.input is not None: + tensor_data = ctx[op.inputs[0].key] + if isinstance(tensor_data, pd.Series): + ctx[chunk.key] = tensor_data.to_frame(name=chunk.dtypes.index[0]) + else: + if op.index is not None and hasattr(op.index, "key"): + # index is a tensor + index_data = ctx[op.inputs[1].key] + else: + index_data = op.index + if isinstance(index_data, pd.RangeIndex) and len(index_data) == 0: + index_data = None + ctx[chunk.key] = pd.DataFrame( + tensor_data, + index=index_data, + columns=op.columns, + ) + else: + index_data = ctx[op.index.key] + ctx[chunk.key] = pd.DataFrame(index=index_data, columns=op.columns) + + +def dataframe_from_tensor( + tensor: Tensor, + index: Union[TileableType, pd.Index] = None, + columns: Union[pd.Index, list] = None, + gpu: bool = None, + sparse: bool = False, +): + if tensor is not None: + if tensor.ndim > 2 or tensor.ndim <= 0: + raise TypeError( + f"Not support create DataFrame from {tensor.ndim} dims tensor" + ) + try: + col_num = tensor.shape[1] + except IndexError: + col_num = 1 + gpu = tensor.op.gpu if gpu is None else gpu + dtypes = pd.Series([tensor.dtype] * col_num, index=columns) + if columns is None: + columns = dtypes.index + else: + gpu = None + if columns is not None: + dtypes = pd.Series([], index=columns) + else: + dtypes = pd.Series([], index=pd.Index([], dtype=object)) + if index is not None and not isinstance(index, ENTITY_TYPE): + index = pd.Index(index) + op = DataFrameFromTensor( + input=tensor, index=index, columns=columns, gpu=gpu, sparse=sparse + ) + return op(tensor, index, columns, dtypes) + + +def dataframe_from_1d_tileables( + d: Dict[Any, TileableType], + index: Union[TileableType, pd.Index, list] = None, + columns: Union[pd.Index, list] = None, + gpu: bool = None, + sparse: bool = False, +): + data = dict() + for k, v in d.items(): + if isinstance(v, (list, tuple)) and any( + isinstance(sv, ENTITY_TYPE) for sv in v + ): + data[k] = astensor(v) + else: + data[k] = v + d = data + if columns is not None: + tileables = [d.get(c) for c in columns] + else: + columns = list(d.keys()) + tileables = list(d.values()) + + gpu = ( + next((t.op.gpu for t in tileables if hasattr(t, "op")), False) + if gpu is None + else gpu + ) + dtypes = pd.Series( + [t.dtype if hasattr(t, "dtype") else pd.Series(t).dtype for t in tileables], + index=columns, + ) + if index is not None and not isinstance(index, ENTITY_TYPE): + index = pd.Index(index) + op = DataFrameFromTensor( + input=d, index=index, columns=columns, gpu=gpu, sparse=sparse + ) + return op(d, index, columns, dtypes) + + +class SeriesFromTensor(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.SERIES_FROM_TENSOR + + input = KeyField("input") + index = AnyField("index") + + def _set_inputs(self, inputs: List[EntityType]): + super()._set_inputs(inputs) + if self.input is not None: + self.input = self.inputs[0] + if self.index is not None and hasattr(self.index, "key"): + self.index = self.inputs[-1] + + @classmethod + def tile(cls, op: "SeriesFromTensor"): + if op.index is None or not hasattr(op.index, "key"): + # check all inputs to make sure no unknown chunk shape + if has_unknown_shape(*op.inputs): + yield + + if op.input is None: + return cls._tile_tensor_none(op) + + out_series = op.outputs[0] + in_tensor = op.inputs[0] + nsplits = in_tensor.nsplits + + index_tensor = series_index = None + if op.index is not None: + if hasattr(op.index, "key"): + index_tensor = yield from recursive_tile(op.index.rechunk([nsplits[0]])) + else: + series_index = op.index + + index_start = 0 + out_chunks = [] + for in_chunk in in_tensor.chunks: + new_op = op.copy().reset_key() + new_op.extra_params["index_start"] = index_start + chunk_inputs = [in_chunk] + if index_tensor is not None: + index_chunk = index_tensor.cix[in_chunk.index] + chunk_inputs.append(index_chunk) + if isinstance(op.index, INDEX_TYPE): + index_value = index_chunk.index_value + else: + index_value = parse_index( + pd.Index([], dtype=in_chunk.dtype), + index_chunk, + type(new_op).__name__, + ) + else: + chunk_pd_index = series_index[ + index_start : index_start + in_chunk.shape[0] + ] + index_value = parse_index(chunk_pd_index) + new_op.index = chunk_pd_index + index_start += in_chunk.shape[0] + out_chunk = new_op.new_chunk( + chunk_inputs, + shape=in_chunk.shape, + index=in_chunk.index, + index_value=index_value, + name=out_series.name, + dtype=out_series.dtype, + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_tileables( + op.inputs, + shape=out_series.shape, + dtype=out_series.dtype, + index_value=out_series.index_value, + name=out_series.name, + chunks=out_chunks, + nsplits=in_tensor.nsplits, + ) + + @classmethod + def _tile_tensor_none(cls, op: "SeriesFromTensor"): + out_series = op.outputs[0] + + out_chunks = [] + assert isinstance(op.index, INDEX_TYPE) + # tile as index + for index_chunk in op.index.chunks: + index_value = index_chunk.index_value + + chunk_shape = (index_chunk.shape[0],) + chunk_index = (index_chunk.index[0],) + + chunk_op = op.copy().reset_key() + out_chunk = chunk_op.new_chunk( + [index_chunk], + shape=chunk_shape, + index=chunk_index, + index_value=index_value, + dtype=out_series.dtype, + name=out_series.name, + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + params = out_series.params.copy() + params["nsplits"] = (op.index.nsplits[0],) + params["chunks"] = out_chunks + return new_op.new_tileables(out_series.inputs, kws=[params]) + + @classmethod + def execute(cls, ctx: Union[dict, Context], op: "SeriesFromTensor"): + chunk = op.outputs[0] + if op.input is not None: + tensor_data = ctx[op.input.key] + else: + tensor_data = None + + if op.index is not None and hasattr(op.index, "key"): + index_data = ctx[op.index.key] + else: + index_data = op.index + if ( + tensor_data is not None + and isinstance(index_data, pd.RangeIndex) + and len(index_data) == 0 + ): + # index not specified + index_data = None + + ctx[chunk.key] = pd.Series( + tensor_data, index=index_data, name=chunk.name, dtype=chunk.dtype + ) + + def __call__( + self, + input_tensor: Tensor, + index: Union[TileableType, pd.Index], + dtype: np.dtype, + name: Any, + ): + inputs = [input_tensor] if input_tensor is not None else [] + if index is not None: + if not isinstance(index, pd.Index): + if isinstance(index, INDEX_TYPE): + self.index = index + index_value = index.index_value + inputs.append(index) + elif isinstance(index, ENTITY_TYPE): + self.index = index + index = astensor(index) + if index.ndim != 1: + raise ValueError(f"index should be 1-d, got {index.ndim}-d") + index_value = parse_index( + pd.Index([], dtype=index.dtype), index, type(self).__name__ + ) + inputs.append(index) + else: + self.index = index = pd.Index(index) + index_value = parse_index(index) + else: + self.index = index + index_value = parse_index(index) + elif input_tensor is not None: + if pd.isna(input_tensor.shape[0]): + pd_index = pd.RangeIndex(-1) + else: + pd_index = pd.RangeIndex(start=0, stop=input_tensor.shape[0]) + index_value = parse_index(pd_index) + self.index = pd_index + else: + self.index = index = pd.Index([], dtype=object) + index_value = parse_index(index) + + if input_tensor is not None: + shape = input_tensor.shape + elif index is not None: + shape = index.shape + else: + shape = (0,) + + return self.new_series( + inputs, shape=shape, dtype=dtype, index_value=index_value, name=name + ) + + +def series_from_tensor( + tensor: Tensor, + index: Union[TileableType, pd.Index, list] = None, + name: Any = None, + dtype: np.dtype = None, + gpu: bool = None, + sparse: bool = False, +): + if tensor is not None: + if tensor.ndim > 1 or tensor.ndim <= 0: + raise TypeError(f"Not support create Series from {tensor.ndim} dims tensor") + gpu = tensor.op.gpu if gpu is None else gpu + dtype = dtype or tensor.dtype + else: + gpu = None + dtype = dtype or np.dtype(float) + op = SeriesFromTensor(input=tensor, gpu=gpu, sparse=sparse) + return op(tensor, index, dtype, name) diff --git a/python/xorbits/_mars/dataframe/datasource/from_vineyard.py b/python/xorbits/_mars/dataframe/datasource/from_vineyard.py new file mode 100644 index 000000000..4714948d5 --- /dev/null +++ b/python/xorbits/_mars/dataframe/datasource/from_vineyard.py @@ -0,0 +1,261 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import OutputType +from ...core.context import get_context +from ...serialization.serializables import Int32Field, StringField +from ...tensor.datasource.from_vineyard import resolve_vineyard_socket +from ...utils import calc_nsplits, has_unknown_shape, lazy_import +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import parse_index + +vineyard = lazy_import("vineyard") +vy_data_utils = lazy_import("vineyard.data.utils", rename="vy_data_utils") + + +class DataFrameFromVineyard(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.DATAFRAME_FROM_VINEYARD_CHUNK + + # generated columns for metadata + generated_columns = ["id", "worker_address", "dtypes", "shape", "index", "columns"] + + # vineyard ipc socket + vineyard_socket = StringField("vineyard_socket") + + # ObjectID in vineyard + object_id = StringField("object_id") + + # a dummy attr to make sure ops have different keys + operator_index = Int32Field("operator_index") + + def __init__(self, vineyard_socket=None, object_id=None, **kw): + super().__init__( + vineyard_socket=vineyard_socket, + object_id=object_id, + _output_types=[OutputType.dataframe], + **kw + ) + + def check_inputs(self, inputs): + # no inputs + if inputs and len(inputs) > 0: + raise ValueError("DataFrame data source has no inputs") + + def _new_chunks(self, inputs, kws=None, **kw): + shape = kw.get("shape", None) + self.extra_params[ + "shape" + ] = shape # set shape to make the operand key different + return super()._new_chunks(inputs, kws=kws, **kw) + + def _new_tileables(self, inputs, kws=None, **kw): + shape = kw.get("shape", None) + self.extra_params[ + "shape" + ] = shape # set shape to make the operand key different + return super()._new_tileables(inputs, kws=kws, **kw) + + def __call__(self, shape, dtypes=None, index_value=None, columns_value=None): + return self.new_dataframe( + None, + shape, + dtypes=dtypes, + index_value=index_value, + columns_value=columns_value, + ) + + @classmethod + def tile(cls, op): + ctx = get_context() + workers = ctx.get_worker_addresses() + + out_chunks = [] + dtypes = pd.Series( + [np.dtype("O")] * len(cls.generated_columns), index=cls.generated_columns + ) + for index, worker in enumerate(workers): + chunk_op = op.copy().reset_key() + chunk_op.expect_worker = worker + chunk_op.operator_index = index + out_chunk = chunk_op.new_chunk( + [], + dtypes=dtypes, + shape=(1, len(cls.generated_columns)), + index=(index, 0), + index_value=parse_index(pd.RangeIndex(0, 1)), + columns_value=parse_index(pd.Index(cls.generated_columns)), + ) + out_chunks.append(out_chunk) + + new_op = op.copy().reset_key() + return new_op.new_dataframes( + op.inputs, + shape=(np.nan, np.nan), + dtypes=dtypes, + chunks=out_chunks, + nsplits=((np.nan,), (np.nan,)), + # use the same value as `read_csv` + index_value=parse_index(pd.RangeIndex(0, 1)), + columns_value=parse_index(pd.Index(cls.generated_columns)), + ) + + @classmethod + def execute(cls, ctx, op): + if vineyard is None: + raise RuntimeError("vineyard is not available") + + socket = resolve_vineyard_socket(ctx, op) + client = vineyard.connect(socket) + + meta = client.get_meta(vineyard.ObjectID(op.object_id)) + chunks, dtypes = [], None + for idx in range(meta["partitions_-size"]): + chunk_meta = meta["partitions_-%d" % idx] + columns = pd.Index(vy_data_utils.from_json(chunk_meta["columns_"])) + shape = (np.nan, len(columns)) + if not chunk_meta.islocal: + continue + if dtypes is None: + dtypes = [] + for idx in range(len(columns)): + column_meta = chunk_meta["__values_-value-%d" % idx] + dtype = vy_data_utils.normalize_dtype( + column_meta["value_type_"], + column_meta.get("value_type_meta_", None), + ) + dtypes.append(dtype) + dtypes = pd.Series(dtypes, index=columns) + chunk_index = ( + chunk_meta["partition_index_row_"], + chunk_meta["partition_index_column_"], + ) + # chunk: (chunk_id, worker_address, dtype, shape, index, columns) + chunks.append( + ( + repr(chunk_meta.id), + ctx.worker_address, + dtypes, + shape, + chunk_index, + columns, + ) + ) + + ctx[op.outputs[0].key] = pd.DataFrame(chunks, columns=cls.generated_columns) + + +class DataFrameFromVineyardChunk(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.TENSOR_FROM_VINEYARD_CHUNK + + # vineyard ipc socket + vineyard_socket = StringField("vineyard_socket") + + # ObjectID of chunk in vineyard + object_id = StringField("object_id") + + def __init__(self, vineyard_socket=None, object_id=None, **kw): + super().__init__(vineyard_socket=vineyard_socket, object_id=object_id, **kw) + + def __call__(self, meta): + return self.new_dataframe([meta]) + + @classmethod + def tile(cls, op): + if has_unknown_shape(*op.inputs): + yield + + ctx = get_context() + + in_chunk_keys = [chunk.key for chunk in op.inputs[0].chunks] + out_chunks = [] + chunk_map = dict() + dtypes, columns = None, None + for chunk, infos in zip( + op.inputs[0].chunks, ctx.get_chunks_result(in_chunk_keys) + ): + for _, info in infos.iterrows(): + chunk_op = op.copy().reset_key() + chunk_op.object_id = info["id"] + chunk_op.expect_worker = info["worker_address"] + dtypes = info["dtypes"] + columns = info["columns"] + shape = info["shape"] + chunk_index = info["index"] + chunk_map[chunk_index] = info["shape"] + out_chunk = chunk_op.new_chunk( + [chunk], + shape=shape, + index=chunk_index, + dtypes=dtypes, + index_value=parse_index(pd.RangeIndex(0, -1)), + columns_value=parse_index(pd.Index(columns)), + ) + out_chunks.append(out_chunk) + + nsplits = calc_nsplits(chunk_map) + shape = [np.sum(nsplit) for nsplit in nsplits] + new_op = op.copy().reset_key() + return new_op.new_dataframes( + op.inputs, + shape=shape, + dtypes=dtypes, + chunks=out_chunks, + nsplits=nsplits, + index_value=parse_index(pd.RangeIndex(0, -1)), + columns_value=parse_index(pd.Index(columns)), + ) + + @classmethod + def execute(cls, ctx, op): + if vineyard is None: + raise RuntimeError("vineyard is not available") + + socket = resolve_vineyard_socket(ctx, op) + client = vineyard.connect(socket) + + client = vineyard.connect(socket) + ctx[op.outputs[0].key] = client.get(vineyard.ObjectID(op.object_id)) + + +def from_vineyard(df, vineyard_socket=None): + if vineyard is not None and isinstance(df, vineyard.Object): # pragma: no cover + if "vineyard::GlobalDataFrame" not in df.typename: + raise TypeError( + "The input dataframe %r is not a vineyard' GlobalDataFrame" % df + ) + object_id = df.id + else: + object_id = df + if vineyard is not None and isinstance(object_id, vineyard.ObjectID): + object_id = repr(object_id) + metaop = DataFrameFromVineyard( + vineyard_socket=vineyard_socket, + object_id=object_id, + dtype=np.dtype("byte"), + gpu=None, + ) + meta = metaop( + shape=(np.nan,), + dtypes=pd.Series([]), + index_value=parse_index(pd.Index([])), + columns_value=parse_index(pd.Index([])), + ) + op = DataFrameFromVineyardChunk( + vineyard_socket=vineyard_socket, object_id=object_id, gpu=None + ) + return op(meta) diff --git a/python/xorbits/_mars/dataframe/datasource/index.py b/python/xorbits/_mars/dataframe/datasource/index.py new file mode 100644 index 000000000..37da1be45 --- /dev/null +++ b/python/xorbits/_mars/dataframe/datasource/index.py @@ -0,0 +1,260 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...config import options +from ...core import OutputType +from ...serialization.serializables import BoolField, DataTypeField, IndexField +from ...tensor.utils import get_chunk_slices +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import decide_series_chunk_size, is_cudf, parse_index + + +class IndexDataSource(DataFrameOperand, DataFrameOperandMixin): + """ + Represent data from pandas Index + """ + + _op_type_ = OperandDef.INDEX_DATA_SOURCE + + data = IndexField("data") + dtype = DataTypeField("dtype") + store_data = BoolField("store_data") + + def __init__(self, data=None, dtype=None, gpu=None, store_data=None, **kw): + if dtype is None and data is not None: + dtype = data.dtype + if gpu is None and is_cudf(data): # pragma: no cover + gpu = True + super().__init__( + data=data, + dtype=dtype, + gpu=gpu, + store_data=store_data, + _output_types=[OutputType.index], + **kw + ) + + def __call__(self, shape=None, chunk_size=None, inp=None, name=None, names=None): + if inp is None: + # create from pandas Index + name = name if name is not None else self.data.name + names = names if names is not None else self.data.names + return self.new_index( + None, + shape=shape, + dtype=self.dtype, + index_value=parse_index(self.data, store_data=self.store_data), + name=name, + names=names, + raw_chunk_size=chunk_size, + ) + elif hasattr(inp, "index_value"): + # get index from Mars DataFrame, Series or Index + name = name if name is not None else inp.index_value.name + names = names if names is not None else [name] + if inp.index_value.has_value(): + self.data = data = inp.index_value.to_pandas() + return self.new_index( + None, + shape=(inp.shape[0],), + dtype=data.dtype, + index_value=parse_index(data, store_data=self.store_data), + name=name, + names=names, + raw_chunk_size=chunk_size, + ) + else: + if self.dtype is None: + self.dtype = inp.index_value.to_pandas().dtype + return self.new_index( + [inp], + shape=(inp.shape[0],), + dtype=self.dtype, + index_value=inp.index_value, + name=name, + names=names, + ) + else: + if inp.ndim != 1: + raise ValueError("Index data must be 1-dimensional") + # get index from tensor + dtype = inp.dtype if self.dtype is None else self.dtype + pd_index = pd.Index([], dtype=dtype) + if self.dtype is None: + self.dtype = pd_index.dtype + return self.new_index( + [inp], + shape=inp.shape, + dtype=self.dtype, + index_value=parse_index(pd_index, inp, store_data=self.store_data), + name=name, + names=names, + ) + + @classmethod + def _tile_from_pandas(cls, op): + index = op.outputs[0] + raw_index = op.data + + memory_usage = raw_index.memory_usage(deep=True) + chunk_size = index.extra_params.raw_chunk_size or options.chunk_size + chunk_size = decide_series_chunk_size(index.shape, chunk_size, memory_usage) + chunk_size_idxes = (range(len(size)) for size in chunk_size) + + out_chunks = [] + for chunk_index, chunk_shape in zip( + itertools.product(*chunk_size_idxes), itertools.product(*chunk_size) + ): + chunk_op = op.copy().reset_key() + slc = get_chunk_slices(chunk_size, chunk_index) + if is_cudf(raw_index): # pragma: no cover + chunk_op.data = chunk_data = raw_index[slc[0]] + else: + chunk_op.data = chunk_data = raw_index[slc] + out_chunk = chunk_op.new_chunk( + None, + shape=chunk_shape, + dtype=index.dtype, + index=chunk_index, + name=index.name, + index_value=parse_index(chunk_data, store_data=op.store_data), + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_indexes( + None, + index.shape, + dtype=index.dtype, + index_value=index.index_value, + name=index.name, + chunks=out_chunks, + nsplits=chunk_size, + ) + + @classmethod + def _tile_from_dataframe(cls, op): + inp = op.inputs[0] + out = op.outputs[0] + + out_chunks = [] + if inp.ndim == 1: + # series, index + for c in inp.chunks: + chunk_op = op.copy().reset_key() + out_chunk = chunk_op.new_chunk( + [c], + shape=c.shape, + dtype=out.dtype, + index=c.index, + index_value=c.index_value, + name=out.name, + ) + out_chunks.append(out_chunk) + nsplits = inp.nsplits + else: + # DataFrame + nsplit = inp.nsplits[1] + axis_1_index = np.argmin(nsplit).item() + for i in range(inp.chunk_shape[0]): + chunk_index = (i, axis_1_index) + c = inp.cix[chunk_index] + chunk_op = op.copy().reset_key() + out_chunk = chunk_op.new_chunk( + [c], + shape=(c.shape[0],), + dtype=out.dtype, + index=(i,), + index_value=c.index_value, + name=out.name, + ) + out_chunks.append(out_chunk) + nsplits = (inp.nsplits[0],) + + new_op = op.copy() + params = out.params + params["chunks"] = out_chunks + params["nsplits"] = nsplits + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + def _tile_from_tensor(cls, op): + inp = op.inputs[0] + out = op.outputs[0] + out_chunks = [] + for c in inp.chunks: + chunk_op = op.copy().reset_key() + index_value = parse_index( + out.index_value.to_pandas(), c, store_data=op.store_data + ) + out_chunk = chunk_op.new_chunk( + [c], + shape=c.shape, + dtype=out.dtype, + index=c.index, + index_value=index_value, + name=out.name, + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + params = out.params + params["chunks"] = out_chunks + params["nsplits"] = inp.nsplits + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + def tile(cls, op): + if not op.inputs: + # from pandas + return cls._tile_from_pandas(op) + elif hasattr(op.inputs[0], "index_value"): + # from DataFrame or Series + return cls._tile_from_dataframe(op) + else: + # from tensor + return cls._tile_from_tensor(op) + + @classmethod + def execute(cls, ctx, op): + if not op.inputs: + # from pandas + ctx[op.outputs[0].key] = op.data + else: + out = op.outputs[0] + inp = ctx[op.inputs[0].key] + dtype = out.dtype if out.dtype != object else None + if hasattr(inp, "index"): + # DataFrame, Series + ctx[out.key] = pd.Index(inp.index, dtype=dtype, name=out.name) + else: + ctx[out.key] = pd.Index(inp, dtype=dtype, name=out.name) + + +def from_pandas(data, chunk_size=None, gpu=None, sparse=False, store_data=False): + op = IndexDataSource( + data=data, gpu=gpu, sparse=sparse, dtype=data.dtype, store_data=store_data + ) + return op(shape=data.shape, chunk_size=chunk_size) + + +def from_tileable(tileable, dtype=None, name=None, names=None): + op = IndexDataSource(gpu=tileable.op.gpu, sparse=tileable.issparse(), dtype=dtype) + return op(inp=tileable, name=name, names=names) diff --git a/python/xorbits/_mars/dataframe/datasource/read_csv.py b/python/xorbits/_mars/dataframe/datasource/read_csv.py new file mode 100644 index 000000000..594a7c273 --- /dev/null +++ b/python/xorbits/_mars/dataframe/datasource/read_csv.py @@ -0,0 +1,760 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from io import BytesIO +from urllib.parse import urlparse + +import numpy as np +import pandas as pd + +try: + from pyarrow import NativeFile +except ImportError: # pragma: no cover + NativeFile = None + +from ... import opcodes as OperandDef +from ...config import options +from ...core import OutputType +from ...lib.filesystem import file_size, get_fs, glob, open_file +from ...serialization.serializables import ( + AnyField, + BoolField, + DictField, + Int32Field, + Int64Field, + ListField, + StringField, +) +from ...utils import FixedSizeFileObject, lazy_import, parse_readable_size +from ..arrays import ArrowStringDtype +from ..utils import build_empty_df, contain_arrow_dtype, parse_index, to_arrow_dtypes +from .core import ( + ColumnPruneSupportedDataSourceMixin, + IncrementalIndexDatasource, + IncrementalIndexDataSourceMixin, + merge_small_files, +) + +cudf = lazy_import("cudf") + + +def _find_delimiter(f, block_size=2**16): + delimiter = b"\n" + if f.tell() == 0: + return 0 + while True: + b = f.read(block_size) + if not b: + return f.tell() + elif delimiter in b: + return f.tell() - len(b) + b.index(delimiter) + 1 + + +def _find_hdfs_start_end(f, offset, size): + # As pyarrow doesn't support `readline` operation (https://github.com/apache/arrow/issues/3838), + # we need to find the start and end of file block manually. + + # Be careful with HdfsFile's seek, it doesn't allow seek beyond EOF. + loc = min(offset, f.size()) + f.seek(loc) + start = _find_delimiter(f) + loc = min(offset + size, f.size()) + f.seek(loc) + end = _find_delimiter(f) + return start, end + + +def _find_chunk_start_end(f, offset, size): + if NativeFile is not None and isinstance(f, NativeFile): + return _find_hdfs_start_end(f, offset, size) + f.seek(offset) + if f.tell() == 0: + start = 0 + else: + f.readline() + start = f.tell() + f.seek(offset + size) + f.readline() + end = f.tell() + return start, end + + +class DataFrameReadCSV( + IncrementalIndexDatasource, + ColumnPruneSupportedDataSourceMixin, + IncrementalIndexDataSourceMixin, +): + _op_type_ = OperandDef.READ_CSV + + path = AnyField("path") + names = ListField("names") + sep = StringField("sep") + header = AnyField("header") + index_col = Int32Field("index_col") + compression = StringField("compression") + usecols = AnyField("usecols") + offset = Int64Field("offset") + size = Int64Field("size") + incremental_index = BoolField("incremental_index") + use_arrow_dtype = BoolField("use_arrow_dtype") + keep_usecols_order = BoolField("keep_usecols_order", default=None) + storage_options = DictField("storage_options") + merge_small_files = BoolField("merge_small_files") + merge_small_file_options = DictField("merge_small_file_options") + + def get_columns(self): + return self.usecols + + def set_pruned_columns(self, columns, *, keep_order=None): + self.usecols = columns + self.keep_usecols_order = keep_order + + @classmethod + def _tile_compressed(cls, op): + # Compression does not support break into small parts + df = op.outputs[0] + chunk_op = op.copy().reset_key() + chunk_op.offset = 0 + chunk_op.size = file_size(op.path, storage_options=op.storage_options) + shape = df.shape + new_chunk = chunk_op.new_chunk( + None, + shape=shape, + index=(0, 0), + index_value=df.index_value, + columns_value=df.columns_value, + dtypes=df.dtypes, + ) + new_op = op.copy() + nsplits = ((np.nan,), (df.shape[1],)) + return new_op.new_dataframes( + None, + df.shape, + dtypes=df.dtypes, + index_value=df.index_value, + columns_value=df.columns_value, + chunks=[new_chunk], + nsplits=nsplits, + ) + + @classmethod + def _tile(cls, op: "DataFrameReadCSV"): + if op.compression: + return cls._tile_compressed(op) + + df = op.outputs[0] + chunk_bytes = df.extra_params.chunk_bytes + chunk_bytes = int(parse_readable_size(chunk_bytes)[0]) + + dtypes = df.dtypes + if ( + op.use_arrow_dtype is None + and not op.gpu + and options.dataframe.use_arrow_dtype + ): # pragma: no cover + # check if use_arrow_dtype set on the server side + dtypes = to_arrow_dtypes(df.dtypes) + + path_prefix = "" + if isinstance(op.path, (tuple, list)): + paths = op.path + elif get_fs(op.path, op.storage_options).isdir(op.path): + parsed_path = urlparse(op.path) + if parsed_path.scheme.lower() == "hdfs": + path_prefix = f"{parsed_path.scheme}://{parsed_path.netloc}" + paths = get_fs(op.path, op.storage_options).ls(op.path) + else: + paths = glob( + op.path.rstrip("/") + "/*", storage_options=op.storage_options + ) + else: + paths = glob(op.path, storage_options=op.storage_options) + + out_chunks = [] + index_num = 0 + for path in paths: + path = path_prefix + path + total_bytes = file_size(path, storage_options=op.storage_options) + offset = 0 + for _ in range(int(np.ceil(total_bytes * 1.0 / chunk_bytes))): + chunk_op = op.copy().reset_key() + chunk_op.path = path + chunk_op.offset = offset + chunk_op.size = min(chunk_bytes, total_bytes - offset) + shape = (np.nan, len(dtypes)) + index_value = parse_index(df.index_value.to_pandas(), path, index_num) + new_chunk = chunk_op.new_chunk( + None, + shape=shape, + index=(index_num, 0), + index_value=index_value, + columns_value=df.columns_value, + dtypes=dtypes, + ) + out_chunks.append(new_chunk) + index_num += 1 + offset += chunk_bytes + + new_op = op.copy() + nsplits = ((np.nan,) * len(out_chunks), (df.shape[1],)) + df = new_op.new_dataframe( + None, + df.shape, + dtypes=dtypes, + index_value=df.index_value, + columns_value=df.columns_value, + chunks=out_chunks, + nsplits=nsplits, + ) + if op.merge_small_files: + df = merge_small_files(df, **(op.merge_small_file_options or dict())) + return [df] + + @classmethod + def _pandas_read_csv(cls, f, op): + csv_kwargs = op.extra_params.copy() + out_df = op.outputs[0] + start, end = _find_chunk_start_end(f, op.offset, op.size) + f.seek(start) + b = FixedSizeFileObject(f, end - start) + if hasattr(out_df, "dtypes"): + dtypes = out_df.dtypes + else: + # Output will be a Series in some optimize rules. + dtypes = pd.Series([out_df.dtype], index=[out_df.name]) + if end == start: + # the last chunk may be empty + df = build_empty_df(dtypes) + if op.keep_usecols_order and not isinstance(op.usecols, list): + # convert to Series, if usecols is a scalar + df = df[op.usecols] + else: + if start == 0: + # The first chunk contains header + # As we specify names and dtype, we need to skip header rows + csv_kwargs["header"] = op.header + if op.usecols: + usecols = op.usecols if isinstance(op.usecols, list) else [op.usecols] + else: + usecols = op.usecols + if contain_arrow_dtype(dtypes): + # when keep_default_na is True which is default, + # will replace null value with np.nan, + # which will cause failure when converting to arrow string array + csv_kwargs["keep_default_na"] = False + csv_kwargs["dtype"] = cls._select_arrow_dtype(dtypes) + df = pd.read_csv( + b, + sep=op.sep, + names=op.names, + index_col=op.index_col, + usecols=usecols, + nrows=op.nrows, + **csv_kwargs, + ) + if op.keep_usecols_order: + df = df[op.usecols] + return df + + @classmethod + def _cudf_read_csv(cls, op): # pragma: no cover + if op.usecols: + usecols = op.usecols if isinstance(op.usecols, list) else [op.usecols] + else: + usecols = op.usecols + csv_kwargs = op.extra_params + if op.offset == 0: + df = cudf.read_csv( + op.path, + byte_range=(op.offset, op.size), + sep=op.sep, + usecols=usecols, + **csv_kwargs, + ) + else: + df = cudf.read_csv( + op.path, + byte_range=(op.offset, op.size), + sep=op.sep, + names=op.names, + usecols=usecols, + nrows=op.nrows, + **csv_kwargs, + ) + + if op.keep_usecols_order: + df = df[op.usecols] + return df + + @classmethod + def _contains_arrow_dtype(cls, dtypes): + return any(isinstance(dtype, ArrowStringDtype) for dtype in dtypes) + + @classmethod + def _select_arrow_dtype(cls, dtypes): + return dict( + (c, dtype) + for c, dtype in dtypes.items() + if isinstance(dtype, ArrowStringDtype) + ) + + @classmethod + def execute(cls, ctx, op): + xdf = cudf if op.gpu else pd + out_df = op.outputs[0] + csv_kwargs = op.extra_params.copy() + + with open_file( + op.path, compression=op.compression, storage_options=op.storage_options + ) as f: + if op.compression is not None: + # As we specify names and dtype, we need to skip header rows + csv_kwargs["header"] = op.header + dtypes = op.outputs[0].dtypes + if contain_arrow_dtype(dtypes): + # when keep_default_na is True which is default, + # will replace null value with np.nan, + # which will cause failure when converting to arrow string array + csv_kwargs["keep_default_na"] = False + csv_kwargs["dtype"] = cls._select_arrow_dtype(dtypes) + df = xdf.read_csv( + f, + sep=op.sep, + names=op.names, + index_col=op.index_col, + usecols=op.usecols, + nrows=op.nrows, + **csv_kwargs, + ) + if op.keep_usecols_order: + df = df[op.usecols] + else: + df = cls._cudf_read_csv(op) if op.gpu else cls._pandas_read_csv(f, op) + ctx[out_df.key] = df + + def estimate_size(cls, ctx, op): + phy_size = op.size * (op.memory_scale or 1) + ctx[op.outputs[0].key] = (phy_size, phy_size * 2) + + def __call__( + self, index_value=None, columns_value=None, dtypes=None, chunk_bytes=None + ): + self._output_types = [OutputType.dataframe] + shape = (np.nan, len(dtypes)) + return self.new_dataframe( + None, + shape, + dtypes=dtypes, + index_value=index_value, + columns_value=columns_value, + chunk_bytes=chunk_bytes, + ) + + +def read_csv( + path, + names=None, + sep: str = ",", + index_col=None, + compression=None, + header="infer", + dtype=None, + usecols=None, + nrows=None, + chunk_bytes="64M", + gpu=None, + head_bytes="100k", + head_lines=None, + incremental_index: bool = True, + use_arrow_dtype: bool = None, + storage_options: dict = None, + memory_scale: int = None, + merge_small_files: bool = True, + merge_small_file_options: dict = None, + **kwargs, +): + r""" + Read a comma-separated values (csv) file into DataFrame. + Also supports optionally iterating or breaking of the file + into chunks. + + Parameters + ---------- + path : str + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: file://localhost/path/to/table.csv, + you can also read from external resources using a URL like: + hdfs://localhost:8020/test.csv. + If you want to pass in a path object, pandas accepts any ``os.PathLike``. + By file-like object, we refer to objects with a ``read()`` method, such as + a file handler (e.g. via builtin ``open`` function) or ``StringIO``. + sep : str, default ',' + Delimiter to use. If sep is None, the C engine cannot automatically detect + the separator, but the Python parsing engine can, meaning the latter will + be used and automatically detect the separator by Python's builtin sniffer + tool, ``csv.Sniffer``. In addition, separators longer than 1 character and + different from ``'\s+'`` will be interpreted as regular expressions and + will also force the use of the Python parsing engine. Note that regex + delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``. + delimiter : str, default ``None`` + Alias for sep. + header : int, list of int, default 'infer' + Row number(s) to use as the column names, and the start of the + data. Default behavior is to infer the column names: if no names + are passed the behavior is identical to ``header=0`` and column + names are inferred from the first line of the file, if column + names are passed explicitly then the behavior is identical to + ``header=None``. Explicitly pass ``header=0`` to be able to + replace existing names. The header can be a list of integers that + specify row locations for a multi-index on the columns + e.g. [0,1,3]. Intervening rows that are not specified will be + skipped (e.g. 2 in this example is skipped). Note that this + parameter ignores commented lines and empty lines if + ``skip_blank_lines=True``, so ``header=0`` denotes the first line of + data rather than the first line of the file. + names : array-like, optional + List of column names to use. If the file contains a header row, + then you should explicitly pass ``header=0`` to override the column names. + Duplicates in this list are not allowed. + index_col : int, str, sequence of int / str, or False, default ``None`` + Column(s) to use as the row labels of the ``DataFrame``, either given as + string name or column index. If a sequence of int / str is given, a + MultiIndex is used. + Note: ``index_col=False`` can be used to force pandas to *not* use the first + column as the index, e.g. when you have a malformed file with delimiters at + the end of each line. + usecols : list-like or callable, optional + Return a subset of the columns. If list-like, all elements must either + be positional (i.e. integer indices into the document columns) or strings + that correspond to column names provided either by the user in `names` or + inferred from the document header row(s). For example, a valid list-like + `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. + Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. + To instantiate a DataFrame from ``data`` with element order preserved use + ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns + in ``['foo', 'bar']`` order or + ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` + for ``['bar', 'foo']`` order. + If callable, the callable function will be evaluated against the column + names, returning names where the callable function evaluates to True. An + example of a valid callable argument would be ``lambda x: x.upper() in + ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster + parsing time and lower memory usage. + squeeze : bool, default False + If the parsed data only contains one column then return a Series. + prefix : str, optional + Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ... + mangle_dupe_cols : bool, default True + Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than + 'X'...'X'. Passing in False will cause data to be overwritten if there + are duplicate names in the columns. + dtype : Type name or dict of column -> type, optional + Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32, + 'c': 'Int64'} + Use `str` or `object` together with suitable `na_values` settings + to preserve and not interpret dtype. + If converters are specified, they will be applied INSTEAD + of dtype conversion. + engine : {'c', 'python'}, optional + Parser engine to use. The C engine is faster while the python engine is + currently more feature-complete. + converters : dict, optional + Dict of functions for converting values in certain columns. Keys can either + be integers or column labels. + true_values : list, optional + Values to consider as True. + false_values : list, optional + Values to consider as False. + skipinitialspace : bool, default False + Skip spaces after delimiter. + skiprows : list-like, int or callable, optional + Line numbers to skip (0-indexed) or number of lines to skip (int) + at the start of the file. + If callable, the callable function will be evaluated against the row + indices, returning True if the row should be skipped and False otherwise. + An example of a valid callable argument would be ``lambda x: x in [0, 2]``. + skipfooter : int, default 0 + Number of lines at bottom of file to skip (Unsupported with engine='c'). + nrows : int, optional + Number of rows of file to read. Useful for reading pieces of large files. + na_values : scalar, str, list-like, or dict, optional + Additional strings to recognize as NA/NaN. If dict passed, specific + per-column NA values. By default the following values are interpreted as + NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', + '1.#IND', '1.#QNAN', '', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', + 'nan', 'null'. + keep_default_na : bool, default True + Whether or not to include the default NaN values when parsing the data. + Depending on whether `na_values` is passed in, the behavior is as follows: + * If `keep_default_na` is True, and `na_values` are specified, `na_values` + is appended to the default NaN values used for parsing. + * If `keep_default_na` is True, and `na_values` are not specified, only + the default NaN values are used for parsing. + * If `keep_default_na` is False, and `na_values` are specified, only + the NaN values specified `na_values` are used for parsing. + * If `keep_default_na` is False, and `na_values` are not specified, no + strings will be parsed as NaN. + Note that if `na_filter` is passed in as False, the `keep_default_na` and + `na_values` parameters will be ignored. + na_filter : bool, default True + Detect missing value markers (empty strings and the value of na_values). In + data without any NAs, passing na_filter=False can improve the performance + of reading a large file. + verbose : bool, default False + Indicate number of NA values placed in non-numeric columns. + skip_blank_lines : bool, default True + If True, skip over blank lines rather than interpreting as NaN values. + parse_dates : bool or list of int or names or list of lists or dict, default False + The behavior is as follows: + * boolean. If True -> try parsing the index. + * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 + each as a separate date column. + * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as + a single date column. + * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call + result 'foo' + If a column or index cannot be represented as an array of datetimes, + say because of an unparsable value or a mixture of timezones, the column + or index will be returned unaltered as an object data type. For + non-standard datetime parsing, use ``pd.to_datetime`` after + ``pd.read_csv``. To parse an index or column with a mixture of timezones, + specify ``date_parser`` to be a partially-applied + :func:`pandas.to_datetime` with ``utc=True``. See + :ref:`io.csv.mixed_timezones` for more. + Note: A fast-path exists for iso8601-formatted dates. + infer_datetime_format : bool, default False + If True and `parse_dates` is enabled, pandas will attempt to infer the + format of the datetime strings in the columns, and if it can be inferred, + switch to a faster method of parsing them. In some cases this can increase + the parsing speed by 5-10x. + keep_date_col : bool, default False + If True and `parse_dates` specifies combining multiple columns then + keep the original columns. + date_parser : function, optional + Function to use for converting a sequence of string columns to an array of + datetime instances. The default uses ``dateutil.parser.parser`` to do the + conversion. Pandas will try to call `date_parser` in three different ways, + advancing to the next if an exception occurs: 1) Pass one or more arrays + (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the + string values from the columns defined by `parse_dates` into a single array + and pass that; and 3) call `date_parser` once for each row using one or + more strings (corresponding to the columns defined by `parse_dates`) as + arguments. + dayfirst : bool, default False + DD/MM format dates, international and European format. + cache_dates : bool, default True + If True, use a cache of unique, converted dates to apply the datetime + conversion. May produce significant speed-up when parsing duplicate + date strings, especially ones with timezone offsets. + .. versionadded:: 0.25.0 + iterator : bool, default False + Return TextFileReader object for iteration or getting chunks with + ``get_chunk()``. + chunksize : int, optional + Return TextFileReader object for iteration. + See the `IO Tools docs + `_ + for more information on ``iterator`` and ``chunksize``. + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer' and + `filepath_or_buffer` is path-like, then detect compression from the + following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no + decompression). If using 'zip', the ZIP file must contain only one data + file to be read in. Set to None for no decompression. + thousands : str, optional + Thousands separator. + decimal : str, default '.' + Character to recognize as decimal point (e.g. use ',' for European data). + lineterminator : str (length 1), optional + Character to break file into lines. Only valid with C parser. + quotechar : str (length 1), optional + The character used to denote the start and end of a quoted item. Quoted + items can include the delimiter and it will be ignored. + quoting : int or csv.QUOTE_* instance, default 0 + Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of + QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3). + doublequote : bool, default ``True`` + When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate + whether or not to interpret two consecutive quotechar elements INSIDE a + field as a single ``quotechar`` element. + escapechar : str (length 1), optional + One-character string used to escape other characters. + comment : str, optional + Indicates remainder of line should not be parsed. If found at the beginning + of a line, the line will be ignored altogether. This parameter must be a + single character. Like empty lines (as long as ``skip_blank_lines=True``), + fully commented lines are ignored by the parameter `header` but not by + `skiprows`. For example, if ``comment='#'``, parsing + ``#empty\na,b,c\n1,2,3`` with ``header=0`` will result in 'a,b,c' being + treated as the header. + encoding : str, optional + Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python + standard encodings + `_ . + dialect : str or csv.Dialect, optional + If provided, this parameter will override values (default or not) for the + following parameters: `delimiter`, `doublequote`, `escapechar`, + `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to + override values, a ParserWarning will be issued. See csv.Dialect + documentation for more details. + error_bad_lines : bool, default True + Lines with too many fields (e.g. a csv line with too many commas) will by + default cause an exception to be raised, and no DataFrame will be returned. + If False, then these "bad lines" will dropped from the DataFrame that is + returned. + warn_bad_lines : bool, default True + If error_bad_lines is False, and warn_bad_lines is True, a warning for each + "bad line" will be output. + delim_whitespace : bool, default False + Specifies whether or not whitespace (e.g. ``' '`` or ``' '``) will be + used as the sep. Equivalent to setting ``sep='\s+'``. If this option + is set to True, nothing should be passed in for the ``delimiter`` + parameter. + low_memory : bool, default True + Internally process the file in chunks, resulting in lower memory use + while parsing, but possibly mixed type inference. To ensure no mixed + types either set False, or specify the type with the `dtype` parameter. + Note that the entire file is read into a single DataFrame regardless, + use the `chunksize` or `iterator` parameter to return the data in chunks. + (Only valid with C parser). + float_precision : str, optional + Specifies which converter the C engine should use for floating-point + values. The options are `None` for the ordinary converter, + `high` for the high-precision converter, and `round_trip` for the + round-trip converter. + chunk_bytes: int, float or str, optional + Number of chunk bytes. + gpu: bool, default False + If read into cudf DataFrame. + head_bytes: int, float or str, optional + Number of bytes to use in the head of file, mainly for data inference. + head_lines: int, optional + Number of lines to use in the head of file, mainly for data inference. + incremental_index: bool, default True + If index_col not specified, ensure range index incremental, + gain a slightly better performance if setting False. + use_arrow_dtype: bool, default None + If True, use arrow dtype to store columns. + storage_options: dict, optional + Options for storage connection. + merge_small_files: bool, default True + Merge small files whose size is small. + merge_small_file_options: dict + Options for merging small files + + Returns + ------- + DataFrame + A comma-separated values (csv) file is returned as two-dimensional + data structure with labeled axes. + + See Also + -------- + to_csv : Write DataFrame to a comma-separated values (csv) file. + + Examples + -------- + >>> import mars.dataframe as md + >>> from mars.lib.filesystem.oss import build_oss_path + >>> md.read_csv('data.csv') # doctest: +SKIP + >>> # read from HDFS + >>> md.read_csv('hdfs://localhost:8020/test.csv') # doctest: +SKIP + >>> # read from OSS + >>> auth_path = build_oss_path(file_path, access_key_id, access_key_secret, end_point) + >>> md.read_csv(auth_path) + """ + # infer dtypes and columns + if isinstance(path, (list, tuple)): + file_path = path[0] + elif get_fs(path, storage_options).isdir(path): + parsed_path = urlparse(path) + if parsed_path.scheme.lower() == "hdfs": + path_prefix = f"{parsed_path.scheme}://{parsed_path.netloc}" + file_path = path_prefix + get_fs(path, storage_options).ls(path)[0] + else: + file_path = glob(path.rstrip("/") + "/*", storage_options)[0] + else: + file_path = glob(path, storage_options)[0] + + with open_file( + file_path, compression=compression, storage_options=storage_options + ) as f: + if head_lines is not None: + b = b"".join([f.readline() for _ in range(head_lines)]) + else: + head_bytes = int(parse_readable_size(head_bytes)[0]) + head_start, head_end = _find_chunk_start_end(f, 0, head_bytes) + f.seek(head_start) + b = f.read(head_end - head_start) + mini_df = pd.read_csv( + BytesIO(b), + sep=sep, + index_col=index_col, + dtype=dtype, + names=names, + header=header, + ) + if header == "infer" and names is not None: + # ignore header as we always specify names + header = None + else: + # replace header if we specify names or header + header = 0 + if names is None: + names = list(mini_df.columns) + if usecols: + usecols = usecols if isinstance(usecols, list) else [usecols] + col_index = sorted(mini_df.columns.get_indexer(usecols)) + mini_df = mini_df.iloc[:, col_index] + + if isinstance(mini_df.index, pd.RangeIndex): + index_value = parse_index(pd.RangeIndex(-1)) + else: + index_value = parse_index(mini_df.index) + columns_value = parse_index(mini_df.columns, store_data=True) + if index_col and not isinstance(index_col, int): + index_col = list(mini_df.columns).index(index_col) + op = DataFrameReadCSV( + path=path, + names=names, + sep=sep, + header=header, + index_col=index_col, + usecols=usecols, + compression=compression, + gpu=gpu, + incremental_index=incremental_index, + use_arrow_dtype=use_arrow_dtype, + storage_options=storage_options, + memory_scale=memory_scale, + merge_small_files=merge_small_files, + merge_small_file_options=merge_small_file_options, + **kwargs, + ) + chunk_bytes = chunk_bytes or options.chunk_store_limit + dtypes = mini_df.dtypes + if use_arrow_dtype is None: + use_arrow_dtype = options.dataframe.use_arrow_dtype + if not gpu and use_arrow_dtype: + dtypes = to_arrow_dtypes(dtypes, test_df=mini_df) + ret = op( + index_value=index_value, + columns_value=columns_value, + dtypes=dtypes, + chunk_bytes=chunk_bytes, + ) + if nrows is not None: + return ret.head(nrows) + return ret diff --git a/python/xorbits/_mars/dataframe/datasource/read_parquet.py b/python/xorbits/_mars/dataframe/datasource/read_parquet.py new file mode 100644 index 000000000..86c3dc63a --- /dev/null +++ b/python/xorbits/_mars/dataframe/datasource/read_parquet.py @@ -0,0 +1,695 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Dict +from urllib.parse import urlparse + +import numpy as np +import pandas as pd + +try: + import pyarrow as pa + import pyarrow.parquet as pq +except ImportError: + pa = None + +try: + import fastparquet +except ImportError: + fastparquet = None + +from ... import opcodes as OperandDef +from ...config import options +from ...lib.filesystem import FileSystem, file_size, get_fs, glob, open_file +from ...serialization.serializables import ( + AnyField, + BoolField, + DictField, + Int32Field, + Int64Field, + ListField, + StringField, +) +from ...utils import is_object_dtype, lazy_import +from ..arrays import ArrowStringDtype +from ..operands import OutputType +from ..utils import contain_arrow_dtype, parse_index, to_arrow_dtypes +from .core import ( + ColumnPruneSupportedDataSourceMixin, + IncrementalIndexDatasource, + IncrementalIndexDataSourceMixin, + merge_small_files, +) + +PARQUET_MEMORY_SCALE = 15 +STRING_FIELD_OVERHEAD = 50 +cudf = lazy_import("cudf") + + +def check_engine(engine): + if engine == "auto": + if pa is not None: + return "pyarrow" + elif fastparquet is not None: # pragma: no cover + return "fastparquet" + else: # pragma: no cover + raise RuntimeError("Please install either pyarrow or fastparquet.") + elif engine == "pyarrow": + if pa is None: # pragma: no cover + raise RuntimeError("Please install pyarrow first.") + return engine + elif engine == "fastparquet": + if fastparquet is None: # pragma: no cover + raise RuntimeError("Please install fastparquet first.") + return engine + else: # pragma: no cover + raise RuntimeError("Unsupported engine {} to read parquet.".format(engine)) + + +def get_engine(engine): + if engine == "pyarrow": + return ArrowEngine() + elif engine == "fastparquet": + return FastpaquetEngine() + else: # pragma: no cover + raise RuntimeError("Unsupported engine {}".format(engine)) + + +class ParquetEngine: + def get_row_num(self, f): + raise NotImplementedError + + def read_dtypes(self, f, **kwargs): + raise NotImplementedError + + def read_to_pandas( + self, f, columns=None, nrows=None, use_arrow_dtype=None, **kwargs + ): + raise NotImplementedError + + def read_group_to_pandas( + self, f, group_index, columns=None, nrows=None, use_arrow_dtype=None, **kwargs + ): + raise NotImplementedError + + def read_partitioned_to_pandas( + self, + f, + partitions: Dict, + partition_keys: Dict, + columns=None, + nrows=None, + use_arrow_dtype=None, + **kwargs, + ): + raw_df = self.read_to_pandas( + f, columns=columns, nrows=nrows, use_arrow_dtype=use_arrow_dtype, **kwargs + ) + for col, value in partition_keys.items(): + dictionary = partitions[col] + raw_df[col] = pd.Series( + value, + dtype=pd.CategoricalDtype(categories=dictionary.tolist()), + index=raw_df.index, + ) + return raw_df + + def read_partitioned_dtypes(self, fs: FileSystem, directory, storage_options): + # As ParquetDataset will iterate all files, + # here we just find one file to infer dtypes + current_path = directory + partition_cols = [] + while fs.isdir(current_path): + _, dirs, files = next(fs.walk(current_path)) + dirs = [d for d in dirs if not d.startswith(".")] + files = [f for f in files if not f.startswith(".")] + if len(files) == 0: + # directory as partition + partition_cols.append(dirs[0].split("=", 1)[0]) + current_path = os.path.join(current_path, dirs[0]) + elif len(dirs) == 0: + # parquet files in deepest directory + current_path = os.path.join(current_path, files[0]) + else: # pragma: no cover + raise ValueError( + "Files and directories are mixed in an intermediate directory" + ) + + # current path is now a parquet file + with open_file(current_path, storage_options=storage_options) as f: + dtypes = self.read_dtypes(f) + for partition in partition_cols: + dtypes[partition] = pd.CategoricalDtype() + return dtypes + + +def _parse_prefix(path): + path_prefix = "" + if isinstance(path, str): + parsed_path = urlparse(path) + if parsed_path.scheme: + path_prefix = f"{parsed_path.scheme}://{parsed_path.netloc}" + return path_prefix + + +class ArrowEngine(ParquetEngine): + def get_row_num(self, f): + file = pq.ParquetFile(f) + return file.metadata.num_rows + + def read_dtypes(self, f, **kwargs): + file = pq.ParquetFile(f) + return file.schema_arrow.empty_table().to_pandas().dtypes + + @classmethod + def _table_to_pandas(cls, t, nrows=None, use_arrow_dtype=None): + if nrows is not None: + t = t.slice(0, nrows) + if use_arrow_dtype: + df = t.to_pandas(types_mapper={pa.string(): ArrowStringDtype()}.get) + else: + df = t.to_pandas() + return df + + def read_to_pandas( + self, f, columns=None, nrows=None, use_arrow_dtype=None, **kwargs + ): + file = pq.ParquetFile(f) + t = file.read(columns=columns, **kwargs) + return self._table_to_pandas(t, nrows=nrows, use_arrow_dtype=use_arrow_dtype) + + def read_group_to_pandas( + self, f, group_index, columns=None, nrows=None, use_arrow_dtype=None, **kwargs + ): + file = pq.ParquetFile(f) + t = file.read_row_group(group_index, columns=columns, **kwargs) + return self._table_to_pandas(t, nrows=nrows, use_arrow_dtype=use_arrow_dtype) + + +class FastpaquetEngine(ParquetEngine): + def get_row_num(self, f): + file = fastparquet.ParquetFile(f) + return file.count() + + def read_dtypes(self, f, **kwargs): + file = fastparquet.ParquetFile(f) + dtypes_dict = file._dtypes() + return pd.Series(dict((c, dtypes_dict[c]) for c in file.columns)) + + def read_to_pandas( + self, f, columns=None, nrows=None, use_arrow_dtype=None, **kwargs + ): + file = fastparquet.ParquetFile(f) + df = file.to_pandas(columns, **kwargs) + if nrows is not None: + df = df.head(nrows) + if use_arrow_dtype: + df = df.astype(to_arrow_dtypes(df.dtypes).to_dict()) + return df + + +class CudfEngine: + @classmethod + def read_to_cudf(cls, file, columns: list = None, nrows: int = None, **kwargs): + df = cudf.read_parquet(file, columns=columns, **kwargs) + if nrows is not None: + df = df.head(nrows) + return df + + def read_group_to_cudf( + self, file, group_index: int, columns: list = None, nrows: int = None, **kwargs + ): + return self.read_to_cudf( + file, columns=columns, nrows=nrows, row_groups=group_index, **kwargs + ) + + @classmethod + def read_partitioned_to_cudf( + cls, + file, + partitions: Dict, + partition_keys: Dict, + columns=None, + nrows=None, + **kwargs, + ): + # cudf will read entire partitions even if only one partition provided, + # so we just read with pyarrow and convert to cudf DataFrame + file = pq.ParquetFile(file) + t = file.read(columns=columns, **kwargs) + t = t.slice(0, nrows) if nrows is not None else t + t = pa.table(t.columns, names=t.column_names) + raw_df = cudf.DataFrame.from_arrow(t) + for col, value in partition_keys.items(): + dictionary = partitions[col].tolist() + codes = cudf.core.column.as_column( + dictionary.index(value), length=len(raw_df) + ) + raw_df[col] = cudf.core.column.build_categorical_column( + categories=dictionary, + codes=codes, + size=codes.size, + offset=codes.offset, + ordered=False, + ) + return raw_df + + +class DataFrameReadParquet( + IncrementalIndexDatasource, + ColumnPruneSupportedDataSourceMixin, + IncrementalIndexDataSourceMixin, +): + _op_type_ = OperandDef.READ_PARQUET + + path = AnyField("path") + engine = StringField("engine") + columns = ListField("columns") + use_arrow_dtype = BoolField("use_arrow_dtype") + groups_as_chunks = BoolField("groups_as_chunks") + group_index = Int32Field("group_index") + read_kwargs = DictField("read_kwargs") + incremental_index = BoolField("incremental_index") + storage_options = DictField("storage_options") + is_partitioned = BoolField("is_partitioned") + merge_small_files = BoolField("merge_small_files") + merge_small_file_options = DictField("merge_small_file_options") + # for chunk + partitions = DictField("partitions", default=None) + partition_keys = DictField("partition_keys", default=None) + num_group_rows = Int64Field("num_group_rows", default=None) + # as read meta may be too time-consuming when number of files is large, + # thus we only read first file to get row number and raw file size + first_chunk_row_num = Int64Field("first_chunk_row_num") + first_chunk_raw_bytes = Int64Field("first_chunk_raw_bytes") + + def get_columns(self): + return self.columns + + def set_pruned_columns(self, columns, *, keep_order=None): + self.columns = columns + + @classmethod + def _to_arrow_dtypes(cls, dtypes, op): + if ( + op.use_arrow_dtype is None + and not op.gpu + and options.dataframe.use_arrow_dtype + ): # pragma: no cover + # check if use_arrow_dtype set on the server side + dtypes = to_arrow_dtypes(dtypes) + return dtypes + + @classmethod + def _tile_partitioned(cls, op: "DataFrameReadParquet"): + out_df = op.outputs[0] + shape = (np.nan, out_df.shape[1]) + dtypes = cls._to_arrow_dtypes(out_df.dtypes, op) + dataset = pq.ParquetDataset(op.path, use_legacy_dataset=False) + + path_prefix = _parse_prefix(op.path) + + chunk_index = 0 + out_chunks = [] + first_chunk_row_num, first_chunk_raw_bytes = None, None + for i, fragment in enumerate(dataset.fragments): + chunk_op = op.copy().reset_key() + chunk_op.path = chunk_path = path_prefix + fragment.path + relpath = os.path.relpath(chunk_path, op.path) + partition_keys = dict( + tuple(s.split("=")) for s in relpath.split(os.sep)[:-1] + ) + chunk_op.partition_keys = partition_keys + chunk_op.partitions = dict( + zip( + dataset.partitioning.schema.names, dataset.partitioning.dictionaries + ) + ) + if i == 0: + first_row_group = fragment.row_groups[0] + first_chunk_raw_bytes = first_row_group.total_byte_size + first_chunk_row_num = first_row_group.num_rows + chunk_op.first_chunk_row_num = first_chunk_row_num + chunk_op.first_chunk_raw_bytes = first_chunk_raw_bytes + new_chunk = chunk_op.new_chunk( + None, + shape=shape, + index=(chunk_index, 0), + index_value=out_df.index_value, + columns_value=out_df.columns_value, + dtypes=dtypes, + ) + out_chunks.append(new_chunk) + chunk_index += 1 + + new_op = op.copy() + nsplits = ((np.nan,) * len(out_chunks), (out_df.shape[1],)) + return new_op.new_dataframes( + None, + out_df.shape, + dtypes=dtypes, + index_value=out_df.index_value, + columns_value=out_df.columns_value, + chunks=out_chunks, + nsplits=nsplits, + ) + + @classmethod + def _tile_no_partitioned(cls, op: "DataFrameReadParquet"): + chunk_index = 0 + out_chunks = [] + out_df = op.outputs[0] + + dtypes = cls._to_arrow_dtypes(out_df.dtypes, op) + shape = (np.nan, out_df.shape[1]) + + path_prefix = "" + if isinstance(op.path, (tuple, list)): + paths = op.path + elif get_fs(op.path, op.storage_options).isdir(op.path): + parsed_path = urlparse(op.path) + if parsed_path.scheme.lower() == "hdfs": + path_prefix = f"{parsed_path.scheme}://{parsed_path.netloc}" + paths = get_fs(op.path, op.storage_options).ls(op.path) + else: + paths = glob(op.path, storage_options=op.storage_options) + + first_chunk_row_num, first_chunk_raw_bytes = None, None + for i, pth in enumerate(paths): + pth = path_prefix + pth + if i == 0: + with open_file(pth, storage_options=op.storage_options) as f: + first_chunk_row_num = get_engine(op.engine).get_row_num(f) + first_chunk_raw_bytes = file_size( + pth, storage_options=op.storage_options + ) + + if op.groups_as_chunks: + num_row_groups = pq.ParquetFile(pth).num_row_groups + for group_idx in range(num_row_groups): + chunk_op = op.copy().reset_key() + chunk_op.path = pth + chunk_op.group_index = group_idx + chunk_op.first_chunk_row_num = first_chunk_row_num + chunk_op.first_chunk_raw_bytes = first_chunk_raw_bytes + chunk_op.num_group_rows = num_row_groups + new_chunk = chunk_op.new_chunk( + None, + shape=shape, + index=(chunk_index, 0), + index_value=out_df.index_value, + columns_value=out_df.columns_value, + dtypes=dtypes, + ) + out_chunks.append(new_chunk) + chunk_index += 1 + else: + chunk_op = op.copy().reset_key() + chunk_op.path = pth + chunk_op.first_chunk_row_num = first_chunk_row_num + chunk_op.first_chunk_raw_bytes = first_chunk_raw_bytes + new_chunk = chunk_op.new_chunk( + None, + shape=shape, + index=(chunk_index, 0), + index_value=out_df.index_value, + columns_value=out_df.columns_value, + dtypes=dtypes, + ) + out_chunks.append(new_chunk) + chunk_index += 1 + + new_op = op.copy() + nsplits = ((np.nan,) * len(out_chunks), (out_df.shape[1],)) + return new_op.new_dataframes( + None, + out_df.shape, + dtypes=dtypes, + index_value=out_df.index_value, + columns_value=out_df.columns_value, + chunks=out_chunks, + nsplits=nsplits, + ) + + @classmethod + def _tile(cls, op: "DataFrameReadParquet"): + if op.is_partitioned: + tiled = cls._tile_partitioned(op) + else: + tiled = cls._tile_no_partitioned(op) + if op.merge_small_files: + tiled = [ + merge_small_files(tiled[0], **(op.merge_small_file_options or dict())) + ] + return tiled + + @classmethod + def _execute_partitioned(cls, ctx, op: "DataFrameReadParquet"): + out = op.outputs[0] + engine = get_engine(op.engine) + with open_file(op.path, storage_options=op.storage_options) as f: + ctx[out.key] = engine.read_partitioned_to_pandas( + f, + op.partitions, + op.partition_keys, + columns=op.columns, + nrows=op.nrows, + use_arrow_dtype=op.use_arrow_dtype, + **op.read_kwargs or dict(), + ) + + @classmethod + def _pandas_read_parquet(cls, ctx: dict, op: "DataFrameReadParquet"): + out = op.outputs[0] + path = op.path + + if op.partitions is not None: + return cls._execute_partitioned(ctx, op) + + engine = get_engine(op.engine) + with open_file(path, storage_options=op.storage_options) as f: + use_arrow_dtype = contain_arrow_dtype(out.dtypes) + if op.groups_as_chunks: + df = engine.read_group_to_pandas( + f, + op.group_index, + columns=op.columns, + nrows=op.nrows, + use_arrow_dtype=use_arrow_dtype, + **op.read_kwargs or dict(), + ) + else: + df = engine.read_to_pandas( + f, + columns=op.columns, + nrows=op.nrows, + use_arrow_dtype=use_arrow_dtype, + **op.read_kwargs or dict(), + ) + + ctx[out.key] = df + + @classmethod + def _cudf_read_parquet(cls, ctx: dict, op: "DataFrameReadParquet"): + out = op.outputs[0] + path = op.path + + engine = CudfEngine() + if os.path.exists(path): + file = op.path + close = lambda: None + else: # pragma: no cover + file = open_file(path, storage_options=op.storage_options) + close = file.close + + try: + if op.partitions is not None: + ctx[out.key] = engine.read_partitioned_to_cudf( + file, + op.partitions, + op.partition_keys, + columns=op.columns, + nrows=op.nrows, + **op.read_kwargs or dict(), + ) + else: + if op.groups_as_chunks: + df = engine.read_group_to_cudf( + file, + op.group_index, + columns=op.columns, + nrows=op.nrows, + **op.read_kwargs or dict(), + ) + else: + df = engine.read_to_cudf( + file, + columns=op.columns, + nrows=op.nrows, + **op.read_kwargs or dict(), + ) + ctx[out.key] = df + finally: + close() + + @classmethod + def execute(cls, ctx, op: "DataFrameReadParquet"): + if not op.gpu: + cls._pandas_read_parquet(ctx, op) + else: + cls._cudf_read_parquet(ctx, op) + + @classmethod + def estimate_size(cls, ctx, op: "DataFrameReadParquet"): + first_chunk_row_num = op.first_chunk_row_num + first_chunk_raw_bytes = op.first_chunk_raw_bytes + raw_bytes = file_size(op.path, storage_options=op.storage_options) + if op.num_group_rows: + raw_bytes = ( + np.ceil(np.divide(raw_bytes, op.num_group_rows)).astype(np.int64).item() + ) + + estimated_row_num = ( + np.ceil(first_chunk_row_num * (raw_bytes / first_chunk_raw_bytes)) + .astype(np.int64) + .item() + ) + phy_size = raw_bytes * (op.memory_scale or PARQUET_MEMORY_SCALE) + n_strings = len([dt for dt in op.outputs[0].dtypes if is_object_dtype(dt)]) + pd_size = phy_size + n_strings * estimated_row_num * STRING_FIELD_OVERHEAD + ctx[op.outputs[0].key] = (pd_size, pd_size + phy_size) + + def __call__(self, index_value=None, columns_value=None, dtypes=None): + self._output_types = [OutputType.dataframe] + shape = (np.nan, len(dtypes)) + return self.new_dataframe( + None, + shape, + dtypes=dtypes, + index_value=index_value, + columns_value=columns_value, + ) + + +def read_parquet( + path, + engine: str = "auto", + columns: list = None, + groups_as_chunks: bool = False, + use_arrow_dtype: bool = None, + incremental_index: bool = False, + storage_options: dict = None, + memory_scale: int = None, + merge_small_files: bool = True, + merge_small_file_options: dict = None, + gpu: bool = None, + **kwargs, +): + """ + Load a parquet object from the file path, returning a DataFrame. + + Parameters + ---------- + path : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. + For file URLs, a host is expected. A local file could be: + ``file://localhost/path/to/table.parquet``. + A file URL can also be a path to a directory that contains multiple + partitioned parquet files. Both pyarrow and fastparquet support + paths to directories as well as file URLs. A directory path could be: + ``file://localhost/path/to/tables``. + By file-like object, we refer to objects with a ``read()`` method, + such as a file handler (e.g. via builtin ``open`` function) + or ``StringIO``. + engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' + Parquet library to use. The default behavior is to try 'pyarrow', + falling back to 'fastparquet' if 'pyarrow' is unavailable. + columns : list, default=None + If not None, only these columns will be read from the file. + groups_as_chunks : bool, default False + if True, each row group correspond to a chunk. + if False, each file correspond to a chunk. + Only available for 'pyarrow' engine. + incremental_index: bool, default False + If index_col not specified, ensure range index incremental, + gain a slightly better performance if setting False. + use_arrow_dtype: bool, default None + If True, use arrow dtype to store columns. + storage_options: dict, optional + Options for storage connection. + memory_scale: int, optional + Scale that real memory occupation divided with raw file size. + merge_small_files: bool, default True + Merge small files whose size is small. + merge_small_file_options: dict + Options for merging small files + **kwargs + Any additional kwargs are passed to the engine. + + Returns + ------- + Mars DataFrame + """ + + engine_type = check_engine(engine) + engine = get_engine(engine_type) + + single_path = path[0] if isinstance(path, list) else path + fs = get_fs(single_path, storage_options) + is_partitioned = False + if fs.isdir(single_path): + paths = fs.ls(path) + if all(fs.isdir(p) for p in paths): + # If all are directories, it is read as a partitioned dataset. + dtypes = engine.read_partitioned_dtypes(fs, path, storage_options) + is_partitioned = True + else: + with fs.open(paths[0], mode="rb") as f: + dtypes = engine.read_dtypes(f) + else: + if not isinstance(path, list): + file_path = glob(path, storage_options=storage_options)[0] + else: + file_path = path[0] + + with open_file(file_path, storage_options=storage_options) as f: + dtypes = engine.read_dtypes(f) + + if columns: + dtypes = dtypes[columns] + + if use_arrow_dtype is None: + use_arrow_dtype = options.dataframe.use_arrow_dtype + if use_arrow_dtype: + dtypes = to_arrow_dtypes(dtypes) + + index_value = parse_index(pd.RangeIndex(-1)) + columns_value = parse_index(dtypes.index, store_data=True) + op = DataFrameReadParquet( + path=path, + engine=engine_type, + columns=columns, + groups_as_chunks=groups_as_chunks, + use_arrow_dtype=use_arrow_dtype, + read_kwargs=kwargs, + incremental_index=incremental_index, + storage_options=storage_options, + is_partitioned=is_partitioned, + memory_scale=memory_scale, + merge_small_files=merge_small_files, + merge_small_file_options=merge_small_file_options, + gpu=gpu, + ) + return op(index_value=index_value, columns_value=columns_value, dtypes=dtypes) diff --git a/python/xorbits/_mars/dataframe/datasource/read_raydataset.py b/python/xorbits/_mars/dataframe/datasource/read_raydataset.py new file mode 100644 index 000000000..76191437c --- /dev/null +++ b/python/xorbits/_mars/dataframe/datasource/read_raydataset.py @@ -0,0 +1,249 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import functools +import warnings + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import OutputType +from ...serialization.serializables import ( + AnyField, + BoolField, + Int64Field, + ListField, + ReferenceField, +) +from ..utils import lazy_import, parse_index, tokenize +from .core import ( + HeadOptimizedDataSource, + IncrementalIndexDatasource, + IncrementalIndexDataSourceMixin, +) + +ray = lazy_import("ray") +# Ray Datasets is available in early preview at ray.data with Ray 1.6+ +# (and ray.experimental.data in Ray 1.5) +ray_dataset = lazy_import("ray.data", rename="ray_dataset") +ray_exp_dataset = lazy_import("ray.experimental.data", rename="ray_exp_dataset") +real_ray_dataset = ray_dataset or ray_exp_dataset + + +class DataFrameReadRayDataset( + IncrementalIndexDatasource, IncrementalIndexDataSourceMixin +): + _op_type_ = OperandDef.READ_RAYDATASET + + refs = AnyField("refs", default=None) + columns = ListField("columns", default=None) + incremental_index = BoolField("incremental_index", default=None) + nrows = Int64Field("nrows", default=None) + + @classmethod + def _tile_partitioned(cls, op: "DataFrameReadRayDataset"): + out_df = op.outputs[0] + shape = (np.nan, out_df.shape[1]) + dtypes = out_df.dtypes + dataset = op.refs + + chunk_index = 0 + out_chunks = [] + for object_ref in dataset: + chunk_op = op.copy().reset_key() + chunk_op._refs = [object_ref] + new_chunk = chunk_op.new_chunk( + None, + shape=shape, + index=(chunk_index, 0), + index_value=out_df.index_value, + columns_value=out_df.columns_value, + dtypes=dtypes, + ) + out_chunks.append(new_chunk) + chunk_index += 1 + + new_op = op.copy() + nsplits = ((np.nan,) * len(out_chunks), (out_df.shape[1],)) + return new_op.new_dataframes( + None, + out_df.shape, + dtypes=dtypes, + index_value=out_df.index_value, + columns_value=out_df.columns_value, + chunks=out_chunks, + nsplits=nsplits, + ) + + @classmethod + def _tile(cls, op): + return cls._tile_partitioned(op) + + @classmethod + def execute(cls, ctx, op: "DataFrameReadRayDataset"): + out = op.outputs[0] + ref = op.refs[0] + + df = ray.get(ref) + ctx[out.key] = df + + def __call__(self, index_value=None, columns_value=None, dtypes=None): + shape = (np.nan, len(dtypes)) + return self.new_dataframe( + None, + shape, + dtypes=dtypes, + index_value=index_value, + columns_value=columns_value, + ) + + +def read_ray_dataset(ds, columns=None, incremental_index=False, **kwargs): + assert isinstance(ds, real_ray_dataset.Dataset) + refs = ds.to_pandas_refs() + schema = ds.schema() + + import pyarrow as pa + + try: + from ray.data._internal.pandas_block import PandasBlockSchema + except ImportError: + try: + from ray.data.impl.pandas_block import PandasBlockSchema + except ImportError: # pragma: no cover + PandasBlockSchema = type(None) + + if isinstance(schema, PandasBlockSchema): + dtypes = pd.Series(schema.types, index=schema.names) + elif isinstance(schema, pa.Schema): + dtypes = schema.empty_table().to_pandas().dtypes + else: + raise NotImplementedError(f"Unsupported format of schema {schema}") + + index_value = parse_index(pd.RangeIndex(-1)) + columns_value = parse_index(dtypes.index, store_data=True) + op = DataFrameReadRayDataset( + refs=refs, columns=columns, incremental_index=incremental_index + ) + return op(index_value=index_value, columns_value=columns_value, dtypes=dtypes) + + +# keep it for back compatibility +@functools.wraps(read_ray_dataset) +def read_raydataset(*args, **kwargs): + warnings.warn( + "read_raydataset has been renamed to read_ray_dataset", + DeprecationWarning, + ) + return read_ray_dataset(*args, **kwargs) + + +class DataFrameReadMLDataset(HeadOptimizedDataSource): + _op_type_ = OperandDef.READ_MLDATASET + + mldataset = ReferenceField("mldataset", "ray.util.data.MLDataset", default=None) + columns = ListField("columns", default=None) + + def __init__(self, **kw): + super().__init__(_output_types=[OutputType.dataframe], **kw) + + def _update_key(self): + """We can't direct generate token for mldataset when we use + ray client, so we use all mldataset's actor_id to generate + token. + """ + datas = [] + for value in self._values_: + if isinstance(value, ray.util.data.MLDataset): + actor_sets = [ + ([str(actor) for actor in actor_set.actors], actor_set.transforms) + for actor_set in value.actor_sets + ] + datas.append(actor_sets) + continue + datas.append(value) + self._obj_set("_key", tokenize(type(self).__name__, *datas)) + return self + + def __call__(self, dtypes, nrows: int): + columns_value = parse_index(dtypes.index, store_data=True) + index_value = parse_index(pd.RangeIndex(nrows)) + return self.new_dataframe( + None, + (nrows, len(dtypes)), + dtypes=dtypes, + index_value=index_value, + columns_value=columns_value, + ) + + @classmethod + def tile(cls, op: "DataFrameReadMLDataset"): + count_iter = op.mldataset.for_each(lambda df: len(df)) + nsplits = [sum(shard) for shard in count_iter.shards()] + nsplits_acc = np.cumsum(nsplits) + out_df = op.outputs[0] + out_chunks = [] + for shard_index in range(op.mldataset.num_shards()): + chunk_op = op.copy().reset_key() + # Make chunk key unique, otherwise all chunk will have same key. + # See `DataFrameFromRecords#tile` + chunk_op.extra_params["shard_index"] = shard_index + shape = (nsplits[shard_index], out_df.shape[1]) + begin_index = nsplits_acc[shard_index] - nsplits[shard_index] + end_index = nsplits_acc[shard_index] + index = parse_index(pd.RangeIndex(start=begin_index, stop=end_index)) + new_chunk = chunk_op.new_chunk( + None, + shape=shape, + index=(shard_index, 0), + index_value=index, + columns_value=out_df.columns_value, + dtypes=out_df.dtypes, + ) + out_chunks.append(new_chunk) + new_op = op.copy() + nsplits = ((np.nan,) * len(out_chunks), (out_df.shape[1],)) + return new_op.new_dataframes( + None, + out_df.shape, + dtypes=out_df.dtypes, + index_value=out_df.index_value, + columns_value=out_df.columns_value, + chunks=out_chunks, + nsplits=nsplits, + ) + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + shard = op.mldataset.get_shard(chunk.index[0]) + pd_dfs = list(shard) + pd_df = pd.concat(pd_dfs).set_index(chunk.index_value.to_pandas()) + ctx[chunk.key] = pd_df + + +def read_ray_mldataset(mldataset, **kwargs): + import ray.util.data + + assert isinstance(mldataset, ray.util.data.MLDataset) + not_empty_dfs = mldataset.filter(lambda df: len(df) > 0).take(1) + if not not_empty_dfs: + raise ValueError( + f"MLDataset {mldataset} is empty, please provide an non-empty dataset." + ) + df_record: pd.DataFrame = not_empty_dfs[0] + columns = df_record.columns.names + nrows = sum(mldataset.for_each(lambda df: len(df)).gather_async()) + op = DataFrameReadMLDataset(mldataset=mldataset, columns=columns, nrows=nrows) + return op(df_record.dtypes, nrows) diff --git a/python/xorbits/_mars/dataframe/datasource/read_sql.py b/python/xorbits/_mars/dataframe/datasource/read_sql.py new file mode 100644 index 000000000..a239ef9cc --- /dev/null +++ b/python/xorbits/_mars/dataframe/datasource/read_sql.py @@ -0,0 +1,935 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import binascii +import datetime +import pickle +import uuid +from typing import List, Union + +import cloudpickle +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...config import options +from ...core.context import Context +from ...core.operand import OperatorLogicKeyGeneratorMixin +from ...serialization.serializables import ( + AnyField, + BoolField, + BytesField, + Float64Field, + Int64Field, + ListField, + StringField, +) +from ...tensor.utils import normalize_chunk_sizes +from ...typing import OperandType, TileableType +from ..arrays import ArrowStringDtype +from ..utils import create_sa_connection, parse_index, to_arrow_dtypes +from .core import ( + ColumnPruneSupportedDataSourceMixin, + IncrementalIndexDatasource, + IncrementalIndexDataSourceMixin, +) + + +class DataFrameReadSQLLogicKeyGenerator(OperatorLogicKeyGeneratorMixin): + def _get_logic_key_token_values(self): + fields_to_tokenize = [ + getattr(self, k, None) + for k in [ + "table_or_sql", + "schema", + "coerce_float", + "parse_dates", + "columns", + "method", + "incremental_index", + "use_arrow_dtype", + "partition_col", + ] + ] + return super()._get_logic_key_token_values() + fields_to_tokenize + + +class DataFrameReadSQL( + IncrementalIndexDatasource, + ColumnPruneSupportedDataSourceMixin, + IncrementalIndexDataSourceMixin, + DataFrameReadSQLLogicKeyGenerator, +): + _op_type_ = OperandDef.READ_SQL + + table_or_sql = AnyField("table_or_sql") + selectable = BytesField( + "selectable", on_serialize=pickle.dumps, on_deserialize=pickle.loads + ) + con = AnyField("con") + schema = StringField("schema") + index_col = AnyField("index_col") + coerce_float = BoolField("coerce_float") + parse_dates = AnyField("parse_dates") + columns = ListField("columns") + engine_kwargs = BytesField( + "engine_kwargs", + on_serialize=cloudpickle.dumps, + on_deserialize=cloudpickle.loads, + ) + row_memory_usage = Float64Field("row_memory_usage") + method = StringField("method") + incremental_index = BoolField("incremental_index") + use_arrow_dtype = BoolField("use_arrow_dtype") + chunk_size = AnyField("chunk_size") + # for chunks + offset = Int64Field("offset") + partition_col = StringField("partition_col") + num_partitions = Int64Field("num_partitions") + low_limit = AnyField("low_limit") + high_limit = AnyField("high_limit") + left_end = BoolField("left_end") + right_end = BoolField("right_end") + nrows = Int64Field("nrows", default=None) + + def get_columns(self): + return self.columns + + def set_pruned_columns(self, columns, *, keep_order=None): + self.columns = columns + + def _get_selectable(self, engine_or_conn, columns=None): + import sqlalchemy as sa + from sqlalchemy import sql + from sqlalchemy.exc import SQLAlchemyError + + # process table_name + if self.selectable is not None: + selectable = self.selectable + else: + if isinstance(self.table_or_sql, sa.Table): + selectable = self.table_or_sql + self.table_or_sql = selectable.name + else: + m = sa.MetaData() + try: + selectable = sa.Table( + self.table_or_sql, + m, + autoload=True, + autoload_with=engine_or_conn, + schema=self.schema, + ) + except SQLAlchemyError: + temp_name_1 = "t1_" + binascii.b2a_hex(uuid.uuid4().bytes).decode() + temp_name_2 = "t2_" + binascii.b2a_hex(uuid.uuid4().bytes).decode() + if columns: + selectable = ( + sql.text(self.table_or_sql) + .columns(*[sql.column(c) for c in columns]) + .alias(temp_name_2) + ) + else: + selectable = sql.select( + "*", + from_obj=sql.text( + f"({self.table_or_sql}) AS {temp_name_1}" + ), + ).alias(temp_name_2) + self.selectable = selectable + return selectable + + def _collect_info(self, engine_or_conn, selectable, columns, test_rows): + from sqlalchemy import sql + + # fetch test DataFrame + if columns: + query = sql.select( + [sql.column(c) for c in columns], from_obj=selectable + ).limit(test_rows) + else: + query = sql.select(selectable.columns, from_obj=selectable).limit(test_rows) + test_df = pd.read_sql( + query, + engine_or_conn, + index_col=self.index_col, + coerce_float=self.coerce_float, + parse_dates=self.parse_dates, + ) + if len(test_df) == 0: + self.row_memory_usage = None + else: + self.row_memory_usage = test_df.memory_usage( + deep=True, index=True + ).sum() / len(test_df) + + if self.method == "offset": + # fetch size + size = list( + engine_or_conn.execute( + sql.select([sql.func.count()]).select_from(selectable) + ) + )[0][0] + shape = (size, test_df.shape[1]) + else: + shape = (np.nan, test_df.shape[1]) + + return test_df, shape + + def __call__(self, test_rows, chunk_size): + import sqlalchemy as sa + from sqlalchemy.sql import elements + + with create_sa_connection(self.con, **(self.engine_kwargs or dict())) as con: + self.con = str(con.engine.url) + selectable = self._get_selectable(con) + + # process index_col + index_col = self.index_col + if index_col is not None: + if not isinstance(index_col, (list, tuple)): + index_col = (index_col,) + new_index_col = [] + for col in index_col: + if isinstance(col, (sa.Column, elements.Label)): + new_index_col.append(col.name) + elif isinstance(col, str): + new_index_col.append(col) + elif col is not None: + raise TypeError(f"unknown index_col type: {type(col)}") + self.index_col = new_index_col + + # process columns + columns = self.columns or [] + new_columns = [] + for col in columns: + if isinstance(col, str): + new_columns.append(col) + else: + new_columns.append(col.name) + self.columns = new_columns + + if self.columns: + collect_cols = self.columns + (self.index_col or []) + else: + collect_cols = [] + test_df, shape = self._collect_info( + con, selectable, collect_cols, test_rows + ) + + # reconstruct selectable using known column names + if not collect_cols: + self.columns = list(test_df.columns) + if self.selectable is not None: + self.selectable = None + self._get_selectable( + con, columns=self.columns + (self.index_col or []) + ) + + if self.method == "partition": + if not self.index_col or self.partition_col not in self.index_col: + part_frame = test_df + else: + part_frame = test_df.index.to_frame() + + if not issubclass( + part_frame[self.partition_col].dtype.type, + (np.number, np.datetime64), + ): + raise TypeError( + "Type of partition column should be numeric or datetime, " + f"now it is {test_df[self.partition_col].dtype}" + ) + + if isinstance(test_df.index, pd.RangeIndex): + index_value = parse_index( + pd.RangeIndex(shape[0] if not np.isnan(shape[0]) else -1), + str(selectable), + self.con, + ) + else: + index_value = parse_index(test_df.index) + + columns_value = parse_index(test_df.columns, store_data=True) + + dtypes = test_df.dtypes + use_arrow_dtype = self.use_arrow_dtype + if use_arrow_dtype is None: + use_arrow_dtype = options.dataframe.use_arrow_dtype + if use_arrow_dtype: + dtypes = to_arrow_dtypes(dtypes, test_df=test_df) + + return self.new_dataframe( + None, + shape=shape, + dtypes=dtypes, + index_value=index_value, + columns_value=columns_value, + raw_chunk_size=chunk_size, + ) + + @classmethod + def _tile_offset(cls, op: "DataFrameReadSQL"): + df = op.outputs[0] + + if op.row_memory_usage is not None: + # Data selected + chunk_size = df.extra_params.raw_chunk_size or options.chunk_size + if chunk_size is None: + chunk_size = ( + int(options.chunk_store_limit / op.row_memory_usage), + df.shape[1], + ) + row_chunk_sizes = normalize_chunk_sizes(df.shape, chunk_size)[0] + else: + # No data selected + row_chunk_sizes = (0,) + offsets = np.cumsum((0,) + row_chunk_sizes).tolist() + + out_chunks = [] + for i, row_size in enumerate(row_chunk_sizes): + chunk_op = op.copy().reset_key() + chunk_op._row_memory_usage = None # no need for chunk + offset = chunk_op.offset = offsets[i] + if df.index_value.has_value(): + # range index + index_value = parse_index( + df.index_value.to_pandas()[offset : offsets[i + 1]] + ) + else: + index_value = parse_index( + df.index_value.to_pandas(), + op.table_or_sql or str(op.selectable), + op.con, + i, + row_size, + ) + out_chunk = chunk_op.new_chunk( + None, + shape=(row_size, df.shape[1]), + columns_value=df.columns_value, + index_value=index_value, + dtypes=df.dtypes, + index=(i, 0), + ) + out_chunks.append(out_chunk) + + nsplits = (row_chunk_sizes, (df.shape[1],)) + new_op = op.copy() + return new_op.new_dataframes( + None, chunks=out_chunks, nsplits=nsplits, **df.params + ) + + def _parse_datetime(self, val): + if isinstance(self.parse_dates, list): + return pd.to_datetime(val) + args = self.parse_dates[self.partition_col] + args = {"format": args} if isinstance(args, str) else args + return pd.to_datetime(val, **args) + + @classmethod + def _tile_partition(cls, op: "DataFrameReadSQL"): + df = op.outputs[0] + + selectable = op._get_selectable(None) + + if op.low_limit is None or op.high_limit is None: + import sqlalchemy as sa + from sqlalchemy import sql + + engine = sa.create_engine(op.con, **(op.engine_kwargs or dict())) + try: + part_col = selectable.columns[op.partition_col] + range_results = engine.execute( + sql.select([sql.func.min(part_col), sql.func.max(part_col)]) + ) + + op.low_limit, op.high_limit = next(range_results) + if op.parse_dates and op.partition_col in op.parse_dates: + op.low_limit = op._parse_datetime(op.low_limit) + op.high_limit = op._parse_datetime(op.high_limit) + finally: + engine.dispose() + + if isinstance(op.low_limit, (datetime.datetime, np.datetime64, pd.Timestamp)): + seps = pd.date_range(op.low_limit, op.high_limit, op.num_partitions + 1) + else: + seps = np.linspace( + op.low_limit, op.high_limit, op.num_partitions + 1, endpoint=True + ) + + out_chunks = [] + for i, (start, end) in enumerate(zip(seps, seps[1:])): + chunk_op = op.copy().reset_key() + chunk_op.row_memory_usage = None # no need for chunk + chunk_op.num_partitions = None + chunk_op.low_limit = start + chunk_op.high_limit = end + chunk_op.left_end = i == 0 + chunk_op.right_end = i == op.num_partitions - 1 + + if df.index_value.has_value(): + # range index + index_value = parse_index(-1, chunk_op.key, chunk_op.index_value.key) + else: + index_value = parse_index( + df.index_value.to_pandas(), str(selectable), op.con, i + ) + out_chunk = chunk_op.new_chunk( + None, + shape=(np.nan, df.shape[1]), + columns_value=df.columns_value, + index_value=index_value, + dtypes=df.dtypes, + index=(i, 0), + ) + out_chunks.append(out_chunk) + + nsplits = ((np.nan,) * len(out_chunks), (df.shape[1],)) + new_op = op.copy() + return new_op.new_dataframes( + None, chunks=out_chunks, nsplits=nsplits, **df.params + ) + + @classmethod + def tile(cls, op: "DataFrameReadSQL"): + if op.method == "offset": + return cls._tile_offset(op) + else: + return cls._tile_partition(op) + + @classmethod + def post_tile(cls, op: OperandType, results: List[TileableType]): + if op.method != "offset": + # method `offset` knows shape of each chunk + # just skip incremental process + return super().post_tile(op, results) + + @classmethod + def execute(cls, ctx, op: "DataFrameReadSQL"): + import sqlalchemy as sa + + def _adapt_datetime(dt): + if isinstance(dt, np.datetime64): + return dt.astype("= op.low_limit) + else: + query = query.where( + (part_col >= op.low_limit) & (part_col < op.high_limit) + ) + + if hasattr(selectable, "primary_key") and len(selectable.primary_key) > 0: + # if table has primary key, sort as the order + query = query.order_by(*list(selectable.primary_key)) + elif op.index_col: + # if no primary key, sort as the index_col + query = query.order_by( + *[selectable.columns[col] for col in op.index_col] + ) + else: + # at last, we sort by all the columns + query = query.order_by(*columns) + + if op.method == "offset": + query = query.limit(out.shape[0]) + if op.offset > 0: + query = query.offset(op.offset) + + if op.nrows is not None: + query = query.limit(op.nrows) + + df = pd.read_sql( + query, + engine, + index_col=op.index_col, + coerce_float=op.coerce_float, + parse_dates=op.parse_dates, + ) + if op.method == "offset" and op.index_col is None and op.offset > 0: + index = pd.RangeIndex(op.offset, op.offset + out.shape[0]) + if op.nrows is not None: + index = index[: op.nrows] + df.index = index + + use_arrow_dtype = op.use_arrow_dtype + if use_arrow_dtype is None: + use_arrow_dtype = options.dataframe.use_arrow_dtype + if use_arrow_dtype: + dtypes = to_arrow_dtypes(df.dtypes, test_df=df) + for i in range(len(dtypes)): + dtype = dtypes.iloc[i] + if isinstance(dtype, ArrowStringDtype): + df.iloc[:, i] = df.iloc[:, i].astype(dtype) + + if out.ndim == 2: + ctx[out.key] = df + else: + # this happens when column pruning results in one single series + ctx[out.key] = df.iloc[:, 0] + finally: + engine.dispose() + + @classmethod + def post_execute(cls, ctx: Union[dict, Context], op: OperandType): + if op.method != "offset": + # method `offset` knows shape of each chunk + # just skip incremental process + return super().post_execute(ctx, op) + + +def _read_sql( + table_or_sql, + con, + schema=None, + index_col=None, + coerce_float=True, + params=None, + parse_dates=None, + columns=None, + chunksize=None, + incremental_index=False, + use_arrow_dtype=None, + test_rows=None, + chunk_size=None, + engine_kwargs=None, + partition_col=None, + num_partitions=None, + low_limit=None, + high_limit=None, +): + if chunksize is not None: + raise NotImplementedError("read_sql_query with chunksize not supported") + method = "offset" if partition_col is None else "partition" + + op = DataFrameReadSQL( + table_or_sql=table_or_sql, + selectable=None, + con=con, + schema=schema, + index_col=index_col, + coerce_float=coerce_float, + params=params, + parse_dates=parse_dates, + columns=columns, + engine_kwargs=engine_kwargs, + incremental_index=incremental_index, + use_arrow_dtype=use_arrow_dtype, + method=method, + partition_col=partition_col, + num_partitions=num_partitions, + low_limit=low_limit, + high_limit=high_limit, + chunk_size=chunk_size, + ) + return op(test_rows, chunk_size) + + +def read_sql( + sql, + con, + index_col=None, + coerce_float=True, + params=None, + parse_dates=None, + columns=None, + chunksize=None, + test_rows=5, + chunk_size=None, + engine_kwargs=None, + incremental_index=True, + partition_col=None, + num_partitions=None, + low_limit=None, + high_limit=None, +): + """ + Read SQL query or database table into a DataFrame. + + This function is a convenience wrapper around ``read_sql_table`` and + ``read_sql_query`` (for backward compatibility). It will delegate + to the specific function depending on the provided input. A SQL query + will be routed to ``read_sql_query``, while a database table name will + be routed to ``read_sql_table``. Note that the delegated function might + have more specific notes about their functionality not listed here. + + Parameters + ---------- + sql : str or SQLAlchemy Selectable (select or text object) + SQL query to be executed or a table name. + con : SQLAlchemy connectable (engine/connection) or database str URI + or DBAPI2 connection (fallback mode)' + + Using SQLAlchemy makes it possible to use any DB supported by that + library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible + for engine disposal and connection closure for the SQLAlchemy connectable. See + `here `_ + index_col : str or list of strings, optional, default: None + Column(s) to set as index(MultiIndex). + coerce_float : bool, default True + Attempts to convert values of non-string, non-numeric objects (like + decimal.Decimal) to floating point, useful for SQL result sets. + params : list, tuple or dict, optional, default: None + List of parameters to pass to execute method. The syntax used + to pass parameters is database driver dependent. Check your + database driver documentation for which of the five syntax styles, + described in PEP 249's paramstyle, is supported. + Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'}. + parse_dates : list or dict, default: None + - List of column names to parse as dates. + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times, or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps. + - Dict of ``{column_name: arg dict}``, where the arg dict corresponds + to the keyword arguments of :func:`pandas.to_datetime` + Especially useful with databases without native Datetime support, + such as SQLite. + columns : list, default: None + List of column names to select from SQL table (only used when reading + a table). + chunksize : int, default None + If specified, return an iterator where `chunksize` is the number of rows + to include in each chunk. Note that this argument is only kept for + compatibility. If a non-none value passed, an error will be reported. + test_rows: int, default 5 + The number of rows to fetch for inferring dtypes. + chunk_size: : int or tuple of ints, optional + Specifies chunk size for each dimension. + engine_kwargs: dict, default None + Extra kwargs to pass to sqlalchemy.create_engine + incremental_index: bool, default True + If index_col not specified, ensure range index incremental, + gain a slightly better performance if setting False. + partition_col : str, default None + Specify name of the column to split the result of the query. If + specified, the range ``[low_limit, high_limit]`` will be divided + into ``n_partitions`` chunks with equal lengths. We do not + guarantee the sizes of chunks be equal. When the value is None, + ``OFFSET`` and ``LIMIT`` clauses will be used to cut the result + of the query. + num_partitions : int, default None + The number of chunks to divide the result of the query into, + when ``partition_col`` is specified. + low_limit : default None + The lower bound of the range of column ``partition_col``. If not + specified, a query will be executed to query the minimum of + the column. + high_limit : default None + The higher bound of the range of column ``partition_col``. If not + specified, a query will be executed to query the maximum of + the column. + + Returns + ------- + DataFrame + + See Also + -------- + read_sql_table : Read SQL database table into a DataFrame. + read_sql_query : Read SQL query into a DataFrame. + """ + return _read_sql( + table_or_sql=sql, + con=con, + index_col=index_col, + coerce_float=coerce_float, + params=params, + parse_dates=parse_dates, + columns=columns, + engine_kwargs=engine_kwargs, + incremental_index=incremental_index, + chunksize=chunksize, + test_rows=test_rows, + chunk_size=chunk_size, + partition_col=partition_col, + num_partitions=num_partitions, + low_limit=low_limit, + high_limit=high_limit, + ) + + +def read_sql_table( + table_name, + con, + schema=None, + index_col=None, + coerce_float=True, + parse_dates=None, + columns=None, + chunksize=None, + test_rows=5, + chunk_size=None, + engine_kwargs=None, + incremental_index=True, + use_arrow_dtype=None, + partition_col=None, + num_partitions=None, + low_limit=None, + high_limit=None, +): + """ + Read SQL database table into a DataFrame. + + Given a table name and a SQLAlchemy connectable, returns a DataFrame. + This function does not support DBAPI connections. + + Parameters + ---------- + table_name : str + Name of SQL table in database. + con : SQLAlchemy connectable or str + A database URI could be provided as as str. + SQLite DBAPI connection mode not supported. + schema : str, default None + Name of SQL schema in database to query (if database flavor + supports this). Uses default schema if None (default). + index_col : str or list of str, optional, default: None + Column(s) to set as index(MultiIndex). + coerce_float : bool, default True + Attempts to convert values of non-string, non-numeric objects (like + decimal.Decimal) to floating point. Can result in loss of Precision. + parse_dates : list or dict, default None + - List of column names to parse as dates. + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps. + - Dict of ``{column_name: arg dict}``, where the arg dict corresponds + to the keyword arguments of :func:`pandas.to_datetime` + Especially useful with databases without native Datetime support, + such as SQLite. + columns : list, default None + List of column names to select from SQL table. + chunksize : int, default None + If specified, returns an iterator where `chunksize` is the number of + rows to include in each chunk. Note that this argument is only kept + for compatibility. If a non-none value passed, an error will be + reported. + test_rows: int, default 5 + The number of rows to fetch for inferring dtypes. + chunk_size: : int or tuple of ints, optional + Specifies chunk size for each dimension. + engine_kwargs: dict, default None + Extra kwargs to pass to sqlalchemy.create_engine + incremental_index: bool, default True + If index_col not specified, ensure range index incremental, + gain a slightly better performance if setting False. + use_arrow_dtype: bool, default None + If True, use arrow dtype to store columns. + partition_col : str, default None + Specify name of the column to split the result of the query. If + specified, the range ``[low_limit, high_limit]`` will be divided + into ``n_partitions`` chunks with equal lengths. We do not + guarantee the sizes of chunks be equal. When the value is None, + ``OFFSET`` and ``LIMIT`` clauses will be used to cut the result + of the query. + num_partitions : int, default None + The number of chunks to divide the result of the query into, + when ``partition_col`` is specified. + low_limit : default None + The lower bound of the range of column ``partition_col``. If not + specified, a query will be executed to query the minimum of + the column. + high_limit : default None + The higher bound of the range of column ``partition_col``. If not + specified, a query will be executed to query the maximum of + the column. + + Returns + ------- + DataFrame + A SQL table is returned as two-dimensional data structure with labeled + axes. + + See Also + -------- + read_sql_query : Read SQL query into a DataFrame. + read_sql : Read SQL query or database table into a DataFrame. + + Notes + ----- + Any datetime values with time zone information will be converted to UTC. + + Examples + -------- + >>> import mars.dataframe as md + >>> md.read_sql_table('table_name', 'postgres:///db_name') # doctest:+SKIP + """ + return _read_sql( + table_or_sql=table_name, + con=con, + schema=schema, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + columns=columns, + engine_kwargs=engine_kwargs, + incremental_index=incremental_index, + use_arrow_dtype=use_arrow_dtype, + chunksize=chunksize, + test_rows=test_rows, + chunk_size=chunk_size, + partition_col=partition_col, + num_partitions=num_partitions, + low_limit=low_limit, + high_limit=high_limit, + ) + + +def read_sql_query( + sql, + con, + index_col=None, + coerce_float=True, + params=None, + parse_dates=None, + chunksize=None, + test_rows=5, + chunk_size=None, + engine_kwargs=None, + incremental_index=True, + use_arrow_dtype=None, + partition_col=None, + num_partitions=None, + low_limit=None, + high_limit=None, +): + """ + Read SQL query into a DataFrame. + + Returns a DataFrame corresponding to the result set of the query + string. Optionally provide an `index_col` parameter to use one of the + columns as the index, otherwise default integer index will be used. + + Parameters + ---------- + sql : str SQL query or SQLAlchemy Selectable (select or text object) + SQL query to be executed. + con : SQLAlchemy connectable(engine/connection), database str URI, + or sqlite3 DBAPI2 connection + Using SQLAlchemy makes it possible to use any DB supported by that + library. + If a DBAPI2 object, only sqlite3 is supported. + index_col : str or list of strings, optional, default: None + Column(s) to set as index(MultiIndex). + coerce_float : bool, default True + Attempts to convert values of non-string, non-numeric objects (like + decimal.Decimal) to floating point. Useful for SQL result sets. + params : list, tuple or dict, optional, default: None + List of parameters to pass to execute method. The syntax used + to pass parameters is database driver dependent. Check your + database driver documentation for which of the five syntax styles, + described in PEP 249's paramstyle, is supported. + Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'}. + parse_dates : list or dict, default: None + - List of column names to parse as dates. + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times, or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps. + - Dict of ``{column_name: arg dict}``, where the arg dict corresponds + to the keyword arguments of :func:`pandas.to_datetime` + Especially useful with databases without native Datetime support, + such as SQLite. + chunksize : int, default None + If specified, return an iterator where `chunksize` is the number of + rows to include in each chunk. Note that this argument is only kept + for compatibility. If a non-none value passed, an error will be + reported. + incremental_index: bool, default True + If index_col not specified, ensure range index incremental, + gain a slightly better performance if setting False. + use_arrow_dtype: bool, default None + If True, use arrow dtype to store columns. + test_rows: int, default 5 + The number of rows to fetch for inferring dtypes. + chunk_size: : int or tuple of ints, optional + Specifies chunk size for each dimension. + engine_kwargs: dict, default None + Extra kwargs to pass to sqlalchemy.create_engine + partition_col : str, default None + Specify name of the column to split the result of the query. If + specified, the range ``[low_limit, high_limit]`` will be divided + into ``n_partitions`` chunks with equal lengths. We do not + guarantee the sizes of chunks be equal. When the value is None, + ``OFFSET`` and ``LIMIT`` clauses will be used to cut the result + of the query. + num_partitions : int, default None + The number of chunks to divide the result of the query into, + when ``partition_col`` is specified. + low_limit : default None + The lower bound of the range of column ``partition_col``. If not + specified, a query will be executed to query the minimum of + the column. + high_limit : default None + The higher bound of the range of column ``partition_col``. If not + specified, a query will be executed to query the maximum of + the column. + + Returns + ------- + DataFrame + + See Also + -------- + read_sql_table : Read SQL database table into a DataFrame. + read_sql + + Notes + ----- + Any datetime values with time zone information parsed via the `parse_dates` + parameter will be converted to UTC. + """ + return _read_sql( + table_or_sql=sql, + con=con, + index_col=index_col, + coerce_float=coerce_float, + params=params, + parse_dates=parse_dates, + engine_kwargs=engine_kwargs, + incremental_index=incremental_index, + use_arrow_dtype=use_arrow_dtype, + chunksize=chunksize, + test_rows=test_rows, + chunk_size=chunk_size, + partition_col=partition_col, + num_partitions=num_partitions, + low_limit=low_limit, + high_limit=high_limit, + ) diff --git a/python/xorbits/_mars/dataframe/datasource/series.py b/python/xorbits/_mars/dataframe/datasource/series.py new file mode 100644 index 000000000..bb06a3a91 --- /dev/null +++ b/python/xorbits/_mars/dataframe/datasource/series.py @@ -0,0 +1,104 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +from ... import opcodes as OperandDef +from ...config import options +from ...core import OutputType +from ...serialization.serializables import DataTypeField, SeriesField +from ...tensor.utils import get_chunk_slices +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import decide_series_chunk_size, is_cudf, parse_index + + +class SeriesDataSource(DataFrameOperand, DataFrameOperandMixin): + """ + Represents data from pandas Series + """ + + _op_type_ = OperandDef.SERIES_DATA_SOURCE + + data = SeriesField("data") + dtype = DataTypeField("dtype") + + def __init__(self, data=None, dtype=None, gpu=None, **kw): + if dtype is None and data is not None: + dtype = data.dtype + if gpu is None and is_cudf(data): # pragma: no cover + gpu = True + super().__init__( + data=data, dtype=dtype, gpu=gpu, _output_types=[OutputType.series], **kw + ) + + def __call__(self, shape, chunk_size=None): + return self.new_series( + None, + shape=shape, + dtype=self.dtype, + index_value=parse_index(self.data.index), + name=self.data.name, + raw_chunk_size=chunk_size, + ) + + @classmethod + def tile(cls, op: "SeriesDataSource"): + series = op.outputs[0] + raw_series = op.data + + memory_usage = raw_series.memory_usage(index=False, deep=True) + chunk_size = series.extra_params.raw_chunk_size or options.chunk_size + chunk_size = decide_series_chunk_size(series.shape, chunk_size, memory_usage) + chunk_size_idxes = (range(len(size)) for size in chunk_size) + + out_chunks = [] + for chunk_shape, chunk_idx in zip( + itertools.product(*chunk_size), itertools.product(*chunk_size_idxes) + ): + chunk_op = op.copy().reset_key() + slc = get_chunk_slices(chunk_size, chunk_idx) + if is_cudf(raw_series): # pragma: no cover + chunk_op.data = raw_series.iloc[slc[0]] + else: + chunk_op.data = raw_series.iloc[slc] + chunk_op.dtype = chunk_op.data.dtype + out_chunk = chunk_op.new_chunk( + None, + shape=chunk_shape, + dtype=op.dtype, + index=chunk_idx, + index_value=parse_index(chunk_op.data.index), + name=series.name, + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_seriess( + None, + series.shape, + dtype=op.dtype, + index_value=series.index_value, + name=series.name, + chunks=out_chunks, + nsplits=chunk_size, + ) + + @classmethod + def execute(cls, ctx, op): + ctx[op.outputs[0].key] = op.data + + +def from_pandas(data, chunk_size=None, gpu=None, sparse=False): + op = SeriesDataSource(data=data, gpu=gpu, sparse=sparse) + return op(data.shape, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/dataframe/datasource/tests/__init__.py b/python/xorbits/_mars/dataframe/datasource/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/datasource/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource.py b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource.py new file mode 100644 index 000000000..c8e181a56 --- /dev/null +++ b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource.py @@ -0,0 +1,649 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import string +import tempfile +from collections import OrderedDict + +import numpy as np +import pandas as pd +import pytest + +from .... import tensor as mt +from ....config import option_context +from ....core import tile +from ....tests.core import require_ray +from ....utils import lazy_import +from ...core import DatetimeIndex, Float64Index, IndexValue, Int64Index +from ...utils import ray_deprecate_ml_dataset +from ..core import merge_small_files +from ..dataframe import from_pandas as from_pandas_df +from ..date_range import date_range +from ..from_records import from_records +from ..from_tensor import ( + dataframe_from_1d_tileables, + dataframe_from_tensor, + series_from_tensor, +) +from ..index import from_pandas as from_pandas_index +from ..index import from_tileable +from ..read_csv import DataFrameReadCSV, read_csv +from ..read_raydataset import ( + DataFrameReadMLDataset, + DataFrameReadRayDataset, + read_ray_dataset, + read_ray_mldataset, +) +from ..read_sql import DataFrameReadSQL, read_sql_query, read_sql_table +from ..series import from_pandas as from_pandas_series + +ray = lazy_import("ray") + + +def test_from_pandas_dataframe(): + data = pd.DataFrame( + np.random.rand(10, 10), columns=["c" + str(i) for i in range(10)] + ) + df = from_pandas_df(data, chunk_size=4) + + pd.testing.assert_series_equal(df.op.dtypes, data.dtypes) + assert isinstance(df.index_value._index_value, IndexValue.RangeIndex) + assert df.index_value._index_value._slice == slice(0, 10, 1) + assert df.index_value.is_monotonic_increasing is True + assert df.index_value.is_monotonic_decreasing is False + assert df.index_value.is_unique is True + assert df.index_value.min_val == 0 + assert df.index_value.max_val == 9 + np.testing.assert_equal(df.columns_value._index_value._data, data.columns.values) + + df = tile(df) + + assert len(df.chunks) == 9 + pd.testing.assert_frame_equal(df.chunks[0].op.data, df.op.data.iloc[:4, :4]) + assert df.chunks[0].index_value._index_value._slice == slice(0, 4, 1) + assert df.chunks[0].index_value._index_value._is_monotonic_increasing is True + assert df.chunks[0].index_value._index_value._is_monotonic_decreasing is False + assert df.chunks[0].index_value._index_value._is_unique is True + pd.testing.assert_frame_equal(df.chunks[1].op.data, df.op.data.iloc[:4, 4:8]) + assert df.chunks[1].index_value._index_value._slice == slice(0, 4, 1) + assert df.chunks[1].index_value._index_value._is_monotonic_increasing is True + assert df.chunks[1].index_value._index_value._is_monotonic_decreasing is False + assert df.chunks[1].index_value._index_value._is_unique is True + pd.testing.assert_frame_equal(df.chunks[2].op.data, df.op.data.iloc[:4, 8:]) + assert df.chunks[2].index_value._index_value._slice == slice(0, 4, 1) + assert df.chunks[2].index_value._index_value._is_monotonic_increasing is True + assert df.chunks[2].index_value._index_value._is_monotonic_decreasing is False + assert df.chunks[2].index_value._index_value._is_unique is True + pd.testing.assert_frame_equal(df.chunks[3].op.data, df.op.data.iloc[4:8, :4]) + assert df.chunks[3].index_value._index_value._slice == slice(4, 8, 1) + assert df.chunks[3].index_value._index_value._is_monotonic_increasing is True + assert df.chunks[3].index_value._index_value._is_monotonic_decreasing is False + assert df.chunks[3].index_value._index_value._is_unique is True + pd.testing.assert_frame_equal(df.chunks[4].op.data, df.op.data.iloc[4:8, 4:8]) + assert df.chunks[4].index_value._index_value._slice == slice(4, 8, 1) + assert df.chunks[4].index_value._index_value._is_monotonic_increasing is True + assert df.chunks[4].index_value._index_value._is_monotonic_decreasing is False + assert df.chunks[4].index_value._index_value._is_unique is True + pd.testing.assert_frame_equal(df.chunks[5].op.data, df.op.data.iloc[4:8, 8:]) + assert df.chunks[5].index_value._index_value._slice == slice(4, 8, 1) + assert df.chunks[5].index_value._index_value._is_monotonic_increasing is True + assert df.chunks[5].index_value._index_value._is_monotonic_decreasing is False + assert df.chunks[5].index_value._index_value._is_unique is True + pd.testing.assert_frame_equal(df.chunks[6].op.data, df.op.data.iloc[8:, :4]) + assert df.chunks[6].index_value._index_value._slice == slice(8, 10, 1) + assert df.chunks[6].index_value._index_value._is_monotonic_increasing is True + assert df.chunks[6].index_value._index_value._is_monotonic_decreasing is False + assert df.chunks[6].index_value._index_value._is_unique is True + pd.testing.assert_frame_equal(df.chunks[7].op.data, df.op.data.iloc[8:, 4:8]) + assert df.chunks[7].index_value._index_value._slice == slice(8, 10, 1) + assert df.chunks[7].index_value._index_value._is_monotonic_increasing is True + assert df.chunks[7].index_value._index_value._is_monotonic_decreasing is False + assert df.chunks[7].index_value._index_value._is_unique is True + pd.testing.assert_frame_equal(df.chunks[8].op.data, df.op.data.iloc[8:, 8:]) + assert df.chunks[8].index_value._index_value._slice == slice(8, 10, 1) + assert df.chunks[8].index_value._index_value._is_monotonic_increasing is True + assert df.chunks[8].index_value._index_value._is_monotonic_decreasing is False + assert df.chunks[8].index_value._index_value._is_unique is True + + data2 = data[::2] + df2 = from_pandas_df(data2, chunk_size=4) + + pd.testing.assert_series_equal(df.op.dtypes, data2.dtypes) + assert isinstance(df2.index_value._index_value, IndexValue.RangeIndex) + assert df2.index_value._index_value._slice == slice(0, 10, 2) + + df2 = tile(df2) + + assert len(df2.chunks) == 6 + pd.testing.assert_frame_equal(df2.chunks[0].op.data, df2.op.data.iloc[:4, :4]) + assert df2.chunks[0].index_value._index_value._slice == slice(0, 8, 2) + pd.testing.assert_frame_equal(df2.chunks[1].op.data, df2.op.data.iloc[:4, 4:8]) + assert df2.chunks[1].index_value._index_value._slice == slice(0, 8, 2) + pd.testing.assert_frame_equal(df2.chunks[2].op.data, df2.op.data.iloc[:4, 8:]) + assert df2.chunks[2].index_value._index_value._slice == slice(0, 8, 2) + pd.testing.assert_frame_equal(df2.chunks[3].op.data, df2.op.data.iloc[4:, :4]) + assert df2.chunks[3].index_value._index_value._slice == slice(8, 10, 2) + pd.testing.assert_frame_equal(df2.chunks[4].op.data, df2.op.data.iloc[4:, 4:8]) + assert df2.chunks[3].index_value._index_value._slice == slice(8, 10, 2) + pd.testing.assert_frame_equal(df2.chunks[5].op.data, df2.op.data.iloc[4:, 8:]) + assert df2.chunks[3].index_value._index_value._slice == slice(8, 10, 2) + + raw = pd.DataFrame( + { + "a": [ + string.printable[i : i + 15] + for i in np.random.randint(len(string.printable), size=100) + ], + "b": np.random.rand(100), + } + ) + with option_context({"chunk_store_limit": raw["a"].memory_usage(deep=True) / 10}): + df = tile(from_pandas_df(raw)) + # see GH#2985, empty chunks are wrongly generated + assert len([ns for ns in df.nsplits[1] if ns == 0]) == 0 + + +def test_from_pandas_series(): + data = pd.Series(np.random.rand(10), name="a") + series = from_pandas_series(data, chunk_size=4) + + assert series.name == data.name + assert isinstance(series.index_value._index_value, IndexValue.RangeIndex) + assert series.index_value._index_value._slice == slice(0, 10, 1) + assert series.index_value.is_monotonic_increasing is True + assert series.index_value.is_monotonic_decreasing is False + assert series.index_value.is_unique is True + assert series.index_value.min_val == 0 + assert series.index_value.max_val == 9 + + series = tile(series) + + assert len(series.chunks) == 3 + pd.testing.assert_series_equal(series.chunks[0].op.data, series.op.data.iloc[:4]) + assert series.chunks[0].index_value._index_value._slice == slice(0, 4, 1) + assert series.chunks[0].index_value._index_value._is_monotonic_increasing is True + assert series.chunks[0].index_value._index_value._is_monotonic_decreasing is False + assert series.chunks[0].index_value._index_value._is_unique is True + pd.testing.assert_series_equal(series.chunks[1].op.data, series.op.data.iloc[4:8]) + assert series.chunks[1].index_value._index_value._slice == slice(4, 8, 1) + assert series.chunks[1].index_value._index_value._is_monotonic_increasing is True + assert series.chunks[1].index_value._index_value._is_monotonic_decreasing is False + assert series.chunks[1].index_value._index_value._is_unique is True + pd.testing.assert_series_equal(series.chunks[2].op.data, series.op.data.iloc[8:]) + assert series.chunks[2].index_value._index_value._slice == slice(8, 10, 1) + assert series.chunks[2].index_value._index_value._is_monotonic_increasing is True + assert series.chunks[2].index_value._index_value._is_monotonic_decreasing is False + assert series.chunks[2].index_value._index_value._is_unique is True + + +def test_from_pandas_index(): + data = pd.date_range("2020-1-1", periods=10, name="date") + index = from_pandas_index(data, chunk_size=4) + + assert isinstance(index, DatetimeIndex) + assert index.name == data.name + assert index.dtype == data.dtype + assert isinstance(index.index_value.value, IndexValue.DatetimeIndex) + + index = tile(index) + + for i, c in enumerate(index.chunks): + assert c.name == data.name + pd.testing.assert_index_equal(c.op.data, data[i * 4 : (i + 1) * 4]) + assert c.dtype == data.dtype + assert isinstance(c.index_value.value, IndexValue.DatetimeIndex) + + +def test_from_tileable_index(): + t = mt.random.rand(10, 4) + + with pytest.raises(ValueError): + from_tileable(t) + + pd_df = pd.DataFrame( + np.random.rand(10, 4), index=np.arange(10, 0, -1).astype(np.int64) + ) + pd_df.index.name = "ind" + df = from_pandas_df(pd_df, chunk_size=6) + + for o in [df, df[0]]: + index = o.index + assert isinstance(index, Int64Index) + assert index.dtype == np.int64 + assert index.name == pd_df.index.name + assert isinstance(index.index_value.value, IndexValue.Int64Index) + + index = tile(index) + + assert len(index.chunks) == 2 + for c in index.chunks: + assert c.dtype == np.int64 + assert c.name == pd_df.index.name + assert isinstance(c.index_value.value, IndexValue.Int64Index) + + t = mt.random.rand(10, chunk_size=6) + index = from_tileable(t, name="new_name") + + assert isinstance(index, Float64Index) + assert index.dtype == np.float64 + assert index.name == "new_name" + assert isinstance(index.index_value.value, IndexValue.Float64Index) + + index = tile(index) + + assert len(index.chunks) == 2 + for c in index.chunks: + assert c.dtype == np.float64 + assert c.name == "new_name" + assert isinstance(c.index_value.value, IndexValue.Float64Index) + + +def test_from_tensor(): + tensor = mt.random.rand(10, 10, chunk_size=5) + df = dataframe_from_tensor(tensor) + assert isinstance(df.index_value._index_value, IndexValue.RangeIndex) + assert df.dtypes[0] == tensor.dtype + + df = tile(df) + assert len(df.chunks) == 4 + assert isinstance(df.chunks[0].index_value._index_value, IndexValue.RangeIndex) + assert isinstance(df.chunks[0].index_value, IndexValue) + + # test converted from 1-d tensor + tensor2 = mt.array([1, 2, 3]) + # in fact, tensor3 is (3,1) + tensor3 = mt.array([tensor2]).T + + df2 = dataframe_from_tensor(tensor2) + df3 = dataframe_from_tensor(tensor3) + df2 = tile(df2) + df3 = tile(df3) + np.testing.assert_equal(df2.chunks[0].index, (0, 0)) + np.testing.assert_equal(df3.chunks[0].index, (0, 0)) + + # test converted from scalar + scalar = mt.array(1) + np.testing.assert_equal(scalar.ndim, 0) + with pytest.raises(TypeError): + dataframe_from_tensor(scalar) + + # from tensor with given index + df = dataframe_from_tensor(tensor, index=np.arange(0, 20, 2)) + df = tile(df) + pd.testing.assert_index_equal(df.chunks[0].op.index, pd.Index(np.arange(0, 10, 2))) + pd.testing.assert_index_equal(df.chunks[1].op.index, pd.Index(np.arange(0, 10, 2))) + pd.testing.assert_index_equal(df.chunks[2].op.index, pd.Index(np.arange(10, 20, 2))) + pd.testing.assert_index_equal(df.chunks[3].op.index, pd.Index(np.arange(10, 20, 2))) + + # from tensor with index that is a tensor as well + df = dataframe_from_tensor(tensor, index=mt.arange(0, 20, 2)) + df = tile(df) + assert len(df.chunks[0].inputs) == 2 + assert df.chunks[0].index_value.has_value() is False + + # from tensor with given columns + df = dataframe_from_tensor(tensor, columns=list("abcdefghij")) + df = tile(df) + pd.testing.assert_index_equal(df.dtypes.index, pd.Index(list("abcdefghij"))) + pd.testing.assert_index_equal( + df.chunks[0].columns_value.to_pandas(), pd.Index(["a", "b", "c", "d", "e"]) + ) + pd.testing.assert_index_equal( + df.chunks[0].dtypes.index, pd.Index(["a", "b", "c", "d", "e"]) + ) + pd.testing.assert_index_equal( + df.chunks[1].columns_value.to_pandas(), pd.Index(["f", "g", "h", "i", "j"]) + ) + pd.testing.assert_index_equal( + df.chunks[1].dtypes.index, pd.Index(["f", "g", "h", "i", "j"]) + ) + pd.testing.assert_index_equal( + df.chunks[2].columns_value.to_pandas(), pd.Index(["a", "b", "c", "d", "e"]) + ) + pd.testing.assert_index_equal( + df.chunks[2].dtypes.index, pd.Index(["a", "b", "c", "d", "e"]) + ) + pd.testing.assert_index_equal( + df.chunks[3].columns_value.to_pandas(), pd.Index(["f", "g", "h", "i", "j"]) + ) + pd.testing.assert_index_equal( + df.chunks[3].dtypes.index, pd.Index(["f", "g", "h", "i", "j"]) + ) + + # test series from tensor + tensor = mt.random.rand(10, chunk_size=4) + series = series_from_tensor(tensor, name="a") + + assert series.dtype == tensor.dtype + assert series.name == "a" + pd.testing.assert_index_equal(series.index_value.to_pandas(), pd.RangeIndex(10)) + + series = tile(series) + assert len(series.chunks) == 3 + pd.testing.assert_index_equal( + series.chunks[0].index_value.to_pandas(), pd.RangeIndex(0, 4) + ) + assert series.chunks[0].name == "a" + pd.testing.assert_index_equal( + series.chunks[1].index_value.to_pandas(), pd.RangeIndex(4, 8) + ) + assert series.chunks[1].name == "a" + pd.testing.assert_index_equal( + series.chunks[2].index_value.to_pandas(), pd.RangeIndex(8, 10) + ) + assert series.chunks[2].name == "a" + + d = OrderedDict( + [(0, mt.tensor(np.random.rand(4))), (1, mt.tensor(np.random.rand(4)))] + ) + df = dataframe_from_1d_tileables(d) + pd.testing.assert_index_equal(df.columns_value.to_pandas(), pd.RangeIndex(2)) + + df = tile(df) + + pd.testing.assert_index_equal( + df.chunks[0].index_value.to_pandas(), pd.RangeIndex(4) + ) + + series = series_from_tensor(mt.random.rand(4)) + pd.testing.assert_index_equal(series.index_value.to_pandas(), pd.RangeIndex(4)) + + series = series_from_tensor(mt.random.rand(4), index=[1, 2, 3]) + pd.testing.assert_index_equal(series.op.index, pd.Index([1, 2, 3])) + + series = series_from_tensor( + mt.random.rand(4), index=pd.Index([1, 2, 3], name="my_index") + ) + pd.testing.assert_index_equal(series.op.index, pd.Index([1, 2, 3], name="my_index")) + assert series.index_value.name == "my_index" + + with pytest.raises(TypeError): + series_from_tensor(mt.ones((10, 10))) + + # index has wrong shape + with pytest.raises(ValueError): + dataframe_from_tensor(mt.random.rand(4, 3), index=mt.random.rand(5)) + + # columns have wrong shape + with pytest.raises(ValueError): + dataframe_from_tensor(mt.random.rand(4, 3), columns=["a", "b"]) + + # index should be 1-d + with pytest.raises(ValueError): + dataframe_from_tensor( + mt.tensor(np.random.rand(3, 2)), index=mt.tensor(np.random.rand(3, 2)) + ) + + # 1-d tensors should have same shape + with pytest.raises(ValueError): + dataframe_from_1d_tileables( + OrderedDict( + [(0, mt.tensor(np.random.rand(3))), (1, mt.tensor(np.random.rand(2)))] + ) + ) + + # index has wrong shape + with pytest.raises(ValueError): + dataframe_from_1d_tileables( + {0: mt.tensor(np.random.rand(3))}, index=mt.tensor(np.random.rand(2)) + ) + + # columns have wrong shape + with pytest.raises(ValueError): + dataframe_from_1d_tileables( + {0: mt.tensor(np.random.rand(3))}, columns=["a", "b"] + ) + + # index should be 1-d + with pytest.raises(ValueError): + series_from_tensor(mt.random.rand(4), index=mt.random.rand(4, 3)) + + +def test_from_records(): + dtype = np.dtype([("x", "int"), ("y", "double"), ("z", " 1 + finally: + shutil.rmtree(tempdir) + + +def test_read_sql(): + test_df = pd.DataFrame( + { + "a": np.arange(10).astype(np.int64, copy=False), + "b": [f"s{i}" for i in range(10)], + } + ) + + with tempfile.TemporaryDirectory() as d: + table_name = "test" + uri = "sqlite:///" + os.path.join(d, "test.db") + + test_df.to_sql(table_name, uri, index=False) + + df = read_sql_table(table_name, uri, chunk_size=4) + + assert df.shape == test_df.shape + pd.testing.assert_index_equal(df.index_value.to_pandas(), test_df.index) + pd.testing.assert_series_equal(df.dtypes, test_df.dtypes) + + df = tile(df) + assert df.nsplits == ((4, 4, 2), (2,)) + for c in df.chunks: + assert isinstance(c.op, DataFrameReadSQL) + assert c.op.offset is not None + + with pytest.raises(NotImplementedError): + read_sql_table(table_name, uri, chunksize=4, index_col=b"a") + with pytest.raises(TypeError): + read_sql_table(table_name, uri, chunk_size=4, index_col=b"a") + with pytest.raises(TypeError): + read_sql_query("select * from " + table_name, uri, partition_col="b") + + +@require_ray +def test_read_ray_dataset(ray_start_regular): + test_df1 = pd.DataFrame( + { + "a": np.arange(10).astype(np.int64, copy=False), + "b": [f"s{i}" for i in range(10)], + } + ) + test_df2 = pd.DataFrame( + { + "a": np.arange(10).astype(np.int64, copy=False), + "b": [f"s{i}" for i in range(10)], + } + ) + df = pd.concat([test_df1, test_df2]) + ds = ray.data.from_pandas_refs([ray.put(test_df1), ray.put(test_df2)]) + mdf = read_ray_dataset(ds) + + assert mdf.shape[1] == 2 + pd.testing.assert_index_equal(df.columns, mdf.columns_value.to_pandas()) + pd.testing.assert_series_equal(df.dtypes, mdf.dtypes) + + mdf = tile(mdf) + assert len(mdf.chunks) == 2 + for chunk in mdf.chunks: + assert isinstance(chunk.op, DataFrameReadRayDataset) + + +def test_date_range(): + with pytest.raises(TypeError): + _ = date_range("2020-1-1", periods="2") + + with pytest.raises(ValueError): + _ = date_range("2020-1-1", "2020-1-10", periods=10, freq="D") + + with pytest.raises(ValueError): + _ = date_range(pd.NaT, periods=10) + + expected = pd.date_range("2020-1-1", periods=9.0, name="date") + + dr = date_range("2020-1-1", periods=9.0, name="date", chunk_size=3) + assert isinstance(dr, DatetimeIndex) + assert dr.shape == (9,) + assert dr.dtype == expected.dtype + assert isinstance(dr.index_value.value, IndexValue.DatetimeIndex) + assert dr.index_value.min_val == expected.min() + assert dr.index_value.min_val_close is True + assert dr.index_value.max_val == expected.max() + assert dr.index_value.max_val_close is True + assert dr.index_value.is_unique == expected.is_unique + assert dr.index_value.is_monotonic_increasing == expected.is_monotonic_increasing + assert dr.name == expected.name + + dr = tile(dr) + + for i, c in enumerate(dr.chunks): + ec = expected[i * 3 : (i + 1) * 3] + assert c.shape == (3,) + assert c.dtype == ec.dtype + assert isinstance(c.index_value.value, IndexValue.DatetimeIndex) + assert c.index_value.min_val == ec.min() + assert c.index_value.min_val_close is True + assert c.index_value.max_val == ec.max() + assert c.index_value.max_val_close is True + assert c.index_value.is_unique == ec.is_unique + assert c.index_value.is_monotonic_increasing == ec.is_monotonic_increasing + assert c.name == ec.name + + +@require_ray +@pytest.mark.skipif( + ray_deprecate_ml_dataset in (True, None), + reason="Ray (>=2.0) has deprecated MLDataset.", +) +def test_read_ray_mldataset(ray_start_regular): + test_df1 = pd.DataFrame( + { + "a": np.arange(10).astype(np.int64, copy=False), + "b": [f"s{i}" for i in range(10)], + } + ) + test_df2 = pd.DataFrame( + { + "a": np.arange(10).astype(np.int64, copy=False), + "b": [f"s{i}" for i in range(10)], + } + ) + df = pd.concat([test_df1, test_df2]) + import ray.util.iter + from ray.util.data import from_parallel_iter + + ml_dataset = from_parallel_iter( + ray.util.iter.from_items([test_df1, test_df2], num_shards=2), need_convert=False + ) + mdf = read_ray_mldataset(ml_dataset) + + assert mdf.shape[1] == 2 + pd.testing.assert_index_equal(df.columns, mdf.columns_value.to_pandas()) + pd.testing.assert_series_equal(df.dtypes, mdf.dtypes) + + mdf = tile(mdf) + assert len(mdf.chunks) == 2 + for chunk in mdf.chunks: + assert isinstance(chunk.op, DataFrameReadMLDataset) + + +def test_merge_small_files(): + raw = pd.DataFrame(np.random.rand(16, 4)) + df = tile(from_pandas_df(raw, chunk_size=4)) + + chunk_size = 4 * 4 * 8 + # number of chunks < 10 + assert df is merge_small_files(df, n_sample_file=10) + # merged_chunk_size + assert df is merge_small_files( + df, n_sample_file=2, merged_file_size=chunk_size + 0.1 + ) + + df2 = merge_small_files(df, n_sample_file=2, merged_file_size=2 * chunk_size) + assert len(df2.chunks) == 2 + assert df2.chunks[0].shape == (8, 4) + pd.testing.assert_index_equal( + df2.chunks[0].index_value.to_pandas(), pd.RangeIndex(8) + ) + assert df2.chunks[1].shape == (8, 4) + pd.testing.assert_index_equal( + df2.chunks[1].index_value.to_pandas(), pd.RangeIndex(8, 16) + ) + assert df2.nsplits == ((8, 8), (4,)) diff --git a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py new file mode 100644 index 000000000..4f8a815de --- /dev/null +++ b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py @@ -0,0 +1,1325 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import os +import tempfile +import time +from collections import OrderedDict +from datetime import datetime +from string import printable + +import numpy as np +import pandas as pd +import pytest + +try: + import pyarrow as pa +except ImportError: # pragma: no cover + pa = None +try: + import fastparquet +except ImportError: # pragma: no cover + fastparquet = None +try: + import sqlalchemy +except ImportError: # pragma: no cover + sqlalchemy = None + + +from .... import dataframe as md +from .... import tensor as mt +from ....config import option_context +from ....tests.core import require_cudf, require_ray +from ....utils import arrow_array_to_objects, lazy_import, pd_release_version +from ...utils import ray_deprecate_ml_dataset +from ..dataframe import from_pandas as from_pandas_df +from ..from_records import from_records +from ..from_tensor import dataframe_from_1d_tileables, dataframe_from_tensor +from ..index import from_pandas as from_pandas_index +from ..index import from_tileable +from ..series import from_pandas as from_pandas_series + +ray = lazy_import("ray") +_date_range_use_inclusive = pd_release_version[:2] >= (1, 4) + + +def test_from_pandas_dataframe_execution(setup): + # test empty DataFrame + pdf = pd.DataFrame() + df = from_pandas_df(pdf) + + result = df.execute().fetch() + pd.testing.assert_frame_equal(pdf, result) + + pdf = pd.DataFrame(columns=list("ab")) + df = from_pandas_df(pdf) + + result = df.execute().fetch() + pd.testing.assert_frame_equal(pdf, result) + + pdf = pd.DataFrame( + np.random.rand(20, 30), index=[np.arange(20), np.arange(20, 0, -1)] + ) + df = from_pandas_df(pdf, chunk_size=(13, 21)) + + result = df.execute().fetch() + pd.testing.assert_frame_equal(pdf, result) + + +def test_from_pandas_series_execution(setup): + # test empty Series + ps = pd.Series(name="a") + series = from_pandas_series(ps, chunk_size=13) + + result = series.execute().fetch() + pd.testing.assert_series_equal(ps, result) + + series = from_pandas_series(ps) + + result = series.execute().fetch() + pd.testing.assert_series_equal(ps, result) + + ps = pd.Series( + np.random.rand(20), index=[np.arange(20), np.arange(20, 0, -1)], name="a" + ) + series = from_pandas_series(ps, chunk_size=13) + + result = series.execute().fetch() + pd.testing.assert_series_equal(ps, result) + + +def test_from_pandas_index_execution(setup): + pd_index = pd.timedelta_range("1 days", periods=10) + index = from_pandas_index(pd_index, chunk_size=7) + + result = index.execute().fetch() + pd.testing.assert_index_equal(pd_index, result) + + +def test_index_execution(setup): + rs = np.random.RandomState(0) + pdf = pd.DataFrame( + rs.rand(20, 10), + index=np.arange(20, 0, -1), + columns=["a" + str(i) for i in range(10)], + ) + df = from_pandas_df(pdf, chunk_size=13) + + # test df.index + result = df.index.execute().fetch() + pd.testing.assert_index_equal(result, pdf.index) + + result = df.columns.execute().fetch() + pd.testing.assert_index_equal(result, pdf.columns) + + # df has unknown chunk shape on axis 0 + df = df[df.a1 < 0.5] + + # test df.index + result = df.index.execute().fetch() + pd.testing.assert_index_equal(result, pdf[pdf.a1 < 0.5].index) + + s = pd.Series(pdf["a1"], index=pd.RangeIndex(20)) + series = from_pandas_series(s, chunk_size=13) + + # test series.index which has value + result = series.index.execute().fetch() + pd.testing.assert_index_equal(result, s.index) + + s = pdf["a2"] + series = from_pandas_series(s, chunk_size=13) + + # test series.index + result = series.index.execute().fetch() + pd.testing.assert_index_equal(result, s.index) + + # test tensor + raw = rs.random(20) + t = mt.tensor(raw, chunk_size=13) + + result = from_tileable(t).execute().fetch() + pd.testing.assert_index_equal(result, pd.Index(raw)) + + +def test_initializer_execution(setup): + arr = np.random.rand(20, 30) + + pdf = pd.DataFrame(arr, index=[np.arange(20), np.arange(20, 0, -1)]) + df = md.DataFrame(pdf, chunk_size=(15, 10)) + result = df.execute().fetch() + pd.testing.assert_frame_equal(pdf, result) + + df = md.DataFrame(arr, index=md.date_range("2020-1-1", periods=20)) + result = df.execute().fetch() + pd.testing.assert_frame_equal( + result, pd.DataFrame(arr, index=pd.date_range("2020-1-1", periods=20)) + ) + + df = md.DataFrame( + {"prices": [100, 101, np.nan, 100, 89, 88]}, + index=md.date_range("1/1/2010", periods=6, freq="D"), + ) + result = df.execute().fetch() + pd.testing.assert_frame_equal( + result, + pd.DataFrame( + {"prices": [100, 101, np.nan, 100, 89, 88]}, + index=pd.date_range("1/1/2010", periods=6, freq="D"), + ), + ) + + s = np.random.rand(20) + + ps = pd.Series(s, index=[np.arange(20), np.arange(20, 0, -1)], name="a") + series = md.Series(ps, chunk_size=7) + result = series.execute().fetch() + pd.testing.assert_series_equal(ps, result) + + series = md.Series(s, index=md.date_range("2020-1-1", periods=20)) + result = series.execute().fetch() + pd.testing.assert_series_equal( + result, pd.Series(s, index=pd.date_range("2020-1-1", periods=20)) + ) + + pi = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) + index = md.Index(md.Index(pi)) + result = index.execute().fetch() + pd.testing.assert_index_equal(pi, result) + + +def test_index_only(setup): + df = md.DataFrame(index=[1, 2, 3]) + pd.testing.assert_frame_equal(df.execute().fetch(), pd.DataFrame(index=[1, 2, 3])) + + s = md.Series(index=[1, 2, 3]) + pd.testing.assert_series_equal(s.execute().fetch(), pd.Series(index=[1, 2, 3])) + + df = md.DataFrame(index=md.Index([1, 2, 3])) + pd.testing.assert_frame_equal(df.execute().fetch(), pd.DataFrame(index=[1, 2, 3])) + + s = md.Series(index=md.Index([1, 2, 3]), dtype=object) + pd.testing.assert_series_equal( + s.execute().fetch(), pd.Series(index=[1, 2, 3], dtype=object) + ) + + +def test_series_from_tensor(setup): + data = np.random.rand(10) + series = md.Series(mt.tensor(data), name="a") + pd.testing.assert_series_equal(series.execute().fetch(), pd.Series(data, name="a")) + + series = md.Series(mt.tensor(data, chunk_size=3)) + pd.testing.assert_series_equal(series.execute().fetch(), pd.Series(data)) + + series = md.Series(mt.ones((10,), chunk_size=4)) + pd.testing.assert_series_equal( + series.execute().fetch(), + pd.Series(np.ones(10)), + ) + + index_data = np.random.rand(10) + series = md.Series( + mt.tensor(data, chunk_size=3), + name="a", + index=mt.tensor(index_data, chunk_size=4), + ) + pd.testing.assert_series_equal( + series.execute().fetch(), pd.Series(data, name="a", index=index_data) + ) + + series = md.Series( + mt.tensor(data, chunk_size=3), + name="a", + index=md.date_range("2020-1-1", periods=10), + ) + pd.testing.assert_series_equal( + series.execute().fetch(), + pd.Series(data, name="a", index=pd.date_range("2020-1-1", periods=10)), + ) + + +def test_from_tensor_execution(setup): + tensor = mt.random.rand(10, 10, chunk_size=5) + df = dataframe_from_tensor(tensor) + tensor_res = tensor.execute().fetch() + pdf_expected = pd.DataFrame(tensor_res) + df_result = df.execute().fetch() + pd.testing.assert_index_equal(df_result.index, pd.RangeIndex(0, 10)) + pd.testing.assert_index_equal(df_result.columns, pd.RangeIndex(0, 10)) + pd.testing.assert_frame_equal(df_result, pdf_expected) + + # test from tensor with unknown shape + tensor2 = tensor[tensor[:, 0] < 0.9] + df = dataframe_from_tensor(tensor2) + df_result = df.execute().fetch() + tensor_res = tensor2.execute().fetch() + pdf_expected = pd.DataFrame(tensor_res) + pd.testing.assert_frame_equal(df_result.reset_index(drop=True), pdf_expected) + + # test converted with specified index_value and columns + tensor2 = mt.random.rand(2, 2, chunk_size=1) + df2 = dataframe_from_tensor( + tensor2, index=pd.Index(["a", "b"]), columns=pd.Index([3, 4]) + ) + df_result = df2.execute().fetch() + pd.testing.assert_index_equal(df_result.index, pd.Index(["a", "b"])) + pd.testing.assert_index_equal(df_result.columns, pd.Index([3, 4])) + + # test converted from 1-d tensor + tensor3 = mt.array([1, 2, 3]) + df3 = dataframe_from_tensor(tensor3) + result3 = df3.execute().fetch() + pdf_expected = pd.DataFrame(np.array([1, 2, 3])) + pd.testing.assert_frame_equal(pdf_expected, result3) + + # test converted from identical chunks + tensor4 = mt.ones((10, 10), chunk_size=3) + df4 = dataframe_from_tensor(tensor4) + result4 = df4.execute().fetch() + pdf_expected = pd.DataFrame(tensor4.execute().fetch()) + pd.testing.assert_frame_equal(pdf_expected, result4) + + # from tensor with given index + tensor5 = mt.ones((10, 10), chunk_size=3) + df5 = dataframe_from_tensor(tensor5, index=np.arange(0, 20, 2)) + result5 = df5.execute().fetch() + pdf_expected = pd.DataFrame(np.ones((10, 10)), index=np.arange(0, 20, 2)) + pd.testing.assert_frame_equal(pdf_expected, result5) + + # from tensor with given index that is a tensor + raw7 = np.random.rand(10, 10) + tensor7 = mt.tensor(raw7, chunk_size=3) + index_raw7 = np.random.rand(10) + index7 = mt.tensor(index_raw7, chunk_size=4) + df7 = dataframe_from_tensor(tensor7, index=index7) + result7 = df7.execute().fetch() + pdf_expected = pd.DataFrame(raw7, index=index_raw7) + pd.testing.assert_frame_equal(pdf_expected, result7) + + # from tensor with given index is a md.Index + raw10 = np.random.rand(10, 10) + tensor10 = mt.tensor(raw10, chunk_size=3) + index10 = md.date_range("2020-1-1", periods=10, chunk_size=3) + df10 = dataframe_from_tensor(tensor10, index=index10) + result10 = df10.execute().fetch() + pdf_expected = pd.DataFrame(raw10, index=pd.date_range("2020-1-1", periods=10)) + pd.testing.assert_frame_equal(pdf_expected, result10) + + # from tensor with given columns + tensor6 = mt.ones((10, 10), chunk_size=3) + df6 = dataframe_from_tensor(tensor6, columns=list("abcdefghij")) + result6 = df6.execute().fetch() + pdf_expected = pd.DataFrame(tensor6.execute().fetch(), columns=list("abcdefghij")) + pd.testing.assert_frame_equal(pdf_expected, result6) + + # from 1d tensors + raws8 = [ + ("a", np.random.rand(8)), + ("b", np.random.randint(10, size=8)), + ("c", ["".join(np.random.choice(list(printable), size=6)) for _ in range(8)]), + ] + tensors8 = OrderedDict((r[0], mt.tensor(r[1], chunk_size=3)) for r in raws8) + raws8.append(("d", 1)) + raws8.append(("e", pd.date_range("2020-1-1", periods=8))) + tensors8["d"] = 1 + tensors8["e"] = raws8[-1][1] + df8 = dataframe_from_1d_tileables(tensors8, columns=[r[0] for r in raws8]) + result = df8.execute().fetch() + pdf_expected = pd.DataFrame(OrderedDict(raws8)) + pd.testing.assert_frame_equal(result, pdf_expected) + + # from 1d tensors and specify index with a tensor + index_raw9 = np.random.rand(8) + index9 = mt.tensor(index_raw9, chunk_size=4) + df9 = dataframe_from_1d_tileables( + tensors8, columns=[r[0] for r in raws8], index=index9 + ) + result = df9.execute().fetch() + pdf_expected = pd.DataFrame(OrderedDict(raws8), index=index_raw9) + pd.testing.assert_frame_equal(result, pdf_expected) + + # from 1d tensors and specify index + df11 = dataframe_from_1d_tileables( + tensors8, + columns=[r[0] for r in raws8], + index=md.date_range("2020-1-1", periods=8), + ) + result = df11.execute().fetch() + pdf_expected = pd.DataFrame( + OrderedDict(raws8), index=pd.date_range("2020-1-1", periods=8) + ) + pd.testing.assert_frame_equal(result, pdf_expected) + + df12 = dataframe_from_1d_tileables({"a": [md.Series([1, 2, 3]).sum() + 1]}) + result = df12.execute().fetch() + pdf_expected = pd.DataFrame({"a": [pd.Series([1, 2, 3]).sum() + 1]}) + pd.testing.assert_frame_equal(result, pdf_expected) + + +def test_from_records_execution(setup): + dtype = np.dtype([("x", "int"), ("y", "double"), ("z", " 0.5", uri, parse_dates=["d"], chunk_size=4 + ) + result = r.execute().fetch() + pd.testing.assert_frame_equal( + result, test_df[test_df.c > 0.5].reset_index(drop=True) + ) + + # test read with sql string and partition method with integer cols + r = md.read_sql( + "select * from test where b > 's5'", + uri, + parse_dates=["d"], + partition_col="a", + num_partitions=3, + ) + result = r.execute().fetch() + pd.testing.assert_frame_equal( + result, test_df[test_df.b > "s5"].reset_index(drop=True) + ) + + # test read with sql string and partition method with datetime cols + r = md.read_sql_query( + "select * from test where b > 's5'", + uri, + parse_dates={"d": "%Y-%m-%d %H:%M:%S"}, + partition_col="d", + num_partitions=3, + ) + result = r.execute().fetch() + pd.testing.assert_frame_equal( + result, test_df[test_df.b > "s5"].reset_index(drop=True) + ) + + # test read with sql string and partition method with datetime cols + r = md.read_sql_query( + "select * from test where b > 's5'", + uri, + parse_dates=["d"], + partition_col="d", + num_partitions=3, + index_col="d", + ) + result = r.execute().fetch() + pd.testing.assert_frame_equal(result, test_df[test_df.b > "s5"].set_index("d")) + + # test SQL that return no result + r = md.read_sql_query("select * from test where a > 1000", uri) + result = r.execute().fetch() + pd.testing.assert_frame_equal(result, pd.DataFrame(columns=test_df.columns)) + + engine = sa.create_engine(uri) + m = sa.MetaData() + try: + # test index_col and columns + r = md.read_sql_table( + "test", + engine.connect(), + chunk_size=4, + index_col="a", + columns=["b", "d"], + ) + result = r.execute().fetch() + expected = test_df.copy(deep=True) + expected.set_index("a", inplace=True) + del expected["c"] + pd.testing.assert_frame_equal(result, expected) + + # do not specify chunk_size + r = md.read_sql_table( + "test", engine.connect(), index_col="a", columns=["b", "d"] + ) + result = r.execute().fetch() + pd.testing.assert_frame_equal(result, expected) + + table = sa.Table(table_name, m, autoload=True, autoload_with=engine) + r = md.read_sql_table( + table, + engine, + chunk_size=4, + index_col=[table.columns["a"], table.columns["b"]], + columns=[table.columns["c"], "d"], + ) + result = r.execute().fetch() + expected = test_df.copy(deep=True) + expected.set_index(["a", "b"], inplace=True) + pd.testing.assert_frame_equal(result, expected) + + # test table with primary key + sa.Table( + table_name2, + m, + sa.Column("id", sa.Integer, primary_key=True), + sa.Column("a", sa.Integer), + sa.Column("b", sa.String), + sa.Column("c", sa.Float), + sa.Column("d", sa.DateTime), + ) + m.create_all(engine) + test_df = test_df.copy(deep=True) + test_df.index.name = "id" + test_df.to_sql(table_name2, uri, if_exists="append") + + r = md.read_sql_table(table_name2, engine, chunk_size=4, index_col="id") + result = r.execute().fetch() + pd.testing.assert_frame_equal(result, test_df) + finally: + engine.dispose() + + +@pytest.mark.skipif(pa is None, reason="pyarrow not installed") +def test_read_sql_use_arrow_dtype(setup): + rs = np.random.RandomState(0) + test_df = pd.DataFrame( + { + "a": np.arange(10).astype(np.int64, copy=False), + "b": [f"s{i}" for i in range(10)], + "c": rs.rand(10), + "d": [ + datetime.fromtimestamp(time.time() + 3600 * (i - 5)) for i in range(10) + ], + } + ) + + with tempfile.TemporaryDirectory() as d: + table_name = "test" + uri = "sqlite:///" + os.path.join(d, "test.db") + + test_df.to_sql(table_name, uri, index=False) + + r = md.read_sql_table("test", uri, chunk_size=4, use_arrow_dtype=True) + result = r.execute().fetch() + assert isinstance(r.dtypes.iloc[1], md.ArrowStringDtype) + assert isinstance(result.dtypes.iloc[1], md.ArrowStringDtype) + pd.testing.assert_frame_equal(arrow_array_to_objects(result), test_df) + + # test read with sql string and offset method + r = md.read_sql_query( + "select * from test where c > 0.5", + uri, + parse_dates=["d"], + chunk_size=4, + use_arrow_dtype=True, + ) + result = r.execute().fetch() + assert isinstance(r.dtypes.iloc[1], md.ArrowStringDtype) + assert isinstance(result.dtypes.iloc[1], md.ArrowStringDtype) + pd.testing.assert_frame_equal( + arrow_array_to_objects(result), + test_df[test_df.c > 0.5].reset_index(drop=True), + ) + + +@pytest.mark.pd_compat +def test_date_range_execution(setup): + chunk_sizes = [None, 3] + inclusives = ["both", "neither", "left", "right"] + + if _date_range_use_inclusive: + with pytest.warns(FutureWarning, match="closed"): + md.date_range("2020-1-1", periods=10, closed="right") + + for chunk_size, inclusive in itertools.product(chunk_sizes, inclusives): + kw = dict() + if _date_range_use_inclusive: + kw["inclusive"] = inclusive + else: + if inclusive == "neither": + continue + elif inclusive == "both": + inclusive = None + kw["closed"] = inclusive + + # start, periods, freq + dr = md.date_range("2020-1-1", periods=10, chunk_size=chunk_size, **kw) + + result = dr.execute().fetch() + expected = pd.date_range("2020-1-1", periods=10, **kw) + pd.testing.assert_index_equal(result, expected) + + # end, periods, freq + dr = md.date_range(end="2020-1-10", periods=10, chunk_size=chunk_size, **kw) + + result = dr.execute().fetch() + expected = pd.date_range(end="2020-1-10", periods=10, **kw) + pd.testing.assert_index_equal(result, expected) + + # start, end, freq + dr = md.date_range("2020-1-1", "2020-1-10", chunk_size=chunk_size, **kw) + + result = dr.execute().fetch() + expected = pd.date_range("2020-1-1", "2020-1-10", **kw) + pd.testing.assert_index_equal(result, expected) + + # start, end and periods + dr = md.date_range( + "2020-1-1", "2020-1-10", periods=19, chunk_size=chunk_size, **kw + ) + + result = dr.execute().fetch() + expected = pd.date_range("2020-1-1", "2020-1-10", periods=19, **kw) + pd.testing.assert_index_equal(result, expected) + + # start, end and freq + dr = md.date_range( + "2020-1-1", "2020-1-10", freq="12H", chunk_size=chunk_size, **kw + ) + + result = dr.execute().fetch() + expected = pd.date_range("2020-1-1", "2020-1-10", freq="12H", **kw) + pd.testing.assert_index_equal(result, expected) + + # test timezone + dr = md.date_range("2020-1-1", periods=10, tz="Asia/Shanghai", chunk_size=7) + + result = dr.execute().fetch() + expected = pd.date_range("2020-1-1", periods=10, tz="Asia/Shanghai") + pd.testing.assert_index_equal(result, expected) + + # test periods=0 + dr = md.date_range("2020-1-1", periods=0) + + result = dr.execute().fetch() + expected = pd.date_range("2020-1-1", periods=0) + pd.testing.assert_index_equal(result, expected) + + # test start == end + dr = md.date_range("2020-1-1", "2020-1-1", periods=1) + + result = dr.execute().fetch() + expected = pd.date_range("2020-1-1", "2020-1-1", periods=1) + pd.testing.assert_index_equal(result, expected) + + # test normalize=True + dr = md.date_range("2020-1-1", periods=10, normalize=True, chunk_size=4) + + result = dr.execute().fetch() + expected = pd.date_range("2020-1-1", periods=10, normalize=True) + pd.testing.assert_index_equal(result, expected) + + # test freq + dr = md.date_range(start="1/1/2018", periods=5, freq="M", chunk_size=3) + + result = dr.execute().fetch() + expected = pd.date_range(start="1/1/2018", periods=5, freq="M") + pd.testing.assert_index_equal(result, expected) + + dr = md.date_range(start="2018/01/01", end="2018/07/01", freq="M") + result = dr.execute().fetch() + expected = pd.date_range(start="2018/01/01", end="2018/07/01", freq="M") + pd.testing.assert_index_equal(result, expected) + + +parquet_engines = ["auto"] +if pa is not None: + parquet_engines.append("pyarrow") +if fastparquet is not None: + parquet_engines.append("fastparquet") + + +@pytest.mark.skipif( + len(parquet_engines) == 1, reason="pyarrow and fastparquet are not installed" +) +@pytest.mark.parametrize("engine", parquet_engines) +def test_read_parquet_arrow(setup, engine): + test_df = pd.DataFrame( + { + "a": np.arange(10).astype(np.int64, copy=False), + "b": [f"s{i}" for i in range(10)], + "c": np.random.rand(10), + } + ) + + with tempfile.TemporaryDirectory() as tempdir: + file_path = os.path.join(tempdir, "test.csv") + test_df.to_parquet(file_path) + + df = md.read_parquet(file_path, engine=engine) + result = df.execute().fetch() + pd.testing.assert_frame_equal(result, test_df) + # size_res = self.executor.execute_dataframe(df, mock=True) + # assert sum(s[0] for s in size_res) > test_df.memory_usage(deep=True).sum() + + if engine != "fastparquet": + with tempfile.TemporaryDirectory() as tempdir: + file_path = os.path.join(tempdir, "test.parquet") + test_df.to_parquet(file_path, row_group_size=3) + + df = md.read_parquet( + file_path, groups_as_chunks=True, columns=["a", "b"], engine=engine + ) + result = df.execute().fetch() + pd.testing.assert_frame_equal( + result.reset_index(drop=True), test_df[["a", "b"]] + ) + + if engine != "fastparquet": + with tempfile.TemporaryDirectory() as tempdir: + file_path = os.path.join(tempdir, "test.parquet") + test_df.to_parquet(file_path, row_group_size=5) + + df = md.read_parquet( + file_path, + groups_as_chunks=True, + use_arrow_dtype=True, + incremental_index=True, + engine=engine, + ) + result = df.execute().fetch() + assert isinstance(df.dtypes.iloc[1], md.ArrowStringDtype) + assert isinstance(result.dtypes.iloc[1], md.ArrowStringDtype) + pd.testing.assert_frame_equal(arrow_array_to_objects(result), test_df) + + # test wildcards in path + for merge_small_file_option in [{"n_sample_file": 1}, None]: + with tempfile.TemporaryDirectory() as tempdir: + df = pd.DataFrame( + { + "a": np.arange(300).astype(np.int64, copy=False), + "b": [f"s{i}" for i in range(300)], + "c": np.random.rand(300), + } + ) + + file_paths = [os.path.join(tempdir, f"test{i}.parquet") for i in range(3)] + df[:100].to_parquet(file_paths[0], row_group_size=50) + df[100:200].to_parquet(file_paths[1], row_group_size=30) + df[200:].to_parquet(file_paths[2]) + + mdf = md.read_parquet(f"{tempdir}/*.parquet", engine=engine) + r = mdf.execute().fetch() + pd.testing.assert_frame_equal(df, r.sort_values("a").reset_index(drop=True)) + + mdf = md.read_parquet(f"{tempdir}", engine=engine) + r = mdf.execute().fetch() + pd.testing.assert_frame_equal(df, r.sort_values("a").reset_index(drop=True)) + + file_list = [os.path.join(tempdir, name) for name in os.listdir(tempdir)] + mdf = md.read_parquet(file_list, engine=engine) + r = mdf.execute().fetch() + pd.testing.assert_frame_equal(df, r.sort_values("a").reset_index(drop=True)) + + # test `use_arrow_dtype=True` + mdf = md.read_parquet( + f"{tempdir}/*.parquet", engine=engine, use_arrow_dtype=True + ) + result = mdf.execute().fetch() + assert isinstance(mdf.dtypes.iloc[1], md.ArrowStringDtype) + assert isinstance(result.dtypes.iloc[1], md.ArrowStringDtype) + + if engine != "fastparquet": + mdf = md.read_parquet( + f"{tempdir}/*.parquet", + groups_as_chunks=True, + engine=engine, + merge_small_file_options=merge_small_file_option, + ) + r = mdf.execute().fetch() + pd.testing.assert_frame_equal( + df, r.sort_values("a").reset_index(drop=True) + ) + + # test partitioned + with tempfile.TemporaryDirectory() as tempdir: + df = pd.DataFrame( + { + "a": np.random.rand(300), + "b": [f"s{i}" for i in range(300)], + "c": np.random.choice(["a", "b", "c"], (300,)), + } + ) + df.to_parquet(tempdir, partition_cols=["c"]) + mdf = md.read_parquet(tempdir, engine=engine) + r = mdf.execute().fetch().astype(df.dtypes) + pd.testing.assert_frame_equal( + df.sort_values("a").reset_index(drop=True), + r.sort_values("a").reset_index(drop=True), + ) + + +@pytest.mark.skipif(fastparquet is None, reason="fastparquet not installed") +def test_read_parquet_fast_parquet(setup): + test_df = pd.DataFrame( + { + "a": np.arange(10).astype(np.int64, copy=False), + "b": [f"s{i}" for i in range(10)], + "c": np.random.rand(10), + } + ) + + # test fastparquet engine + with tempfile.TemporaryDirectory() as tempdir: + file_path = os.path.join(tempdir, "test.csv") + test_df.to_parquet(file_path, compression=None) + + df = md.read_parquet(file_path, engine="fastparquet") + result = df.execute().fetch() + pd.testing.assert_frame_equal(result, test_df) + # size_res = self.executor.execute_dataframe(df, mock=True) + # assert sum(s[0] for s in size_res) > test_df.memory_usage(deep=True).sum() + + +@require_cudf +def test_read_parquet_gpu_execution(setup_gpu): + with tempfile.TemporaryDirectory() as tempdir: + file_path = os.path.join(tempdir, "test.parquet") + + df = pd.DataFrame( + { + "col1": np.random.rand(100), + "col2": np.random.choice(["a", "b", "c"], (100,)), + "col3": np.arange(100), + } + ) + df.to_parquet(file_path, index=False) + + pdf = pd.read_parquet(file_path) + mdf = md.read_parquet(file_path, gpu=True).execute().fetch() + pd.testing.assert_frame_equal( + pdf.reset_index(drop=True), mdf.to_pandas().reset_index(drop=True) + ) + + mdf2 = md.read_parquet(file_path, gpu=True).execute().fetch() + pd.testing.assert_frame_equal( + pdf.reset_index(drop=True), mdf2.to_pandas().reset_index(drop=True) + ) + + mdf3 = md.read_parquet(file_path, gpu=True).head(3).execute().fetch() + pd.testing.assert_frame_equal( + pdf.reset_index(drop=True).head(3), mdf3.to_pandas().reset_index(drop=True) + ) + + with tempfile.TemporaryDirectory() as tempdir: + file_path = os.path.join(tempdir, "test.parquet") + test_df = pd.DataFrame( + { + "a": np.arange(10).astype(np.int64, copy=False), + "b": [f"s{i}" for i in range(10)], + "c": np.random.rand(10), + } + ) + test_df.to_parquet(file_path, row_group_size=3) + + df = md.read_parquet( + file_path, groups_as_chunks=True, columns=["a", "b"], gpu=True + ) + result = df.execute().fetch().to_pandas() + pd.testing.assert_frame_equal( + result.reset_index(drop=True), test_df[["a", "b"]] + ) + + # test partitioned + with tempfile.TemporaryDirectory() as tempdir: + df = pd.DataFrame( + { + "a": np.random.rand(300), + "b": [f"s{i}" for i in range(300)], + "c": np.random.choice(["a", "b", "c"], (300,)), + } + ) + df.to_parquet(tempdir, partition_cols=["c"]) + mdf = md.read_parquet(tempdir, gpu=True) + r = mdf.execute().fetch().to_pandas().astype(df.dtypes) + pd.testing.assert_frame_equal( + df.sort_values("a").reset_index(drop=True), + r.sort_values("a").reset_index(drop=True), + ) + + +@require_ray +@pytest.mark.skip_ray_dag # raydataset is not compatible with Ray DAG +def test_read_raydataset(ray_start_regular, ray_create_mars_cluster): + test_df1 = pd.DataFrame( + { + "a": np.arange(10).astype(np.int64, copy=False), + "b": [f"s{i}" for i in range(10)], + } + ) + test_df2 = pd.DataFrame( + { + "a": np.arange(10).astype(np.int64, copy=False), + "b": [f"s{i}" for i in range(10)], + } + ) + df = pd.concat([test_df1, test_df2]) + ds = ray.data.from_pandas_refs([ray.put(test_df1), ray.put(test_df2)]) + mdf = md.read_ray_dataset(ds) + assert df.equals(mdf.execute().fetch()) + + n = 10000 + pdf = pd.DataFrame({"a": list(range(n)), "b": list(range(n, 2 * n))}) + df = md.DataFrame(pdf) + + # Convert mars dataframe to ray dataset + ds = md.to_ray_dataset(df) + pd.testing.assert_frame_equal(ds.to_pandas(), df.to_pandas()) + ds2 = ds.filter(lambda row: row["a"] % 2 == 0) + assert ds2.take(5) == [{"a": 2 * i, "b": n + 2 * i} for i in range(5)] + + # Convert ray dataset to mars dataframe + df2 = md.read_ray_dataset(ds2) + pd.testing.assert_frame_equal( + df2.head(5).to_pandas(), + pd.DataFrame({"a": list(range(0, 10, 2)), "b": list(range(n, n + 10, 2))}), + ) + + # Test Arrow Dataset + pdf2 = pd.DataFrame({c: range(5) for c in "abc"}) + ds3 = ray.data.from_arrow([pa.Table.from_pandas(pdf2) for _ in range(3)]) + df3 = md.read_ray_dataset(ds3) + pd.testing.assert_frame_equal( + df3.head(5).to_pandas(), + pdf2, + ) + + # Test simple datasets + with pytest.raises(NotImplementedError): + ray.data.range(10).to_mars() + + +@require_ray +@pytest.mark.skipif( + ray_deprecate_ml_dataset in (True, None), + reason="Ray (>=2.0) has deprecated MLDataset.", +) +def test_read_ray_mldataset(ray_start_regular, ray_create_mars_cluster): + test_dfs = [ + pd.DataFrame( + { + "a": np.arange(i * 10, (i + 1) * 10).astype(np.int64, copy=False), + "b": [f"s{j}" for j in range(i * 10, (i + 1) * 10)], + } + ) + for i in range(5) + ] + import ray.util.iter + from ray.util.data import from_parallel_iter + + ml_dataset = from_parallel_iter( + ray.util.iter.from_items(test_dfs, num_shards=4), need_convert=False + ) + dfs = [] + for shard in ml_dataset.shards(): + dfs.extend(list(shard)) + df = pd.concat(dfs).reset_index(drop=True) + mdf = md.read_ray_mldataset(ml_dataset) + pd.testing.assert_frame_equal(df, mdf.execute().fetch()) + pd.testing.assert_frame_equal(df.head(5), mdf.head(5).execute().fetch()) + pd.testing.assert_frame_equal(df.head(15), mdf.head(15).execute().fetch()) diff --git a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_hdfs.py b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_hdfs.py new file mode 100644 index 000000000..f1f3ae5fe --- /dev/null +++ b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_hdfs.py @@ -0,0 +1,141 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +from io import BytesIO, StringIO + +import numpy as np +import pandas as pd +import pytest + +from .... import dataframe as md +from ....tests.core import require_hadoop + +TEST_DIR = "/tmp/test" + + +@require_hadoop +@pytest.fixture(scope="module") +def setup_hdfs(): + import pyarrow + + hdfs = pyarrow.hdfs.connect(host="localhost", port=8020) + if hdfs.exists(TEST_DIR): + hdfs.rm(TEST_DIR, recursive=True) + try: + yield hdfs + finally: + if hdfs.exists(TEST_DIR): + hdfs.rm(TEST_DIR, recursive=True) + + +@require_hadoop +def test_read_csv_execution(setup, setup_hdfs): + hdfs = setup_hdfs + + with hdfs.open(f"{TEST_DIR}/simple_test.csv", "wb", replication=1) as f: + f.write(b"name,amount,id\nAlice,100,1\nBob,200,2") + + df = md.read_csv(f"hdfs://localhost:8020{TEST_DIR}/simple_test.csv") + expected = pd.read_csv(BytesIO(b"name,amount,id\nAlice,100,1\nBob,200,2")) + res = df.to_pandas() + pd.testing.assert_frame_equal(expected, res) + + test_df = pd.DataFrame( + { + "A": np.random.rand(20), + "B": [ + pd.Timestamp("2020-01-01") + pd.Timedelta(days=random.randint(0, 31)) + for _ in range(20) + ], + "C": np.random.rand(20), + "D": np.random.randint(0, 100, size=(20,)), + "E": ["foo" + str(random.randint(0, 999999)) for _ in range(20)], + } + ) + buf = StringIO() + test_df[:10].to_csv(buf) + csv_content = buf.getvalue().encode() + + buf = StringIO() + test_df[10:].to_csv(buf) + csv_content2 = buf.getvalue().encode() + + with hdfs.open(f"{TEST_DIR}/chunk_test.csv", "wb", replication=1) as f: + f.write(csv_content) + + df = md.read_csv(f"hdfs://localhost:8020{TEST_DIR}/chunk_test.csv", chunk_bytes=50) + expected = pd.read_csv(BytesIO(csv_content)) + res = df.to_pandas() + pd.testing.assert_frame_equal( + expected.reset_index(drop=True), res.reset_index(drop=True) + ) + + test_read_dir = f"{TEST_DIR}/test_read_csv_directory" + hdfs.mkdir(test_read_dir) + with hdfs.open(f"{test_read_dir}/part.csv", "wb", replication=1) as f: + f.write(csv_content) + with hdfs.open(f"{test_read_dir}/part2.csv", "wb", replication=1) as f: + f.write(csv_content2) + + df = md.read_csv(f"hdfs://localhost:8020{test_read_dir}", chunk_bytes=50) + expected = pd.concat( + [pd.read_csv(BytesIO(csv_content)), pd.read_csv(BytesIO(csv_content2))] + ) + res = df.to_pandas() + pd.testing.assert_frame_equal( + expected.reset_index(drop=True), res.reset_index(drop=True) + ) + + +@require_hadoop +def test_read_parquet_execution(setup, setup_hdfs): + hdfs = setup_hdfs + + test_df = pd.DataFrame( + { + "a": np.arange(10).astype(np.int64, copy=False), + "b": [f"s{i}" for i in range(10)], + "c": np.random.rand(10), + } + ) + test_df2 = pd.DataFrame( + { + "a": np.arange(10).astype(np.int64, copy=False), + "b": [f"s{i}" for i in range(10)], + "c": np.random.rand(10), + } + ) + + with hdfs.open(f"{TEST_DIR}/test.parquet", "wb", replication=1) as f: + test_df.to_parquet(f, row_group_size=3) + + df = md.read_parquet(f"hdfs://localhost:8020{TEST_DIR}/test.parquet") + res = df.to_pandas() + pd.testing.assert_frame_equal(res, test_df) + + hdfs.mkdir(f"{TEST_DIR}/test_partitioned") + + with hdfs.open( + f"{TEST_DIR}/test_partitioned/file1.parquet", "wb", replication=1 + ) as f: + test_df.to_parquet(f, row_group_size=3) + with hdfs.open( + f"{TEST_DIR}/test_partitioned/file2.parquet", "wb", replication=1 + ) as f: + test_df2.to_parquet(f, row_group_size=3) + + df = md.read_parquet(f"hdfs://localhost:8020{TEST_DIR}/test_partitioned") + res = df.to_pandas() + pd.testing.assert_frame_equal(res, pd.concat([test_df, test_df2])) diff --git a/python/xorbits/_mars/dataframe/datastore/__init__.py b/python/xorbits/_mars/dataframe/datastore/__init__.py new file mode 100644 index 000000000..3931660d4 --- /dev/null +++ b/python/xorbits/_mars/dataframe/datastore/__init__.py @@ -0,0 +1,35 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def _install(): + from ..operands import DATAFRAME_TYPE, SERIES_TYPE + from .to_csv import to_csv + from .to_parquet import to_parquet + from .to_sql import to_sql + from .to_vineyard import to_vineyard + + for cls in DATAFRAME_TYPE: + setattr(cls, "to_csv", to_csv) + setattr(cls, "to_sql", to_sql) + setattr(cls, "to_parquet", to_parquet) + setattr(cls, "to_vineyard", to_vineyard) + + for cls in SERIES_TYPE: + setattr(cls, "to_csv", to_csv) + setattr(cls, "to_sql", to_sql) + + +_install() +del _install diff --git a/python/xorbits/_mars/dataframe/datastore/tests/__init__.py b/python/xorbits/_mars/dataframe/datastore/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/datastore/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/datastore/tests/test_datastore.py b/python/xorbits/_mars/dataframe/datastore/tests/test_datastore.py new file mode 100644 index 000000000..e13685a3a --- /dev/null +++ b/python/xorbits/_mars/dataframe/datastore/tests/test_datastore.py @@ -0,0 +1,42 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ....core import tile +from ... import DataFrame + + +def test_to_csv(): + raw = pd.DataFrame(np.random.rand(10, 5)) + df = DataFrame(raw, chunk_size=4) + + r = df.to_csv("*.csv") + r = tile(r) + + assert r.chunk_shape[1] == 1 + for i, c in enumerate(r.chunks): + assert type(c.op).__name__ == "DataFrameToCSV" + assert c.inputs[0] is r.inputs[0].chunks[i].data + + # test one file + r = df.to_csv("out.csv") + r = tile(r) + + assert r.chunk_shape[1] == 1 + for i, c in enumerate(r.chunks): + assert len(c.inputs) == 2 + assert c.inputs[0].inputs[0] is r.inputs[0].chunks[i].data + assert type(c.inputs[1].op).__name__ == "DataFrameToCSVStat" diff --git a/python/xorbits/_mars/dataframe/datastore/tests/test_datastore_execution.py b/python/xorbits/_mars/dataframe/datastore/tests/test_datastore_execution.py new file mode 100644 index 000000000..83dca8737 --- /dev/null +++ b/python/xorbits/_mars/dataframe/datastore/tests/test_datastore_execution.py @@ -0,0 +1,253 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile + +import numpy as np +import pandas as pd +import pytest + +try: + import vineyard +except ImportError: + vineyard = None +try: + import sqlalchemy +except ImportError: + sqlalchemy = None +try: + import pyarrow as pa +except ImportError: + pa = None +try: + import fastparquet +except ImportError: + fastparquet = None + +from .... import dataframe as md +from ....tests.core import flaky +from ... import DataFrame + + +def test_to_csv_execution(setup): + index = pd.RangeIndex(100, 0, -1, name="index") + raw = pd.DataFrame( + { + "col1": np.random.rand(100), + "col2": np.random.choice(["a", "b", "c"], (100,)), + "col3": np.arange(100), + }, + index=index, + ) + df = DataFrame(raw, chunk_size=33) + + with tempfile.TemporaryDirectory() as base_path: + # DATAFRAME TESTS + # test one file with dataframe + path = os.path.join(base_path, "out.csv") + + df.to_csv(path).execute() + + result = pd.read_csv(path, dtype=raw.dtypes.to_dict()) + result.set_index("index", inplace=True) + pd.testing.assert_frame_equal(result, raw) + + # test multi files with dataframe + path = os.path.join(base_path, "out-*.csv") + df.to_csv(path).execute() + + dfs = [ + pd.read_csv( + os.path.join(base_path, f"out-{i}.csv"), dtype=raw.dtypes.to_dict() + ) + for i in range(4) + ] + result = pd.concat(dfs, axis=0) + result.set_index("index", inplace=True) + pd.testing.assert_frame_equal(result, raw) + pd.testing.assert_frame_equal(dfs[1].set_index("index"), raw.iloc[33:66]) + + # test df with unknown shape + df2 = DataFrame(raw, chunk_size=(50, 2)) + df2 = df2[df2["col1"] < 1] + path2 = os.path.join(base_path, "out2.csv") + df2.to_csv(path2).execute() + + result = pd.read_csv(path2, dtype=raw.dtypes.to_dict()) + result.set_index("index", inplace=True) + pd.testing.assert_frame_equal(result, raw) + + # SERIES TESTS + series = md.Series(raw.col1, chunk_size=33) + + # test one file with series + path = os.path.join(base_path, "out.csv") + series.to_csv(path).execute() + + result = pd.read_csv(path, dtype=raw.dtypes.to_dict()) + result.set_index("index", inplace=True) + pd.testing.assert_frame_equal(result, raw.col1.to_frame()) + + # test multi files with series + path = os.path.join(base_path, "out-*.csv") + series.to_csv(path).execute() + + dfs = [ + pd.read_csv( + os.path.join(base_path, f"out-{i}.csv"), dtype=raw.dtypes.to_dict() + ) + for i in range(4) + ] + result = pd.concat(dfs, axis=0) + result.set_index("index", inplace=True) + pd.testing.assert_frame_equal(result, raw.col1.to_frame()) + pd.testing.assert_frame_equal( + dfs[1].set_index("index"), raw.col1.to_frame().iloc[33:66] + ) + + +@pytest.mark.skipif(sqlalchemy is None, reason="sqlalchemy not installed") +def test_to_sql(): + index = pd.RangeIndex(100, 0, -1, name="index") + raw = pd.DataFrame( + { + "col1": np.random.rand(100), + "col2": np.random.choice(["a", "b", "c"], (100,)), + "col3": np.arange(100).astype("int64"), + }, + index=index, + ) + + with tempfile.TemporaryDirectory() as d: + table_name1 = "test_table" + table_name2 = "test_table2" + uri = "sqlite:///" + os.path.join(d, "test.db") + + engine = sqlalchemy.create_engine(uri) + + # test write dataframe + df = DataFrame(raw, chunk_size=33) + df.to_sql(table_name1, con=engine).execute() + + written = pd.read_sql(table_name1, con=engine, index_col="index").sort_index( + ascending=False + ) + pd.testing.assert_frame_equal(raw, written) + + # test write with existing table + with pytest.raises(ValueError): + df.to_sql(table_name1, con=uri).execute() + + # test write series + series = md.Series(raw.col1, chunk_size=33) + with engine.connect() as conn: + series.to_sql(table_name2, con=conn).execute() + + written = pd.read_sql(table_name2, con=engine, index_col="index").sort_index( + ascending=False + ) + pd.testing.assert_frame_equal(raw.col1.to_frame(), written) + + +@pytest.mark.skipif(pa is None, reason="pyarrow not installed") +@flaky(max_runs=3) +def test_to_parquet_arrow_execution(setup): + raw = pd.DataFrame( + { + "col1": np.random.rand(100), + "col2": np.arange(100), + "col3": np.random.choice(["a", "b", "c"], (100,)), + } + ) + df = DataFrame(raw, chunk_size=33) + + with tempfile.TemporaryDirectory() as base_path: + # DATAFRAME TESTS + path = os.path.join(base_path, "out-*.parquet") + df.to_parquet(path).execute() + + read_df = md.read_parquet(path) + result = read_df.execute().fetch() + result = result.sort_index() + pd.testing.assert_frame_equal(result, raw) + + # test read_parquet then to_parquet + read_df = md.read_parquet(path) + read_df.to_parquet(path).execute() + + # test partition_cols + path = os.path.join(base_path, "out-partitioned") + df.to_parquet(path, partition_cols=["col3"]).execute() + + read_df = md.read_parquet(path) + result = read_df.execute().fetch() + result["col3"] = result["col3"].astype("object") + pd.testing.assert_frame_equal( + result.sort_values("col1").reset_index(drop=True), + raw.sort_values("col1").reset_index(drop=True), + ) + + +@pytest.mark.skipif(fastparquet is None, reason="fastparquet not installed") +def test_to_parquet_fast_parquet_execution(): + raw = pd.DataFrame( + { + "col1": np.random.rand(100), + "col2": np.arange(100), + "col3": np.random.choice(["a", "b", "c"], (100,)), + } + ) + df = DataFrame(raw, chunk_size=33) + + with tempfile.TemporaryDirectory() as base_path: + # test fastparquet + path = os.path.join(base_path, "out-fastparquet-*.parquet") + df.to_parquet(path, engine="fastparquet", compression="gzip").execute() + + +@pytest.mark.skipif(vineyard is None, reason="vineyard not installed") +def test_vineyard_execution(setup): + raw = np.random.RandomState(0).rand(55, 55) + + extra_config = { + "check_dtype": False, + "check_nsplits": False, + "check_shape": False, + "check_dtypes": False, + "check_columns_value": False, + "check_index_value": False, + } + + with vineyard.deploy.local.start_vineyardd() as (_, vineyard_socket, _): + raw = pd.DataFrame({"a": np.arange(0, 55), "b": np.arange(55, 110)}) + a = md.DataFrame(raw, chunk_size=15) + a.execute() # n.b.: pre-execute + + b = a.to_vineyard(vineyard_socket=vineyard_socket) + object_id = b.execute(extra_config=extra_config).fetch()[0][0] + + c = md.from_vineyard(object_id, vineyard_socket=vineyard_socket) + df = c.execute(extra_config=extra_config).fetch() + pd.testing.assert_frame_equal(df, raw) + + raw = pd.DataFrame({"a": np.arange(0, 55), "b": np.arange(55, 110)}) + a = md.DataFrame(raw, chunk_size=15) # n.b.: no pre-execute + + b = a.to_vineyard(vineyard_socket=vineyard_socket) + object_id = b.execute(extra_config=extra_config).fetch()[0][0] + + c = md.from_vineyard(object_id, vineyard_socket=vineyard_socket) + df = c.execute(extra_config=extra_config).fetch() + pd.testing.assert_frame_equal(df, raw) diff --git a/python/xorbits/_mars/dataframe/datastore/tests/test_datastore_hdfs.py b/python/xorbits/_mars/dataframe/datastore/tests/test_datastore_hdfs.py new file mode 100644 index 000000000..47e794d2c --- /dev/null +++ b/python/xorbits/_mars/dataframe/datastore/tests/test_datastore_hdfs.py @@ -0,0 +1,66 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest + +from .... import dataframe as md +from ....tests.core import require_hadoop + +TEST_DIR = "/tmp/test" + + +@require_hadoop +@pytest.fixture(scope="module") +def setup_hdfs(): + import pyarrow + + hdfs = pyarrow.hdfs.connect(host="localhost", port=8020) + if hdfs.exists(TEST_DIR): + hdfs.rm(TEST_DIR, recursive=True) + + yield hdfs + + if hdfs.exists(TEST_DIR): + hdfs.rm(TEST_DIR, recursive=True) + + +@require_hadoop +def test_to_parquet_execution(setup, setup_hdfs): + hdfs = setup_hdfs + + test_df = pd.DataFrame( + { + "a": np.arange(10).astype(np.int64, copy=False), + "b": [f"s{i}" for i in range(10)], + "c": np.random.rand(10), + } + ) + df = md.DataFrame(test_df, chunk_size=5) + + dir_name = f"hdfs://localhost:8020{TEST_DIR}/test_to_parquet/" + hdfs.mkdir(dir_name) + df.to_parquet(dir_name).execute() + + result = md.read_parquet(dir_name).to_pandas() + pd.testing.assert_frame_equal(result.reset_index(drop=True), test_df) + + # test wildcard + dir_name = f"hdfs://localhost:8020{TEST_DIR}/test_to_parquet2/*.parquet" + hdfs.mkdir(dir_name.rsplit("/", 1)[0]) + df.to_parquet(dir_name).execute() + + result = md.read_parquet(dir_name.rsplit("/", 1)[0]).to_pandas() + pd.testing.assert_frame_equal(result.reset_index(drop=True), test_df) diff --git a/python/xorbits/_mars/dataframe/datastore/to_csv.py b/python/xorbits/_mars/dataframe/datastore/to_csv.py new file mode 100644 index 000000000..3bd4500c2 --- /dev/null +++ b/python/xorbits/_mars/dataframe/datastore/to_csv.py @@ -0,0 +1,601 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from io import StringIO + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import OutputType, recursive_tile +from ...core.operand import OperandStage +from ...lib.filesystem import open_file +from ...serialization.serializables import ( + AnyField, + BoolField, + DictField, + Int32Field, + Int64Field, + KeyField, + ListField, + StringField, +) +from ...tensor.core import TensorOrder +from ...tensor.operands import TensorOperand, TensorOperandMixin +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import parse_index + + +class DataFrameToCSV(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.TO_CSV + + _input = KeyField("input") + _path = AnyField("path") + _sep = StringField("sep") + _na_rep = StringField("na_rep") + _float_format = StringField("float_format") + _columns = ListField("columns") + _header = AnyField("header") + _index = BoolField("index") + _index_label = AnyField("index_label") + _mode = StringField("mode") + _encoding = StringField("encoding") + _compression = AnyField("compression") + _quoting = Int32Field("quoting") + _quotechar = StringField("quotechar") + _line_terminator = StringField("line_terminator") + _chunksize = Int64Field("chunksize") + _date_format = StringField("date_format") + _doublequote = BoolField("doublequote") + _escapechar = StringField("escapechar") + _decimal = StringField("decimal") + _storage_options = DictField("storage_options") + # for chunk + _output_stat = BoolField("output_stat") + + def __init__( + self, + path=None, + sep=None, + na_rep=None, + float_format=None, + columns=None, + header=None, + index=None, + index_label=None, + mode=None, + encoding=None, + compression=None, + quoting=None, + quotechar=None, + line_terminator=None, + chunksize=None, + date_format=None, + doublequote=None, + escapechar=None, + decimal=None, + output_stat=None, + storage_options=None, + output_types=None, + **kw + ): + super().__init__( + _path=path, + _sep=sep, + _na_rep=na_rep, + _float_format=float_format, + _columns=columns, + _header=header, + _index=index, + _index_label=index_label, + _mode=mode, + _encoding=encoding, + _compression=compression, + _quoting=quoting, + _quotechar=quotechar, + _line_terminator=line_terminator, + _chunksize=chunksize, + _date_format=date_format, + _doublequote=doublequote, + _escapechar=escapechar, + _decimal=decimal, + _output_stat=output_stat, + _storage_options=storage_options, + _output_types=output_types, + **kw + ) + + @property + def input(self): + return self._input + + @property + def path(self): + return self._path + + @property + def sep(self): + return self._sep + + @property + def na_rep(self): + return self._na_rep + + @property + def float_format(self): + return self._float_format + + @property + def columns(self): + return self._columns + + @property + def header(self): + return self._header + + @property + def index(self): + return self._index + + @property + def index_label(self): + return self._index_label + + @property + def mode(self): + return self._mode + + @property + def encoding(self): + return self._encoding + + @property + def compression(self): + return self._compression + + @property + def quoting(self): + return self._quoting + + @property + def quotechar(self): + return self._quotechar + + @property + def line_terminator(self): + return self._line_terminator + + @property + def chunksize(self): + return self._chunksize + + @property + def date_format(self): + return self._date_format + + @property + def doublequote(self): + return self._doublequote + + @property + def escapechar(self): + return self._escapechar + + @property + def decimal(self): + return self._decimal + + @property + def storage_options(self): + return self._storage_options + + @property + def one_file(self): + # if wildcard in path, write csv into multiple files + return "*" not in self._path + + @property + def output_stat(self): + return self._output_stat + + @property + def output_limit(self): + return 1 if not self.output_stat else 2 + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + @classmethod + def tile(cls, op: "DataFrameToCSV"): + in_df = op.input + out_df = op.outputs[0] + + if in_df.ndim == 2 and in_df.chunk_shape[1] > 1: + # make sure only 1 chunk on the column axis + in_df = yield from recursive_tile(in_df.rechunk({1: in_df.shape[1]})) + + one_file = op.one_file + + out_chunks = [], [] + for chunk in in_df.chunks: + chunk_op = op.copy().reset_key() + if not one_file: + index_value = parse_index(chunk.index_value.to_pandas()[:0], chunk) + if chunk.ndim == 2: + out_chunk = chunk_op.new_chunk( + [chunk], + shape=(0, 0), + index_value=index_value, + columns_value=out_df.columns_value, + dtypes=out_df.dtypes, + index=chunk.index, + ) + else: + out_chunk = chunk_op.new_chunk( + [chunk], + shape=(0,), + index_value=index_value, + dtype=out_df.dtype, + index=chunk.index, + ) + out_chunks[0].append(out_chunk) + else: + chunk_op._output_stat = True + chunk_op.stage = OperandStage.map + chunk_op.output_types = [OutputType.scalar] * 2 + # bytes of csv + kws = [ + { + "shape": (), + "dtype": np.dtype(np.str_), + "index": chunk.index, + "order": TensorOrder.C_ORDER, + "output_type": OutputType.scalar, + "type": "csv", + }, + { + "shape": (), + "dtype": np.dtype(np.intp), + "index": chunk.index, + "order": TensorOrder.C_ORDER, + "output_type": OutputType.scalar, + "type": "stat", + }, + ] + chunks = chunk_op.new_chunks([chunk], kws=kws, output_limit=len(kws)) + out_chunks[0].append(chunks[0]) + out_chunks[1].append(chunks[1]) + + if not one_file: + out_chunks = out_chunks[0] + else: + stat_chunk = DataFrameToCSVStat( + path=op.path, + dtype=np.dtype(np.int64), + storage_options=op.storage_options, + ).new_chunk( + out_chunks[1], shape=(len(out_chunks[0]),), order=TensorOrder.C_ORDER + ) + new_out_chunks = [] + for c in out_chunks[0]: + op = DataFrameToCSV( + stage=OperandStage.agg, + path=op.path, + storage_options=op.storage_options, + output_types=op.output_types, + ) + if out_df.ndim == 2: + out_chunk = op.new_chunk( + [c, stat_chunk], + shape=(0, 0), + dtypes=out_df.dtypes, + index_value=out_df.index_value, + columns_value=out_df.columns_value, + index=c.index, + ) + else: + out_chunk = op.new_chunk( + [c, stat_chunk], + shape=(0,), + dtype=out_df.dtype, + index_value=out_df.index_value, + index=c.index, + ) + new_out_chunks.append(out_chunk) + out_chunks = new_out_chunks + + new_op = op.copy() + params = out_df.params.copy() + if out_df.ndim == 2: + params.update( + dict(chunks=out_chunks, nsplits=((0,) * in_df.chunk_shape[0], (0,))) + ) + else: + params.update( + dict(chunks=out_chunks, nsplits=((0,) * in_df.chunk_shape[0],)) + ) + return new_op.new_tileables([in_df], **params) + + def __call__(self, df): + index_value = parse_index(df.index_value.to_pandas()[:0], df) + if df.ndim == 2: + columns_value = parse_index( + df.columns_value.to_pandas()[:0], store_data=True + ) + return self.new_dataframe( + [df], + shape=(0, 0), + dtypes=df.dtypes[:0], + index_value=index_value, + columns_value=columns_value, + ) + else: + return self.new_series( + [df], shape=(0,), dtype=df.dtype, index_value=index_value + ) + + @classmethod + def _to_csv(cls, op, df, path, header=None): + if header is None: + header = op.header + df.to_csv( + path, + sep=op.sep, + na_rep=op.na_rep, + float_format=op.float_format, + columns=op.columns, + header=header, + index=op.index, + index_label=op.index_label, + mode=op.mode, + encoding=op.encoding, + compression=op.compression, + quoting=op.quoting, + quotechar=op.quotechar, + line_terminator=op.line_terminator, + chunksize=op.chunksize, + date_format=op.date_format, + doublequote=op.doublequote, + escapechar=op.escapechar, + decimal=op.decimal, + ) + + @classmethod + def _execute_map(cls, ctx, op): + out = op.outputs[0] + + df = ctx[op.input.key] + sio = StringIO() + header = op.header if out.index[0] == 0 else False + # do not output header if index of chunk > 0 + cls._to_csv(op, df, sio, header=header) + + ret = sio.getvalue().encode(op.encoding or "utf-8") + ctx[op.outputs[0].key] = ret + ctx[op.outputs[1].key] = len(ret) + + @classmethod + def _execute_agg(cls, ctx, op): + out = op.outputs[0] + i = out.index[0] + path = cls._get_path(op.path, i) + + csv_bytes, offsets = [ctx[inp.key] for inp in op.inputs] + offset_start = offsets[i] + + # write csv bytes into file + with open_file(path, mode="r+b", storage_options=op.storage_options) as f: + f.seek(offset_start) + f.write(csv_bytes) + + ctx[out.key] = ( + pd.DataFrame() if out.ndim == 2 else pd.Series([], dtype=out.dtype) + ) + + @classmethod + def _get_path(cls, path, i): + if "*" not in path: + return path + return path.replace("*", str(i)) + + @classmethod + def execute(cls, ctx, op): + if op.stage == OperandStage.map: + cls._execute_map(ctx, op) + elif op.stage == OperandStage.agg: + cls._execute_agg(ctx, op) + else: + assert op.stage is None + df = ctx[op.input.key] + out = op.outputs[0] + path = cls._get_path(op.path, op.outputs[0].index[0]) + with open_file(path, mode="w", storage_options=op.storage_options) as f: + cls._to_csv(op, df, f) + ctx[out.key] = ( + pd.DataFrame() if out.ndim == 2 else pd.Series([], dtype=out.dtype) + ) + + +class DataFrameToCSVStat(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.TO_CSV_STAT + + _path = AnyField("path") + _storage_options = DictField("storage_options") + + def __init__(self, path=None, storage_options=None, dtype=None, **kw): + super().__init__( + _path=path, _storage_options=storage_options, dtype=dtype, **kw + ) + + @property + def path(self): + return self._path + + @property + def storage_options(self): + return self._storage_options + + @classmethod + def execute(cls, ctx, op): + sizes = [ctx[inp.key] for inp in op.inputs] + total_bytes = sum(sizes) + offsets = np.cumsum([0] + sizes)[:-1] + + # write NULL bytes into file + with open_file(op.path, mode="wb", storage_options=op.storage_options) as f: + rest = total_bytes + while rest > 0: + # at most 4M + write_bytes = min(4 * 1024**2, rest) + f.write(b"\00" * write_bytes) + rest -= write_bytes + + ctx[op.outputs[0].key] = offsets + + +def to_csv( + df, + path, + sep=",", + na_rep="", + float_format=None, + columns=None, + header=True, + index=True, + index_label=None, + mode="w", + encoding=None, + compression="infer", + quoting=None, + quotechar='"', + line_terminator=None, + chunksize=None, + date_format=None, + doublequote=True, + escapechar=None, + decimal=".", + storage_options=None, +): + r""" + Write object to a comma-separated values (csv) file. + + Parameters + ---------- + path : str + File path. + If path is a string with wildcard e.g. '/to/path/out-*.csv', + to_csv will try to write multiple files, for instance, + chunk (0, 0) will write data into '/to/path/out-0.csv'. + If path is a string without wildcard, + all data will be written into a single file. + sep : str, default ',' + String of length 1. Field delimiter for the output file. + na_rep : str, default '' + Missing data representation. + float_format : str, default None + Format string for floating point numbers. + columns : sequence, optional + Columns to write. + header : bool or list of str, default True + Write out the column names. If a list of strings is given it is + assumed to be aliases for the column names. + index : bool, default True + Write row names (index). + index_label : str or sequence, or False, default None + Column label for index column(s) if desired. If None is given, and + `header` and `index` are True, then the index names are used. A + sequence should be given if the object uses MultiIndex. If + False do not print fields for index names. Use index_label=False + for easier importing in R. + mode : str + Python write mode, default 'w'. + encoding : str, optional + A string representing the encoding to use in the output file, + defaults to 'utf-8'. + compression : str or dict, default 'infer' + If str, represents compression mode. If dict, value at 'method' is + the compression mode. Compression mode may be any of the following + possible values: {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If + compression mode is 'infer' and `path_or_buf` is path-like, then + detect compression mode from the following extensions: '.gz', + '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given + and mode is 'zip' or inferred as 'zip', other entries passed as + additional compression options. + quoting : optional constant from csv module + Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` + then floats are converted to strings and thus csv.QUOTE_NONNUMERIC + will treat them as non-numeric. + quotechar : str, default '\"' + String of length 1. Character used to quote fields. + line_terminator : str, optional + The newline character or character sequence to use in the output + file. Defaults to `os.linesep`, which depends on the OS in which + this method is called ('\n' for linux, '\r\n' for Windows, i.e.). + chunksize : int or None + Rows to write at a time. + date_format : str, default None + Format string for datetime objects. + doublequote : bool, default True + Control quoting of `quotechar` inside a field. + escapechar : str, default None + String of length 1. Character used to escape `sep` and `quotechar` + when appropriate. + decimal : str, default '.' + Character recognized as decimal separator. E.g. use ',' for + European data. + Returns + ------- + None or str + If path_or_buf is None, returns the resulting csv format as a + string. Otherwise returns None. + + See Also + -------- + read_csv : Load a CSV file into a DataFrame. + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame({'name': ['Raphael', 'Donatello'], + ... 'mask': ['red', 'purple'], + ... 'weapon': ['sai', 'bo staff']}) + >>> df.to_csv('out.csv', index=False).execute() + """ + + if mode != "w": # pragma: no cover + raise NotImplementedError("only support to_csv with mode 'w' for now") + op = DataFrameToCSV( + path=path, + sep=sep, + na_rep=na_rep, + float_format=float_format, + columns=columns, + header=header, + index=index, + index_label=index_label, + mode=mode, + encoding=encoding, + compression=compression, + quoting=quoting, + quotechar=quotechar, + line_terminator=line_terminator, + chunksize=chunksize, + date_format=date_format, + doublequote=doublequote, + escapechar=escapechar, + decimal=decimal, + storage_options=storage_options, + ) + return op(df) diff --git a/python/xorbits/_mars/dataframe/datastore/to_parquet.py b/python/xorbits/_mars/dataframe/datastore/to_parquet.py new file mode 100644 index 000000000..e9793d82b --- /dev/null +++ b/python/xorbits/_mars/dataframe/datastore/to_parquet.py @@ -0,0 +1,282 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import recursive_tile +from ...lib.filesystem import get_fs, open_file +from ...serialization.serializables import ( + AnyField, + BoolField, + DictField, + KeyField, + ListField, + StringField, +) +from ...utils import has_unknown_shape +from ..datasource.read_parquet import check_engine +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import parse_index + +try: + import pyarrow as pa + import pyarrow.parquet as pq +except ImportError: # pragma: no cover + pq = None + pa = None + + +class DataFrameToParquet(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.TO_PARQUET + + _input = KeyField("input") + _path = AnyField("path") + _engine = StringField("engine") + _index = BoolField("index") + _compression = AnyField("compression") + _partition_cols = ListField("partition_cols") + _additional_kwargs = DictField("additional_kwargs") + _storage_options = DictField("storage_options") + + def __init__( + self, + path=None, + engine=None, + index=None, + compression=None, + partition_cols=None, + storage_options=None, + additional_kwargs=None, + **kw, + ): + super().__init__( + _path=path, + _engine=engine, + _index=index, + _compression=compression, + _partition_cols=partition_cols, + _storage_options=storage_options, + _additional_kwargs=additional_kwargs, + **kw, + ) + + @property + def input(self): + return self._input + + @property + def path(self): + return self._path + + @property + def engine(self): + return self._engine + + @property + def index(self): + return self._index + + @property + def compression(self): + return self._compression + + @property + def partition_cols(self): + return self._partition_cols + + @property + def storage_options(self): + return self._storage_options + + @property + def additional_kwargs(self): + return self._additional_kwargs + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + @classmethod + def _get_path(cls, path, i): + if "*" not in path: + return path + return path.replace("*", str(i)) + + @classmethod + def tile(cls, op): + in_df = op.input + out_df = op.outputs[0] + + # make sure only 1 chunk on the column axis + if in_df.chunk_shape[1] > 1: + if has_unknown_shape(in_df): + yield + in_df = yield from recursive_tile(in_df.rechunk({1: in_df.shape[1]})) + + out_chunks = [] + for chunk in in_df.chunks: + chunk_op = op.copy().reset_key() + index_value = parse_index(chunk.index_value.to_pandas()[:0], chunk) + out_chunk = chunk_op.new_chunk( + [chunk], + shape=(0, 0), + index_value=index_value, + columns_value=out_df.columns_value, + dtypes=out_df.dtypes, + index=chunk.index, + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + params = out_df.params.copy() + params.update( + dict(chunks=out_chunks, nsplits=((0,) * in_df.chunk_shape[0], (0,))) + ) + return new_op.new_tileables([in_df], **params) + + @classmethod + def execute(cls, ctx, op): + df = ctx[op.input.key] + out = op.outputs[0] + i = op.outputs[0].index[0] + path = op.path + has_wildcard = False + if "*" in path: + path = path.replace("*", str(i)) + has_wildcard = True + + if op.partition_cols is None: + if not has_wildcard: + fs = get_fs(path, op.storage_options) + path = fs.pathsep.join([path.rstrip(fs.pathsep), f"{i}.parquet"]) + if op.engine == "fastparquet": + df.to_parquet( + path, + engine=op.engine, + compression=op.compression, + index=op.index, + open_with=open_file, + **op.additional_kwargs, + ) + else: + with open_file( + path, mode="wb", storage_options=op.storage_options + ) as f: + df.to_parquet( + f, + engine=op.engine, + compression=op.compression, + index=op.index, + **op.additional_kwargs or dict(), + ) + else: + if op.engine == "pyarrow": + pq.write_to_dataset( + pa.Table.from_pandas(df), path, partition_cols=op.partition_cols + ) + else: # pragma: no cover + raise NotImplementedError( + "Only support pyarrow engine when specify `partition_cols`." + ) + + ctx[out.key] = pd.DataFrame() + + def __call__(self, df): + index_value = parse_index(df.index_value.to_pandas()[:0], df) + columns_value = parse_index(df.columns_value.to_pandas()[:0], store_data=True) + return self.new_dataframe( + [df], + shape=(0, 0), + dtypes=df.dtypes[:0], + index_value=index_value, + columns_value=columns_value, + ) + + +def to_parquet( + df, + path, + engine="auto", + compression="snappy", + index=None, + partition_cols=None, + storage_options: dict = None, + **kwargs, +): + """ + Write a DataFrame to the binary parquet format, each chunk will be + written to a Parquet file. + + Parameters + ---------- + path : str or file-like object + If path is a string with wildcard e.g. '/to/path/out-*.parquet', + `to_parquet` will try to write multiple files, for instance, + chunk (0, 0) will write data into '/to/path/out-0.parquet'. + If path is a string without wildcard, we will treat it as a directory. + + engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' + Parquet library to use. The default behavior is to try 'pyarrow', + falling back to 'fastparquet' if 'pyarrow' is unavailable. + + compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy' + Name of the compression to use. Use ``None`` for no compression. + + index : bool, default None + If ``True``, include the dataframe's index(es) in the file output. + If ``False``, they will not be written to the file. + If ``None``, similar to ``True`` the dataframe's index(es) + will be saved. However, instead of being saved as values, + the RangeIndex will be stored as a range in the metadata so it + doesn't require much space and is faster. Other indexes will + be included as columns in the file output. + + partition_cols : list, optional, default None + Column names by which to partition the dataset. + Columns are partitioned in the order they are given. + Must be None if path is not a string. + + **kwargs + Additional arguments passed to the parquet library. + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]}) + >>> df.to_parquet('*.parquet.gzip', + ... compression='gzip').execute() # doctest: +SKIP + >>> md.read_parquet('*.parquet.gzip').execute() # doctest: +SKIP + col1 col2 + 0 1 3 + 1 2 4 + + >>> import io + >>> f = io.BytesIO() + >>> df.to_parquet(f).execute() + >>> f.seek(0) + 0 + >>> content = f.read() + """ + engine = check_engine(engine) + op = DataFrameToParquet( + path=path, + engine=engine, + compression=compression, + index=index, + partition_cols=partition_cols, + storage_options=storage_options, + additional_kwargs=kwargs, + ) + return op(df) diff --git a/python/xorbits/_mars/dataframe/datastore/to_sql.py b/python/xorbits/_mars/dataframe/datastore/to_sql.py new file mode 100644 index 000000000..00d4e33aa --- /dev/null +++ b/python/xorbits/_mars/dataframe/datastore/to_sql.py @@ -0,0 +1,352 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cloudpickle +import pandas as pd + +from ... import opcodes +from ...core import recursive_tile +from ...core.operand import OperatorLogicKeyGeneratorMixin +from ...serialization.serializables import ( + AnyField, + BoolField, + BytesField, + Int64Field, + StringField, +) +from ..core import DATAFRAME_TYPE +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import ( + build_empty_df, + build_empty_series, + create_sa_connection, + parse_index, +) + + +class DataFrameToSQLTableLogicKeyGeneratorMixin(OperatorLogicKeyGeneratorMixin): + def _get_logic_key_token_values(self): + fields_to_tokenize = [ + getattr(self, k, None) + for k in [ + "table_name", + "schema", + "if_exists", + "index", + "index_label", + "chunksize", + "dtype", + "method", + ] + ] + return super()._get_logic_key_token_values() + fields_to_tokenize + + +class DataFrameToSQLTable( + DataFrameOperand, DataFrameOperandMixin, DataFrameToSQLTableLogicKeyGeneratorMixin +): + _op_type_ = opcodes.TO_SQL + + table_name = StringField("table_name") + con = AnyField("con") + schema = StringField("schema") + if_exists = StringField("if_exists") + index = BoolField("index") + index_label = AnyField("index_label") + chunksize = Int64Field("chunksize") + dtype = AnyField("dtype") + method = AnyField("method") + engine_kwargs = BytesField( + "engine_kwargs", + on_serialize=cloudpickle.dumps, + on_deserialize=cloudpickle.loads, + default=None, + ) + + def __call__(self, df_or_series): + with create_sa_connection(self.con, **(self.engine_kwargs or dict())) as con: + self.con = str(con.engine.url) + empty_index = df_or_series.index_value.to_pandas()[:0] + if isinstance(df_or_series, DATAFRAME_TYPE): + empty_obj = build_empty_df(df_or_series.dtypes, index=empty_index) + else: + empty_obj = build_empty_series( + df_or_series.dtype, index=empty_index, name=df_or_series.name + ) + + empty_obj.to_sql( + self.table_name, + con=con, + schema=self.schema, + if_exists=self.if_exists, + index=self.index, + index_label=self.index_label, + dtype=self.dtype, + ) + + index_value = parse_index( + df_or_series.index_value.to_pandas()[:0], df_or_series.key, "index" + ) + if isinstance(df_or_series, DATAFRAME_TYPE): + columns_value = parse_index( + df_or_series.columns_value.to_pandas()[:0], + df_or_series.key, + "columns", + store_data=True, + ) + return self.new_dataframe( + [df_or_series], + shape=(0, 0), + dtypes=df_or_series.dtypes[:0], + index_value=index_value, + columns_value=columns_value, + ) + else: + return self.new_series( + [df_or_series], + shape=(0,), + dtype=df_or_series.dtype, + index_value=index_value, + ) + + @classmethod + def tile(cls, op: "DataFrameToSQLTable"): + inp = op.inputs[0] + out = op.outputs[0] + if inp.ndim == 2: + inp = yield from recursive_tile(inp.rechunk({1: (inp.shape[1],)})) + + chunks = [] + for c in inp.chunks: + new_op = op.copy().reset_key() + new_op.if_exists = "append" + + index_value = parse_index(c.index_value.to_pandas()[:0], c) + if c.ndim == 2: + columns_value = parse_index( + c.columns_value.to_pandas()[:0], store_data=True + ) + chunks.append( + new_op.new_chunk( + [c], + shape=(0, 0), + index=c.index, + dtypes=out.dtypes, + index_value=index_value, + columns_value=columns_value, + ) + ) + else: + chunks.append( + new_op.new_chunk( + [c], + shape=(0,), + index=c.index, + dtype=out.dtype, + index_value=index_value, + ) + ) + + new_op = op.copy().reset_key() + params = out.params.copy() + params["nsplits"] = tuple((0,) * len(sp) for sp in inp.nsplits) + return new_op.new_tileables([inp], chunks=chunks, **params) + + @classmethod + def execute(cls, ctx, op: "DataFrameToSQLTable"): + in_df = op.inputs[0] + out_df = op.outputs[0] + in_data = ctx[in_df.key] + + import sqlalchemy as sa + + engine = sa.create_engine(op.con, **(op.engine_kwargs or dict())) + + try: + with engine.connect() as connection: + with connection.begin(): + in_data.to_sql( + op.table_name, + con=connection, + if_exists=op.if_exists, + index=op.index, + index_label=op.index_label, + chunksize=op.chunksize, + dtype=op.dtype, + method=op.method, + ) + finally: + engine.dispose() + + if in_df.ndim == 2: + ctx[out_df.key] = pd.DataFrame() + else: + ctx[out_df.key] = pd.Series([], dtype=in_data.dtype) + + +def to_sql( + df, + name: str, + con, + schema=None, + if_exists: str = "fail", + index: bool = True, + index_label=None, + chunksize=None, + dtype=None, + method=None, +): + """ + Write records stored in a DataFrame to a SQL database. + + Databases supported by SQLAlchemy [1]_ are supported. Tables can be + newly created, appended to, or overwritten. + + Parameters + ---------- + name : str + Name of SQL table. + con : sqlalchemy.engine.Engine or sqlite3.Connection + Using SQLAlchemy makes it possible to use any DB supported by that + library. Legacy support is provided for sqlite3.Connection objects. The user + is responsible for engine disposal and connection closure for the SQLAlchemy + connectable See `here `_ + + schema : str, optional + Specify the schema (if database flavor supports this). If None, use + default schema. + if_exists : {'fail', 'replace', 'append'}, default 'fail' + How to behave if the table already exists. + + * fail: Raise a ValueError. + * replace: Drop the table before inserting new values. + * append: Insert new values to the existing table. + + index : bool, default True + Write DataFrame index as a column. Uses `index_label` as the column + name in the table. + index_label : str or sequence, default None + Column label for index column(s). If None is given (default) and + `index` is True, then the index names are used. + A sequence should be given if the DataFrame uses MultiIndex. + chunksize : int, optional + Specify the number of rows in each batch to be written at a time. + By default, all rows will be written at once. + dtype : dict or scalar, optional + Specifying the datatype for columns. If a dictionary is used, the + keys should be the column names and the values should be the + SQLAlchemy types or strings for the sqlite3 legacy mode. If a + scalar is provided, it will be applied to all columns. + method : {None, 'multi', callable}, optional + Controls the SQL insertion clause used: + + * None : Uses standard SQL ``INSERT`` clause (one per row). + * 'multi': Pass multiple values in a single ``INSERT`` clause. + * callable with signature ``(pd_table, conn, keys, data_iter)``. + + Details and a sample callable implementation can be found in the + section :ref:`insert method `. + + .. versionadded:: 0.24.0 + + Raises + ------ + ValueError + When the table already exists and `if_exists` is 'fail' (the + default). + + See Also + -------- + read_sql : Read a DataFrame from a table. + + Notes + ----- + Timezone aware datetime columns will be written as + ``Timestamp with timezone`` type with SQLAlchemy if supported by the + database. Otherwise, the datetimes will be stored as timezone unaware + timestamps local to the original timezone. + + .. versionadded:: 0.24.0 + + References + ---------- + .. [1] http://docs.sqlalchemy.org + .. [2] https://www.python.org/dev/peps/pep-0249/ + + Examples + -------- + + Create an in-memory SQLite database. + + >>> import mars.dataframe as md + >>> from sqlalchemy import create_engine + >>> engine = create_engine('sqlite:////tmp/temp.db') + + Create a table from scratch with 3 rows. + + >>> df = md.DataFrame({'name' : ['User 1', 'User 2', 'User 3']}) + >>> df.execute() + name + 0 User 1 + 1 User 2 + 2 User 3 + + >>> df.to_sql('users', con=engine).execute() + >>> engine.execute("SELECT * FROM users").fetchall() + [(0, 'User 1'), (1, 'User 2'), (2, 'User 3')] + + >>> df1 = md.DataFrame({'name' : ['User 4', 'User 5']}) + >>> df1.to_sql('users', con=engine, if_exists='append').execute() + >>> engine.execute("SELECT * FROM users").fetchall() + [(0, 'User 1'), (1, 'User 2'), (2, 'User 3'), + (0, 'User 4'), (1, 'User 5')] + + Overwrite the table with just ``df1``. + + >>> df1.to_sql('users', con=engine, if_exists='replace', + ... index_label='id').execute() + >>> engine.execute("SELECT * FROM users").fetchall() + [(0, 'User 4'), (1, 'User 5')] + + Specify the dtype (especially useful for integers with missing values). + Notice that while pandas is forced to store the data as floating point, + the database supports nullable integers. When fetching the data with + Python, we get back integer scalars. + + >>> df = md.DataFrame({"A": [1, None, 2]}) + >>> df.execute() + A + 0 1.0 + 1 NaN + 2 2.0 + + >>> from sqlalchemy.types import Integer + >>> df.to_sql('integers', con=engine, index=False, + ... dtype={"A": Integer()}).execute() + + >>> engine.execute("SELECT * FROM integers").fetchall() + [(1,), (None,), (2,)] + """ + op = DataFrameToSQLTable( + table_name=name, + con=con, + schema=schema, + if_exists=if_exists, + index=index, + index_label=index_label, + chunksize=chunksize, + dtype=dtype, + method=method, + ) + return op(df) diff --git a/python/xorbits/_mars/dataframe/datastore/to_vineyard.py b/python/xorbits/_mars/dataframe/datastore/to_vineyard.py new file mode 100644 index 000000000..25236caa1 --- /dev/null +++ b/python/xorbits/_mars/dataframe/datastore/to_vineyard.py @@ -0,0 +1,192 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import OutputType +from ...serialization.serializables import FieldTypes, StringField, TupleField +from ...tensor.datastore.to_vineyard import resolve_vineyard_socket +from ...utils import lazy_import +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import parse_index + +vineyard = lazy_import("vineyard") +vy_data_df = lazy_import("vineyard.data.dataframe", rename="vy_data_df") +vy_data_utils = lazy_import("vineyard.data.utils", rename="vy_data_utils") + + +class DataFrameToVineyardChunk(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.DATAFRAME_STORE_VINEYARD_CHUNK + + # vineyard ipc socket + vineyard_socket = StringField("vineyard_socket") + + # a dummy attr to make sure ops have different keys + operator_index = TupleField("operator_index", FieldTypes.int32) + + def __init__(self, vineyard_socket=None, dtypes=None, **kw): + super().__init__( + vineyard_socket=vineyard_socket, + _dtypes=dtypes, + _output_types=[OutputType.dataframe], + **kw + ) + + def __call__(self, df): + return self.new_dataframe( + [df], + shape=(0, 0), + dtypes=df.dtypes, + index_value=df.index_value, + columns_value=df.columns_value, + ) + + @classmethod + def _process_out_chunks(cls, op, out_chunks): + dtypes = pd.Series([np.dtype("O")], index=pd.Index([0])) + merge_op = DataFrameToVinyardStoreMeta( + vineyard_socket=op.vineyard_socket, + chunk_shape=op.inputs[0].chunk_shape, + shape=(1, 1), + dtypes=dtypes, + ) + return merge_op.new_chunks( + out_chunks, shape=(1, 1), dtypes=dtypes, index=(0, 0) + ) + + @classmethod + def tile(cls, op): + out_chunks = [] + dtypes = pd.Series([np.dtype("O")], index=pd.Index([0])) + for idx, chunk in enumerate(op.inputs[0].chunks): + chunk_op = op.copy().reset_key() + chunk_op.operator_index = chunk.index + out_chunk = chunk_op.new_chunk( + [chunk], + shape=(1, 1), + dtypes=dtypes, + index_value=chunk.index_value, + columns_value=chunk.columns_value, + index=(idx, 0), + ) + out_chunks.append(out_chunk) + out_chunks = cls._process_out_chunks(op, out_chunks) + + in_df = op.inputs[0] + new_op = op.copy().reset_key() + return new_op.new_dataframes( + op.inputs, + shape=(len(out_chunks), 1), + dtypes=dtypes, + index_value=in_df.index_value, + columns_value=in_df.columns_value, + chunks=out_chunks, + nsplits=((np.prod(op.inputs[0].chunk_shape),),), + ) + + @classmethod + def execute(cls, ctx, op): + if vineyard is None: + raise RuntimeError("vineyard is not available") + + socket, needs_put = resolve_vineyard_socket(ctx, op) + client = vineyard.connect(socket) + + # some op might be fused and executed twice on different workers + if not needs_put: + # might be fused + try: # pragma: no cover + meta = ctx.get_chunks_meta([op.inputs[0].key])[0] + df_id = vineyard.ObjectID(meta["object_ref"]) + if not client.exists(df_id): + needs_put = True + except KeyError: + needs_put = True + if needs_put: + df_id = client.put( + ctx[op.inputs[0].key], partition_index=op.inputs[0].index + ) + else: # pragma: no cover + meta = client.get_meta(df_id) + new_meta = vineyard.ObjectMeta() + for k, v in meta.items(): + if k not in ["id", "signature", "instance_id"]: + if isinstance(v, vineyard.ObjectMeta): + new_meta.add_member(k, v) + else: + new_meta[k] = v + new_meta["partition_index_"] = vy_data_utils.to_json(op.inputs[0].index) + df_id = client.create_metadata(new_meta).id + + client.persist(df_id) + ctx[op.outputs[0].key] = pd.DataFrame({0: [df_id]}) + + +class DataFrameToVinyardStoreMeta(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.DATAFRAME_STORE_VINEYARD_META + + # vineyard ipc socket + vineyard_socket = StringField("vineyard_socket") + + def __init__(self, vineyard_socket=None, dtypes=None, **kw): + super().__init__( + vineyard_socket=vineyard_socket, + dtypes=dtypes, + _output_types=[OutputType.dataframe], + **kw + ) + + @classmethod + def tile(cls, op): + dtypes = pd.Series([np.dtype("O")], index=pd.Index([0])) + chunk_op = op.copy().reset_key() + out_chunk = chunk_op.new_chunk( + op.inputs[0].chunks, + shape=(1, 1), + dtypes=dtypes, + index_value=parse_index(pd.Index([-1])), + columns_value=parse_index(pd.Index([0])), + index=(0, 0), + ) + new_op = op.copy().reset_key() + return new_op.new_dataframes( + op.inputs, + shape=(1, 1), + dtypes=dtypes, + index_value=parse_index(pd.Index([0])), + columns_value=parse_index(pd.Index([0])), + chunks=[out_chunk], + nsplits=((1,), (1,)), + ) + + @classmethod + def execute(cls, ctx, op): + if vineyard is None: + raise RuntimeError("vineyard is not available") + + socket, _ = resolve_vineyard_socket(ctx, op) + client = vineyard.connect(socket) + + # # store the result object id to execution context + chunks = [ctx[chunk.key][0][0] for chunk in op.inputs] + ctx[op.outputs[0].key] = pd.DataFrame( + {0: [vy_data_df.make_global_dataframe(client, chunks).id]} + ) + + +def to_vineyard(df, vineyard_socket=None): + op = DataFrameToVineyardChunk(vineyard_socket=vineyard_socket) + return op(df) diff --git a/python/xorbits/_mars/dataframe/fetch/__init__.py b/python/xorbits/_mars/dataframe/fetch/__init__.py new file mode 100644 index 000000000..358b547f5 --- /dev/null +++ b/python/xorbits/_mars/dataframe/fetch/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .core import DataFrameFetch, DataFrameFetchShuffle diff --git a/python/xorbits/_mars/dataframe/fetch/core.py b/python/xorbits/_mars/dataframe/fetch/core.py new file mode 100644 index 000000000..2557abc9e --- /dev/null +++ b/python/xorbits/_mars/dataframe/fetch/core.py @@ -0,0 +1,94 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...core import OutputType, register_fetch_class +from ...core.operand import Fetch, FetchMixin, FetchShuffle +from ...serialization.serializables import FieldTypes, TupleField +from ...utils import on_deserialize_shape, on_serialize_shape +from ..operands import DataFrameOperandMixin + + +class DataFrameFetchMixin(DataFrameOperandMixin, FetchMixin): + __slots__ = () + + +class DataFrameFetch(Fetch, DataFrameFetchMixin): + # required fields + _shape = TupleField( + "shape", + FieldTypes.int64, + on_serialize=on_serialize_shape, + on_deserialize=on_deserialize_shape, + ) + + def __init__(self, output_types=None, **kw): + super().__init__(_output_types=output_types, **kw) + + def _extract_dataframe_or_series_kws(self, kws, **kw): + if kws is None: + kws = [kw] + new_kws = [] + new_output_types = [] + for output_type, kwargs in zip(self._output_types, kws): + if output_type == OutputType.df_or_series: + data_params = kwargs["data_params"] + data_type = kwargs["data_type"] + if data_type == "series": + new_output_types.append(OutputType.series) + else: + new_output_types.append(OutputType.dataframe) + new_kws.append(data_params) + else: + new_output_types.append(output_type) + new_kws.append(kwargs) + self._output_types = new_output_types + return new_kws + + def _new_chunks(self, inputs, kws=None, **kw): + if "_key" in kw and self.source_key is None: + self.source_key = kw["_key"] + if "_shape" in kw and self._shape is None: + self._shape = kw["_shape"] + new_kws = self._extract_dataframe_or_series_kws(kws, **kw) + return super()._new_chunks(inputs, kws=new_kws, **kw) + + def _new_tileables(self, inputs, kws=None, **kw): + if "_key" in kw and self.source_key is None: + self.source_key = kw["_key"] + new_kws = self._extract_dataframe_or_series_kws(kws, **kw) + return super()._new_tileables(inputs, kws=new_kws, **kw) + + +class DataFrameFetchShuffle(FetchShuffle, DataFrameFetchMixin): + # required fields + _shape = TupleField( + "shape", + FieldTypes.int64, + on_serialize=on_serialize_shape, + on_deserialize=on_deserialize_shape, + ) + + def __init__(self, output_types=None, **kw): + super().__init__(_output_types=output_types, **kw) + + +register_fetch_class(OutputType.dataframe, DataFrameFetch, DataFrameFetchShuffle) +register_fetch_class( + OutputType.dataframe_groupby, DataFrameFetch, DataFrameFetchShuffle +) +register_fetch_class(OutputType.df_or_series, DataFrameFetch, DataFrameFetchShuffle) +register_fetch_class(OutputType.series, DataFrameFetch, DataFrameFetchShuffle) +register_fetch_class(OutputType.series_groupby, DataFrameFetch, DataFrameFetchShuffle) +register_fetch_class(OutputType.index, DataFrameFetch, DataFrameFetchShuffle) +register_fetch_class(OutputType.categorical, DataFrameFetch, DataFrameFetchShuffle) diff --git a/python/xorbits/_mars/dataframe/groupby/__init__.py b/python/xorbits/_mars/dataframe/groupby/__init__.py new file mode 100644 index 000000000..21d22109b --- /dev/null +++ b/python/xorbits/_mars/dataframe/groupby/__init__.py @@ -0,0 +1,87 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# noinspection PyUnresolvedReferences +from ..core import DataFrameGroupBy, GroupBy, SeriesGroupBy + + +def _install(): + from ..core import DATAFRAME_GROUPBY_TYPE, DATAFRAME_TYPE, GROUPBY_TYPE, SERIES_TYPE + from .aggregation import agg + from .apply import groupby_apply + from .core import groupby + from .cum import cumcount, cummax, cummin, cumprod, cumsum + from .fill import bfill, ffill, fillna + from .getitem import df_groupby_getitem + from .head import head + + # Just for enabling custom agg function registration. + # Therefore, del this immediately after import. + from .nunique import DataFrameCustomGroupByNuniqueMixin + from .sample import groupby_sample + from .transform import groupby_transform + + del DataFrameCustomGroupByNuniqueMixin + + for cls in DATAFRAME_TYPE: + setattr(cls, "groupby", groupby) + + for cls in SERIES_TYPE: + setattr(cls, "groupby", groupby) + + for cls in GROUPBY_TYPE: + setattr(cls, "agg", agg) + setattr(cls, "aggregate", agg) + + setattr(cls, "sum", lambda groupby, **kw: agg(groupby, "sum", **kw)) + setattr(cls, "prod", lambda groupby, **kw: agg(groupby, "prod", **kw)) + setattr(cls, "max", lambda groupby, **kw: agg(groupby, "max", **kw)) + setattr(cls, "min", lambda groupby, **kw: agg(groupby, "min", **kw)) + setattr(cls, "count", lambda groupby, **kw: agg(groupby, "count", **kw)) + setattr(cls, "size", lambda groupby, **kw: agg(groupby, "size", **kw)) + setattr(cls, "mean", lambda groupby, **kw: agg(groupby, "mean", **kw)) + setattr(cls, "var", lambda groupby, **kw: agg(groupby, "var", **kw)) + setattr(cls, "std", lambda groupby, **kw: agg(groupby, "std", **kw)) + setattr(cls, "all", lambda groupby, **kw: agg(groupby, "all", **kw)) + setattr(cls, "any", lambda groupby, **kw: agg(groupby, "any", **kw)) + setattr(cls, "skew", lambda groupby, **kw: agg(groupby, "skew", **kw)) + setattr(cls, "kurt", lambda groupby, **kw: agg(groupby, "kurt", **kw)) + setattr(cls, "kurtosis", lambda groupby, **kw: agg(groupby, "kurtosis", **kw)) + setattr(cls, "sem", lambda groupby, **kw: agg(groupby, "sem", **kw)) + setattr(cls, "nunique", lambda groupby, **kw: agg(groupby, "nunique", **kw)) + + setattr(cls, "apply", groupby_apply) + setattr(cls, "transform", groupby_transform) + + setattr(cls, "cumcount", cumcount) + setattr(cls, "cummin", cummin) + setattr(cls, "cummax", cummax) + setattr(cls, "cumprod", cumprod) + setattr(cls, "cumsum", cumsum) + + setattr(cls, "head", head) + + setattr(cls, "sample", groupby_sample) + + setattr(cls, "ffill", ffill) + setattr(cls, "bfill", bfill) + setattr(cls, "backfill", bfill) + setattr(cls, "fillna", fillna) + + for cls in DATAFRAME_GROUPBY_TYPE: + setattr(cls, "__getitem__", df_groupby_getitem) + + +_install() +del _install diff --git a/python/xorbits/_mars/dataframe/groupby/aggregation.py b/python/xorbits/_mars/dataframe/groupby/aggregation.py new file mode 100644 index 000000000..fd449d271 --- /dev/null +++ b/python/xorbits/_mars/dataframe/groupby/aggregation.py @@ -0,0 +1,1350 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import itertools +import logging +import uuid +from typing import Callable, Dict, List, Union + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...config import options +from ...core import ENTITY_TYPE, OutputType +from ...core.context import get_context +from ...core.custom_log import redirect_custom_log +from ...core.operand import OperandStage +from ...serialization.serializables import ( + AnyField, + BoolField, + DictField, + Int32Field, + Int64Field, + ListField, + StringField, +) +from ...typing import ChunkType, TileableType +from ...utils import ( + enter_current_session, + estimate_pandas_size, + lazy_import, + pd_release_version, +) +from ..arrays import ArrowArray +from ..core import GROUPBY_TYPE +from ..merge import DataFrameConcat +from ..operands import DataFrameOperand, DataFrameOperandMixin, DataFrameShuffleProxy +from ..reduction.aggregation import is_funcs_aggregate, normalize_reduction_funcs +from ..reduction.core import ReductionAggStep, ReductionCompiler, ReductionSteps +from ..utils import ( + build_concatenated_rows_frame, + concat_on_columns, + is_cudf, + parse_index, +) +from .core import DataFrameGroupByOperand +from .custom_aggregation import custom_agg_functions +from .sort import ( + DataFrameGroupbyConcatPivot, + DataFrameGroupbySortShuffle, + DataFramePSRSGroupbySample, +) + +cp = lazy_import("cupy", rename="cp") +cudf = lazy_import("cudf") + +logger = logging.getLogger(__name__) +CV_THRESHOLD = 0.2 +MEAN_RATIO_THRESHOLD = 2 / 3 +_support_get_group_without_as_index = pd_release_version[:2] > (1, 0) + + +class SizeRecorder: + def __init__(self): + self._raw_records = [] + self._agg_records = [] + + def record(self, raw_record: int, agg_record: int): + self._raw_records.append(raw_record) + self._agg_records.append(agg_record) + + def get(self): + return self._raw_records, self._agg_records + + +_agg_functions = { + "sum": lambda x: x.sum(), + "prod": lambda x: x.prod(), + "product": lambda x: x.product(), + "min": lambda x: x.min(), + "max": lambda x: x.max(), + "all": lambda x: x.all(), + "any": lambda x: x.any(), + "count": lambda x: x.count(), + "size": lambda x: x._reduction_size(), + "mean": lambda x: x.mean(), + "var": lambda x, ddof=1: x.var(ddof=ddof), + "std": lambda x, ddof=1: x.std(ddof=ddof), + "sem": lambda x, ddof=1: x.sem(ddof=ddof), + "skew": lambda x, bias=False: x.skew(bias=bias), + "kurt": lambda x, bias=False: x.kurt(bias=bias), + "kurtosis": lambda x, bias=False: x.kurtosis(bias=bias), + "nunique": lambda x: x.nunique(), +} +_series_col_name = "col_name" + + +def _patch_groupby_kurt(): + try: + from pandas.core.groupby import DataFrameGroupBy, SeriesGroupBy + + if not hasattr(DataFrameGroupBy, "kurt"): # pragma: no branch + + def _kurt_by_frame(a, *args, **kwargs): + data = a.to_frame().kurt(*args, **kwargs).iloc[0] + if is_cudf(data): # pragma: no cover + data = data.copy() + return data + + def _group_kurt(x, *args, **kwargs): + if kwargs.get("numeric_only") is not None: + return x.agg(functools.partial(_kurt_by_frame, *args, **kwargs)) + else: + return x.agg(functools.partial(pd.Series.kurt, *args, **kwargs)) + + DataFrameGroupBy.kurt = DataFrameGroupBy.kurtosis = _group_kurt + SeriesGroupBy.kurt = SeriesGroupBy.kurtosis = _group_kurt + except (AttributeError, ImportError): # pragma: no cover + pass + + +_patch_groupby_kurt() +del _patch_groupby_kurt + + +def build_mock_agg_result( + groupby: GROUPBY_TYPE, + groupby_params: Dict, + raw_func: Callable, + **raw_func_kw, +): + try: + agg_result = groupby.op.build_mock_groupby().aggregate(raw_func, **raw_func_kw) + except ValueError: + if ( + groupby_params.get("as_index") or _support_get_group_without_as_index + ): # pragma: no cover + raise + agg_result = ( + groupby.op.build_mock_groupby(as_index=True) + .aggregate(raw_func, **raw_func_kw) + .to_frame() + ) + agg_result.index.names = [None] * agg_result.index.nlevels + return agg_result + + +class DataFrameGroupByAgg(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.GROUPBY_AGG + + raw_func = AnyField("raw_func") + raw_func_kw = DictField("raw_func_kw") + func = AnyField("func") + func_rename = ListField("func_rename") + + raw_groupby_params = DictField("raw_groupby_params") + groupby_params = DictField("groupby_params") + + method = StringField("method") + use_inf_as_na = BoolField("use_inf_as_na") + + # for chunk + combine_size = Int32Field("combine_size") + chunk_store_limit = Int64Field("chunk_store_limit") + pre_funcs = ListField("pre_funcs") + agg_funcs = ListField("agg_funcs") + post_funcs = ListField("post_funcs") + index_levels = Int32Field("index_levels") + size_recorder_name = StringField("size_recorder_name") + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs[1:]) + if len(self._inputs) > 1: + by = [] + for v in self.groupby_params["by"]: + if isinstance(v, ENTITY_TYPE): + by.append(next(inputs_iter)) + else: + by.append(v) + self.groupby_params["by"] = by + + def _get_inputs(self, inputs): + if isinstance(self.groupby_params["by"], list): + for v in self.groupby_params["by"]: + if isinstance(v, ENTITY_TYPE): + inputs.append(v) + return inputs + + def _get_index_levels(self, groupby, mock_index): + if not self.groupby_params["as_index"]: + try: + as_index_agg_df = groupby.op.build_mock_groupby( + as_index=True + ).aggregate(self.raw_func, **self.raw_func_kw) + except: # noqa: E722 # nosec # pylint: disable=bare-except + # handling cases like mdf.groupby("b", as_index=False).b.agg({"c": "count"}) + if isinstance(self.groupby_params["by"], list): + return len(self.groupby_params["by"]) + raise # pragma: no cover + pd_index = as_index_agg_df.index + else: + pd_index = mock_index + return 1 if not isinstance(pd_index, pd.MultiIndex) else len(pd_index.levels) + + def _fix_as_index(self, result_index: pd.Index): + # make sure if as_index=False takes effect + if isinstance(result_index, pd.MultiIndex): + # if MultiIndex, as_index=False definitely takes no effect + self.groupby_params["as_index"] = True + elif result_index.name is not None: + # if not MultiIndex and agg_df.index has a name + # means as_index=False takes no effect + self.groupby_params["as_index"] = True + + def _call_dataframe(self, groupby, input_df): + agg_df = build_mock_agg_result( + groupby, self.groupby_params, self.raw_func, **self.raw_func_kw + ) + + shape = (np.nan, agg_df.shape[1]) + if isinstance(agg_df.index, pd.RangeIndex): + index_value = parse_index( + pd.RangeIndex(-1), groupby.key, groupby.index_value.key + ) + else: + index_value = parse_index( + agg_df.index, groupby.key, groupby.index_value.key + ) + + # make sure if as_index=False takes effect + self._fix_as_index(agg_df.index) + + # determine num of indices to group in intermediate steps + self.index_levels = self._get_index_levels(groupby, agg_df.index) + + inputs = self._get_inputs([input_df]) + return self.new_dataframe( + inputs, + shape=shape, + dtypes=agg_df.dtypes, + index_value=index_value, + columns_value=parse_index(agg_df.columns, store_data=True), + ) + + def _call_series(self, groupby, in_series): + agg_result = build_mock_agg_result( + groupby, self.groupby_params, self.raw_func, **self.raw_func_kw + ) + + # make sure if as_index=False takes effect + self._fix_as_index(agg_result.index) + + index_value = parse_index( + agg_result.index, groupby.key, groupby.index_value.key + ) + + inputs = self._get_inputs([in_series]) + + # determine num of indices to group in intermediate steps + self.index_levels = self._get_index_levels(groupby, agg_result.index) + + # update value type + if isinstance(agg_result, pd.DataFrame): + return self.new_dataframe( + inputs, + shape=(np.nan, len(agg_result.columns)), + dtypes=agg_result.dtypes, + index_value=index_value, + columns_value=parse_index(agg_result.columns, store_data=True), + ) + else: + return self.new_series( + inputs, + shape=(np.nan,), + dtype=agg_result.dtype, + name=agg_result.name, + index_value=index_value, + ) + + def __call__(self, groupby): + normalize_reduction_funcs(self, ndim=groupby.ndim) + df = groupby + while df.op.output_types[0] not in (OutputType.dataframe, OutputType.series): + df = df.inputs[0] + + if self.raw_func == "size": + self.output_types = [OutputType.series] + else: + self.output_types = ( + [OutputType.dataframe] + if groupby.op.output_types[0] == OutputType.dataframe_groupby + else [OutputType.series] + ) + + if self.output_types[0] == OutputType.dataframe: + return self._call_dataframe(groupby, df) + else: + return self._call_series(groupby, df) + + @classmethod + def partition_merge_data( + cls, + op: "DataFrameGroupByAgg", + partition_chunks: List[ChunkType], + proxy_chunk: ChunkType, + ): + # stage 4: all *ith* classes are gathered and merged + partition_sort_chunks = [] + properties = dict(by=op.groupby_params["by"], gpu=op.is_gpu()) + out_df = op.outputs[0] + + for i, partition_chunk in enumerate(partition_chunks): + output_types = ( + [OutputType.dataframe_groupby] + if out_df.ndim == 2 + else [OutputType.series_groupby] + ) + partition_shuffle_reduce = DataFrameGroupbySortShuffle( + stage=OperandStage.reduce, + reducer_index=(i, 0), + n_reducers=len(partition_chunks), + output_types=output_types, + **properties, + ) + chunk_shape = list(partition_chunk.shape) + chunk_shape[0] = np.nan + + kw = dict( + shape=tuple(chunk_shape), + index=partition_chunk.index, + index_value=partition_chunk.index_value, + ) + if op.outputs[0].ndim == 2: + kw.update( + dict( + columns_value=partition_chunk.columns_value, + dtypes=partition_chunk.dtypes, + ) + ) + else: + kw.update(dict(dtype=partition_chunk.dtype, name=partition_chunk.name)) + cs = partition_shuffle_reduce.new_chunks([proxy_chunk], **kw) + partition_sort_chunks.append(cs[0]) + return partition_sort_chunks + + @classmethod + def partition_local_data( + cls, + op: "DataFrameGroupByAgg", + sorted_chunks: List[ChunkType], + concat_pivot_chunk: ChunkType, + in_df: TileableType, + ): + out_df = op.outputs[0] + map_chunks = [] + chunk_shape = (in_df.chunk_shape[0], 1) + for chunk in sorted_chunks: + chunk_inputs = [chunk, concat_pivot_chunk] + output_types = ( + [OutputType.dataframe_groupby] + if out_df.ndim == 2 + else [OutputType.series_groupby] + ) + map_chunk_op = DataFrameGroupbySortShuffle( + shuffle_size=chunk_shape[0], + stage=OperandStage.map, + n_partition=len(sorted_chunks), + output_types=output_types, + ) + kw = dict() + if out_df.ndim == 2: + kw.update( + dict( + columns_value=chunk_inputs[0].columns_value, + dtypes=chunk_inputs[0].dtypes, + ) + ) + else: + kw.update(dict(dtype=chunk_inputs[0].dtype, name=chunk_inputs[0].name)) + + map_chunks.append( + map_chunk_op.new_chunk( + chunk_inputs, + shape=chunk_shape, + index=chunk.index, + index_value=chunk_inputs[0].index_value, + # **kw + ) + ) + + return map_chunks + + @classmethod + def _gen_shuffle_chunks_with_pivot( + cls, + op: "DataFrameGroupByAgg", + in_df: TileableType, + chunks: List[ChunkType], + pivot: ChunkType, + ): + map_chunks = cls.partition_local_data(op, chunks, pivot, in_df) + + proxy_chunk = DataFrameShuffleProxy( + output_types=[OutputType.dataframe] + ).new_chunk(map_chunks, shape=()) + + partition_sort_chunks = cls.partition_merge_data(op, map_chunks, proxy_chunk) + + return partition_sort_chunks + + @classmethod + def _gen_shuffle_chunks(cls, op, chunks): + # generate map chunks + map_chunks = [] + chunk_shape = (len(chunks), 1) + for chunk in chunks: + # no longer consider as_index=False for the intermediate phases, + # will do reset_index at last if so + map_op = DataFrameGroupByOperand( + stage=OperandStage.map, + shuffle_size=chunk_shape[0], + output_types=[OutputType.dataframe_groupby], + ) + map_chunks.append( + map_op.new_chunk( + [chunk], + shape=(np.nan, np.nan), + index=chunk.index, + index_value=op.outputs[0].index_value, + ) + ) + + proxy_chunk = DataFrameShuffleProxy( + output_types=[OutputType.dataframe] + ).new_chunk(map_chunks, shape=()) + + # generate reduce chunks + reduce_chunks = [] + out_indices = list(itertools.product(*(range(s) for s in chunk_shape))) + for out_idx in out_indices: + reduce_op = DataFrameGroupByOperand( + stage=OperandStage.reduce, + output_types=[OutputType.dataframe_groupby], + n_reducers=len(out_indices), + ) + reduce_chunks.append( + reduce_op.new_chunk( + [proxy_chunk], + shape=(np.nan, np.nan), + index=out_idx, + index_value=None, + ) + ) + return reduce_chunks + + @classmethod + def _gen_map_chunks( + cls, + op: "DataFrameGroupByAgg", + in_chunks: List[ChunkType], + out_df: TileableType, + func_infos: ReductionSteps, + ): + map_chunks = [] + for chunk in in_chunks: + chunk_inputs = [chunk] + map_op = op.copy().reset_key() + # force as_index=True for map phase + map_op.output_types = op.output_types + map_op.groupby_params = map_op.groupby_params.copy() + map_op.groupby_params["as_index"] = True + if isinstance(map_op.groupby_params["by"], list): + by = [] + for v in map_op.groupby_params["by"]: + if isinstance(v, ENTITY_TYPE): + by_chunk = v.cix[chunk.index[0],] + chunk_inputs.append(by_chunk) + by.append(by_chunk) + else: + by.append(v) + map_op.groupby_params["by"] = by + map_op.stage = OperandStage.map + map_op.pre_funcs = func_infos.pre_funcs + map_op.agg_funcs = func_infos.agg_funcs + new_index = chunk.index if len(chunk.index) == 2 else (chunk.index[0],) + if out_df.ndim == 2: + new_index = (new_index[0], 0) if len(new_index) == 1 else new_index + map_chunk = map_op.new_chunk( + chunk_inputs, + shape=out_df.shape, + index=new_index, + index_value=out_df.index_value, + columns_value=out_df.columns_value, + dtypes=out_df.dtypes, + ) + else: + new_index = new_index[:1] if len(new_index) == 2 else new_index + map_chunk = map_op.new_chunk( + chunk_inputs, + shape=(out_df.shape[0],), + index=new_index, + index_value=out_df.index_value, + dtype=out_df.dtype, + ) + map_chunks.append(map_chunk) + return map_chunks + + @classmethod + def _compile_funcs(cls, op: "DataFrameGroupByAgg", in_df) -> ReductionSteps: + compiler = ReductionCompiler(store_source=True) + if isinstance(op.func, list): + func_iter = ((None, f) for f in op.func) + else: + func_iter = ((col, f) for col, funcs in op.func.items() for f in funcs) + + func_renames = ( + op.func_rename + if getattr(op, "func_rename", None) is not None + else itertools.repeat(None) + ) + for func_rename, (col, f) in zip(func_renames, func_iter): + func_name = None + if isinstance(f, str): + f, func_name = _agg_functions[f], f + if func_rename is not None: + func_name = func_rename + + func_cols = None + if col is not None: + func_cols = [col] + compiler.add_function(f, in_df.ndim, cols=func_cols, func_name=func_name) + return compiler.compile() + + @classmethod + def _tile_with_shuffle( + cls, + op: "DataFrameGroupByAgg", + in_df: TileableType, + out_df: TileableType, + func_infos: ReductionSteps, + ): + # First, perform groupby and aggregation on each chunk. + agg_chunks = cls._gen_map_chunks(op, in_df.chunks, out_df, func_infos) + return cls._perform_shuffle(op, agg_chunks, in_df, out_df, func_infos) + + @classmethod + def _gen_pivot_chunk( + cls, + op: "DataFrameGroupByAgg", + sample_chunks: List[ChunkType], + agg_chunk_len: int, + ): + properties = dict( + by=op.groupby_params["by"], + gpu=op.is_gpu(), + ) + + # stage 2: gather and merge samples, choose and broadcast p-1 pivots + kind = "quicksort" + output_types = [OutputType.tensor] + + concat_pivot_op = DataFrameGroupbyConcatPivot( + kind=kind, + n_partition=agg_chunk_len, + output_types=output_types, + **properties, + ) + + concat_pivot_chunk = concat_pivot_op.new_chunk( + sample_chunks, + shape=(agg_chunk_len,), + dtype=np.dtype(object), + ) + return concat_pivot_chunk + + @classmethod + def _sample_chunks( + cls, + op: "DataFrameGroupByAgg", + agg_chunks: List[ChunkType], + ): + chunk_shape = len(agg_chunks) + sampled_chunks = [] + + properties = dict( + by=op.groupby_params["by"], + gpu=op.is_gpu(), + ) + + for i, chunk in enumerate(agg_chunks): + kws = [] + sampled_shape = ( + (chunk_shape, chunk.shape[1]) if chunk.ndim == 2 else (chunk_shape,) + ) + chunk_index = (i, 0) if chunk.ndim == 2 else (i,) + chunk_op = DataFramePSRSGroupbySample( + kind="quicksort", + n_partition=chunk_shape, + output_types=op.output_types, + **properties, + ) + if op.output_types[0] == OutputType.dataframe: + kws.append( + { + "shape": sampled_shape, + "index_value": chunk.index_value, + "index": chunk_index, + "type": "regular_sampled", + } + ) + else: + kws.append( + { + "shape": sampled_shape, + "index_value": chunk.index_value, + "index": chunk_index, + "type": "regular_sampled", + "dtype": chunk.dtype, + } + ) + chunk = chunk_op.new_chunk([chunk], kws=kws) + sampled_chunks.append(chunk) + + return sampled_chunks + + @classmethod + def _perform_shuffle( + cls, + op: "DataFrameGroupByAgg", + agg_chunks: List[ChunkType], + in_df: TileableType, + out_df: TileableType, + func_infos: ReductionSteps, + ): + if op.groupby_params["sort"] and len(in_df.chunks) > 1: + agg_chunk_len = len(agg_chunks) + sample_chunks = cls._sample_chunks(op, agg_chunks) + pivot_chunk = cls._gen_pivot_chunk(op, sample_chunks, agg_chunk_len) + reduce_chunks = cls._gen_shuffle_chunks_with_pivot( + op, in_df, agg_chunks, pivot_chunk + ) + else: + reduce_chunks = cls._gen_shuffle_chunks(op, agg_chunks) + + # Combine groups + agg_chunks = [] + for chunk in reduce_chunks: + agg_op = op.copy().reset_key() + agg_op.tileable_op_key = op.key + agg_op.groupby_params = agg_op.groupby_params.copy() + agg_op.groupby_params.pop("selection", None) + # use levels instead of by for reducer + agg_op.groupby_params.pop("by", None) + agg_op.groupby_params["level"] = list(range(op.index_levels)) + agg_op.stage = OperandStage.agg + agg_op.agg_funcs = func_infos.agg_funcs + agg_op.post_funcs = func_infos.post_funcs + if op.output_types[0] == OutputType.dataframe: + agg_chunk = agg_op.new_chunk( + [chunk], + shape=out_df.shape, + index=chunk.index, + index_value=out_df.index_value, + dtypes=out_df.dtypes, + columns_value=out_df.columns_value, + ) + else: + agg_chunk = agg_op.new_chunk( + [chunk], + shape=out_df.shape, + index=(chunk.index[0],), + dtype=out_df.dtype, + index_value=out_df.index_value, + name=out_df.name, + ) + agg_chunks.append(agg_chunk) + + new_op = op.copy() + if op.output_types[0] == OutputType.dataframe: + nsplits = ((np.nan,) * len(agg_chunks), (out_df.shape[1],)) + else: + nsplits = ((np.nan,) * len(agg_chunks),) + kw = out_df.params.copy() + kw.update(dict(chunks=agg_chunks, nsplits=nsplits)) + return new_op.new_tileables([in_df], **kw) + + @classmethod + def _tile_with_tree( + cls, + op: "DataFrameGroupByAgg", + in_df: TileableType, + out_df: TileableType, + func_infos: ReductionSteps, + ): + chunks = cls._gen_map_chunks(op, in_df.chunks, out_df, func_infos) + return cls._combine_tree(op, chunks, out_df, func_infos) + + @classmethod + def _build_tree_chunks( + cls, + op: "DataFrameGroupByAgg", + chunks: List[ChunkType], + func_infos: ReductionSteps, + combine_size: int, + input_chunk_size: float = None, + chunk_store_limit: int = None, + ): + out_df = op.outputs[0] + # if concat chunk's size is greater than chunk_store_limit, + # stop combining them. + check_size = False + if chunk_store_limit is not None: + assert input_chunk_size is not None + check_size = True + concat_chunk_size = input_chunk_size + while (not check_size or concat_chunk_size < chunk_store_limit) and ( + len(chunks) > combine_size + ): + new_chunks = [] + for idx, i in enumerate(range(0, len(chunks), combine_size)): + chks = chunks[i : i + combine_size] + if len(chks) == 1: + chk = chks[0] + else: + concat_op = DataFrameConcat(output_types=out_df.op.output_types) + # Change index for concatenate + for j, c in enumerate(chks): + c._index = (j, 0) + if out_df.ndim == 2: + chk = concat_op.new_chunk(chks, dtypes=chks[0].dtypes) + else: + chk = concat_op.new_chunk(chks, dtype=chunks[0].dtype) + chunk_op = op.copy().reset_key() + chunk_op.tileable_op_key = None + chunk_op.output_types = out_df.op.output_types + chunk_op.stage = OperandStage.combine + chunk_op.groupby_params = chunk_op.groupby_params.copy() + chunk_op.groupby_params.pop("selection", None) + # use levels instead of by for agg + chunk_op.groupby_params.pop("by", None) + chunk_op.groupby_params["level"] = list(range(op.index_levels)) + chunk_op.agg_funcs = func_infos.agg_funcs + + new_shape = ( + (np.nan, out_df.shape[1]) if len(out_df.shape) == 2 else (np.nan,) + ) + + new_chunks.append( + chunk_op.new_chunk( + [chk], + index=(idx, 0), + shape=new_shape, + index_value=chks[0].index_value, + columns_value=getattr(out_df, "columns_value", None), + ) + ) + chunks = new_chunks + if concat_chunk_size is not None: + concat_chunk_size *= combine_size + if concat_chunk_size: + return chunks, concat_chunk_size + else: + return chunks + + @classmethod + def _build_out_tileable( + cls, + op: "DataFrameGroupByAgg", + out_df: TileableType, + combined_chunks: List[ChunkType], + func_infos: ReductionSteps, + ): + if len(combined_chunks) == 1: + chk = combined_chunks[0] + else: + concat_op = DataFrameConcat(output_types=out_df.op.output_types) + if out_df.ndim == 2: + chk = concat_op.new_chunk( + combined_chunks, dtypes=combined_chunks[0].dtypes + ) + else: + chk = concat_op.new_chunk( + combined_chunks, dtype=combined_chunks[0].dtype + ) + chunk_op = op.copy().reset_key() + chunk_op.tileable_op_key = op.key + chunk_op.stage = OperandStage.agg + chunk_op.groupby_params = chunk_op.groupby_params.copy() + chunk_op.groupby_params.pop("selection", None) + # use levels instead of by for agg + chunk_op.groupby_params.pop("by", None) + chunk_op.groupby_params["level"] = list(range(op.index_levels)) + chunk_op.agg_funcs = func_infos.agg_funcs + chunk_op.post_funcs = func_infos.post_funcs + kw = out_df.params.copy() + kw["index"] = (0, 0) if op.output_types[0] == OutputType.dataframe else (0,) + chunk = chunk_op.new_chunk([chk], **kw) + new_op = op.copy() + if op.output_types[0] == OutputType.dataframe: + nsplits = ((out_df.shape[0],), (out_df.shape[1],)) + else: + nsplits = ((out_df.shape[0],),) + + kw = out_df.params.copy() + kw.update(dict(chunks=[chunk], nsplits=nsplits)) + return new_op.new_tileables(op.inputs, **kw) + + @classmethod + def _combine_tree( + cls, + op: "DataFrameGroupByAgg", + chunks: List[ChunkType], + out_df: TileableType, + func_infos: ReductionSteps, + ): + combine_size = op.combine_size + chunks = cls._build_tree_chunks(op, chunks, func_infos, combine_size) + return cls._build_out_tileable(op, out_df, chunks, func_infos) + + @classmethod + def _build_tree_and_shuffle_chunks( + cls, + op: "DataFrameGroupByAgg", + in_df: TileableType, + out_df: TileableType, + func_infos: ReductionSteps, + sample_map_chunks: List[ChunkType], + sample_agg_sizes: List[int], + ): + combine_size = op.combine_size + left_chunks = cls._gen_map_chunks( + op, in_df.chunks[combine_size:], out_df, func_infos + ) + input_size = sum(sample_agg_sizes) / len(sample_agg_sizes) + combine_chunk_limit = op.chunk_store_limit / 4 + combined_chunks, concat_size = cls._build_tree_chunks( + op, + sample_map_chunks + left_chunks, + func_infos, + combine_size, + input_size, + combine_chunk_limit, + ) + logger.debug( + "Combine map chunks to %s chunks for groupby operand %s", + len(combined_chunks), + op, + ) + if concat_size <= combine_chunk_limit: + logger.debug( + "Choose tree method after combining chunks for groupby operand %s", op + ) + return cls._build_out_tileable(op, out_df, combined_chunks, func_infos) + else: + logger.debug( + "Choose shuffle method after combining chunks for " + "groupby operand %s, chunk count is %s", + op, + len(combined_chunks), + ) + return cls._perform_shuffle( + op, + combined_chunks, + in_df, + out_df, + func_infos, + ) + + @classmethod + def _tile_auto( + cls, + op: "DataFrameGroupByAgg", + in_df: TileableType, + out_df: TileableType, + func_infos: ReductionSteps, + ): + ctx = get_context() + combine_size = op.combine_size + size_recorder_name = str(uuid.uuid4()) + size_recorder = ctx.create_remote_object(size_recorder_name, SizeRecorder) + + # collect the first combine_size chunks, run it + # to get the size before and after agg + chunks = cls._gen_map_chunks( + op, in_df.chunks[:combine_size], out_df, func_infos + ) + for chunk in chunks: + chunk.op.size_recorder_name = size_recorder_name + # yield to trigger execution + yield chunks + + raw_sizes, agg_sizes = size_recorder.get() + # destroy size recorder + ctx.destroy_remote_object(size_recorder_name) + + logger.debug( + "Start to choose method for Groupby, agg_sizes: %s, raw_sizes: %s, " + "sample_count: %s, total_count: %s, chunk_store_limit: %s", + agg_sizes, + raw_sizes, + len(agg_sizes), + len(in_df.chunks), + op.chunk_store_limit, + ) + + return cls._build_tree_and_shuffle_chunks( + op, in_df, out_df, func_infos, chunks, agg_sizes + ) + + @classmethod + def tile(cls, op: "DataFrameGroupByAgg"): + in_df = op.inputs[0] + if len(in_df.shape) > 1: + in_df = build_concatenated_rows_frame(in_df) + out_df = op.outputs[0] + + func_infos = cls._compile_funcs(op, in_df) + + if op.method == "auto": + logger.debug("Choose auto method for groupby operand %s", op) + if len(in_df.chunks) <= op.combine_size: + return cls._tile_with_tree(op, in_df, out_df, func_infos) + else: + return (yield from cls._tile_auto(op, in_df, out_df, func_infos)) + if op.method == "shuffle": + logger.debug("Choose shuffle method for groupby operand %s", op) + return cls._tile_with_shuffle(op, in_df, out_df, func_infos) + elif op.method == "tree": + logger.debug("Choose tree method for groupby operand %s", op) + return cls._tile_with_tree(op, in_df, out_df, func_infos) + else: # pragma: no cover + raise NotImplementedError + + @classmethod + def _get_grouped(cls, op: "DataFrameGroupByAgg", df, ctx, copy=False, grouper=None): + if copy: + df = df.copy() + + params = op.groupby_params.copy() + params.pop("as_index", None) + selection = params.pop("selection", None) + + if grouper is not None: + params["by"] = grouper + params.pop("level", None) + elif isinstance(params.get("by"), list): + new_by = [] + for v in params["by"]: + if isinstance(v, ENTITY_TYPE): + new_by.append(ctx[v.key]) + else: + new_by.append(v) + params["by"] = new_by + + try: + grouped = df.groupby(**params) + except ValueError: # pragma: no cover + if isinstance(df.index.values, ArrowArray): + df = df.copy() + df.index = pd.Index(df.index.to_numpy(), name=df.index.name) + grouped = df.groupby(**params) + else: + raise + + if selection is not None: + grouped = grouped[selection] + return grouped + + @staticmethod + def _pack_inputs(agg_funcs: List[ReductionAggStep], in_data): + pos = 0 + out_dict = dict() + for step in agg_funcs: + if step.custom_reduction is None: + out_dict[step.output_key] = in_data[pos] + else: + out_dict[step.output_key] = tuple( + in_data[pos : pos + step.output_limit] + ) + pos += step.output_limit + return out_dict + + @staticmethod + def _do_custom_agg( + func_name: str, op: "DataFrameGroupByAgg", in_data: pd.DataFrame + ) -> Union[pd.Series, pd.DataFrame]: + if op.stage == OperandStage.map: + return custom_agg_functions[func_name].execute_map(op, in_data) + elif op.stage == OperandStage.combine: + return custom_agg_functions[func_name].execute_combine(op, in_data) + else: # must be OperandStage.agg, since OperandStage.reduce has been excluded in the execute function. + return custom_agg_functions[func_name].execute_agg(op, in_data) + + @staticmethod + def _do_predefined_agg(input_obj, agg_func, single_func=False, **kwds): + ndim = getattr(input_obj, "ndim", None) or input_obj.obj.ndim + if agg_func == "str_concat": + agg_func = lambda x: x.str.cat(**kwds) + elif isinstance(agg_func, str) and not kwds.get("skipna", True): + func_name = agg_func + agg_func = lambda x: getattr(x, func_name)(skipna=False) + agg_func.__name__ = func_name + + if ndim == 2: + if single_func: + result = input_obj.agg(agg_func) + if result.ndim == 1: + # when agg_func == size, agg only returns one single series. + result = result.to_frame(agg_func) + else: + result = input_obj.agg([agg_func]) + result.columns = result.columns.droplevel(-1) + return result + else: + return input_obj.agg(agg_func) + + @staticmethod + def _series_to_df(in_series, gpu): + xdf = cudf if gpu else pd + + in_df = in_series.to_frame() + if in_series.name is not None: + in_df.columns = xdf.Index([in_series.name]) + return in_df + + @classmethod + def _execute_map(cls, ctx, op: "DataFrameGroupByAgg"): + xdf = cudf if op.gpu else pd + + in_data = ctx[op.inputs[0].key] + if ( + isinstance(in_data, xdf.Series) + and op.output_types[0] == OutputType.dataframe + ): + in_data = cls._series_to_df(in_data, op.gpu) + + # map according to map groups + ret_map_groupbys = dict() + grouped = cls._get_grouped(op, in_data, ctx) + grouper = None + drop_names = False + + for input_key, output_key, cols, func in op.pre_funcs: + if input_key == output_key: + if cols is None or getattr(grouped, "_selection", None) is not None: + ret_map_groupbys[output_key] = grouped + else: + ret_map_groupbys[output_key] = grouped[cols] + else: + + def _wrapped_func(col): + try: + return func(col, gpu=op.is_gpu()) + except TypeError: + return col + + pre_df = in_data if cols is None else in_data[cols] + try: + pre_df = func(pre_df, gpu=op.is_gpu()) + except TypeError: + pre_df = pre_df.transform(_wrapped_func) + + if grouper is None: + try: + grouper = grouped.grouper + except AttributeError: # cudf does not have GroupBy.grouper + grouper = xdf.Series( + grouped.grouping.keys, index=grouped.obj.index + ) + if in_data.ndim == 2: + drop_names = True + + if drop_names: + pre_df = pre_df.drop( + columns=grouped.grouping.names, errors="ignore" + ) + ret_map_groupbys[output_key] = cls._get_grouped( + op, pre_df, ctx, grouper=grouper + ) + + agg_dfs = [] + for ( + input_key, + raw_func_name, + map_func_name, + _agg_func_name, + custom_reduction, + _output_key, + _output_limit, + kwds, + ) in op.agg_funcs: + input_obj = ret_map_groupbys[input_key] + if map_func_name == "custom_reduction": + agg_dfs.append(cls._do_custom_agg(raw_func_name, op, in_data)) + else: + single_func = map_func_name == op.raw_func + agg_dfs.append( + cls._do_predefined_agg( + input_obj, map_func_name, single_func, **kwds + ) + ) + + if getattr(op, "size_recorder_name", None) is not None: + # record_size + raw_size = estimate_pandas_size(in_data) + # when agg by a list of methods, agg_size should be sum + agg_size = sum([estimate_pandas_size(item) for item in agg_dfs]) + size_recorder = ctx.get_remote_object(op.size_recorder_name) + size_recorder.record(raw_size, agg_size) + + ctx[op.outputs[0].key] = tuple(agg_dfs) + + @classmethod + def _execute_combine(cls, ctx, op: "DataFrameGroupByAgg"): + xdf = cudf if op.gpu else pd + + in_data_tuple = ctx[op.inputs[0].key] + in_data_list = [] + for in_data in in_data_tuple: + if ( + isinstance(in_data, xdf.Series) + and op.output_types[0] == OutputType.dataframe + ): + in_data = cls._series_to_df(in_data, op.gpu) + in_data_list.append(cls._get_grouped(op, in_data, ctx)) + in_data_tuple = tuple(in_data_list) + in_data_dict = cls._pack_inputs(op.agg_funcs, in_data_tuple) + + combines = [] + for raw_input, ( + _input_key, + raw_func_name, + _map_func_name, + agg_func_name, + custom_reduction, + output_key, + _output_limit, + kwds, + ) in zip(ctx[op.inputs[0].key], op.agg_funcs): + input_obj = in_data_dict[output_key] + if agg_func_name == "custom_reduction": + combines.append(cls._do_custom_agg(raw_func_name, op, raw_input)) + else: + combines.append( + cls._do_predefined_agg(input_obj, agg_func_name, **kwds) + ) + ctx[op.outputs[0].key] = tuple(combines) + + @classmethod + def _execute_agg(cls, ctx, op: "DataFrameGroupByAgg"): + xdf = cudf if op.gpu else pd + out_chunk = op.outputs[0] + col_value = ( + out_chunk.columns_value.to_pandas() + if hasattr(out_chunk, "columns_value") + else None + ) + + in_data_tuple = ctx[op.inputs[0].key] + in_data_list = [] + for in_data in in_data_tuple: + if ( + isinstance(in_data, xdf.Series) + and op.output_types[0] == OutputType.dataframe + ): + in_data = cls._series_to_df(in_data, op.gpu) + in_data_list.append(in_data) + in_data_tuple = tuple(in_data_list) + in_data_dict = cls._pack_inputs(op.agg_funcs, in_data_tuple) + + for ( + _input_key, + raw_func_name, + _map_func_name, + agg_func_name, + custom_reduction, + output_key, + _output_limit, + kwds, + ) in op.agg_funcs: + if agg_func_name == "custom_reduction": + in_data_dict[output_key] = cls._do_custom_agg( + raw_func_name, op, in_data_dict[output_key] + ) + else: + input_obj = cls._get_grouped(op, in_data_dict[output_key], ctx) + in_data_dict[output_key] = cls._do_predefined_agg( + input_obj, agg_func_name, **kwds + ) + + aggs = [] + for input_keys, _output_key, func_name, cols, func in op.post_funcs: + if func_name in custom_agg_functions: + agg_df = in_data_dict[_output_key] + else: + if cols is None: + func_inputs = [in_data_dict[k] for k in input_keys] + else: + func_inputs = [in_data_dict[k][cols] for k in input_keys] + + if ( + func_inputs[0].ndim == 2 + and len(set(inp.shape[1] for inp in func_inputs)) > 1 + ): + common_cols = func_inputs[0].columns + for inp in func_inputs[1:]: + common_cols = common_cols.join(inp.columns, how="inner") + func_inputs = [inp[common_cols] for inp in func_inputs] + + agg_df = func(*func_inputs, gpu=op.is_gpu()) + if isinstance(agg_df, np.ndarray): + agg_df = xdf.DataFrame(agg_df, index=func_inputs[0].index) + + new_cols = None + if out_chunk.ndim == 2 and col_value is not None: + if col_value.nlevels > agg_df.columns.nlevels: + new_cols = xdf.MultiIndex.from_product( + [agg_df.columns, [func_name]] + ) + elif agg_df.shape[-1] == 1 and func_name in col_value: + new_cols = xdf.Index([func_name]) + aggs.append((agg_df, new_cols)) + + for agg_df, new_cols in aggs: + if new_cols is not None: + agg_df.columns = new_cols + aggs = [item[0] for item in aggs] + + if out_chunk.ndim == 2: + result = concat_on_columns(aggs) + if ( + not op.groupby_params.get("as_index", True) + and col_value.nlevels == result.columns.nlevels + ): + result.reset_index( + inplace=True, drop=result.index.name in result.columns + ) + result = result.reindex(col_value, axis=1) + + if result.ndim == 2 and len(result) == 0: + result = result.astype(out_chunk.dtypes) + else: + result = xdf.concat(aggs) + if result.ndim == 2: + result = result.iloc[:, 0] + if is_cudf(result): # pragma: no cover + result = result.copy() + result.name = out_chunk.name + + ctx[out_chunk.key] = result + + @classmethod + @redirect_custom_log + @enter_current_session + def execute(cls, ctx, op: "DataFrameGroupByAgg"): + try: + pd.set_option("mode.use_inf_as_na", op.use_inf_as_na) + if op.stage == OperandStage.map: + cls._execute_map(ctx, op) + elif op.stage == OperandStage.combine: + cls._execute_combine(ctx, op) + elif op.stage == OperandStage.agg: + cls._execute_agg(ctx, op) + else: # pragma: no cover + raise ValueError("Aggregation operand not executable") + finally: + pd.reset_option("mode.use_inf_as_na") + + +def agg(groupby, func=None, method="auto", combine_size=None, *args, **kwargs): + """ + Aggregate using one or more operations on grouped data. + + Parameters + ---------- + groupby : Mars Groupby + Groupby data. + func : str or list-like + Aggregation functions. + method : {'auto', 'shuffle', 'tree'}, default 'auto' + 'tree' method provide a better performance, 'shuffle' is recommended + if aggregated result is very large, 'auto' will use 'shuffle' method + in distributed mode and use 'tree' in local mode. + combine_size : int + The number of chunks to combine when method is 'tree' + + + Returns + ------- + Series or DataFrame + Aggregated result. + """ + + # When perform a computation on the grouped data, we won't shuffle + # the data in the stage of groupby and do shuffle after aggregation. + + if not isinstance(groupby, GROUPBY_TYPE): + raise TypeError(f"Input should be type of groupby, not {type(groupby)}") + + if method is None: + method = "auto" + if method not in ["shuffle", "tree", "auto"]: + raise ValueError( + f"Method {method} is not available, please specify 'tree' or 'shuffle" + ) + + if not is_funcs_aggregate(func, ndim=groupby.ndim): + # pass index to transform, otherwise it will lose name info for index + agg_result = build_mock_agg_result( + groupby, groupby.op.groupby_params, func, **kwargs + ) + if isinstance(agg_result.index, pd.RangeIndex): + # set -1 to represent unknown size for RangeIndex + index_value = parse_index( + pd.RangeIndex(-1), groupby.key, groupby.index_value.key + ) + else: + index_value = parse_index( + agg_result.index, groupby.key, groupby.index_value.key + ) + return groupby.transform( + func, *args, _call_agg=True, index=index_value, **kwargs + ) + + use_inf_as_na = kwargs.pop("_use_inf_as_na", options.dataframe.mode.use_inf_as_na) + + agg_op = DataFrameGroupByAgg( + raw_func=func, + raw_func_kw=kwargs, + method=method, + raw_groupby_params=groupby.op.groupby_params, + groupby_params=groupby.op.groupby_params, + combine_size=combine_size or options.combine_size, + chunk_store_limit=options.chunk_store_limit, + use_inf_as_na=use_inf_as_na, + ) + return agg_op(groupby) diff --git a/python/xorbits/_mars/dataframe/groupby/apply.py b/python/xorbits/_mars/dataframe/groupby/apply.py new file mode 100644 index 000000000..c16fefdbd --- /dev/null +++ b/python/xorbits/_mars/dataframe/groupby/apply.py @@ -0,0 +1,358 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...core import OutputType +from ...core.context import get_context +from ...core.custom_log import redirect_custom_log +from ...core.operand import OperatorLogicKeyGeneratorMixin +from ...serialization.serializables import ( + AnyField, + BoolField, + DictField, + FunctionField, + StringField, + TupleField, +) +from ...utils import enter_current_session, get_func_token, quiet_stdio, tokenize +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import ( + auto_merge_chunks, + build_empty_df, + build_empty_series, + clean_up_func, + make_dtype, + make_dtypes, + parse_index, + restore_func, + validate_output_types, +) + + +class GroupByApplyLogicKeyGeneratorMixin(OperatorLogicKeyGeneratorMixin): + def _get_logic_key_token_values(self): + token_values = super()._get_logic_key_token_values() + if self.func: + return token_values + [get_func_token(self.func)] + else: # pragma: no cover + return token_values + + +class GroupByApply( + DataFrameOperand, DataFrameOperandMixin, GroupByApplyLogicKeyGeneratorMixin +): + _op_type_ = opcodes.APPLY + _op_module_ = "dataframe.groupby" + + func = FunctionField("func") + args = TupleField("args", default_factory=tuple) + kwds = DictField("kwds", default_factory=dict) + maybe_agg = BoolField("maybe_agg", default=None) + logic_key = StringField("logic_key", default=None) + func_key = AnyField("func_key", default=None) + need_clean_up_func = BoolField("need_clean_up_func", default=False) + + def __init__(self, output_types=None, **kw): + super().__init__(_output_types=output_types, **kw) + + def _update_key(self): + values = [v for v in self._values_ if v is not self.func] + [ + get_func_token(self.func) + ] + self._obj_set("_key", tokenize(type(self).__name__, *values)) + return self + + @classmethod + @redirect_custom_log + @enter_current_session + def execute(cls, ctx, op): + restore_func(ctx, op) + in_data = ctx[op.inputs[0].key] + out = op.outputs[0] + if not in_data: + if op.output_types[0] == OutputType.dataframe: + ctx[op.outputs[0].key] = build_empty_df(op.outputs[0].dtypes) + elif op.output_types[0] == OutputType.series: + ctx[op.outputs[0].key] = build_empty_series( + op.outputs[0].dtype, name=out.name + ) + else: + raise ValueError( + "Chunk can not be empty except for dataframe/series, " + "please specify output types" + ) + return + + applied = in_data.apply(op.func, *op.args, **op.kwds) + + if isinstance(applied, pd.DataFrame): + # when there is only one group, pandas tend to return a DataFrame, while + # we need to convert it into a compatible series + if op.output_types[0] == OutputType.series: + assert len(applied.index) == 1 + applied_idx = pd.MultiIndex.from_arrays( + [ + [applied.index[0]] * len(applied.columns), + applied.columns.tolist(), + ] + ) + applied_idx.names = [applied.index.name, None] + applied = pd.Series( + np.array(applied.iloc[0]), applied_idx, name=applied.columns.name + ) + else: + applied.columns.name = None + ctx[out.key] = applied + + @classmethod + def tile(cls, op): + clean_up_func(op) + in_groupby = op.inputs[0] + out_df = op.outputs[0] + + chunks = [] + for c in in_groupby.chunks: + inp_chunks = [c] + + new_op = op.copy().reset_key() + new_op.tileable_op_key = op.key + if op.output_types[0] == OutputType.df_or_series: + chunks.append( + new_op.new_chunk(inp_chunks, index=c.index, collapse_axis=1) + ) + elif op.output_types[0] == OutputType.dataframe: + chunks.append( + new_op.new_chunk( + inp_chunks, + index=c.index, + shape=(np.nan, len(out_df.dtypes)), + dtypes=out_df.dtypes, + columns_value=out_df.columns_value, + index_value=out_df.index_value, + ) + ) + else: + chunks.append( + new_op.new_chunk( + inp_chunks, + name=out_df.name, + index=(c.index[0],), + shape=(np.nan,), + dtype=out_df.dtype, + index_value=out_df.index_value, + ) + ) + + new_op = op.copy() + kw = out_df.params.copy() + kw["chunks"] = chunks + if op.output_types[0] == OutputType.dataframe: + kw["nsplits"] = ((np.nan,) * len(chunks), (out_df.shape[1],)) + else: + kw["nsplits"] = ((np.nan,) * len(chunks),) + ret = new_op.new_tileable([in_groupby], **kw) + if not op.maybe_agg: + return [ret] + else: + # auto merge small chunks if df.groupby().apply(func) + # may be an aggregation operation + yield ret.chunks # trigger execution for chunks + return [auto_merge_chunks(get_context(), ret)] + + def _infer_df_func_returns( + self, in_groupby, in_df, dtypes=None, dtype=None, name=None, index=None + ): + index_value, output_type, new_dtypes = None, None, None + + if self.output_types is not None and (dtypes is not None or dtype is not None): + ret_dtypes = dtypes if dtypes is not None else (dtype, name) + ret_index_value = parse_index(index) if index is not None else None + return ret_dtypes, ret_index_value + + try: + infer_df = in_groupby.op.build_mock_groupby().apply( + self.func, *self.args, **self.kwds + ) + + if len(infer_df) <= 2: + # we create mock df with 4 rows, 2 groups + # if return df has 2 rows, we assume that + # it's an aggregation operation + self.maybe_agg = True + + # todo return proper index when sort=True is implemented + index_value = parse_index(infer_df.index[:0], in_df.key, self.func) + + # for backward compatibility + dtype = dtype if dtype is not None else dtypes + if isinstance(infer_df, pd.DataFrame): + output_type = output_type or OutputType.dataframe + new_dtypes = new_dtypes or infer_df.dtypes + elif isinstance(infer_df, pd.Series): + output_type = output_type or OutputType.series + new_dtypes = new_dtypes or ( + name or infer_df.name, + dtype or infer_df.dtype, + ) + else: + output_type = OutputType.series + new_dtypes = (name, dtype or pd.Series(infer_df).dtype) + except: # noqa: E722 # nosec + pass + + self.output_types = ( + [output_type] if not self.output_types else self.output_types + ) + dtypes = new_dtypes if dtypes is None else dtypes + index_value = index_value if index is None else parse_index(index) + return dtypes, index_value + + def __call__(self, groupby, dtypes=None, dtype=None, name=None, index=None): + in_df = groupby + if self.output_types and self.output_types[0] == OutputType.df_or_series: + return self.new_df_or_series([groupby]) + while in_df.op.output_types[0] not in (OutputType.dataframe, OutputType.series): + in_df = in_df.inputs[0] + + with quiet_stdio(): + dtypes, index_value = self._infer_df_func_returns( + groupby, in_df, dtypes, dtype=dtype, name=name, index=index + ) + if index_value is None: + index_value = parse_index(None, (in_df.key, in_df.index_value.key)) + for arg, desc in zip((self.output_types, dtypes), ("output_types", "dtypes")): + if arg is None: + raise TypeError( + f"Cannot determine {desc} by calculating with enumerate data, " + "please specify it as arguments" + ) + + if self.output_types[0] == OutputType.dataframe: + new_shape = (np.nan, len(dtypes)) + return self.new_dataframe( + [groupby], + shape=new_shape, + dtypes=dtypes, + index_value=index_value, + columns_value=parse_index(dtypes.index, store_data=True), + ) + else: + name = name or dtypes[0] + dtype = dtype or dtypes[1] + new_shape = (np.nan,) + return self.new_series( + [groupby], + name=name, + shape=new_shape, + dtype=dtype, + index_value=index_value, + ) + + +def groupby_apply( + groupby, + func, + *args, + output_type=None, + dtypes=None, + dtype=None, + name=None, + index=None, + skip_infer=None, + **kwargs, +): + """ + Apply function `func` group-wise and combine the results together. + + The function passed to `apply` must take a dataframe as its first + argument and return a DataFrame, Series or scalar. `apply` will + then take care of combining the results back together into a single + dataframe or series. `apply` is therefore a highly flexible + grouping method. + + While `apply` is a very flexible method, its downside is that + using it can be quite a bit slower than using more specific methods + like `agg` or `transform`. Pandas offers a wide range of method that will + be much faster than using `apply` for their specific purposes, so try to + use them before reaching for `apply`. + + Parameters + ---------- + func : callable + A callable that takes a dataframe as its first argument, and + returns a dataframe, a series or a scalar. In addition the + callable may take positional and keyword arguments. + + output_type : {'dataframe', 'series'}, default None + Specify type of returned object. See `Notes` for more details. + + dtypes : Series, default None + Specify dtypes of returned DataFrames. See `Notes` for more details. + + dtype : numpy.dtype, default None + Specify dtype of returned Series. See `Notes` for more details. + + name : str, default None + Specify name of returned Series. See `Notes` for more details. + + index : Index, default None + Specify index of returned object. See `Notes` for more details. + + skip_infer: bool, default False + Whether infer dtypes when dtypes or output_type is not specified. + + args, kwargs : tuple and dict + Optional positional and keyword arguments to pass to `func`. + + Returns + ------- + applied : Series or DataFrame + + See Also + -------- + pipe : Apply function to the full GroupBy object instead of to each + group. + aggregate : Apply aggregate function to the GroupBy object. + transform : Apply function column-by-column to the GroupBy object. + Series.apply : Apply a function to a Series. + DataFrame.apply : Apply a function to each row or column of a DataFrame. + + Notes + ----- + When deciding output dtypes and shape of the return value, Mars will + try applying ``func`` onto a mock grouped object, and the apply call + may fail. When this happens, you need to specify the type of apply + call (DataFrame or Series) in output_type. + + * For DataFrame output, you need to specify a list or a pandas Series + as ``dtypes`` of output DataFrame. ``index`` of output can also be + specified. + * For Series output, you need to specify ``dtype`` and ``name`` of + output Series. + """ + output_types = kwargs.pop("output_types", None) + object_type = kwargs.pop("object_type", None) + output_types = validate_output_types( + output_types=output_types, output_type=output_type, object_type=object_type + ) + if output_types is None and skip_infer: + output_types = [OutputType.df_or_series] + + dtypes = make_dtypes(dtypes) + dtype = make_dtype(dtype) + op = GroupByApply(func=func, args=args, kwds=kwargs, output_types=output_types) + return op(groupby, dtypes=dtypes, dtype=dtype, name=name, index=index) diff --git a/python/xorbits/_mars/dataframe/groupby/core.py b/python/xorbits/_mars/dataframe/groupby/core.py new file mode 100644 index 000000000..52f6ef8df --- /dev/null +++ b/python/xorbits/_mars/dataframe/groupby/core.py @@ -0,0 +1,533 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from functools import partial + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import ENTITY_TYPE, Entity, OutputType +from ...core.operand import MapReduceOperand, OperandStage +from ...lib.groupby_wrapper import wrapped_groupby +from ...serialization.serializables import AnyField, BoolField, Int32Field +from ...utils import lazy_import, no_default, pd_release_version +from ..align import align_dataframe_series, align_series_series +from ..core import SERIES_CHUNK_TYPE, SERIES_TYPE +from ..initializer import Series as asseries +from ..operands import DataFrameOperandMixin, DataFrameShuffleProxy +from ..utils import ( + build_concatenated_rows_frame, + build_df, + build_series, + hash_dataframe_on, + is_cudf, + parse_index, +) + +cudf = lazy_import("cudf") + +_GROUP_KEYS_NO_DEFAULT = pd_release_version >= (1, 5, 0) +_default_group_keys = no_default if _GROUP_KEYS_NO_DEFAULT else True + + +class DataFrameGroupByOperand(MapReduceOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.GROUPBY + + _by = AnyField("by", on_serialize=lambda x: x.data if isinstance(x, Entity) else x) + _level = AnyField("level") + _as_index = BoolField("as_index") + _sort = BoolField("sort") + _group_keys = BoolField("group_keys") + + _shuffle_size = Int32Field("shuffle_size") + + def __init__( + self, + by=None, + level=None, + as_index=None, + sort=None, + group_keys=None, + shuffle_size=None, + output_types=None, + **kw + ): + super().__init__( + _by=by, + _level=level, + _as_index=as_index, + _sort=sort, + _group_keys=group_keys, + _shuffle_size=shuffle_size, + _output_types=output_types, + **kw + ) + if output_types: + if self.stage in (OperandStage.map, OperandStage.reduce): + if output_types[0] in ( + OutputType.dataframe, + OutputType.dataframe_groupby, + ): + output_types = [OutputType.dataframe] + else: + output_types = [OutputType.series] + else: + if output_types[0] in ( + OutputType.dataframe, + OutputType.dataframe_groupby, + ): + output_types = [OutputType.dataframe_groupby] + elif output_types[0] == OutputType.series: + output_types = [OutputType.series_groupby] + self.output_types = output_types + + @property + def by(self): + return self._by + + @property + def level(self): + return self._level + + @property + def as_index(self): + return self._as_index + + @property + def sort(self): + return self._sort + + @property + def group_keys(self): + return self._group_keys + + @property + def shuffle_size(self): + return self._shuffle_size + + @property + def is_dataframe_obj(self): + return self.output_types[0] in ( + OutputType.dataframe_groupby, + OutputType.dataframe, + ) + + @property + def groupby_params(self): + return dict( + by=self.by, + level=self.level, + as_index=self.as_index, + sort=self.sort, + group_keys=self.group_keys, + ) + + def build_mock_groupby(self, **kwargs): + in_df = self.inputs[0] + if self.is_dataframe_obj: + mock_obj = build_df( + in_df, size=[2, 2], fill_value=[1, 2], ensure_string=True + ) + else: + mock_obj = build_series( + in_df, + size=[2, 2], + fill_value=[1, 2], + name=in_df.name, + ensure_string=True, + ) + + new_kw = self.groupby_params + new_kw.update(kwargs) + if new_kw.get("level"): + new_kw["level"] = 0 + if isinstance(new_kw["by"], list): + new_by = [] + for v in new_kw["by"]: + if isinstance(v, ENTITY_TYPE): + build_fun = build_df if v.ndim == 2 else build_series + mock_by = pd.concat( + [ + build_fun(v, size=2, fill_value=1, name=v.name), + build_fun(v, size=2, fill_value=2, name=v.name), + ] + ) + new_by.append(mock_by) + else: + new_by.append(v) + new_kw["by"] = new_by + return mock_obj.groupby(**new_kw) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs[1:]) + if len(inputs) > 1: + by = [] + for k in self._by: + if isinstance(k, (SERIES_TYPE, SERIES_CHUNK_TYPE)): + by.append(next(inputs_iter)) + else: + by.append(k) + self._by = by + + def __call__(self, df): + params = df.params.copy() + params["index_value"] = parse_index(None, df.key, df.index_value.key) + if df.ndim == 2: + if isinstance(self.by, list): + index, types = [], [] + for k in self.by: + if isinstance(k, SERIES_TYPE): + index.append(k.name) + types.append(k.dtype) + elif k in df.dtypes: + index.append(k) + types.append(df.dtypes[k]) + else: + raise KeyError(k) + params["key_dtypes"] = pd.Series(types, index=index) + + inputs = [df] + if isinstance(self.by, list): + for k in self.by: + if isinstance(k, SERIES_TYPE): + inputs.append(k) + + return self.new_tileable(inputs, **params) + + @classmethod + def _align_input_and_by(cls, op, inp, by): + align_method = ( + partial(align_dataframe_series, axis="index") + if op.is_dataframe_obj + else align_series_series + ) + nsplits, _, inp_chunks, by_chunks = align_method(inp, by) + + inp_params = inp.params + inp_params["chunks"] = inp_chunks + inp_params["nsplits"] = nsplits + inp = inp.op.copy().new_tileable(op.inputs, kws=[inp_params]) + + by_params = by.params + by_params["chunks"] = by_chunks + if len(nsplits) == 2: + by_nsplits = nsplits[:1] + else: + by_nsplits = nsplits + by_params["nsplits"] = by_nsplits + by = by.op.copy().new_tileable(by.op.inputs, kws=[by_params]) + + return inp, by + + @classmethod + def tile(cls, op): + in_df = op.inputs[0] + by = op.by + + series_in_by = False + new_inputs = [] + if len(op.inputs) > 1: + # by series + new_by = [] + for k in by: + if isinstance(k, SERIES_TYPE): + in_df, k = cls._align_input_and_by(op, in_df, k) + if len(new_inputs) == 0: + new_inputs.append(in_df) + new_inputs.append(k) + series_in_by = True + new_by.append(k) + by = new_by + else: + new_inputs = op.inputs + + is_dataframe_obj = op.is_dataframe_obj + if is_dataframe_obj: + in_df = build_concatenated_rows_frame(in_df) + output_type = OutputType.dataframe + chunk_shape = (in_df.chunk_shape[0], 1) + else: + output_type = OutputType.series + chunk_shape = (in_df.chunk_shape[0],) + + # generate map chunks + map_chunks = [] + for chunk in in_df.chunks: + map_op = op.copy().reset_key() + map_op.stage = OperandStage.map + map_op._shuffle_size = chunk_shape[0] + map_op._output_types = [output_type] + chunk_inputs = [chunk] + if len(op.inputs) > 1: + chunk_by = [] + for k in by: + if isinstance(k, SERIES_TYPE): + by_chunk = k.cix[chunk.index[0],] + chunk_by.append(by_chunk) + chunk_inputs.append(by_chunk) + else: + chunk_by.append(k) + map_op._by = chunk_by + map_chunks.append( + map_op.new_chunk( + chunk_inputs, + shape=(np.nan, np.nan), + index=chunk.index, + ) + ) + + proxy_chunk = DataFrameShuffleProxy(output_types=[output_type]).new_chunk( + map_chunks, shape=() + ) + + # generate reduce chunks + reduce_chunks = [] + out_indices = list(itertools.product(*(range(s) for s in chunk_shape))) + for ordinal, out_idx in enumerate(out_indices): + reduce_op = op.copy().reset_key() + reduce_op._by = None + reduce_op._output_types = [output_type] + reduce_op.stage = OperandStage.reduce + reduce_op.reducer_ordinal = ordinal + reduce_op.n_reducers = len(out_indices) + reduce_chunks.append( + reduce_op.new_chunk( + [proxy_chunk], shape=(np.nan, np.nan), index=out_idx + ) + ) + + # generate groupby chunks + out_chunks = [] + for chunk in reduce_chunks: + groupby_op = op.copy().reset_key() + if series_in_by: + # set by to None, cuz data of by will be passed from map to reduce to groupby + groupby_op._by = None + if is_dataframe_obj: + new_shape = (np.nan, in_df.shape[1]) + else: + new_shape = (np.nan,) + params = dict(shape=new_shape, index=chunk.index) + if op.is_dataframe_obj: + params.update( + dict( + dtypes=in_df.dtypes, + columns_value=in_df.columns_value, + index_value=parse_index(None, chunk.key, proxy_chunk.key), + ) + ) + else: + params.update( + dict( + name=in_df.name, + dtype=in_df.dtype, + index_value=parse_index(None, chunk.key, proxy_chunk.key), + ) + ) + out_chunks.append(groupby_op.new_chunk([chunk], **params)) + + new_op = op.copy() + params = op.outputs[0].params.copy() + if is_dataframe_obj: + params["nsplits"] = ((np.nan,) * len(out_chunks), (in_df.shape[1],)) + else: + params["nsplits"] = ((np.nan,) * len(out_chunks),) + params["chunks"] = out_chunks + return new_op.new_tileables(new_inputs, **params) + + @classmethod + def execute_map(cls, ctx, op): + is_dataframe_obj = op.is_dataframe_obj + by = op.by + chunk = op.outputs[0] + df = ctx[op.inputs[0].key] + + deliver_by = False # output by for the upcoming process + if isinstance(by, list): + new_by = [] + for v in by: + if isinstance(v, ENTITY_TYPE): + deliver_by = True + new_by.append(ctx[v.key]) + else: + new_by.append(v) + by = new_by + + if isinstance(by, list) or callable(by): + on = by + else: + on = None + + # Get the filter rule corresponding to each df. + dfs = df if isinstance(df, tuple) else (df,) + counter = itertools.count() + df_filters = [] + idx_to_index_and_filters = dict() + for item in dfs: + is_new = True + for _, (index, filters) in idx_to_index_and_filters.items(): + if item.index.equals(index): + df_filters.append(filters) + is_new = False + break + if is_new: + filters = hash_dataframe_on(item, on, op.shuffle_size, level=op.level) + idx_to_index_and_filters[next(counter)] = (item.index, filters) + df_filters.append(filters) + + def _take_index(src, f): + result = src.iloc[f] + if src.index.names: + result.index.names = src.index.names + if isinstance(src.index, pd.MultiIndex): + result.index = result.index.remove_unused_levels() + if is_cudf(result): # pragma: no cover + result = result.copy() + return result + + for index_idx in range(len(df_filters[0])): + if is_dataframe_obj: + reducer_index = (index_idx, chunk.index[1]) + else: + reducer_index = (index_idx,) + filtered = [] + filtered_by = [] + for d, filters in zip(dfs, df_filters): + index_filter = filters[index_idx] + if deliver_by: + for v in by: + if isinstance(v, pd.Series): + filtered_by.append(_take_index(v, index_filter)) + else: + filtered_by.append(v) + filtered.append(_take_index(d, index_filter)) + if deliver_by: + ctx[chunk.key, reducer_index] = ctx.get_current_chunk().index, ( + *filtered, + filtered_by, + deliver_by, + ) + else: + if isinstance(df, tuple): + ctx[chunk.key, reducer_index] = ( + ctx.get_current_chunk().index, + tuple(filtered) + (deliver_by,), + ) + else: + ctx[chunk.key, reducer_index] = ( + ctx.get_current_chunk().index, + filtered[0], + ) + + @classmethod + def execute_reduce(cls, ctx, op: "DataFrameGroupByOperand"): + xdf = cudf if op.gpu else pd + chunk = op.outputs[0] + input_idx_to_df = dict(op.iter_mapper_data(ctx)) + row_idxes = sorted(input_idx_to_df.keys()) + + res = [] + for row_idx in row_idxes: + row_df = input_idx_to_df.get(row_idx, None) + if row_df is not None: + res.append(row_df) + by = None + if isinstance(res[0], tuple): + # By is series + deliver_by = res[0][-1] + r = [] + part_len = len(res[0]) + part_len -= 1 if not deliver_by else 2 + for n in range(part_len): + r.append(xdf.concat([it[n] for it in res], axis=0)) + r = tuple(r) + + if deliver_by: + by = [None] * len(res[0][-2]) + for it in res: + for i, v in enumerate(it[1]): + if isinstance(v, pd.Series): + if by[i] is None: + by[i] = v + else: + by[i] = pd.concat([by[i], v], axis=0) + else: + by[i] = v + else: + r = pd.concat(res, axis=0) + + if chunk.index_value is not None: + if isinstance(r, tuple): + for s in r: + s.index.name = chunk.index_value.name + else: + r.index.name = chunk.index_value.name + if by is None: + ctx[chunk.key] = r + elif isinstance(r, tuple): + ctx[chunk.key] = r + (by,) + else: + ctx[chunk.key] = (r, by) + + @classmethod + def execute(cls, ctx, op: "DataFrameGroupByOperand"): + if op.stage == OperandStage.map: + cls.execute_map(ctx, op) + elif op.stage == OperandStage.reduce: + cls.execute_reduce(ctx, op) + else: + inp = ctx[op.inputs[0].key] + if isinstance(inp, tuple): + # df, by + df, by = inp + else: + df = inp + by = op.by + ctx[op.outputs[0].key] = wrapped_groupby( + df, + by=by, + level=op.level, + as_index=op.as_index, + sort=op.sort, + group_keys=op.group_keys if op.group_keys is not None else no_default, + ) + + +def groupby( + df, by=None, level=None, as_index=True, sort=True, group_keys=_default_group_keys +): + if not as_index and df.op.output_types[0] == OutputType.series: + raise TypeError("as_index=False only valid with DataFrame") + + output_types = ( + [OutputType.dataframe_groupby] if df.ndim == 2 else [OutputType.series_groupby] + ) + if isinstance(by, (SERIES_TYPE, pd.Series)): + if isinstance(by, pd.Series): + by = asseries(by) + by = [by] + elif df.ndim > 1 and by is not None and not isinstance(by, list): + by = [by] + op = DataFrameGroupByOperand( + by=by, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys if group_keys is not no_default else None, + output_types=output_types, + ) + return op(df) diff --git a/python/xorbits/_mars/dataframe/groupby/cum.py b/python/xorbits/_mars/dataframe/groupby/cum.py new file mode 100644 index 000000000..4ca56c857 --- /dev/null +++ b/python/xorbits/_mars/dataframe/groupby/cum.py @@ -0,0 +1,200 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...core import OutputType +from ...serialization.serializables import AnyField, BoolField +from ...utils import lazy_import +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_empty_df, build_empty_series, parse_index, validate_axis + +cudf = lazy_import("cudf") + + +class GroupByCumReductionOperand(DataFrameOperandMixin, DataFrameOperand): + _op_module_ = "dataframe.groupby" + + _axis = AnyField("axis") + _ascending = BoolField("ascending") + + def __init__(self, axis=None, ascending=None, output_types=None, **kw): + super().__init__( + _axis=axis, _ascending=ascending, _output_types=output_types, **kw + ) + + @property + def axis(self) -> int: + return self._axis + + @property + def ascending(self) -> bool: + return self._ascending + + def _calc_out_dtypes(self, in_groupby): + mock_groupby = in_groupby.op.build_mock_groupby() + func_name = getattr(self, "_func_name") + + if func_name == "cumcount": + result_df = mock_groupby.cumcount(ascending=self.ascending) + else: + result_df = getattr(mock_groupby, func_name)(axis=self.axis) + + if isinstance(result_df, pd.DataFrame): + self.output_types = [OutputType.dataframe] + return result_df.dtypes + else: + self.output_types = [OutputType.series] + return result_df.name, result_df.dtype + + def __call__(self, groupby): + in_df = groupby + while in_df.op.output_types[0] not in (OutputType.dataframe, OutputType.series): + in_df = in_df.inputs[0] + + self._axis = validate_axis(self.axis or 0, in_df) + + out_dtypes = self._calc_out_dtypes(groupby) + + kw = in_df.params.copy() + kw["index_value"] = parse_index(pd.RangeIndex(-1), groupby.key) + if self.output_types[0] == OutputType.dataframe: + kw.update( + dict( + columns_value=parse_index(out_dtypes.index, store_data=True), + dtypes=out_dtypes, + shape=(groupby.shape[0], len(out_dtypes)), + ) + ) + else: + name, dtype = out_dtypes + kw.update(dtype=dtype, name=name, shape=(groupby.shape[0],)) + return self.new_tileable([groupby], **kw) + + @classmethod + def tile(cls, op): + in_groupby = op.inputs[0] + out_df = op.outputs[0] + + chunks = [] + for c in in_groupby.chunks: + new_op = op.copy().reset_key() + + new_index = parse_index(pd.RangeIndex(-1), c.key) + if op.output_types[0] == OutputType.dataframe: + chunks.append( + new_op.new_chunk( + [c], + index=c.index, + shape=(np.nan, len(out_df.dtypes)), + dtypes=out_df.dtypes, + columns_value=out_df.columns_value, + index_value=new_index, + ) + ) + else: + chunks.append( + new_op.new_chunk( + [c], + index=(c.index[0],), + shape=(np.nan,), + dtype=out_df.dtype, + index_value=new_index, + name=out_df.name, + ) + ) + + new_op = op.copy().reset_key() + kw = out_df.params.copy() + kw["chunks"] = chunks + if op.output_types[0] == OutputType.dataframe: + kw["nsplits"] = ((np.nan,) * len(chunks), (len(out_df.dtypes),)) + else: + kw["nsplits"] = ((np.nan,) * len(chunks),) + return new_op.new_tileables([in_groupby], **kw) + + @classmethod + def execute(cls, ctx, op: "GroupByCumReductionOperand"): + in_data = ctx[op.inputs[0].key] + out_chunk = op.outputs[0] + + if not in_data or in_data.empty: + ctx[out_chunk.key] = ( + build_empty_df(out_chunk.dtypes) + if op.output_types[0] == OutputType.dataframe + else build_empty_series(out_chunk.dtype, name=out_chunk.name) + ) + return + + func_name = getattr(op, "_func_name") + if func_name == "cumcount": + ctx[out_chunk.key] = in_data.cumcount(ascending=op.ascending) + else: + result = getattr(in_data, func_name)(axis=op.axis) + if result.ndim == 2: + ctx[out_chunk.key] = result.astype(out_chunk.dtypes, copy=False) + else: + ctx[out_chunk.key] = result.astype(out_chunk.dtype, copy=False) + + +class GroupByCummin(GroupByCumReductionOperand): + _op_type_ = opcodes.CUMMIN + _func_name = "cummin" + + +class GroupByCummax(GroupByCumReductionOperand): + _op_type_ = opcodes.CUMMAX + _func_name = "cummax" + + +class GroupByCumsum(GroupByCumReductionOperand): + _op_type_ = opcodes.CUMSUM + _func_name = "cumsum" + + +class GroupByCumprod(GroupByCumReductionOperand): + _op_type_ = opcodes.CUMPROD + _func_name = "cumprod" + + +class GroupByCumcount(GroupByCumReductionOperand): + _op_type_ = opcodes.CUMCOUNT + _func_name = "cumcount" + + +def cumcount(groupby, ascending: bool = True): + op = GroupByCumcount(ascending=ascending) + return op(groupby) + + +def cummin(groupby, axis=0): + op = GroupByCummin(axis=axis) + return op(groupby) + + +def cummax(groupby, axis=0): + op = GroupByCummax(axis=axis) + return op(groupby) + + +def cumprod(groupby, axis=0): + op = GroupByCumprod(axis=axis) + return op(groupby) + + +def cumsum(groupby, axis=0): + op = GroupByCumsum(axis=axis) + return op(groupby) diff --git a/python/xorbits/_mars/dataframe/groupby/custom_aggregation.py b/python/xorbits/_mars/dataframe/groupby/custom_aggregation.py new file mode 100644 index 000000000..14c3818fc --- /dev/null +++ b/python/xorbits/_mars/dataframe/groupby/custom_aggregation.py @@ -0,0 +1,86 @@ +# Copyright 2022 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from abc import ABC, abstractmethod +from typing import Dict, Type, Union + +import pandas as pd + + +class DataFrameCustomGroupByAggMixin(ABC): + @classmethod + @abstractmethod + def execute_map(cls, op, in_data: pd.DataFrame) -> Union[pd.DataFrame, pd.Series]: + """ + Map stage implement. + + Parameters + ------- + op : Any operand + DataFrame operand. + in_data : pd.DataFrame + Input dataframe. + + Returns + ------- + The result of op map stage. + """ + + @classmethod + @abstractmethod + def execute_combine( + cls, op, in_data: pd.DataFrame + ) -> Union[pd.DataFrame, pd.Series]: + """ + Combine stage implement. + + Parameters + ---------- + op : Any operand + DataFrame operand. + in_data : pd.Dataframe + Input dataframe. + + Returns + ------- + The result of op combine stage. + """ + + @classmethod + @abstractmethod + def execute_agg(cls, op, in_data: pd.DataFrame) -> Union[pd.DataFrame, pd.Series]: + """ + Agg stage implement. + + Parameters + ---------- + op : Any operand + DataFrame operand. + in_data : pd.Dataframe + Input dataframe. + + Returns + ------- + The result of op agg stage. + """ + + +custom_agg_functions: Dict[str, Type[DataFrameCustomGroupByAggMixin]] = {} + + +def register_custom_groupby_agg_func(method_name: str): + def wrap(func_type: Type[DataFrameCustomGroupByAggMixin]): + custom_agg_functions[method_name] = func_type + return func_type + + return wrap diff --git a/python/xorbits/_mars/dataframe/groupby/fill.py b/python/xorbits/_mars/dataframe/groupby/fill.py new file mode 100644 index 000000000..5a4549af4 --- /dev/null +++ b/python/xorbits/_mars/dataframe/groupby/fill.py @@ -0,0 +1,212 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...core import OutputType +from ...serialization.serializables import AnyField, DictField, Int64Field, StringField +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_empty_df, build_empty_series, parse_index + + +class GroupByFillOperand(DataFrameOperand, DataFrameOperandMixin): + _op_module_ = "dataframe.groupby" + + value = AnyField("value", default=None) + method = StringField("method", default=None) + axis = AnyField("axis", default=0) + limit = Int64Field("limit", default=None) + downcast = DictField("downcast", default=None) + + def _calc_out_dtypes(self, in_groupby): + mock_groupby = in_groupby.op.build_mock_groupby() + func_name = getattr(self, "_func_name") + + if func_name == "fillna": + result_df = mock_groupby.fillna( + value=self.value, + method=self.method, + axis=self.axis, + limit=self.limit, + downcast=self.downcast, + ) + else: + result_df = getattr(mock_groupby, func_name)(limit=self.limit) + + if isinstance(result_df, pd.DataFrame): + self.output_types = [OutputType.dataframe] + return result_df.dtypes + else: + self.output_types = [OutputType.series] + return result_df.name, result_df.dtype + + def __call__(self, groupby): + in_df = groupby + while in_df.op.output_types[0] not in (OutputType.dataframe, OutputType.series): + in_df = in_df.inputs[0] + out_dtypes = self._calc_out_dtypes(groupby) + + kw = in_df.params.copy() + kw["index_value"] = parse_index(pd.RangeIndex(-1), groupby.key) + if self.output_types[0] == OutputType.dataframe: + kw.update( + dict( + columns_value=parse_index(out_dtypes.index, store_data=True), + dtypes=out_dtypes, + shape=(groupby.shape[0], len(out_dtypes)), + ) + ) + else: + name, dtype = out_dtypes + kw.update(dtype=dtype, name=name, shape=(groupby.shape[0],)) + return self.new_tileable([groupby], **kw) + + @classmethod + def tile(cls, op): + in_groupby = op.inputs[0] + out_df = op.outputs[0] + + chunks = [] + for c in in_groupby.chunks: + new_op = op.copy().reset_key() + + new_index = parse_index(pd.RangeIndex(-1), c.key) + if op.output_types[0] == OutputType.dataframe: + chunks.append( + new_op.new_chunk( + [c], + index=c.index, + shape=(np.nan, len(out_df.dtypes)), + dtypes=out_df.dtypes, + columns_value=out_df.columns_value, + index_value=new_index, + ) + ) + else: + chunks.append( + new_op.new_chunk( + [c], + index=(c.index[0],), + shape=(np.nan,), + dtype=out_df.dtype, + index_value=new_index, + name=out_df.name, + ) + ) + new_op = op.copy().reset_key() + kw = out_df.params.copy() + kw["chunks"] = chunks + if op.output_types[0] == OutputType.dataframe: + kw["nsplits"] = ((np.nan,) * len(chunks), (len(out_df.dtypes),)) + else: + kw["nsplits"] = ((np.nan,) * len(chunks),) + return new_op.new_tileables([in_groupby], **kw) + + @classmethod + def execute(cls, ctx, op: "GroupByFillOperand"): + in_data = ctx[op.inputs[0].key] + out_chunk = op.outputs[0] + + if not in_data or in_data.empty: + ctx[out_chunk.key] = ( + build_empty_df(out_chunk.dtypes) + if op.output_types[0] == OutputType.dataframe + else build_empty_series(out_chunk.dtype, name=out_chunk.name) + ) + return + + func_name = getattr(op, "_func_name") + if func_name == "fillna": + ctx[out_chunk.key] = in_data.fillna( + value=op.value, + method=op.method, + axis=op.axis, + limit=op.limit, + downcast=op.downcast, + ) + else: + result = getattr(in_data, func_name)(limit=op.limit) + if result.ndim == 2: + ctx[out_chunk.key] = result.astype(out_chunk.dtypes, copy=False) + else: + ctx[out_chunk.key] = result.astype(out_chunk.dtype, copy=False) + + +class GroupByFFill(GroupByFillOperand): + _op_type_ = opcodes.FILL_NA + _func_name = "ffill" + + +class GroupByBFill(GroupByFillOperand): + _op_type = opcodes.FILL_NA + _func_name = "bfill" + + +class GroupByFillNa(GroupByFillOperand): + _op_type = opcodes.FILL_NA + _func_name = "fillna" + + +def ffill(groupby, limit=None): + """ + Forward fill the values. + + limit: int, default None + Limit number of values to fill + + return: Series or DataFrame + """ + op = GroupByFFill(limit=limit) + return op(groupby) + + +def bfill(groupby, limit=None): + """ + Backward fill the values. + + limit: int, default None + Limit number of values to fill + + return: Series or DataFrame + """ + op = GroupByBFill(limit=limit) + return op(groupby) + + +def fillna(groupby, value=None, method=None, axis=None, limit=None, downcast=None): + """ + Fill NA/NaN values using the specified method + + value: scalar, dict, Series, or DataFrame + Value to use to fill holes (e.g. 0), alternately a dict/Series/DataFrame + of values specifying which value to use for each index (for a Series) or + column (for a DataFrame). Values not in the dict/Series/DataFrame + will not be filled. This value cannot be a list. + method: {'backfill','bfill','ffill',None}, default None + axis: {0 or 'index', 1 or 'column'} + limit: int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill + downcast: dict, default None + A dict of item->dtype of what to downcast if possible, + or the string ‘infer’ which will try to downcast to an appropriate equal type + + return: DataFrame or None + """ + op = GroupByFillNa( + value=value, method=method, axis=axis, limit=limit, downcast=downcast + ) + return op(groupby) diff --git a/python/xorbits/_mars/dataframe/groupby/getitem.py b/python/xorbits/_mars/dataframe/groupby/getitem.py new file mode 100644 index 000000000..c4cbd8c7a --- /dev/null +++ b/python/xorbits/_mars/dataframe/groupby/getitem.py @@ -0,0 +1,137 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import Iterable + +from ... import opcodes +from ...core import OutputType +from ...serialization.serializables import AnyField +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import parse_index + + +class GroupByIndex(DataFrameOperandMixin, DataFrameOperand): + _op_type_ = opcodes.INDEX + _op_module_ = "dataframe.groupby" + + _selection = AnyField("selection") + + def __init__(self, selection=None, output_types=None, **kw): + super().__init__(_selection=selection, _output_types=output_types, **kw) + + @property + def selection(self): + return self._selection + + @property + def groupby_params(self): + params = self.inputs[0].op.groupby_params + params["selection"] = self.selection + return params + + def build_mock_groupby(self, **kwargs): + groupby_op = self.inputs[0].op + return groupby_op.build_mock_groupby(**kwargs)[self.selection] + + def __call__(self, groupby): + indexed = groupby.op.build_mock_groupby()[self.selection] + + if indexed.ndim == 1: + self.output_types = [OutputType.series_groupby] + params = dict( + shape=(groupby.shape[0],), + name=self.selection, + dtype=groupby.dtypes[self.selection], + index_value=groupby.index_value, + key_dtypes=groupby.key_dtypes, + ) + else: + self.output_types = [OutputType.dataframe_groupby] + + if isinstance(self.selection, Iterable) and not isinstance( + self.selection, str + ): + item_list = list(self.selection) + else: + item_list = [self.selection] + + params = groupby.params.copy() + params["dtypes"] = new_dtypes = groupby.dtypes[item_list] + params["selection"] = self.selection + params["shape"] = (groupby.shape[0], len(item_list)) + params["columns_value"] = parse_index(new_dtypes.index, store_data=True) + + return self.new_tileable([groupby], **params) + + @classmethod + def tile(cls, op: "GroupByIndex"): + in_groupby = op.inputs[0] + out_groupby = op.outputs[0] + + chunks = [] + for c in in_groupby.chunks: + if op.output_types[0] == OutputType.series_groupby: + params = dict( + shape=(c.shape[0],), + name=op.selection, + index=(c.index[0],), + dtype=c.dtypes[op.selection], + index_value=c.index_value, + key_dtypes=c.key_dtypes, + ) + else: + params = c.params.copy() + params["dtypes"] = out_groupby.dtypes + params["selection"] = op.selection + params["shape"] = (c.shape[0], len(op.selection)) + params["columns_value"] = out_groupby.columns_value + + new_op = op.copy().reset_key() + chunks.append(new_op.new_chunk([c], **params)) + + new_op = op.copy().reset_key() + params = out_groupby.params.copy() + new_nsplits = ( + (in_groupby.nsplits[0], (len(op.selection),)) + if out_groupby.ndim == 2 + else (in_groupby.nsplits[0],) + ) + params.update(dict(chunks=chunks, nsplits=new_nsplits)) + return new_op.new_tileables([in_groupby], **params) + + @classmethod + def execute(cls, ctx, op: "GroupByIndex"): + in_data = ctx[op.inputs[0].key] + ctx[op.outputs[0].key] = in_data[op.selection] + + +def df_groupby_getitem(df_groupby, item): + try: + hash(item) + hashable = True + except TypeError: + hashable = False + + if hashable and item in df_groupby.dtypes: + output_types = [OutputType.series_groupby] + elif isinstance(item, Iterable) and all(it in df_groupby.dtypes for it in item): + output_types = [OutputType.dataframe_groupby] + else: + raise NameError(f"Cannot slice groupby with {item!r}") + + if df_groupby.selection: + raise IndexError(f"Column(s) {df_groupby.selection!r} already selected") + + op = GroupByIndex(selection=item, output_types=output_types) + return op(df_groupby) diff --git a/python/xorbits/_mars/dataframe/groupby/head.py b/python/xorbits/_mars/dataframe/groupby/head.py new file mode 100644 index 000000000..b1b4f4a86 --- /dev/null +++ b/python/xorbits/_mars/dataframe/groupby/head.py @@ -0,0 +1,225 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...core import OutputType, get_output_types, recursive_tile +from ...serialization.serializables import BoolField, DictField, Int64Field +from ...utils import pd_release_version +from ..core import IndexValue +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_concatenated_rows_frame, parse_index + +_pandas_enable_negative = pd_release_version >= (1, 4, 0) + + +class GroupByHead(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.GROUPBY_HEAD + _op_module_ = "dataframe.groupby" + + row_count = Int64Field("row_count") + groupby_params = DictField("groupby_params") + enable_negative = BoolField("enable_negative") + + def __call__(self, groupby): + df = groupby + while df.op.output_types[0] not in (OutputType.dataframe, OutputType.series): + df = df.inputs[0] + + selection = groupby.op.groupby_params.pop("selection", None) + if df.ndim > 1 and selection: + if isinstance(selection, tuple) and selection not in df.dtypes: + selection = list(selection) + + result_df = df[selection] + else: + result_df = df + + self._output_types = ( + [OutputType.dataframe] if result_df.ndim == 2 else [OutputType.series] + ) + + params = result_df.params + params["shape"] = (np.nan,) + result_df.shape[1:] + if isinstance(df.index_value.value, IndexValue.RangeIndex): + params["index_value"] = parse_index(pd.RangeIndex(-1), df.key) + + return self.new_tileable([df], **params) + + @classmethod + def tile(cls, op: "GroupByHead"): + in_df = op.inputs[0] + groupby_params = op.groupby_params.copy() + selection = groupby_params.pop("selection", None) + + enable_negative = _pandas_enable_negative and op.enable_negative + + if len(in_df.shape) > 1: + in_df = build_concatenated_rows_frame(in_df) + out_df = op.outputs[0] + + # when row_count is not positive and pandas does not support negative head, + # or there is only one chunk, tile with a single chunk + if (not enable_negative and op.row_count <= 0) or len(in_df.chunks) <= 1: + row_num = 0 if not enable_negative and op.row_count <= 0 else np.nan + new_shape = (row_num,) + new_nsplits = ((row_num,),) + if out_df.ndim > 1: + new_shape += (out_df.shape[1],) + new_nsplits += ((out_df.shape[1],),) + + c = in_df.chunks[0] + chunk_op = op.copy().reset_key() + params = out_df.params + params["shape"] = new_shape + params["index"] = (0,) * out_df.ndim + out_chunk = chunk_op.new_chunk([c], **params) + + tileable_op = op.copy().reset_key() + return tileable_op.new_tileables( + [in_df], nsplits=new_nsplits, chunks=[out_chunk], **params + ) + + if in_df.ndim > 1 and selection: + if isinstance(selection, tuple) and selection not in in_df.dtypes: + selection = list(selection) + + if not isinstance(selection, list): + pre_selection = [selection] + else: + pre_selection = list(selection) + + if isinstance(groupby_params.get("by"), list): + pre_selection += [ + el for el in groupby_params["by"] if el not in pre_selection + ] + + if len(pre_selection) != in_df.shape[1]: + in_df = yield from recursive_tile(in_df[pre_selection]) + + # generate pre chunks + if op.row_count < 0: + # when we have negative row counts, pre-groupby optimization is not possible + pre_chunks = in_df.chunks + else: + pre_chunks = [] + for c in in_df.chunks: + pre_op = op.copy().reset_key() + pre_op._output_types = get_output_types(c) + pre_op.groupby_params = op.groupby_params.copy() + pre_op.groupby_params.pop("selection", None) + params = c.params + params["shape"] = (np.nan,) + c.shape[1:] + pre_chunks.append(pre_op.new_chunk([c], **params)) + + new_op = op.copy().reset_key() + new_op._output_types = get_output_types(in_df) + new_nsplits = ((np.nan,) * len(in_df.nsplits[0]),) + in_df.nsplits[1:] + pre_tiled = new_op.new_tileable( + [in_df], chunks=pre_chunks, nsplits=new_nsplits, **in_df.params + ) + + # generate groupby + grouped = yield from recursive_tile(pre_tiled.groupby(**groupby_params)) + if selection: + grouped = yield from recursive_tile(grouped[selection]) + + # generate post chunks + post_chunks = [] + for c in grouped.chunks: + post_op = op.copy().reset_key() + post_op.groupby_params = op.groupby_params.copy() + post_op.groupby_params.pop("selection", None) + if op.output_types[0] == OutputType.dataframe: + index = c.index + else: + index = (c.index[0],) + params = out_df.params + params["index"] = index + post_chunks.append(post_op.new_chunk([c], **params)) + + new_op = op.copy().reset_key() + new_nsplits = ((np.nan,) * len(in_df.nsplits[0]),) + if out_df.ndim > 1: + new_nsplits += ((out_df.shape[1],),) + return new_op.new_tileables( + [in_df], chunks=post_chunks, nsplits=new_nsplits, **out_df.params + ) + + @classmethod + def execute(cls, ctx, op: "GroupByHead"): + in_data = ctx[op.inputs[0].key] + + params = op.groupby_params.copy() + selection = params.pop("selection", None) + + if hasattr(in_data, "groupby"): + grouped = in_data.groupby(**params) + else: + grouped = in_data + + if selection: + grouped = grouped[selection] + + result = grouped.head(op.row_count) + if not op.enable_negative and op.row_count < 0: + result = result.iloc[:0] + ctx[op.outputs[0].key] = result + + +def head(groupby, n=5): + """ + Return first n rows of each group. + + Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows + from the original Series or DataFrame with original index and order preserved + (``as_index`` flag is ignored). + + Does not work for negative values of `n`. + + Returns + ------- + Series or DataFrame + + See Also + -------- + Series.groupby + DataFrame.groupby + + Examples + -------- + + >>> import mars.dataframe as md + >>> df = md.DataFrame([[1, 2], [1, 4], [5, 6]], + ... columns=['A', 'B']) + >>> df.groupby('A').head(1).execute() + A B + 0 1 2 + 2 5 6 + >>> df.groupby('A').head(-1).execute() + Empty DataFrame + Columns: [A, B] + Index: [] + """ + groupby_params = groupby.op.groupby_params.copy() + groupby_params.pop("as_index", None) + + op = GroupByHead( + row_count=n, + groupby_params=groupby_params, + enable_negative=_pandas_enable_negative, + ) + return op(groupby) diff --git a/python/xorbits/_mars/dataframe/groupby/nunique.py b/python/xorbits/_mars/dataframe/groupby/nunique.py new file mode 100644 index 000000000..9a9f97c31 --- /dev/null +++ b/python/xorbits/_mars/dataframe/groupby/nunique.py @@ -0,0 +1,157 @@ +# Copyright 2022 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List, Union + +import pandas as pd + +from ...core import OutputType +from ...utils import implements +from .aggregation import DataFrameGroupByAgg +from .custom_aggregation import ( + DataFrameCustomGroupByAggMixin, + register_custom_groupby_agg_func, +) + + +@register_custom_groupby_agg_func("nunique") +class DataFrameCustomGroupByNuniqueMixin(DataFrameCustomGroupByAggMixin): + @classmethod + def _get_level_indexes( + cls, op: DataFrameGroupByAgg, data: pd.DataFrame + ) -> List[int]: + """ + When group by level, get the level index list. + Level can be int, level name, or sequence of such. + This function calculates the corresponding indexes. + Parameters + ---------- + op + data + + Returns + ------- + + """ + index = [data.index.name] if data.index.name else data.index.names + index = pd.Index(index) + level = op.groupby_params["level"] + if isinstance(level, int): + indexes = [level] + elif isinstance(level, str): + indexes = [index.get_loc(level)] + else: + level = list(level) + if isinstance(level[0], int): + indexes = level + else: + indexes = index.get_indexer(level).tolist() + return indexes + + @classmethod + def _get_selection_columns(cls, op: DataFrameGroupByAgg) -> Union[None, List]: + """ + Get groupby selection columns from op parameters. + If this returns None, it means all columns are required. + Parameters + ---------- + op + + Returns + ------- + + """ + if "selection" in op.groupby_params: + selection = op.groupby_params["selection"] + if isinstance(selection, (tuple, list)): + selection = [n for n in selection] + else: + selection = [selection] + return selection + + @classmethod + def _get_execute_map_result( + cls, op: DataFrameGroupByAgg, in_data: pd.DataFrame + ) -> Union[pd.DataFrame, pd.Series]: + selections = cls._get_selection_columns(op) + by_cols = op.raw_groupby_params["by"] + if by_cols is not None: + cols = ( + [*selections, *by_cols] if selections is not None else in_data.columns + ) + res = in_data[cols].drop_duplicates(subset=cols).set_index(by_cols) + else: # group by level + selections = selections if selections is not None else in_data.columns + level_indexes = cls._get_level_indexes(op, in_data) + in_data = in_data.reset_index() + index_names = in_data.columns[level_indexes].tolist() + cols = [*index_names, *selections] + res = in_data[cols].drop_duplicates().set_index(index_names) + + # if sort=True is specified, sort index when finishing drop_duplicates. + if op.raw_groupby_params["sort"]: + res = res.sort_index() + + if op.output_types[0] == OutputType.series: + res = res.squeeze() + + return res + + @classmethod + def _get_execute_combine_result( + cls, op: DataFrameGroupByAgg, in_data: pd.DataFrame + ) -> Union[pd.DataFrame, pd.Series]: + # in_data.index.names means MultiIndex (groupby on multi cols) + index_col = in_data.index.name or in_data.index.names + res = in_data.reset_index().drop_duplicates().set_index(index_col) + if op.output_types[0] == OutputType.series: + res = res.squeeze() + return res + + @classmethod + def _get_execute_agg_result( + cls, op: DataFrameGroupByAgg, in_data: pd.DataFrame + ) -> Union[pd.DataFrame, pd.Series]: + groupby_params = op.groupby_params.copy() + cols = in_data.index.name or in_data.index.names + by = op.raw_groupby_params["by"] + + if by is not None: + if op.output_types[0] == OutputType.dataframe: + groupby_params.pop("level", None) + groupby_params["by"] = cols + in_data = in_data.reset_index() + else: + # When group by multi levels, we must get the actual all levels from raw_groupby_params, + # since level field in op.groupby_params is not correct. + groupby_params["level"] = op.raw_groupby_params["level"] + + res = in_data.groupby(**groupby_params).nunique() + return res + + @classmethod + @implements(DataFrameCustomGroupByAggMixin.execute_map) + def execute_map(cls, op, in_data: pd.DataFrame) -> Union[pd.DataFrame, pd.Series]: + return cls._get_execute_map_result(op, in_data) + + @classmethod + @implements(DataFrameCustomGroupByAggMixin.execute_combine) + def execute_combine( + cls, op, in_data: pd.DataFrame + ) -> Union[pd.DataFrame, pd.Series]: + return cls._get_execute_combine_result(op, in_data) + + @classmethod + @implements(DataFrameCustomGroupByAggMixin.execute_agg) + def execute_agg(cls, op, in_data: pd.DataFrame) -> Union[pd.DataFrame, pd.Series]: + return cls._get_execute_agg_result(op, in_data) diff --git a/python/xorbits/_mars/dataframe/groupby/sample.py b/python/xorbits/_mars/dataframe/groupby/sample.py new file mode 100644 index 000000000..49b2c9635 --- /dev/null +++ b/python/xorbits/_mars/dataframe/groupby/sample.py @@ -0,0 +1,626 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import itertools +import random +from collections.abc import Iterable +from typing import Optional, Sequence, Union + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...core import ENTITY_TYPE, OutputType, get_output_types, recursive_tile +from ...core.operand import MapReduceOperand, OperandStage +from ...serialization.serializables import ( + BoolField, + DictField, + Float32Field, + Int32Field, + Int64Field, + KeyField, + NDArrayField, + StringField, +) +from ...tensor.operands import TensorShuffleProxy +from ...tensor.random import RandomStateField +from ...tensor.utils import gen_random_seeds +from ...utils import has_unknown_shape +from ..initializer import Series as asseries +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import parse_index + +_ILOC_COL_HEADER = "_gsamp_iloc_col_" +_WEIGHT_COL_HEADER = "_gsamp_weight_col_" + + +# code adapted from pandas.core.groupby.groupby.DataFrameGroupBy.sample +def _sample_groupby_iter( + groupby, obj_index, n, frac, replace, weights, random_state=None, errors="ignore" +): + if weights is None: + ws = [None] * groupby.ngroups + elif not isinstance(weights, Iterable) or isinstance(weights, str): + ws = [weights] * groupby.ngroups + else: + weights = pd.Series(weights, index=obj_index) + ws = [weights.iloc[idx] for idx in groupby.indices.values()] + + group_iterator = groupby.grouper.get_iterator(groupby._selected_obj) + if not replace and errors == "ignore": + for (_, obj), w in zip(group_iterator, ws): + yield obj.sample( + n=n, frac=frac, replace=replace, weights=w, random_state=random_state + ) if len(obj) > n else obj + else: + for (_, obj), w in zip(group_iterator, ws): + yield obj.sample( + n=n, frac=frac, replace=replace, weights=w, random_state=random_state + ) + + +class GroupBySampleILoc(DataFrameOperand, DataFrameOperandMixin): + _op_code_ = opcodes.GROUPBY_SAMPLE_ILOC + _op_module_ = "dataframe.groupby" + + groupby_params = DictField("groupby_params", default=None) + size = Int64Field("size", default=None) + frac = Float32Field("frac", default=None) + replace = BoolField("replace", default=None) + weights = KeyField("weights", default=None) + seed = Int32Field("seed", default=None) + _random_state = RandomStateField("random_state", default=None) + errors = StringField("errors", default=None) + + random_col_id = Int32Field("random_col_id", default=None) + + # for chunks + # num of instances for chunks + left_iloc_bound = Int64Field("left_iloc_bound", default=None) + + def __init__(self, random_state=None, **kw): + super().__init__(_random_state=random_state, **kw) + if self.random_col_id is None: + self.random_col_id = random.randint(10000, 99999) + + @property + def random_state(self): + if self._random_state is None: + self._random_state = np.random.RandomState(self.seed) + return self._random_state + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + input_iter = iter(inputs) + next(input_iter) + if isinstance(self.weights, ENTITY_TYPE): + self.weights = next(input_iter) + + def __call__(self, df): + self._output_types = [OutputType.tensor] + inp_tileables = [df] + if self.weights is not None: + inp_tileables.append(self.weights) + return self.new_tileable( + inp_tileables, dtype=np.dtype(np.int_), shape=(np.nan,) + ) + + @classmethod + def tile(cls, op: "GroupBySampleILoc"): + in_df = op.inputs[0] + out_tensor = op.outputs[0] + iloc_col_header = _ILOC_COL_HEADER + str(op.random_col_id) + weight_col_header = _WEIGHT_COL_HEADER + str(op.random_col_id) + + if has_unknown_shape(in_df): + yield + + if op.weights is None: + weights_iter = itertools.repeat(None) + else: + weights_iter = iter(op.weights.chunks) + + if isinstance(op.groupby_params["by"], list): + map_cols = list(op.groupby_params["by"]) + else: # pragma: no cover + map_cols = [] + + dtypes = in_df.dtypes.copy() + dtypes.at[iloc_col_header] = np.dtype(np.int_) + map_cols.append(iloc_col_header) + if op.weights is not None: + dtypes.at[weight_col_header] = op.weights.dtype + map_cols.append(weight_col_header) + + new_dtypes = dtypes[map_cols] + new_columns_value = parse_index(new_dtypes.index, store_data=True) + + map_chunks = [] + left_ilocs = np.array((0,) + in_df.nsplits[0]).cumsum() + for inp_chunk, weight_chunk in zip(in_df.chunks, weights_iter): + new_op = op.copy().reset_key() + new_op.left_iloc_bound = int(left_ilocs[inp_chunk.index[0]]) + new_op.stage = OperandStage.map + new_op.output_types = [OutputType.dataframe] + + inp_chunks = [inp_chunk] + if weight_chunk is not None: + inp_chunks.append(weight_chunk) + params = inp_chunk.params + params.update( + dict( + dtypes=new_dtypes, + columns_value=new_columns_value, + shape=(inp_chunk.shape[0], len(new_dtypes)), + index=inp_chunk.index, + ) + ) + map_chunks.append(new_op.new_chunk(inp_chunks, **params)) + + new_op = op.copy().reset_key() + new_op._output_types = [OutputType.dataframe] + params = in_df.params + params.update( + dict( + chunks=map_chunks, + nsplits=(in_df.nsplits[0], (len(new_dtypes),)), + dtypes=new_dtypes, + columns_value=new_columns_value, + shape=(in_df.shape[0], len(new_dtypes)), + ) + ) + map_df = new_op.new_tileable(op.inputs, **params) + + groupby_params = op.groupby_params.copy() + groupby_params.pop("selection", None) + grouped = yield from recursive_tile(map_df.groupby(**groupby_params)) + + result_chunks = [] + seeds = gen_random_seeds(len(grouped.chunks), op.random_state) + for group_chunk, seed in zip(grouped.chunks, seeds): + new_op = op.copy().reset_key() + new_op.stage = OperandStage.reduce + new_op.weights = None + new_op._random_state = None + new_op.seed = seed + + result_chunks.append( + new_op.new_chunk( + [group_chunk], + shape=(np.nan,), + index=(group_chunk.index[0],), + dtype=out_tensor.dtype, + ) + ) + + new_op = op.copy().reset_key() + params = out_tensor.params + params.update( + dict(chunks=result_chunks, nsplits=((np.nan,) * len(result_chunks),)) + ) + return new_op.new_tileables(op.inputs, **params) + + @classmethod + def execute(cls, ctx, op: "GroupBySampleILoc"): + in_data = ctx[op.inputs[0].key] + iloc_col = _ILOC_COL_HEADER + str(op.random_col_id) + weight_col = _WEIGHT_COL_HEADER + str(op.random_col_id) + if op.stage == OperandStage.map: + if op.weights is not None: + ret = pd.DataFrame( + { + iloc_col: np.arange( + op.left_iloc_bound, op.left_iloc_bound + len(in_data) + ), + weight_col: ctx[op.weights.key], + }, + index=in_data.index, + ) + else: + ret = pd.DataFrame( + { + iloc_col: np.arange( + op.left_iloc_bound, op.left_iloc_bound + len(in_data) + ), + }, + index=in_data.index, + ) + + if isinstance(op.groupby_params["by"], list): + ret = pd.concat([in_data[op.groupby_params["by"]], ret], axis=1) + + ctx[op.outputs[0].key] = ret + else: + if weight_col not in in_data.obj.columns: + weight_col = None + + if len(in_data.obj) == 0 or in_data.ngroups == 0: + ctx[op.outputs[0].key] = np.array([], dtype=np.int_) + else: + ctx[op.outputs[0].key] = np.concatenate( + [ + sample_pd[iloc_col].to_numpy() + for sample_pd in _sample_groupby_iter( + in_data, + in_data.obj.index, + n=op.size, + frac=op.frac, + replace=op.replace, + weights=weight_col, + random_state=op.random_state, + errors=op.errors, + ) + ] + ) + + +class GroupBySample(MapReduceOperand, DataFrameOperandMixin): + _op_code_ = opcodes.RAND_SAMPLE + _op_module_ = "dataframe.groupby" + + groupby_params = DictField("groupby_params", default=None) + size = Int64Field("size", default=None) + frac = Float32Field("frac", default=None) + replace = BoolField("replace", default=None) + weights = KeyField("weights", default=None) + seed = Int32Field("seed", default=None) + _random_state = RandomStateField("random_state", default=None) + errors = StringField("errors", default=None) + + # for chunks + # num of instances for chunks + input_nsplits = NDArrayField("input_nsplits", default=None) + + def __init__(self, random_state=None, **kw): + super().__init__(_random_state=random_state, **kw) + + @property + def random_state(self): + return self._random_state + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + input_iter = iter(inputs) + next(input_iter) + if isinstance(self.weights, ENTITY_TYPE): + self.weights = next(input_iter) + + def __call__(self, groupby): + df = groupby + while df.op.output_types[0] not in (OutputType.dataframe, OutputType.series): + df = df.inputs[0] + + selection = groupby.op.groupby_params.pop("selection", None) + if df.ndim > 1 and selection: + if isinstance(selection, tuple) and selection not in df.dtypes: + selection = list(selection) + result_df = df[selection] + else: + result_df = df + + params = result_df.params + params["shape"] = ( + (np.nan,) if result_df.ndim == 1 else (np.nan, result_df.shape[-1]) + ) + params["index_value"] = parse_index(result_df.index_value.to_pandas()[:0]) + + input_dfs = [df] + if isinstance(self.weights, ENTITY_TYPE): + input_dfs.append(self.weights) + + self._output_types = get_output_types(result_df) + return self.new_tileable(input_dfs, **params) + + @classmethod + def _tile_one_chunk(cls, op: "GroupBySample", in_df, weights): + out = op.outputs[0] + + input_dfs = [in_df] + if isinstance(weights, ENTITY_TYPE): + input_dfs.append(weights) + + params = out.params + chunk_op = op.copy().reset_key() + if isinstance(weights, ENTITY_TYPE): + chunk_op._weights = weights + params["index"] = (0,) * out.ndim + chunk = chunk_op.new_chunk([c.chunks[0] for c in input_dfs], **params) + + df_op = op.copy().reset_key() + return df_op.new_tileables( + input_dfs, chunks=[chunk], nsplits=((s,) for s in out.shape), **params + ) + + @classmethod + def _tile_distributed(cls, op: "GroupBySample", in_df, weights): + out_df = op.outputs[0] + if has_unknown_shape(in_df): + yield + + sample_iloc_op = GroupBySampleILoc( + groupby_params=op.groupby_params, + size=op.size, + frac=op.frac, + replace=op.replace, + weights=weights, + random_state=op.random_state, + errors=op.errors, + seed=None, + left_iloc_bound=None, + ) + sampled_iloc = yield from recursive_tile(sample_iloc_op(in_df)) + + map_chunks = [] + for c in sampled_iloc.chunks: + new_op = op.copy().reset_key() + new_op.stage = OperandStage.map + new_op.weights = None + new_op.output_types = [OutputType.tensor] + new_op.input_nsplits = np.array(in_df.nsplits[0]) + + map_chunks.append( + new_op.new_chunk( + [c], dtype=sampled_iloc.dtype, shape=(np.nan,), index=c.index + ) + ) + + proxy_chunk = TensorShuffleProxy(dtype=sampled_iloc.dtype).new_chunk( + map_chunks, shape=() + ) + + reduce_chunks = [] + for ordinal, src_chunk in enumerate(in_df.chunks): + new_op = op.copy().reset_key() + new_op.weights = None + new_op.output_types = [OutputType.tensor] + new_op.stage = OperandStage.reduce + new_op.reducer_index = (src_chunk.index[0],) + new_op.reducer_ordinal = ordinal + new_op.n_reducers = len(in_df.chunks) + new_op.input_nsplits = np.array(in_df.nsplits[0]) + + reduce_chunks.append( + new_op.new_chunk( + [proxy_chunk], + index=src_chunk.index, + dtype=sampled_iloc.dtype, + shape=(np.nan,), + ) + ) + + combine_chunks = [] + for src_chunk, reduce_chunk in zip(in_df.chunks, reduce_chunks): + new_op = op.copy().reset_key() + new_op.stage = OperandStage.combine + new_op._weights = None + + params = out_df.params + if out_df.ndim == 2: + params.update( + dict( + index=src_chunk.index, + dtypes=out_df.dtypes, + shape=(np.nan, out_df.shape[1]), + columns_value=out_df.columns_value, + ) + ) + else: + params.update( + dict( + index=(src_chunk.index[0],), + dtype=out_df.dtype, + shape=(np.nan,), + name=out_df.name, + ) + ) + combine_chunks.append(new_op.new_chunk([src_chunk, reduce_chunk], **params)) + + new_op = op.copy().reset_key() + if out_df.ndim == 2: + new_nsplits = ((np.nan,) * in_df.chunk_shape[0], (out_df.shape[1],)) + else: + new_nsplits = ((np.nan,) * in_df.chunk_shape[0],) + return new_op.new_tileables( + out_df.inputs, chunks=combine_chunks, nsplits=new_nsplits, **out_df.params + ) + + @classmethod + def tile(cls, op: "GroupBySample"): + in_df = op.inputs[0] + if in_df.ndim == 2: + in_df = yield from recursive_tile(in_df.rechunk({1: (in_df.shape[1],)})) + + weights = op.weights + if isinstance(weights, ENTITY_TYPE): + weights = yield from recursive_tile(weights.rechunk({0: in_df.nsplits[0]})) + + if len(in_df.chunks) == 1: + return cls._tile_one_chunk(op, in_df, weights) + return (yield from cls._tile_distributed(op, in_df, weights)) + + @classmethod + def execute(cls, ctx, op: "GroupBySample"): + out_df = op.outputs[0] + + if op.stage == OperandStage.map: + in_data = ctx[op.inputs[0].key] + in_data = np.sort(in_data) + input_nsplits = np.copy(op.input_nsplits).tolist() + pos_array = np.cumsum([0] + input_nsplits) + poses = np.searchsorted(in_data, pos_array).tolist() + for idx, (left, right) in enumerate(zip(poses, poses[1:])): + ctx[op.outputs[0].key, (idx,)] = in_data[left:right] + elif op.stage == OperandStage.reduce: + in_indexes = list(op.iter_mapper_data(ctx)) + idx = np.sort(np.concatenate(in_indexes)) + if op.outputs[0].index[0] > 0: + acc_nsplits = np.cumsum(op.input_nsplits) + idx -= acc_nsplits[op.outputs[0].index[0] - 1] + ctx[op.outputs[0].key] = idx + elif op.stage == OperandStage.combine: + in_data = ctx[op.inputs[0].key] + idx = ctx[op.inputs[1].key] + selection = op.groupby_params.get("selection") + if selection: + in_data = in_data[selection] + ctx[op.outputs[0].key] = in_data.iloc[idx] + else: + in_data = ctx[op.inputs[0].key] + weights = op.weights + if isinstance(weights, ENTITY_TYPE): + weights = ctx[weights.key] + params = op.groupby_params.copy() + selection = params.pop("selection", None) + + grouped = in_data.groupby(**params) + if selection is not None: + grouped = grouped[selection] + + result = pd.concat( + [ + sample_df + for sample_df in _sample_groupby_iter( + grouped, + in_data.index, + n=op.size, + frac=op.frac, + replace=op.replace, + weights=weights, + random_state=op.random_state, + errors=op.errors, + ) + ] + ) + ctx[out_df.key] = result + + +def groupby_sample( + groupby, + n: Optional[int] = None, + frac: Optional[float] = None, + replace: bool = False, + weights: Union[Sequence, pd.Series, None] = None, + random_state: Optional[np.random.RandomState] = None, + errors: str = "ignore", +): + """ + Return a random sample of items from each group. + + You can use `random_state` for reproducibility. + + Parameters + ---------- + n : int, optional + Number of items to return for each group. Cannot be used with + `frac` and must be no larger than the smallest group unless + `replace` is True. Default is one if `frac` is None. + frac : float, optional + Fraction of items to return. Cannot be used with `n`. + replace : bool, default False + Allow or disallow sampling of the same row more than once. + weights : list-like, optional + Default None results in equal probability weighting. + If passed a list-like then values must have the same length as + the underlying DataFrame or Series object and will be used as + sampling probabilities after normalization within each group. + Values must be non-negative with at least one positive element + within each group. + random_state : int, array-like, BitGenerator, np.random.RandomState, optional + If int, array-like, or BitGenerator (NumPy>=1.17), seed for + random number generator + If np.random.RandomState, use as numpy RandomState object. + errors : {'ignore', 'raise'}, default 'ignore' + If ignore, errors will not be raised when `replace` is False + and size of some group is less than `n`. + + Returns + ------- + Series or DataFrame + A new object of same type as caller containing items randomly + sampled within each group from the caller object. + + See Also + -------- + DataFrame.sample: Generate random samples from a DataFrame object. + numpy.random.choice: Generate a random sample from a given 1-D numpy + array. + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame( + ... {"a": ["red"] * 2 + ["blue"] * 2 + ["black"] * 2, "b": range(6)} + ... ) + >>> df.execute() + a b + 0 red 0 + 1 red 1 + 2 blue 2 + 3 blue 3 + 4 black 4 + 5 black 5 + + Select one row at random for each distinct value in column a. The + `random_state` argument can be used to guarantee reproducibility: + + >>> df.groupby("a").sample(n=1, random_state=1).execute() + a b + 4 black 4 + 2 blue 2 + 1 red 1 + + Set `frac` to sample fixed proportions rather than counts: + + >>> df.groupby("a")["b"].sample(frac=0.5, random_state=2).execute() + 5 5 + 2 2 + 0 0 + Name: b, dtype: int64 + + Control sample probabilities within groups by setting weights: + + >>> df.groupby("a").sample( + ... n=1, + ... weights=[1, 1, 1, 0, 0, 1], + ... random_state=1, + ... ).execute() + a b + 5 black 5 + 2 blue 2 + 0 red 0 + """ + groupby_params = groupby.op.groupby_params.copy() + groupby_params.pop("as_index", None) + + if weights is not None and not isinstance(weights, ENTITY_TYPE): + weights = asseries(weights) + + n = 1 if n is None and frac is None else n + rs = copy.deepcopy( + random_state.to_numpy() if hasattr(random_state, "to_numpy") else random_state + ) + if not isinstance(rs, np.random.RandomState): # pragma: no cover + rs = np.random.RandomState(rs) + + op = GroupBySample( + size=n, + frac=frac, + replace=replace, + weights=weights, + random_state=rs, + groupby_params=groupby_params, + errors=errors, + ) + return op(groupby) diff --git a/python/xorbits/_mars/dataframe/groupby/sort.py b/python/xorbits/_mars/dataframe/groupby/sort.py new file mode 100644 index 000000000..df43b4156 --- /dev/null +++ b/python/xorbits/_mars/dataframe/groupby/sort.py @@ -0,0 +1,167 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import OutputType +from ...core.operand import MapReduceOperand, OperandStage +from ...serialization.serializables import Int32Field, ListField +from ...utils import lazy_import +from ..operands import DataFrameOperandMixin +from ..sort.psrs import DataFramePSRSChunkOperand + +cudf = lazy_import("cudf") + + +def _series_to_df(in_series, xdf): + in_df = in_series.to_frame() + if in_series.name is not None: + in_df.columns = xdf.Index([in_series.name]) + return in_df + + +class DataFrameGroupbyConcatPivot(DataFramePSRSChunkOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.GROUPBY_SORT_PIVOT + + @property + def output_limit(self): + return 1 + + @classmethod + def execute(cls, ctx, op: "DataFrameGroupbyConcatPivot"): + inputs = [ctx[c.key] for c in op.inputs if len(ctx[c.key]) > 0] + + xdf = pd if isinstance(inputs[0], (pd.DataFrame, pd.Series)) else cudf + + a = xdf.concat(inputs, axis=0) + a = a.sort_index() + index = a.index.drop_duplicates() + + p = len(inputs) + if len(index) < p: + num = p // len(index) + 1 + index = index.append([index] * (num - 1)) + + index = index.sort_values() + + values = index.values + + slc = np.linspace( + p - 1, len(index) - 1, num=len(op.inputs) - 1, endpoint=False + ).astype(int) + out = values[slc] + ctx[op.outputs[-1].key] = out + + +class DataFramePSRSGroupbySample(DataFramePSRSChunkOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.GROUPBY_SORT_REGULAR_SAMPLE + + @property + def output_limit(self): + return 1 + + @classmethod + def execute(cls, ctx, op: "DataFramePSRSGroupbySample"): + a = ctx[op.inputs[0].key][0] + xdf = pd if isinstance(a, (pd.DataFrame, pd.Series)) else cudf + if isinstance(a, xdf.Series) and op.output_types[0] == OutputType.dataframe: + a = _series_to_df(a, xdf) + + n = op.n_partition + if a.shape[0] < n: + num = n // a.shape[0] + 1 + a = xdf.concat([a] * num).sort_index() + + w = a.shape[0] * 1.0 / (n + 1) + + slc = np.linspace(max(w - 1, 0), a.shape[0] - 1, num=n, endpoint=False).astype( + int + ) + + out = a.iloc[slc] + if op.output_types[0] == OutputType.series and out.ndim == 2: + assert out.shape[1] == 1 + out = out.iloc[:, 0] + ctx[op.outputs[-1].key] = out + + +class DataFrameGroupbySortShuffle(MapReduceOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.GROUPBY_SORT_SHUFFLE + + # for shuffle map + by = ListField("by") + n_partition = Int32Field("n_partition") + + def __init__(self, output_types=None, **kw): + super().__init__(_output_types=output_types, **kw) + + @property + def output_limit(self): + return 1 + + @classmethod + def _execute_map(cls, ctx, op: "DataFrameGroupbySortShuffle"): + df, pivots = [ctx[c.key] for c in op.inputs] + out = op.outputs[0] + + def _get_out_df(p_index, in_df): + if p_index == 0: + out_df = in_df.loc[: pivots[p_index]] + elif p_index == op.n_partition - 1: + out_df = in_df.loc[pivots[p_index - 1] :].drop( + index=pivots[p_index - 1], errors="ignore" + ) + else: + out_df = in_df.loc[pivots[p_index - 1] : pivots[p_index]].drop( + index=pivots[p_index - 1], errors="ignore" + ) + return out_df + + for i in range(op.n_partition): + index = (i, 0) + out_df = tuple(_get_out_df(i, x) for x in df) + ctx[out.key, index] = out_df + + @classmethod + def _execute_reduce(cls, ctx, op: "DataFrameGroupbySortShuffle"): + raw_inputs = list(op.iter_mapper_data(ctx, pop=False)) + by = op.by + xdf = cudf if op.gpu else pd + + r = [] + + tuple_len = len(raw_inputs[0]) + for i in range(tuple_len): + r.append(xdf.concat([inp[i] for inp in raw_inputs], axis=0)) + r = tuple(r) + + ctx[op.outputs[0].key] = r + (by,) + + @classmethod + def estimate_size(cls, ctx, op: "DataFrameGroupbySortShuffle"): + super().estimate_size(ctx, op) + result = ctx[op.outputs[0].key] + if op.stage == OperandStage.reduce: + ctx[op.outputs[0].key] = (result[0], result[1] * 1.5) + else: + ctx[op.outputs[0].key] = result + + @classmethod + def execute(cls, ctx, op: "DataFrameGroupbySortShuffle"): + if op.stage == OperandStage.map: + cls._execute_map(ctx, op) + else: + cls._execute_reduce(ctx, op) diff --git a/python/xorbits/_mars/dataframe/groupby/tests/__init__.py b/python/xorbits/_mars/dataframe/groupby/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/groupby/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/groupby/tests/test_groupby.py b/python/xorbits/_mars/dataframe/groupby/tests/test_groupby.py new file mode 100644 index 000000000..79e4eec0b --- /dev/null +++ b/python/xorbits/_mars/dataframe/groupby/tests/test_groupby.py @@ -0,0 +1,526 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict + +import numpy as np +import pandas as pd +import pytest + +from .... import dataframe as md +from .... import opcodes +from ....config import option_context +from ....core import OutputType, tile +from ....core.operand import OperandStage +from ...core import DataFrame, DataFrameGroupBy, SeriesGroupBy +from ..aggregation import DataFrameGroupByAgg +from ..core import DataFrameGroupByOperand, DataFrameShuffleProxy +from ..getitem import GroupByIndex +from ..sort import DataFrameGroupbySortShuffle + + +def test_groupby(): + df = pd.DataFrame( + {"a": [3, 4, 5, 3, 5, 4, 1, 2, 3], "b": [1, 3, 4, 5, 6, 5, 4, 4, 4]} + ) + mdf = md.DataFrame(df, chunk_size=2) + with pytest.raises(KeyError): + mdf.groupby("c2") + with pytest.raises(KeyError): + mdf.groupby(["b", "c2"]) + + grouped = mdf.groupby("b") + assert isinstance(grouped, DataFrameGroupBy) + assert isinstance(grouped.op, DataFrameGroupByOperand) + assert list(grouped.key_dtypes.index) == ["b"] + + grouped = tile(grouped) + assert len(grouped.chunks) == 5 + for chunk in grouped.chunks: + assert isinstance(chunk.op, DataFrameGroupByOperand) + + series = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3]) + ms = md.Series(series, chunk_size=3) + grouped = ms.groupby(lambda x: x + 1) + + assert isinstance(grouped, SeriesGroupBy) + assert isinstance(grouped.op, DataFrameGroupByOperand) + + grouped = tile(grouped) + assert len(grouped.chunks) == 3 + for chunk in grouped.chunks: + assert isinstance(chunk.op, DataFrameGroupByOperand) + + with pytest.raises(TypeError): + ms.groupby(lambda x: x + 1, as_index=False) + + +def test_groupby_get_item(): + df1 = pd.DataFrame( + { + "a": [3, 4, 5, 3, 5, 4, 1, 2, 3], + "b": [1, 3, 4, 5, 6, 5, 4, 4, 4], + "c": list("aabaaddce"), + } + ) + mdf = md.DataFrame(df1, chunk_size=3) + + r = tile(mdf.groupby("b")[["a", "b"]]) + assert isinstance(r, DataFrameGroupBy) + assert isinstance(r.op, GroupByIndex) + assert r.selection == ["a", "b"] + assert list(r.key_dtypes.index) == ["b"] + assert len(r.chunks) == 3 + + r = tile(mdf.groupby("b").a) + assert isinstance(r, SeriesGroupBy) + assert isinstance(r.op, GroupByIndex) + assert r.name == "a" + assert list(r.key_dtypes.index) == ["b"] + assert len(r.chunks) == 3 + + with pytest.raises(IndexError): + getattr(mdf.groupby("b")[["a", "b"]], "a") + + +def test_groupby_agg(): + df = pd.DataFrame( + { + "a": np.random.choice([2, 3, 4], size=(20,)), + "b": np.random.choice([2, 3, 4], size=(20,)), + } + ) + mdf = md.DataFrame(df, chunk_size=3) + r = mdf.groupby("a").agg("sum", method="tree") + assert isinstance(r.op, DataFrameGroupByAgg) + assert isinstance(r, DataFrame) + assert r.op.method == "tree" + r = tile(r) + assert len(r.chunks) == 1 + assert r.chunks[0].op.stage == OperandStage.agg + assert len(r.chunks[0].inputs) == 1 + assert len(r.chunks[0].inputs[0].inputs) == 2 + + df = pd.DataFrame( + { + "c1": range(10), + "c2": np.random.choice(["a", "b", "c"], (10,)), + "c3": np.random.rand(10), + } + ) + mdf = md.DataFrame(df, chunk_size=2) + r = mdf.groupby("c2", sort=False).sum(method="shuffle") + + assert isinstance(r.op, DataFrameGroupByAgg) + assert isinstance(r, DataFrame) + + r = tile(r) + assert len(r.chunks) == 5 + for chunk in r.chunks: + assert isinstance(chunk.op, DataFrameGroupByAgg) + assert chunk.op.stage == OperandStage.agg + assert isinstance(chunk.inputs[0].op, DataFrameGroupByOperand) + assert chunk.inputs[0].op.stage == OperandStage.reduce + assert isinstance(chunk.inputs[0].inputs[0].op, DataFrameShuffleProxy) + assert isinstance( + chunk.inputs[0].inputs[0].inputs[0].op, DataFrameGroupByOperand + ) + assert chunk.inputs[0].inputs[0].inputs[0].op.stage == OperandStage.map + + agg_chunk = chunk.inputs[0].inputs[0].inputs[0].inputs[0] + assert agg_chunk.op.stage == OperandStage.map + + r = mdf.groupby( + "c2", + ).sum(method="shuffle") + + assert isinstance(r.op, DataFrameGroupByAgg) + assert isinstance(r, DataFrame) + + r = tile(r) + assert len(r.chunks) == 5 + for chunk in r.chunks: + assert isinstance(chunk.op, DataFrameGroupByAgg) + assert chunk.op.stage == OperandStage.agg + assert isinstance(chunk.inputs[0].op, DataFrameGroupbySortShuffle) + assert chunk.inputs[0].op.stage == OperandStage.reduce + assert isinstance(chunk.inputs[0].inputs[0].op, DataFrameShuffleProxy) + assert isinstance( + chunk.inputs[0].inputs[0].inputs[0].op, DataFrameGroupbySortShuffle + ) + assert chunk.inputs[0].inputs[0].inputs[0].op.stage == OperandStage.map + + agg_chunk = chunk.inputs[0].inputs[0].inputs[0].inputs[0] + assert agg_chunk.op.stage == OperandStage.map + + # test unknown method + with pytest.raises(ValueError): + mdf.groupby("c2").sum(method="not_exist") + + +def test_groupby_auto_on_cluster(): + rs = np.random.RandomState(0) + raw = pd.DataFrame( + { + "c1": rs.randint(20, size=100), + "c2": rs.choice(["a", "b", "c"], (100,)), + "c3": rs.rand(100), + } + ) + # test DataFrameGroupByAgg._tile_auto_on_distributed + with option_context({"chunk_store_limit": 80}): + # chunk_store_limit is 30, each chunk's size is 8, + # will combine once, then shuffle 5 combined chunk + mdf = md.DataFrame(raw, chunk_size=5) + tiled_mdf = tile(mdf) + r = mdf.groupby("c2").sum() + func_infos = DataFrameGroupByAgg._compile_funcs(r.op, mdf) + tiled = DataFrameGroupByAgg._build_tree_and_shuffle_chunks( + r.op, tiled_mdf, r, func_infos, tiled_mdf.chunks[:4], [8] * 4 + )[0] + assert len(tiled.chunks) == 5 + + +def test_groupby_apply(): + df1 = pd.DataFrame( + { + "a": [3, 4, 5, 3, 5, 4, 1, 2, 3], + "b": [1, 3, 4, 5, 6, 5, 4, 4, 4], + "c": list("aabaaddce"), + } + ) + + def apply_call_with_err(_): + raise ValueError + + def apply_df(df): + return df.sort_index() + + def apply_df_with_error(df): + assert len(df) > 2 + return df.sort_index() + + def apply_series(s): + return s.sort_index() + + mdf = md.DataFrame(df1, chunk_size=3) + + # when dtype and output_type specified, apply function + # shall not be called + applied = mdf.groupby("b").apply( + apply_call_with_err, output_type="series", dtype=int + ) + assert applied.dtype == int + assert applied.op.output_types[0] == OutputType.series + + with pytest.raises(TypeError): + mdf.groupby("b").apply(apply_df_with_error) + + applied = tile( + mdf.groupby("b").apply( + apply_df_with_error, output_type="dataframe", dtypes=df1.dtypes + ) + ) + pd.testing.assert_series_equal(applied.dtypes, df1.dtypes) + assert applied.shape == (np.nan, 3) + assert applied.op._op_type_ == opcodes.APPLY + assert applied.op.output_types[0] == OutputType.dataframe + assert len(applied.chunks) == 3 + assert applied.chunks[0].shape == (np.nan, 3) + pd.testing.assert_series_equal(applied.chunks[0].dtypes, df1.dtypes) + + applied = tile(mdf.groupby("b").apply(apply_df)) + pd.testing.assert_series_equal(applied.dtypes, df1.dtypes) + assert applied.shape == (np.nan, 3) + assert applied.op._op_type_ == opcodes.APPLY + assert applied.op.output_types[0] == OutputType.dataframe + assert len(applied.chunks) == 3 + assert applied.chunks[0].shape == (np.nan, 3) + pd.testing.assert_series_equal(applied.chunks[0].dtypes, df1.dtypes) + + applied = tile(mdf.groupby("b").apply(lambda df: df.a)) + assert applied.dtype == df1.a.dtype + assert applied.shape == (np.nan,) + assert applied.op._op_type_ == opcodes.APPLY + assert applied.op.output_types[0] == OutputType.series + assert len(applied.chunks) == 3 + assert applied.chunks[0].shape == (np.nan,) + assert applied.chunks[0].dtype == df1.a.dtype + + applied = mdf.groupby("b").apply(lambda df: df.a.sum()) + assert applied.op.maybe_agg is True + # force set to pass test + applied.op.maybe_agg = None + applied = tile(applied) + assert applied.dtype == df1.a.dtype + assert applied.shape == (np.nan,) + assert applied.op._op_type_ == opcodes.APPLY + assert applied.op.output_types[0] == OutputType.series + assert len(applied.chunks) == 3 + assert applied.chunks[0].shape == (np.nan,) + assert applied.chunks[0].dtype == df1.a.dtype + + series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3]) + + ms1 = md.Series(series1, chunk_size=3) + applied = tile(ms1.groupby(lambda x: x % 3).apply(apply_series)) + assert applied.dtype == series1.dtype + assert applied.shape == (np.nan,) + assert applied.op._op_type_ == opcodes.APPLY + assert applied.op.output_types[0] == OutputType.series + assert len(applied.chunks) == 3 + assert applied.chunks[0].shape == (np.nan,) + assert applied.chunks[0].dtype == series1.dtype + + +def test_groupby_transform(): + df1 = pd.DataFrame( + { + "a": [3, 4, 5, 3, 5, 4, 1, 2, 3], + "b": [1, 3, 4, 5, 6, 5, 4, 4, 4], + "c": list("aabaaddce"), + "d": [3, 4, 5, 3, 5, 4, 1, 2, 3], + "e": [1, 3, 4, 5, 6, 5, 4, 4, 4], + "f": list("aabaaddce"), + } + ) + + def transform_df(df): + return df.sort_index() + + def transform_df_with_err(df): + assert len(df) > 2 + return df.sort_index() + + mdf = md.DataFrame(df1, chunk_size=3) + + with pytest.raises(TypeError): + mdf.groupby("b").transform(["cummax", "cumcount"]) + + with pytest.raises(TypeError): + mdf.groupby("b").transform(transform_df_with_err) + + r = tile( + mdf.groupby("b").transform(transform_df_with_err, dtypes=df1.dtypes.drop("b")) + ) + assert r.dtypes.index.tolist() == list("acdef") + assert r.shape == (9, 5) + assert r.op._op_type_ == opcodes.TRANSFORM + assert r.op.output_types[0] == OutputType.dataframe + assert len(r.chunks) == 3 + assert r.chunks[0].shape == (np.nan, 5) + assert r.chunks[0].dtypes.index.tolist() == list("acdef") + + r = tile(mdf.groupby("b").transform(transform_df)) + assert r.dtypes.index.tolist() == list("acdef") + assert r.shape == (9, 5) + assert r.op._op_type_ == opcodes.TRANSFORM + assert r.op.output_types[0] == OutputType.dataframe + assert len(r.chunks) == 3 + assert r.chunks[0].shape == (np.nan, 5) + assert r.chunks[0].dtypes.index.tolist() == list("acdef") + + r = tile(mdf.groupby("b").transform(["cummax", "cumcount"], _call_agg=True)) + assert r.shape == (np.nan, 6) + assert r.op._op_type_ == opcodes.TRANSFORM + assert r.op.output_types[0] == OutputType.dataframe + assert len(r.chunks) == 3 + assert r.chunks[0].shape == (np.nan, 6) + + agg_dict = OrderedDict([("d", "cummax"), ("b", "cumsum")]) + r = tile(mdf.groupby("b").transform(agg_dict, _call_agg=True)) + assert r.shape == (np.nan, 2) + assert r.op._op_type_ == opcodes.TRANSFORM + assert r.op.output_types[0] == OutputType.dataframe + assert len(r.chunks) == 3 + assert r.chunks[0].shape == (np.nan, 2) + + agg_list = ["sum", lambda s: s.sum()] + r = tile(mdf.groupby("b").transform(agg_list, _call_agg=True)) + assert r.shape == (np.nan, 10) + assert r.op._op_type_ == opcodes.TRANSFORM + assert r.op.output_types[0] == OutputType.dataframe + assert len(r.chunks) == 3 + assert r.chunks[0].shape == (np.nan, 10) + + series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3]) + ms1 = md.Series(series1, chunk_size=3) + + r = tile(ms1.groupby(lambda x: x % 3).transform(lambda x: x + 1)) + assert r.dtype == series1.dtype + assert r.shape == series1.shape + assert r.op._op_type_ == opcodes.TRANSFORM + assert r.op.output_types[0] == OutputType.series + assert len(r.chunks) == 3 + assert r.chunks[0].shape == (np.nan,) + assert r.chunks[0].dtype == series1.dtype + + r = tile(ms1.groupby(lambda x: x % 3).transform("cummax", _call_agg=True)) + assert r.shape == (np.nan,) + assert r.op._op_type_ == opcodes.TRANSFORM + assert r.op.output_types[0] == OutputType.series + assert len(r.chunks) == 3 + assert r.chunks[0].shape == (np.nan,) + + agg_list = ["cummax", "cumcount"] + r = tile(ms1.groupby(lambda x: x % 3).transform(agg_list, _call_agg=True)) + assert r.shape == (np.nan, 2) + assert r.op._op_type_ == opcodes.TRANSFORM + assert r.op.output_types[0] == OutputType.dataframe + assert len(r.chunks) == 3 + assert r.chunks[0].shape == (np.nan, 2) + + +def test_groupby_cum(): + df1 = pd.DataFrame( + { + "a": [3, 5, 2, 7, 1, 2, 4, 6, 2, 4], + "b": [8, 3, 4, 1, 8, 2, 2, 2, 2, 3], + "c": [1, 8, 8, 5, 3, 5, 0, 0, 5, 4], + } + ) + mdf = md.DataFrame(df1, chunk_size=3) + + for fun in ["cummin", "cummax", "cumprod", "cumsum"]: + r = tile(getattr(mdf.groupby("b"), fun)()) + assert r.op.output_types[0] == OutputType.dataframe + assert len(r.chunks) == 4 + assert r.shape == (len(df1), 2) + assert r.chunks[0].shape == (np.nan, 2) + pd.testing.assert_index_equal( + r.chunks[0].columns_value.to_pandas(), pd.Index(["a", "c"]) + ) + + r = tile(getattr(mdf.groupby("b"), fun)(axis=1)) + assert r.op.output_types[0] == OutputType.dataframe + assert len(r.chunks) == 4 + assert r.shape == (len(df1), 3) + assert r.chunks[0].shape == (np.nan, 3) + pd.testing.assert_index_equal( + r.chunks[0].columns_value.to_pandas(), df1.columns + ) + + r = tile(mdf.groupby("b").cumcount()) + assert r.op.output_types[0] == OutputType.series + assert len(r.chunks) == 4 + assert r.shape == (len(df1),) + assert r.chunks[0].shape == (np.nan,) + + series1 = pd.Series([2, 2, 5, 7, 3, 7, 8, 8, 5, 6]) + ms1 = md.Series(series1, chunk_size=3) + + for fun in ["cummin", "cummax", "cumprod", "cumsum", "cumcount"]: + r = tile(getattr(ms1.groupby(lambda x: x % 2), fun)()) + assert r.op.output_types[0] == OutputType.series + assert len(r.chunks) == 4 + assert r.shape == (len(series1),) + assert r.chunks[0].shape == (np.nan,) + + +def test_groupby_fill(): + df1 = pd.DataFrame( + [ + [1, 1, 10], + [1, 1, np.nan], + [1, 1, np.nan], + [1, 2, np.nan], + [1, 2, 20], + [1, 2, np.nan], + [1, 3, np.nan], + [1, 3, np.nan], + ], + columns=["one", "two", "three"], + ) + mdf = md.DataFrame(df1, chunk_size=3) + + r = tile(mdf.groupby(["one", "two"]).ffill()) + assert r.op.output_types[0] == OutputType.dataframe + assert r.shape == (len(df1), 1) + assert len(r.chunks) == 3 + assert r.chunks[0].shape == (np.nan, 1) + assert r.dtypes.index.tolist() == ["three"] + + r = tile(mdf.groupby(["two"]).bfill()) + assert r.op.output_types[0] == OutputType.dataframe + assert r.shape == (len(df1), 2) + assert len(r.chunks) == 3 + assert r.chunks[0].shape == (np.nan, 2) + assert r.dtypes.index.tolist() == ["one", "three"] + + r = tile(mdf.groupby(["two"]).backfill()) + assert r.op.output_types[0] == OutputType.dataframe + assert r.shape == (len(df1), 2) + assert len(r.chunks) == 3 + assert r.chunks[0].shape == (np.nan, 2) + assert r.dtypes.index.tolist() == ["one", "three"] + + r = tile(mdf.groupby(["one"]).fillna(5)) + assert r.op.output_types[0] == OutputType.dataframe + assert r.shape == (len(df1), 2) + assert len(r.chunks) == 3 + assert r.chunks[0].shape == (np.nan, 2) + assert r.dtypes.index.tolist() == ["two", "three"] + + s1 = pd.Series([4, 3, 9, np.nan, np.nan, 7, 10, 8, 1, 6]) + ms1 = md.Series(s1, chunk_size=3) + r = tile(ms1.groupby(lambda x: x % 2).ffill()) + assert r.op.output_types[0] == OutputType.series + assert len(r.chunks) == 4 + assert r.shape == (len(s1),) + assert r.chunks[0].shape == (np.nan,) + + r = tile(ms1.groupby(lambda x: x % 2).bfill()) + assert r.op.output_types[0] == OutputType.series + assert len(r.chunks) == 4 + assert r.shape == (len(s1),) + assert r.chunks[0].shape == (np.nan,) + + r = tile(ms1.groupby(lambda x: x % 2).backfill()) + assert r.op.output_types[0] == OutputType.series + assert len(r.chunks) == 4 + assert r.shape == (len(s1),) + assert r.chunks[0].shape == (np.nan,) + + r = tile(ms1.groupby(lambda x: x % 2).fillna(5)) + assert r.op.output_types[0] == OutputType.series + assert len(r.chunks) == 4 + assert r.shape == (len(s1),) + assert r.chunks[0].shape == (np.nan,) + + s1 = pd.Series([4, 3, 9, np.nan, np.nan, 7, 10, 8, 1, 6]) + ms1 = md.Series(s1, chunk_size=3) + + r = tile(ms1.groupby(lambda x: x % 2).ffill()) + assert r.op.output_types[0] == OutputType.series + assert len(r.chunks) == 4 + assert r.shape == (len(s1),) + assert r.chunks[0].shape == (np.nan,) + + r = tile(ms1.groupby(lambda x: x % 2).bfill()) + assert r.op.output_types[0] == OutputType.series + assert len(r.chunks) == 4 + assert r.shape == (len(s1),) + assert r.chunks[0].shape == (np.nan,) + + r = tile(ms1.groupby(lambda x: x % 2).backfill()) + assert r.op.output_types[0] == OutputType.series + assert len(r.chunks) == 4 + assert r.shape == (len(s1),) + assert r.chunks[0].shape == (np.nan,) + + r = tile(ms1.groupby(lambda x: x % 2).fillna(5)) + assert r.op.output_types[0] == OutputType.series + assert len(r.chunks) == 4 + assert r.shape == (len(s1),) + assert r.chunks[0].shape == (np.nan,) diff --git a/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_execution.py b/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_execution.py new file mode 100644 index 000000000..08b1e9bd7 --- /dev/null +++ b/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_execution.py @@ -0,0 +1,1513 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from collections import OrderedDict + +import numpy as np +import pandas as pd +import pytest + +try: + import pyarrow as pa +except ImportError: # pragma: no cover + pa = None + +from .... import dataframe as md +from ....config import option_context +from ....core.operand import OperandStage +from ....tests.core import assert_groupby_equal, require_cudf +from ....utils import arrow_array_to_objects, pd_release_version +from ...core import DATAFRAME_OR_SERIES_TYPE +from ..aggregation import DataFrameGroupByAgg + +pytestmark = pytest.mark.pd_compat + +_agg_size_as_frame = pd_release_version[:2] > (1, 0) + + +class MockReduction1(md.CustomReduction): + def agg(self, v1): + return v1.sum() + + +def test_groupby(setup): + rs = np.random.RandomState(0) + data_size = 100 + data_dict = { + "a": rs.randint(0, 10, size=(data_size,)), + "b": rs.randint(0, 10, size=(data_size,)), + "c": rs.choice(list("abcd"), size=(data_size,)), + } + + # test groupby with DataFrames and RangeIndex + df1 = pd.DataFrame(data_dict) + mdf = md.DataFrame(df1, chunk_size=13) + grouped = mdf.groupby("b") + assert_groupby_equal(grouped.execute().fetch(), df1.groupby("b")) + + # test groupby with string index with duplications + df2 = pd.DataFrame(data_dict, index=["i" + str(i % 3) for i in range(data_size)]) + mdf = md.DataFrame(df2, chunk_size=13) + grouped = mdf.groupby("b") + assert_groupby_equal(grouped.execute().fetch(), df2.groupby("b")) + + # test groupby with DataFrames by series + grouped = mdf.groupby(mdf["b"]) + assert_groupby_equal(grouped.execute().fetch(), df2.groupby(df2["b"])) + + # test groupby with DataFrames by multiple series + grouped = mdf.groupby(by=[mdf["b"], mdf["c"]]) + assert_groupby_equal( + grouped.execute().fetch(), df2.groupby(by=[df2["b"], df2["c"]]) + ) + + # test groupby with DataFrames with MultiIndex + df3 = pd.DataFrame( + data_dict, + index=pd.MultiIndex.from_tuples( + [(i % 3, "i" + str(i)) for i in range(data_size)] + ), + ) + mdf = md.DataFrame(df3, chunk_size=13) + grouped = mdf.groupby(level=0) + assert_groupby_equal(grouped.execute().fetch(), df3.groupby(level=0)) + + # test groupby with DataFrames by integer columns + df4 = pd.DataFrame(list(data_dict.values())).T + mdf = md.DataFrame(df4, chunk_size=13) + grouped = mdf.groupby(0) + assert_groupby_equal(grouped.execute().fetch(), df4.groupby(0)) + + series1 = pd.Series(data_dict["a"]) + ms1 = md.Series(series1, chunk_size=13) + grouped = ms1.groupby(lambda x: x % 3) + assert_groupby_equal(grouped.execute().fetch(), series1.groupby(lambda x: x % 3)) + + # test groupby series + grouped = ms1.groupby(ms1) + assert_groupby_equal(grouped.execute().fetch(), series1.groupby(series1)) + + series2 = pd.Series(data_dict["a"], index=["i" + str(i) for i in range(data_size)]) + ms2 = md.Series(series2, chunk_size=13) + grouped = ms2.groupby(lambda x: int(x[1:]) % 3) + assert_groupby_equal( + grouped.execute().fetch(), series2.groupby(lambda x: int(x[1:]) % 3) + ) + + +def test_groupby_getitem(setup): + rs = np.random.RandomState(0) + data_size = 100 + raw = pd.DataFrame( + { + "a": rs.randint(0, 10, size=(data_size,)), + "b": rs.randint(0, 10, size=(data_size,)), + "c": rs.choice(list("abcd"), size=(data_size,)), + }, + index=pd.MultiIndex.from_tuples( + [(i % 3, "i" + str(i)) for i in range(data_size)] + ), + ) + mdf = md.DataFrame(raw, chunk_size=13) + + r = mdf.groupby(level=0)[["a", "b"]] + assert_groupby_equal( + r.execute().fetch(), raw.groupby(level=0)[["a", "b"]], with_selection=True + ) + + for method in ("tree", "shuffle"): + r = mdf.groupby(level=0)[["a", "b"]].sum(method=method) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + raw.groupby(level=0)[["a", "b"]].sum().sort_index(), + ) + + r = mdf.groupby(level=0)[["a", "b"]].apply(lambda x: x + 1) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + raw.groupby(level=0)[["a", "b"]].apply(lambda x: x + 1).sort_index(), + ) + + r = mdf.groupby("b")[["a", "b"]] + assert_groupby_equal( + r.execute().fetch(), raw.groupby("b")[["a", "b"]], with_selection=True + ) + + r = mdf.groupby("b")[["a", "c"]] + assert_groupby_equal( + r.execute().fetch(), raw.groupby("b")[["a", "c"]], with_selection=True + ) + + for method in ("tree", "shuffle"): + r = mdf.groupby("b")[["a", "b"]].sum(method=method) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + raw.groupby("b")[["a", "b"]].sum().sort_index(), + ) + + r = mdf.groupby("b")[["a", "b"]].agg(["sum", "count"], method=method) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + raw.groupby("b")[["a", "b"]].agg(["sum", "count"]).sort_index(), + ) + + r = mdf.groupby("b")[["a", "c"]].agg(["sum", "count"], method=method) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + raw.groupby("b")[["a", "c"]].agg(["sum", "count"]).sort_index(), + ) + + r = mdf.groupby("b")[["a", "b"]].apply(lambda x: x + 1) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + raw.groupby("b")[["a", "b"]].apply(lambda x: x + 1).sort_index(), + ) + + r = mdf.groupby("b")[["a", "b"]].transform(lambda x: x + 1) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + raw.groupby("b")[["a", "b"]].transform(lambda x: x + 1).sort_index(), + ) + + r = mdf.groupby("b")[["a", "b"]].cumsum() + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + raw.groupby("b")[["a", "b"]].cumsum().sort_index(), + ) + + r = mdf.groupby("b").a + assert_groupby_equal(r.execute().fetch(), raw.groupby("b").a, with_selection=True) + + for method in ("shuffle", "tree"): + r = mdf.groupby("b").a.sum(method=method) + pd.testing.assert_series_equal( + r.execute().fetch().sort_index(), raw.groupby("b").a.sum().sort_index() + ) + + r = mdf.groupby("b").a.agg(["sum", "mean", "var"], method=method) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + raw.groupby("b").a.agg(["sum", "mean", "var"]).sort_index(), + ) + + r = mdf.groupby("b", as_index=False).a.sum(method=method) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_values("b", ignore_index=True), + raw.groupby("b", as_index=False) + .a.sum() + .sort_values("b", ignore_index=True), + ) + + r = mdf.groupby("b", as_index=False).b.count(method=method) + result = r.execute().fetch().sort_values("b", ignore_index=True) + try: + expected = ( + raw.groupby("b", as_index=False) + .b.count() + .sort_values("b", ignore_index=True) + ) + except ValueError: + expected = raw.groupby("b").b.count().to_frame() + expected.index.names = [None] * expected.index.nlevels + expected = expected.sort_values("b", ignore_index=True) + pd.testing.assert_frame_equal(result, expected) + + r = mdf.groupby("b", as_index=False).b.agg({"cnt": "count"}, method=method) + result = r.execute().fetch().sort_values("b", ignore_index=True) + try: + expected = ( + raw.groupby("b", as_index=False) + .b.agg({"cnt": "count"}) + .sort_values("b", ignore_index=True) + ) + except ValueError: + expected = raw.groupby("b").b.agg({"cnt": "count"}).to_frame() + expected.index.names = [None] * expected.index.nlevels + expected = expected.sort_values("b", ignore_index=True) + pd.testing.assert_frame_equal(result, expected) + + r = mdf.groupby("b").a.apply(lambda x: x + 1) + pd.testing.assert_series_equal( + r.execute().fetch().sort_index(), + raw.groupby("b").a.apply(lambda x: x + 1).sort_index(), + ) + + r = mdf.groupby("b").a.transform(lambda x: x + 1) + pd.testing.assert_series_equal( + r.execute().fetch().sort_index(), + raw.groupby("b").a.transform(lambda x: x + 1).sort_index(), + ) + + r = mdf.groupby("b").a.cumsum() + pd.testing.assert_series_equal( + r.execute().fetch().sort_index(), raw.groupby("b").a.cumsum().sort_index() + ) + + # special test for selection key == 0 + raw = pd.DataFrame(rs.rand(data_size, 10)) + raw[0] = 0 + mdf = md.DataFrame(raw, chunk_size=13) + r = mdf.groupby(0, as_index=False)[0].agg({"cnt": "count"}, method="tree") + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + raw.groupby(0, as_index=False)[0].agg({"cnt": "count"}), + ) + + # test groupby getitem then agg(#GH 2640) + rs = np.random.RandomState(0) + raw = pd.DataFrame( + { + "c1": rs.randint(0, 10, size=(100,)).astype(np.int64), + "c2": rs.choice(["a", "b", "c"], (100,)), + "c3": rs.rand(100), + "c4": rs.rand(100), + } + ) + mdf = md.DataFrame(raw, chunk_size=20) + + r = mdf.groupby(["c2"])[["c1", "c3"]].agg({"c1": "max", "c3": "min"}, method="tree") + pd.testing.assert_frame_equal( + r.execute().fetch(), + raw.groupby(["c2"])[["c1", "c3"]].agg({"c1": "max", "c3": "min"}), + ) + + mdf = md.DataFrame(raw.copy(), chunk_size=30) + r = mdf.groupby(["c2"])[["c1", "c4"]].agg( + {"c1": "max", "c4": "mean"}, method="shuffle" + ) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + raw.groupby(["c2"])[["c1", "c4"]].agg({"c1": "max", "c4": "mean"}), + ) + + # test anonymous function lists + agg_funs = [lambda x: (x + 1).sum()] + r = mdf.groupby(["c2"])["c1"].agg(agg_funs) + pd.testing.assert_frame_equal( + r.execute().fetch(), raw.groupby(["c2"])["c1"].agg(agg_funs) + ) + + # test group by multiple cols + r = mdf.groupby(["c1", "c2"], as_index=False)["c3"].sum() + expected = raw.groupby(["c1", "c2"], as_index=False)["c3"].sum() + pd.testing.assert_frame_equal( + r.execute().fetch().sort_values(["c1", "c2"]).reset_index(drop=True), + expected.sort_values(["c1", "c2"]).reset_index(drop=True), + ) + + r = mdf.groupby(["c1", "c2"], as_index=False)["c3"].agg(["sum"]) + expected = raw.groupby(["c1", "c2"], as_index=False)["c3"].agg(["sum"]) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_values(["c1", "c2"]), + expected.sort_values(["c1", "c2"]), + ) + + +def test_dataframe_groupby_agg(setup): + agg_funs = [ + "std", + "mean", + "var", + "max", + "count", + "size", + "all", + "any", + "skew", + "kurt", + "sem", + "nunique", + ] + + rs = np.random.RandomState(0) + raw = pd.DataFrame( + { + "c1": np.arange(100).astype(np.int64), + "c2": rs.choice(["a", "b", "c"], (100,)), + "c3": rs.rand(100), + } + ) + mdf = md.DataFrame(raw, chunk_size=13) + + for method in ["tree", "shuffle"]: + for sort in [True, False]: + r = mdf.groupby("c2").agg("size", method=method) + pd.testing.assert_series_equal( + r.execute().fetch().sort_index(), + raw.groupby("c2").agg("size").sort_index(), + ) + + for agg_fun in agg_funs: + if agg_fun == "size": + continue + r = mdf.groupby("c2", sort=sort).agg(agg_fun, method=method) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + raw.groupby("c2").agg(agg_fun).sort_index(), + ) + + r = mdf.groupby("c2", sort=sort).agg(agg_funs, method=method) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + raw.groupby("c2").agg(agg_funs).sort_index(), + ) + + agg = OrderedDict([("c1", ["min", "mean"]), ("c3", "std")]) + r = mdf.groupby("c2", sort=sort).agg(agg, method=method) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + raw.groupby("c2").agg(agg).sort_index(), + ) + + agg = OrderedDict([("c1", "min"), ("c3", "sum")]) + r = mdf.groupby("c2", sort=sort).agg(agg, method=method) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + raw.groupby("c2").agg(agg).sort_index(), + ) + + r = mdf.groupby("c2", sort=sort).agg( + {"c1": "min", "c3": "min"}, method=method + ) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + raw.groupby("c2").agg({"c1": "min", "c3": "min"}).sort_index(), + ) + + r = mdf.groupby("c2", sort=sort).agg({"c1": "min"}, method=method) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + raw.groupby("c2").agg({"c1": "min"}).sort_index(), + ) + + # test groupby series + r = mdf.groupby(mdf["c2"], sort=sort).sum(method=method) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + raw.groupby(raw["c2"]).sum().sort_index(), + ) + + r = mdf.groupby("c2").size(method="tree") + pd.testing.assert_series_equal(r.execute().fetch(), raw.groupby("c2").size()) + + # test inserted kurt method + r = mdf.groupby("c2").kurtosis(method="tree") + pd.testing.assert_frame_equal(r.execute().fetch(), raw.groupby("c2").kurtosis()) + + for agg_fun in agg_funs: + if agg_fun == "size" or callable(agg_fun): + continue + r = getattr(mdf.groupby("c2"), agg_fun)(method="tree") + pd.testing.assert_frame_equal( + r.execute().fetch(), getattr(raw.groupby("c2"), agg_fun)() + ) + + # test as_index=False + for method in ["tree", "shuffle"]: + r = mdf.groupby("c2", as_index=False).agg("size", method=method) + if _agg_size_as_frame: + result = r.execute().fetch().sort_values("c2", ignore_index=True) + expected = ( + raw.groupby("c2", as_index=False) + .agg("size") + .sort_values("c2", ignore_index=True) + ) + pd.testing.assert_frame_equal(result, expected) + else: + result = r.execute().fetch().sort_index() + expected = raw.groupby("c2", as_index=False).agg("size").sort_index() + pd.testing.assert_series_equal(result, expected) + + r = mdf.groupby("c2", as_index=False).agg("mean", method=method) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_values("c2", ignore_index=True), + raw.groupby("c2", as_index=False) + .agg("mean") + .sort_values("c2", ignore_index=True), + ) + assert r.op.groupby_params["as_index"] is False + + r = mdf.groupby(["c1", "c2"], as_index=False).agg("mean", method=method) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_values(["c1", "c2"], ignore_index=True), + raw.groupby(["c1", "c2"], as_index=False) + .agg("mean") + .sort_values(["c1", "c2"], ignore_index=True), + ) + assert r.op.groupby_params["as_index"] is False + + # test as_index=False takes no effect + r = mdf.groupby(["c1", "c2"], as_index=False).agg(["mean", "count"]) + pd.testing.assert_frame_equal( + r.execute().fetch(), + raw.groupby(["c1", "c2"], as_index=False).agg(["mean", "count"]), + ) + assert r.op.groupby_params["as_index"] is True + + r = mdf.groupby("c2").agg(["cumsum", "cumcount"]) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + raw.groupby("c2").agg(["cumsum", "cumcount"]).sort_index(), + ) + + r = mdf.groupby("c2").agg( + sum_c1=md.NamedAgg("c1", "sum"), + min_c1=md.NamedAgg("c1", "min"), + mean_c3=md.NamedAgg("c3", "mean"), + method="tree", + ) + pd.testing.assert_frame_equal( + r.execute().fetch(), + raw.groupby("c2").agg( + sum_c1=md.NamedAgg("c1", "sum"), + min_c1=md.NamedAgg("c1", "min"), + mean_c3=md.NamedAgg("c3", "mean"), + ), + ) + + +def test_dataframe_groupby_agg_sort(setup): + agg_funs = [ + "std", + "mean", + "var", + "max", + "count", + "size", + "all", + "any", + "skew", + "kurt", + "sem", + "nunique", + ] + + rs = np.random.RandomState(0) + raw = pd.DataFrame( + { + "c1": np.arange(100).astype(np.int64), + "c2": rs.choice(["a", "b", "c"], (100,)), + "c3": rs.rand(100), + } + ) + mdf = md.DataFrame(raw, chunk_size=13) + + for method in ["tree", "shuffle"]: + r = mdf.groupby("c2").agg("size", method=method) + pd.testing.assert_series_equal( + r.execute().fetch(), raw.groupby("c2").agg("size") + ) + + for agg_fun in agg_funs: + if agg_fun == "size": + continue + r = mdf.groupby("c2").agg(agg_fun, method=method) + pd.testing.assert_frame_equal( + r.execute().fetch(), + raw.groupby("c2").agg(agg_fun), + ) + + r = mdf.groupby("c2").agg(agg_funs, method=method) + pd.testing.assert_frame_equal( + r.execute().fetch(), + raw.groupby("c2").agg(agg_funs), + ) + + agg = OrderedDict([("c1", ["min", "mean"]), ("c3", "std")]) + r = mdf.groupby("c2").agg(agg, method=method) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.groupby("c2").agg(agg)) + + agg = OrderedDict([("c1", "min"), ("c3", "sum")]) + r = mdf.groupby("c2").agg(agg, method=method) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.groupby("c2").agg(agg)) + + r = mdf.groupby("c2").agg({"c1": "min", "c3": "min"}, method=method) + pd.testing.assert_frame_equal( + r.execute().fetch(), + raw.groupby("c2").agg({"c1": "min", "c3": "min"}), + ) + + r = mdf.groupby("c2").agg({"c1": "min"}, method=method) + pd.testing.assert_frame_equal( + r.execute().fetch(), + raw.groupby("c2").agg({"c1": "min"}), + ) + + # test groupby series + r = mdf.groupby(mdf["c2"]).sum(method=method) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.groupby(raw["c2"]).sum()) + + r = mdf.groupby("c2").size(method="tree") + pd.testing.assert_series_equal(r.execute().fetch(), raw.groupby("c2").size()) + + # test inserted kurt method + r = mdf.groupby("c2").kurtosis(method="tree") + pd.testing.assert_frame_equal(r.execute().fetch(), raw.groupby("c2").kurtosis()) + + for agg_fun in agg_funs: + if agg_fun == "size" or callable(agg_fun): + continue + r = getattr(mdf.groupby("c2"), agg_fun)(method="tree") + pd.testing.assert_frame_equal( + r.execute().fetch(), getattr(raw.groupby("c2"), agg_fun)() + ) + + # test as_index=False takes no effect + r = mdf.groupby(["c1", "c2"], as_index=False).agg(["mean", "count"]) + pd.testing.assert_frame_equal( + r.execute().fetch(), + raw.groupby(["c1", "c2"], as_index=False).agg(["mean", "count"]), + ) + assert r.op.groupby_params["as_index"] is True + + +def test_series_groupby_agg(setup): + rs = np.random.RandomState(0) + series1 = pd.Series(rs.rand(10)) + ms1 = md.Series(series1, chunk_size=3) + + agg_funs = [ + "std", + "mean", + "var", + "max", + "count", + "size", + "all", + "any", + "skew", + "kurt", + "sem", + ] + + for method in ["tree", "shuffle"]: + for agg_fun in agg_funs: + r = ms1.groupby(lambda x: x % 2).agg(agg_fun, method=method) + pd.testing.assert_series_equal( + r.execute().fetch(), series1.groupby(lambda x: x % 2).agg(agg_fun) + ) + + r = ms1.groupby(lambda x: x % 2).agg(agg_funs, method=method) + pd.testing.assert_frame_equal( + r.execute().fetch(), series1.groupby(lambda x: x % 2).agg(agg_funs) + ) + + # test groupby series + r = ms1.groupby(ms1).sum(method=method) + pd.testing.assert_series_equal( + r.execute().fetch().sort_index(), + series1.groupby(series1).sum().sort_index(), + ) + + r = ms1.groupby(ms1).sum(method=method) + pd.testing.assert_series_equal( + r.execute().fetch().sort_index(), + series1.groupby(series1).sum().sort_index(), + ) + + # test inserted kurt method + r = ms1.groupby(ms1).kurtosis(method="tree") + pd.testing.assert_series_equal( + r.execute().fetch(), series1.groupby(series1).kurtosis() + ) + + for agg_fun in agg_funs: + r = getattr(ms1.groupby(lambda x: x % 2), agg_fun)(method="tree") + pd.testing.assert_series_equal( + r.execute().fetch(), getattr(series1.groupby(lambda x: x % 2), agg_fun)() + ) + + r = ms1.groupby(lambda x: x % 2).agg(["cumsum", "cumcount"], method="tree") + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + series1.groupby(lambda x: x % 2).agg(["cumsum", "cumcount"]).sort_index(), + ) + + r = ms1.groupby(lambda x: x % 2).agg(col_var="var", col_skew="skew", method="tree") + pd.testing.assert_frame_equal( + r.execute().fetch(), + series1.groupby(lambda x: x % 2).agg(col_var="var", col_skew="skew"), + ) + + +def test_groupby_agg_auto_method(setup): + rs = np.random.RandomState(0) + raw = pd.DataFrame( + { + "c1": rs.randint(20, size=100), + "c2": rs.choice(["a", "b", "c"], (100,)), + "c3": rs.rand(100), + } + ) + mdf = md.DataFrame(raw, chunk_size=20) + + def _disallow_reduce(ctx, op): + assert op.stage != OperandStage.reduce + op.execute(ctx, op) + + r = mdf.groupby("c2").agg("sum") + operand_executors = {DataFrameGroupByAgg: _disallow_reduce} + result = r.execute( + extra_config={"operand_executors": operand_executors, "check_all": False} + ).fetch() + pd.testing.assert_frame_equal(result.sort_index(), raw.groupby("c2").agg("sum")) + + r = mdf.groupby("c3").agg("min") + operand_executors = {DataFrameGroupByAgg: _disallow_reduce} + result = r.execute( + extra_config={"operand_executors": operand_executors, "check_all": False} + ).fetch() + pd.testing.assert_frame_equal(result.sort_index(), raw.groupby("c3").agg("min")) + + def _disallow_combine_and_agg(ctx, op): + assert op.stage != OperandStage.combine + op.execute(ctx, op) + + with option_context({"chunk_store_limit": 1}): + raw2 = pd.DataFrame( + { + "c1": rs.randint(20, size=100), + "c2": rs.rand(100), + "c3": rs.rand(100), + } + ) + mdf = md.DataFrame(raw2, chunk_size=20) + r = mdf.groupby("c3").agg("min") + operand_executors = {DataFrameGroupByAgg: _disallow_combine_and_agg} + result = r.execute( + extra_config={"operand_executors": operand_executors, "check_all": False} + ).fetch() + pd.testing.assert_frame_equal( + result.sort_index(), raw2.groupby("c3").agg("min") + ) + + rs = np.random.RandomState(0) + raw = pd.DataFrame( + { + "c1": list(range(4)) * 12, + "c2": rs.choice(["a", "b", "c"], (48,)), + "c3": rs.rand(48), + } + ) + + mdf = md.DataFrame(raw, chunk_size=8) + r = mdf.groupby("c1").agg("sum") + operand_executors = {DataFrameGroupByAgg: _disallow_reduce} + result = r.execute( + extra_config={"operand_executors": operand_executors, "check_all": False} + ).fetch() + pd.testing.assert_frame_equal(result.sort_index(), raw.groupby("c1").agg("sum")) + + +@pytest.mark.skip_ray_dag # _fetch_infos() is not supported by ray backend. +def test_distributed_groupby_agg(setup_cluster): + rs = np.random.RandomState(0) + raw = pd.DataFrame(rs.rand(50000, 10)) + df = md.DataFrame(raw, chunk_size=raw.shape[0] // 2) + with option_context({"chunk_store_limit": 1024**2}): + r = df.groupby(0).sum(combine_size=1) + result = r.execute().fetch() + pd.testing.assert_frame_equal(result, raw.groupby(0).sum()) + # test use shuffle + assert len(r._fetch_infos()["memory_size"]) > 1 + + rs = np.random.RandomState(0) + raw = pd.DataFrame( + { + "c1": rs.randint(20, size=100), + "c2": rs.choice(["a", "b", "c"], (100,)), + "c3": rs.rand(100), + } + ) + mdf = md.DataFrame(raw, chunk_size=20) + r = mdf.groupby("c2").sum().execute() + pd.testing.assert_frame_equal(r.fetch(), raw.groupby("c2").sum()) + # test use tree + assert len(r._fetch_infos()["memory_size"]) == 1 + + rs = np.random.RandomState(0) + raw = pd.DataFrame( + { + "c1": rs.randint(20, size=100), + "c2": rs.choice(["a", "b", "c"], (100,)), + "c3": rs.rand(100), + } + ) + mdf = md.DataFrame(raw, chunk_size=10) + with option_context({"chunk_store_limit": 2048}): + r = mdf.groupby("c2", sort=False).sum().execute() + pd.testing.assert_frame_equal( + r.fetch().sort_index(), raw.groupby("c2", sort=False).sum().sort_index() + ) + # use tree and shuffle + assert len(r._fetch_infos()["memory_size"]) == 3 + + +def test_groupby_agg_str_cat(setup): + agg_fun = lambda x: x.str.cat(sep="_", na_rep="NA") + + rs = np.random.RandomState(0) + raw_df = pd.DataFrame( + { + "a": rs.choice(["A", "B", "C"], size=(100,)), + "b": rs.choice([None, "alfa", "bravo", "charlie"], size=(100,)), + } + ) + + mdf = md.DataFrame(raw_df, chunk_size=13) + + r = mdf.groupby("a").agg(agg_fun, method="tree") + pd.testing.assert_frame_equal(r.execute().fetch(), raw_df.groupby("a").agg(agg_fun)) + + raw_series = pd.Series(rs.choice([None, "alfa", "bravo", "charlie"], size=(100,))) + + ms = md.Series(raw_series, chunk_size=13) + + r = ms.groupby(lambda x: x % 2).agg(agg_fun, method="tree") + pd.testing.assert_series_equal( + r.execute().fetch(), raw_series.groupby(lambda x: x % 2).agg(agg_fun) + ) + + +@require_cudf +def test_gpu_groupby_agg(setup_gpu): + rs = np.random.RandomState(0) + df1 = pd.DataFrame( + {"a": rs.choice([2, 3, 4], size=(100,)), "b": rs.choice([2, 3, 4], size=(100,))} + ) + mdf = md.DataFrame(df1, chunk_size=13).to_gpu() + + r = mdf.groupby("a").sum() + pd.testing.assert_frame_equal( + r.execute().fetch().to_pandas(), df1.groupby("a").sum() + ) + + r = mdf.groupby("a").kurt() + pd.testing.assert_frame_equal( + r.execute().fetch().to_pandas(), df1.groupby("a").kurt() + ) + + r = mdf.groupby("a").agg(["sum", "var"]) + pd.testing.assert_frame_equal( + r.execute().fetch().to_pandas(), df1.groupby("a").agg(["sum", "var"]) + ) + + rs = np.random.RandomState(0) + idx = pd.Index(np.where(rs.rand(10) > 0.5, "A", "B")) + series1 = pd.Series(rs.rand(10), index=idx) + ms = md.Series(series1, index=idx, chunk_size=3).to_gpu().to_gpu() + + r = ms.groupby(level=0).sum() + pd.testing.assert_series_equal( + r.execute().fetch().to_pandas(), series1.groupby(level=0).sum() + ) + + r = ms.groupby(level=0).kurt() + pd.testing.assert_series_equal( + r.execute().fetch().to_pandas(), series1.groupby(level=0).kurt() + ) + + r = ms.groupby(level=0).agg(["sum", "var"]) + pd.testing.assert_frame_equal( + r.execute().fetch().to_pandas(), series1.groupby(level=0).agg(["sum", "var"]) + ) + + +def test_groupby_apply(setup): + df1 = pd.DataFrame( + { + "a": [3, 4, 5, 3, 5, 4, 1, 2, 3], + "b": [1, 3, 4, 5, 6, 5, 4, 4, 4], + "c": list("aabaaddce"), + } + ) + + def apply_df(df, ret_series=False): + df = df.sort_index() + df.a += df.b + if len(df.index) > 0: + if not ret_series: + df = df.iloc[:-1, :] + else: + df = df.iloc[-1, :] + return df + + def apply_series(s, truncate=True): + s = s.sort_index() + if truncate and len(s.index) > 0: + s = s.iloc[:-1] + return s + + mdf = md.DataFrame(df1, chunk_size=3) + + applied = mdf.groupby("b").apply(lambda df: None) + pd.testing.assert_frame_equal( + applied.execute().fetch(), df1.groupby("b").apply(lambda df: None) + ) + + applied = mdf.groupby("b").apply(apply_df) + pd.testing.assert_frame_equal( + applied.execute().fetch().sort_index(), + df1.groupby("b").apply(apply_df).sort_index(), + ) + + applied = mdf.groupby("b").apply(apply_df, ret_series=True) + pd.testing.assert_frame_equal( + applied.execute().fetch().sort_index(), + df1.groupby("b").apply(apply_df, ret_series=True).sort_index(), + ) + + applied = mdf.groupby("b").apply(lambda df: df.a, output_type="series") + pd.testing.assert_series_equal( + applied.execute().fetch().sort_index(), + df1.groupby("b").apply(lambda df: df.a).sort_index(), + ) + + applied = mdf.groupby("b").apply(lambda df: df.a.sum()) + pd.testing.assert_series_equal( + applied.execute().fetch().sort_index(), + df1.groupby("b").apply(lambda df: df.a.sum()).sort_index(), + ) + + series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3]) + ms1 = md.Series(series1, chunk_size=3) + + applied = ms1.groupby(lambda x: x % 3).apply(lambda df: None) + pd.testing.assert_series_equal( + applied.execute().fetch(), + series1.groupby(lambda x: x % 3).apply(lambda df: None), + ) + + applied = ms1.groupby(lambda x: x % 3).apply(apply_series) + pd.testing.assert_series_equal( + applied.execute().fetch().sort_index(), + series1.groupby(lambda x: x % 3).apply(apply_series).sort_index(), + ) + + sindex2 = pd.MultiIndex.from_arrays([list(range(9)), list("ABCDEFGHI")]) + series2 = pd.Series(list("CDECEDABC"), index=sindex2) + ms2 = md.Series(series2, chunk_size=3) + + applied = ms2.groupby(lambda x: x[0] % 3).apply(apply_series) + pd.testing.assert_series_equal( + applied.execute().fetch().sort_index(), + series2.groupby(lambda x: x[0] % 3).apply(apply_series).sort_index(), + ) + + +def test_groupby_apply_with_df_or_series_output(setup): + raw = pd.DataFrame( + { + "a": [3, 4, 5, 3, 5, 4, 1, 2, 3], + "b": [6, 3, 3, 5, 6, 5, 4, 4, 4], + "c": list("aabaabbbb"), + } + ) + mdf = md.DataFrame(raw, chunk_size=3) + + def f1(df): + return df.a.iloc[2] + + with pytest.raises(TypeError): + mdf.groupby("c").apply(f1) + + with pytest.raises(ValueError): + mdf.groupby("c").apply(f1, output_types=["df_or_series"]).execute() + + for kwargs in [dict(output_type="df_or_series"), dict(skip_infer=True)]: + mdf = md.DataFrame(raw, chunk_size=5) + applied = mdf.groupby("c").apply(f1, **kwargs) + assert isinstance(applied, DATAFRAME_OR_SERIES_TYPE) + applied = applied.execute() + assert applied.data_type == "series" + assert not ("dtypes" in applied.data_params) + assert applied.shape == (2,) + pd.testing.assert_series_equal( + applied.fetch().sort_index(), raw.groupby("c").apply(f1).sort_index() + ) + + def f2(df): + return df[["a"]] + + mdf = md.DataFrame(raw, chunk_size=5) + applied = mdf.groupby("c").apply(f2, output_types=["df_or_series"]) + assert isinstance(applied, DATAFRAME_OR_SERIES_TYPE) + applied = applied.execute() + assert applied.data_type == "dataframe" + assert not ("dtype" in applied.data_params) + assert applied.shape == (9, 1) + expected = raw.groupby("c", as_index=True).apply(f2) + pd.testing.assert_series_equal(applied.dtypes, expected.dtypes) + pd.testing.assert_frame_equal(applied.fetch().sort_index(), expected.sort_index()) + + +def test_groupby_apply_closure(setup): + # DataFrame + df1 = pd.DataFrame( + { + "a": [3, 4, 5, 3, 5, 4, 1, 2, 3], + "b": [1, 3, 4, 5, 6, 5, 4, 4, 4], + "c": list("aabaaddce"), + } + ) + + x, y = 10, 11 + + def apply_closure_df(df): + return df["a"].max() * x + + def apply_closure_series(s): + return s.mean() * y + + class callable_df: + def __init__(self): + self.x = 10 + + def __call__(self, df): + return df["a"].max() * x + + class callable_series: + def __init__(self): + self.y = 11 + + def __call__(self, s): + return s.mean() * y + + mdf = md.DataFrame(df1, chunk_size=3) + + applied = mdf.groupby("b").apply(apply_closure_df) + pd.testing.assert_series_equal( + applied.execute().fetch().sort_index(), + df1.groupby("b").apply(apply_closure_df).sort_index(), + ) + + cdf = callable_df() + applied = mdf.groupby("b").apply(cdf) + pd.testing.assert_series_equal( + applied.execute().fetch().sort_index(), + df1.groupby("b").apply(cdf).sort_index(), + ) + + # Series + series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3]) + ms1 = md.Series(series1, chunk_size=3) + + applied = ms1.groupby(lambda x: x % 3).apply(apply_closure_series) + pd.testing.assert_series_equal( + applied.execute().fetch().sort_index(), + series1.groupby(lambda x: x % 3).apply(apply_closure_series).sort_index(), + ) + + cs = callable_series() + applied = ms1.groupby(lambda x: x % 3).apply(cs) + pd.testing.assert_series_equal( + applied.execute().fetch().sort_index(), + series1.groupby(lambda x: x % 3).apply(cs).sort_index(), + ) + + +def test_groupby_transform(setup): + df1 = pd.DataFrame( + { + "a": [3, 4, 5, 3, 5, 4, 1, 2, 3], + "b": [1, 3, 4, 5, 6, 5, 4, 4, 4], + "c": list("aabaaddce"), + "d": [3, 4, 5, 3, 5, 4, 1, 2, 3], + "e": [1, 3, 4, 5, 6, 5, 4, 4, 4], + "f": list("aabaaddce"), + } + ) + + def transform_series(s, truncate=True): + s = s.sort_index() + if truncate and len(s.index) > 1: + s = s.iloc[:-1].reset_index(drop=True) + return s + + mdf = md.DataFrame(df1, chunk_size=3) + + r = mdf.groupby("b").transform(transform_series, truncate=False) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + df1.groupby("b").transform(transform_series, truncate=False).sort_index(), + ) + + df2 = pd.DataFrame( + { + "a": [3, 4, 5, 3, 5, 4, 1, 2, 3], + "b": [1, 3, 4, 5, 6, 5, 4, 4, 4], + "c": list("aabaabbba"), + } + ) + + def f(df): + if df.iloc[2]: + return df + else: + return df + df.max() + + mdf2 = md.DataFrame(df2, chunk_size=5) + with pytest.raises(TypeError): + mdf2.groupby("c").transform(f) + + r = mdf2.groupby("c").transform(f, skip_infer=True) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + df2.groupby("c").transform(f).sort_index(), + ) + + if pd.__version__ != "1.1.0": + r = mdf.groupby("b").transform(["cummax", "cumsum"], _call_agg=True) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + df1.groupby("b").agg(["cummax", "cumsum"]).sort_index(), + ) + + agg_list = ["cummax", "cumsum"] + r = mdf.groupby("b").transform(agg_list, _call_agg=True) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + df1.groupby("b").agg(agg_list).sort_index(), + ) + + agg_dict = OrderedDict([("d", "cummax"), ("b", "cumsum")]) + r = mdf.groupby("b").transform(agg_dict, _call_agg=True) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + df1.groupby("b").agg(agg_dict).sort_index(), + ) + + agg_list = ["sum", lambda s: s.sum()] + r = mdf.groupby("b").transform(agg_list, _call_agg=True) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), df1.groupby("b").agg(agg_list).sort_index() + ) + + series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3]) + ms1 = md.Series(series1, chunk_size=3) + + r = ms1.groupby(lambda x: x % 3).transform(lambda x: x + 1) + pd.testing.assert_series_equal( + r.execute().fetch().sort_index(), + series1.groupby(lambda x: x % 3).transform(lambda x: x + 1).sort_index(), + ) + + r = ms1.groupby(lambda x: x % 3).transform("cummax", _call_agg=True) + pd.testing.assert_series_equal( + r.execute().fetch().sort_index(), + series1.groupby(lambda x: x % 3).agg("cummax").sort_index(), + ) + + agg_list = ["cummax", "cumcount"] + r = ms1.groupby(lambda x: x % 3).transform(agg_list, _call_agg=True) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + series1.groupby(lambda x: x % 3).agg(agg_list).sort_index(), + ) + + +def test_groupby_cum(setup): + df1 = pd.DataFrame( + { + "a": [3, 5, 2, 7, 1, 2, 4, 6, 2, 4], + "b": [8, 3, 4, 1, 8, 2, 2, 2, 2, 3], + "c": [1, 8, 8, 5, 3, 5, 0, 0, 5, 4], + } + ) + mdf = md.DataFrame(df1, chunk_size=3) + + for fun in ["cummin", "cummax", "cumprod", "cumsum"]: + r1 = getattr(mdf.groupby("b"), fun)() + pd.testing.assert_frame_equal( + r1.execute().fetch().sort_index(), + getattr(df1.groupby("b"), fun)().sort_index(), + ) + + r2 = getattr(mdf.groupby("b"), fun)(axis=1) + pd.testing.assert_frame_equal( + r2.execute().fetch().sort_index(), + getattr(df1.groupby("b"), fun)(axis=1).sort_index(), + ) + + r3 = mdf.groupby("b").cumcount() + pd.testing.assert_series_equal( + r3.execute().fetch().sort_index(), df1.groupby("b").cumcount().sort_index() + ) + + series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3]) + ms1 = md.Series(series1, chunk_size=3) + + for fun in ["cummin", "cummax", "cumprod", "cumsum", "cumcount"]: + r1 = getattr(ms1.groupby(lambda x: x % 2), fun)() + pd.testing.assert_series_equal( + r1.execute().fetch().sort_index(), + getattr(series1.groupby(lambda x: x % 2), fun)().sort_index(), + ) + + +def test_groupby_fill(setup): + df1 = pd.DataFrame( + [ + [1, 1, 10], + [1, 1, np.nan], + [1, 1, np.nan], + [1, 2, np.nan], + [1, 2, 20], + [1, 2, np.nan], + [1, 3, np.nan], + [1, 3, np.nan], + ], + columns=["one", "two", "three"], + ) + mdf = md.DataFrame(df1, chunk_size=3) + r1 = getattr(mdf.groupby(["one", "two"]), "ffill")() + pd.testing.assert_frame_equal( + r1.execute().fetch().sort_index(), + getattr(df1.groupby(["one", "two"]), "ffill")().sort_index(), + ) + + r2 = getattr(mdf.groupby("two"), "bfill")() + pd.testing.assert_frame_equal( + r2.execute().fetch().sort_index(), + getattr(df1.groupby("two"), "bfill")().sort_index(), + ) + + r3 = getattr(mdf.groupby("one"), "fillna")(5) + pd.testing.assert_frame_equal( + r3.execute().fetch().sort_index(), + getattr(df1.groupby("one"), "fillna")(5).sort_index(), + ) + + r4 = getattr(mdf.groupby("two"), "backfill")() + pd.testing.assert_frame_equal( + r4.execute().fetch().sort_index(), + getattr(df1.groupby("two"), "backfill")().sort_index(), + ) + + s1 = pd.Series([4, 3, 9, np.nan, np.nan, 7, 10, 8, 1, 6]) + ms1 = md.Series(s1, chunk_size=3) + + r1 = getattr(ms1.groupby(lambda x: x % 2), "ffill")() + pd.testing.assert_series_equal( + r1.execute().fetch().sort_index(), + getattr(s1.groupby(lambda x: x % 2), "ffill")().sort_index(), + ) + + r2 = getattr(ms1.groupby(lambda x: x % 2), "bfill")() + pd.testing.assert_series_equal( + r2.execute().fetch().sort_index(), + getattr(s1.groupby(lambda x: x % 2), "bfill")().sort_index(), + ) + + r4 = getattr(ms1.groupby(lambda x: x % 2), "backfill")() + pd.testing.assert_series_equal( + r4.execute().fetch().sort_index(), + getattr(s1.groupby(lambda x: x % 2), "backfill")().sort_index(), + ) + + +def test_groupby_head(setup): + df1 = pd.DataFrame( + { + "a": [3, 5, 2, 7, 1, 2, 4, 6, 2, 4], + "b": [8, 3, 4, 1, 8, 2, 2, 2, 2, 3], + "c": [1, 8, 8, 5, 3, 5, 0, 0, 5, 4], + "d": [9, 7, 6, 3, 6, 3, 2, 1, 5, 8], + } + ) + # test single chunk + mdf = md.DataFrame(df1) + + r = mdf.groupby("b").head(1) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), df1.groupby("b").head(1) + ) + r = mdf.groupby("b").head(-1) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), df1.groupby("b").head(-1) + ) + r = mdf.groupby("b")["a", "c"].head(1) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), df1.groupby("b")["a", "c"].head(1) + ) + + # test multiple chunks + mdf = md.DataFrame(df1, chunk_size=3) + + r = mdf.groupby("b").head(1) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), df1.groupby("b").head(1) + ) + + r = mdf.groupby("b").head(-1) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), df1.groupby("b").head(-1) + ) + + # test head with selection + r = mdf.groupby("b")["a", "d"].head(1) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), df1.groupby("b")["a", "d"].head(1) + ) + r = mdf.groupby("b")["c", "a", "d"].head(1) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), df1.groupby("b")["c", "a", "d"].head(1) + ) + r = mdf.groupby("b")["c"].head(1) + pd.testing.assert_series_equal( + r.execute().fetch().sort_index(), df1.groupby("b")["c"].head(1) + ) + + # test single chunk + series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3]) + ms = md.Series(series1) + + r = ms.groupby(lambda x: x % 2).head(1) + pd.testing.assert_series_equal( + r.execute().fetch().sort_index(), series1.groupby(lambda x: x % 2).head(1) + ) + r = ms.groupby(lambda x: x % 2).head(-1) + pd.testing.assert_series_equal( + r.execute().fetch().sort_index(), series1.groupby(lambda x: x % 2).head(-1) + ) + + # test multiple chunk + ms = md.Series(series1, chunk_size=3) + + r = ms.groupby(lambda x: x % 2).head(1) + pd.testing.assert_series_equal( + r.execute().fetch().sort_index(), series1.groupby(lambda x: x % 2).head(1) + ) + + # test with special index + series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3], index=[4, 1, 2, 3, 5, 8, 6, 7, 9]) + ms = md.Series(series1, chunk_size=3) + + r = ms.groupby(lambda x: x % 2).head(1) + pd.testing.assert_series_equal( + r.execute().fetch().sort_index(), + series1.groupby(lambda x: x % 2).head(1).sort_index(), + ) + + +def test_groupby_sample(setup): + rs = np.random.RandomState(0) + sample_count = 10 + src_data_list = [] + for b in range(5): + data_count = int(rs.randint(20, 100)) + src_data_list.append( + pd.DataFrame( + { + "a": rs.randint(0, 100, size=data_count), + "b": np.array([b] * data_count), + "c": rs.randint(0, 100, size=data_count), + "d": rs.randint(0, 100, size=data_count), + } + ) + ) + df1 = pd.concat(src_data_list) + shuffle_idx = np.arange(len(df1)) + rs.shuffle(shuffle_idx) + df1 = df1.iloc[shuffle_idx].reset_index(drop=True) + + # test single chunk + mdf = md.DataFrame(df1) + + r1 = mdf.groupby("b").sample(sample_count, random_state=rs) + result1 = r1.execute().fetch() + r2 = mdf.groupby("b").sample(sample_count, random_state=rs) + result2 = r2.execute().fetch() + pd.testing.assert_frame_equal(result1, result2) + assert not (result1.groupby("b").count() - sample_count).any()[0] + + r1 = mdf.groupby("b").sample( + sample_count, weights=df1["c"] / df1["c"].sum(), random_state=rs + ) + result1 = r1.execute().fetch() + r2 = mdf.groupby("b").sample( + sample_count, weights=df1["c"] / df1["c"].sum(), random_state=rs + ) + result2 = r2.execute().fetch() + pd.testing.assert_frame_equal(result1, result2) + assert not (result1.groupby("b").count() - sample_count).any()[0] + + r1 = mdf.groupby("b")[["b", "c"]].sample(sample_count, random_state=rs) + result1 = r1.execute().fetch() + r2 = mdf.groupby("b")[["b", "c"]].sample(sample_count, random_state=rs) + result2 = r2.execute().fetch() + pd.testing.assert_frame_equal(result1, result2) + assert len(result1.columns) == 2 + assert not (result1.groupby("b").count() - sample_count).any()[0] + + r1 = mdf.groupby("b").c.sample(sample_count, random_state=rs) + result1 = r1.execute().fetch() + r2 = mdf.groupby("b").c.sample(sample_count, random_state=rs) + result2 = r2.execute().fetch() + pd.testing.assert_series_equal(result1, result2) + + r1 = mdf.groupby("b").c.sample(len(df1), random_state=rs) + result1 = r1.execute().fetch() + assert len(result1) == len(df1) + + with pytest.raises(ValueError): + r1 = mdf.groupby("b").c.sample(len(df1), random_state=rs, errors="raises") + r1.execute().fetch() + + # test multiple chunks + mdf = md.DataFrame(df1, chunk_size=47) + + r1 = mdf.groupby("b").sample(sample_count, random_state=rs) + result1 = r1.execute().fetch() + r2 = mdf.groupby("b").sample(sample_count, random_state=rs) + result2 = r2.execute().fetch() + pd.testing.assert_frame_equal(result1, result2) + assert not (result1.groupby("b").count() - sample_count).any()[0] + + r1 = mdf.groupby("b").sample( + sample_count, weights=df1["c"] / df1["c"].sum(), random_state=rs + ) + result1 = r1.execute().fetch() + r2 = mdf.groupby("b").sample( + sample_count, weights=df1["c"] / df1["c"].sum(), random_state=rs + ) + result2 = r2.execute().fetch() + pd.testing.assert_frame_equal(result1, result2) + assert not (result1.groupby("b").count() - sample_count).any()[0] + + r1 = mdf.groupby("b")[["b", "c"]].sample(sample_count, random_state=rs) + result1 = r1.execute().fetch() + r2 = mdf.groupby("b")[["b", "c"]].sample(sample_count, random_state=rs) + result2 = r2.execute().fetch() + pd.testing.assert_frame_equal(result1, result2) + assert len(result1.columns) == 2 + assert not (result1.groupby("b").count() - sample_count).any()[0] + + r1 = mdf.groupby("b").c.sample(sample_count, random_state=rs) + result1 = r1.execute().fetch() + r2 = mdf.groupby("b").c.sample(sample_count, random_state=rs) + result2 = r2.execute().fetch() + pd.testing.assert_series_equal(result1, result2) + + r1 = mdf.groupby("b").c.sample(len(df1), random_state=rs) + result1 = r1.execute().fetch() + assert len(result1) == len(df1) + + with pytest.raises(ValueError): + r1 = mdf.groupby("b").c.sample(len(df1), random_state=rs, errors="raises") + r1.execute().fetch() + + +@pytest.mark.skipif(pa is None, reason="pyarrow not installed") +def test_groupby_agg_with_arrow_dtype(setup): + df1 = pd.DataFrame({"a": [1, 2, 1], "b": ["a", "b", "a"]}) + mdf = md.DataFrame(df1) + mdf["b"] = mdf["b"].astype("Arrow[string]") + + r = mdf.groupby("a").count() + result = r.execute().fetch() + expected = df1.groupby("a").count() + pd.testing.assert_frame_equal(result, expected) + + r = mdf.groupby("b").count() + result = r.execute().fetch() + result.index = result.index.astype(object) + expected = df1.groupby("b").count() + pd.testing.assert_frame_equal(result, expected) + + series1 = df1["b"] + mseries = md.Series(series1).astype("Arrow[string]") + + r = mseries.groupby(mseries).count() + result = r.execute().fetch() + result.index = result.index.astype(object) + expected = series1.groupby(series1).count() + pd.testing.assert_series_equal(result, expected) + + series2 = series1.copy() + series2.index = pd.MultiIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) + mseries = md.Series(series2).astype("Arrow[string]") + + r = mseries.groupby(mseries).count() + result = r.execute().fetch() + result.index = result.index.astype(object) + expected = series2.groupby(series2).count() + pd.testing.assert_series_equal(result, expected) + + +@pytest.mark.skipif(pa is None, reason="pyarrow not installed") +def test_groupby_apply_with_arrow_dtype(setup): + df1 = pd.DataFrame({"a": [1, 2, 1], "b": ["a", "b", "a"]}) + mdf = md.DataFrame(df1) + mdf["b"] = mdf["b"].astype("Arrow[string]") + + applied = mdf.groupby("b").apply(lambda df: df.a.sum()) + result = applied.execute().fetch() + result.index = result.index.astype(object) + expected = df1.groupby("b").apply(lambda df: df.a.sum()) + pd.testing.assert_series_equal(result, expected) + + series1 = df1["b"] + mseries = md.Series(series1).astype("Arrow[string]") + + applied = mseries.groupby(mseries).apply(lambda s: s) + result = applied.execute().fetch() + result.index = result.index.astype(np.int64) + expected = series1.groupby(series1).apply(lambda s: s) + pd.testing.assert_series_equal(arrow_array_to_objects(result), expected) + + +def test_groupby_nunique(setup): + rs = np.random.RandomState(0) + data_size = 100 + data_dict = { + "a": rs.randint(0, 10, size=(data_size,)), + "b": rs.choice(list("abcd"), size=(data_size,)), + "c": rs.choice(list("abcd"), size=(data_size,)), + } + df1 = pd.DataFrame(data_dict) + + # one chunk + mdf = md.DataFrame(df1) + pd.testing.assert_frame_equal( + mdf.groupby("c").nunique().execute().fetch().sort_index(), + df1.groupby("c").nunique().sort_index(), + ) + + # multiple chunks + mdf = md.DataFrame(df1, chunk_size=13) + pd.testing.assert_frame_equal( + mdf.groupby("b").nunique().execute().fetch().sort_index(), + df1.groupby("b").nunique().sort_index(), + ) + + # getitem and nunique + mdf = md.DataFrame(df1, chunk_size=13) + pd.testing.assert_series_equal( + mdf.groupby("b")["a"].nunique().execute().fetch().sort_index(), + df1.groupby("b")["a"].nunique().sort_index(), + ) + + # test with as_index=False + mdf = md.DataFrame(df1, chunk_size=13) + if _agg_size_as_frame: + pd.testing.assert_frame_equal( + mdf.groupby("b", as_index=False)["a"] + .nunique() + .execute() + .fetch() + .sort_values(by="b", ignore_index=True), + df1.groupby("b", as_index=False)["a"] + .nunique() + .sort_values(by="b", ignore_index=True), + ) diff --git a/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_nunique_execution.py b/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_nunique_execution.py new file mode 100644 index 000000000..f05deee11 --- /dev/null +++ b/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_nunique_execution.py @@ -0,0 +1,330 @@ +# Copyright 2022 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import pandas as pd +import pytest + + +from .... import dataframe as md + + +@pytest.fixture +def gen_data1(): + rs = np.random.RandomState(0) + data_size = 100 + data_dict = { + "a": rs.randint(0, 10, size=(data_size,)), + "b": rs.choice(list("abcd"), size=(data_size,)), + "c": rs.choice(list("abcd"), size=(data_size,)), + } + df = pd.DataFrame(data_dict) + yield df + + +@pytest.fixture +def gen_data2(): + rs = np.random.RandomState(0) + data_size = 100 + data_dict = { + "a": rs.randint(0, 10, size=(data_size,)), + "b": rs.choice(list("abcd"), size=(data_size,)), + "c": rs.choice(list("abcd"), size=(data_size,)), + "d": rs.randint(0, 10, size=(data_size,)), + } + df = pd.DataFrame(data_dict) + yield df + + +@pytest.fixture +def gen_data3(): + arrays = [ + ["Falcon", "Falcon", "Parrot", "Parrot"], + ["Captive", "Wild", "Captive", "Wild"], + ] + index = pd.MultiIndex.from_arrays(arrays, names=("Animal", "Type")) + df = pd.DataFrame({"Max Speed": [390.0, 350.0, 30.0, 20.0]}, index=index) + yield df + + +def test_groupby_nunique_without_index(setup, gen_data1): + df = gen_data1 + mdf = md.DataFrame(df, chunk_size=13) + r1 = mdf.groupby("b", sort=False)[["a"]].nunique(method="tree").execute().fetch() + r2 = ( + mdf.groupby("b", sort=False)[["a"]] + .nunique(method="shuffle") + .execute() + .fetch() + .sort_index(level=0) + ) + r3 = ( + mdf.groupby("b", sort=False)[["a"]] + .nunique(method="auto") + .execute() + .fetch() + .sort_index(level=0) + ) + + expected = df.groupby("b", sort=False)[["a"]].nunique() + pd.testing.assert_frame_equal(r1, expected) + pd.testing.assert_frame_equal(r2, expected.sort_index(level=0)) + pd.testing.assert_frame_equal(r3, expected.sort_index(level=0)) + + +def test_groupby_nunique_with_index(setup, gen_data1): + df = gen_data1 + mdf = md.DataFrame(df, chunk_size=13) + + r1 = ( + mdf.groupby("b", as_index=False, sort=False)["a"] + .nunique(method="tree") + .execute() + .fetch() + ) + # shuffle cannot ensure its order + r2 = ( + mdf.groupby("b", as_index=False, sort=False)["a"] + .nunique(method="auto") + .execute() + .fetch() + .sort_values(by="b") + .reset_index(drop=True) + ) + r3 = ( + mdf.groupby("b", as_index=False, sort=False)["a"] + .nunique(method="shuffle") + .execute() + .fetch() + .sort_values(by="b") + .reset_index(drop=True) + ) + + expected = df.groupby("b", as_index=False, sort=False)["a"].nunique() + pd.testing.assert_frame_equal(r1, expected) + pd.testing.assert_frame_equal( + r2, expected.sort_values(by="b").reset_index(drop=True) + ) + pd.testing.assert_frame_equal( + r3, expected.sort_values(by="b").reset_index(drop=True) + ) + + +def test_groupby_nunique_series(setup, gen_data1): + df = gen_data1 + mdf = md.DataFrame(df, chunk_size=13) + # When method = shuffle and output is series, mars has issue about that. + # Therefore, skip the case. + r1 = mdf.groupby("b", sort=False)["a"].nunique(method="tree").execute().fetch() + r2 = ( + mdf.groupby("b", sort=False)["a"] + .nunique(method="auto") + .execute() + .fetch() + .sort_index(level=0) + ) + + expected = df.groupby("b", sort=False)["a"].nunique() + pd.testing.assert_series_equal(r1, expected) + pd.testing.assert_series_equal(r2, expected.sort_index(level=0)) + + +def test_groupby_nunique_frame(setup, gen_data1): + df = gen_data1 + mdf = md.DataFrame(df, chunk_size=13) + + r1 = mdf.groupby("b", sort=False)["a", "c"].nunique(method="tree").execute().fetch() + r2 = ( + mdf.groupby("b", sort=False)["a", "c"] + .nunique(method="auto") + .execute() + .fetch() + .sort_values(by="b") + .reset_index() + ) + r3 = ( + mdf.groupby("b", sort=False)["a", "c"] + .nunique(method="shuffle") + .execute() + .fetch() + .sort_values(by="b") + .reset_index() + ) + + expected = df.groupby("b", sort=False)["a", "c"].nunique() + pd.testing.assert_frame_equal(r1, expected) + pd.testing.assert_frame_equal(r2, expected.sort_values(by="b").reset_index()) + pd.testing.assert_frame_equal(r3, expected.sort_values(by="b").reset_index()) + + +def test_groupby_nunique_with_sort(setup, gen_data1): + df = gen_data1 + mdf = md.DataFrame(df, chunk_size=13) + + r = mdf.groupby("b", sort=True)["a", "c"].nunique().execute().fetch() + + expected = df.groupby("b", sort=True)["a", "c"].nunique() + pd.testing.assert_frame_equal(r, expected) + + r = mdf.groupby(["b", "c"], sort=True)["a"].nunique().execute().fetch() + expected = df.groupby(["b", "c"], sort=True)["a"].nunique() + pd.testing.assert_series_equal(r, expected) + + +def test_groupby_nunique_multiindex(setup, gen_data2): + df = gen_data2 + mdf = md.DataFrame(df, chunk_size=13) + + r1 = ( + mdf.groupby(["b", "c"], sort=False)["a", "d"] + .nunique(method="tree") + .execute() + .fetch() + ) + r2 = ( + mdf.groupby(["b", "c"], sort=False)["a", "d"] + .nunique(method="shuffle") + .execute() + .fetch() + .sort_values(by=["b", "c"]) + .reset_index() + ) + r3 = ( + mdf.groupby(["b", "c"], sort=False)["a", "d"] + .nunique(method="auto") + .execute() + .fetch() + .sort_values(by=["b", "c"]) + .reset_index() + ) + + expected = df.groupby(["b", "c"], sort=False)["a", "d"].nunique() + pd.testing.assert_frame_equal(r1, expected) + pd.testing.assert_frame_equal(r2, expected.sort_values(by=["b", "c"]).reset_index()) + pd.testing.assert_frame_equal(r3, expected.sort_values(by=["b", "c"]).reset_index()) + + +def test_groupby_nunique_level(setup, gen_data1, gen_data3): + df = gen_data1 + mdf = md.DataFrame(df, chunk_size=13) + + r = ( + mdf.groupby(level=0, as_index=False, sort=False)["a"] + .nunique() + .execute() + .fetch() + ) + + expected = df.groupby(level=0, as_index=False, sort=False)["a"].nunique() + pd.testing.assert_frame_equal(r, expected) + + r = mdf.groupby(level=0, sort=False)["a"].nunique().execute().fetch() + expected = df.groupby(level=0, sort=False)["a"].nunique() + pd.testing.assert_series_equal(r, expected, check_index=False) + + r = mdf.groupby(level=0, sort=False)["a", "b"].nunique().execute().fetch() + expected = df.groupby(level=0, sort=False)["a", "b"].nunique() + pd.testing.assert_frame_equal( + r.reset_index(drop=True), expected.reset_index(drop=True) + ) + + df2 = gen_data3 + mdf2 = md.DataFrame(df2, chunk_size=2) + r = mdf2.groupby(level="Type", sort=False).nunique().execute().fetch() + expected = df2.groupby(level="Type", sort=False).nunique() + pd.testing.assert_frame_equal(r, expected) + + r = mdf2.groupby(level=["Animal", "Type"], sort=False).nunique().execute().fetch() + expected = df2.groupby(level=["Animal", "Type"], sort=False).nunique() + pd.testing.assert_frame_equal(r, expected) + + r = mdf2.groupby(level=(0, 1), sort=False).nunique().execute().fetch() + expected = df2.groupby(level=(0, 1), sort=False).nunique() + pd.testing.assert_frame_equal(r, expected) + + r = mdf2.groupby(level=["Type", "Animal"]).nunique().execute().fetch() + expected = df2.groupby(level=["Type", "Animal"]).nunique() + pd.testing.assert_frame_equal(r, expected) + + r = ( + mdf2.groupby(level=(0, 1), sort=False) + .nunique(method="shuffle") + .execute() + .fetch() + ) + expected = df2.groupby(level=(0, 1), sort=False).nunique() + pd.testing.assert_frame_equal(r.sort_index(), expected.sort_index()) + + r = mdf2.groupby(level=["Type", "Animal"]).nunique(method="tree").execute().fetch() + expected = df2.groupby(level=["Type", "Animal"]).nunique() + pd.testing.assert_frame_equal(r, expected) + + r = mdf2.groupby(level=["Type", "Animal"]).nunique(method="auto").execute().fetch() + expected = df2.groupby(level=["Type", "Animal"]).nunique() + pd.testing.assert_frame_equal(r, expected) + + r = ( + mdf2.groupby(level=["Type", "Animal"], sort=False) + .nunique(method="shuffle") + .execute() + .fetch() + ) + expected = df2.groupby(level=["Type", "Animal"]).nunique() + pd.testing.assert_frame_equal(r.sort_index(), expected.sort_index()) + + +def test_groupby_agg_nunique(setup, gen_data1): + df = gen_data1 + mdf = md.DataFrame(df, chunk_size=13) + + r = mdf.groupby(["b", "c"]).agg("nunique").execute().fetch() + expected = df.groupby(["b", "c"]).agg("nunique") + pd.testing.assert_frame_equal(r, expected) + + r = mdf.groupby(["b", "c"]).agg(["nunique"], method="tree").execute().fetch() + expected = df.groupby(["b", "c"]).agg(["nunique"]) + pd.testing.assert_frame_equal(r, expected) + + r = mdf.groupby(["b", "c"]).agg(["nunique"], method="auto").execute().fetch() + expected = df.groupby(["b", "c"]).agg(["nunique"]) + pd.testing.assert_frame_equal(r, expected) + + r = mdf.groupby(["b", "c"]).agg(["nunique"], method="shuffle").execute().fetch() + expected = df.groupby(["b", "c"]).agg(["nunique"]) + pd.testing.assert_frame_equal(r, expected) + + r = mdf.groupby(["b", "c"], as_index=False).agg("nunique").execute().fetch() + expected = df.groupby(["b", "c"], as_index=False).agg("nunique") + pd.testing.assert_frame_equal(r, expected) + + r = ( + mdf.groupby(["b", "c"], as_index=False, sort=False) + .agg("nunique") + .execute() + .fetch() + ) + expected = df.groupby(["b", "c"], as_index=False, sort=False).agg("nunique") + pd.testing.assert_frame_equal(r, expected) + + is_sort = [True, False] + methods = ["auto", "shuffle", "tree"] + for sort in is_sort: + for method in methods: + r = ( + mdf.groupby("b", sort=sort) + .agg(["sum", "nunique"], method=method) + .execute() + .fetch() + ) + expected = df.groupby("b", sort=sort).agg(["sum", "nunique"]) + pd.testing.assert_frame_equal(r.sort_index(), expected.sort_index()) diff --git a/python/xorbits/_mars/dataframe/groupby/transform.py b/python/xorbits/_mars/dataframe/groupby/transform.py new file mode 100644 index 000000000..d2050faa2 --- /dev/null +++ b/python/xorbits/_mars/dataframe/groupby/transform.py @@ -0,0 +1,374 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...core import OutputType +from ...core.custom_log import redirect_custom_log +from ...serialization.serializables import AnyField, BoolField, DictField, TupleField +from ...utils import enter_current_session, quiet_stdio +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_empty_df, build_empty_series, parse_index + + +class GroupByTransform(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.TRANSFORM + _op_module_ = "dataframe.groupby" + + _func = AnyField("func") + _args = TupleField("args") + _kwds = DictField("kwds") + + _call_agg = BoolField("call_agg") + + def __init__( + self, func=None, args=None, kwds=None, call_agg=None, output_types=None, **kw + ): + super().__init__( + _func=func, + _args=args, + _kwds=kwds, + _call_agg=call_agg, + _output_types=output_types, + **kw, + ) + + @property + def func(self): + return self._func + + @property + def args(self): + return getattr(self, "_args", None) or () + + @property + def kwds(self): + return getattr(self, "_kwds", None) or dict() + + @property + def call_agg(self): + return self._call_agg + + def _infer_df_func_returns(self, in_groupby, dtypes, index): + index_value, output_types, new_dtypes = None, None, None + + output_types = ( + [OutputType.dataframe] + if in_groupby.op.output_types[0] == OutputType.dataframe_groupby + else [OutputType.series] + ) + + try: + mock_groupby = in_groupby.op.build_mock_groupby() + with np.errstate(all="ignore"), quiet_stdio(): + if self.call_agg: + infer_df = mock_groupby.agg(self.func, *self.args, **self.kwds) + else: + infer_df = mock_groupby.transform( + self.func, *self.args, **self.kwds + ) + + # todo return proper index when sort=True is implemented + index_value = parse_index(None, in_groupby.key, self.func) + + if isinstance(infer_df, pd.DataFrame): + output_types = [OutputType.dataframe] + new_dtypes = new_dtypes or infer_df.dtypes + else: + output_types = [OutputType.series] + new_dtypes = new_dtypes or (infer_df.name, infer_df.dtype) + except: # noqa: E722 # nosec + pass + + self.output_types = output_types if not self.output_types else self.output_types + dtypes = new_dtypes if dtypes is None else dtypes + index_value = index_value if index is None else parse_index(index) + return dtypes, index_value + + def __call__( + self, groupby, dtypes=None, dtype=None, name=None, index=None, skip_infer=None + ): + in_df = groupby.inputs[0] + + if dtypes is None and dtype is not None: + dtypes = (name, dtype) + if skip_infer: + dtypes, index_value = None, None + self.output_types = ( + [OutputType.dataframe] + if groupby.op.output_types[0] == OutputType.dataframe_groupby + else [OutputType.series] + ) + else: + dtypes, index_value = self._infer_df_func_returns(groupby, dtypes, index) + for arg, desc in zip( + (self.output_types, dtypes), ("output_types", "dtypes") + ): + if arg is None: + raise TypeError( + f"Cannot determine {desc} by calculating with enumerate data, " + "please specify it as arguments" + ) + if index_value is None: + index_value = parse_index(None, (in_df.key, in_df.index_value.key)) + + if self.output_types[0] == OutputType.dataframe: + new_shape = ( + np.nan if self.call_agg else in_df.shape[0], + len(dtypes) if dtypes is not None else np.nan, + ) + columns_value = ( + parse_index(dtypes.index, store_data=True) + if dtypes is not None + else None + ) + return self.new_dataframe( + [groupby], + shape=new_shape, + dtypes=dtypes, + index_value=index_value, + columns_value=columns_value, + ) + else: + name, dtype = dtypes + new_shape = (np.nan,) if self.call_agg else groupby.shape + return self.new_series( + [groupby], + name=name, + shape=new_shape, + dtype=dtype, + index_value=index_value, + ) + + @classmethod + def tile(cls, op): + in_groupby = op.inputs[0] + out_df = op.outputs[0] + + chunks = [] + if op.output_types[0] == OutputType.dataframe: + chunk_shape = ( + np.nan, + len(out_df.dtypes) if out_df.dtypes is not None else np.nan, + ) + else: + chunk_shape = (np.nan,) + for c in in_groupby.chunks: + inp_chunks = [c] + + new_op = op.copy().reset_key() + new_op.tileable_op_key = op.key + if op.output_types[0] == OutputType.dataframe: + new_index = c.index if c.ndim == 2 else c.index + (0,) + chunks.append( + new_op.new_chunk( + inp_chunks, + index=new_index, + shape=chunk_shape, + dtypes=out_df.dtypes, + columns_value=out_df.columns_value, + index_value=out_df.index_value, + ) + ) + else: + chunks.append( + new_op.new_chunk( + inp_chunks, + name=out_df.name, + index=(c.index[0],), + shape=chunk_shape, + dtype=out_df.dtype, + index_value=out_df.index_value, + ) + ) + + new_op = op.copy() + kw = out_df.params.copy() + kw["chunks"] = chunks + if op.output_types[0] == OutputType.dataframe: + kw["nsplits"] = ( + (np.nan,) * len(chunks), + (len(out_df.dtypes) if out_df.dtypes is not None else np.nan,), + ) + else: + kw["nsplits"] = ((np.nan,) * len(chunks),) + return new_op.new_tileables([in_groupby], **kw) + + @classmethod + @redirect_custom_log + @enter_current_session + def execute(cls, ctx, op): + in_data = ctx[op.inputs[0].key] + out_chunk = op.outputs[0] + + if in_data is None: + if op.output_types[0] == OutputType.dataframe: + ctx[op.outputs[0].key] = build_empty_df( + out_chunk.dtypes, index=out_chunk.index_value.to_pandas() + ) + else: + ctx[op.outputs[0].key] = build_empty_series( + out_chunk.dtype, + name=out_chunk.name, + index=out_chunk.index_value.to_pandas(), + ) + return + + if op.call_agg: + result = in_data.agg(op.func, *op.args, **op.kwds) + elif in_data.shape[0] > 0: + # cannot perform groupby-transform over empty dataframe + result = in_data.transform(op.func, *op.args, **op.kwds) + else: + if out_chunk.ndim == 2: + result = pd.DataFrame(columns=out_chunk.dtypes.index) + else: + result = pd.Series([], name=out_chunk.name, dtype=out_chunk.dtype) + + if result.ndim == 2: + if out_chunk.dtypes is not None: + result = result.astype(out_chunk.dtypes, copy=False) + else: + if out_chunk.dtype is not None: + result = result.astype(out_chunk.dtype, copy=False) + ctx[op.outputs[0].key] = result + + +def groupby_transform( + groupby, + f, + *args, + dtypes=None, + dtype=None, + name=None, + index=None, + output_types=None, + skip_infer=False, + **kwargs, +): + """ + Call function producing a like-indexed DataFrame on each group and + return a DataFrame having the same indexes as the original object + filled with the transformed values + + Parameters + ---------- + f : function + Function to apply to each group. + + dtypes : Series, default None + Specify dtypes of returned DataFrames. See `Notes` for more details. + + dtype : numpy.dtype, default None + Specify dtype of returned Series. See `Notes` for more details. + + name : str, default None + Specify name of returned Series. See `Notes` for more details. + + skip_infer: bool, default False + Whether infer dtypes when dtypes or output_type is not specified. + + *args + Positional arguments to pass to func + + **kwargs + Keyword arguments to be passed into func. + + Returns + ------- + DataFrame + + See Also + -------- + DataFrame.groupby.apply + DataFrame.groupby.aggregate + DataFrame.transform + + Notes + ----- + Each group is endowed the attribute 'name' in case you need to know + which group you are working on. + + The current implementation imposes three requirements on f: + + * f must return a value that either has the same shape as the input + subframe or can be broadcast to the shape of the input subframe. + For example, if `f` returns a scalar it will be broadcast to have the + same shape as the input subframe. + * if this is a DataFrame, f must support application column-by-column + in the subframe. If f also supports application to the entire subframe, + then a fast path is used starting from the second chunk. + * f must not mutate groups. Mutation is not supported and may + produce unexpected results. + + Notes + ----- + When deciding output dtypes and shape of the return value, Mars will + try applying ``func`` onto a mock grouped object, and the transform call + may fail. + + * For DataFrame output, you need to specify a list or a pandas Series + as ``dtypes`` of output DataFrame. ``index`` of output can also be + specified. + * For Series output, you need to specify ``dtype`` and ``name`` of + output Series. + + Examples + -------- + + >>> import mars.dataframe as md + >>> df = md.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + ... 'foo', 'bar'], + ... 'B' : ['one', 'one', 'two', 'three', + ... 'two', 'two'], + ... 'C' : [1, 5, 5, 2, 5, 5], + ... 'D' : [2.0, 5., 8., 1., 2., 9.]}) + >>> grouped = df.groupby('A') + >>> grouped.transform(lambda x: (x - x.mean()) / x.std()).execute() + C D + 0 -1.154701 -0.577350 + 1 0.577350 0.000000 + 2 0.577350 1.154701 + 3 -1.154701 -1.000000 + 4 0.577350 -0.577350 + 5 0.577350 1.000000 + + Broadcast result of the transformation + + >>> grouped.transform(lambda x: x.max() - x.min()).execute() + C D + 0 4 6.0 + 1 3 8.0 + 2 4 6.0 + 3 3 8.0 + 4 4 6.0 + 5 3 8.0 + """ + call_agg = kwargs.pop("_call_agg", False) + if not call_agg and isinstance(f, (dict, list)): + raise TypeError(f"Does not support transform with {type(f)}") + + op = GroupByTransform( + func=f, args=args, kwds=kwargs, output_types=output_types, call_agg=call_agg + ) + return op( + groupby, + dtypes=dtypes, + dtype=dtype, + name=name, + index=index, + skip_infer=skip_infer, + ) diff --git a/python/xorbits/_mars/dataframe/indexing/__init__.py b/python/xorbits/_mars/dataframe/indexing/__init__.py new file mode 100644 index 000000000..10805c3a4 --- /dev/null +++ b/python/xorbits/_mars/dataframe/indexing/__init__.py @@ -0,0 +1,86 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def _install(): + from pandas.util import cache_readonly + + from ..operands import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE + from .add_prefix_suffix import ( + df_add_prefix, + df_add_suffix, + series_add_prefix, + series_add_suffix, + ) + from .align import align + from .at import at + from .getitem import dataframe_getitem, series_getitem + from .iat import iat + from .iloc import head, iloc, index_getitem, index_setitem, tail + from .insert import df_insert + from .loc import loc + from .reindex import reindex, reindex_like + from .rename import df_rename, index_rename, index_set_names, series_rename + from .rename_axis import rename_axis + from .reset_index import df_reset_index, series_reset_index + from .sample import sample + from .set_axis import df_set_axis, series_set_axis + from .set_index import set_index + from .setitem import dataframe_setitem + from .where import mask, where + + for cls in DATAFRAME_TYPE + SERIES_TYPE: + setattr(cls, "iloc", cache_readonly(iloc)) + setattr(cls, "loc", cache_readonly(loc)) + setattr(cls, "iat", cache_readonly(iat)) + setattr(cls, "at", cache_readonly(at)) + setattr(cls, "head", head) + setattr(cls, "reindex", reindex) + setattr(cls, "reindex_like", reindex_like) + setattr(cls, "rename_axis", rename_axis) + setattr(cls, "tail", tail) + setattr(cls, "mask", mask) + setattr(cls, "where", where) + setattr(cls, "sample", sample) + + for cls in DATAFRAME_TYPE: + setattr(cls, "set_index", set_index) + setattr(cls, "__getitem__", dataframe_getitem) + setattr(cls, "__setitem__", dataframe_setitem) + setattr(cls, "insert", df_insert) + setattr(cls, "reset_index", df_reset_index) + setattr(cls, "rename", df_rename) + setattr(cls, "set_axis", df_set_axis) + setattr(cls, "add_prefix", df_add_prefix) + setattr(cls, "add_suffix", df_add_suffix) + setattr(cls, "align", align) + + for cls in SERIES_TYPE: + setattr(cls, "__getitem__", series_getitem) + setattr(cls, "reset_index", series_reset_index) + setattr(cls, "rename", series_rename) + setattr(cls, "set_axis", series_set_axis) + setattr(cls, "add_prefix", series_add_prefix) + setattr(cls, "add_suffix", series_add_suffix) + setattr(cls, "align", align) + + for cls in INDEX_TYPE: + setattr(cls, "__getitem__", index_getitem) + setattr(cls, "__setitem__", index_setitem) + setattr(cls, "rename", index_rename) + setattr(cls, "set_names", index_set_names) + + +_install() +del _install diff --git a/python/xorbits/_mars/dataframe/indexing/add_prefix_suffix.py b/python/xorbits/_mars/dataframe/indexing/add_prefix_suffix.py new file mode 100644 index 000000000..ed87cea1a --- /dev/null +++ b/python/xorbits/_mars/dataframe/indexing/add_prefix_suffix.py @@ -0,0 +1,110 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import textwrap +from functools import partial + + +def _get_prefix_suffix_docs(is_prefix: bool): + if is_prefix: + action, pos = "prefix", "before" + r_action = "suffix" + else: + action, pos = "suffix", "after" + r_action = "prefix" + + def mk_col(ch: str, s: str): + return f"{ch}_{s}" if is_prefix else f"{s}_{ch}" + + doc = f""" + {action.capitalize()} labels with string `{action}`. + + For Series, the row labels are {action}ed. + For DataFrame, the column labels are {action}ed. + + Parameters + ---------- + {action} : str + The string to add {pos} each label. + + Returns + ------- + Series or DataFrame + New Series or DataFrame with updated labels. + + See Also + -------- + Series.add_{r_action}: Suffix row labels with string `{r_action}`. + DataFrame.add_{r_action}: Suffix column labels with string `{r_action}`. + + Examples + -------- + >>> import mars.dataframe as md + >>> s = md.Series([1, 2, 3, 4]) + >>> s.execute() + 0 1 + 1 2 + 2 3 + 3 4 + dtype: int64 + + >>> s.add_prefix({mk_col('item', '')!r}).execute() + {mk_col('item', '0')} 1 + {mk_col('item', '1')} 2 + {mk_col('item', '2')} 3 + {mk_col('item', '3')} 4 + dtype: int64 + + >>> df = md.DataFrame({{'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}}) + >>> df.execute() + A B + 0 1 3 + 1 2 4 + 2 3 5 + 3 4 6 + + >>> df.add_prefix({mk_col('col', '')!r}).execute() + {mk_col('col', 'A')} {mk_col('col', 'B')} + 0 1 3 + 1 2 4 + 2 3 5 + 3 4 6 + """ + return textwrap.dedent(doc).strip() + + +def df_add_prefix(df, prefix): + f = partial("{prefix}{}".format, prefix=prefix) + return df.rename(columns=f) + + +def series_add_prefix(series, prefix): + f = partial("{prefix}{}".format, prefix=prefix) + return series.rename(index=f) + + +def df_add_suffix(df, suffix): + f = partial("{}{suffix}".format, suffix=suffix) + return df.rename(columns=f) + + +def series_add_suffix(series, suffix): + f = partial("{}{suffix}".format, suffix=suffix) + return series.rename(index=f) + + +df_add_prefix.__doc__ = _get_prefix_suffix_docs(True) +series_add_prefix.__doc__ = df_add_prefix.__doc__ +df_add_suffix.__doc__ = _get_prefix_suffix_docs(False) +series_add_suffix.__doc__ = df_add_suffix.__doc__ diff --git a/python/xorbits/_mars/dataframe/indexing/align.py b/python/xorbits/_mars/dataframe/indexing/align.py new file mode 100644 index 000000000..c12b4ed8d --- /dev/null +++ b/python/xorbits/_mars/dataframe/indexing/align.py @@ -0,0 +1,554 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Optional, Union + +import numpy as np + +from ... import opcodes +from ...core import OutputType, get_output_types, recursive_tile +from ...serialization.serializables import ( + AnyField, + Int16Field, + Int64Field, + KeyField, + StringField, +) +from ...typing import TileableType +from ..align import ( + align_dataframe_dataframe, + align_dataframe_series, + align_series_series, +) +from ..core import IndexValue +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_empty_df, parse_index, validate_axis + + +class _NoNeedToAlign(Exception): + pass + + +class DataFrameAlign(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.ALIGN + + lhs = KeyField("lhs") + rhs = KeyField("rhs") + join = StringField("join", default=None) + axis = Int16Field("axis", default=None) + level = AnyField("level", default=None) + fill_value = AnyField("fill_value", default=None) + method = StringField("method", default=None) + limit = Int64Field("limit", default=None) + fill_axis = Int16Field("fill_axis", default=None) + broadcast_axis = Int16Field("broadcast_axis", default=None) + + @property + def output_limit(self) -> int: + return 2 + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self.lhs = inputs[0] + self.rhs = inputs[1] + + def __call__(self, lhs: TileableType, rhs: TileableType): + if self.broadcast_axis != 1 or lhs.ndim == rhs.ndim: + self._output_types = get_output_types(lhs, rhs) + else: + self._output_types = [OutputType.dataframe, OutputType.dataframe] + + if lhs.ndim == rhs.ndim: + if lhs.ndim == 1: + return self._call_series_series(lhs, rhs) + else: + return self._call_dataframe_dataframe(lhs, rhs) + else: + if lhs.ndim == 1: + # join order need to be reversed if not symmetric + asym_joins = {"left", "right"} - {self.join} + if len(asym_joins) == 1: # self.join in {"left", "right"} + self.join = asym_joins.pop() + # need to put dataframe first + self._output_types = get_output_types(rhs, lhs) + return self._call_dataframe_series(rhs, lhs)[::-1] + else: + return self._call_dataframe_series(lhs, rhs) + + def _call_dataframe_dataframe(self, lhs: TileableType, rhs: TileableType): + l_shape = list(lhs.shape) + r_shape = list(rhs.shape) + if self.axis is None or self.axis == 0: + l_idx_val = r_idx_val = self._merge_index( + lhs.index_value, rhs.index_value, how=self.join + ) + l_shape[0] = r_shape[0] = np.nan + else: + l_idx_val, r_idx_val = lhs.index_value, rhs.index_value + + if self.axis is None or self.axis == 1: + l_empty = build_empty_df(lhs.dtypes) + r_empty = build_empty_df(rhs.dtypes) + aligned, _ = l_empty.align(r_empty, axis=1) + l_dtypes = r_dtypes = aligned.dtypes + l_col_val = r_col_val = parse_index(aligned.columns, store_data=True) + l_shape[1] = r_shape[1] = len(l_dtypes) + else: + l_dtypes, r_dtypes = lhs.dtypes, rhs.dtypes + l_col_val, r_col_val = lhs.columns_value, rhs.columns_value + + l_kws = { + "index_value": l_idx_val, + "dtypes": l_dtypes, + "shape": tuple(l_shape), + "columns_value": l_col_val, + } + r_kws = { + "index_value": r_idx_val, + "dtypes": r_dtypes, + "shape": tuple(r_shape), + "columns_value": r_col_val, + } + return self.new_tileables([lhs, rhs], kws=[l_kws, r_kws]) + + def _call_dataframe_series(self, lhs: TileableType, rhs: TileableType): + l_shape = list(lhs.shape) + if self.axis == 0 or self.broadcast_axis == 1: + dtypes = lhs.dtypes + col_val = lhs.columns_value + l_idx_val = r_idx_val = self._merge_index( + lhs.index_value, rhs.index_value, how=self.join + ) + l_shape[0] = r_size = np.nan + else: + l_idx_val = lhs.index_value + if not rhs.index_value.has_value(): + dtypes = None + l_shape[1] = r_size = np.nan + col_val = r_idx_val = self._merge_index( + lhs.columns_value, rhs.index_value, how=self.join + ) + else: + series_index = rhs.index_value.to_pandas() + dtypes = lhs.dtypes.reindex( + lhs.dtypes.index.join(series_index, how=self.join) + ).fillna(np.dtype(np.float_)) + l_shape[1] = r_size = len(dtypes) + col_val = r_idx_val = parse_index(dtypes.index, store_data=True) + + l_kws = { + "index_value": l_idx_val, + "dtypes": dtypes, + "shape": tuple(l_shape), + "columns_value": col_val, + } + if self.broadcast_axis == 1: + r_kws = { + "index_value": r_idx_val, + "dtypes": dtypes, + "shape": tuple(l_shape), + "columns_value": col_val, + } + else: + r_kws = { + "index_value": r_idx_val, + "shape": (r_size,), + "dtype": rhs.dtype, + } + return self.new_tileables([lhs, rhs], kws=[l_kws, r_kws]) + + def _call_series_series(self, lhs: TileableType, rhs: TileableType): + idx = self._merge_index(lhs.index_value, rhs.index_value, how=self.join) + kws = [ + {"index_value": idx, "shape": (np.nan,), "dtype": lhs.dtype}, + {"index_value": idx, "shape": (np.nan,), "dtype": rhs.dtype}, + ] + return self.new_tileables([lhs, rhs], kws=kws) + + @staticmethod + def _merge_index( + left_index_value: IndexValue, right_index_value: IndexValue, how: str = "outer" + ): + left_pd = left_index_value.to_pandas() + right_pd = right_index_value.to_pandas() + + if not left_index_value.has_value() or not right_index_value.has_value(): + left_pd = left_pd[:0] + right_pd = right_pd[:0] + store_data = False + else: + store_data = True + + joined = left_pd.join(right_pd, how=how) + if store_data: + return parse_index(joined, store_data=store_data) + else: + return parse_index( + joined, + {left_index_value.key, right_index_value.key}, + store_data=store_data, + ) + + @classmethod + def _select_nsplits( + cls, op: "DataFrameAlign", tileable: TileableType, val_to_replace: list + ): + if op.axis is None: + return val_to_replace[: tileable.ndim] + else: + attr_val = tileable.nsplits + axis = op.axis % tileable.ndim + return [ + tuple(val_to_replace[op.axis]) if i == axis else attr_val[i] + for i in range(len(attr_val)) + ] + + @classmethod + def _build_tiled_kw( + cls, op: "DataFrameAlign", idx: int, chunks: list, nsplits: list + ): + in_tileable = op.inputs[idx] + out_tileable = op.outputs[idx] + kw = out_tileable.params.copy() + kw.update( + { + "chunks": chunks, + "nsplits": tuple(cls._select_nsplits(op, in_tileable, nsplits)), + } + ) + return kw + + @classmethod + def _check_align_needed( + cls, op: "DataFrameAlign", left_chunks: list, right_chunks: list + ): + lhs, rhs = op.lhs, op.rhs + if all(lc.key == rc.key for lc, rc in zip(lhs.chunks, left_chunks)) and all( + lc.key == rc.key for lc, rc in zip(rhs.chunks, right_chunks) + ): + raise _NoNeedToAlign + + @classmethod + def _tile_dataframe_dataframe(cls, op: "DataFrameAlign"): + lhs, rhs = op.lhs, op.rhs + nsplits, chunk_shapes, left_chunks, right_chunks = align_dataframe_dataframe( + lhs, rhs, axis=op.axis + ) + cls._check_align_needed(op, left_chunks, right_chunks) + + left_chunk_array = np.array(left_chunks, dtype="O").reshape(chunk_shapes[0]) + right_chunk_array = np.array(right_chunks, dtype="O").reshape(chunk_shapes[1]) + + left_idx_to_chunk = dict() + l_chunks, r_chunks = [], [] + + iterator = np.nditer(right_chunk_array, flags=["refs_ok", "multi_index"]) + for rc_obj in iterator: + rc = rc_obj.tolist() + r_index = iterator.multi_index + l_index = tuple(r_index[i] % chunk_shapes[0][i] for i in (0, 1)) + lc = left_chunk_array[l_index] + + kws = [lc.params, rc.params] + kws[0]["index"] = l_index + kws[1]["index"] = r_index + + chunk_op = op.copy().reset_key() + l_chunk, r_chunk = chunk_op.new_chunks([lc, rc], kws=kws) + left_idx_to_chunk[l_index] = l_chunk + r_chunks.append(r_chunk) + + iterator = np.nditer(left_chunk_array, flags=["refs_ok", "multi_index"]) + for lc_obj in iterator: + lc = lc_obj.tolist() + l_index = iterator.multi_index + try: + l_chunk = left_idx_to_chunk[l_index] + l_chunks.append(l_chunk) + continue + except KeyError: + pass + + r_index = tuple(l_index[i] % chunk_shapes[1][i] for i in (0, 1)) + rc = right_chunk_array[r_index] + + kws = [lc.params, rc.params] + kws[0]["index"] = l_index + + chunk_op = op.copy().reset_key() + l_chunk, _r_chunk = chunk_op.new_chunks([lc, rc], kws=kws) + l_chunks.append(l_chunk) + + return nsplits, l_chunks, r_chunks + + @classmethod + def _tile_dataframe_series(cls, op: "DataFrameAlign"): + lhs, rhs = op.lhs, op.rhs + nsplits, left_chunk_shape, left_chunks, right_chunks = align_dataframe_series( + lhs, rhs, axis=op.axis + ) + cls._check_align_needed(op, left_chunks, right_chunks) + + left_chunk_array = np.array(left_chunks, dtype="O").reshape(left_chunk_shape) + axis = op.axis if op.broadcast_axis != 1 else 0 + l_chunks, r_chunks = [], [] + iterator = np.nditer(left_chunk_array, flags=["refs_ok", "multi_index"]) + for c_obj in iterator: + c = c_obj.tolist() + l_index = iterator.multi_index + + right_chunk = right_chunks[l_index[axis]] + kws = [c.params, right_chunk.params] + kws[0]["index"] = l_index + if op.broadcast_axis != 1: + kws[1]["index"] = (l_index[axis],) + else: + kws[1]["index"] = l_index + + chunk_op = op.copy().reset_key() + l_chunk, r_chunk = chunk_op.new_chunks([c, right_chunk], kws=kws) + + l_chunks.append(l_chunk) + if op.broadcast_axis == 1 or l_index[1 - axis] == 0: + r_chunks.append(r_chunk) + + return nsplits, l_chunks, r_chunks + + @classmethod + def _tile_series_series(cls, op: "DataFrameAlign"): + nsplits, _, left_chunks, right_chunks = align_series_series(op.lhs, op.rhs) + cls._check_align_needed(op, left_chunks, right_chunks) + + l_chunks, r_chunks = [], [] + for idx, (lc, rc) in enumerate(zip(left_chunks, right_chunks)): + kws = [lc.params, rc.params] + kws[0]["index"] = kws[1]["index"] = (idx,) + + chunk_op = op.copy().reset_key() + l_chunk, r_chunk = chunk_op.new_chunks([lc, rc], kws=kws) + l_chunks.append(l_chunk) + r_chunks.append(r_chunk) + return nsplits, l_chunks, r_chunks + + @classmethod + def _tile_with_fillna(cls, tileable: TileableType): + op = tileable.op + if op.method is None: + return tileable + axis = op.fill_axis if tileable.ndim == 2 else 0 + tileable = tileable.fillna(method=op.method, limit=op.limit, axis=axis) + return (yield from recursive_tile(tileable)) + + @classmethod + def _make_direct_output_kws(cls, left: TileableType, right: TileableType): + kws = [left.params, right.params] + kws[0].update(dict(chunks=left.chunks, nsplits=left.nsplits)) + kws[1].update(dict(chunks=right.chunks, nsplits=right.nsplits)) + return kws + + @classmethod + def tile(cls, op: "DataFrameAlign"): + try: + if op.lhs.ndim == op.rhs.ndim: + if op.lhs.ndim == 2: + nsplits, left_chunks, right_chunks = cls._tile_dataframe_dataframe( + op + ) + else: + nsplits, left_chunks, right_chunks = cls._tile_series_series(op) + else: + nsplits, left_chunks, right_chunks = cls._tile_dataframe_series(op) + except _NoNeedToAlign: + kws = cls._make_direct_output_kws(op.lhs, op.rhs) + else: + kws = [ + cls._build_tiled_kw(op, 0, left_chunks, nsplits), + cls._build_tiled_kw(op, 1, right_chunks, nsplits), + ] + new_left, new_right = op.copy().new_tileables(op.inputs, kws=kws) + + new_left_filled = yield from cls._tile_with_fillna(new_left) + new_right_filled = yield from cls._tile_with_fillna(new_right) + if new_left_filled is not new_left or new_right_filled is not new_right: + kws = cls._make_direct_output_kws(new_left_filled, new_right_filled) + new_left, new_right = op.copy().new_tileables(op.inputs, kws=kws) + + return [new_left, new_right] + + @classmethod + def execute(cls, ctx, op: "DataFrameAlign"): + lhs_val = ctx[op.lhs.key] + rhs_val = ctx[op.rhs.key] + l_res, r_res = lhs_val.align( + rhs_val, + axis=op.axis, + join=op.join, + fill_value=op.fill_value, + broadcast_axis=op.broadcast_axis, + ) + ctx[op.outputs[0].key] = l_res + ctx[op.outputs[1].key] = r_res + + +def align( + df, + other, + join: str = "outer", + axis: Union[int, str, None] = None, + level: Union[int, str, None] = None, + copy: bool = True, + fill_value: Any = None, + method: str = None, + limit: Optional[int] = None, + fill_axis: Union[int, str] = 0, + broadcast_axis: Union[int, str] = None, +): + """ + Align two objects on their axes with the specified join method. + + Join method is specified for each axis Index. + + Parameters + ---------- + other : DataFrame or Series + join : {'outer', 'inner', 'left', 'right'}, default 'outer' + axis : allowed axis of the other object, default None + Align on index (0), columns (1), or both (None). + level : int or level name, default None + Broadcast across a level, matching Index values on the + passed MultiIndex level. + copy : bool, default True + Always returns new objects. If copy=False and no reindexing is + required then original objects are returned. + fill_value : scalar, default np.NaN + Value to use for missing values. Defaults to NaN, but can be any + "compatible" value. + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed Series: + + - pad / ffill: propagate last valid observation forward to next valid. + - backfill / bfill: use NEXT valid observation to fill gap. + + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. Must be greater than 0 if not None. + fill_axis : {0 or 'index', 1 or 'columns'}, default 0 + Filling axis, method and limit. + broadcast_axis : {0 or 'index', 1 or 'columns'}, default None + Broadcast values along this axis, if aligning two objects of + different dimensions. + + Notes + ----- + Currently argument `level` is not supported. + + Returns + ------- + (left, right) : (DataFrame, type of other) + Aligned objects. + + Examples + -------- + >>> import mars.tensor as mt + >>> import mars.dataframe as md + >>> df = md.DataFrame( + ... [[1, 2, 3, 4], [6, 7, 8, 9]], columns=["D", "B", "E", "A"], index=[1, 2] + ... ) + >>> other = md.DataFrame( + ... [[10, 20, 30, 40], [60, 70, 80, 90], [600, 700, 800, 900]], + ... columns=["A", "B", "C", "D"], + ... index=[2, 3, 4], + ... ) + >>> df.execute() + D B E A + 1 1 2 3 4 + 2 6 7 8 9 + >>> other.execute() + A B C D + 2 10 20 30 40 + 3 60 70 80 90 + 4 600 700 800 900 + + Align on columns: + + >>> left, right = df.align(other, join="outer", axis=1) + >>> left.execute() + A B C D E + 1 4 2 NaN 1 3 + 2 9 7 NaN 6 8 + >>> right.execute() + A B C D E + 2 10 20 30 40 NaN + 3 60 70 80 90 NaN + 4 600 700 800 900 NaN + + We can also align on the index: + + >>> left, right = df.align(other, join="outer", axis=0) + >>> left.execute() + D B E A + 1 1.0 2.0 3.0 4.0 + 2 6.0 7.0 8.0 9.0 + 3 NaN NaN NaN NaN + 4 NaN NaN NaN NaN + >>> right.execute() + A B C D + 1 NaN NaN NaN NaN + 2 10.0 20.0 30.0 40.0 + 3 60.0 70.0 80.0 90.0 + 4 600.0 700.0 800.0 900.0 + + Finally, the default `axis=None` will align on both index and columns: + + >>> left, right = df.align(other, join="outer", axis=None) + >>> left.execute() + A B C D E + 1 4.0 2.0 NaN 1.0 3.0 + 2 9.0 7.0 NaN 6.0 8.0 + 3 NaN NaN NaN NaN NaN + 4 NaN NaN NaN NaN NaN + >>> right.execute() + A B C D E + 1 NaN NaN NaN NaN NaN + 2 10.0 20.0 30.0 40.0 NaN + 3 60.0 70.0 80.0 90.0 NaN + 4 600.0 700.0 800.0 900.0 NaN + """ + axis = validate_axis(axis) if axis is not None else None + fill_axis = validate_axis(fill_axis) if fill_axis is not None else None + broadcast_axis = ( + validate_axis(broadcast_axis) if broadcast_axis is not None else None + ) + + if level is not None: + raise NotImplementedError(f"Argument `level` not supported") + if df.ndim != other.ndim and axis is None: + raise ValueError("Must specify axis=0 or 1") + + op = DataFrameAlign( + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + broadcast_axis=broadcast_axis, + ) + return op(df, other) diff --git a/python/xorbits/_mars/dataframe/indexing/at.py b/python/xorbits/_mars/dataframe/indexing/at.py new file mode 100644 index 000000000..35b37460f --- /dev/null +++ b/python/xorbits/_mars/dataframe/indexing/at.py @@ -0,0 +1,83 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from .loc import DataFrameLoc + + +class DataFrameAt: + def __init__(self, obj): + self._obj = obj + self._loc = DataFrameLoc(self._obj) + + def __getitem__(self, indexes): + if not isinstance(indexes, tuple): + indexes = (indexes,) + + for index in indexes: + if not np.isscalar(index): + raise ValueError("Invalid call for scalar access (getting)!") + + return self._loc[indexes] + + +def at(a): + """ + Access a single value for a row/column label pair. + + Similar to ``loc``, in that both provide label-based lookups. Use + ``at`` if you only need to get or set a single value in a DataFrame + or Series. + + Raises + ------ + KeyError + If 'label' does not exist in DataFrame. + + See Also + -------- + DataFrame.iat : Access a single value for a row/column pair by integer + position. + DataFrame.loc : Access a group of rows and columns by label(s). + Series.at : Access a single value using a label. + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], + ... index=[4, 5, 6], columns=['A', 'B', 'C']) + >>> df.execute() + A B C + 4 0 2 3 + 5 0 4 1 + 6 10 20 30 + + Get value at specified row/column pair + + >>> df.at[4, 'B'].execute() + 2 + + # Set value at specified row/column pair + # + # >>> df.at[4, 'B'] = 10 + # >>> df.at[4, 'B'] + # 10 + + Get value within a Series + + >>> df.loc[5].at['B'].execute() + 4 + """ + return DataFrameAt(a) diff --git a/python/xorbits/_mars/dataframe/indexing/getitem.py b/python/xorbits/_mars/dataframe/indexing/getitem.py new file mode 100644 index 000000000..aea1f3e90 --- /dev/null +++ b/python/xorbits/_mars/dataframe/indexing/getitem.py @@ -0,0 +1,635 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from numbers import Integral + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...config import options +from ...core import ENTITY_TYPE, OutputType, recursive_tile +from ...serialization.serializables import AnyField, BoolField, Int32Field +from ...tensor.core import TENSOR_CHUNK_TYPE, TENSOR_TYPE +from ...tensor.datasource import tensor as astensor +from ...utils import has_unknown_shape +from ..align import align_dataframe_dataframe, align_dataframe_series +from ..core import ( + DATAFRAME_CHUNK_TYPE, + DATAFRAME_TYPE, + SERIES_CHUNK_TYPE, + SERIES_TYPE, + is_chunk_meta_lazy, +) +from ..merge import DataFrameConcat +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import in_range_index, parse_index +from .utils import calc_columns_index + + +class SeriesIndex(DataFrameOperand, DataFrameOperandMixin): + _op_module_ = "series" + _op_type_ = OperandDef.INDEX + + _labels = AnyField("labels") + + _combine_size = Int32Field("combine_size") + _is_intermediate = BoolField("is_intermediate") + + def __init__( + self, + labels=None, + combine_size=None, + is_intermediate=None, + output_types=None, + **kw, + ): + super().__init__( + _labels=labels, + _combine_size=combine_size, + _is_intermediate=is_intermediate, + _output_types=output_types, + **kw, + ) + + @property + def labels(self): + return self._labels + + @property + def combine_size(self): + return self._combine_size + + @property + def is_intermediate(self): + return self._is_intermediate + + def __call__(self, series, name=None): + return self.new_tileable([series], dtype=series.dtype, name=name) + + def _new_tileables(self, inputs, kws=None, **kw): + # Override this method to automatically decide the output type, + # when `labels` is a list, we will set `output_types` as series, + # otherwise it will be a scalar. + output_types = getattr(self, "_output_types", None) + shape = kw.pop("shape", None) + is_scalar = not isinstance(self._labels, list) + if not output_types: + output_types = [OutputType.scalar] if is_scalar else [OutputType.series] + self.output_types = output_types + if shape is None: + shape = () if is_scalar else ((len(self._labels)),) + kw["shape"] = shape + if not is_scalar: + index_value = kw.pop("index_value", None) or parse_index( + pd.Index(self._labels) + ) + kw["index_value"] = index_value + return super()._new_tileables(inputs, kws=kws, **kw) + + def _new_chunks(self, inputs, kws=None, **kw): + # Override this method to automatically decide the output type, + # when `labels` is a list, we will set `output_types` as series, + # otherwise it will be a scalar. + output_types = getattr(self, "_output_types", None) + is_scalar = not isinstance(self._labels, list) + if not output_types: + output_types = [OutputType.scalar] if is_scalar else [OutputType.series] + self.output_types = output_types + if kw.get("shape", None) is None: + shape = () if is_scalar else ((len(self._labels)),) + kw["shape"] = shape + if not is_scalar: + index_value = kw.pop("index_value", None) or parse_index( + pd.Index(self._labels) + ) + kw["index_value"] = index_value + else: + # tensor chunk cannot accept index_value + kw.pop("index_value", None) + return super()._new_chunks(inputs, kws=kws, **kw) + + @classmethod + def _calc_chunk_index(cls, label, chunk_indexes): + for i, index in enumerate(chunk_indexes): + if isinstance(index, pd.RangeIndex) and in_range_index(label, index): + return i + elif label in index: + return i + raise TypeError(f"label {label} doesn't exist") + + @classmethod + def _tile_one_chunk(cls, op): + in_series = op.inputs[0] + out_series = op.outputs[0] + + index_op = SeriesIndex(labels=op.labels) + kw = {"name": out_series.name} if hasattr(out_series, "name") else {} + index_chunk = index_op.new_chunk(in_series.chunks, dtype=out_series.dtype, **kw) + new_op = op.copy() + nsplits = ((len(op.labels),),) if isinstance(op.labels, list) else () + return new_op.new_tileables( + op.inputs, chunks=[index_chunk], nsplits=nsplits, dtype=out_series.dtype + ) + + @classmethod + def _tree_getitem(cls, op): + """ + DataFrame doesn't store the index value except RangeIndex or specify `store=True` in `parse_index`, + So we build a tree structure to avoid too much dependence for getitem node. + """ + out_series = op.outputs[0] + combine_size = options.combine_size + chunks = op.inputs[0].chunks + while len(chunks) > combine_size: + new_chunks = [] + for i in range(0, len(chunks), combine_size): + chks = chunks[i : i + combine_size] + if len(chks) == 1: + chk = chks[0] + else: + concat_op = DataFrameConcat(output_types=[OutputType.series]) + chk = concat_op.new_chunk(chks, dtype=chks[0].dtype) + chk_op = SeriesIndex(labels=op.labels, is_intermediate=True) + kw = {"name": out_series.name} if hasattr(out_series, "name") else {} + chk = chk_op.new_chunk( + [chk], + shape=(np.nan,), + dtype=chk.dtype, + index_value=parse_index(pd.RangeIndex(-1)), + **kw, + ) + new_chunks.append(chk) + chunks = new_chunks + + concat_op = DataFrameConcat(output_types=[OutputType.series]) + kw = {"name": out_series.name} if hasattr(out_series, "name") else {} + kw["index"] = (0,) + chk = concat_op.new_chunk(chunks, dtype=chunks[0].dtype, **kw) + index_op = SeriesIndex(labels=op.labels) + chunk = index_op.new_chunk([chk], dtype=chk.dtype, **kw) + new_op = op.copy() + nsplits = ((len(op.labels),),) if isinstance(op.labels, list) else () + kw = out_series.params + kw["nsplits"] = nsplits + kw["chunks"] = [chunk] + return new_op.new_tileables(op.inputs, kws=[kw]) + + @classmethod + def tile(cls, op): + in_series = op.inputs[0] + out_series = op.outputs[0] + + if len(in_series.chunks) == 1: + return cls._tile_one_chunk(op) + if not in_series.index_value.has_value(): + return cls._tree_getitem(op) + + chunk_indexes = [c.index_value.to_pandas() for c in in_series.chunks] + if not isinstance(op.labels, list): + selected_chunk = in_series.chunks[ + cls._calc_chunk_index(op.labels, chunk_indexes) + ] + index_op = op.copy().reset_key() + out_chunk = index_op.new_chunk( + [selected_chunk], shape=(), dtype=selected_chunk.dtype + ) + new_op = op.copy() + return new_op.new_scalars( + op.inputs, dtype=out_series.dtype, chunks=[out_chunk] + ) + else: + # When input series's index is RangeIndex(5), chunk_size is 3, and labels is [4, 2, 3, 4], + # Combine the labels in the same chunk, so the splits will be [[4], [2], [3, 4]], + # the corresponding chunk index is [1, 0, 1]. + selected_index = [ + cls._calc_chunk_index(label, chunk_indexes) for label in op.labels + ] + condition = np.where(np.diff(selected_index))[0] + 1 + column_splits = np.split(op.labels, condition) + column_indexes = np.split(selected_index, condition) + + out_chunks = [] + nsplits = [] + for i, (labels, idx) in enumerate(zip(column_splits, column_indexes)): + index_op = SeriesIndex(labels=list(labels)) + c = in_series.chunks[idx[0]] + nsplits.append(len(labels)) + index_value = parse_index( + pd.Index([], dtype=c.index_value.to_pandas().dtype), c, labels + ) + out_chunks.append( + index_op.new_chunk( + [c], + shape=(len(labels),), + dtype=c.dtype, + index_value=index_value, + name=c.name, + index=(i,), + ) + ) + new_op = op.copy() + return new_op.new_seriess( + op.inputs, + shape=out_series.shape, + dtype=out_series.dtype, + index_value=out_series.index_value, + nsplits=(tuple(nsplits),), + chunks=out_chunks, + name=out_series.name, + ) + + @classmethod + def execute(cls, ctx, op): + series = ctx[op.inputs[0].key] + labels = op.labels + if op.is_intermediate: + # for intermediate result, it is always a series even if labels is a scalar. + labels = labels if isinstance(labels, list) else [labels] + labels = [label for label in set(labels) if label in series] + ctx[op.outputs[0].key] = series[labels] + + +class DataFrameIndex(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.INDEX + + col_names = AnyField("col_names", default=None) + + # for bool index + mask = AnyField("mask", default=None) + identical_index = BoolField("identical_index") + + def __init__(self, output_types=None, **kw): + output_types = output_types or [OutputType.series] + super().__init__(_output_types=output_types, **kw) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if isinstance(self.col_names, ENTITY_TYPE): + self.col_names = self._inputs[0] + if isinstance(self.mask, ENTITY_TYPE): + self.mask = self._inputs[-1] + + def __call__(self, df): + if self.col_names is not None: + # if col_names is a list, return a DataFrame, else return a Series + col_names = self.col_names + if not isinstance(col_names, list): + col_names = [col_names] + is_list = False + else: + is_list = True + + dtypes_list = df._get_dtypes_by_columns(col_names) + if is_list or len(dtypes_list) > 1: + if len(col_names) != len(dtypes_list): + col_names = df._get_columns_by_columns(col_names) + columns = parse_index(pd.Index(col_names), store_data=True) + return self.new_dataframe( + [df], + shape=(df.shape[0], len(col_names)), + dtypes=pd.Series(dtypes_list, index=col_names, dtype=np.dtype("O")), + index_value=df.index_value, + columns_value=columns, + ) + else: + dtype = dtypes_list[0] + return self.new_series( + [df], + shape=(df.shape[0],), + dtype=dtype, + index_value=df.index_value, + name=self.col_names, + ) + else: + if isinstance(self.mask, (SERIES_TYPE, DATAFRAME_TYPE, TENSOR_TYPE)): + index_value = parse_index( + pd.Index( + [], + dtype=df.index_value.to_pandas().dtype, + name=df.index_value.name, + ), + df, + self.mask, + ) + return self.new_dataframe( + [df, self.mask], + shape=(np.nan, df.shape[1]), + dtypes=df.dtypes, + index_value=index_value, + columns_value=df.columns_value, + ) + else: + index_value = parse_index( + pd.Index( + [], + dtype=df.index_value.to_pandas().dtype, + name=df.index_value.name, + ), + df, + self.mask, + ) + return self.new_dataframe( + [df], + shape=(np.nan, df.shape[1]), + dtypes=df.dtypes, + index_value=index_value, + columns_value=df.columns_value, + ) + + @classmethod + def tile(cls, op): + if op.col_names is not None: + return cls.tile_with_columns(op) + else: + return (yield from cls.tile_with_mask(op)) + + @classmethod + def tile_with_mask(cls, op: "DataFrameIndex"): + in_df = op.inputs[0] + out_df = op.outputs[0] + + out_chunks = [] + + if isinstance(op.mask, (SERIES_TYPE, DATAFRAME_TYPE, TENSOR_TYPE)): + mask = op.inputs[1] + + if hasattr(mask, "index_value") and mask.ndim == 1 and op.identical_index: + if has_unknown_shape(in_df, mask): + yield + nsplits = ((np.nan,) * in_df.chunk_shape[0], in_df.nsplits[1]) + out_shape = in_df.chunk_shape + df_chunks = in_df.chunks + aligned_mask = yield from recursive_tile( + mask.rechunk(in_df.nsplits[: mask.ndim]) + ) + mask_chunks = aligned_mask.chunks + elif isinstance(mask, SERIES_TYPE): + nsplits, out_shape, df_chunks, mask_chunks = align_dataframe_series( + in_df, mask, axis="index" + ) + elif isinstance(mask, DATAFRAME_TYPE): + nsplits, out_shapes, df_chunks, mask_chunks = align_dataframe_dataframe( + in_df, mask + ) + out_shape = out_shapes[0] + else: + # tensor + nsplits = in_df.nsplits + mask = yield from recursive_tile(mask.rechunk(nsplits[: mask.ndim])) + out_shape = in_df.chunk_shape + df_chunks = in_df.chunks + mask_chunks = mask.chunks + out_chunk_indexes = itertools.product(*(range(s) for s in out_shape)) + + out_chunks = [] + for i, idx, df_chunk in zip( + itertools.count(), out_chunk_indexes, df_chunks + ): + if op.mask.ndim == 1: + mask_chunk = mask_chunks[df_chunk.index[0]] + else: + mask_chunk = mask_chunks[i] + out_chunk = ( + op.copy() + .reset_key() + .new_chunk( + [df_chunk, mask_chunk], + index=idx, + shape=(np.nan, df_chunk.shape[1]), + ) + ) + out_chunk._set_tileable_meta( + tileable_key=out_df.key, + nsplits=nsplits, + index_value=out_df.index_value, + columns_value=out_df.columns_value, + dtypes=out_df.dtypes, + ) + out_chunks.append(out_chunk) + else: + if has_unknown_shape(in_df): + yield + nsplits_acc = np.cumsum((0,) + in_df.nsplits[0]) + for idx in range(in_df.chunk_shape[0]): + for idxj in range(in_df.chunk_shape[1]): + in_chunk = in_df.cix[idx, idxj] + chunk_op = op.copy().reset_key() + chunk_op.mask = op.mask.iloc[ + nsplits_acc[idx] : nsplits_acc[idx + 1] + ] + out_chunk = chunk_op.new_chunk( + [in_chunk], + index=in_chunk.index, + shape=(np.nan, in_chunk.shape[1]), + dtypes=in_chunk.dtypes, + index_value=in_df.index_value, + columns_value=in_chunk.columns_value, + ) + out_chunks.append(out_chunk) + + nsplits_on_columns = tuple(c.shape[1] for c in out_chunks if c.index[0] == 0) + row_chunk_num = len([c.shape[0] for c in out_chunks if c.index[1] == 0]) + nsplits = ((np.nan,) * row_chunk_num, nsplits_on_columns) + + new_op = op.copy() + return new_op.new_dataframes( + op.inputs, + shape=out_df.shape, + dtypes=out_df.dtypes, + index_value=out_df.index_value, + columns_value=out_df.columns_value, + chunks=out_chunks, + nsplits=nsplits, + ) + + @classmethod + def tile_with_columns(cls, op): + in_df = op.inputs[0] + out_df = op.outputs[0] + col_names = op.col_names + chunk_meta_lazy = is_chunk_meta_lazy(in_df.chunks[0]) + if out_df.ndim < 2: + # Series + column_index = calc_columns_index(col_names, in_df)[0] + out_chunks = [] + dtype = in_df.dtypes[col_names] + out_nsplits = (in_df.nsplits[0],) + for i in range(in_df.chunk_shape[0]): + c = in_df.cix[(i, column_index)] + chunk_op = DataFrameIndex(col_names=col_names) + if chunk_meta_lazy: + out_chunk = chunk_op.new_chunk( + [c], + shape=(c.shape[0],), + index=(i,), + dtype=dtype, + name=col_names, + ) + out_chunk._set_tileable_meta( + tileable_key=out_df.key, + nsplits=out_nsplits, + index_value=out_df.index_value, + ) + else: + out_chunk = chunk_op.new_chunk( + [c], + shape=(c.shape[0],), + index=(i,), + dtype=dtype, + index_value=c.index_value, + name=col_names, + ) + out_chunks.append(out_chunk) + new_op = op.copy() + params = out_df.params.copy() + params["chunks"] = out_chunks + params["nsplits"] = out_nsplits + return new_op.new_seriess(op.inputs, kws=[params]) + else: + # combine columns into one chunk and keep the columns order at the same time. + # When chunk columns are ['c1', 'c2', 'c3'], ['c4', 'c5'], + # selected columns are ['c2', 'c3', 'c4', 'c2'], `column_splits` will be + # [(['c2', 'c3'], 0), ('c4', 1), ('c2', 0)]. + if not isinstance(col_names, _list_like_types): + col_names = [col_names] + selected_index = [calc_columns_index(col, in_df) for col in col_names] + selected_index = list(itertools.chain.from_iterable(selected_index)) + condition = np.where(np.diff(selected_index))[0] + 1 + column_splits = np.split(col_names, condition) + column_indexes = np.split(selected_index, condition) + + out_chunks = [[] for _ in range(in_df.chunk_shape[0])] + nsplits = [in_df.nsplits[0], []] + column_nsplits = nsplits[1] + for i, (columns, column_idx) in enumerate( + zip(column_splits, column_indexes) + ): + try: + dtypes = in_df.dtypes[columns] + except ValueError: # pragma: no cover + dtypes = in_df.dtypes[list(columns)] + column_nsplits.append(len(dtypes)) + for j in range(in_df.chunk_shape[0]): + c = in_df.cix[(j, column_idx[0])] + index_op = DataFrameIndex( + col_names=list(columns), output_types=[OutputType.dataframe] + ) + if chunk_meta_lazy: + out_chunk = index_op.new_chunk( + [c], shape=(c.shape[0], len(dtypes)), index=(j, i) + ) + out_chunk._set_tileable_meta( + tileable_key=out_df.key, + nsplits=nsplits, + index_value=out_df.index_value, + columns_value=out_df.columns_value, + dtypes=out_df.dtypes, + ) + else: + out_chunk = index_op.new_chunk( + [c], + shape=(c.shape[0], len(dtypes)), + index=(j, i), + dtypes=dtypes, + index_value=c.index_value, + columns_value=parse_index( + pd.Index(dtypes.index), store_data=True + ), + ) + out_chunks[j].append(out_chunk) + out_chunks = [item for cl in out_chunks for item in cl] + new_op = op.copy() + params = out_df.params.copy() + params["chunks"] = out_chunks + params["nsplits"] = nsplits + return new_op.new_dataframes(op.inputs, kws=[params]) + + @classmethod + def execute(cls, ctx, op: "DataFrameIndex"): + if op.mask is None: + df = ctx[op.inputs[0].key] + ctx[op.outputs[0].key] = df[op.col_names] + else: + df = ctx[op.inputs[0].key] + if isinstance( + op.mask, (SERIES_CHUNK_TYPE, DATAFRAME_CHUNK_TYPE, TENSOR_CHUNK_TYPE) + ): + mask = ctx[op.inputs[1].key] + else: + mask = op.mask + if hasattr(mask, "reindex_like") and not op.identical_index: + mask = mask.reindex_like(df).fillna(False) + ctx[op.outputs[0].key] = df[mask] + + @classmethod + def estimate_size(cls, ctx: dict, op: "DataFrameIndex"): + super().estimate_size(ctx, op) + result_size = ctx[op.outputs[0].key][0] + ctx[op.outputs[0].key] = (result_size, result_size) + + +_list_like_types = (list, np.ndarray, SERIES_TYPE, pd.Series, TENSOR_TYPE) + + +def dataframe_getitem(df, item): + columns_set = set(df.dtypes.keys()) + + if isinstance(item, (np.ndarray, pd.Series)) and item.dtype != np.bool_: + item = item.tolist() + + if isinstance(item, slice): + edge = item.start if item.start is not None else item.stop + if isinstance(edge, Integral): + return df.iloc[item] + else: + return df.loc[item] + elif isinstance(item, list): + for col_name in item: + if col_name not in columns_set: + raise KeyError(f"{col_name} not in columns") + op = DataFrameIndex(col_names=item, output_types=[OutputType.dataframe]) + elif isinstance(item, _list_like_types) or hasattr(item, "dtypes"): + # NB: don't enforce the dtype of `item` to be `bool` since it may be unknown + if isinstance(item, DATAFRAME_TYPE + SERIES_TYPE): + identical_index = df.index_value.key == item.index_value.key + else: + identical_index = False + op = DataFrameIndex( + mask=item, + identical_index=identical_index, + output_types=[OutputType.dataframe], + ) + else: + if item not in columns_set: + raise KeyError(f"{item} not in columns {columns_set}") + op = DataFrameIndex(col_names=item) + return op(df) + + +def series_getitem(series, labels, combine_size=None): + if isinstance(labels, list) or np.isscalar(labels): + op = SeriesIndex(labels=labels, combine_size=combine_size) + return op(series, name=series.name) + elif isinstance(labels, _list_like_types) and astensor(labels).dtype == np.bool_: + return series.loc[labels] + elif isinstance(labels, slice): + edge = labels.start if labels.start is not None else labels.stop + if isinstance(edge, Integral): + return series.iloc[labels] + else: + return series.loc[labels] + else: + raise NotImplementedError(f"type {type(labels)} is not support for getitem") diff --git a/python/xorbits/_mars/dataframe/indexing/iat.py b/python/xorbits/_mars/dataframe/indexing/iat.py new file mode 100644 index 000000000..c03dfb17b --- /dev/null +++ b/python/xorbits/_mars/dataframe/indexing/iat.py @@ -0,0 +1,37 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from numbers import Integral + +from .iloc import DataFrameIloc + + +class DataFrameIat: + def __init__(self, obj): + self._obj = obj + self._iloc = DataFrameIloc(self._obj) + + def __getitem__(self, indexes): + if not isinstance(indexes, tuple): + indexes = (indexes,) + + for index in indexes: + if not isinstance(index, Integral): + raise ValueError("Invalid call for scalar access (getting)!") + + return self._iloc[indexes] + + +def iat(a): + return DataFrameIat(a) diff --git a/python/xorbits/_mars/dataframe/indexing/iloc.py b/python/xorbits/_mars/dataframe/indexing/iloc.py new file mode 100644 index 000000000..225cf6fd7 --- /dev/null +++ b/python/xorbits/_mars/dataframe/indexing/iloc.py @@ -0,0 +1,893 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from numbers import Integral + +import numpy as np +import pandas as pd +from pandas.core.dtypes.cast import find_common_type +from pandas.core.indexing import IndexingError + +from ... import opcodes as OperandDef +from ...config import options +from ...core import ENTITY_TYPE, OutputType, recursive_tile +from ...serialization.serializables import AnyField, KeyField, ListField +from ...tensor import asarray +from ...tensor.datasource.empty import empty +from ...tensor.indexing.core import calc_shape +from ...utils import ceildiv +from ..operands import DATAFRAME_TYPE, DataFrameOperand, DataFrameOperandMixin +from ..utils import indexing_index_value, is_cudf +from .index_lib import DataFrameIlocIndexesHandler + +_ILOC_ERROR_MSG = ( + "Location based indexing can only have [integer, " + "integer slice (START point is INCLUDED, END point is EXCLUDED), " + "listlike of integers, boolean array] types" +) + + +def process_iloc_indexes(inp, indexes): + ndim = inp.ndim + + if not isinstance(indexes, tuple): + indexes = (indexes,) + if len(indexes) < ndim: + indexes += (slice(None),) * (ndim - len(indexes)) + if len(indexes) > ndim: + raise IndexingError("Too many indexers") + + new_indexes = [] + # check each index + for ax, index in enumerate(indexes): + if isinstance(index, tuple): + # a tuple should already have been caught by this point + # so don't treat a tuple as a valid indexer + raise IndexingError("Too many indexers") + elif isinstance(index, slice): + if any(v is not None for v in [index.start, index.stop, index.step]): + pd_index = ( + inp.index_value if ax == 0 else inp.columns_value + ).to_pandas() + for val in [index.start, index.stop, index.step]: + if val is not None: + try: + pd_index[val] # check on the pandas + except IndexError: + pass + except TypeError: + raise TypeError( + f"cannot do slice indexing on {type(pd_index)} " + f"with these indexers [{val}] of {type(val)}" + ) + new_indexes.append(index) + elif isinstance(index, (list, np.ndarray, pd.Series, ENTITY_TYPE)): + if not isinstance(index, ENTITY_TYPE): + index = np.asarray(index) + else: + index = asarray(index) + if ax == 1: + # do not support tensor index on axis 1 + # because if so, the dtypes and columns_value would be unknown + try: + index = index.fetch() + except (RuntimeError, ValueError): + raise NotImplementedError( + "indexer on axis columns cannot be non-executed tensor" + ) + if index.dtype != np.bool_: + index = index.astype(np.int64) + if index.ndim != 1: + raise ValueError( + "Buffer has wrong number of dimensions " + f"(expected 1, got {index.ndim})" + ) + new_indexes.append(index) + elif isinstance(index, Integral): + shape = inp.shape[ax] + if not np.isnan(shape): + if index < -shape or index >= shape: + raise IndexError("single positional indexer is out-of-bounds") + new_indexes.append(index) + else: + raise ValueError(_ILOC_ERROR_MSG) + + return new_indexes + + +class DataFrameIloc: + def __init__(self, obj): + self._obj = obj + + def __getitem__(self, indexes): + if isinstance(self._obj, DATAFRAME_TYPE): + op = DataFrameIlocGetItem(indexes=process_iloc_indexes(self._obj, indexes)) + else: + op = SeriesIlocGetItem(indexes=process_iloc_indexes(self._obj, indexes)) + return op(self._obj) + + def __setitem__(self, indexes, value): + if not np.isscalar(value): + raise NotImplementedError("Only scalar value is supported to set by iloc") + + if isinstance(self._obj, DATAFRAME_TYPE): + op = DataFrameIlocSetItem( + indexes=process_iloc_indexes(self._obj, indexes), value=value + ) + else: + op = SeriesIlocSetItem( + indexes=process_iloc_indexes(self._obj, indexes), value=value + ) + + ret = op(self._obj) + self._obj.data = ret.data + + +class HeadTailOptimizedOperandMixin(DataFrameOperandMixin): + __slots__ = () + + @classmethod + def _is_head(cls, index0): + return ( + (index0.start is None or index0.start == 0) + and index0.stop is not None + and index0.stop > 0 + ) + + @classmethod + def _is_tail(cls, index0): + return index0.start is not None and index0.start < 0 and index0.stop is None + + @classmethod + def _is_indexes_head_or_tail(cls, indexes): + index0 = indexes[0] + if not isinstance(index0, slice): + # have to be slice + return False + if index0.step is not None and index0.step != 1: + return False + if len(indexes) == 2: + if not isinstance(indexes[1], slice): + return False + if indexes[1] != slice(None): + return False + if cls._is_tail(index0): + # tail + return True + if cls._is_head(index0): + # head + return True + return False + + @classmethod + def _need_tile_head_tail(cls, op): + # first, the input DataFrame should + # have unknown chunk shapes on the index axis, + inp = op.input + if not any(np.isnan(s) for s in inp.nsplits[0]): + return False + + # if input is a DataFrame, + # should have 1 chunk on columns axis + if inp.ndim > 1 and inp.chunk_shape[1] > 1: + return False + + return cls._is_indexes_head_or_tail(op.indexes) + + @classmethod + def _tile_head_tail(cls, op): + from ..merge import DataFrameConcat + + inp = op.input + out = op.outputs[0] + combine_size = options.combine_size + + chunks = inp.chunks + + new_chunks = [] + for c in chunks: + chunk_op = op.copy().reset_key() + params = out.params + params["index"] = c.index + params["shape"] = c.shape if np.isnan(c.shape[0]) else out.shape + new_chunks.append(chunk_op.new_chunk([c], kws=[params])) + chunks = new_chunks + + while len(chunks) > 1: + new_size = ceildiv(len(chunks), combine_size) + new_chunks = [] + for i in range(new_size): + in_chunks = chunks[combine_size * i : combine_size * (i + 1)] + chunk_index = (i, 0) if in_chunks[0].ndim == 2 else (i,) + if len(inp.shape) == 1: + shape = (sum(c.shape[0] for c in in_chunks),) + else: + shape = (sum(c.shape[0] for c in in_chunks), in_chunks[0].shape[1]) + concat_chunk = DataFrameConcat( + axis=0, output_types=in_chunks[0].op.output_types + ).new_chunk(in_chunks, index=chunk_index, shape=shape) + chunk_op = op.copy().reset_key() + params = out.params + params["index"] = chunk_index + params["shape"] = ( + in_chunks[0].shape if np.isnan(in_chunks[0].shape[0]) else out.shape + ) + new_chunks.append(chunk_op.new_chunk([concat_chunk], kws=[params])) + chunks = new_chunks + + new_op = op.copy() + params = out.params + params["nsplits"] = tuple((s,) for s in out.shape) + params["chunks"] = chunks + return new_op.new_tileables(op.inputs, kws=[params]) + + def can_be_optimized(self): + return ( + self._is_indexes_head_or_tail(self._indexes) + and self._is_head(self._indexes[0]) + and self._indexes[0].stop <= options.optimize.head_optimize_threshold + ) + + @classmethod + def tile(cls, op): + if cls._need_tile_head_tail(op): + return cls._tile_head_tail(op) + + +class DataFrameIlocGetItem(DataFrameOperand, HeadTailOptimizedOperandMixin): + _op_type_ = OperandDef.DATAFRAME_ILOC_GETITEM + + _input = KeyField("input") + _indexes = ListField("indexes") + + def __init__(self, indexes=None, gpu=None, sparse=False, output_types=None, **kw): + super().__init__( + _indexes=indexes, gpu=gpu, sparse=sparse, _output_types=output_types, **kw + ) + if not self.output_types: + self.output_types = [OutputType.dataframe] + + @property + def input(self): + return self._input + + @property + def indexes(self): + return self._indexes + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + self._input = next(inputs_iter) + indexes = [] + for index in self._indexes: + if isinstance(index, ENTITY_TYPE): + indexes.append(next(inputs_iter)) + else: + indexes.append(index) + self._indexes = indexes + + def __call__(self, df): + # Note [Fancy Index of Numpy and Pandas] + # + # The numpy and pandas.iloc have different semantic when processing fancy index: + # + # >>> np.ones((3,3))[[1,2],[1,2]] + # array([1., 1.]) + # + # >>> pd.DataFrame(np.ones((3,3))).iloc[[1,2],[1,2]] + # 1 2 + # 1 1.0 1.0 + # 2 1.0 1.0 + # + # Thus, we processing the index along two axis of DataFrame separately. + shape0 = tuple(calc_shape((df.shape[0],), (self.indexes[0],))) + shape1 = tuple(calc_shape((df.shape[1],), (self.indexes[1],))) + + inputs = [df] + [ + index for index in self._indexes if isinstance(index, ENTITY_TYPE) + ] + + # NB: pandas only compresses the result to series when index on one of axis is integral + if isinstance(self.indexes[1], Integral): + shape = shape0 + dtype = df.dtypes.iloc[self.indexes[1]] + index_value = indexing_index_value(df.index_value, self.indexes[0]) + if isinstance(self.indexes[0], Integral): + # scalar + return self.new_scalar(inputs, dtype=dtype) + else: + return self.new_series( + inputs, + shape=shape, + dtype=dtype, + index_value=index_value, + name=df.dtypes.index[self.indexes[1]], + ) + elif isinstance(self.indexes[0], Integral): + shape = shape1 + dtype = find_common_type(list(df.dtypes.iloc[self.indexes[1]].values)) + index_value = indexing_index_value(df.columns_value, self.indexes[1]) + return self.new_series( + inputs, shape=shape, dtype=dtype, index_value=index_value + ) + else: + return self.new_dataframe( + inputs, + shape=shape0 + shape1, + dtypes=df.dtypes.iloc[self.indexes[1]], + index_value=indexing_index_value(df.index_value, self.indexes[0]), + columns_value=indexing_index_value( + df.columns_value, self.indexes[1], store_data=True + ), + ) + + # FIXME The view behavior of DataFrame.iloc + # + # The pandas's iloc has complicated behavior about whether to create a view or not, it depends + # on the further operation on the view, as illustrated by the following example: + # + # >>> df = pd.DataFrame([[1,2], [3,4]]) + # >>> x = df.iloc[:] + # >>> df + # 0 1 + # 0 1 2 + # 1 3 4 + # >>> x + # 0 1 + # 0 1 2 + # 1 3 4 + # + # >>> x.iloc[:] = 1000 + # >>> x + # 0 1 + # 0 1000 1000 + # 1 1000 1000 + # df + # 0 1 + # 0 1000 1000 + # 1 1000 1000 + # + # >>> x.iloc[:] = 2000.0 + # >>> x + # 0 1 + # 0 2000.0 2000.0 + # 1 2000.0 2000.0 + # >>> df + # 0 1 + # 0 1000 1000 + # 1 1000 1000 + + @classmethod + def tile(cls, op): + tileds = super().tile(op) + if tileds is not None: + return tileds + + handler = DataFrameIlocIndexesHandler() + return [(yield from handler.handle(op))] + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + df = ctx[op.input.key] + if len(op.inputs) > 1: + indexes = tuple( + ctx[index.key] if hasattr(index, "key") else index + for index in op.indexes + ) + else: + indexes = tuple(op.indexes) + r = df.iloc[indexes] + if isinstance(r, pd.Series) and r.dtype != chunk.dtype: + r = r.astype(chunk.dtype) + if is_cudf(r): # pragma: no cover + r = r.copy() + ctx[chunk.key] = r + + +class DataFrameIlocSetItem(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.DATAFRAME_ILOC_SETITEM + + _indexes = ListField("indexes") + _value = AnyField("value") + + def __init__( + self, indexes=None, value=None, gpu=None, sparse=False, output_types=None, **kw + ): + super().__init__( + _indexes=indexes, + _value=value, + gpu=gpu, + sparse=sparse, + _output_types=output_types, + **kw, + ) + if not self.output_types: + self.output_types = [OutputType.dataframe] + + @property + def indexes(self): + return self._indexes + + @property + def value(self): + return self._value + + def __call__(self, df): + return self.new_dataframe( + [df], + shape=df.shape, + dtypes=df.dtypes, + index_value=df.index_value, + columns_value=df.columns_value, + ) + + @classmethod + def tile(cls, op): + in_df = op.inputs[0] + out_df = op.outputs[0] + + # See Note [Fancy Index of Numpy and Pandas] + tensor0 = yield from recursive_tile( + empty(in_df.shape[0], chunk_size=(in_df.nsplits[0],))[op.indexes[0]] + ) + tensor1 = yield from recursive_tile( + empty(in_df.shape[1], chunk_size=(in_df.nsplits[1],))[op.indexes[1]] + ) + + chunk_mapping = { + c0.inputs[0].index + c1.inputs[0].index: (c0, c1) + for c0, c1 in itertools.product(tensor0.chunks, tensor1.chunks) + } + + out_chunks = [] + for chunk in in_df.chunks: + if chunk.index not in chunk_mapping: + out_chunks.append(chunk) + else: + chunk_op = op.copy().reset_key() + index_chunk, column_chunk = chunk_mapping[chunk.index] + chunk_op._indexes = [ + index_chunk.op.indexes[0], + column_chunk.op.indexes[0], + ] + chunk_op._value = op.value + out_chunk = chunk_op.new_chunk( + [chunk], + shape=chunk.shape, + index=chunk.index, + dtypes=chunk.dtypes, + index_value=chunk.index_value, + columns_value=chunk.columns_value, + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_dataframes( + op.inputs, + shape=out_df.shape, + dtypes=out_df.dtypes, + index_value=out_df.index_value, + columns_value=out_df.columns_value, + chunks=out_chunks, + nsplits=in_df.nsplits, + ) + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + r = ctx[op.inputs[0].key].copy(deep=True) + r.iloc[tuple(op.indexes)] = op.value + ctx[chunk.key] = r + + +class SeriesIlocGetItem(DataFrameOperand, HeadTailOptimizedOperandMixin): + _op_module_ = "series" + _op_type_ = OperandDef.DATAFRAME_ILOC_GETITEM + + _input = KeyField("input") + _indexes = ListField("indexes") + + def __init__(self, indexes=None, gpu=None, sparse=False, output_types=None, **kw): + super().__init__( + _indexes=indexes, gpu=gpu, sparse=sparse, _output_types=output_types, **kw + ) + if not self.output_types: + self.output_types = [OutputType.series] + + @property + def input(self): + return self._input + + @property + def indexes(self): + return self._indexes + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + + inputs_iter = iter(self._inputs) + self._input = next(inputs_iter) + + indexes = [] + for index in self._indexes: + if isinstance(index, ENTITY_TYPE): + indexes.append(next(inputs_iter)) + else: + indexes.append(index) + self._indexes = indexes + + @classmethod + def tile(cls, op): + tileds = super().tile(op) + if tileds is not None: + return tileds + + handler = DataFrameIlocIndexesHandler() + return [(yield from handler.handle(op))] + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + series = ctx[op.input.key] + if len(op.inputs) > 1: + indexes = tuple( + ctx[index.key] if hasattr(index, "key") else index + for index in op.indexes + ) + else: + indexes = tuple(op.indexes) + if hasattr(series, "iloc"): + ctx[chunk.key] = series.iloc[indexes] + else: + # index, only happen for calling from rechunk + ctx[chunk.key] = series[indexes if len(indexes) > 1 else indexes[0]] + + def __call__(self, series): + if isinstance(self._indexes[0], Integral): + return self.new_scalar([series], dtype=series.dtype) + else: + shape = tuple(calc_shape(series.shape, self.indexes)) + index_value = indexing_index_value(series.index_value, self.indexes[0]) + inputs = [series] + [ + index for index in self._indexes if isinstance(index, ENTITY_TYPE) + ] + return self.new_series( + inputs, + shape=shape, + dtype=series.dtype, + index_value=index_value, + name=series.name, + ) + + +class SeriesIlocSetItem(DataFrameOperand, DataFrameOperandMixin): + _op_module_ = "series" + _op_type_ = OperandDef.DATAFRAME_ILOC_SETITEM + + _indexes = ListField("indexes") + _value = AnyField("value") + + def __init__(self, indexes=None, value=None, gpu=None, sparse=False, **kw): + super().__init__( + _indexes=indexes, + _value=value, + gpu=gpu, + sparse=sparse, + _output_types=[OutputType.series], + **kw, + ) + + @property + def indexes(self): + return self._indexes + + @property + def value(self): + return self._value + + def __call__(self, series): + return self.new_series( + [series], + shape=series.shape, + dtype=series.dtype, + index_value=series.index_value, + name=series.name, + ) + + @classmethod + def tile(cls, op): + in_series = op.inputs[0] + out = op.outputs[0] + + # Reuse the logic of fancy indexing in tensor module. + tensor = yield from recursive_tile( + empty(in_series.shape, chunk_size=in_series.nsplits)[op.indexes[0]] + ) + + chunk_mapping = dict((c.inputs[0].index, c) for c in tensor.chunks) + + out_chunks = [] + for chunk in in_series.chunks: + if chunk.index not in chunk_mapping: + out_chunks.append(chunk) + else: + chunk_op = op.copy().reset_key() + index_chunk = chunk_mapping[chunk.index] + chunk_op._indexes = index_chunk.op.indexes + chunk_op._value = op.value + out_chunk = chunk_op.new_chunk( + [chunk], + shape=chunk.shape, + index=chunk.index, + dtype=chunk.dtype, + index_value=chunk.index_value, + name=chunk.name, + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_seriess( + op.inputs, + shape=out.shape, + dtype=out.dtype, + index_value=out.index_value, + name=out.name, + chunks=out_chunks, + nsplits=in_series.nsplits, + ) + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + r = ctx[op.inputs[0].key].copy(deep=True) + r.iloc[tuple(op.indexes)] = op.value + ctx[chunk.key] = r + + +class IndexIlocGetItem(DataFrameOperand, DataFrameOperandMixin): + _op_module_ = "index" + _op_type_ = OperandDef.DATAFRAME_ILOC_GETITEM + + _input = KeyField("input") + _indexes = ListField("indexes") + + def __init__(self, indexes=None, gpu=None, sparse=False, output_types=None, **kw): + super().__init__( + _indexes=indexes, gpu=gpu, sparse=sparse, _output_types=output_types, **kw + ) + if not self.output_types: + self.output_types = [OutputType.index] + + @property + def input(self): + return self._input + + @property + def indexes(self): + return self._indexes + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + + inputs_iter = iter(self._inputs) + self._input = next(inputs_iter) + + indexes = [] + for index in self._indexes: + if isinstance(index, ENTITY_TYPE): + indexes.append(next(inputs_iter)) + else: + indexes.append(index) + self._indexes = indexes + + @classmethod + def tile(cls, op): + handler = DataFrameIlocIndexesHandler() + return [(yield from handler.handle(op))] + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + idx = ctx[op.input.key] + if len(op.inputs) > 1: + indexes = tuple( + ctx[index.key] if hasattr(index, "key") else index + for index in op.indexes + ) + else: + indexes = tuple(op.indexes) + if len(indexes) == 1: + indexes = indexes[0] + ctx[chunk.key] = idx[indexes] + + def __call__(self, idx): + if isinstance(self._indexes[0], Integral): + return self.new_scalar([idx], dtype=idx.dtype) + else: + shape = tuple(calc_shape(idx.shape, self.indexes)) + index_value = indexing_index_value(idx.index_value, self.indexes[0]) + inputs = [idx] + [ + index for index in self._indexes if isinstance(index, ENTITY_TYPE) + ] + return self.new_index( + inputs, + shape=shape, + dtype=idx.dtype, + index_value=index_value, + name=idx.name, + ) + + +def index_getitem(idx, indexes): + op = IndexIlocGetItem(indexes=process_iloc_indexes(idx, indexes)) + return op(idx) + + +def index_setitem(_idx, *_): + raise TypeError("Index does not support mutable operations") + + +def iloc(a): + return DataFrameIloc(a) + + +def head(a, n=5): + """ + Return the first `n` rows. + + This function returns the first `n` rows for the object based + on position. It is useful for quickly testing if your object + has the right type of data in it. + + For negative values of `n`, this function returns all rows except + the last `n` rows, equivalent to ``df[:-n]``. + + Parameters + ---------- + n : int, default 5 + Number of rows to select. + + Returns + ------- + same type as caller + The first `n` rows of the caller object. + + See Also + -------- + DataFrame.tail: Returns the last `n` rows. + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', + ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) + >>> df.execute() + animal + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra + + Viewing the first 5 lines + + >>> df.head().execute() + animal + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + + Viewing the first `n` lines (three in this case) + + >>> df.head(3).execute() + animal + 0 alligator + 1 bee + 2 falcon + + For negative values of `n` + + >>> df.head(-3).execute() + animal + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot + """ + return DataFrameIloc(a)[0:n] + + +def tail(a, n=5): + """ + Return the last `n` rows. + + This function returns last `n` rows from the object based on + position. It is useful for quickly verifying data, for example, + after sorting or appending rows. + + For negative values of `n`, this function returns all rows except + the first `n` rows, equivalent to ``df[n:]``. + + Parameters + ---------- + n : int, default 5 + Number of rows to select. + + Returns + ------- + type of caller + The last `n` rows of the caller object. + + See Also + -------- + DataFrame.head : The first `n` rows of the caller object. + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', + ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) + >>> df.execute() + animal + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra + + Viewing the last 5 lines + + >>> df.tail().execute() + animal + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra + + Viewing the last `n` lines (three in this case) + + >>> df.tail(3).execute() + animal + 6 shark + 7 whale + 8 zebra + + For negative values of `n` + + >>> df.tail(-3).execute() + animal + 3 lion + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra + """ + return DataFrameIloc(a)[-n:] diff --git a/python/xorbits/_mars/dataframe/indexing/index_lib.py b/python/xorbits/_mars/dataframe/indexing/index_lib.py new file mode 100644 index 000000000..bc701e40c --- /dev/null +++ b/python/xorbits/_mars/dataframe/indexing/index_lib.py @@ -0,0 +1,1202 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from collections import namedtuple +from typing import List, Tuple, Union + +import numpy as np +import pandas as pd +from pandas.core.dtypes.cast import find_common_type + +from ...core import Chunk, OutputType, Tileable, recursive_tile +from ...core.operand import OperandStage +from ...tensor.core import TENSOR_TYPE +from ...tensor.indexing.index_lib import ChunkIndexInfo as ChunkIndexInfoBase +from ...tensor.indexing.index_lib import ( + IndexesHandler, + IndexHandler, + IndexHandlerContext, + IndexInfo, + IndexType, + IntegralIndexHandler, +) +from ...tensor.indexing.index_lib import ( + NDArrayBoolIndexHandler as NDArrayBoolIndexHandlerBase, +) +from ...tensor.indexing.index_lib import SliceIndexHandler as SliceIndexHandlerBase +from ...tensor.indexing.index_lib import ( + TensorBoolIndexHandler as TensorBoolIndexHandlerBase, +) +from ...tensor.utils import ( + calc_pos, + calc_sliced_size, + filter_inputs, + normalize_chunk_sizes, + slice_split, + split_indexes_into_chunks, + to_numpy, +) +from ...utils import classproperty, has_unknown_shape, is_full_slice +from ..core import SERIES_CHUNK_TYPE, SERIES_TYPE, IndexValue +from ..utils import parse_index +from .utils import convert_labels_into_positions + +ChunkIndexAxisInfo = namedtuple( + "chunk_index_axis_info", + ["output_axis_index", "processed_index", "output_shape", "index_value", "dtypes"], +) + + +class ChunkIndexInfo(ChunkIndexInfoBase): + def __init__(self): + super().__init__() + self.index_values = [] + self.dtypes = None + + def set(self, info: ChunkIndexAxisInfo): + super().set(info) + if getattr(info, "index_value", None) is not None: + self.index_values.append(info.index_value) + if getattr(info, "dtypes", None) is not None: + self.dtypes = info.dtypes + + +class FancyIndexInfo(IndexInfo): + def __init__( + self, + index_type: IndexType, + input_axis: int, + output_axis: int, + raw_index, + handler, + ): + super().__init__(index_type, input_axis, output_axis, raw_index, handler) + + # extra info for DataFrame fancy index + # split info + # - chunk_index_to_fancy_index_arrays + # - chunk_index_to_raw_positions + # - is_fancy_index_asc_sorted + self.split_info = None + + +class LabelFancyIndexInfo(IndexInfo): + def __init__( + self, + index_type: IndexType, + input_axis: int, + output_axis: int, + raw_index, + handler, + ): + super().__init__(index_type, input_axis, output_axis, raw_index, handler) + + # store chunk_index -> labels + self.chunk_index_to_labels = None + self.is_label_asc_sorted = None + + +class DataFrameIndexHandlerContext(IndexHandlerContext): + def set_tileable(self, tileable: Tileable): + for chunk in tileable.chunks: + self.chunk_index_to_info[chunk.index] = ChunkIndexInfo() + + def concat_chunks(self, chunks: List[Chunk], axis: Union[Tuple[int], int]) -> Chunk: + dataframe_op_type = type(chunks[0].op) + # create tileable from chunks + concat_tileable = dataframe_op_type.create_tileable_from_chunks( + chunks, inputs=chunks + ) + # concat chunks + chunk = dataframe_op_type.concat_tileable_chunks(concat_tileable).chunks[0] + if chunk.ndim > 1 and ( + (isinstance(axis, tuple) and len(axis) == 1) or isinstance(axis, int) + ): + # adjust index and axis + axis = axis[0] if isinstance(axis, tuple) else axis + chunk.op._axis = axis + chunk_index = list(chunk.index) + chunk_index[1 - axis] = chunks[0].index[1 - axis] + chunk._index = tuple(chunk_index) + return chunk + + def create_chunk( + self, chunk_index: Tuple[int], chunk_index_info: ChunkIndexInfo + ) -> Chunk: + chunk_op = self.op.copy().reset_key() + chunk_op._indexes = indexes = chunk_index_info.indexes + chunk_op.stage = OperandStage.map + + chunk_input = self.tileable.cix[chunk_index] + chunk_inputs = filter_inputs([chunk_input] + indexes) + + kw = {} + kw["shape"] = shape = tuple(chunk_index_info.output_chunk_shape) + kw["index"] = tuple(chunk_index_info.output_chunk_index) + index_values = chunk_index_info.index_values + if len(shape) == 0: + # scalar + chunk_op.output_types = [OutputType.scalar] + kw["dtype"] = self.op.outputs[0].dtype + elif len(shape) == 1: + # Series or Index + chunk_op.output_types = ( + [OutputType.index] + if chunk_op._op_module_ == "index" + else [OutputType.series] + ) + kw["index_value"] = index_values[0] + kw["dtype"] = self.op.outputs[0].dtype + kw["name"] = getattr(self.op.outputs[0], "name", None) + else: + # dataframe + chunk_op.output_types = [OutputType.dataframe] + kw["index_value"] = index_values[0] + kw["columns_value"] = index_values[1] + kw["dtypes"] = chunk_index_info.dtypes + + return chunk_op.new_chunk(chunk_inputs, kws=[kw]) + + +class SliceIndexHandler(SliceIndexHandlerBase): + @classmethod + def set_chunk_index_info( + cls, + context: IndexHandlerContext, + index_info: IndexInfo, + chunk_index: Tuple[int], + chunk_index_info: ChunkIndexInfo, + output_axis_index: int, + index, + output_shape: int, + ): + tileable = context.tileable + chunk_input = tileable.cix[chunk_index] + slc = index + + kw = { + "output_axis_index": output_axis_index, + "processed_index": slc, + "output_shape": output_shape, + "dtypes": None, + } + if index_info.input_axis == 0: + if is_full_slice(slc): + kw["index_value"] = chunk_input.index_value + else: + index = chunk_input.index_value.to_pandas() + kw["index_value"] = parse_index( + index[slc], chunk_input, slc, store_data=False + ) + else: + assert index_info.input_axis == 1 + index = chunk_input.columns_value.to_pandas() + # do not store index value if output axis is 0 + store_data = True if index_info.output_axis == 1 else False + kw["index_value"] = parse_index(index[slc], store_data=store_data) + kw["dtypes"] = chunk_input.dtypes[slc] + + chunk_index_info.set(ChunkIndexAxisInfo(**kw)) + + +class LabelSliceIndexHandler(IndexHandler): + def accept(cls, raw_index): + return isinstance(raw_index, slice) + + def parse(self, raw_index, context: IndexHandlerContext) -> IndexInfo: + info = IndexInfo( + IndexType.label_slice, + context.input_axis, + context.output_axis, + raw_index, + self, + ) + context.input_axis += 1 + context.output_axis += 1 + context.append(info) + return info + + @staticmethod + def _slice_all(slc): + return ( + slc.start is None + and slc.stop is None + and (slc.step is None or slc.step == 1) + ) + + def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + tileable = context.tileable + input_axis = index_info.input_axis + if isinstance(tileable, SERIES_TYPE): + index_value = tileable.index_value + else: + index_value = [tileable.index_value, tileable.columns_value][input_axis] + + # check if chunks have unknown shape + if ( + not self._slice_all(index_info.raw_index) + and index_value.has_value() + and any(np.isnan(ns) for ns in tileable.nsplits[input_axis]) + ): # pragma: no cover + yield + + def set_chunk_index_info( + cls, + context: IndexHandlerContext, + index_info: IndexInfo, + chunk_index: Tuple[int], + chunk_index_info: ChunkIndexInfo, + output_axis_index: int, + index, + output_shape: int, + ): + tileable = context.tileable + chunk_input = tileable.cix[chunk_index] + slc = index + + kw = { + "output_axis_index": output_axis_index, + "processed_index": slc, + "output_shape": output_shape, + "dtypes": None, + } + if index_info.input_axis == 0: + if is_full_slice(index): + kw["index_value"] = chunk_input.index_value + else: + index = chunk_input.index_value.to_pandas() + start, stop = index.slice_locs( + slc.start, slc.stop, slc.step, kind="loc" + ) + pos_slc = slice(start, stop, slc.step) + kw["index_value"] = parse_index( + index[pos_slc], chunk_input, slc, store_data=False + ) + else: + assert index_info.input_axis == 1 + dtypes = chunk_input.dtypes + # do not store index value if output axis is 0 + store_data = True if index_info.output_axis == 1 else False + columns = dtypes.loc[slc].index + kw["index_value"] = parse_index(columns, store_data=store_data) + kw["dtypes"] = chunk_input.dtypes[slc] + + chunk_index_info.set(ChunkIndexAxisInfo(**kw)) + + def _process_slice_all_index( + self, + tileable: Tileable, + index_info: IndexInfo, + input_axis: int, + context: IndexHandlerContext, + ) -> None: + index_to_info = context.chunk_index_to_info.copy() + for chunk_index, chunk_index_info in index_to_info.items(): + i = chunk_index[input_axis] + size = tileable.nsplits[input_axis][i] + self.set_chunk_index_info( + context, + index_info, + chunk_index, + chunk_index_info, + i, + slice(None), + size, + ) + + def _process_has_value_index( + self, + tileable: Tileable, + index_info: IndexInfo, + index_value, + input_axis: int, + context: IndexHandlerContext, + ) -> None: + pd_index = index_value.to_pandas() + # turn label-based slice into position-based slice + start, end = pd_index.slice_locs( + index_info.raw_index.start, + index_info.raw_index.stop, + index_info.raw_index.step, + kind="loc", + ) + slc = slice(start, end, index_info.raw_index.step) + + cum_nsplit = [0] + np.cumsum(tileable.nsplits[index_info.input_axis]).tolist() + # split position-based slice into chunk slices + effected_i_to_slc = slice_split(slc, tileable.nsplits[index_info.input_axis]) + is_reversed = (slc.step or 0) < 0 + output_axis_index_range = ( + range(len(effected_i_to_slc)) + if not is_reversed + else range(len(effected_i_to_slc) - 1, -1, -1) + ) + other_index_to_iter = dict() + + index_to_info = context.chunk_index_to_info.copy() + for chunk_index, chunk_index_info in index_to_info.items(): + i = chunk_index[input_axis] + other_index = chunk_index[:input_axis] + chunk_index[input_axis + 1 :] + size = tileable.nsplits[input_axis][i] + if i not in effected_i_to_slc: + # delete it, the input chunk could be ignored + del context.chunk_index_to_info[chunk_index] + else: + chunk_slc = effected_i_to_slc[i] + output_shape = calc_sliced_size(size, chunk_slc) + if other_index not in other_index_to_iter: + other_index_to_iter[other_index] = iter(output_axis_index_range) + output_axis_index = next(other_index_to_iter[other_index]) + + # turn position-based slice back into label-based slice + start = chunk_slc.start + if start is not None: + abs_start = cum_nsplit[i] + start + label_start = pd_index[abs_start] + else: + label_start = None + stop = chunk_slc.stop + if stop is not None: + abs_stop = cum_nsplit[i] + stop - 1 # label slice include the stop + label_stop = ( + pd_index[abs_stop] if abs_stop < len(pd_index) else None + ) + else: + label_stop = None + + label_slc = slice(label_start, label_stop, chunk_slc.step) + self.set_chunk_index_info( + context, + index_info, + chunk_index, + chunk_index_info, + output_axis_index, + label_slc, + output_shape, + ) + + def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + tileable = context.tileable + input_axis = index_info.input_axis + if isinstance(tileable, SERIES_TYPE): + index_value = tileable.index_value + else: + index_value = [tileable.index_value, tileable.columns_value][input_axis] + + if self._slice_all(index_info.raw_index): + self._process_slice_all_index(tileable, index_info, input_axis, context) + elif index_value.has_value(): + self._process_has_value_index( + tileable, index_info, index_value, input_axis, context + ) + else: + other_index_to_iter = dict() + # slice on all chunks on the specified axis + for chunk_index, chunk_index_info in context.chunk_index_to_info.items(): + other_index = chunk_index[:1] if input_axis == 1 else chunk_index[1:] + if other_index not in other_index_to_iter: + other_index_to_iter[other_index] = itertools.count() + output_axis_index = next(other_index_to_iter[other_index]) + self.set_chunk_index_info( + context, + index_info, + chunk_index, + chunk_index_info, + output_axis_index, + index_info.raw_index, + np.nan, + ) + + +class LabelIndexHandler(IndexHandler): + def accept(cls, raw_index): + # accept type other than slice, ndarray and tensor + return not isinstance(raw_index, (slice, np.ndarray, TENSOR_TYPE)) + + def parse(self, raw_index, context: IndexHandlerContext) -> IndexInfo: + tileable = context.tileable + input_axis = context.input_axis + if tileable.ndim == 2: + index_value = [tileable.index_value, tileable.columns_value][input_axis] + else: + index_value = tileable.index_value + + if index_value.has_value(): + pd_index = index_value.to_pandas() + loc = pd_index.get_loc(raw_index) + if isinstance(loc, slice): + # if is slice, means index not unique, but monotonic + # just call LabelSliceIndexHandler + new_raw_index = slice(raw_index, raw_index) + return LabelSliceIndexHandler.get_instance().parse( + new_raw_index, context + ) + elif isinstance(loc, np.ndarray): + # bool indexing, non unique, and not monotonic + return NDArrayBoolIndexHandler.get_instance().parse(loc, context) + else: + return LabelNDArrayFancyIndexHandler.get_instance().parse( + raw_index, context + ) + + info = IndexInfo( + IndexType.label, context.input_axis, context.output_axis, raw_index, self + ) + context.input_axis += 1 + context.append(info) + return info + + def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + # if index has value on input axis, + # label will be converted to position, + # thus chunks cannot have unknown shape on this axis + tileable = context.tileable + input_axis = index_info.input_axis + if tileable.ndim == 1: + index_value = tileable.index_value + else: + index_value = [tileable.index_value, tileable.columns_value][input_axis] + if index_value.has_value(): + if any( + np.isnan(ns) for ns in tileable.nsplits[input_axis] + ): # pragma: no cover + yield + + def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + tileable = context.tileable + input_axis = index_info.input_axis + if tileable.ndim == 1: + index_value = tileable.index_value + else: + index_value = [tileable.index_value, tileable.columns_value][input_axis] + + if index_value.has_value(): + pd_index = index_value.to_pandas() + loc = pd_index.get_loc(index_info.raw_index) + + # other situations have been delegated to different handlers + assert isinstance(loc, int) + + effected_i_to_slc = slice_split( + loc, tileable.nsplits[index_info.input_axis] + ) + + index_to_info = context.chunk_index_to_info.copy() + for chunk_index, chunk_index_info in index_to_info.items(): + i = chunk_index[input_axis] + if i not in effected_i_to_slc: + # delete it, the input chunk could be ignored + del context.chunk_index_to_info[chunk_index] + else: + chunk_index_info.set( + ChunkIndexAxisInfo( + output_axis_index=None, + processed_index=index_info.raw_index, + output_shape=None, + index_value=None, + dtypes=None, + ) + ) + + +class DataFrameIndexHandler: + @classmethod + def _calc_dtypes(cls, dtypes, index, context: IndexHandlerContext): + if getattr(context.op, "can_index_miss", False): + # reindex + return dtypes.reindex(index).fillna(np.dtype(np.float64)) + else: + # loc, iloc + return getattr(dtypes, cls.kind)[index] + + @classmethod + def set_chunk_index_info( + cls, + context: IndexHandlerContext, + index_info: IndexInfo, + chunk_index: Tuple[int], + chunk_index_info: ChunkIndexInfo, + output_axis_index: int, + index, + output_shape: int, + ): + tileable = context.tileable + chunk_input = tileable.cix[chunk_index] + + dtypes = None + if index_info.input_axis == 0: + index_value = parse_index( + chunk_input.index_value.to_pandas()[:0], + chunk_input, + index, + store_data=False, + ) + else: + dtypes = cls._calc_dtypes(chunk_input.dtypes, index, context) + columns = dtypes.index + index_value = parse_index(columns, store_data=True) + + info = ChunkIndexAxisInfo( + output_axis_index=output_axis_index, + processed_index=index, + output_shape=output_shape, + index_value=index_value, + dtypes=dtypes, + ) + chunk_index_info.set(info) + + +class NDArrayBoolIndexHandler(NDArrayBoolIndexHandlerBase): + @classmethod + def set_chunk_index_info( + cls, + context: IndexHandlerContext, + index_info: IndexInfo, + chunk_index: Tuple[int], + chunk_index_info: ChunkIndexInfo, + output_axis_index: int, + index, + output_shape: int, + ): + tileable = context.tileable + chunk_input = tileable.cix[chunk_index] + + if index_info.input_axis == 0: + dtype = chunk_input.index_value.to_pandas().dtype + index_value = parse_index( + pd.Index([], dtype=dtype), chunk_input, index, store_data=False + ) + dtypes = None + else: + pd_index = chunk_input.columns_value.to_pandas() + filtered_index = pd_index[index] + index_value = parse_index(filtered_index, store_data=True) + dtypes = chunk_input.dtypes[index] + + info = ChunkIndexAxisInfo( + output_axis_index=output_axis_index, + processed_index=index, + output_shape=output_shape, + index_value=index_value, + dtypes=dtypes, + ) + chunk_index_info.set(info) + + +class TensorBoolIndexHandler(TensorBoolIndexHandlerBase): + @classmethod + def set_chunk_index_info( + cls, + context: IndexHandlerContext, + index_info: IndexInfo, + chunk_index: Tuple[int], + chunk_index_info: ChunkIndexInfo, + output_axis_index: int, + index, + output_shape: int, + ): + tileable = context.tileable + chunk_input = tileable.cix[chunk_index] + + assert ( + index_info.input_axis == 0 + ), "bool indexing on axis columns cannot be tensor" + + index_value = parse_index( + pd.Index([], chunk_input.index_value.to_pandas().dtype), + chunk_input, + index, + store_data=False, + ) + + info = ChunkIndexAxisInfo( + output_axis_index=output_axis_index, + processed_index=index, + output_shape=output_shape, + index_value=index_value, + dtypes=None, + ) + chunk_index_info.set(info) + + +class _FancyIndexHandler(DataFrameIndexHandler, IndexHandler): + @classproperty + def kind(self): # pylint: disable=no-self-use + return "iloc" + + def parse(self, raw_index, context: IndexHandlerContext) -> IndexInfo: + info = FancyIndexInfo( + IndexType.fancy_index, + context.input_axis, + context.output_axis, + raw_index, + self, + ) + context.input_axis += 1 + context.output_axis += 1 + context.append(info) + return info + + +class NDArrayFancyIndexHandler(_FancyIndexHandler): + def accept(cls, raw_index): + # raw index like list, and pd.Series + # would have been converted to ndarray or tensor already + return isinstance(raw_index, np.ndarray) and raw_index.dtype != np.bool_ + + def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + tileable = context.tileable + if has_unknown_shape(tileable): # pragma: no cover + yield + + # split raw index into chunks on the given axis + split_info = split_indexes_into_chunks( + [tileable.nsplits[index_info.input_axis]], [index_info.raw_index] + ) + index_info.split_info = split_info + + def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + chunk_index_to_fancy_index_arrays = index_info.split_info[0] + + other_index_to_iter = dict() + chunk_index_to_info = context.chunk_index_to_info.copy() + for chunk_index, chunk_index_info in chunk_index_to_info.items(): + i = chunk_index[index_info.input_axis] + fancy_index_array = chunk_index_to_fancy_index_arrays[(i,)][0] + + if fancy_index_array.size == 0: + # not effected + del context.chunk_index_to_info[chunk_index] + continue + + other_index = ( + chunk_index[:1] if index_info.input_axis == 1 else chunk_index[1:] + ) + if other_index not in other_index_to_iter: + other_index_to_iter[other_index] = itertools.count() + output_axis_index = next(other_index_to_iter[other_index]) + output_axis_shape = fancy_index_array.shape[0] + self.set_chunk_index_info( + context, + index_info, + chunk_index, + chunk_index_info, + output_axis_index, + fancy_index_array, + output_axis_shape, + ) + + @classmethod + def need_postprocess(cls, index_info: IndexInfo, context: IndexHandlerContext): + tileable = context.tileable + + if tileable.chunk_shape[index_info.input_axis] == 1: + # if tileable only has 1 chunk on this axis + # do not need postprocess + return False + # if ascending sorted, no need to postprocess + return not index_info.split_info[2] + + def postprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + # could be 2 fancy indexes at most + fancy_indexes = context.get_indexes(index_info.index_type) + i_fancy_index = fancy_indexes.index(index_info) + need_postprocesses = [ + fancy_index.handler.need_postprocess(fancy_index, context) + for fancy_index in fancy_indexes + ] + + if not need_postprocesses[i_fancy_index]: + # do not need postprocess + return + + if ( + i_fancy_index == 0 + and len(fancy_indexes) == 2 + and need_postprocesses[1] + and isinstance(fancy_indexes[1].raw_index, np.ndarray) + ): + # check if need postprocess if 2 fancy indexes and now it's the first, + # if so, skip postprocess for this one, + # and do MapReduce just once for the second postprocess + return + + chunks, nsplits = context.out_chunks, context.out_nsplits + index_to_chunks = {c.index: c for c in chunks} + + to_concat_axes = tuple( + fancy_index.output_axis + for i, fancy_index in enumerate(fancy_indexes) + if need_postprocesses[i] + ) + reorder_indexes = [ + calc_pos(fancy_index.raw_index.shape, fancy_index.split_info[1]) + for i, fancy_index in enumerate(fancy_indexes) + if need_postprocesses[i] + ] + new_out_chunks = [] + for chunk_index in itertools.product( + *( + range(len(ns)) + for ax, ns in enumerate(nsplits) + if ax not in to_concat_axes + ) + ): + if len(to_concat_axes) == 2: + to_concat_chunks = chunks + else: + to_concat_chunks = [] + for i in range(len(nsplits[to_concat_axes[0]])): + to_concat_index = list(chunk_index) + to_concat_index.insert(to_concat_axes[0], i) + to_concat_chunks.append(index_to_chunks[tuple(to_concat_index)]) + concat_chunk = context.concat_chunks(to_concat_chunks, to_concat_axes) + reorder_chunk = self._create_reorder_chunk( + concat_chunk, to_concat_axes, reorder_indexes, context + ) + new_out_chunks.append(reorder_chunk) + + new_nsplits = list(nsplits) + for fancy_index in fancy_indexes: + new_nsplits[fancy_index.output_axis] = (fancy_index.raw_index.shape[0],) + context.out_chunks = new_out_chunks + context.out_nsplits = new_nsplits + + @classmethod + def _create_reorder_chunk( + cls, + concat_chunk: Chunk, + to_concat_axes: Tuple, + reorder_indexes: List, + context: IndexHandlerContext, + ): + reorder_chunk_op = context.op.copy().reset_key() + indexes = [slice(None)] * concat_chunk.ndim + for ax, reorder_index in zip(to_concat_axes, reorder_indexes): + indexes[ax] = reorder_index + reorder_chunk_op._indexes = indexes + + params = concat_chunk.params + if isinstance(concat_chunk, SERIES_CHUNK_TYPE): + if concat_chunk.index_value.has_value(): + # if concat chunk's index has value, we could calculate the new index + reorder_index = concat_chunk.index_value.to_pandas()[reorder_indexes[0]] + params["index_value"] = parse_index(reorder_index, store_data=True) + else: + params["index_value"] = parse_index( + concat_chunk.index_value.to_pandas(), indexes + ) + return reorder_chunk_op.new_chunk([concat_chunk], kws=[params]) + else: + if 0 in to_concat_axes: + if concat_chunk.index_value.has_value(): + # if concat chunk's index has value, and index on axis 0, + # we could calculate the new index + reorder_index = concat_chunk.index_value.to_pandas()[ + reorder_indexes[0] + ] + params["index_value"] = parse_index(reorder_index, store_data=True) + else: + params["index_value"] = parse_index( + concat_chunk.index_value.to_pandas(), indexes[0] + ) + if 1 in to_concat_axes: + reorder_columns = concat_chunk.columns_value.to_pandas()[ + reorder_indexes[-1] + ] + params["columns_value"] = parse_index(reorder_columns, store_data=True) + params["dtypes"] = concat_chunk.dtypes[reorder_indexes[-1]] + + return reorder_chunk_op.new_chunk([concat_chunk], kws=[params]) + + +class _LabelFancyIndexHandler(DataFrameIndexHandler, IndexHandler): + @classproperty + def kind(self): # pylint: disable=no-self-use + return "loc" + + +class LabelNDArrayFancyIndexHandler(_LabelFancyIndexHandler): + def accept(cls, raw_index): + return isinstance(raw_index, np.ndarray) and raw_index.dtype != np.bool_ + + def parse(self, raw_index, context: IndexHandlerContext) -> IndexInfo: + info = LabelFancyIndexInfo( + IndexType.label_fancy_index, + context.input_axis, + context.output_axis, + raw_index, + self, + ) + context.input_axis += 1 + if not np.isscalar(raw_index): + context.output_axis += 1 + context.append(info) + return info + + def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + tileable = context.tileable + op = context.op + + input_axis = index_info.input_axis + + # check unknown shape + if any(np.isnan(s) for s in tileable.nsplits[input_axis]): + yield + + if tileable.ndim == 2: + index_value = [tileable.index_value, tileable.columns_value][input_axis] + else: + index_value = tileable.index_value + cum_nsplit = [0] + np.cumsum(tileable.nsplits[input_axis]).tolist() + if not op.can_index_miss and index_value.has_value(): + # df.loc cannot have missed index, reindex can have + # thus for reindex, do not try to resolve by converting to positions + # turn label-based fancy index into position-based + pd_index = index_value.to_pandas() + positions = convert_labels_into_positions(pd_index, index_info.raw_index) + split_info = split_indexes_into_chunks( + [tileable.nsplits[input_axis]], [positions] + ) + chunk_index_to_pos = split_info[0] + is_asc_sorted = split_info[-1] + + # convert back to labels for chunk_index + chunk_index_to_labels = dict() + for chunk_index, pos in chunk_index_to_pos.items(): + # chunk_index and pos are all list with 1 element + abs_pos = pos[0] + cum_nsplit[chunk_index[0]] + if isinstance(pd_index, pd.RangeIndex) and len(abs_pos) == 0: + chunk_labels = np.array([], dtype=pd_index.dtype) + else: + chunk_labels = to_numpy(pd_index[abs_pos]) + chunk_index_to_labels[chunk_index[0]] = chunk_labels + + index_info.is_label_asc_sorted = is_asc_sorted + index_info.chunk_index_to_labels = chunk_index_to_labels + else: + index = index_info.raw_index + if np.isscalar(index): + # delegation from label index handler + index = np.atleast_1d(index) + # does not know the right positions, need postprocess always + index_info.is_label_asc_sorted = False + # do df.loc on each chunk + index_info.chunk_index_to_labels = { + i: index for i in range(tileable.chunk_shape[input_axis]) + } + + def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + tileable = context.tileable + input_axis = index_info.input_axis + chunk_index_to_labels = index_info.chunk_index_to_labels + full_label_size = sum(labels.size for labels in chunk_index_to_labels.values()) + + other_index_to_iter = dict() + chunk_index_to_info = context.chunk_index_to_info.copy() + for chunk_index, chunk_index_info in chunk_index_to_info.items(): + i = chunk_index[input_axis] + chunk_labels = chunk_index_to_labels[i] + size = chunk_labels.size + + if size == 0 and full_label_size > 0 and tileable.shape[0] > 0: + # not effected when + # 1) tileable not empty + # 2) full index not empty + # 3) no index chosen for this chunk + del context.chunk_index_to_info[chunk_index] + continue + + if ( + np.isscalar(index_info.raw_index) + and isinstance(tileable.index_value.value, IndexValue.DatetimeIndex) + and isinstance(chunk_labels[0], str) + ): + # special case when index is DatetimeIndex and loc by string + # convert back list to scalar because if keep list, + # KeyError will always happen + chunk_labels = chunk_labels[0].item() + + other_index = chunk_index[:1] if input_axis == 1 else chunk_index[1:] + if other_index not in other_index_to_iter: + other_index_to_iter[other_index] = itertools.count() + output_axis_index = next(other_index_to_iter[other_index]) + output_axis_shape = size + self.set_chunk_index_info( + context, + index_info, + chunk_index, + chunk_index_info, + output_axis_index, + chunk_labels, + output_axis_shape, + ) + + @classmethod + def need_postprocess(cls, index_info: IndexInfo, context: IndexHandlerContext): + # if ascending sorted, no need to postprocess + return not index_info.is_label_asc_sorted + + def postprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + if not self.need_postprocess(index_info, context): + # do not need postprocess + return + + chunks, nsplits = context.out_chunks, context.out_nsplits + index_to_chunks = {c.index: c for c in chunks} + + axis = index_info.output_axis + new_out_chunks = [] + chunk_axis_shapes = dict() + for chunk_index in itertools.product( + *(range(len(ns)) for ax, ns in enumerate(nsplits) if ax != axis) + ): + to_concat_chunks = [] + for i in range(len(nsplits[axis])): + if axis == 0: + to_concat_index = (i,) + chunk_index + else: + to_concat_index = chunk_index + (i,) + to_concat_chunks.append(index_to_chunks[to_concat_index]) + concat_chunk = context.concat_chunks(to_concat_chunks, axis) + chunk_op = context.op.copy().reset_key() + indexes = [slice(None)] * len(nsplits) + indexes[axis] = index_info.raw_index + params = concat_chunk.params + if np.isscalar(index_info.raw_index): + assert axis == 0 + if "columns_value" in params: + params["index_value"] = params.pop("columns_value") + params["dtype"] = find_common_type(params["dtypes"].tolist()) + del params["dtypes"] + if getattr(context.op.outputs[0], "name", None) is not None: + params["name"] = context.op.outputs[0].name + if len(params["index"]) == chunks[0].ndim: + index = list(params["index"]) + index.pop(index_info.output_axis) + params["index"] = tuple(index) + shape = list(params["shape"]) + shape.pop(index_info.output_axis) + params["shape"] = tuple(shape) + if context.op.outputs[0].ndim == 0: + del params["index_value"] + elif axis == 0: + pd_index = pd.Index(index_info.raw_index) + params["index_value"] = parse_index(pd_index, store_data=False) + shape = list(params["shape"]) + shape[0] = len(pd_index) + params["shape"] = shape + else: + if context.op.can_index_miss: + # reindex + params["dtypes"] = dtypes = to_concat_chunks[0].dtypes + else: + params["dtypes"] = dtypes = concat_chunk.dtypes.loc[ + index_info.raw_index + ] + params["columns_value"] = parse_index(dtypes.index, store_data=True) + shape = list(params["shape"]) + shape[1] = len(dtypes) + params["shape"] = tuple(shape) + chunk_op._indexes = indexes + chunk_op.stage = OperandStage.agg + out_chunk = chunk_op.new_chunk([concat_chunk], kws=[params]) + if len(out_chunk.shape) != 0: + chunk_axis_shapes[out_chunk.index[axis]] = out_chunk.shape[axis] + new_out_chunks.append(out_chunk) + + new_nsplits = list(nsplits) + if np.isscalar(index_info.raw_index): + new_nsplits = new_nsplits[:axis] + new_nsplits[axis + 1 :] + else: + new_nsplits[axis] = (sum(chunk_axis_shapes.values()),) + context.out_chunks = new_out_chunks + context.out_nsplits = new_nsplits + + +class LabelTensorFancyIndexHandler(_LabelFancyIndexHandler): + def accept(cls, raw_index): + return isinstance(raw_index, TENSOR_TYPE) and raw_index.dtype != np.bool_ + + def parse(self, raw_index, context: IndexHandlerContext) -> IndexInfo: + if context.input_axis == 1: # pragma: no cover + raise NotImplementedError( + "do not support tensor-based index on columns axis" + ) + info = LabelFancyIndexInfo( + IndexType.label_fancy_index, + context.input_axis, + context.output_axis, + raw_index, + self, + ) + context.input_axis += 1 + context.output_axis += 1 + context.append(info) + return info + + def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + if has_unknown_shape(index_info.raw_index): + yield + # rechunk index into one + index_info.unprocessed_raw_index = index_info.raw_index + index_info.raw_index = yield from recursive_tile( + index_info.raw_index.rechunk(index_info.raw_index.shape) + ) + + def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + tileable = context.tileable + input_axis = index_info.input_axis + + assert len(index_info.raw_index.chunks) == 1 + chunk_labels = index_info.raw_index.chunks[0] + + other_index_to_iter = dict() + for chunk in tileable.chunks: + chunk_index = chunk.index + other_index = chunk_index[:1] if input_axis == 1 else chunk_index[1:] + if other_index not in other_index_to_iter: + other_index_to_iter[other_index] = itertools.count() + output_axis_index = next(other_index_to_iter[other_index]) + self.set_chunk_index_info( + context, + index_info, + chunk_index, + context.chunk_index_to_info[chunk.index], + output_axis_index, + chunk_labels, + len(chunk_labels), + ) + + def postprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + from .iloc import DataFrameIlocGetItem + + tileable = context.tileable + out = context.op.outputs[0] + max_chunk_size = max(tileable.nsplits[index_info.input_axis]) + max_chunk_size = 0 if np.isnan(max_chunk_size) else max_chunk_size + max_chunk_size = max( + max_chunk_size, max(index_info.unprocessed_raw_index.nsplits[0]) + ) + new_chunk_sizes = normalize_chunk_sizes( + index_info.raw_index.shape[0], max_chunk_size + )[0] + cum_new_chunk_sizes = [0] + np.cumsum(new_chunk_sizes).tolist() + + chunks, nsplits = context.out_chunks, context.out_nsplits + index_to_chunks = {c.index: c for c in chunks} + + axis = index_info.output_axis + assert axis == 0 + new_out_chunks = [] + for chunk_index in itertools.product( + *(range(len(ns)) for ax, ns in enumerate(nsplits) if ax != axis) + ): + select_chunks = [] + for i in range(len(nsplits[axis])): + select_index = (i,) + chunk_index + select_chunks.append(index_to_chunks[select_index]) + + for j in range(len(new_chunk_sizes)): + slc = slice(cum_new_chunk_sizes[j], cum_new_chunk_sizes[j + 1]) + indexes = [slice(None)] * len(nsplits) + indexes[axis] = slc + + slice_chunks = [] + for select_chunk in select_chunks: + output_types = ( + [OutputType.series] + if len(nsplits) == 1 + else [OutputType.dataframe] + ) + slc_op = DataFrameIlocGetItem( + indexes=indexes, output_types=output_types + ) + slice_chunk_shape = list(select_chunk.shape) + slice_chunk_shape[axis] = new_chunk_sizes[j] + slice_chunk = slc_op.new_chunk( + [select_chunk], shape=tuple(slice_chunk_shape) + ) + slice_chunks.append(slice_chunk) + + chunk_op = context.op.copy().reset_key() + chunk_op.stage = OperandStage.agg + chunk_op._indexes = (None,) * len(nsplits) + chunk_op._fill_value = None + assert axis == 0 + params = dict() + params["index"] = ( + (j,) + chunk_index if axis == 0 else chunk_index + (j,) + ) + params["index_value"] = parse_index( + out.index_value.to_pandas()[slc], slice_chunks, store_data=False + ) + if select_chunks[0].ndim == 2: + params["columns_value"] = select_chunks[0].columns_value + params["dtypes"] = select_chunks[0].dtypes + else: + params["dtype"] = select_chunks[0].dtype + params["name"] = select_chunks[0].name + params["shape"] = slice_chunks[0].shape + out_chunk = chunk_op.new_chunk(slice_chunks, kws=[params]) + new_out_chunks.append(out_chunk) + + new_nsplits = list(nsplits) + new_nsplits[axis] = tuple(new_chunk_sizes) + context.out_chunks = new_out_chunks + context.out_nsplits = new_nsplits + + +class DataFrameIlocIndexesHandler(IndexesHandler): + def __init__(self): + super().__init__() + self.register( + IntegralIndexHandler, + SliceIndexHandler, + NDArrayBoolIndexHandler, + TensorBoolIndexHandler, + NDArrayFancyIndexHandler, + ) + + def create_context(self, op): + return DataFrameIndexHandlerContext(op) + + +class DataFrameLocIndexesHandler(IndexesHandler): + def __init__(self): + super().__init__() + self.register( + LabelIndexHandler, + LabelSliceIndexHandler, + NDArrayBoolIndexHandler, + TensorBoolIndexHandler, + LabelNDArrayFancyIndexHandler, + ) + + def create_context(self, op): + return DataFrameIndexHandlerContext(op) + + +class DataFrameReindexHandler(IndexesHandler): + def __init__(self): + super().__init__() + self.register( + LabelTensorFancyIndexHandler, + LabelNDArrayFancyIndexHandler, + LabelSliceIndexHandler, + ) + + def create_context(self, op): + return DataFrameIndexHandlerContext(op) diff --git a/python/xorbits/_mars/dataframe/indexing/insert.py b/python/xorbits/_mars/dataframe/indexing/insert.py new file mode 100644 index 000000000..72f5d4277 --- /dev/null +++ b/python/xorbits/_mars/dataframe/indexing/insert.py @@ -0,0 +1,190 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...core import recursive_tile +from ...serialization.serializables import AnyField, BoolField, Int64Field +from ...tensor.core import TENSOR_CHUNK_TYPE, TENSOR_TYPE +from ..core import SERIES_CHUNK_TYPE, SERIES_TYPE +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_empty_df, parse_index + + +class DataFrameInsert(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.INSERT + + _loc = Int64Field("loc") + _column = AnyField("column") + _value = AnyField("value") + _allow_duplicates = BoolField("allow_duplicates") + + def __init__(self, loc=None, column=None, value=None, allow_duplicates=None, **kw): + super().__init__( + _loc=loc, + _column=column, + _value=value, + _allow_duplicates=allow_duplicates, + **kw, + ) + + @property + def loc(self) -> int: + return self._loc + + @property + def column(self): + return self._column + + @property + def value(self): + return self._value + + @property + def allow_duplicates(self): + return self._allow_duplicates + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if len(inputs) > 1: + self._value = self._inputs[-1] + + def __call__(self, df): + inputs = [df] + if isinstance(self.value, (SERIES_TYPE, TENSOR_TYPE)): + value_dtype = self.value.dtype + inputs.append(self.value) + else: + value_dtype = pd.Series(self.value).dtype + + empty_df = build_empty_df(df.dtypes) + empty_df.insert( + loc=self.loc, + column=self.column, + allow_duplicates=self.allow_duplicates, + value=pd.Series([], dtype=value_dtype), + ) + + params = df.params + params["columns_value"] = parse_index(empty_df.columns, store_data=True) + params["dtypes"] = empty_df.dtypes + params["shape"] = (df.shape[0], df.shape[1] + 1) + return self.new_dataframe(inputs, **params) + + @classmethod + def tile(cls, op: "DataFrameInsert"): + inp = op.inputs[0] + value = op.value + if isinstance(value, (SERIES_TYPE, TENSOR_TYPE)): + value = yield from recursive_tile(value.rechunk({0: inp.nsplits[0]})) + out = op.outputs[0] + + chunk_bounds = np.cumsum((0,) + inp.nsplits[1]) + chunk_bounds[-1] += 1 + + chunks = [] + new_split = list(inp.nsplits[1]) + chunk_dtypes = None + chunk_columns_value = None + for c in inp.chunks: + left_bound = int(chunk_bounds[c.index[1]]) + right_bound = int(chunk_bounds[c.index[1] + 1]) + if left_bound > op.loc or right_bound <= op.loc: + chunks.append(c) + continue + + if chunk_dtypes is None: + new_split[c.index[1]] = inp.nsplits[1][c.index[1]] + 1 + + if isinstance(value, (SERIES_TYPE, TENSOR_TYPE)): + value_dtype = value.dtype + else: + value_dtype = pd.Series(value).dtype + + empty_df = build_empty_df(c.dtypes) + empty_df.insert( + loc=op.loc - left_bound, + column=op.column, + allow_duplicates=op.allow_duplicates, + value=pd.Series([], dtype=value_dtype), + ) + + chunk_dtypes = empty_df.dtypes + chunk_columns_value = parse_index(chunk_dtypes.index, store_data=True) + + params = c.params + params["columns_value"] = chunk_columns_value + params["dtypes"] = chunk_dtypes + params["shape"] = (c.shape[0], c.shape[1] + 1) + + new_op = op.copy().reset_key() + new_op._loc = op.loc - left_bound + + if isinstance(value, (SERIES_TYPE, TENSOR_TYPE)): + inputs = [c, value.chunks[c.index[0]]] + else: + inputs = [c] + chunks.append(new_op.new_chunk(inputs, **params)) + + new_op = op.copy().reset_key() + return new_op.new_tileables( + [inp], + chunks=chunks, + nsplits=(inp.nsplits[0], tuple(new_split)), + **out.params, + ) + + @classmethod + def execute(cls, ctx, op: "DataFrameInsert"): + input_ = ctx[op.inputs[0].key] + value = op.value + if isinstance(value, (SERIES_CHUNK_TYPE, TENSOR_CHUNK_TYPE)): + value = ctx[value.key] + ctx[op.outputs[0].key] = copied = input_.copy() + copied.insert( + loc=op.loc, + column=op.column, + allow_duplicates=op.allow_duplicates, + value=value, + ) + + +def df_insert(df, loc, column, value, allow_duplicates=False): + """ + Insert column into DataFrame at specified location. + + Raises a ValueError if `column` is already contained in the DataFrame, + unless `allow_duplicates` is set to True. + + Parameters + ---------- + loc : int + Insertion index. Must verify 0 <= loc <= len(columns). + column : str, number, or hashable object + Label of the inserted column. + value : int, Series, or array-like + allow_duplicates : bool, optional + """ + if isinstance(value, TENSOR_TYPE) and value.ndim > 1: + raise ValueError( + f"Wrong number of items passed {value.ndim}, placement implies 1" + ) + + op = DataFrameInsert( + loc=loc, column=column, value=value, allow_duplicates=allow_duplicates + ) + out_df = op(df) + df.data = out_df.data diff --git a/python/xorbits/_mars/dataframe/indexing/loc.py b/python/xorbits/_mars/dataframe/indexing/loc.py new file mode 100644 index 000000000..bc3aca577 --- /dev/null +++ b/python/xorbits/_mars/dataframe/indexing/loc.py @@ -0,0 +1,555 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from numbers import Integral +from typing import Dict + +import numpy as np +import pandas as pd +from pandas.core.dtypes.cast import find_common_type +from pandas.core.indexing import IndexingError + +from ... import opcodes as OperandDef +from ...core import ENTITY_TYPE, OutputType +from ...core.operand import OperandStage +from ...serialization.serializables import AnyField, KeyField, ListField +from ...tensor.datasource import asarray +from ...tensor.utils import calc_sliced_size, filter_inputs +from ...utils import is_full_slice, lazy_import +from ..core import DATAFRAME_TYPE, IndexValue +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import is_index_value_identical, parse_index +from .iloc import DataFrameIlocSetItem +from .index_lib import DataFrameLocIndexesHandler + +cudf = lazy_import("cudf") + + +def process_loc_indexes(inp, indexes, fetch_index: bool = True): + ndim = inp.ndim + + if not isinstance(indexes, tuple): + indexes = (indexes,) + if len(indexes) < ndim: + indexes += (slice(None),) * (ndim - len(indexes)) + if len(indexes) > ndim: + raise IndexingError("Too many indexers") + + new_indexes = [] + for ax, index in enumerate(indexes): + if isinstance(index, (list, np.ndarray, pd.Series, ENTITY_TYPE)): + if not isinstance(index, ENTITY_TYPE): + index = np.asarray(index) + elif fetch_index: + index = asarray(index) + if ax == 1: + # do not support tensor index on axis 1 + # because if so, the dtypes and columns_value would be unknown + try: + index = index.fetch() + except (RuntimeError, ValueError): + raise NotImplementedError( + "indexer on axis columns cannot be non-executed tensor" + ) + new_indexes.append(index) + + return new_indexes + + +class DataFrameLoc: + def __init__(self, obj): + self._obj = obj + + def _use_iloc(self, indexes): + # for RangeIndex from 0, use iloc instead of loc + index_value = self._obj.index_value.value + if len(indexes) == 2: + if not isinstance(indexes[1], slice): + return False, None + elif indexes[1] != slice(None): + return False, None + if not isinstance(index_value, IndexValue.RangeIndex): + return False, None + if index_value.slice.start != 0 and index_value.slice.start is not None: + return False, None + if not isinstance(indexes[0], (Integral, slice)): + return False, None + if isinstance(indexes[0], Integral): + if indexes[0] < 0: + return False, None + else: + index0 = indexes[0] + for v in (index0.start, index0.stop, index0.step): + if v is None: + continue + if not isinstance(v, Integral): + return False, None + if v < 0: + return False, None + if index0.stop is not None: + # adjust slice right bound + return ( + True, + [slice(index0.start, index0.stop + 1, index0.step)] + indexes[1:], + ) + return True, None + + def __getitem__(self, indexes): + indexes = process_loc_indexes(self._obj, indexes) + + use_iloc, new_indexes = self._use_iloc(indexes) + if use_iloc: + # use iloc instead + return self._obj.iloc[tuple(new_indexes or indexes)] + + op = DataFrameLocGetItem(indexes=indexes) + return op(self._obj) + + def __setitem__(self, indexes, value): + if not np.isscalar(value): + raise NotImplementedError("Only scalar value is supported to set by loc") + if not isinstance(self._obj, DATAFRAME_TYPE): + raise NotImplementedError("Only DataFrame is supported to set by loc") + indexes = process_loc_indexes(self._obj, indexes, fetch_index=False) + use_iloc, new_indexes = self._use_iloc(indexes) + if use_iloc: + op = DataFrameIlocSetItem(indexes=new_indexes, value=value) + ret = op(self._obj) + self._obj.data = ret.data + else: + other_indices = [] + indices_tileable = [ + idx + for idx in indexes + if isinstance(idx, ENTITY_TYPE) or other_indices.append(idx) + ] + op = DataFramelocSetItem(indexes=other_indices, value=value) + ret = op([self._obj] + indices_tileable) + self._obj.data = ret.data + + +class DataFramelocSetItem(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.DATAFRAME_ILOC_SETITEM + + _indexes = ListField("indexes") + _value = AnyField("value") + + def __init__( + self, indexes=None, value=None, gpu=None, sparse=False, output_types=None, **kw + ): + super().__init__( + _indexes=indexes, + _value=value, + gpu=gpu, + sparse=sparse, + _output_types=output_types, + **kw, + ) + if not self.output_types: + self.output_types = [OutputType.dataframe] + + @property + def indexes(self): + return self._indexes + + @property + def value(self): + return self._value + + def __call__(self, inputs): + df = inputs[0] + return self.new_dataframe( + inputs, + shape=df.shape, + dtypes=df.dtypes, + index_value=df.index_value, + columns_value=df.columns_value, + ) + + @classmethod + def tile(cls, op): + in_df = op.inputs[0] + out_df = op.outputs[0] + out_chunks = [] + if len(op.inputs) > 1: + index_series = op.inputs[1] + is_identical = is_index_value_identical(in_df, index_series) + if not is_identical: + raise NotImplementedError("Only identical index value is supported") + if len(in_df.nsplits[1]) != 1: + raise NotImplementedError("Column-split chunks are not supported") + for target_chunk, index_chunk in zip(in_df.chunks, index_series.chunks): + chunk_op = op.copy().reset_key() + out_chunk = chunk_op.new_chunk( + [target_chunk, index_chunk], + shape=target_chunk.shape, + index=target_chunk.index, + dtypes=target_chunk.dtypes, + index_value=target_chunk.index_value, + columns_value=target_chunk.columns_value, + ) + out_chunks.append(out_chunk) + else: + for target_chunk in in_df.chunks: + chunk_op = op.copy().reset_key() + out_chunk = chunk_op.new_chunk( + [target_chunk], + shape=target_chunk.shape, + index=target_chunk.index, + dtypes=target_chunk.dtypes, + index_value=target_chunk.index_value, + columns_value=target_chunk.columns_value, + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_dataframes( + op.inputs, + shape=out_df.shape, + dtypes=out_df.dtypes, + index_value=out_df.index_value, + columns_value=out_df.columns_value, + chunks=out_chunks, + nsplits=in_df.nsplits, + ) + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + r = ctx[op.inputs[0].key].copy(deep=True) + if len(op.inputs) > 1: + row_index = ctx[op.inputs[1].key] + r.loc[(row_index,) + tuple(op.indexes)] = op.value + else: + r.loc[tuple(op.indexes)] = op.value + ctx[chunk.key] = r + + +class DataFrameLocGetItem(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.DATAFRAME_LOC_GETITEM + + _input = KeyField("input") + _indexes = ListField("indexes") + + def __init__(self, indexes=None, gpu=None, sparse=False, output_types=None, **kw): + super().__init__( + _indexes=indexes, gpu=gpu, sparse=sparse, _output_types=output_types, **kw + ) + + @property + def input(self): + return self._input + + @property + def indexes(self): + return self._indexes + + @property + def can_index_miss(self): + return False + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + self._input = next(inputs_iter) + indexes = [] + for index in self._indexes: + if isinstance(index, ENTITY_TYPE): + indexes.append(next(inputs_iter)) + else: + indexes.append(index) + self._indexes = list(indexes) + + @classmethod + def _calc_slice_param( + cls, + input_index_value: IndexValue, + pd_index: pd.Index, + inp, + index: slice, + axis: int, + ) -> Dict: + param = dict() + if is_full_slice(index): + # full slice on this axis + param["shape"] = inp.shape[axis] + param["index_value"] = input_index_value + if axis == 1: + param["dtypes"] = inp.dtypes + elif input_index_value.has_value(): + start, end = pd_index.slice_locs( + index.start, index.stop, index.step, kind="loc" + ) + slc = slice(start, end, index.step) + size = calc_sliced_size(inp.shape[axis], slc) + param["shape"] = size + out_index = pd_index[slc] + param["index_value"] = parse_index(out_index, store_data=axis == 1) + if axis == 1: + param["dtypes"] = inp.dtypes[slc] + else: + assert axis == 0 + if index.start is None and index.stop is None: + param["shape"] = calc_sliced_size(inp.shape[axis], index) + else: + param["shape"] = np.nan + param["index_value"] = parse_index(pd_index, inp, index) + + return param + + @classmethod + def _calc_bool_index_param( + cls, input_index_value: IndexValue, pd_index: pd.Index, inp, index, axis: int + ) -> Dict: + param = dict() + if input_index_value.has_value(): + if isinstance(index, np.ndarray): + filtered_index = pd_index[index] + param["shape"] = len(filtered_index) + param["index_value"] = parse_index(filtered_index, store_data=axis == 1) + if axis == 1: + param["dtypes"] = inp.dtypes[index] + else: + # tensor, cannot be indexer on axis 1 + assert axis == 0 + param["shape"] = np.nan + param["index_value"] = parse_index( + pd.Index([], dtype=pd_index.dtype), inp, index, store_data=False + ) + else: + assert axis == 0 + if isinstance(index, np.ndarray): + param["shape"] = int(index.sum()) + else: + param["shape"] = np.nan + param["index_value"] = parse_index(pd_index, inp, index, store_data=False) + + return param + + @classmethod + def _calc_fancy_index_param( + cls, input_index_value: IndexValue, pd_index: pd.Index, inp, index, axis: int + ) -> Dict: + param = dict() + if input_index_value.has_value(): + if isinstance(index, np.ndarray): + if not pd_index.is_unique: + assert axis == 1 + # as there's no direct method in pandas to handle fancy indexes + # we creates a empty + new_dtypes = inp.dtypes.loc[index] + param["shape"] = len(new_dtypes) + param["index_value"] = parse_index( + new_dtypes.index, store_data=True + ) + param["dtypes"] = new_dtypes + else: + for it in index: + if it not in pd_index: + axis_name = "index" if axis == 0 else "columns" + raise KeyError( + f"Label [{it}] not found in the [{axis_name}]" + ) + param["shape"] = len(index) + param["index_value"] = parse_index(pd.Index(index), store_data=True) + if axis == 1: + param["dtypes"] = inp.dtypes[index] + else: + assert axis == 0 + param["shape"] = index.shape[0] + param["index_value"] = parse_index( + pd.Index([], dtype=pd_index.dtype), inp, index + ) + else: + assert axis == 0 + param["shape"] = index.shape[0] + param["index_value"] = parse_index(pd_index, inp, index) + + return param + + @classmethod + def _calc_param(cls, inp, axis: int, index) -> Dict: + input_index_value = inp.index_value if axis == 0 else inp.columns_value + pd_index = input_index_value.to_pandas() + + if isinstance(index, slice): + return cls._calc_slice_param(input_index_value, pd_index, inp, index, axis) + elif hasattr(index, "dtype") and index.ndim == 1: + if index.dtype == np.bool_: + # bool indexing + return cls._calc_bool_index_param( + input_index_value, pd_index, inp, index, axis + ) + else: + # fancy indexing + return cls._calc_fancy_index_param( + input_index_value, pd_index, inp, index, axis + ) + else: + param = dict() + if input_index_value.has_value(): + loc = pd_index.get_loc(index) + if isinstance(loc, (slice, np.ndarray)): + assert axis == 1 + new_dtypes = inp.dtypes[loc] + param["shape"] = len(new_dtypes) + param["index_value"] = parse_index( + new_dtypes.index, store_data=True + ) + param["dtypes"] = new_dtypes + else: + # append None to indicate returning Series + param["shape"] = None + else: + param["shape"] = None + return param + + def __call__(self, inp): + inputs = [inp] + filter_inputs(self._indexes) + + shape = [] + sizes = [] + index_value = columns_value = dtypes = None + for ax, index in enumerate(self._indexes): + param = self._calc_param(inp, ax, index) + + size = param.get("shape") + sizes.append(size) + if size is not None: + shape.append(size) + + if ax == 0: + index_value = param.get("index_value") + else: + columns_value = param.get("index_value") + dtypes = param.get("dtypes") + + shape = tuple(shape) + if len(shape) == 0: + # scalar + if isinstance(inp, DATAFRAME_TYPE): + dtype = inp.dtypes[self._indexes[1]] + else: + dtype = inp.dtype + return self.new_scalar(inputs, dtype=dtype) + elif len(shape) == 1: + # series + if isinstance(inp, DATAFRAME_TYPE): + if sizes[0] is None: + # label on axis 0 + dtype = find_common_type(list(dtypes)) + return self.new_series( + inputs, + shape=shape, + dtype=dtype, + index_value=columns_value, + name=self._indexes[0], + ) + else: + # label on axis 1 + dtype = inp.dtypes[self._indexes[1]] + return self.new_series( + inputs, + shape=shape, + dtype=dtype, + index_value=index_value, + name=self._indexes[1], + ) + else: + return self.new_series( + inputs, + shape=shape, + dtype=inp.dtype, + index_value=index_value, + name=inp.name, + ) + else: + # dataframe + return self.new_dataframe( + inputs, + shape=shape, + dtypes=dtypes, + index_value=index_value, + columns_value=columns_value, + ) + + @classmethod + def tile(cls, op): + handler = DataFrameLocIndexesHandler() + return [(yield from handler.handle(op))] + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + df = ctx[op.input.key] + if len(op.inputs) > 1: + indexes = tuple( + ctx[index.key] if hasattr(index, "key") else index + for index in op.indexes + ) + else: + indexes = tuple(op.indexes) + xdf = pd if isinstance(df, (pd.Series, pd.DataFrame)) or cudf is None else cudf + + if op.stage != OperandStage.map: + try: + r = df.loc[indexes] + except AttributeError: + # workaround for error when calling series.loc[(index,)] + r = df.loc[indexes[0]] + else: + # for map stage, and when some index is fancy index + # ignore keys that do not exist + new_indexes = [] + str_loc_on_datetime_index = False + for ax, index in enumerate(indexes): + if ax == 0: + if isinstance(index, np.ndarray) and index.dtype != np.bool_: + new_indexes.append(df.index.intersection(index)) + elif isinstance(df.index, pd.DatetimeIndex) and isinstance( + index, str + ): + # special process for datetime index + str_loc_on_datetime_index = True + new_indexes.append(index) + else: + new_indexes.append(index) + else: + new_indexes.append(index) + + try: + r = df.loc[tuple(new_indexes)] + if str_loc_on_datetime_index: + # convert back to DataFrame or Series + if r.ndim == 0: + index = df.index[df.index.get_loc(new_indexes[0])] + r = xdf.Series([r], index=[index]) + elif r.ndim == 1: + rdf = xdf.DataFrame(columns=r.index) + rdf.loc[r.name] = r + r = rdf + except KeyError: + if str_loc_on_datetime_index: + new_indexes[0] = [] + r = df.loc[tuple(new_indexes)] + else: # pragma: no cover + raise + + if isinstance(r, pd.Series) and r.dtype != chunk.dtype: + r = r.astype(chunk.dtype) + ctx[chunk.key] = r + + +def loc(a): + return DataFrameLoc(a) diff --git a/python/xorbits/_mars/dataframe/indexing/reindex.py b/python/xorbits/_mars/dataframe/indexing/reindex.py new file mode 100644 index 000000000..f17290b58 --- /dev/null +++ b/python/xorbits/_mars/dataframe/indexing/reindex.py @@ -0,0 +1,900 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +try: + import scipy.sparse as sps +except ImportError: # pragma: no cover + sps = None + +from ... import opcodes +from ...core import ENTITY_TYPE, recursive_tile +from ...core.operand import OperandStage +from ...serialization.serializables import ( + AnyField, + BoolField, + Int64Field, + KeyField, + StringField, +) +from ...tensor import tensor as astensor +from ...utils import lazy_import, pd_release_version +from ..core import INDEX_TYPE +from ..core import Index as DataFrameIndexType +from ..initializer import Index as asindex +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import parse_index, validate_axis_style_args +from .index_lib import DataFrameReindexHandler + +cudf = lazy_import("cudf") + +# under pandas<1.1, SparseArray ignores zeros on creation +_pd_sparse_miss_zero = pd_release_version[:2] < (1, 1) + + +class DataFrameReindex(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.REINDEX + + _input = KeyField("input") + _index = AnyField("index") + _index_freq = AnyField("index_freq") + _columns = AnyField("columns") + _method = StringField("method") + _level = AnyField("level") + _fill_value = AnyField("fill_value") + _limit = Int64Field("limit") + _enable_sparse = BoolField("enable_sparse") + + def __init__( + self, + index=None, + index_freq=None, + columns=None, + method=None, + level=None, + fill_value=None, + limit=None, + enable_sparse=None, + **kw, + ): + super().__init__( + _index=index, + _index_freq=index_freq, + _columns=columns, + _method=method, + _level=level, + _fill_value=fill_value, + _limit=limit, + _enable_sparse=enable_sparse, + **kw, + ) + + @property + def input(self): + return self._input + + @property + def index(self): + return self._index + + @property + def index_freq(self): + return self._index_freq + + @property + def columns(self): + return self._columns + + @property + def method(self): + return self._method + + @property + def level(self): + return self._level + + @property + def fill_value(self): + return self._fill_value + + @property + def limit(self): + return self._limit + + @property + def enable_sparse(self): + return self._enable_sparse + + @property + def _indexes(self): + # used for index_lib + indexes = [] + names = ("index", "columns") + for ax in range(self.input.ndim): + index = names[ax] + val = getattr(self, index) + if val is not None: + indexes.append(val) + else: + indexes.append(slice(None)) + return indexes + + @_indexes.setter + def _indexes(self, new_indexes): + for index_field, new_index in zip(["_index", "_columns"], new_indexes): + setattr(self, index_field, new_index) + + @property + def indexes(self): + return self._indexes + + @property + def can_index_miss(self): + return True + + def _new_chunks(self, inputs, kws=None, **kw): + if self.stage == OperandStage.map and len(inputs) < len(self._inputs): + assert len(inputs) == len(self._inputs) - 1 + inputs.append(self._fill_value.chunks[0]) + + if self.stage == OperandStage.agg and self._fill_value is not None: + # fill_value is not required + self._fill_value = None + + return super()._new_chunks(inputs, kws=kws, **kw) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + self._input = next(inputs_iter) + if self._index is not None and isinstance(self._index, ENTITY_TYPE): + self._index = next(inputs_iter) + if self._fill_value is not None and isinstance(self._fill_value, ENTITY_TYPE): + self._fill_value = next(inputs_iter) + + def __call__(self, df_or_series): + inputs = [df_or_series] + shape = list(df_or_series.shape) + index_value = df_or_series.index_value + columns_value = dtypes = None + if df_or_series.ndim == 2: + columns_value = df_or_series.columns_value + dtypes = df_or_series.dtypes + + if self._index is not None: + shape[0] = self._index.shape[0] + index_value = asindex(self._index).index_value + self._index = astensor(self._index) + if isinstance(self._index, ENTITY_TYPE): + inputs.append(self._index) + if self._columns is not None: + shape[1] = self._columns.shape[0] + dtypes = df_or_series.dtypes.reindex(index=self._columns).fillna( + np.dtype(np.float64) + ) + columns_value = parse_index(dtypes.index, store_data=True) + if self._fill_value is not None and isinstance(self._fill_value, ENTITY_TYPE): + inputs.append(self._fill_value) + + if df_or_series.ndim == 1: + return self.new_series( + inputs, + shape=tuple(shape), + dtype=df_or_series.dtype, + index_value=index_value, + name=df_or_series.name, + ) + else: + return self.new_dataframe( + inputs, + shape=tuple(shape), + dtypes=dtypes, + index_value=index_value, + columns_value=columns_value, + ) + + @classmethod + def tile(cls, op): + if all(len(inp.chunks) == 1 for inp in op.inputs): + # tile one chunk + out = op.outputs[0] + + chunk_op = op.copy().reset_key() + chunk_params = out.params.copy() + chunk_params["index"] = (0,) * out.ndim + out_chunk = chunk_op.new_chunk( + [inp.chunks[0] for inp in op.inputs], kws=[chunk_params] + ) + + params = out.params.copy() + params["nsplits"] = ((s,) for s in out.shape) + params["chunks"] = [out_chunk] + new_op = op.copy() + return new_op.new_tileables(op.inputs, kws=[params]) + + handler = DataFrameReindexHandler() + result = yield from handler.handle(op) + if op.method is None and op.fill_value is None: + return [result] + else: + axis = 1 if op.columns is not None and op.index is None else 0 + result = result.fillna( + value=op.fill_value, method=op.method, axis=axis, limit=op.limit + ) + return [(yield from recursive_tile(result))] + + @classmethod + def _get_value(cls, ctx, obj): + if obj is not None and hasattr(obj, "key"): + return ctx[obj.key] + return obj + + @classmethod + def _convert_to_writable(cls, obj): + if isinstance(obj, np.ndarray) and not obj.flags.writeable: + return obj.copy() + return obj + + @classmethod + def _sparse_reindex(cls, inp, index=None, columns=None): + if inp.ndim == 2: + columns = inp.columns if columns is None else columns + index_shape = len(index) if index is not None else len(inp) + i_to_columns = dict() + + for i, col in enumerate(columns): + if col in inp.dtypes: + if index is None: + i_to_columns[i] = inp[col] + else: + indexer = inp.index.reindex(index)[1] + cond = indexer >= 0 + available_indexer = indexer[cond] + del indexer + data = inp[col].iloc[available_indexer].to_numpy() + ind = cond.nonzero()[0] + spmatrix = sps.csc_matrix( + (data, (ind, np.zeros_like(ind))), + shape=(index_shape, 1), + dtype=inp[col].dtype, + ) + # convert to SparseDtype(xxx, np.nan) + # to ensure 0 in sparse_array not converted to np.nan + if not _pd_sparse_miss_zero: + sparse_array = pd.arrays.SparseArray.from_spmatrix(spmatrix) + sparse_array = pd.arrays.SparseArray( + sparse_array.sp_values, + sparse_index=sparse_array.sp_index, + fill_value=np.nan, + dtype=pd.SparseDtype(sparse_array.dtype, np.nan), + ) + else: + from pandas._libs.sparse import IntIndex + + sparse_array = pd.arrays.SparseArray( + data, + sparse_index=IntIndex(index_shape, ind), + fill_value=np.nan, + dtype=pd.SparseDtype(data.dtype, np.nan), + ) + series = pd.Series(sparse_array, index=index) + + i_to_columns[i] = series + else: + ind = index if index is not None else inp.index + i_to_columns[i] = pd.DataFrame.sparse.from_spmatrix( + sps.coo_matrix((index_shape, 1), dtype=np.float64), index=ind + ).iloc[:, 0] + + df = pd.DataFrame(i_to_columns) + df.columns = columns + return df + else: + indexer = inp.index.reindex(index)[1] + cond = indexer >= 0 + available_indexer = indexer[cond] + del indexer + data = inp.iloc[available_indexer].to_numpy() + ind = cond.nonzero()[0] + spmatrix = sps.csc_matrix( + (data, (ind, np.zeros_like(ind))), + shape=(len(index), 1), + dtype=inp.dtype, + ) + sparse_array = pd.arrays.SparseArray.from_spmatrix(spmatrix) + # convert to SparseDtype(xxx, np.nan) + # to ensure 0 in sparse_array not converted to np.nan + sparse_array = pd.arrays.SparseArray( + sparse_array.sp_values, + sparse_index=sparse_array.sp_index, + fill_value=np.nan, + dtype=pd.SparseDtype(sparse_array.dtype, np.nan), + ) + series = pd.Series(sparse_array, index=index, name=inp.name) + return series + + @classmethod + def _reindex(cls, ctx, op, fill=True, try_sparse=None): + inp = cls._convert_to_writable(ctx[op.input.key]) + index = cls._get_value(ctx, op.index) + if op.index_freq is not None: + index = pd.Index(index, freq=op.index_freq) + columns = cls._get_value(ctx, op.columns) + kw = {"level": op.level} + if index is not None and not isinstance(index, slice): + kw["index"] = cls._convert_to_writable(index) + if columns is not None and not isinstance(columns, slice): + kw["columns"] = cls._convert_to_writable(columns) + if fill: + kw["method"] = op.method + kw["fill_value"] = cls._get_value(ctx, op.fill_value) + kw["limit"] = op.limit + + if ( + try_sparse + and not fill + and op.level is None + and isinstance(inp, (pd.DataFrame, pd.Series)) + and sps is not None + ): + # 1. sparse is used in map only + # 2. for MultiIndex, sparse is not needed as well + # 3. only consider cpu + # 4. scipy is installed + + if op.enable_sparse is None: + # try to use sparse if estimated size > 2 * input_size + cur_size = inp.memory_usage(deep=True) + if inp.ndim == 2: + cur_size = cur_size.sum() + element_size = cur_size / inp.size + shape = list(inp.shape) + if "index" in kw: + shape[0] = len(kw["index"]) + if "columns" in kw: + shape[1] = len(kw["columns"]) + estimate_size = np.prod(shape) * element_size + + fitted = estimate_size > cur_size * 2 + else: + # specified when op.enable_sparse == True + fitted = True + + if fitted: + # use sparse instead + return cls._sparse_reindex( + inp, index=kw.get("index"), columns=kw.get("columns") + ) + + return inp.reindex(**kw) + + @classmethod + def _execute_reindex(cls, ctx, op): + ctx[op.outputs[0].key] = cls._reindex(ctx, op) + + @classmethod + def _execute_map(cls, ctx, op): + if op.enable_sparse is not None: + try_sparse = op.enable_sparse + else: + try_sparse = True + ctx[op.outputs[0].key] = cls._reindex( + ctx, op, fill=False, try_sparse=try_sparse + ) + + @classmethod + def _convert_to_dense(cls, series): + if isinstance(series.dtype, pd.SparseDtype): + return series.astype( + pd.SparseDtype(series.dtype.subtype, np.nan) + ).sparse.to_dense() + return series + + @classmethod + def _merge_chunks(cls, inputs): + xdf = cls._get_xdf(inputs[0]) + + ndim = inputs[0].ndim + if ndim == 2: + columns = inputs[0].columns + result = xdf.DataFrame( + np.full(inputs[0].shape, np.nan), columns=columns, index=inputs[0].index + ) + else: + columns = [inputs[0].name] + result = None + + for i in range(len(columns)): + if ndim == 1: + curr = cls._convert_to_dense(inputs[0]).copy() + else: + curr = cls._convert_to_dense(inputs[0].iloc[:, i]).copy() + for j in range(len(inputs) - 1): + if ndim == 2: + left = cls._convert_to_dense(inputs[j].iloc[:, i]) + right = cls._convert_to_dense(inputs[j + 1].iloc[:, i]) + else: + left = cls._convert_to_dense(inputs[j]) + right = cls._convert_to_dense(inputs[j + 1]) + + left_notna = left.notna() + right_notna = right.notna() + if (left_notna & right_notna).sum() > 0: + raise ValueError("cannot reindex from a duplicate axis") + curr.loc[left_notna] = left.loc[left_notna] + curr.loc[right_notna] = right.loc[right_notna] + if ndim == 1: + result = curr + else: + result.iloc[:, i] = curr + + return result + + @classmethod + def _get_xdf(cls, obj): + return ( + pd if isinstance(obj, (pd.DataFrame, pd.Series)) or cudf is None else cudf + ) + + @classmethod + def _execute_agg(cls, ctx, op): + out = op.outputs[0] + + if op.index is None and op.columns is None: + # index is tensor + inputs = [ctx[inp.key] for inp in op.inputs] + + xdf = cls._get_xdf(inputs[0]) + + if inputs[0].index.nlevels > 1 and op.level is not None: + # multi index + result = xdf.concat(inputs) + else: + result = cls._merge_chunks(inputs) if len(inputs) > 1 else inputs[0] + + ctx[out.key] = result + + else: + # ndarray index or columns + if isinstance(op.index, slice) and op.index == slice(None): + axis = 1 + labels = op.columns + else: + assert op.columns is None or ( + isinstance(op.columns, slice) and op.columns == slice(None) + ) + axis = 0 + labels = op.index + + inp = ctx[op.inputs[0].key] + if inp.index.nlevels > 1 and op.level is not None: + new_inp = inp + else: + # split input + size = out.shape[axis] + assert inp.shape[axis] % size == 0 + inputs = [] + for i in range(inp.shape[axis] // size): + slc = [slice(None)] * inp.ndim + slc[axis] = slice(size * i, size * (i + 1)) + inputs.append(inp.iloc[tuple(slc)]) + new_inp = cls._merge_chunks(inputs) + + labels = cls._convert_to_writable(labels) + if out.ndim == 2: + result = new_inp.reindex(labels=labels, axis=axis, level=op.level) + else: + result = new_inp.reindex(index=labels, level=op.level) + ctx[out.key] = result + + @classmethod + def execute(cls, ctx, op): + if op.stage == OperandStage.map: + return cls._execute_map(ctx, op) + elif op.stage == OperandStage.agg: + return cls._execute_agg(ctx, op) + else: + assert op.stage is None + return cls._execute_reindex(ctx, op) + + +def reindex(df_or_series, *args, **kwargs): + """ + Conform Series/DataFrame to new index with optional filling logic. + + Places NA/NaN in locations having no value in the previous index. A new object + is produced unless the new index is equivalent to the current one and + ``copy=False``. + + Parameters + ---------- + labels : array-like, optional + New labels / index to conform the axis specified by 'axis' to. + index, columns : array-like, optional + New labels / index to conform to, should be specified using + keywords. Preferably an Index object to avoid duplicating data. + axis : int or str, optional + Axis to target. Can be either the axis name ('index', 'columns') + or number (0, 1). + method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'} + Method to use for filling holes in reindexed DataFrame. + Please note: this is only applicable to DataFrames/Series with a + monotonically increasing/decreasing index. + + * None (default): don't fill gaps + * pad / ffill: Propagate last valid observation forward to next + valid. + * backfill / bfill: Use next valid observation to fill gap. + * nearest: Use nearest valid observations to fill gap. + + copy : bool, default True + Return a new object, even if the passed indexes are the same. + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + fill_value : scalar, default np.NaN + Value to use for missing values. Defaults to NaN, but can be any + "compatible" value. + limit : int, default None + Maximum number of consecutive elements to forward or backward fill. + tolerance : optional + Maximum distance between original and new labels for inexact + matches. The values of the index at the matching locations most + satisfy the equation ``abs(index[indexer] - target) <= tolerance``. + + Tolerance may be a scalar value, which applies the same tolerance + to all values, or list-like, which applies variable tolerance per + element. List-like includes list, tuple, array, Series, and must be + the same size as the index and its dtype must exactly match the + index's type. + + Returns + ------- + Series/DataFrame with changed index. + + See Also + -------- + DataFrame.set_index : Set row labels. + DataFrame.reset_index : Remove row labels or move them to new columns. + DataFrame.reindex_like : Change to same indices as other DataFrame. + + Examples + -------- + + ``DataFrame.reindex`` supports two calling conventions + + * ``(index=index_labels, columns=column_labels, ...)`` + * ``(labels, axis={'index', 'columns'}, ...)`` + + We *highly* recommend using keyword arguments to clarify your + intent. + + Create a dataframe with some fictional data. + + >>> import mars.dataframe as md + >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror'] + >>> df = md.DataFrame({'http_status': [200, 200, 404, 404, 301], + ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}, + ... index=index) + >>> df.execute() + http_status response_time + Firefox 200 0.04 + Chrome 200 0.02 + Safari 404 0.07 + IE10 404 0.08 + Konqueror 301 1.00 + + Create a new index and reindex the dataframe. By default + values in the new index that do not have corresponding + records in the dataframe are assigned ``NaN``. + + >>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10', + ... 'Chrome'] + >>> df.reindex(new_index).execute() + http_status response_time + Safari 404.0 0.07 + Iceweasel NaN NaN + Comodo Dragon NaN NaN + IE10 404.0 0.08 + Chrome 200.0 0.02 + + We can fill in the missing values by passing a value to + the keyword ``fill_value``. Because the index is not monotonically + increasing or decreasing, we cannot use arguments to the keyword + ``method`` to fill the ``NaN`` values. + + >>> df.reindex(new_index, fill_value=0).execute() + http_status response_time + Safari 404 0.07 + Iceweasel 0 0.00 + Comodo Dragon 0 0.00 + IE10 404 0.08 + Chrome 200 0.02 + + >>> df.reindex(new_index, fill_value='missing').execute() + http_status response_time + Safari 404 0.07 + Iceweasel missing missing + Comodo Dragon missing missing + IE10 404 0.08 + Chrome 200 0.02 + + We can also reindex the columns. + + >>> df.reindex(columns=['http_status', 'user_agent']).execute() + http_status user_agent + Firefox 200 NaN + Chrome 200 NaN + Safari 404 NaN + IE10 404 NaN + Konqueror 301 NaN + + Or we can use "axis-style" keyword arguments + + >>> df.reindex(['http_status', 'user_agent'], axis="columns").execute() + http_status user_agent + Firefox 200 NaN + Chrome 200 NaN + Safari 404 NaN + IE10 404 NaN + Konqueror 301 NaN + + To further illustrate the filling functionality in + ``reindex``, we will create a dataframe with a + monotonically increasing index (for example, a sequence + of dates). + + >>> date_index = md.date_range('1/1/2010', periods=6, freq='D') + >>> df2 = md.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]}, + ... index=date_index) + >>> df2.execute() + prices + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + + Suppose we decide to expand the dataframe to cover a wider + date range. + + >>> date_index2 = md.date_range('12/29/2009', periods=10, freq='D') + >>> df2.reindex(date_index2).execute() + prices + 2009-12-29 NaN + 2009-12-30 NaN + 2009-12-31 NaN + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + 2010-01-07 NaN + + The index entries that did not have a value in the original data frame + (for example, '2009-12-29') are by default filled with ``NaN``. + If desired, we can fill in the missing values using one of several + options. + + For example, to back-propagate the last valid value to fill the ``NaN`` + values, pass ``bfill`` as an argument to the ``method`` keyword. + + >>> df2.reindex(date_index2, method='bfill').execute() + prices + 2009-12-29 100.0 + 2009-12-30 100.0 + 2009-12-31 100.0 + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + 2010-01-07 NaN + + Please note that the ``NaN`` value present in the original dataframe + (at index value 2010-01-03) will not be filled by any of the + value propagation schemes. This is because filling while reindexing + does not look at dataframe values, but only compares the original and + desired indexes. If you do want to fill in the ``NaN`` values present + in the original dataframe, use the ``fillna()`` method. + + See the :ref:`user guide ` for more. + """ + axes = validate_axis_style_args(df_or_series, args, kwargs, "labels", "reindex") + # Pop these, since the values are in `kwargs` under different names + kwargs.pop("index", None) + if df_or_series.ndim > 1: + kwargs.pop("columns", None) + kwargs.pop("axis", None) + kwargs.pop("labels", None) + method = kwargs.pop("method", None) + level = kwargs.pop("level", None) + copy = kwargs.pop("copy", True) + limit = kwargs.pop("limit", None) + tolerance = kwargs.pop("tolerance", None) + fill_value = kwargs.pop("fill_value", None) + enable_sparse = kwargs.pop("enable_sparse", None) + + if kwargs: + raise TypeError( + "reindex() got an unexpected keyword " + f'argument "{list(kwargs.keys())[0]}"' + ) + + if tolerance is not None: # pragma: no cover + raise NotImplementedError("`tolerance` is not supported yet") + + if method == "nearest": # pragma: no cover + raise NotImplementedError("method=nearest is not supported yet") + + index = axes.get("index") + index_freq = None + if isinstance(index, ENTITY_TYPE): + if isinstance(index, DataFrameIndexType): + index_freq = getattr(index.index_value.value, "freq", None) + if not isinstance(index, INDEX_TYPE): + index = astensor(index) + elif index is not None: + index = np.asarray(index) + index_freq = getattr(index, "freq", None) + + columns = axes.get("columns") + if isinstance(columns, ENTITY_TYPE): # pragma: no cover + try: + columns = columns.fetch() + except ValueError: + raise NotImplementedError( + "`columns` need to be executed first if it's a Mars object" + ) + elif columns is not None: + columns = np.asarray(columns) + + if isinstance(fill_value, ENTITY_TYPE) and getattr(fill_value, "ndim", 0) != 0: + raise ValueError("fill_value must be a scalar") + + op = DataFrameReindex( + index=index, + index_freq=index_freq, + columns=columns, + method=method, + level=level, + fill_value=fill_value, + limit=limit, + enable_sparse=enable_sparse, + ) + ret = op(df_or_series) + + if copy: + return ret.copy() + return ret + + +def reindex_like( + df_or_series, other, method=None, copy=True, limit=None, tolerance=None +): + """ + Return an object with matching indices as other object. + + Conform the object to the same index on all axes. Optional + filling logic, placing NaN in locations having no value + in the previous index. A new object is produced unless the + new index is equivalent to the current one and copy=False. + + Parameters + ---------- + other : Object of the same data type + Its row and column indices are used to define the new indices + of this object. + method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'} + Method to use for filling holes in reindexed DataFrame. + Please note: this is only applicable to DataFrames/Series with a + monotonically increasing/decreasing index. + + * None (default): don't fill gaps + * pad / ffill: propagate last valid observation forward to next + valid + * backfill / bfill: use next valid observation to fill gap + * nearest: use nearest valid observations to fill gap. + + copy : bool, default True + Return a new object, even if the passed indexes are the same. + limit : int, default None + Maximum number of consecutive labels to fill for inexact matches. + tolerance : optional + Maximum distance between original and new labels for inexact + matches. The values of the index at the matching locations must + satisfy the equation ``abs(index[indexer] - target) <= tolerance``. + + Tolerance may be a scalar value, which applies the same tolerance + to all values, or list-like, which applies variable tolerance per + element. List-like includes list, tuple, array, Series, and must be + the same size as the index and its dtype must exactly match the + index's type. + + Returns + ------- + Series or DataFrame + Same type as caller, but with changed indices on each axis. + + See Also + -------- + DataFrame.set_index : Set row labels. + DataFrame.reset_index : Remove row labels or move them to new columns. + DataFrame.reindex : Change to new indices or expand indices. + + Notes + ----- + Same as calling + ``.reindex(index=other.index, columns=other.columns,...)``. + + Examples + -------- + >>> import pandas as pd + >>> import mars.dataframe as md + >>> df1 = md.DataFrame([[24.3, 75.7, 'high'], + ... [31, 87.8, 'high'], + ... [22, 71.6, 'medium'], + ... [35, 95, 'medium']], + ... columns=['temp_celsius', 'temp_fahrenheit', + ... 'windspeed'], + ... index=md.date_range(start='2014-02-12', + ... end='2014-02-15', freq='D')) + + >>> df1.execute() + temp_celsius temp_fahrenheit windspeed + 2014-02-12 24.3 75.7 high + 2014-02-13 31 87.8 high + 2014-02-14 22 71.6 medium + 2014-02-15 35 95 medium + + >>> df2 = md.DataFrame([[28, 'low'], + ... [30, 'low'], + ... [35.1, 'medium']], + ... columns=['temp_celsius', 'windspeed'], + ... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13', + ... '2014-02-15'])) + + >>> df2.execute() + temp_celsius windspeed + 2014-02-12 28.0 low + 2014-02-13 30.0 low + 2014-02-15 35.1 medium + + >>> df2.reindex_like(df1).execute() + temp_celsius temp_fahrenheit windspeed + 2014-02-12 28.0 NaN low + 2014-02-13 30.0 NaN low + 2014-02-14 NaN NaN NaN + 2014-02-15 35.1 NaN medium + """ + cond = df_or_series.index_value.key == other.index_value.key + if df_or_series.ndim == 2: + cond &= df_or_series.columns_value.key == other.columns_value.key + if cond and not copy: + return df_or_series + + kw = { + "index": other.index, + "method": method, + "limit": limit, + "tolerance": tolerance, + } + if df_or_series.ndim == 2: + kw["columns"] = other.dtypes.index + return reindex(df_or_series, **kw) diff --git a/python/xorbits/_mars/dataframe/indexing/rename.py b/python/xorbits/_mars/dataframe/indexing/rename.py new file mode 100644 index 000000000..5a38462a6 --- /dev/null +++ b/python/xorbits/_mars/dataframe/indexing/rename.py @@ -0,0 +1,555 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings + +from ... import opcodes +from ...core import OutputType, get_output_types +from ...serialization.serializables import AnyField, StringField +from ..core import SERIES_TYPE +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_df, build_series, parse_index, validate_axis + + +class DataFrameRename(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.RENAME + + _columns_mapper = AnyField("columns_mapper") + _index_mapper = AnyField("index_mapper") + _new_name = AnyField("new_name") + _level = AnyField("level") + _errors = StringField("errors") + + def __init__( + self, + columns_mapper=None, + index_mapper=None, + new_name=None, + level=None, + errors=None, + output_types=None, + **kw + ): + super().__init__( + _columns_mapper=columns_mapper, + _index_mapper=index_mapper, + _new_name=new_name, + _level=level, + _errors=errors, + _output_types=output_types, + **kw + ) + + @property + def columns_mapper(self): + return self._columns_mapper + + @property + def index_mapper(self): + return self._index_mapper + + @property + def new_name(self): + return self._new_name + + @property + def level(self): + return self._level + + @property + def errors(self) -> str: + return self._errors + + def _calc_renamed_df(self, df, errors="ignore"): + empty_df = build_df(df) + return empty_df.rename( + columns=self._columns_mapper, + index=self._index_mapper, + level=self._level, + errors=errors, + ) + + def _calc_renamed_series(self, df, errors="ignore"): + empty_series = build_series(df, name=df.name) + new_series = empty_series.rename( + index=self._index_mapper, level=self._level, errors=errors + ) + if self._new_name: + new_series.name = self._new_name + return new_series + + def __call__(self, df): + params = df.params + raw_index = df.index_value.to_pandas() + if df.ndim == 2: + new_df = self._calc_renamed_df(df, errors=self.errors) + new_index = new_df.index + elif isinstance(df, SERIES_TYPE): + new_df = self._calc_renamed_series(df, errors=self.errors) + new_index = new_df.index + else: + new_df = new_index = raw_index.set_names( + self._index_mapper or self._new_name, level=self._level + ) + + if self._columns_mapper is not None: + params["columns_value"] = parse_index(new_df.columns, store_data=True) + params["dtypes"] = new_df.dtypes + if self._index_mapper is not None: + params["index_value"] = parse_index(new_index) + if df.ndim == 1: + params["name"] = new_df.name + return self.new_tileable([df], **params) + + @classmethod + def tile(cls, op: "DataFrameRename"): + inp = op.inputs[0] + out = op.outputs[0] + chunks = [] + + dtypes_cache = dict() + for c in inp.chunks: + params = c.params + new_op = op.copy().reset_key() + + if op.columns_mapper is not None: + try: + new_dtypes = dtypes_cache[c.index[1]] + except KeyError: + new_dtypes = dtypes_cache[c.index[1]] = op._calc_renamed_df( + c + ).dtypes + + params["columns_value"] = parse_index(new_dtypes.index, store_data=True) + params["dtypes"] = new_dtypes + if op.index_mapper is not None: + params["index_value"] = out.index_value + if out.ndim == 1: + params["name"] = out.name + + if isinstance(op.columns_mapper, dict): + idx = params["dtypes"].index + if op._level is not None: + idx = idx.get_level_values(op._level) + new_op._columns_mapper = { + k: v for k, v in op.columns_mapper.items() if v in idx + } + chunks.append(new_op.new_chunk([c], **params)) + + new_op = op.copy().reset_key() + return new_op.new_tileables( + [inp], chunks=chunks, nsplits=inp.nsplits, **out.params + ) + + @classmethod + def execute(cls, ctx, op: "DataFrameRename"): + input_ = ctx[op.inputs[0].key] + if input_.ndim == 2: + ctx[op.outputs[0].key] = input_.rename( + index=op.index_mapper, columns=op.columns_mapper, level=op.level + ) + elif op.output_types[0] == OutputType.series: + ctx[op.outputs[0].key] = input_.rename( + index=op.index_mapper or op.new_name, level=op.level + ) + else: + ctx[op.outputs[0].key] = input_.set_names( + op.index_mapper or op.new_name, level=op.level + ) + + +def _rename( + df_obj, + index_mapper=None, + columns_mapper=None, + copy=True, + inplace=False, + level=None, + errors="ignore", +): + if not copy: + raise NotImplementedError("`copy=False` not implemented") + + if index_mapper is not None and errors == "raise" and not inplace: + warnings.warn("Errors will not raise for non-existing indices") + + op = DataFrameRename( + columns_mapper=columns_mapper, + index_mapper=index_mapper, + level=level, + errors=errors, + output_types=get_output_types(df_obj), + ) + ret = op(df_obj) + if inplace: + df_obj.data = ret.data + else: + return ret + + +def df_rename( + df, + mapper=None, + index=None, + columns=None, + axis="index", + copy=True, + inplace=False, + level=None, + errors="ignore", +): + """ + Alter axes labels. + + Function / dict values must be unique (1-to-1). Labels not contained in + a dict / Series will be left as-is. Extra labels listed don't throw an + error. + + Parameters + ---------- + mapper : dict-like or function + Dict-like or functions transformations to apply to + that axis' values. Use either ``mapper`` and ``axis`` to + specify the axis to target with ``mapper``, or ``index`` and + ``columns``. + index : dict-like or function + Alternative to specifying axis (``mapper, axis=0`` + is equivalent to ``index=mapper``). + columns : dict-like or function + Alternative to specifying axis (``mapper, axis=1`` + is equivalent to ``columns=mapper``). + axis : int or str + Axis to target with ``mapper``. Can be either the axis name + ('index', 'columns') or number (0, 1). The default is 'index'. + copy : bool, default True + Also copy underlying data. + inplace : bool, default False + Whether to return a new DataFrame. If True then value of copy is + ignored. + level : int or level name, default None + In case of a MultiIndex, only rename labels in the specified + level. + errors : {'ignore', 'raise'}, default 'ignore' + If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`, + or `columns` contains labels that are not present in the Index + being transformed. + If 'ignore', existing keys will be renamed and extra keys will be + ignored. + + Returns + ------- + DataFrame + DataFrame with the renamed axis labels. + + Raises + ------ + KeyError + If any of the labels is not found in the selected axis and + "errors='raise'". + + See Also + -------- + DataFrame.rename_axis : Set the name of the axis. + + Examples + -------- + + ``DataFrame.rename`` supports two calling conventions + + * ``(index=index_mapper, columns=columns_mapper, ...)`` + * ``(mapper, axis={'index', 'columns'}, ...)`` + + We *highly* recommend using keyword arguments to clarify your + intent. + + Rename columns using a mapping: + + >>> import mars.dataframe as md + >>> df = md.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + >>> df.rename(columns={"A": "a", "B": "c"}).execute() + a c + 0 1 4 + 1 2 5 + 2 3 6 + + Rename index using a mapping: + + >>> df.rename(index={0: "x", 1: "y", 2: "z"}).execute() + A B + x 1 4 + y 2 5 + z 3 6 + + Cast index labels to a different type: + + >>> df.index.execute() + RangeIndex(start=0, stop=3, step=1) + >>> df.rename(index=str).index.execute() + Index(['0', '1', '2'], dtype='object') + + >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise").execute() + Traceback (most recent call last): + KeyError: ['C'] not found in axis + + Using axis-style parameters + + >>> df.rename(str.lower, axis='columns').execute() + a b + 0 1 4 + 1 2 5 + 2 3 6 + + >>> df.rename({1: 2, 2: 4}, axis='index').execute() + A B + 0 1 4 + 2 2 5 + 4 3 6 + + """ + axis = validate_axis(axis, df) + if axis == 0: + index_mapper = index if index is not None else mapper + columns_mapper = columns + else: + columns_mapper = columns if columns is not None else mapper + index_mapper = index + + if index_mapper is not None and errors == "raise" and not inplace: + warnings.warn("Errors will not raise for non-existing indices") + + return _rename( + df, + index_mapper=index_mapper, + columns_mapper=columns_mapper, + copy=copy, + inplace=inplace, + level=level, + errors=errors, + ) + + +def series_rename( + series, + index=None, + *, + axis="index", + copy=True, + inplace=False, + level=None, + errors="ignore" +): + """ + Alter Series index labels or name. + + Function / dict values must be unique (1-to-1). Labels not contained in + a dict / Series will be left as-is. Extra labels listed don't throw an + error. + + Alternatively, change ``Series.name`` with a scalar value. + + Parameters + ---------- + axis : {0 or "index"} + Unused. Accepted for compatibility with DataFrame method only. + index : scalar, hashable sequence, dict-like or function, optional + Functions or dict-like are transformations to apply to + the index. + Scalar or hashable sequence-like will alter the ``Series.name`` + attribute. + + **kwargs + Additional keyword arguments passed to the function. Only the + "inplace" keyword is used. + + Returns + ------- + Series + Series with index labels or name altered. + + See Also + -------- + DataFrame.rename : Corresponding DataFrame method. + Series.rename_axis : Set the name of the axis. + + Examples + -------- + >>> import mars.dataframe as md + >>> s = md.Series([1, 2, 3]) + >>> s.execute() + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.rename("my_name").execute() # scalar, changes Series.name.execute() + 0 1 + 1 2 + 2 3 + Name: my_name, dtype: int64 + >>> s.rename(lambda x: x ** 2).execute() # function, changes labels.execute() + 0 1 + 1 2 + 4 3 + dtype: int64 + >>> s.rename({1: 3, 2: 5}).execute() # mapping, changes labels.execute() + 0 1 + 3 2 + 5 3 + dtype: int64 + """ + validate_axis(axis) + return _rename( + series, + index_mapper=index, + copy=copy, + inplace=inplace, + level=level, + errors=errors, + ) + + +def index_rename(index, name, inplace=False): + """ + Alter Index or MultiIndex name. + + Able to set new names without level. Defaults to returning new index. + Length of names must match number of levels in MultiIndex. + + Parameters + ---------- + name : label or list of labels + Name(s) to set. + inplace : bool, default False + Modifies the object directly, instead of creating a new Index or + MultiIndex. + + Returns + ------- + Index + The same type as the caller or None if inplace is True. + + See Also + -------- + Index.set_names : Able to set new names partially and by level. + + Examples + -------- + >>> import mars.dataframe as md + >>> idx = md.Index(['A', 'C', 'A', 'B'], name='score') + >>> idx.rename('grade').execute() + Index(['A', 'C', 'A', 'B'], dtype='object', name='grade') + + >>> idx = md.Index([('python', 2018), + ... ('python', 2019), + ... ('cobra', 2018), + ... ('cobra', 2019)], + ... names=['kind', 'year']) + >>> idx.execute() + MultiIndex([('python', 2018), + ('python', 2019), + ( 'cobra', 2018), + ( 'cobra', 2019)], + names=['kind', 'year']) + >>> idx.rename(['species', 'year']).execute() + MultiIndex([('python', 2018), + ('python', 2019), + ( 'cobra', 2018), + ( 'cobra', 2019)], + names=['species', 'year']) + >>> idx.rename('species').execute() + Traceback (most recent call last): + TypeError: Must pass list-like as `names`. + """ + op = DataFrameRename(index_mapper=name, output_types=get_output_types(index)) + ret = op(index) + if inplace: + index.data = ret.data + else: + return ret + + +def index_set_names(index, names, level=None, inplace=False): + """ + Set Index or MultiIndex name. + + Able to set new names partially and by level. + + Parameters + ---------- + names : label or list of label + Name(s) to set. + level : int, label or list of int or label, optional + If the index is a MultiIndex, level(s) to set (None for all + levels). Otherwise level must be None. + inplace : bool, default False + Modifies the object directly, instead of creating a new Index or + MultiIndex. + + Returns + ------- + Index + The same type as the caller or None if inplace is True. + + See Also + -------- + Index.rename : Able to set new names without level. + + Examples + -------- + >>> import mars.dataframe as md + >>> idx = md.Index([1, 2, 3, 4]) + >>> idx.execute() + Int64Index([1, 2, 3, 4], dtype='int64') + >>> idx.set_names('quarter').execute() + Int64Index([1, 2, 3, 4], dtype='int64', name='quarter') + + >>> idx = md.MultiIndex.from_product([['python', 'cobra'], + ... [2018, 2019]]) + >>> idx.execute() + MultiIndex([('python', 2018), + ('python', 2019), + ( 'cobra', 2018), + ( 'cobra', 2019)], + ) + >>> idx.set_names(['kind', 'year'], inplace=True) + >>> idx.execute() + MultiIndex([('python', 2018), + ('python', 2019), + ( 'cobra', 2018), + ( 'cobra', 2019)], + names=['kind', 'year']) + >>> idx.set_names('species', level=0).execute() + MultiIndex([('python', 2018), + ('python', 2019), + ( 'cobra', 2018), + ( 'cobra', 2019)], + names=['species', 'year']) + """ + op = DataFrameRename( + index_mapper=names, level=level, output_types=get_output_types(index) + ) + ret = op(index) + + if inplace: + df_or_series = getattr(index, "_get_df_or_series", lambda: None)() + if df_or_series is not None: + from .rename_axis import rename_axis_with_level + + rename_axis_with_level( + df_or_series, names, axis=index._axis, level=level, inplace=True + ) + index.data = df_or_series.axes[index._axis].data + else: + index.data = ret.data + else: + return ret diff --git a/python/xorbits/_mars/dataframe/indexing/rename_axis.py b/python/xorbits/_mars/dataframe/indexing/rename_axis.py new file mode 100644 index 000000000..6bee645ea --- /dev/null +++ b/python/xorbits/_mars/dataframe/indexing/rename_axis.py @@ -0,0 +1,277 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes +from ...core import OutputType +from ...serialization.serializables import AnyField, BoolField +from ..core import DATAFRAME_TYPE +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_df, build_series, parse_index, validate_axis + + +class DataFrameRenameAxis(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.RENAME_AXIS + + _index = AnyField("index") + _columns = AnyField("columns") + _copy_value = BoolField("copy_value") + _level = AnyField("level") + + def __init__(self, index=None, columns=None, copy_value=None, level=None, **kw): + super().__init__( + _index=index, _columns=columns, _copy_value=copy_value, _level=level, **kw + ) + + @property + def index(self): + return self._index + + @property + def columns(self): + return self._columns + + @property + def copy_value(self): + return self._copy_value + + @property + def level(self): + return self._level + + @staticmethod + def _update_params(params, obj, mapper, axis, level): + if obj.ndim == 2: + test_obj = build_df(obj) + else: + test_obj = build_series(obj) + + if level is None: + test_obj = test_obj.rename_axis(mapper, axis=axis) + else: + test_obj.axes[axis].set_names(mapper, level=level, inplace=True) + + if axis == 0: + params["index_value"] = parse_index(test_obj.index, store_data=False) + else: + params["dtypes"] = test_obj.dtypes + params["columns_value"] = parse_index(test_obj.columns, store_data=True) + + def __call__(self, df_or_series): + params = df_or_series.params + + if isinstance(df_or_series, DATAFRAME_TYPE): + self._output_types = [OutputType.dataframe] + else: + self._output_types = [OutputType.series] + + if self.index is not None: + self._update_params( + params, df_or_series, self.index, axis=0, level=self.level + ) + else: + self._update_params( + params, df_or_series, self.columns, axis=1, level=self.level + ) + + return self.new_tileable([df_or_series], **params) + + @classmethod + def tile(cls, op: "DataFrameRenameAxis"): + in_obj = op.inputs[0] + out_obj = op.outputs[0] + + chunks = [] + idx_cache = dict() + for c in in_obj.chunks: + params = c.params + if op.index is not None: + try: + params["index_value"] = idx_cache[c.index[0]] + except KeyError: + cls._update_params(params, c, op.index, axis=0, level=op.level) + idx_cache[c.index[0]] = params["index_value"] + else: + try: + params["columns_value"], params["dtypes"] = idx_cache[c.index[1]] + except KeyError: + cls._update_params(params, c, op.columns, axis=1, level=op.level) + idx_cache[c.index[1]] = params["columns_value"], params["dtypes"] + + new_op = op.copy().reset_key() + chunks.append(new_op.new_chunk([c], **params)) + + new_op = op.copy().reset_key() + return new_op.new_tileables( + [in_obj], chunks=chunks, nsplits=in_obj.nsplits, **out_obj.params + ) + + @classmethod + def execute(cls, ctx, op: "DataFrameRenameAxis"): + in_data = ctx[op.inputs[0].key] + if op.index is not None: + val, axis = op.index, 0 + else: + val, axis = op.columns, 1 + + if op.level is None: + ctx[op.outputs[0].key] = in_data.rename_axis( + val, axis=axis, copy=op.copy_value + ) + else: + ret = in_data.copy() if op.copy_value else in_data + ret.axes[axis].set_names(val, level=op.level, inplace=True) + ctx[op.outputs[0].key] = ret + + +def rename_axis_with_level( + df_or_series, + mapper=None, + index=None, + columns=None, + axis=0, + copy=True, + level=None, + inplace=False, +): + axis = validate_axis(axis, df_or_series) + if mapper is not None: + if axis == 0: + index = mapper + else: + columns = mapper + op = DataFrameRenameAxis(index=index, columns=columns, copy_value=copy, level=level) + result = op(df_or_series) + if not inplace: + return result + else: + df_or_series.data = result.data + + +def rename_axis( + df_or_series, + mapper=None, + index=None, + columns=None, + axis=0, + copy=True, + inplace=False, +): + """ + Set the name of the axis for the index or columns. + + Parameters + ---------- + mapper : scalar, list-like, optional + Value to set the axis name attribute. + index, columns : scalar, list-like, dict-like or function, optional + A scalar, list-like, dict-like or functions transformations to + apply to that axis' values. + Note that the ``columns`` parameter is not allowed if the + object is a Series. This parameter only apply for DataFrame + type objects. + + Use either ``mapper`` and ``axis`` to + specify the axis to target with ``mapper``, or ``index`` + and/or ``columns``. + + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to rename. + copy : bool, default True + Also copy underlying data. + inplace : bool, default False + Modifies the object directly, instead of creating a new Series + or DataFrame. + + Returns + ------- + Series, DataFrame, or None + The same type as the caller or None if `inplace` is True. + + See Also + -------- + Series.rename : Alter Series index labels or name. + DataFrame.rename : Alter DataFrame index labels or name. + Index.rename : Set new names on index. + + Notes + ----- + ``DataFrame.rename_axis`` supports two calling conventions + + * ``(index=index_mapper, columns=columns_mapper, ...)`` + * ``(mapper, axis={'index', 'columns'}, ...)`` + + The first calling convention will only modify the names of + the index and/or the names of the Index object that is the columns. + In this case, the parameter ``copy`` is ignored. + + The second calling convention will modify the names of the + the corresponding index if mapper is a list or a scalar. + However, if mapper is dict-like or a function, it will use the + deprecated behavior of modifying the axis *labels*. + + We *highly* recommend using keyword arguments to clarify your + intent. + + Examples + -------- + **Series** + + >>> import mars.dataframe as md + >>> s = md.Series(["dog", "cat", "monkey"]) + >>> s.execute() + 0 dog + 1 cat + 2 monkey + dtype: object + >>> s.rename_axis("animal").execute() + animal + 0 dog + 1 cat + 2 monkey + dtype: object + + **DataFrame** + + >>> df = md.DataFrame({"num_legs": [4, 4, 2], + ... "num_arms": [0, 0, 2]}, + ... ["dog", "cat", "monkey"]) + >>> df.execute() + num_legs num_arms + dog 4 0 + cat 4 0 + monkey 2 2 + >>> df = df.rename_axis("animal") + >>> df.execute() + num_legs num_arms + animal + dog 4 0 + cat 4 0 + monkey 2 2 + >>> df = df.rename_axis("limbs", axis="columns") + >>> df.execute() + limbs num_legs num_arms + animal + dog 4 0 + cat 4 0 + monkey 2 2 + """ + return rename_axis_with_level( + df_or_series, + mapper=mapper, + index=index, + columns=columns, + axis=axis, + copy=copy, + inplace=inplace, + ) diff --git a/python/xorbits/_mars/dataframe/indexing/reset_index.py b/python/xorbits/_mars/dataframe/indexing/reset_index.py new file mode 100644 index 000000000..30fad4aab --- /dev/null +++ b/python/xorbits/_mars/dataframe/indexing/reset_index.py @@ -0,0 +1,619 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import OutputType +from ...serialization.serializables import AnyField, BoolField +from ...utils import calc_nsplits, no_default +from ..core import IndexValue +from ..operands import DATAFRAME_TYPE, DataFrameOperand, DataFrameOperandMixin +from ..utils import ( + build_empty_df, + build_empty_series, + parse_index, + standardize_range_index, +) + + +class DataFrameResetIndex(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.RESET_INDEX + + _level = AnyField("level") + _drop = BoolField("drop") + _name = AnyField("name") + _col_level = AnyField("col_level") + _col_fill = AnyField("col_fill") + _incremental_index = BoolField("incremental_index") + + def __init__( + self, + level=None, + drop=None, + name=None, + col_level=None, + col_fill=None, + incremental_index=None, + output_types=None, + **kwargs + ): + super().__init__( + _level=level, + _drop=drop, + _name=name, + _col_level=col_level, + _col_fill=col_fill, + _incremental_index=incremental_index, + _output_types=output_types, + **kwargs + ) + + @property + def level(self): + return self._level + + @property + def drop(self): + return self._drop + + @property + def name(self): + return self._name + + @property + def col_level(self): + return self._col_level + + @property + def col_fill(self): + return self._col_fill + + @property + def incremental_index(self): + return self._incremental_index + + @classmethod + def _tile_series(cls, op: "DataFrameResetIndex"): + out_chunks = [] + out = op.outputs[0] + is_range_index = out.index_value.has_value() + cum_range = np.cumsum((0,) + op.inputs[0].nsplits[0]) + for c in op.inputs[0].chunks: + if is_range_index: + index_value = parse_index( + pd.RangeIndex(cum_range[c.index[0]], cum_range[c.index[0] + 1]) + ) + else: + index_value = out.index_value + chunk_op = op.copy().reset_key() + if op.drop: + out_chunk = chunk_op.new_chunk( + [c], + shape=c.shape, + index=c.index, + dtype=c.dtype, + name=c.name, + index_value=index_value, + ) + else: + shape = (c.shape[0], out.shape[1]) + out_chunk = chunk_op.new_chunk( + [c], + shape=shape, + index=c.index + (0,), + dtypes=out.dtypes, + index_value=index_value, + columns_value=out.columns_value, + ) + out_chunks.append(out_chunk) + if ( + not is_range_index + and isinstance(out.index_value.value, IndexValue.RangeIndex) + and op.incremental_index + ): + yield out_chunks + out_chunks = standardize_range_index(out_chunks) + new_op = op.copy() + nsplits = calc_nsplits({c.index: c.shape for c in out_chunks}) + if op.drop: + return new_op.new_seriess( + op.inputs, + op.inputs[0].shape, + name=out.name, + chunks=out_chunks, + nsplits=nsplits, + dtype=out.dtype, + index_value=out.index_value, + ) + else: + return new_op.new_dataframes( + op.inputs, + out.shape, + nsplits=nsplits, + chunks=out_chunks, + index_value=out.index_value, + columns_value=out.columns_value, + dtypes=out.dtypes, + ) + + @classmethod + def _tile_dataframe(cls, op: "DataFrameResetIndex"): + in_df = op.inputs[0] + out_df = op.outputs[0] + added_columns_num = len(out_df.dtypes) - len(in_df.dtypes) + out_chunks = [] + index_has_value = out_df.index_value.has_value() + chunk_has_nan = any(np.isnan(s) for s in in_df.nsplits[0]) + cum_range = np.cumsum((0,) + in_df.nsplits[0]) + for c in in_df.chunks: + if index_has_value: + if chunk_has_nan: + index_value = parse_index(pd.RangeIndex(-1)) + else: + index_value = parse_index( + pd.RangeIndex(cum_range[c.index[0]], cum_range[c.index[0] + 1]) + ) + else: + index_value = out_df.index_value + if c.index[1] == 0: + chunk_op = op.copy().reset_key() + dtypes = out_df.dtypes[: (added_columns_num + len(c.dtypes))] + columns_value = parse_index(dtypes.index) + new_chunk = chunk_op.new_chunk( + [c], + shape=(c.shape[0], c.shape[1] + added_columns_num), + index=c.index, + index_value=index_value, + columns_value=columns_value, + dtypes=dtypes, + ) + else: + chunk_op = op.copy().reset_key() + chunk_op._drop = True + new_chunk = chunk_op.new_chunk( + [c], + shape=c.shape, + index_value=index_value, + index=c.index, + columns_value=c.columns_value, + dtypes=c.dtypes, + ) + out_chunks.append(new_chunk) + if not index_has_value or chunk_has_nan: + if ( + isinstance(out_df.index_value.value, IndexValue.RangeIndex) + and op.incremental_index + ): + yield out_chunks + out_chunks = standardize_range_index(out_chunks) + new_op = op.copy() + columns_splits = list(in_df.nsplits[1]) + columns_splits[0] += added_columns_num + nsplits = calc_nsplits({c.index: c.shape for c in out_chunks}) + return new_op.new_dataframes( + op.inputs, + out_df.shape, + nsplits=nsplits, + chunks=out_chunks, + dtypes=out_df.dtypes, + index_value=out_df.index_value, + columns_value=out_df.columns_value, + ) + + @classmethod + def tile(cls, op): + if isinstance(op.inputs[0], DATAFRAME_TYPE): + return (yield from cls._tile_dataframe(op)) + else: + return (yield from cls._tile_series(op)) + + @classmethod + def execute(cls, ctx, op): + in_data = ctx[op.inputs[0].key] + out = op.outputs[0] + + kwargs = dict() + if op.name is not None: + kwargs["name"] = op.name + if op.col_level is not None: + kwargs["col_level"] = op.col_level + if op.col_fill is not None: + kwargs["col_fill"] = op.col_fill + + r = in_data.reset_index(level=op.level, drop=op.drop, **kwargs) + if out.index_value.has_value(): + r.index = out.index_value.to_pandas() + ctx[out.key] = r + + @classmethod + def _get_out_index(cls, df, out_shape): + if isinstance(df.index, pd.RangeIndex): + range_value = -1 if np.isnan(out_shape[0]) else out_shape[0] + index_value = parse_index(pd.RangeIndex(range_value)) + else: + index_value = parse_index(df.index) + return index_value + + def _call_series(self, a): + if self.drop: + range_value = -1 if np.isnan(a.shape[0]) else a.shape[0] + index_value = parse_index(pd.RangeIndex(range_value)) + return self.new_series( + [a], shape=a.shape, dtype=a.dtype, name=a.name, index_value=index_value + ) + else: + empty_series = build_empty_series( + dtype=a.dtype, index=a.index_value.to_pandas()[:0], name=a.name + ) + empty_df = empty_series.reset_index(level=self.level, name=self.name) + shape = (a.shape[0], len(empty_df.dtypes)) + index_value = self._get_out_index(empty_df, shape) + return self.new_dataframe( + [a], + shape=shape, + index_value=index_value, + columns_value=parse_index(empty_df.columns), + dtypes=empty_df.dtypes, + ) + + def _call_dataframe(self, a): + if self.drop: + shape = a.shape + columns_value = a.columns_value + dtypes = a.dtypes + range_value = -1 if np.isnan(a.shape[0]) else a.shape[0] + index_value = parse_index(pd.RangeIndex(range_value)) + else: + empty_df = build_empty_df(a.dtypes) + empty_df.index = a.index_value.to_pandas()[:0] + empty_df = empty_df.reset_index( + level=self.level, col_level=self.col_level, col_fill=self.col_fill + ) + shape = (a.shape[0], len(empty_df.columns)) + columns_value = parse_index(empty_df.columns, store_data=True) + dtypes = empty_df.dtypes + index_value = self._get_out_index(empty_df, shape) + return self.new_dataframe( + [a], + shape=shape, + columns_value=columns_value, + index_value=index_value, + dtypes=dtypes, + ) + + def __call__(self, a): + if isinstance(a, DATAFRAME_TYPE): + return self._call_dataframe(a) + else: + return self._call_series(a) + + +def df_reset_index( + df, + level=None, + drop=False, + inplace=False, + col_level=0, + col_fill="", + incremental_index=False, +): + """ + Reset the index, or a level of it. + + Reset the index of the DataFrame, and use the default one instead. + If the DataFrame has a MultiIndex, this method can remove one or more + levels. + + Parameters + ---------- + level : int, str, tuple, or list, default None + Only remove the given levels from the index. Removes all levels by + default. + drop : bool, default False + Do not try to insert index into dataframe columns. This resets + the index to the default integer index. + inplace : bool, default False + Modify the DataFrame in place (do not create a new object). + col_level : int or str, default 0 + If the columns have multiple levels, determines which level the + labels are inserted into. By default it is inserted into the first + level. + col_fill : object, default '' + If the columns have multiple levels, determines how the other + levels are named. If None then the index name is repeated. + incremental_index: bool, default False + Ensure RangeIndex incremental, when output DataFrame has multiple chunks, + ensuring index incremental costs more computation, + so by default, each chunk will have index which starts from 0, + setting incremental_index=True,reset_index will guarantee that + output DataFrame's index is from 0 to n - 1. + + Returns + ------- + DataFrame or None + DataFrame with the new index or None if ``inplace=True``. + + See Also + -------- + DataFrame.set_index : Opposite of reset_index. + DataFrame.reindex : Change to new indices or expand indices. + DataFrame.reindex_like : Change to same indices as other DataFrame. + + Examples + -------- + >>> import mars.tensor as mt + >>> import mars.dataframe as md + >>> df = md.DataFrame([('bird', 389.0), + ... ('bird', 24.0), + ... ('mammal', 80.5), + ... ('mammal', mt.nan)], + ... index=['falcon', 'parrot', 'lion', 'monkey'], + ... columns=('class', 'max_speed')) + >>> df.execute() + class max_speed + falcon bird 389.0 + parrot bird 24.0 + lion mammal 80.5 + monkey mammal NaN + + When we reset the index, the old index is added as a column, and a + new sequential index is used: + + >>> df.reset_index().execute() + index class max_speed + 0 falcon bird 389.0 + 1 parrot bird 24.0 + 2 lion mammal 80.5 + 3 monkey mammal NaN + + We can use the `drop` parameter to avoid the old index being added as + a column: + + >>> df.reset_index(drop=True).execute() + class max_speed + 0 bird 389.0 + 1 bird 24.0 + 2 mammal 80.5 + 3 mammal NaN + + You can also use `reset_index` with `MultiIndex`. + + >>> import pandas as pd + >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'), + ... ('bird', 'parrot'), + ... ('mammal', 'lion'), + ... ('mammal', 'monkey')], + ... names=['class', 'name']) + >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'), + ... ('species', 'type')]) + >>> df = md.DataFrame([(389.0, 'fly'), + ... ( 24.0, 'fly'), + ... ( 80.5, 'run'), + ... (mt.nan, 'jump')], + ... index=index, + ... columns=columns) + >>> df.execute() + speed species + max type + class name + bird falcon 389.0 fly + parrot 24.0 fly + mammal lion 80.5 run + monkey NaN jump + + If the index has multiple levels, we can reset a subset of them: + + >>> df.reset_index(level='class').execute() + class speed species + max type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal NaN jump + + If we are not dropping the index, by default, it is placed in the top + level. We can place it in another level: + + >>> df.reset_index(level='class', col_level=1).execute() + speed species + class max type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal NaN jump + + When the index is inserted under another level, we can specify under + which one with the parameter `col_fill`: + + >>> df.reset_index(level='class', col_level=1, col_fill='species').execute() + species speed species + class max type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal NaN jump + + If we specify a nonexistent level for `col_fill`, it is created: + + >>> df.reset_index(level='class', col_level=1, col_fill='genus').execute() + genus speed species + class max type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal NaN jump + """ + op = DataFrameResetIndex( + level=level, + drop=drop, + col_level=col_level, + col_fill=col_fill, + incremental_index=incremental_index, + output_types=[OutputType.dataframe], + ) + ret = op(df) + if not inplace: + return ret + else: + df.data = ret.data + + +def series_reset_index( + series, + level=None, + drop=False, + name=no_default, + inplace=False, + incremental_index=False, +): + """ + Generate a new DataFrame or Series with the index reset. + + This is useful when the index needs to be treated as a column, or + when the index is meaningless and needs to be reset to the default + before another operation. + + Parameters + ---------- + level : int, str, tuple, or list, default optional + For a Series with a MultiIndex, only remove the specified levels + from the index. Removes all levels by default. + drop : bool, default False + Just reset the index, without inserting it as a column in + the new DataFrame. + name : object, optional + The name to use for the column containing the original Series + values. Uses ``self.name`` by default. This argument is ignored + when `drop` is True. + inplace : bool, default False + Modify the Series in place (do not create a new object). + incremental_index: bool, default False + Ensure RangeIndex incremental, when output Series has multiple chunks, + ensuring index incremental costs more computation, + so by default, each chunk will have index which starts from 0, + setting incremental_index=True,reset_index will guarantee that + output Series's index is from 0 to n - 1. + + Returns + ------- + Series or DataFrame + When `drop` is False (the default), a DataFrame is returned. + The newly created columns will come first in the DataFrame, + followed by the original Series values. + When `drop` is True, a `Series` is returned. + In either case, if ``inplace=True``, no value is returned. + + See Also + -------- + DataFrame.reset_index: Analogous function for DataFrame. + + Examples + -------- + >>> import mars.tensor as mt + >>> import mars.dataframe as md + >>> s = md.Series([1, 2, 3, 4], name='foo', + ... index=md.Index(['a', 'b', 'c', 'd'], name='idx')) + + Generate a DataFrame with default index. + + >>> s.reset_index().execute() + idx foo + 0 a 1 + 1 b 2 + 2 c 3 + 3 d 4 + + To specify the name of the new column use `name`. + + >>> s.reset_index(name='values').execute() + idx values + 0 a 1 + 1 b 2 + 2 c 3 + 3 d 4 + + To generate a new Series with the default set `drop` to True. + + >>> s.reset_index(drop=True).execute() + 0 1 + 1 2 + 2 3 + 3 4 + Name: foo, dtype: int64 + + To update the Series in place, without generating a new one + set `inplace` to True. Note that it also requires ``drop=True``. + + >>> s.reset_index(inplace=True, drop=True) + >>> s.execute() + 0 1 + 1 2 + 2 3 + 3 4 + Name: foo, dtype: int64 + + The `level` parameter is interesting for Series with a multi-level + index. + + >>> import numpy as np + >>> import pandas as pd + >>> arrays = [np.array(['bar', 'bar', 'baz', 'baz']), + ... np.array(['one', 'two', 'one', 'two'])] + >>> s2 = md.Series( + ... range(4), name='foo', + ... index=pd.MultiIndex.from_arrays(arrays, + ... names=['a', 'b'])) + + To remove a specific level from the Index, use `level`. + + >>> s2.reset_index(level='a').execute() + a foo + b + one bar 0 + two bar 1 + one baz 2 + two baz 3 + + If `level` is not set, all levels are removed from the Index. + + >>> s2.reset_index().execute() + a b foo + 0 bar one 0 + 1 bar two 1 + 2 baz one 2 + 3 baz two 3 + """ + if name is no_default: + name = series.name if series.name is not None else 0 + + op = DataFrameResetIndex( + level=level, + drop=drop, + name=name, + incremental_index=incremental_index, + output_types=[OutputType.series], + ) + ret = op(series) + if not inplace: + return ret + elif ret.ndim == 2: + raise TypeError("Cannot reset_index inplace on a Series to create a DataFrame") + else: + series.data = ret.data diff --git a/python/xorbits/_mars/dataframe/indexing/sample.py b/python/xorbits/_mars/dataframe/indexing/sample.py new file mode 100644 index 000000000..29e4e0d67 --- /dev/null +++ b/python/xorbits/_mars/dataframe/indexing/sample.py @@ -0,0 +1,600 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import itertools + +import numpy as np + +from ... import opcodes +from ...core import ENTITY_TYPE, get_output_types, recursive_tile +from ...serialization.serializables import ( + AnyField, + BoolField, + Float64Field, + Int8Field, + Int64Field, + KeyField, +) +from ...tensor import searchsorted +from ...tensor.base import TensorMapChunk +from ...tensor.merge import TensorConcatenate +from ...tensor.random import RandomState as TensorRandomState +from ...tensor.random import RandomStateField +from ...tensor.utils import gen_random_seeds, normalize_chunk_sizes +from ...utils import ceildiv, has_unknown_shape +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import parse_index, validate_axis + + +class DataFrameSample(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.RAND_SAMPLE + + _size = Int64Field("size") + _frac = Float64Field("frac") + _replace = BoolField("replace") + _weights = AnyField("weights") + _axis = Int8Field("axis") + _seed = Int64Field("seed") + _random_state = RandomStateField("random_state") + _always_multinomial = BoolField("always_multinomial") + + # for chunks + # num of instances for chunks + _chunk_samples = KeyField("chunk_samples") + + def __init__( + self, + size=None, + frac=None, + replace=None, + weights=None, + seed=None, + axis=None, + random_state=None, + always_multinomial=None, + chunk_samples=None, + **kw + ): + super().__init__( + _size=size, + _frac=frac, + _replace=replace, + _weights=weights, + _seed=seed, + _axis=axis, + _random_state=random_state, + _always_multinomial=always_multinomial, + _chunk_samples=chunk_samples, + **kw + ) + + @property + def size(self): + return self._size + + @property + def frac(self): + return self._frac + + @property + def replace(self): + return self._replace + + @property + def weights(self): + return self._weights + + @property + def seed(self): + return self._seed + + @property + def axis(self): + return self._axis + + @property + def random_state(self): + if self._random_state is None: + self._random_state = np.random.RandomState(self.seed) + return self._random_state + + @property + def always_multinomial(self): + return self._always_multinomial + + @property + def chunk_samples(self): + return self._chunk_samples + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + it = iter(inputs) + next(it) + if isinstance(self.weights, ENTITY_TYPE): + self._weights = next(it) + if isinstance(self.chunk_samples, ENTITY_TYPE): + self._chunk_samples = next(it) + + def __call__(self, df): + params = df.params + new_shape = list(df.shape) + + if self.frac is not None and not np.isnan(df.shape[self.axis]): + self._size = int(self.frac * df.shape[self.axis]) + self._frac = None + + if self.size is not None: + new_shape[self.axis] = self.size + params["shape"] = tuple(new_shape) + params["index_value"] = parse_index(df.index_value.to_pandas()[:0]) + + input_dfs = [df] + if isinstance(self.weights, ENTITY_TYPE): + input_dfs.append(self.weights) + + self._output_types = get_output_types(df) + return self.new_tileable(input_dfs, **params) + + @classmethod + def _tile_one_chunk(cls, op: "DataFrameSample", in_df, weights): + out = op.outputs[0] + + input_dfs = [in_df] + if isinstance(weights, ENTITY_TYPE): + input_dfs.append(weights) + + params = out.params + chunk_op = op.copy().reset_key() + if isinstance(weights, ENTITY_TYPE): + chunk_op._weights = weights + params["index"] = (0,) * out.ndim + chunk = chunk_op.new_chunk([c.chunks[0] for c in input_dfs], **params) + + df_op = op.copy().reset_key() + return df_op.new_tileables( + input_dfs, chunks=[chunk], nsplits=((s,) for s in out.shape), **params + ) + + @classmethod + def _tile_multinomial(cls, op: "DataFrameSample", in_df, weights): + out_data = op.outputs[0] + input_dfs = [in_df] + size = op.size + + weight_chunks = itertools.repeat(None) + if isinstance(op.weights, ENTITY_TYPE): + input_dfs.append(weights) + weight_chunks = weights.chunks + + chunks = [] + new_nsplits = list(in_df.nsplits) + rs = op.random_state + seeds = gen_random_seeds(len(in_df.chunks), op.random_state) + if weights is None: + # weights is None, use nsplits to sample num of instances for each chunk + probs = np.array(in_df.nsplits[op.axis]) + probs = 1.0 * probs / probs.sum() + chunk_sizes = rs.multinomial(size, probs) + new_nsplits[op.axis] = tuple(int(s) for s in chunk_sizes if s > 0) + + chunk_idx = 0 + for data_chunk, chunk_size, seed in zip(in_df.chunks, chunk_sizes, seeds): + if chunk_size == 0: + continue + + chunk_op = op.copy().reset_key() + chunk_op._random_state = None + chunk_op._seed = seed + chunk_op._size = int(chunk_size) + + params = data_chunk.params + params["index_value"] = parse_index( + params["index_value"].to_pandas()[:0] + ) + new_shape = list(data_chunk.shape) + new_shape[op.axis] = int(chunk_size) + params["shape"] = tuple(new_shape) + + idx_list = [0] * data_chunk.ndim + idx_list[op.axis] = chunk_idx + params["index"] = tuple(idx_list) + + chunks.append(chunk_op.new_chunk([data_chunk], **params)) + chunk_idx += 1 + else: + mn_seed = gen_random_seeds(1, op.random_state)[0] + + # weights is specified, use weights to sample num of instances for each chunk + chunk_weights = yield from recursive_tile( + weights.to_tensor().map_chunk(lambda x: x.sum(keepdims=True)) + ) + chunk_weights_chunk = TensorConcatenate( + dtype=chunk_weights.dtype + ).new_chunk( + chunk_weights.chunks, shape=(len(chunk_weights.chunks),), index=(0,) + ) + chunk_samples = TensorMapChunk( + func=lambda x: np.random.RandomState(mn_seed).multinomial( + size, x / x.sum() + ) + ).new_chunk( + [chunk_weights_chunk], shape=(len(chunk_weights.chunks),), index=(0,) + ) + new_nsplits[op.axis] = (np.nan,) * len(chunk_weights.chunks) + for chunk_idx, (data_chunk, weight_chunk, seed) in enumerate( + zip(in_df.chunks, weight_chunks, seeds) + ): + input_chunks = [data_chunk] + + chunk_op = op.copy().reset_key() + chunk_op._size = None + chunk_op._random_state = None + chunk_op._seed = seed + chunk_op._chunk_samples = chunk_samples + if weight_chunk is not None: + chunk_op._weights = weight_chunk + input_chunks.append(weight_chunk) + + params = data_chunk.params + params["index_value"] = parse_index( + params["index_value"].to_pandas()[:0] + ) + new_shape = list(data_chunk.shape) + new_shape[op.axis] = np.nan + params["shape"] = tuple(new_shape) + + idx_list = [0] * data_chunk.ndim + idx_list[op.axis] = chunk_idx + params["index"] = tuple(idx_list) + + chunks.append( + chunk_op.new_chunk(input_chunks + [chunk_samples], **params) + ) + + params = out_data.params + new_shape = list(in_df.shape) + new_shape[op.axis] = size + params["shape"] = tuple(new_shape) + + df_op = op.copy().reset_key() + return df_op.new_tileables( + input_dfs, chunks=chunks, nsplits=tuple(new_nsplits), **params + ) + + @classmethod + def _tile_reservoirs(cls, op: "DataFrameSample", in_df, weights): + out_data = op.outputs[0] + input_dfs = [in_df] + size = op.size + + weight_chunks = itertools.repeat(None) + if isinstance(weights, ENTITY_TYPE): + input_dfs.append(weights) + weight_chunks = weights.chunks + + if any(cs < size for cs in in_df.nsplits[op.axis]): + # make sure all chunk > m + n_records = in_df.shape[op.axis] + n_chunk = min(max(ceildiv(n_records, size), 1), in_df.chunk_shape[0]) + chunk_size = ceildiv(in_df.shape[op.axis], n_chunk) + chunk_sizes = list(normalize_chunk_sizes(n_records, chunk_size)[0]) + if chunk_sizes[-1] < size and len(chunk_sizes) > 1: + # the last chunk may still less than m + # merge it into previous one + chunk_sizes[-2] += chunk_sizes[-1] + chunk_sizes = chunk_sizes[:-1] + in_df = yield from recursive_tile(in_df.rechunk({0: tuple(chunk_sizes)})) + if isinstance(weights, ENTITY_TYPE): + weights = yield from recursive_tile( + weights.rechunk({0: tuple(chunk_sizes)}) + ) + if len(chunk_sizes) == 1: + return cls._tile_one_chunk(op, in_df, weights) + + # for each chunk in a, do regular sampling + sampled_chunks = [] + seeds = gen_random_seeds(len(in_df.chunks), op.random_state) + for data_chunk, weights_chunk, seed in zip(in_df.chunks, weight_chunks, seeds): + input_chunks = [data_chunk] + + chunk_op = op.copy().reset_key() + chunk_op._random_state = None + chunk_op._seed = seed + if isinstance(op.weights, ENTITY_TYPE): + input_chunks.append(weights_chunk) + chunk_op._weights = weights_chunk + + params = data_chunk.params + new_shape = list(data_chunk.shape) + new_shape[op.axis] = size + params["shape"] = tuple(new_shape) + sampled_chunks.append(chunk_op.new_chunk(input_chunks, **params)) + + # generate a random variable for samples in every chunk + state = TensorRandomState.from_numpy(op.random_state) + indices = state.rand(size) + + if weights is None: + # weights not specified, use nsplits to calculate cumulative probability + # to distribute samples in each chunk + cum_offsets = np.cumsum(in_df.nsplits[op.axis]) + cum_offsets = cum_offsets * 1.0 / cum_offsets[-1] + else: + # weights specified, use weights to calculate cumulative probability + # to distribute samples in each chunk + chunk_weights = yield from recursive_tile( + weights.to_tensor().map_chunk(lambda x: x.sum(keepdims=True)) + ) + chunk_weights_chunk = TensorConcatenate( + dtype=chunk_weights.dtype + ).new_chunk( + chunk_weights.chunks, shape=(len(chunk_weights.chunks),), index=(0,) + ) + + cum_chunk = TensorMapChunk(func=lambda x: (x / x.sum()).cumsum()).new_chunk( + [chunk_weights_chunk], shape=(len(chunk_weights.chunks),), index=(0,) + ) + cum_offsets = TensorMapChunk(func=cum_chunk.op.func).new_tensor( + [weights], + chunks=[cum_chunk], + nsplits=((s,) for s in cum_chunk.shape), + **cum_chunk.params + ) + + index_chunks = [] + # seek which chunk the final sample will select + chunk_sel = yield from recursive_tile( + searchsorted(cum_offsets, indices, side="right") + ) + # for every chunk, select samples with bool indexing + for idx, sampled_chunk in enumerate(sampled_chunks): + chunk_index = chunk_sel.map_chunk( + func=lambda x, i: x == i, args=(idx,), elementwise=True, dtype=bool + ) + sampled_df_op = sampled_chunk.op.copy().reset_key() + sampled_chunk._index = (0,) * sampled_chunk.ndim + sampled_df = sampled_df_op.new_tileable( + input_dfs, + chunks=[sampled_chunk], + nsplits=((s,) for s in sampled_chunk.shape), + **sampled_chunk.params + ) + index_chunk = ( + yield from recursive_tile(sampled_df.iloc[chunk_index]) + ).chunks[0] + + chunk_idx = [0] * sampled_chunk.ndim + chunk_idx[op.axis] = idx + index_chunk._index = tuple(chunk_idx) + index_chunks.append(index_chunk) + + params = out_data.params + new_shape = list(in_df.shape) + new_shape[op.axis] = size + params["shape"] = tuple(new_shape) + + new_nsplits = list(in_df.nsplits) + new_nsplits[op.axis] = (np.nan,) * len(index_chunks) + + df_op = op.copy().reset_key() + return df_op.new_tileables( + input_dfs, chunks=index_chunks, nsplits=tuple(new_nsplits), **params + ) + + @classmethod + def tile(cls, op: "DataFrameSample"): + if has_unknown_shape(*op.inputs): + yield + + in_df = op.inputs[0] + if in_df.ndim == 2: + in_df = yield from recursive_tile( + in_df.rechunk({(1 - op.axis): (in_df.shape[1 - op.axis],)}) + ) + + if op.size is None: + op._size = int(op.frac * in_df.shape[op.axis]) + + weights = op.weights + if isinstance(weights, ENTITY_TYPE): + weights = yield from recursive_tile( + weights.rechunk({0: in_df.nsplits[op.axis]}) + ) + elif in_df.ndim > 1 and weights in in_df.dtypes.index: + weights = yield from recursive_tile(in_df[weights]) + + if len(in_df.chunks) == 1: + return cls._tile_one_chunk(op, in_df, weights) + + if op.replace or op.always_multinomial: + return (yield from cls._tile_multinomial(op, in_df, weights)) + else: + return (yield from cls._tile_reservoirs(op, in_df, weights)) + + @classmethod + def execute(cls, ctx, op: "DataFrameSample"): + in_data = ctx[op.inputs[0].key] + weights = op.weights + if isinstance(weights, ENTITY_TYPE): + weights = ctx[weights.key] + + size = op.size + chunk_samples = op.chunk_samples + if isinstance(chunk_samples, ENTITY_TYPE): + chunk_samples = ctx[chunk_samples.key] + if chunk_samples is not None: + size = chunk_samples[op.inputs[0].index[op.axis]] + + try: + ctx[op.outputs[0].key] = in_data.sample( + n=size, + frac=op.frac, + replace=op.replace, + weights=weights, + random_state=op.random_state, + axis=op.axis, + ) + except ValueError: # pragma: no cover + ctx[op.outputs[0].key] = in_data.copy().sample( + n=size, + frac=op.frac, + replace=op.replace, + weights=weights, + random_state=op.random_state, + axis=op.axis, + ) + + +def sample( + df_or_series, + n=None, + frac=None, + replace=False, + weights=None, + random_state=None, + axis=None, + always_multinomial=False, +): + """ + Return a random sample of items from an axis of object. + + You can use `random_state` for reproducibility. + + Parameters + ---------- + n : int, optional + Number of items from axis to return. Cannot be used with `frac`. + Default = 1 if `frac` = None. + frac : float, optional + Fraction of axis items to return. Cannot be used with `n`. + replace : bool, default False + Allow or disallow sampling of the same row more than once. + weights : str or ndarray-like, optional + Default 'None' results in equal probability weighting. + If passed a Series, will align with target object on index. Index + values in weights not found in sampled object will be ignored and + index values in sampled object not in weights will be assigned + weights of zero. + If called on a DataFrame, will accept the name of a column + when axis = 0. + Unless weights are a Series, weights must be same length as axis + being sampled. + If weights do not sum to 1, they will be normalized to sum to 1. + Missing values in the weights column will be treated as zero. + Infinite values not allowed. + random_state : int, array-like, BitGenerator, np.random.RandomState, optional + If int, array-like, or BitGenerator (NumPy>=1.17), seed for + random number generator + If np.random.RandomState, use as numpy RandomState object. + axis : {0 or ‘index’, 1 or ‘columns’, None}, default None + Axis to sample. Accepts axis number or name. Default is stat axis + for given data type (0 for Series and DataFrames). + always_multinomial : bool, default False + If True, always treat distribution of sample counts between data chunks + as multinomial distribution. This will accelerate sampling when data + is huge, but may affect randomness of samples when number of instances + is not very large. + + Returns + ------- + Series or DataFrame + A new object of same type as caller containing `n` items randomly + sampled from the caller object. + + See Also + -------- + DataFrameGroupBy.sample: Generates random samples from each group of a + DataFrame object. + SeriesGroupBy.sample: Generates random samples from each group of a + Series object. + numpy.random.choice: Generates a random sample from a given 1-D numpy + array. + + Notes + ----- + If `frac` > 1, `replacement` should be set to `True`. + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame({'num_legs': [2, 4, 8, 0], + ... 'num_wings': [2, 0, 0, 0], + ... 'num_specimen_seen': [10, 2, 1, 8]}, + ... index=['falcon', 'dog', 'spider', 'fish']) + >>> df.execute() + num_legs num_wings num_specimen_seen + falcon 2 2 10 + dog 4 0 2 + spider 8 0 1 + fish 0 0 8 + + Extract 3 random elements from the ``Series`` ``df['num_legs']``: + Note that we use `random_state` to ensure the reproducibility of + the examples. + + >>> df['num_legs'].sample(n=3, random_state=1).execute() + fish 0 + spider 8 + falcon 2 + Name: num_legs, dtype: int64 + + A random 50% sample of the ``DataFrame`` with replacement: + + >>> df.sample(frac=0.5, replace=True, random_state=1).execute() + num_legs num_wings num_specimen_seen + dog 4 0 2 + fish 0 0 8 + + An upsample sample of the ``DataFrame`` with replacement: + Note that `replace` parameter has to be `True` for `frac` parameter > 1. + + >>> df.sample(frac=2, replace=True, random_state=1).execute() + num_legs num_wings num_specimen_seen + dog 4 0 2 + fish 0 0 8 + falcon 2 2 10 + falcon 2 2 10 + fish 0 0 8 + dog 4 0 2 + fish 0 0 8 + dog 4 0 2 + + Using a DataFrame column as weights. Rows with larger value in the + `num_specimen_seen` column are more likely to be sampled. + + >>> df.sample(n=2, weights='num_specimen_seen', random_state=1).execute() + num_legs num_wings num_specimen_seen + falcon 2 2 10 + fish 0 0 8 + + """ + axis = validate_axis(axis or 0, df_or_series) + if axis == 1: + raise NotImplementedError("Currently cannot sample over columns") + rs = copy.deepcopy( + random_state.to_numpy() if hasattr(random_state, "to_numpy") else random_state + ) + if isinstance(rs, (int, np.ndarray)): + rs = np.random.RandomState(rs) + op = DataFrameSample( + size=n, + frac=frac, + replace=replace, + weights=weights, + random_state=rs, + axis=axis, + always_multinomial=always_multinomial, + ) + return op(df_or_series) diff --git a/python/xorbits/_mars/dataframe/indexing/set_axis.py b/python/xorbits/_mars/dataframe/indexing/set_axis.py new file mode 100644 index 000000000..282f6e657 --- /dev/null +++ b/python/xorbits/_mars/dataframe/indexing/set_axis.py @@ -0,0 +1,292 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...core import ENTITY_TYPE, get_output_types, recursive_tile +from ...serialization.serializables import AnyField, Int8Field, KeyField +from ...utils import has_unknown_shape +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import parse_index, validate_axis + + +class DataFrameSetAxis(DataFrameOperand, DataFrameOperandMixin): + _op_code_ = opcodes.DATAFRAME_SET_AXIS + + _input = KeyField("input") + _axis = Int8Field("axis") + _value = AnyField("value") + + def __init__(self, value=None, axis=None, **kw): + super().__init__(_value=value, _axis=axis, **kw) + + @property + def input(self): + return self._input + + @property + def value(self): + return self._value + + @property + def axis(self): + return self._axis + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = inputs[0] + if isinstance(self.value, ENTITY_TYPE): + self._value = inputs[-1] + + def __call__(self, df_or_series): + new_size = self.value.shape[0] + expect_size = df_or_series.axes[self.axis].shape[0] + if ( + not np.isnan(new_size) + and not np.isnan(expect_size) + and new_size != expect_size + ): + raise ValueError( + f"Length mismatch: Expected axis has {expect_size} elements, " + f"new values have {new_size} elements" + ) + + params = df_or_series.params + if self.axis == 0: + params["index_value"] = ( + parse_index(self.value) + if isinstance(self.value, pd.Index) + else self.value.index_value + ) + else: + params["columns_value"] = ( + parse_index(self.value, store_data=True) + if isinstance(self.value, pd.Index) + else self.value.index_value + ) + pd_columns = ( + self.value.index_value.to_pandas() + if isinstance(self.value, ENTITY_TYPE) + else self.value + ) + params["dtypes"] = params["dtypes"].set_axis(pd_columns) + + self._output_types = get_output_types(df_or_series) + inputs = [df_or_series] + if isinstance(self.value, ENTITY_TYPE): + inputs += [self.value] + return self.new_tileable(inputs, **params) + + @classmethod + def tile(cls, op: "DataFrameSetAxis"): + output = op.outputs[0] + input_tileables = [op.input] + + value = op.value + if isinstance(value, ENTITY_TYPE): + input_tileables.append(value) + if has_unknown_shape(value): + yield + + if any(np.isnan(s) for s in op.input.nsplits[op.axis]): + yield + + if op.input.shape[op.axis] != value.shape[0]: + raise ValueError( + f"Length mismatch: Expected axis has {value.shape[0]} elements, " + f"new values have {op.input.shape[op.axis]} elements" + ) + + if isinstance(value, ENTITY_TYPE): + value = yield from recursive_tile( + value.rechunk({0: op.input.nsplits[op.axis]}) + ) + input_tileables[-1] = value + + slices = np.array((0,) + op.input.nsplits[op.axis]).cumsum() + slice_left = slices[:-1] + slice_right = slices[1:] + + chunks = [] + param_cache = [None] * len(op.input.nsplits[op.axis]) + for inp_chunk in op.input.chunks: + input_chunks = [inp_chunk] + value_index = inp_chunk.index[op.axis] + params = inp_chunk.params + + if isinstance(value, ENTITY_TYPE): + value_data = value.chunks[value_index] + input_chunks.append(value_data) + else: + value_data = value[slice_left[value_index] : slice_right[value_index]] + + if param_cache[value_index] is None: + cached_params = param_cache[value_index] = dict() + if isinstance(value, ENTITY_TYPE): + if op.axis == 0: + cached_params["index_value"] = value_data.index_value + else: + cached_params["columns_value"] = value_data.index_value + cached_params["dtypes"] = output.dtypes.iloc[ + slice_left[value_index] : slice_right[value_index] + ] + else: + if op.axis == 0: + cached_params["index_value"] = parse_index(value_data) + else: + cached_params["columns_value"] = parse_index( + value_data, store_data=True + ) + cached_params["dtypes"] = params["dtypes"].set_axis(value_data) + + params.update(param_cache[value_index]) + + new_op = op.copy().reset_key() + new_op._value = value_data + chunks.append(new_op.new_chunk(input_chunks, **params)) + + params = op.outputs[0].params + params["chunks"] = chunks + params["nsplits"] = op.input.nsplits + new_op = op.copy().reset_key() + return new_op.new_tileables(input_tileables, **params) + + @classmethod + def execute(cls, ctx, op: "DataFrameSetAxis"): + in_data = ctx[op.input.key] + value = op.value + if isinstance(value, ENTITY_TYPE): + value = ctx[value.key] + ctx[op.outputs[0].key] = in_data.set_axis(value, axis=op.axis) + + +def _set_axis(df_or_axis, labels, axis=0, inplace=False): + axis = validate_axis(axis, df_or_axis) + if not isinstance(labels, ENTITY_TYPE) and not isinstance(labels, pd.Index): + labels = pd.Index(labels) + + op = DataFrameSetAxis(value=labels, axis=axis) + result = op(df_or_axis) + if inplace: + df_or_axis.data = result.data + else: + return result + + +def df_set_axis(df, labels, axis=0, inplace=False): + """ + Assign desired index to given axis. + + Indexes for column or row labels can be changed by assigning + a list-like or Index. + + Parameters + ---------- + labels : list-like, Index + The values for the new index. + + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to update. The value 0 identifies the rows, and 1 identifies the columns. + + inplace : bool, default False + Whether to return a new DataFrame instance. + + Returns + ------- + renamed : DataFrame or None + An object of type DataFrame or None if ``inplace=True``. + + See Also + -------- + DataFrame.rename_axis : Alter the name of the index or columns. + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + + Change the row labels. + + >>> df.set_axis(['a', 'b', 'c'], axis='index').execute() + A B + a 1 4 + b 2 5 + c 3 6 + + Change the column labels. + + >>> df.set_axis(['I', 'II'], axis='columns').execute() + I II + 0 1 4 + 1 2 5 + 2 3 6 + + Now, update the labels inplace. + + >>> df.set_axis(['i', 'ii'], axis='columns', inplace=True) + >>> df.execute() + i ii + 0 1 4 + 1 2 5 + 2 3 6 + """ + return _set_axis(df, labels, axis=axis, inplace=inplace) + + +def series_set_axis(series, labels, axis=0, inplace=False): + """ + Assign desired index to given axis. + + Indexes for row labels can be changed by assigning + a list-like or Index. + + Parameters + ---------- + labels : list-like, Index + The values for the new index. + + axis : {0 or 'index'}, default 0 + The axis to update. The value 0 identifies the rows. + + inplace : bool, default False + Whether to return a new Series instance. + + Returns + ------- + renamed : Series or None + An object of type Series or None if ``inplace=True``. + + See Also + -------- + Series.rename_axis : Alter the name of the index. + + Examples + -------- + >>> import mars.dataframe as md + >>> s = md.Series([1, 2, 3]) + >>> s.execute() + 0 1 + 1 2 + 2 3 + dtype: int64 + + >>> s.set_axis(['a', 'b', 'c'], axis=0).execute() + a 1 + b 2 + c 3 + dtype: int64 + """ + return _set_axis(series, labels, axis=axis, inplace=inplace) diff --git a/python/xorbits/_mars/dataframe/indexing/set_index.py b/python/xorbits/_mars/dataframe/indexing/set_index.py new file mode 100644 index 000000000..762a7fc03 --- /dev/null +++ b/python/xorbits/_mars/dataframe/indexing/set_index.py @@ -0,0 +1,212 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import OutputType +from ...serialization.serializables import AnyField, BoolField +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_empty_df, parse_index + + +class DataFrameSetIndex(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.DATAFRAME_SET_INDEX + + _keys = AnyField("keys") + _drop = BoolField("drop") + _append = BoolField("append") + _verify_integrity = BoolField("verify_integrity") + + def __init__( + self, + keys=None, + drop=True, + append=False, + verify_integrity=False, + output_types=None, + **kw + ): + super().__init__( + _keys=keys, + _drop=drop, + _append=append, + _verify_integrity=verify_integrity, + _output_types=output_types, + **kw + ) + + @property + def keys(self): + return self._keys + + @property + def drop(self): + return self._drop + + @property + def append(self): + return self._append + + @property + def verify_integrity(self): + return self._verify_integrity + + def __call__(self, df): + new_df = build_empty_df(df.dtypes).set_index( + keys=self.keys, + drop=self.drop, + append=self.append, + verify_integrity=self.verify_integrity, + ) + return self.new_dataframe( + [df], + shape=(df.shape[0], new_df.shape[1]), + dtypes=new_df.dtypes, + index_value=parse_index(new_df.index), + columns_value=parse_index(new_df.columns, store_data=True), + ) + + @classmethod + def _tile_column_axis_n_chunk(cls, op, in_df, out_df, out_chunks): + if not isinstance(op.keys, str): # pragma: no cover + raise NotImplementedError("DataFrame.set_index only support label") + if op.verify_integrity: # pragma: no cover + raise NotImplementedError( + "DataFrame.set_index not support verify_integrity yet" + ) + + try: + column_index = in_df.columns_value.to_pandas().get_loc(op.keys) + except KeyError: # pragma: no cover + raise NotImplementedError( + "The new index label must be a column of the original dataframe" + ) + + chunk_index = np.searchsorted(np.cumsum(in_df.nsplits[1]), column_index + 1) + + for row_idx in range(in_df.chunk_shape[0]): + index_chunk = in_df.cix[row_idx, chunk_index] + for col_idx in range(in_df.chunk_shape[1]): + input_chunk = in_df.cix[row_idx, col_idx] + if op.drop and input_chunk.key == index_chunk.key: + new_shape = (input_chunk.shape[0], input_chunk.shape[1] - 1) + selected = input_chunk.columns_value.to_pandas().drop(op.keys) + columns = parse_index(selected, store_data=True) + dtypes = input_chunk.dtypes.loc[selected] + else: + new_shape = input_chunk.shape + columns = input_chunk.columns_value + dtypes = input_chunk.dtypes + out_op = op.copy().reset_key() + out_chunk = out_op.new_chunk( + [index_chunk, input_chunk], + shape=new_shape, + dtypes=dtypes, + index=input_chunk.index, + index_value=parse_index(pd.Index([], dtype=np.int64)), + columns_value=columns, + ) + out_chunks.append(out_chunk) + + @classmethod + def _tile_column_axis_1_chunk(cls, op, in_df, out_df, out_chunks): + out_pd_index = out_df.index_value.to_pandas() + for c in in_df.chunks: + chunk_op = op.copy().reset_key() + chunk_shape = (c.shape[0], out_df.shape[1]) + index_value = parse_index(out_pd_index, c) + out_chunk = chunk_op.new_chunk( + [c], + shape=chunk_shape, + dtypes=out_df.dtypes, + index=c.index, + index_value=index_value, + columns_value=out_df.columns_value, + ) + out_chunks.append(out_chunk) + + @classmethod + def tile(cls, op): + in_df = op.inputs[0] + out_df = op.outputs[0] + + out_chunks = [] + if in_df.chunk_shape[1] > 1: + cls._tile_column_axis_n_chunk(op, in_df, out_df, out_chunks) + else: + cls._tile_column_axis_1_chunk(op, in_df, out_df, out_chunks) + + new_op = op.copy() + columns_nsplits = list(in_df.nsplits[1]) + if op.drop: + columns_nsplits = tuple( + split - 1 if i == 0 else split + for i, split in enumerate(columns_nsplits) + ) + nsplits = (in_df.nsplits[0], columns_nsplits) + return new_op.new_dataframes( + op.inputs, + out_df.shape, + dtypes=out_df.dtypes, + index_value=out_df.index_value, + columns_value=out_df.columns_value, + chunks=out_chunks, + nsplits=nsplits, + ) + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + + if len(op.inputs) == 2: + # axis 1 has more than 1 chunk + index_chunk, input_chunk = op.inputs + # Optimization: we don't need to get value of the column + # that is set as new index. + if input_chunk.key == index_chunk.key: + new_index = op.keys + else: + new_index = ctx[index_chunk.key][op.keys] + ctx[chunk.key] = ctx[input_chunk.key].set_index( + new_index, + drop=op.drop, + append=op.append, + verify_integrity=op.verify_integrity, + ) + else: + # axis 1 has 1 chunk + inp = ctx[op.inputs[0].key] + ctx[chunk.key] = inp.set_index( + op.keys, + drop=op.drop, + append=op.append, + verify_integrity=op.verify_integrity, + ) + + +def set_index(df, keys, drop=True, append=False, inplace=False, verify_integrity=False): + op = DataFrameSetIndex( + keys=keys, + drop=drop, + append=append, + verify_integrity=verify_integrity, + output_types=[OutputType.dataframe], + ) + result = op(df) + if not inplace: + return result + else: + df.data = result.data diff --git a/python/xorbits/_mars/dataframe/indexing/setitem.py b/python/xorbits/_mars/dataframe/indexing/setitem.py new file mode 100644 index 000000000..b7ced7513 --- /dev/null +++ b/python/xorbits/_mars/dataframe/indexing/setitem.py @@ -0,0 +1,337 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections + +import numpy as np +import pandas as pd +from pandas.api.types import is_list_like + +from ... import opcodes +from ...core import OutputType, recursive_tile +from ...serialization.serializables import AnyField, KeyField +from ...tensor.core import TENSOR_TYPE +from ...utils import pd_release_version +from ..core import DATAFRAME_TYPE, SERIES_TYPE, DataFrame +from ..initializer import DataFrame as asframe +from ..initializer import Series as asseries +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import is_index_value_identical, parse_index + +# in pandas 1.0.x, __setitem__ with a list with missing items are not allowed +_allow_set_missing_list = pd_release_version[:2] >= (1, 1) + + +class DataFrameSetitem(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.INDEXSETVALUE + + _target = KeyField("target") + _indexes = AnyField("indexes") + _value = AnyField("value") + + def __init__(self, target=None, indexes=None, value=None, output_types=None, **kw): + super().__init__( + _target=target, + _indexes=indexes, + _value=value, + _output_types=output_types, + **kw, + ) + if self.output_types is None: + self.output_types = [OutputType.dataframe] + + @property + def target(self): + return self._target + + @property + def indexes(self): + return self._indexes + + @property + def value(self): + return self._value + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._target = self._inputs[0] + if len(inputs) > 1: + self._value = self._inputs[-1] + + @staticmethod + def _is_scalar_tensor(t): + return isinstance(t, TENSOR_TYPE) and t.ndim == 0 + + def __call__(self, target: DataFrame, value): + raw_target = target + + inputs = [target] + if np.isscalar(value): + value_dtype = np.array(value).dtype + elif self._is_scalar_tensor(value): + inputs.append(value) + value_dtype = value.dtype + else: + if isinstance(value, (pd.Series, SERIES_TYPE)): + value = asseries(value) + value_dtype = value.dtype + elif isinstance(value, (pd.DataFrame, DATAFRAME_TYPE)): + if len(self.indexes) != value.shape[1]: # pragma: no cover + raise ValueError("Columns must be same length as key") + + value = asframe(value) + value_dtype = pd.Series(list(value.dtypes), index=self._indexes) + elif is_list_like(value) or isinstance(value, TENSOR_TYPE): + # convert to numpy to get actual dim and shape + if is_list_like(value): + value = np.array(value) + + if value.ndim == 1: + value = asseries(value, index=target.index) + value_dtype = value.dtype + else: + if len(self.indexes) != value.shape[1]: # pragma: no cover + raise ValueError("Columns must be same length as key") + + value = asframe(value, index=target.index) + value_dtype = pd.Series(list(value.dtypes), index=self._indexes) + else: # pragma: no cover + raise TypeError( + "Wrong value type, could be one of scalar, Series or tensor" + ) + + if target.shape[0] == 0: + # target empty, reindex target first + target = target.reindex(value.index) + inputs[0] = target + elif value.index_value.key != target.index_value.key: + # need reindex when target df is not empty and index different + value = value.reindex(target.index) + inputs.append(value) + + index_value = target.index_value + dtypes = target.dtypes.copy(deep=True) + + try: + dtypes.loc[self._indexes] = value_dtype + except KeyError: + # when some index not exist, try update one by one + if isinstance(value_dtype, pd.Series): + for idx in self._indexes: + dtypes.loc[idx] = value_dtype.loc[idx] + else: + for idx in self._indexes: + dtypes.loc[idx] = value_dtype + + columns_value = parse_index(dtypes.index, store_data=True) + ret = self.new_dataframe( + inputs, + shape=(target.shape[0], len(dtypes)), + dtypes=dtypes, + index_value=index_value, + columns_value=columns_value, + ) + raw_target.data = ret.data + + @classmethod + def tile(cls, op: "DataFrameSetitem"): + from ..merge.concat import DataFrameConcat + + out = op.outputs[0] + target = op.target + value = op.value + indexes = op.indexes + columns = target.columns_value.to_pandas() + last_column_index = target.chunk_shape[1] - 1 + is_value_scalar = np.isscalar(value) or cls._is_scalar_tensor(value) + has_multiple_cols = getattr(out.dtypes[indexes], "ndim", 0) > 0 + target_index_to_value_index = collections.defaultdict(list) + + if has_multiple_cols: + append_cols = [c for c in indexes if c not in columns] + else: + append_cols = [indexes] if indexes not in columns else [] + + if not is_value_scalar: + rechunk_arg = {} + + # check if all chunk's index_value are identical + is_identical = is_index_value_identical(target, value) + if not is_identical: + # do rechunk + if any(np.isnan(s) for s in target.nsplits[0]) or any( + np.isnan(s) for s in value.nsplits[0] + ): # pragma: no cover + yield + + rechunk_arg[0] = target.nsplits[0] + + if isinstance(value, DATAFRAME_TYPE): + if len(append_cols) < len(indexes): + # rechunk in column dim given distribution of indexes in target chunks + target_col_to_chunk_index = { + col: head_chunk.index[1] + for head_chunk in target.cix[0, :] + for col in head_chunk.dtypes.keys() + } + value_chunk_indexes = [ + target_col_to_chunk_index.get(vc, None) for vc in indexes + ] + col_nsplits = [] + last_cidx = value_chunk_indexes[0] + match_idxes = [] + for cidx, idx in zip(value_chunk_indexes, indexes): + if cidx != last_cidx: + target_index_to_value_index[last_cidx].append( + len(col_nsplits) + ) + col_nsplits.append(len(match_idxes)) + last_cidx = cidx + match_idxes = [idx] + else: + match_idxes.append(idx) + target_index_to_value_index[last_cidx].append(len(col_nsplits)) + col_nsplits.append(len(match_idxes)) + + # merge last column indexes and keep column order + last_value_index = target_index_to_value_index.pop( + last_column_index, [] + ) + append_value_index = target_index_to_value_index.pop(None, []) + target_index_to_value_index[None] = ( + last_value_index + append_value_index + ) + + rechunk_arg[1] = col_nsplits + else: + target_index_to_value_index[None] = [0] + rechunk_arg[1] = [len(append_cols)] + + if rechunk_arg: + value = yield from recursive_tile(value.rechunk(rechunk_arg)) + + out_chunks = [] + nsplits = [list(ns) for ns in target.nsplits] + nsplits[1][-1] += len(append_cols) + nsplits = tuple(tuple(ns) for ns in nsplits) + + for c in target.chunks: + result_chunk = c + + if has_multiple_cols: + new_indexes = [vc for vc in indexes if vc in c.dtypes] + else: + new_indexes = [indexes] if indexes in c.dtypes else [] + + if c.index[-1] == last_column_index: + new_indexes.extend(append_cols) + + if new_indexes: + # update needed on current chunk + chunk_op = op.copy().reset_key() + chunk_op._indexes = new_indexes if has_multiple_cols else new_indexes[0] + + if pd.api.types.is_scalar(value): + chunk_inputs = [c] + elif is_value_scalar: + chunk_inputs = [c, value.chunks[0]] + else: + # get proper chunk from value chunks + if has_multiple_cols: + value_chunks = [] + target_index = ( + None if c.index[-1] == last_column_index else c.index[1] + ) + for value_index in target_index_to_value_index[target_index]: + value_chunk = value.cix[c.index[0], value_index] + value_chunks.append(value_chunk) + if len(value_chunks) == 1: + value_chunk = value_chunks[0] + else: + # concat multiple columns by order + shape = ( + value_chunks[0].shape[0], + sum(c.shape[1] for c in value_chunks), + ) + dtypes = pd.concat([c.dtypes for c in value_chunks]) + concat_op = DataFrameConcat(output_types=op.output_types) + value_chunk = concat_op.new_chunk( + value_chunks, shape=shape, dtypes=dtypes + ) + else: + value_chunk = value.cix[c.index[0],] + + chunk_inputs = [c, value_chunk] + + shape = c.shape + if append_cols and c.index[-1] == last_column_index: + # some columns appended at the last column of chunks + shape = (shape[0], shape[1] + len(append_cols)) + + result_chunk = chunk_op.new_chunk( + chunk_inputs, + shape=shape, + index=c.index, + ) + result_chunk._set_tileable_meta( + tileable_key=out.key, + nsplits=nsplits, + index_value=out.index_value, + columns_value=out.columns_value, + dtypes=out.dtypes, + ) + out_chunks.append(result_chunk) + + params = out.params + params["nsplits"] = nsplits + params["chunks"] = out_chunks + new_op = op.copy() + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + def estimate_size(cls, ctx: dict, op: "DataFrameSetitem"): + result_size = ctx[op.target.key][0] + ctx[op.outputs[0].key] = (result_size, result_size) + + @classmethod + def execute(cls, ctx, op: "DataFrameSetitem"): + target = ctx[op.target.key] + # only deep copy when updating + indexes = ( + (op.indexes,) + if not isinstance(op.indexes, (tuple, list, set)) + else op.indexes + ) + deep = bool(set(indexes) & set(target.columns)) + target = ctx[op.target.key].copy(deep=deep) + value = ctx[op.value.key] if not np.isscalar(op.value) else op.value + try: + target[op.indexes] = value + except KeyError: + if _allow_set_missing_list: # pragma: no cover + raise + else: + existing = set(target.columns) + new_columns = target.columns.append( + pd.Index([idx for idx in op.indexes if idx not in existing]) + ) + target = target.reindex(new_columns, axis=1) + target[op.indexes] = value + + ctx[op.outputs[0].key] = target + + +def dataframe_setitem(df, col, value): + op = DataFrameSetitem(target=df, indexes=col, value=value) + return op(df, value) diff --git a/python/xorbits/_mars/dataframe/indexing/tests/__init__.py b/python/xorbits/_mars/dataframe/indexing/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/indexing/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/indexing/tests/test_indexing.py b/python/xorbits/_mars/dataframe/indexing/tests/test_indexing.py new file mode 100644 index 000000000..ac8ac194d --- /dev/null +++ b/python/xorbits/_mars/dataframe/indexing/tests/test_indexing.py @@ -0,0 +1,959 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest + +from .... import dataframe as md +from .... import tensor as mt +from ....core import tile +from ....tensor.core import TENSOR_CHUNK_TYPE, TENSOR_TYPE, Tensor +from ...core import ( + DATAFRAME_CHUNK_TYPE, + DATAFRAME_TYPE, + SERIES_CHUNK_TYPE, + SERIES_TYPE, + DataFrame, + Series, +) +from ...datasource.from_tensor import dataframe_from_tensor +from ..iloc import ( + DataFrameIlocGetItem, + DataFrameIlocSetItem, + HeadTailOptimizedOperandMixin, + IndexingError, +) +from ..loc import DataFrameLocGetItem + + +def test_set_index(): + df1 = pd.DataFrame( + [[1, 3, 3], [4, 2, 6], [7, 8, 9]], + index=["a1", "a2", "a3"], + columns=["x", "y", "z"], + ) + df2 = md.DataFrame(df1, chunk_size=2) + + df3 = df2.set_index("y", drop=True) + df3 = tile(df3) + assert df3.chunk_shape == (2, 2) + pd.testing.assert_index_equal( + df3.chunks[0].columns_value.to_pandas(), pd.Index(["x"]) + ) + pd.testing.assert_index_equal( + df3.chunks[1].columns_value.to_pandas(), pd.Index(["z"]) + ) + + df4 = df2.set_index("y", drop=False) + df4 = tile(df4) + assert df4.chunk_shape == (2, 2) + pd.testing.assert_index_equal( + df4.chunks[0].columns_value.to_pandas(), pd.Index(["x", "y"]) + ) + pd.testing.assert_index_equal( + df4.chunks[1].columns_value.to_pandas(), pd.Index(["z"]) + ) + + +def test_iloc_getitem(): + df1 = pd.DataFrame( + [[1, 3, 3], [4, 2, 6], [7, 8, 9]], + index=["a1", "a2", "a3"], + columns=["x", "y", "z"], + ) + df2 = md.DataFrame(df1, chunk_size=2) + + with pytest.raises(IndexingError): + _ = df2.iloc[1, 1, 1] + + # index cannot be tuple + with pytest.raises(IndexingError): + _ = df2.iloc[((1,),)] + + # index wrong type + with pytest.raises(TypeError): + _ = df2.iloc["a1":] + + with pytest.raises(NotImplementedError): + _ = df2.iloc[0, md.Series(["a2", "a3"])] + + # fancy index should be 1-d + with pytest.raises(ValueError): + _ = df2.iloc[[[0, 1], [1, 2]]] + + with pytest.raises(ValueError): + _ = df2.iloc[1, ...] + + with pytest.raises(IndexError): + _ = df2.iloc[-4] + + with pytest.raises(IndexError): + _ = df2.iloc[3] + + # plain index + df3 = df2.iloc[1] + df3 = tile(df3) + assert isinstance(df3, SERIES_TYPE) + assert isinstance(df3.op, DataFrameIlocGetItem) + assert df3.shape == (3,) + assert df3.chunk_shape == (2,) + assert df3.chunks[0].shape == (2,) + assert df3.chunks[1].shape == (1,) + assert df3.chunks[0].op.indexes == [1, slice(None, None, None)] + assert df3.chunks[1].op.indexes == [1, slice(None, None, None)] + assert df3.chunks[0].inputs[0].index == (0, 0) + assert df3.chunks[0].inputs[0].shape == (2, 2) + assert df3.chunks[1].inputs[0].index == (0, 1) + assert df3.chunks[1].inputs[0].shape == (2, 1) + + # slice index + df4 = df2.iloc[:, 2:4] + df4 = tile(df4) + assert isinstance(df4, DATAFRAME_TYPE) + assert isinstance(df4.op, DataFrameIlocGetItem) + assert df4.index_value.key == df2.index_value.key + assert df4.shape == (3, 1) + assert df4.chunk_shape == (2, 1) + assert df4.chunks[0].shape == (2, 1) + pd.testing.assert_index_equal( + df4.chunks[0].columns_value.to_pandas(), df1.columns[2:3] + ) + pd.testing.assert_series_equal(df4.chunks[0].dtypes, df1.dtypes[2:3]) + assert isinstance(df4.chunks[0].index_value.to_pandas(), type(df1.index)) + assert df4.chunks[1].shape == (1, 1) + pd.testing.assert_index_equal( + df4.chunks[1].columns_value.to_pandas(), df1.columns[2:3] + ) + pd.testing.assert_series_equal(df4.chunks[1].dtypes, df1.dtypes[2:3]) + assert df4.chunks[0].index_value.key != df4.chunks[1].index_value.key + assert isinstance(df4.chunks[1].index_value.to_pandas(), type(df1.index)) + assert df4.chunks[0].op.indexes == [ + slice(None, None, None), + slice(None, None, None), + ] + assert df4.chunks[1].op.indexes == [ + slice(None, None, None), + slice(None, None, None), + ] + assert df4.chunks[0].inputs[0].index == (0, 1) + assert df4.chunks[0].inputs[0].shape == (2, 1) + assert df4.chunks[1].inputs[0].index == (1, 1) + assert df4.chunks[1].inputs[0].shape == (1, 1) + + # plain fancy index + df5 = df2.iloc[[0], [0, 1, 2]] + df5 = tile(df5) + assert isinstance(df5, DATAFRAME_TYPE) + assert isinstance(df5.op, DataFrameIlocGetItem) + assert df5.shape == (1, 3) + assert df5.chunk_shape == (1, 2) + assert df5.chunks[0].shape == (1, 2) + pd.testing.assert_index_equal( + df5.chunks[0].columns_value.to_pandas(), df1.columns[:2] + ) + pd.testing.assert_series_equal(df5.chunks[0].dtypes, df1.dtypes[:2]) + assert isinstance(df5.chunks[0].index_value.to_pandas(), type(df1.index)) + assert df5.chunks[1].shape == (1, 1) + pd.testing.assert_index_equal( + df5.chunks[1].columns_value.to_pandas(), df1.columns[2:] + ) + pd.testing.assert_series_equal(df5.chunks[1].dtypes, df1.dtypes[2:]) + assert isinstance(df5.chunks[1].index_value.to_pandas(), type(df1.index)) + np.testing.assert_array_equal(df5.chunks[0].op.indexes[0], [0]) + np.testing.assert_array_equal(df5.chunks[0].op.indexes[1], [0, 1]) + np.testing.assert_array_equal(df5.chunks[1].op.indexes[0], [0]) + np.testing.assert_array_equal(df5.chunks[1].op.indexes[1], [0]) + assert df5.chunks[0].inputs[0].index == (0, 0) + assert df5.chunks[0].inputs[0].shape == (2, 2) + assert df5.chunks[1].inputs[0].index == (0, 1) + assert df5.chunks[1].inputs[0].shape == (2, 1) + + # fancy index + df6 = df2.iloc[[1, 2], [0, 1, 2]] + df6 = tile(df6) + assert isinstance(df6, DATAFRAME_TYPE) + assert isinstance(df6.op, DataFrameIlocGetItem) + assert df6.shape == (2, 3) + assert df6.chunk_shape == (2, 2) + assert df6.chunks[0].shape == (1, 2) + assert df6.chunks[1].shape == (1, 1) + assert df6.chunks[2].shape == (1, 2) + assert df6.chunks[3].shape == (1, 1) + np.testing.assert_array_equal(df6.chunks[0].op.indexes[0], [1]) + np.testing.assert_array_equal(df6.chunks[0].op.indexes[1], [0, 1]) + np.testing.assert_array_equal(df6.chunks[1].op.indexes[0], [1]) + np.testing.assert_array_equal(df6.chunks[1].op.indexes[1], [0]) + np.testing.assert_array_equal(df6.chunks[2].op.indexes[0], [0]) + np.testing.assert_array_equal(df6.chunks[2].op.indexes[1], [0, 1]) + np.testing.assert_array_equal(df6.chunks[3].op.indexes[0], [0]) + np.testing.assert_array_equal(df6.chunks[3].op.indexes[1], [0]) + assert df6.chunks[0].inputs[0].index == (0, 0) + assert df6.chunks[0].inputs[0].shape == (2, 2) + assert df6.chunks[1].inputs[0].index == (0, 1) + assert df6.chunks[1].inputs[0].shape == (2, 1) + assert df6.chunks[2].inputs[0].index == (1, 0) + assert df6.chunks[2].inputs[0].shape == (1, 2) + assert df6.chunks[3].inputs[0].index == (1, 1) + assert df6.chunks[3].inputs[0].shape == (1, 1) + + # plain index + df7 = df2.iloc[1, 2] + df7 = tile(df7) + assert isinstance(df7, TENSOR_TYPE) # scalar + assert isinstance(df7.op, DataFrameIlocGetItem) + assert df7.shape == () + assert df7.chunk_shape == () + assert df7.chunks[0].dtype == df7.dtype + assert df7.chunks[0].shape == () + assert df7.chunks[0].op.indexes == [1, 0] + assert df7.chunks[0].inputs[0].index == (0, 1) + assert df7.chunks[0].inputs[0].shape == (2, 1) + + # test Series iloc getitem + + # slice + series = md.Series(pd.Series(np.arange(10)), chunk_size=3).iloc[4:8] + series = tile(series) + + assert series.shape == (4,) + + assert len(series.chunks) == 2 + assert series.chunks[0].shape == (2,) + assert series.chunks[0].index == (0,) + assert series.chunks[0].op.indexes == [slice(1, 3, 1)] + assert series.chunks[1].shape == (2,) + assert series.chunks[1].op.indexes == [slice(0, 2, 1)] + assert series.chunks[1].index == (1,) + + # fancy index + series = md.Series(pd.Series(np.arange(10)), chunk_size=3).iloc[[2, 4, 8]] + series = tile(series) + + assert series.shape == (3,) + + assert len(series.chunks) == 3 + assert series.chunks[0].shape == (1,) + assert series.chunks[0].index == (0,) + assert series.chunks[0].op.indexes[0] == [2] + assert series.chunks[1].shape == (1,) + assert series.chunks[1].op.indexes[0] == [1] + assert series.chunks[1].index == (1,) + assert series.chunks[2].shape == (1,) + assert series.chunks[2].op.indexes[0] == [2] + assert series.chunks[2].index == (2,) + + +def test_iloc_setitem(): + df1 = pd.DataFrame( + [[1, 3, 3], [4, 2, 6], [7, 8, 9]], + index=["a1", "a2", "a3"], + columns=["x", "y", "z"], + ) + df2 = md.DataFrame(df1, chunk_size=2) + df2 = tile(df2) + + # plain index + df3 = md.DataFrame(df1, chunk_size=2) + df3.iloc[1] = 100 + df3 = tile(df3) + assert isinstance(df3.op, DataFrameIlocSetItem) + assert df3.chunk_shape == df2.chunk_shape + pd.testing.assert_index_equal( + df2.index_value.to_pandas(), df3.index_value.to_pandas() + ) + pd.testing.assert_index_equal( + df2.columns_value.to_pandas(), df3.columns_value.to_pandas() + ) + for c1, c2 in zip(df2.chunks, df3.chunks): + assert c1.shape == c2.shape + pd.testing.assert_index_equal( + c1.index_value.to_pandas(), c2.index_value.to_pandas() + ) + pd.testing.assert_index_equal( + c1.columns_value.to_pandas(), c2.columns_value.to_pandas() + ) + if isinstance(c2.op, DataFrameIlocSetItem): + assert c1.key == c2.inputs[0].key + else: + assert c1.key == c2.key + assert df3.chunks[0].op.indexes == [1, slice(None, None, None)] + assert df3.chunks[1].op.indexes == [1, slice(None, None, None)] + + # # slice index + df4 = md.DataFrame(df1, chunk_size=2) + df4.iloc[:, 2:4] = 1111 + df4 = tile(df4) + assert isinstance(df4.op, DataFrameIlocSetItem) + assert df4.chunk_shape == df2.chunk_shape + pd.testing.assert_index_equal( + df2.index_value.to_pandas(), df4.index_value.to_pandas() + ) + pd.testing.assert_index_equal( + df2.columns_value.to_pandas(), df4.columns_value.to_pandas() + ) + for c1, c2 in zip(df2.chunks, df4.chunks): + assert c1.shape == c2.shape + pd.testing.assert_index_equal( + c1.index_value.to_pandas(), c2.index_value.to_pandas() + ) + pd.testing.assert_index_equal( + c1.columns_value.to_pandas(), c2.columns_value.to_pandas() + ) + if isinstance(c2.op, DataFrameIlocSetItem): + assert c1.key == c2.inputs[0].key + else: + assert c1.key == c2.key + assert df4.chunks[1].op.indexes == [ + slice(None, None, None), + slice(None, None, None), + ] + assert df4.chunks[3].op.indexes == [ + slice(None, None, None), + slice(None, None, None), + ] + + # plain fancy index + df5 = md.DataFrame(df1, chunk_size=2) + df5.iloc[[0], [0, 1, 2]] = 2222 + df5 = tile(df5) + assert isinstance(df5.op, DataFrameIlocSetItem) + assert df5.chunk_shape == df2.chunk_shape + pd.testing.assert_index_equal( + df2.index_value.to_pandas(), df5.index_value.to_pandas() + ) + pd.testing.assert_index_equal( + df2.columns_value.to_pandas(), df5.columns_value.to_pandas() + ) + for c1, c2 in zip(df2.chunks, df5.chunks): + assert c1.shape == c2.shape + pd.testing.assert_index_equal( + c1.index_value.to_pandas(), c2.index_value.to_pandas() + ) + pd.testing.assert_index_equal( + c1.columns_value.to_pandas(), c2.columns_value.to_pandas() + ) + if isinstance(c2.op, DataFrameIlocSetItem): + assert c1.key == c2.inputs[0].key + else: + assert c1.key == c2.key + np.testing.assert_array_equal(df5.chunks[0].op.indexes[0], [0]) + np.testing.assert_array_equal(df5.chunks[0].op.indexes[1], [0, 1]) + np.testing.assert_array_equal(df5.chunks[1].op.indexes[0], [0]) + np.testing.assert_array_equal(df5.chunks[1].op.indexes[1], [0]) + + # fancy index + df6 = md.DataFrame(df1, chunk_size=2) + df6.iloc[[1, 2], [0, 1, 2]] = 3333 + df6 = tile(df6) + assert isinstance(df6.op, DataFrameIlocSetItem) + assert df6.chunk_shape == df2.chunk_shape + pd.testing.assert_index_equal( + df2.index_value.to_pandas(), df6.index_value.to_pandas() + ) + pd.testing.assert_index_equal( + df2.columns_value.to_pandas(), df6.columns_value.to_pandas() + ) + for c1, c2 in zip(df2.chunks, df6.chunks): + assert c1.shape == c2.shape + pd.testing.assert_index_equal( + c1.index_value.to_pandas(), c2.index_value.to_pandas() + ) + pd.testing.assert_index_equal( + c1.columns_value.to_pandas(), c2.columns_value.to_pandas() + ) + if isinstance(c2.op, DataFrameIlocSetItem): + assert c1.key == c2.inputs[0].key + else: + assert c1.key == c2.key + np.testing.assert_array_equal(df6.chunks[0].op.indexes[0], [1]) + np.testing.assert_array_equal(df6.chunks[0].op.indexes[1], [0, 1]) + np.testing.assert_array_equal(df6.chunks[1].op.indexes[0], [1]) + np.testing.assert_array_equal(df6.chunks[1].op.indexes[1], [0]) + np.testing.assert_array_equal(df6.chunks[2].op.indexes[0], [0]) + np.testing.assert_array_equal(df6.chunks[2].op.indexes[1], [0, 1]) + np.testing.assert_array_equal(df6.chunks[3].op.indexes[0], [0]) + np.testing.assert_array_equal(df6.chunks[3].op.indexes[1], [0]) + + # plain index + df7 = md.DataFrame(df1, chunk_size=2) + df7.iloc[1, 2] = 4444 + df7 = tile(df7) + assert isinstance(df7.op, DataFrameIlocSetItem) + assert df7.chunk_shape == df2.chunk_shape + pd.testing.assert_index_equal( + df2.index_value.to_pandas(), df7.index_value.to_pandas() + ) + pd.testing.assert_index_equal( + df2.columns_value.to_pandas(), df7.columns_value.to_pandas() + ) + for c1, c2 in zip(df2.chunks, df7.chunks): + assert c1.shape == c2.shape + pd.testing.assert_index_equal( + c1.index_value.to_pandas(), c2.index_value.to_pandas() + ) + pd.testing.assert_index_equal( + c1.columns_value.to_pandas(), c2.columns_value.to_pandas() + ) + if isinstance(c2.op, DataFrameIlocSetItem): + assert c1.key == c2.inputs[0].key + else: + assert c1.key == c2.key + assert df7.chunks[1].op.indexes == [1, 0] + + # test Series + + # slice + series = md.Series(pd.Series(np.arange(10)), chunk_size=3) + series.iloc[:4] = 2 + series = tile(series) + + assert series.shape == (10,) + assert len(series.chunks) == 4 + + assert series.chunks[0].op.indexes == [ + slice(None, None, None), + ] + assert series.chunks[0].op.value == 2 + assert series.chunks[1].op.indexes == [ + slice(0, 1, 1), + ] + assert series.chunks[1].op.value == 2 + + raw = pd.DataFrame( + np.random.rand(9, 2), + index=["a1", "a2", "a3"] * 3, + columns=["x", "y"], + ) + df = md.DataFrame(raw, chunk_size=4) + iloc_df = df.iloc[:, 1:] + tiled_df, tiled_iloc_df = tile(df, iloc_df) + # for full slice, index_value should be same as input chunk + for loc_chunk, chunk in zip(tiled_iloc_df.chunks, tiled_df.chunks): + assert loc_chunk.index_value.key == chunk.index_value.key + + # fancy index + series = md.Series(pd.Series(np.arange(10)), chunk_size=3) + series.iloc[[2, 4, 9]] = 3 + series = tile(series) + + assert series.shape == (10,) + + assert len(series.chunks) == 4 + assert series.chunks[0].index == (0,) + assert series.chunks[0].op.indexes[0].tolist() == [2] + assert series.chunks[0].op.value == 3 + assert series.chunks[1].index == (1,) + assert series.chunks[1].op.indexes[0].tolist() == [1] + assert series.chunks[1].op.value == 3 + assert series.chunks[3].index == (3,) + assert series.chunks[3].op.indexes[0].tolist() == [0] + assert series.chunks[3].op.value == 3 + + +def test_dataframe_loc(): + raw = pd.DataFrame( + [[1, 3, 3], [4, 2, 6], [7, 8, 9]], + index=["a1", "a2", "a3"], + columns=["x", "y", "z"], + ) + df = md.DataFrame(raw, chunk_size=2) + raw2 = raw.copy() + raw2.reset_index(inplace=True, drop=True) + df3 = md.DataFrame(raw2, chunk_size=2) + s = pd.Series([1, 3, 5], index=["a1", "a2", "a3"]) + series = md.Series(s, chunk_size=2) + + # test return scalar + df2 = df.loc["a1", "z"] + assert isinstance(df2, Tensor) + assert df2.shape == () + assert df2.dtype == raw["z"].dtype + + df2 = tile(df2) + assert len(df2.chunks) == 1 + assert isinstance(df2.chunks[0], TENSOR_CHUNK_TYPE) + + # test return series for index axis + df2 = df.loc[:, "y"] + assert isinstance(df2, Series) + assert df2.shape == (3,) + pd.testing.assert_index_equal( + df2.index_value.to_pandas(), df.index_value.to_pandas() + ) + assert df2.name == "y" + assert df2.index_value.key == df.index_value.key + + df2 = tile(df2) + assert len(df2.chunks) == 2 + for c in df2.chunks: + assert isinstance(c, SERIES_CHUNK_TYPE) + assert isinstance(c.index_value.to_pandas(), type(raw.index)) + assert c.name == "y" + assert c.dtype == raw["y"].dtype + + # test return series for column axis + df2 = df.loc["a2", :] + assert isinstance(df2, Series) + assert df2.shape == (3,) + pd.testing.assert_index_equal( + df2.index_value.to_pandas(), df.columns_value.to_pandas() + ) + assert df2.name == "a2" + + df2 = tile(df2) + assert len(df2.chunks) == 2 + for c in df2.chunks: + assert isinstance(c, SERIES_CHUNK_TYPE) + assert isinstance(c.index_value.to_pandas(), type(raw.columns)) + assert c.name == "a2" + assert c.dtype == raw.loc["a2"].dtype + + # test slice + df2 = df.loc["a2":"a3", "y":"z"] + assert isinstance(df2, DataFrame) + assert df2.shape == (np.nan, 2) + pd.testing.assert_index_equal( + df2.index_value.to_pandas(), df.index_value.to_pandas() + ) + assert df2.index_value.key != df.index_value.key + pd.testing.assert_index_equal( + df2.columns_value.to_pandas(), raw.loc[:, "y":"z"].columns + ) + pd.testing.assert_series_equal(df2.dtypes, raw.loc[:, "y":"z"].dtypes) + + # test fancy index on index axis + df2 = df.loc[["a3", "a2"], [True, False, True]] + assert isinstance(df2, DataFrame) + assert df2.shape == (2, 2) + pd.testing.assert_index_equal( + df2.index_value.to_pandas(), df.index_value.to_pandas() + ) + assert df2.index_value.key != df.index_value.key + pd.testing.assert_index_equal( + df2.columns_value.to_pandas(), raw.loc[:, [True, False, True]].columns + ) + pd.testing.assert_series_equal(df2.dtypes, raw.loc[:, [True, False, True]].dtypes) + + # test fancy index which is md.Series on index axis + df2 = df.loc[md.Series(["a3", "a2"]), [True, False, True]] + assert isinstance(df2, DataFrame) + assert df2.shape == (2, 2) + pd.testing.assert_index_equal( + df2.index_value.to_pandas(), df.index_value.to_pandas() + ) + assert df2.index_value.key != df.index_value.key + pd.testing.assert_index_equal( + df2.columns_value.to_pandas(), raw.loc[:, [True, False, True]].columns + ) + pd.testing.assert_series_equal(df2.dtypes, raw.loc[:, [True, False, True]].dtypes) + + # test fancy index on columns axis + df2 = df.loc[[True, False, True], ["z", "x", "y"]] + assert isinstance(df2, DataFrame) + assert df2.shape == (2, 3) + pd.testing.assert_index_equal( + df2.index_value.to_pandas(), df.index_value.to_pandas() + ) + assert df2.index_value.key != df.index_value.key + pd.testing.assert_index_equal( + df2.columns_value.to_pandas(), raw.loc[:, ["z", "x", "y"]].columns + ) + pd.testing.assert_series_equal(df2.dtypes, raw.loc[:, ["z", "x", "y"]].dtypes) + + df2 = tile(df2) + assert len(df2.chunks) == 2 + for c in df2.chunks: + assert isinstance(c, DATAFRAME_CHUNK_TYPE) + pd.testing.assert_index_equal( + c.index_value.to_pandas(), df.index_value.to_pandas() + ) + assert c.index_value.key != df.index_value.key + pd.testing.assert_index_equal( + c.columns_value.to_pandas(), raw.loc[:, ["z", "x", "y"]].columns + ) + pd.testing.assert_series_equal(c.dtypes, raw.loc[:, ["z", "x", "y"]].dtypes) + + df2 = df.loc[md.Series([True, False, True])] + assert isinstance(df2, DataFrame) + assert df2.shape == (np.nan, 3) + pd.testing.assert_index_equal( + df2.index_value.to_pandas(), df.index_value.to_pandas() + ) + assert df2.index_value.key != df.index_value.key + pd.testing.assert_index_equal(df2.columns_value.to_pandas(), raw.columns) + pd.testing.assert_series_equal(df2.dtypes, raw.dtypes) + + df2 = df3.loc[md.Series([True, False, True])] + assert isinstance(df2, DataFrame) + assert df2.shape == (np.nan, 3) + assert isinstance( + df2.index_value.to_pandas(), type(raw.loc[[True, False, True]].index) + ) + assert df2.index_value.key != df3.index_value.key + pd.testing.assert_index_equal(df2.columns_value.to_pandas(), raw.columns) + pd.testing.assert_series_equal(df2.dtypes, raw.dtypes) + + df2 = df3.loc[md.Series([2, 1])] + assert isinstance(df2, DataFrame) + assert df2.shape == (2, 3) + assert isinstance(df2.index_value.to_pandas(), type(raw2.loc[[2, 1]].index)) + assert df2.index_value.key != df3.index_value.key + pd.testing.assert_index_equal(df2.columns_value.to_pandas(), raw.columns) + pd.testing.assert_series_equal(df2.dtypes, raw.dtypes) + + series2 = series.loc["a2"] + assert isinstance(series2, Tensor) + assert series2.shape == () + assert series2.dtype == s.dtype + + series2 = series.loc[["a2", "a3"]] + assert isinstance(series2, Series) + assert series2.shape == (2,) + assert series2.dtype == s.dtype + assert series2.name == s.name + + with pytest.raises(IndexingError): + _ = df.loc["a1", "z", ...] + + with pytest.raises(NotImplementedError): + _ = df.loc[:, md.Series([True, False, True])] + + with pytest.raises(KeyError): + _ = df.loc[:, ["non_exist"]] + + # test loc chunk's index_value + raw = pd.DataFrame( + np.random.rand(9, 2), + index=["a1", "a2", "a3"] * 3, + columns=["x", "y"], + ) + df = md.DataFrame(raw, chunk_size=4) + loc_df = df.loc[:, ["x"]] + tiled_df, tiled_loc_df = tile(df, loc_df) + # for full slice, index_value should be same as input chunk + for loc_chunk, chunk in zip(tiled_loc_df.chunks, tiled_df.chunks): + assert loc_chunk.index_value.key == chunk.index_value.key + + # test loc on filtered df + df2 = df[df["x"] < 1] + loc_df = df2.loc[:, ["y", "x"]] + tiled_loc_df = tile(loc_df) + assert len(tiled_loc_df.chunks) == 3 + + +def test_loc_use_iloc(): + raw = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]], columns=["x", "y", "z"]) + df = md.DataFrame(raw, chunk_size=2) + + assert isinstance(df.loc[:3].op, DataFrameIlocGetItem) + assert isinstance(df.loc[1:3].op, DataFrameIlocGetItem) + assert isinstance(df.loc[1].op, DataFrameIlocGetItem) + # negative + assert isinstance(df.loc[:-3].op, DataFrameLocGetItem) + with pytest.raises(KeyError): + _ = df.loc[-3] + # index 1 not None + assert isinstance(df.loc[:3, :"y"].op, DataFrameLocGetItem) + # index 1 not slice + assert isinstance(df.loc[:3, [True, False, True]].op, DataFrameLocGetItem) + assert isinstance(df.loc[[True, False, True]].op, DataFrameLocGetItem) + + raw2 = raw.copy() + raw2.index = pd.RangeIndex(1, 4) + df2 = md.DataFrame(raw2, chunk_size=2) + + assert isinstance(df2.loc[:3].op, DataFrameLocGetItem) + assert isinstance(df2.loc["a3":].op, DataFrameLocGetItem) + + raw2 = raw.copy() + raw2.index = [f"a{i}" for i in range(3)] + df2 = md.DataFrame(raw2, chunk_size=2) + + assert isinstance(df2.loc[:3].op, DataFrameLocGetItem) + + +def test_dataframe_getitem(): + data = pd.DataFrame(np.random.rand(10, 5), columns=["c1", "c2", "c3", "c4", "c5"]) + df = md.DataFrame(data, chunk_size=2) + + series = df["c3"] + assert isinstance(series, Series) + assert series.shape == (10,) + assert series.name == "c3" + assert series.dtype == data["c3"].dtype + assert series.index_value == df.index_value + + series = tile(series) + assert isinstance(series, SERIES_TYPE) + assert all(not i.is_coarse() for i in series.inputs) is True + assert series.nsplits == ((2, 2, 2, 2, 2),) + assert len(series.chunks) == 5 + for i, c in enumerate(series.chunks): + assert isinstance(c, SERIES_CHUNK_TYPE) + assert c.index == (i,) + assert c.shape == (2,) + + df1 = df[["c1", "c2", "c3"]] + assert isinstance(df1, DataFrame) + assert df1.shape == (10, 3) + assert df1.index_value == df.index_value + pd.testing.assert_index_equal( + df1.columns_value.to_pandas(), data[["c1", "c2", "c3"]].columns + ) + pd.testing.assert_series_equal(df1.dtypes, data[["c1", "c2", "c3"]].dtypes) + + df1 = tile(df1) + assert df1.nsplits == ((2, 2, 2, 2, 2), (2, 1)) + assert len(df1.chunks) == 10 + for i, c in enumerate(df1.chunks[slice(0, 10, 2)]): + assert isinstance(c, DATAFRAME_CHUNK_TYPE) + assert c.index == (i, 0) + assert c.shape == (2, 2) + for i, c in enumerate(df1.chunks[slice(1, 10, 2)]): + assert isinstance(c, DATAFRAME_CHUNK_TYPE) + assert c.index == (i, 1) + assert c.shape == (2, 1) + + +def test_dataframe_getitem_bool(): + data = pd.DataFrame( + np.random.rand(10, 5), + columns=["c1", "c2", "c3", "c4", "c5"], + index=pd.RangeIndex(10, name="i"), + ) + df = md.DataFrame(data, chunk_size=2) + + mask_data1 = data.c1 > 0.5 + mask_data2 = data.c1 < 0.5 + mask1 = md.Series(mask_data1, chunk_size=2) + mask2 = md.Series(mask_data2, chunk_size=2) + + r1 = df[mask1] + r2 = df[mask2] + r3 = df[mask1] + + assert r1.index_value.key != df.index_value.key + assert r1.index_value.key != mask1.index_value.key + assert r1.columns_value.key == df.columns_value.key + assert r1.columns_value is df.columns_value + assert r1.index_value.name == "i" + + assert r1.index_value.key != r2.index_value.key + assert r1.columns_value.key == r2.columns_value.key + assert r1.columns_value is r2.columns_value + + assert r1.index_value.key == r3.index_value.key + assert r1.columns_value.key == r3.columns_value.key + assert r1.columns_value is r3.columns_value + + +def test_series_getitem(): + data = pd.Series(np.random.rand(10), name="a") + series = md.Series(data, chunk_size=3) + + result1 = series[2] + assert result1.shape == () + + result1 = tile(result1) + assert result1.nsplits == () + assert len(result1.chunks) == 1 + assert isinstance(result1.chunks[0], TENSOR_CHUNK_TYPE) + assert result1.chunks[0].shape == () + assert result1.chunks[0].dtype == data.dtype + + result2 = series[[4, 5, 1, 2, 3]] + assert result2.shape == (5,) + + result2 = tile(result2) + assert result2.nsplits == ((2, 2, 1),) + assert len(result2.chunks) == 3 + assert result2.chunks[0].op.labels == [4, 5] + assert result2.chunks[1].op.labels == [1, 2] + assert result2.chunks[2].op.labels == [3] + + data = pd.Series(np.random.rand(10), index=["i" + str(i) for i in range(10)]) + series = md.Series(data, chunk_size=3) + + result1 = series["i2"] + assert result1.shape == () + + result1 = tile(result1) + assert result1.nsplits == () + assert result1.chunks[0].dtype == data.dtype + assert result1.chunks[0].op.labels == "i2" + + result2 = series[["i2", "i4"]] + assert result2.shape == (2,) + + result2 = tile(result2) + assert result2.nsplits == ((2,),) + assert result2.chunks[0].dtype == data.dtype + assert result2.chunks[0].op.labels == ["i2", "i4"] + + +def test_setitem(): + data = pd.DataFrame(np.random.rand(10, 2), columns=["c1", "c2"]) + df = md.DataFrame(data, chunk_size=4) + + df["new"] = 1 + assert df.shape == (10, 3) + pd.testing.assert_series_equal(df.inputs[0].dtypes, data.dtypes) + + tiled = tile(df) + assert tiled.chunks[0].shape == (4, 3) + pd.testing.assert_series_equal(tiled.inputs[0].dtypes, data.dtypes) + assert tiled.chunks[1].shape == (4, 3) + pd.testing.assert_series_equal(tiled.inputs[0].dtypes, data.dtypes) + assert tiled.chunks[2].shape == (2, 3) + pd.testing.assert_series_equal(tiled.inputs[0].dtypes, data.dtypes) + + for c in tiled.chunks: + pd.testing.assert_series_equal(c.inputs[0].dtypes, data.dtypes) + + +def test_reset_index(): + data = pd.DataFrame( + [("bird", 389.0), ("bird", 24.0), ("mammal", 80.5), ("mammal", np.nan)], + index=["falcon", "parrot", "lion", "monkey"], + columns=("class", "max_speed"), + ) + df = md.DataFrame(data, chunk_size=2).reset_index() + r = data.reset_index() + + assert df.shape == (4, 3) + pd.testing.assert_series_equal(df.dtypes, r.dtypes) + pd.testing.assert_index_equal(df.columns_value.to_pandas(), r.columns) + + df2 = tile(df) + + assert len(df2.chunks) == 2 + assert df2.chunks[0].shape == (2, 3) + pd.testing.assert_index_equal( + df2.chunks[0].index_value.to_pandas(), pd.RangeIndex(2) + ) + pd.testing.assert_series_equal(df2.chunks[0].dtypes, r.dtypes) + assert df2.chunks[1].shape == (2, 3) + pd.testing.assert_index_equal( + df2.chunks[1].index_value.to_pandas(), pd.RangeIndex(2, 4) + ) + pd.testing.assert_series_equal(df2.chunks[1].dtypes, r.dtypes) + + df = md.DataFrame(data, chunk_size=1).reset_index(drop=True) + r = data.reset_index(drop=True) + + assert df.shape == (4, 2) + pd.testing.assert_series_equal(df.dtypes, r.dtypes) + + df2 = tile(df) + + assert len(df2.chunks) == 8 + + for c in df2.chunks: + assert c.shape == (1, 1) + pd.testing.assert_index_equal( + c.index_value.to_pandas(), pd.RangeIndex(c.index[0], c.index[0] + 1) + ) + pd.testing.assert_series_equal(c.dtypes, r.dtypes[c.index[1] : c.index[1] + 1]) + + # test Series + series_data = pd.Series( + [1, 2, 3, 4], name="foo", index=pd.Index(["a", "b", "c", "d"], name="idx") + ) + s = md.Series(series_data, chunk_size=2).reset_index() + r = series_data.reset_index() + + assert s.shape == (4, 2) + pd.testing.assert_series_equal(s.dtypes, r.dtypes) + + s2 = tile(s) + assert len(s2.chunks) == 2 + assert s2.chunks[0].shape == (2, 2) + pd.testing.assert_index_equal( + s2.chunks[0].index_value.to_pandas(), pd.RangeIndex(2) + ) + assert s2.chunks[1].shape == (2, 2) + pd.testing.assert_index_equal( + s2.chunks[1].index_value.to_pandas(), pd.RangeIndex(2, 4) + ) + + with pytest.raises(TypeError): + md.Series(series_data, chunk_size=2).reset_index(inplace=True) + + +def test_head_tail_optimize(): + raw = pd.DataFrame(np.random.rand(4, 3)) + + df = md.DataFrame(raw, chunk_size=2) + + # no nan chunk shape + assert ( + HeadTailOptimizedOperandMixin._need_tile_head_tail(tile(df).head(2).op) is False + ) + + df2 = tile(df[df[0] < 0.5]) + # chunk shape on axis 1 greater than 1 + assert HeadTailOptimizedOperandMixin._need_tile_head_tail(df2.head(2).op) is False + + df = md.DataFrame(raw, chunk_size=(2, 3)) + df2 = tile(df[df[0] < 0.5]) + # not slice + assert HeadTailOptimizedOperandMixin._need_tile_head_tail(df2.iloc[2].op) is False + # step not None + assert ( + HeadTailOptimizedOperandMixin._need_tile_head_tail(df2.iloc[:2:2].op) is False + ) + # not head or tail + assert HeadTailOptimizedOperandMixin._need_tile_head_tail(df2.iloc[1:3].op) is False + # slice 1 is not slice(None) + assert ( + HeadTailOptimizedOperandMixin._need_tile_head_tail(df2.iloc[:3, :2].op) is False + ) + + +def test_reindex(): + raw = pd.DataFrame(np.random.rand(4, 3)) + + df = md.DataFrame(raw, chunk_size=2) + + with pytest.raises(TypeError): + df.reindex(unknown_arg=1) + + with pytest.raises(ValueError): + df.reindex([1, 2], fill_value=mt.tensor([1, 2])) + + +def test_getitem_lazy_chunk_meta(): + df = dataframe_from_tensor(mt.random.rand(10, 3, chunk_size=3)) + df2 = df[[0, 2]] + df2 = tile(df2) + + chunk = df2.chunks[0].data + assert chunk._FIELDS["_dtypes"].get(chunk) is None + pd.testing.assert_series_equal(chunk.dtypes, df.dtypes[[0, 2]]) + assert chunk._FIELDS["_dtypes"].get(chunk) is not None + assert chunk._FIELDS["_index_value"].get(chunk) is None + pd.testing.assert_index_equal(chunk.index_value.to_pandas(), pd.RangeIndex(3)) + assert chunk._FIELDS["_index_value"].get(chunk) is not None + assert chunk._FIELDS["_columns_value"].get(chunk) is None + pd.testing.assert_index_equal(chunk.columns_value.to_pandas(), pd.Index([0, 2])) + assert chunk._FIELDS["_columns_value"].get(chunk) is not None + + df2 = df[2] + df2 = tile(df2) + + chunk = df2.chunks[0].data + assert chunk._FIELDS["_index_value"].get(chunk) is None + pd.testing.assert_index_equal(chunk.index_value.to_pandas(), pd.RangeIndex(3)) + assert chunk._FIELDS["_index_value"].get(chunk) is not None diff --git a/python/xorbits/_mars/dataframe/indexing/tests/test_indexing_execution.py b/python/xorbits/_mars/dataframe/indexing/tests/test_indexing_execution.py new file mode 100644 index 000000000..2b5a9b0d2 --- /dev/null +++ b/python/xorbits/_mars/dataframe/indexing/tests/test_indexing_execution.py @@ -0,0 +1,1851 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile + +import mars +import numpy as np +import pandas as pd +import pytest + +try: + import pyarrow as pa +except ImportError: # pragma: no cover + pa = None +try: + import fastparquet as fp +except ImportError: # pragma: no cover + fp = None + +from .... import dataframe as md +from .... import tensor as mt +from ....utils import pd_release_version +from ...datasource.read_csv import DataFrameReadCSV +from ...datasource.read_parquet import DataFrameReadParquet +from ...datasource.read_sql import DataFrameReadSQL + +_allow_set_missing_list = pd_release_version[:2] >= (1, 1) + + +@pytest.mark.parametrize("chunk_size", [2, (2, 3)]) +def test_set_index(setup, chunk_size): + df1 = pd.DataFrame( + [[1, 3, 3], [4, 2, 6], [7, 8, 9]], + index=["a1", "a2", "a3"], + columns=["x", "y", "z"], + ) + + df2 = md.DataFrame(df1, chunk_size=chunk_size) + + expected = df1.set_index("y", drop=True) + df3 = df2.set_index("y", drop=True) + pd.testing.assert_frame_equal(expected, df3.execute().fetch()) + + expected = df1.set_index("y", drop=False) + df4 = df2.set_index("y", drop=False) + pd.testing.assert_frame_equal(expected, df4.execute().fetch()) + + expected = df1.set_index("y") + df2.set_index("y", inplace=True) + pd.testing.assert_frame_equal(expected, df2.execute().fetch()) + + +def test_iloc_getitem(setup): + df1 = pd.DataFrame( + [[1, 3, 3], [4, 2, 6], [7, 8, 9]], + index=["a1", "a2", "a3"], + columns=["x", "y", "z"], + ) + df2 = md.DataFrame(df1, chunk_size=2) + + # plain index + expected = df1.iloc[1] + df3 = df2.iloc[1] + result = df3.execute(extra_config={"check_series_name": False}).fetch() + pd.testing.assert_series_equal(expected, result) + + # plain index on axis 1 + expected = df1.iloc[:2, 1] + df4 = df2.iloc[:2, 1] + pd.testing.assert_series_equal(expected, df4.execute().fetch()) + + # slice index + expected = df1.iloc[:, 2:4] + df5 = df2.iloc[:, 2:4] + pd.testing.assert_frame_equal(expected, df5.execute().fetch()) + + # plain fancy index + expected = df1.iloc[[0], [0, 1, 2]] + df6 = df2.iloc[[0], [0, 1, 2]] + pd.testing.assert_frame_equal(expected, df6.execute().fetch()) + + # plain fancy index with shuffled order + expected = df1.iloc[[0], [1, 2, 0]] + df7 = df2.iloc[[0], [1, 2, 0]] + pd.testing.assert_frame_equal(expected, df7.execute().fetch()) + + # fancy index + expected = df1.iloc[[1, 2], [0, 1, 2]] + df8 = df2.iloc[[1, 2], [0, 1, 2]] + pd.testing.assert_frame_equal(expected, df8.execute().fetch()) + + # fancy index with shuffled order + expected = df1.iloc[[2, 1], [1, 2, 0]] + df9 = df2.iloc[[2, 1], [1, 2, 0]] + pd.testing.assert_frame_equal(expected, df9.execute().fetch()) + + # one fancy index + expected = df1.iloc[[2, 1]] + df10 = df2.iloc[[2, 1]] + pd.testing.assert_frame_equal(expected, df10.execute().fetch()) + + # plain index + expected = df1.iloc[1, 2] + df11 = df2.iloc[1, 2] + assert expected == df11.execute().fetch() + + # bool index array + expected = df1.iloc[[True, False, True], [2, 1]] + df12 = df2.iloc[[True, False, True], [2, 1]] + pd.testing.assert_frame_equal(expected, df12.execute().fetch()) + + # bool index array on axis 1 + expected = df1.iloc[[2, 1], [True, False, True]] + df14 = df2.iloc[[2, 1], [True, False, True]] + pd.testing.assert_frame_equal(expected, df14.execute().fetch()) + + # bool index + expected = df1.iloc[[True, False, True], [2, 1]] + df13 = df2.iloc[md.Series([True, False, True], chunk_size=1), [2, 1]] + pd.testing.assert_frame_equal(expected, df13.execute().fetch()) + + # test Series + data = pd.Series(np.arange(10)) + series = md.Series(data, chunk_size=3).iloc[:3] + pd.testing.assert_series_equal(series.execute().fetch(), data.iloc[:3]) + + series = md.Series(data, chunk_size=3).iloc[4] + assert series.execute().fetch() == data.iloc[4] + + series = md.Series(data, chunk_size=3).iloc[[2, 3, 4, 9]] + pd.testing.assert_series_equal(series.execute().fetch(), data.iloc[[2, 3, 4, 9]]) + + series = md.Series(data, chunk_size=3).iloc[[4, 3, 9, 2]] + pd.testing.assert_series_equal(series.execute().fetch(), data.iloc[[4, 3, 9, 2]]) + + series = md.Series(data).iloc[5:] + pd.testing.assert_series_equal(series.execute().fetch(), data.iloc[5:]) + + # bool index array + selection = np.random.RandomState(0).randint(2, size=10, dtype=bool) + series = md.Series(data).iloc[selection] + pd.testing.assert_series_equal(series.execute().fetch(), data.iloc[selection]) + + # bool index + series = md.Series(data).iloc[md.Series(selection, chunk_size=4)] + pd.testing.assert_series_equal(series.execute().fetch(), data.iloc[selection]) + + # test index + data = pd.Index(np.arange(10)) + index = md.Index(data, chunk_size=3)[:3] + pd.testing.assert_index_equal(index.execute().fetch(), data[:3]) + + index = md.Index(data, chunk_size=3)[4] + assert index.execute().fetch() == data[4] + + index = md.Index(data, chunk_size=3)[[2, 3, 4, 9]] + pd.testing.assert_index_equal(index.execute().fetch(), data[[2, 3, 4, 9]]) + + index = md.Index(data, chunk_size=3)[[4, 3, 9, 2]] + pd.testing.assert_index_equal(index.execute().fetch(), data[[4, 3, 9, 2]]) + + index = md.Index(data)[5:] + pd.testing.assert_index_equal(index.execute().fetch(), data[5:]) + + # bool index array + selection = np.random.RandomState(0).randint(2, size=10, dtype=bool) + index = md.Index(data)[selection] + pd.testing.assert_index_equal(index.execute().fetch(), data[selection]) + + index = md.Index(data)[mt.tensor(selection, chunk_size=4)] + pd.testing.assert_index_equal(index.execute().fetch(), data[selection]) + + +def test_iloc_setitem(setup): + df1 = pd.DataFrame( + [[1, 3, 3], [4, 2, 6], [7, 8, 9]], + index=["a1", "a2", "a3"], + columns=["x", "y", "z"], + ) + df2 = md.DataFrame(df1, chunk_size=2) + + # plain index + expected = df1 + expected.iloc[1] = 100 + df2.iloc[1] = 100 + pd.testing.assert_frame_equal(expected, df2.execute().fetch()) + + # slice index + expected.iloc[:, 2:4] = 1111 + df2.iloc[:, 2:4] = 1111 + pd.testing.assert_frame_equal(expected, df2.execute().fetch()) + + # plain fancy index + expected.iloc[[0], [0, 1, 2]] = 2222 + df2.iloc[[0], [0, 1, 2]] = 2222 + pd.testing.assert_frame_equal(expected, df2.execute().fetch()) + + # fancy index + expected.iloc[[1, 2], [0, 1, 2]] = 3333 + df2.iloc[[1, 2], [0, 1, 2]] = 3333 + pd.testing.assert_frame_equal(expected, df2.execute().fetch()) + + # plain index + expected.iloc[1, 2] = 4444 + df2.iloc[1, 2] = 4444 + pd.testing.assert_frame_equal(expected, df2.execute().fetch()) + + # test Series + data = pd.Series(np.arange(10)) + series = md.Series(data, chunk_size=3) + series.iloc[:3] = 1 + data.iloc[:3] = 1 + pd.testing.assert_series_equal(series.execute().fetch(), data) + + series.iloc[4] = 2 + data.iloc[4] = 2 + pd.testing.assert_series_equal(series.execute().fetch(), data) + + series.iloc[[2, 3, 4, 9]] = 3 + data.iloc[[2, 3, 4, 9]] = 3 + pd.testing.assert_series_equal(series.execute().fetch(), data) + + series.iloc[5:] = 4 + data.iloc[5:] = 4 + pd.testing.assert_series_equal(series.execute().fetch(), data) + + # test Index + data = pd.Index(np.arange(10)) + index = md.Index(data, chunk_size=3) + with pytest.raises(TypeError): + index[5:] = 4 + + +def test_loc_getitem(setup): + rs = np.random.RandomState(0) + # index and columns are labels + raw1 = pd.DataFrame( + rs.randint(10, size=(5, 4)), + index=["a1", "a2", "a3", "a4", "a5"], + columns=["a", "b", "c", "d"], + ) + # columns are labels + raw2 = raw1.copy() + raw2.reset_index(inplace=True, drop=True) + # columns are non unique and monotonic + raw3 = raw1.copy() + raw3.columns = ["a", "b", "b", "d"] + # columns are non unique and non monotonic + raw4 = raw1.copy() + raw4.columns = ["b", "a", "b", "d"] + # index that is timestamp + raw5 = raw1.copy() + raw5.index = pd.date_range("2020-1-1", periods=5) + raw6 = raw1[:0] + + df1 = md.DataFrame(raw1, chunk_size=2) + df2 = md.DataFrame(raw2, chunk_size=2) + df3 = md.DataFrame(raw3, chunk_size=2) + df4 = md.DataFrame(raw4, chunk_size=2) + df5 = md.DataFrame(raw5, chunk_size=2) + df6 = md.DataFrame(raw6) + + df = df2.loc[3, "b"] + result = df.execute().fetch() + expected = raw2.loc[3, "b"] + assert result == expected + + df = df1.loc["a3", "b"] + result = df.execute(extra_config={"check_shape": False}).fetch() + expected = raw1.loc["a3", "b"] + assert result == expected + + # test empty list + df = df1.loc[[]] + result = df.execute().fetch() + expected = raw1.loc[[]] + pd.testing.assert_frame_equal(result, expected) + + df = df2.loc[[]] + result = df.execute().fetch() + expected = raw2.loc[[]] + pd.testing.assert_frame_equal(result, expected) + + df = df2.loc[1:4] + result = df.execute().fetch() + expected = raw2.loc[1:4] + pd.testing.assert_frame_equal(result, expected) + + df = df2.loc[1:4, "b":"d"] + result = df.execute().fetch() + expected = raw2.loc[1:4, "b":"d"] + pd.testing.assert_frame_equal(result, expected) + + df = df2.loc[:4, "b":] + result = df.execute().fetch() + expected = raw2.loc[:4, "b":] + pd.testing.assert_frame_equal(result, expected) + + # slice on axis index whose index_value does not have value + df = df1.loc["a2":"a4", "b":] + result = df.execute().fetch() + expected = raw1.loc["a2":"a4", "b":] + pd.testing.assert_frame_equal(result, expected) + + df = df2.loc[:, "b"] + result = df.execute().fetch() + expected = raw2.loc[:, "b"] + pd.testing.assert_series_equal(result, expected) + df = df2.loc[:, ["b", "a"]] + result = df.execute().fetch() + expected = raw2.loc[:, ["b", "a"]] + pd.testing.assert_frame_equal(result, expected) + + # 'b' is non-unique + df = df3.loc[:, "b"] + result = df.execute().fetch() + expected = raw3.loc[:, "b"] + pd.testing.assert_frame_equal(result, expected) + + # 'b' is non-unique, and non-monotonic + df = df4.loc[:, "b"] + result = df.execute().fetch() + expected = raw4.loc[:, "b"] + pd.testing.assert_frame_equal(result, expected) + + # label on axis 0 + df = df1.loc["a2", :] + result = df.execute().fetch() + expected = raw1.loc["a2", :] + pd.testing.assert_series_equal(result, expected) + + # label-based fancy index + df = df2.loc[[3, 0, 1], ["c", "a", "d"]] + result = df.execute().fetch() + expected = raw2.loc[[3, 0, 1], ["c", "a", "d"]] + pd.testing.assert_frame_equal(result, expected) + df = df2[df2["a"] < 10] + df = df.loc[[3, 0, 1], ["c", "a", "d"]] + result = df.execute().fetch() + expected = raw2.loc[[3, 0, 1], ["c", "a", "d"]] + pd.testing.assert_frame_equal(result, expected) + + # label-based fancy index, asc sorted + df = df2.loc[[0, 1, 3], ["a", "c", "d"]] + result = df.execute().fetch() + expected = raw2.loc[[0, 1, 3], ["a", "c", "d"]] + pd.testing.assert_frame_equal(result, expected) + + # label-based fancy index in which non-unique exists + selection = rs.randint(2, size=(5,), dtype=bool) + df = df3.loc[selection, ["b", "a", "d"]] + result = df.execute().fetch() + expected = raw3.loc[selection, ["b", "a", "d"]] + pd.testing.assert_frame_equal(result, expected) + + df = df3.loc[md.Series(selection), ["b", "a", "d"]] + result = df.execute().fetch() + expected = raw3.loc[selection, ["b", "a", "d"]] + pd.testing.assert_frame_equal(result, expected) + + # label-based fancy index on index + # whose index_value does not have value + df = df1.loc[["a3", "a1"], ["b", "a", "d"]] + result = df.execute(extra_config={"check_nsplits": False}).fetch() + expected = raw1.loc[["a3", "a1"], ["b", "a", "d"]] + pd.testing.assert_frame_equal(result, expected) + + # get timestamp by str + df = df5.loc["20200101"] + result = df.execute(extra_config={"check_series_name": False}).fetch( + extra_config={"check_series_name": False} + ) + expected = raw5.loc["20200101"] + pd.testing.assert_series_equal(result, expected) + + # get timestamp by str, return scalar + df = df5.loc["2020-1-1", "c"] + result = df.execute().fetch() + expected = raw5.loc["2020-1-1", "c"] + assert result == expected + + # test empty df + df = df6.loc[[]] + result = df.execute().fetch() + expected = raw6.loc[[]] + pd.testing.assert_frame_equal(result, expected) + + +@pytest.mark.pd_compat +def test_dataframe_getitem(setup): + data = pd.DataFrame(np.random.rand(10, 5), columns=["c1", "c2", "c3", "c4", "c5"]) + df = md.DataFrame(data, chunk_size=2) + data2 = data.copy() + data2.index = pd.date_range("2020-1-1", periods=10) + mdf = md.DataFrame(data2, chunk_size=3) + + series1 = df["c2"] + pd.testing.assert_series_equal(series1.execute().fetch(), data["c2"]) + + series2 = df["c5"] + pd.testing.assert_series_equal(series2.execute().fetch(), data["c5"]) + + df1 = df[["c1", "c2", "c3"]] + pd.testing.assert_frame_equal(df1.execute().fetch(), data[["c1", "c2", "c3"]]) + + df2 = df[["c3", "c2", "c1"]] + pd.testing.assert_frame_equal(df2.execute().fetch(), data[["c3", "c2", "c1"]]) + + df3 = df[["c1"]] + pd.testing.assert_frame_equal(df3.execute().fetch(), data[["c1"]]) + + df4 = df[["c3", "c1", "c2", "c1"]] + pd.testing.assert_frame_equal(df4.execute().fetch(), data[["c3", "c1", "c2", "c1"]]) + + df5 = df[np.array(["c1", "c2", "c3"])] + pd.testing.assert_frame_equal(df5.execute().fetch(), data[["c1", "c2", "c3"]]) + + df6 = df[["c3", "c2", "c1"]] + pd.testing.assert_frame_equal(df6.execute().fetch(), data[["c3", "c2", "c1"]]) + + df7 = df[1:7:2] + pd.testing.assert_frame_equal(df7.execute().fetch(), data[1:7:2]) + + df8 = df[["c1", "c1"]]["c1"] + pd.testing.assert_frame_equal(df8.execute().fetch(), data[["c1", "c1"]]["c1"]) + + series3 = df["c1"][0] + assert series3.execute().fetch() == data["c1"][0] + + df8 = mdf[3:7] + pd.testing.assert_frame_equal(df8.execute().fetch(), data2[3:7]) + + df9 = mdf["2020-1-2":"2020-1-5"] + pd.testing.assert_frame_equal(df9.execute().fetch(), data2["2020-1-2":"2020-1-5"]) + + +def test_dataframe_getitem_bool(setup): + data = pd.DataFrame(np.random.rand(10, 5), columns=["c1", "c2", "c3", "c4", "c5"]) + df = md.DataFrame(data, chunk_size=2) + + mask_data = data.c1 > 0.5 + mask = md.Series(mask_data, chunk_size=2) + + # getitem by mars series + assert df[mask].execute().fetch().shape == data[mask_data].shape + pd.testing.assert_frame_equal(df[mask].execute().fetch(), data[mask_data]) + + # getitem by pandas series + pd.testing.assert_frame_equal(df[mask_data].execute().fetch(), data[mask_data]) + + # getitem by mars series with alignment but no shuffle + mask_data = pd.Series( + [True, True, True, False, False, True, True, False, False, True], + index=range(9, -1, -1), + ) + mask = md.Series(mask_data, chunk_size=2) + pd.testing.assert_frame_equal(df[mask].execute().fetch(), data[mask_data]) + + # getitem by mars series with shuffle alignment + mask_data = pd.Series( + [True, True, True, False, False, True, True, False, False, True], + index=[0, 3, 6, 2, 9, 8, 5, 7, 1, 4], + ) + mask = md.Series(mask_data, chunk_size=2) + pd.testing.assert_frame_equal( + df[mask].execute().fetch().sort_index(), data[mask_data] + ) + + # getitem by mars series with shuffle alignment and extra element + mask_data = pd.Series( + [True, True, True, False, False, True, True, False, False, True, False], + index=[0, 3, 6, 2, 9, 8, 5, 7, 1, 4, 10], + ) + mask = md.Series(mask_data, chunk_size=2) + pd.testing.assert_frame_equal( + df[mask].execute().fetch().sort_index(), data[mask_data] + ) + + # getitem by DataFrame with all bool columns + r = df[df > 0.5] + result = r.execute().fetch() + pd.testing.assert_frame_equal(result, data[data > 0.5]) + + # getitem by tensor mask + r = df[(df["c1"] > 0.5).to_tensor()] + result = r.execute().fetch() + pd.testing.assert_frame_equal(result, data[data["c1"] > 0.5]) + + # test input data with unknown shape + data = pd.DataFrame(np.random.rand(10, 2), columns=["c1", "c2"]) + mask_data = data[data["c2"] > 0.5] + df = md.DataFrame(data, chunk_size=2) + s = md.Series(mask_data["c1"] > 0.5, chunk_size=2) + df1 = df[df["c2"] > 0.5] + s._index_value = df1.index_value + r = df1[s] + pd.testing.assert_frame_equal(r.execute().fetch(), mask_data[mask_data["c1"] > 0.5]) + + +def test_dataframe_getitem_using_attr(setup): + data = pd.DataFrame( + np.random.rand(10, 5), columns=["c1", "c2", "key", "dtypes", "size"] + ) + df = md.DataFrame(data, chunk_size=2) + + series1 = df.c2 + pd.testing.assert_series_equal(series1.execute().fetch(), data.c2) + + # accessing column using attribute shouldn't overwrite existing attributes + assert df.key == getattr(getattr(df, "_data"), "_key") + assert df.size == data.size + pd.testing.assert_series_equal(df.dtypes, data.dtypes) + + # accessing non-existing attributes should trigger exception + with pytest.raises(AttributeError): + _ = df.zzz # noqa: F841 + + +def test_series_getitem(setup): + data = pd.Series(np.random.rand(10)) + series = md.Series(data) + assert series[1].execute().fetch() == data[1] + + data = pd.Series(np.random.rand(10), name="a") + series = md.Series(data, chunk_size=4) + + for i in range(10): + series1 = series[i] + assert series1.execute().fetch() == data[i] + + series2 = series[[0, 1, 2, 3, 4]] + pd.testing.assert_series_equal(series2.execute().fetch(), data[[0, 1, 2, 3, 4]]) + + series3 = series[[4, 3, 2, 1, 0]] + pd.testing.assert_series_equal(series3.execute().fetch(), data[[4, 3, 2, 1, 0]]) + + series4 = series[[1, 2, 3, 2, 1, 0]] + pd.testing.assert_series_equal(series4.execute().fetch(), data[[1, 2, 3, 2, 1, 0]]) + # + index = ["i" + str(i) for i in range(20)] + data = pd.Series(np.random.rand(20), index=index, name="a") + series = md.Series(data, chunk_size=3) + + for idx in index: + series1 = series[idx] + assert series1.execute().fetch() == data[idx] + + selected = ["i1", "i2", "i3", "i4", "i5"] + series2 = series[selected] + pd.testing.assert_series_equal(series2.execute().fetch(), data[selected]) + + selected = ["i4", "i7", "i0", "i1", "i5"] + series3 = series[selected] + pd.testing.assert_series_equal(series3.execute().fetch(), data[selected]) + + selected = ["i0", "i1", "i5", "i4", "i0", "i1"] + series4 = series[selected] + pd.testing.assert_series_equal(series4.execute().fetch(), data[selected]) + + selected = ["i0"] + series5 = series[selected] + pd.testing.assert_series_equal(series5.execute().fetch(), data[selected]) + + data = pd.Series(np.random.rand(10)) + series = md.Series(data, chunk_size=3) + selected = series[:2] + pd.testing.assert_series_equal(selected.execute().fetch(), data[:2]) + + selected = series[2:8:2] + pd.testing.assert_series_equal(selected.execute().fetch(), data[2:8:2]) + + data = pd.Series(np.random.rand(9), index=["c" + str(i) for i in range(9)]) + series = md.Series(data, chunk_size=3) + selected = series[:"c2"] + pd.testing.assert_series_equal(selected.execute().fetch(), data[:"c2"]) + selected = series["c2":"c9"] + pd.testing.assert_series_equal(selected.execute().fetch(), data["c2":"c9"]) + + +def test_head(setup): + data = pd.DataFrame(np.random.rand(10, 5), columns=["c1", "c2", "c3", "c4", "c5"]) + df = md.DataFrame(data, chunk_size=2) + + pd.testing.assert_frame_equal(df.head().execute().fetch(), data.head()) + pd.testing.assert_frame_equal(df.head(3).execute().fetch(), data.head(3)) + pd.testing.assert_frame_equal(df.head(-3).execute().fetch(), data.head(-3)) + pd.testing.assert_frame_equal(df.head(8).execute().fetch(), data.head(8)) + pd.testing.assert_frame_equal(df.head(-8).execute().fetch(), data.head(-8)) + pd.testing.assert_frame_equal(df.head(13).execute().fetch(), data.head(13)) + pd.testing.assert_frame_equal(df.head(-13).execute().fetch(), data.head(-13)) + + +def test_tail(setup): + data = pd.DataFrame(np.random.rand(10, 5), columns=["c1", "c2", "c3", "c4", "c5"]) + df = md.DataFrame(data, chunk_size=2) + + pd.testing.assert_frame_equal(df.tail().execute().fetch(), data.tail()) + pd.testing.assert_frame_equal(df.tail(3).execute().fetch(), data.tail(3)) + pd.testing.assert_frame_equal(df.tail(-3).execute().fetch(), data.tail(-3)) + pd.testing.assert_frame_equal(df.tail(8).execute().fetch(), data.tail(8)) + pd.testing.assert_frame_equal(df.tail(-8).execute().fetch(), data.tail(-8)) + pd.testing.assert_frame_equal(df.tail(13).execute().fetch(), data.tail(13)) + pd.testing.assert_frame_equal(df.tail(-13).execute().fetch(), data.tail(-13)) + + +def test_at(setup): + data = pd.DataFrame( + np.random.rand(10, 5), + columns=["c" + str(i) for i in range(5)], + index=["i" + str(i) for i in range(10)], + ) + df = md.DataFrame(data, chunk_size=3) + data2 = data.copy() + data2.index = np.arange(10) + df2 = md.DataFrame(data2, chunk_size=3) + + with pytest.raises(ValueError): + _ = df.at[["i3, i4"], "c1"] + + result = df.at["i3", "c1"].execute().fetch() + assert result == data.at["i3", "c1"] + + result = df["c1"].at["i2"].execute().fetch() + assert result == data["c1"].at["i2"] + + result = df2.at[3, "c2"].execute().fetch() + assert result == data2.at[3, "c2"] + + result = df2.loc[3].at["c2"].execute().fetch() + assert result == data2.loc[3].at["c2"] + + +def test_iat(setup): + data = pd.DataFrame( + np.random.rand(10, 5), + columns=["c" + str(i) for i in range(5)], + index=["i" + str(i) for i in range(10)], + ) + df = md.DataFrame(data, chunk_size=3) + + with pytest.raises(ValueError): + _ = df.iat[[1, 2], 3] + + result = df.iat[3, 4].execute().fetch() + assert result == data.iat[3, 4] + + result = df.iloc[:, 2].iat[3].execute().fetch() + assert result == data.iloc[:, 2].iat[3] + + +@pytest.mark.pd_compat +def test_setitem(setup): + data = pd.DataFrame( + np.random.rand(10, 5), + columns=["c" + str(i) for i in range(5)], + index=["i" + str(i) for i in range(10)], + ) + data2 = np.random.rand(10) + data3 = np.random.rand(10, 2) + df = md.DataFrame(data, chunk_size=3) + + df["c3"] = df["c3"] + 1 + df["c10"] = 10 + df[4] = mt.tensor(data2, chunk_size=4) + df["d1"] = df["c4"].mean() + df["e1"] = data2 * 2 + + result = df.execute().fetch() + expected = data.copy() + expected["c3"] = expected["c3"] + 1 + expected["c10"] = 10 + expected[4] = data2 + expected["d1"] = data["c4"].mean() + expected["e1"] = data2 * 2 + pd.testing.assert_frame_equal(result, expected) + + # test set multiple cols with scalar + df = md.DataFrame(data, chunk_size=3) + df[["c0", "c2"]] = 1 + df[["c1", "c10"]] = df["c4"].mean() + df[["c11", "c12"]] = mt.tensor(data3, chunk_size=4) + + result = df.execute().fetch() + if not _allow_set_missing_list: + expected = data.copy().reindex( + ["c" + str(i) for i in range(5)] + ["c10", "c11", "c12"], + axis=1, + ) + else: + expected = data.copy() + expected[["c0", "c2"]] = 1 + expected[["c1", "c10"]] = expected["c4"].mean() + expected[["c11", "c12"]] = data3 + pd.testing.assert_frame_equal(result, expected) + + # test set multiple rows + df = md.DataFrame(data, chunk_size=3) + df[["c1", "c4", "c10"]] = df[["c2", "c3", "c4"]] * 2 + + result = df.execute().fetch() + expected = data.copy() + expected[["c1", "c4", "c10"]] = expected[["c2", "c3", "c4"]] * 2 + pd.testing.assert_frame_equal(result, expected) + + # test setitem into empty DataFrame + df = md.DataFrame() + df["a"] = md.Series(np.arange(1, 11), chunk_size=3) + pd.testing.assert_index_equal(df.index_value.to_pandas(), pd.RangeIndex(10)) + + result = df.execute().fetch() + expected = pd.DataFrame() + expected["a"] = pd.Series(np.arange(1, 11)) + pd.testing.assert_frame_equal(result, expected) + + df["b"] = md.Series(np.arange(2, 12), index=pd.RangeIndex(1, 11), chunk_size=3) + result = df.execute().fetch() + expected["b"] = pd.Series(np.arange(2, 12), index=pd.RangeIndex(1, 11)) + pd.testing.assert_frame_equal(result, expected) + + # test set multiple item order + data = pd.DataFrame( + [list(range(5))] * 10, + columns=["cc" + str(i) for i in range(5)], + index=["i" + str(i) for i in range(10)], + ) + df = md.DataFrame(data, chunk_size=3) + df2 = df.apply( + lambda x: x * 2, + axis=1, + result_type="expand", + dtypes=[np.int64, np.int64, np.int64, np.int64, np.int64], + output_type="dataframe", + ) + columns = ["dd" + str(i) for i in range(5)] + columns[1] = "cc2" + columns[3] = "cc1" + columns[4] = "cc3" + df2.columns = columns + df[columns] = df2[columns] + result = df.execute().fetch() + df2 = data.apply(lambda x: x * 2) + df2.columns = columns + data[columns] = df2[columns] + pd.testing.assert_frame_equal(result, data) + + +def test_reset_index_execution(setup): + data = pd.DataFrame( + [("bird", 389.0), ("bird", 24.0), ("mammal", 80.5), ("mammal", np.nan)], + index=["falcon", "parrot", "lion", "monkey"], + columns=("class", "max_speed"), + ) + df = md.DataFrame(data) + df2 = df.reset_index() + result = df2.execute().fetch() + expected = data.reset_index() + pd.testing.assert_frame_equal(result, expected) + + df = md.DataFrame(data, chunk_size=2) + df2 = df.reset_index() + result = df2.execute().fetch() + expected = data.reset_index() + pd.testing.assert_frame_equal(result, expected) + + df = md.DataFrame(data, chunk_size=1) + df2 = df.reset_index(drop=True) + result = df2.execute().fetch() + expected = data.reset_index(drop=True) + pd.testing.assert_frame_equal(result, expected) + + index = pd.MultiIndex.from_tuples( + [ + ("bird", "falcon"), + ("bird", "parrot"), + ("mammal", "lion"), + ("mammal", "monkey"), + ], + names=["class", "name"], + ) + data = pd.DataFrame( + [("bird", 389.0), ("bird", 24.0), ("mammal", 80.5), ("mammal", np.nan)], + index=index, + columns=("type", "max_speed"), + ) + df = md.DataFrame(data, chunk_size=1) + df2 = df.reset_index(level="class") + result = df2.execute().fetch() + expected = data.reset_index(level="class") + pd.testing.assert_frame_equal(result, expected) + + columns = pd.MultiIndex.from_tuples([("speed", "max"), ("species", "type")]) + data.columns = columns + df = md.DataFrame(data, chunk_size=2) + df2 = df.reset_index(level="class", col_level=1, col_fill="species") + result = df2.execute().fetch() + expected = data.reset_index(level="class", col_level=1, col_fill="species") + pd.testing.assert_frame_equal(result, expected) + + df = md.DataFrame(data, chunk_size=3) + df.reset_index(level="class", col_level=1, col_fill="species", inplace=True) + result = df.execute().fetch() + expected = data.reset_index(level="class", col_level=1, col_fill="species") + pd.testing.assert_frame_equal(result, expected) + + # Test Series + + s = pd.Series( + [1, 2, 3, 4], name="foo", index=pd.Index(["a", "b", "c", "d"], name="idx") + ) + + series = md.Series(s) + s2 = series.reset_index(name="bar") + result = s2.execute().fetch() + expected = s.reset_index(name="bar") + pd.testing.assert_frame_equal(result, expected) + + series = md.Series(s, chunk_size=2) + s2 = series.reset_index(drop=True) + result = s2.execute().fetch() + expected = s.reset_index(drop=True) + pd.testing.assert_series_equal(result, expected) + + # Test Unknown shape + data1 = pd.DataFrame(np.random.rand(10, 3), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9]) + df1 = md.DataFrame(data1, chunk_size=5) + data2 = pd.DataFrame(np.random.rand(10, 3), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) + df2 = md.DataFrame(data2, chunk_size=6) + df = (df1 + df2).reset_index(incremental_index=True) + result = df.execute().fetch() + pd.testing.assert_index_equal(result.index, pd.RangeIndex(12)) + # Inconsistent with Pandas when input dataframe's shape is unknown. + result = result.sort_values(by=result.columns[0]) + expected = (data1 + data2).reset_index() + np.testing.assert_array_equal(result.to_numpy(), expected.to_numpy()) + + data1 = pd.Series( + np.random.rand(10), + index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], + ) + series1 = md.Series(data1, chunk_size=3) + data2 = pd.Series( + np.random.rand(10), + index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], + ) + series2 = md.Series(data2, chunk_size=3) + df = (series1 + series2).reset_index(incremental_index=True) + result = df.execute().fetch() + pd.testing.assert_index_equal(result.index, pd.RangeIndex(12)) + # Inconsistent with Pandas when input dataframe's shape is unknown. + result = result.sort_values(by=result.columns[0]) + expected = (data1 + data2).reset_index() + np.testing.assert_array_equal(result.to_numpy(), expected.to_numpy()) + + series1 = md.Series(data1, chunk_size=3) + series1.reset_index(inplace=True, drop=True) + result = series1.execute().fetch() + pd.testing.assert_index_equal(result.index, pd.RangeIndex(10)) + + # case from https://github.com/mars-project/mars/issues/1286 + data = pd.DataFrame(np.random.rand(10, 3), columns=list("abc")) + df = md.DataFrame(data, chunk_size=3) + + r = df.sort_values("a").reset_index(drop=True, incremental_index=True) + result = r.execute().fetch() + expected = data.sort_values("a").reset_index(drop=True) + pd.testing.assert_frame_equal(result, expected) + + +def test_rename(setup): + rs = np.random.RandomState(0) + raw = pd.DataFrame(rs.rand(10, 4), columns=["A", "B", "C", "D"]) + df = md.DataFrame(raw, chunk_size=3) + + with pytest.warns(Warning): + df.rename(str, errors="raise") + + with pytest.raises(NotImplementedError): + df.rename({"A": "a", "B": "b"}, axis=1, copy=False) + + r = df.rename(str) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.rename(str)) + + r = df.rename({"A": "a", "B": "b"}, axis=1) + pd.testing.assert_frame_equal( + r.execute().fetch(), raw.rename({"A": "a", "B": "b"}, axis=1) + ) + + df.rename({"A": "a", "B": "b"}, axis=1, inplace=True) + pd.testing.assert_frame_equal( + df.execute().fetch(), raw.rename({"A": "a", "B": "b"}, axis=1) + ) + + raw = pd.DataFrame( + rs.rand(10, 4), + columns=pd.MultiIndex.from_tuples( + (("A", "C"), ("A", "D"), ("B", "E"), ("B", "F")) + ), + ) + df = md.DataFrame(raw, chunk_size=3) + + r = df.rename({"C": "a", "D": "b"}, level=1, axis=1) + pd.testing.assert_frame_equal( + r.execute().fetch(), raw.rename({"C": "a", "D": "b"}, level=1, axis=1) + ) + + raw = pd.Series(rs.rand(10), name="series") + series = md.Series(raw, chunk_size=3) + + r = series.rename("new_series") + pd.testing.assert_series_equal(r.execute().fetch(), raw.rename("new_series")) + + r = series.rename(lambda x: 2**x) + pd.testing.assert_series_equal(r.execute().fetch(), raw.rename(lambda x: 2**x)) + + with pytest.raises(TypeError): + series.name = {1: 10, 2: 20} + + series.name = "new_series" + pd.testing.assert_series_equal(series.execute().fetch(), raw.rename("new_series")) + + raw = pd.MultiIndex.from_frame(pd.DataFrame(rs.rand(10, 2), columns=["A", "B"])) + idx = md.Index(raw) + + r = idx.rename(["C", "D"]) + pd.testing.assert_index_equal(r.execute().fetch(), raw.rename(["C", "D"])) + + r = idx.set_names("C", level=0) + pd.testing.assert_index_equal(r.execute().fetch(), raw.set_names("C", level=0)) + + +def test_rename_axis(setup): + rs = np.random.RandomState(0) + + # test dataframe cases + raw = pd.DataFrame(rs.rand(10, 4), columns=["A", "B", "C", "D"]) + df = md.DataFrame(raw, chunk_size=3) + + r = df.rename_axis("idx") + pd.testing.assert_frame_equal(r.execute().fetch(), raw.rename_axis("idx")) + + r = df.rename_axis("cols", axis=1) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.rename_axis("cols", axis=1)) + + df.rename_axis("c", axis=1, inplace=True) + pd.testing.assert_frame_equal(df.execute().fetch(), raw.rename_axis("c", axis=1)) + + df.columns.name = "df_cols" + pd.testing.assert_frame_equal( + df.execute().fetch(), raw.rename_axis("df_cols", axis=1) + ) + + # test dataframe cases with MultiIndex + raw = pd.DataFrame( + rs.rand(10, 4), + columns=pd.MultiIndex.from_tuples([("A", 1), ("B", 2), ("C", 3), ("D", 4)]), + ) + df = md.DataFrame(raw, chunk_size=3) + + df.columns.names = ["c1", "c2"] + pd.testing.assert_frame_equal( + df.execute().fetch(), raw.rename_axis(["c1", "c2"], axis=1) + ) + + df.columns.set_names("c2_1", level=1, inplace=True) + pd.testing.assert_frame_equal( + df.execute().fetch(), raw.rename_axis(["c1", "c2_1"], axis=1) + ) + + # test series cases + raw = pd.Series(rs.rand(10)) + s = md.Series(raw, chunk_size=3) + + r = s.rename_axis("idx") + pd.testing.assert_series_equal(r.execute().fetch(), raw.rename_axis("idx")) + + s.index.name = "series_idx" + pd.testing.assert_series_equal(s.execute().fetch(), raw.rename_axis("series_idx")) + + +def test_insert(setup): + rs = np.random.RandomState(0) + raw = pd.DataFrame(rs.rand(10, 4), columns=["A", "B", "C", "D"]) + + with pytest.raises(ValueError): + tensor = mt.tensor(rs.rand(10, 10), chunk_size=4) + df = md.DataFrame(raw.copy(deep=True), chunk_size=3) + df.insert(4, "E", tensor) + + df = md.DataFrame(raw.copy(deep=True), chunk_size=3) + df.insert(4, "E", 0) + raw_dup = raw.copy(deep=True) + raw_dup.insert(4, "E", 0) + pd.testing.assert_frame_equal(df.execute().fetch(), raw_dup) + + raw_tensor = rs.rand(10) + tensor = mt.tensor(raw_tensor, chunk_size=4) + df = md.DataFrame(raw.copy(deep=True), chunk_size=3) + df.insert(4, "E", tensor) + raw_dup = raw.copy(deep=True) + raw_dup.insert(4, "E", raw_tensor) + pd.testing.assert_frame_equal(df.execute().fetch(), raw_dup) + + +def _wrap_execute_data_source(limit, op_cls): + def _execute_data_source(ctx, op): + op_cls.execute(ctx, op) + result = ctx[op.outputs[0].key] + if len(result) > limit: + raise RuntimeError("have data more than expected") # pragma: no cover + + return _execute_data_source + + +def _wrap_execute_data_source_usecols(usecols, op_cls): + def _execute_data_source(ctx, op): # pragma: no cover + op_cls.execute(ctx, op) + result = ctx[op.outputs[0].key] + if not isinstance(usecols, list): + if not isinstance(result, pd.Series): + raise RuntimeError(f"Out data should be a Series, got {type(result)}") + elif len(result.columns) > len(usecols): + params = dict( + (k, getattr(op, k, None)) + for k in op._keys_ + if k not in op._no_copy_attrs_ + ) + raise RuntimeError( + f"have data more than expected, got {result.columns}, " + f"result {result}, op params {params}" + ) + + return _execute_data_source + + +def _wrap_execute_data_source_mixed(limit, usecols, op_cls): + def _execute_data_source(ctx, op): # pragma: no cover + op_cls.execute(ctx, op) + result = ctx[op.outputs[0].key] + if not isinstance(usecols, list): + if not isinstance(result, pd.Series): + raise RuntimeError("Out data should be a Series") + elif len(result.columns) > len(usecols): + raise RuntimeError("have data more than expected") + if len(result) > limit: + raise RuntimeError("have data more than expected") + + return _execute_data_source + + +@pytest.mark.skip_ray_dag # operand_executors is not supported by ray backend. +@pytest.mark.pd_compat +def test_optimization(setup): + import sqlalchemy as sa + + with tempfile.TemporaryDirectory() as tempdir: + filename = os.path.join(tempdir, "test_head.csv") + rs = np.random.RandomState(0) + pd_df = pd.DataFrame( + { + "a": rs.randint(1000, size=(2000,)).astype(np.int64), + "b": rs.randint(1000, size=(2000,)).astype(np.int64), + "c": ["sss" for _ in range(2000)], + "d": ["eeee" for _ in range(2000)], + } + ) + pd_df.to_csv(filename, index=False) + + size = os.path.getsize(filename) + chunk_bytes = size / 3 - 2 + + df = md.read_csv(filename, chunk_bytes=chunk_bytes) + + cols = ["b", "a", "c"] + r = df[cols] + operand_executors = { + DataFrameReadCSV: _wrap_execute_data_source_usecols(cols, DataFrameReadCSV) + } + result = r.execute( + extra_config={"operand_executors": operand_executors} + ).fetch() + expected = pd_df[cols] + result.reset_index(drop=True, inplace=True) + pd.testing.assert_frame_equal(result, expected) + + cols = ["b", "a", "b"] + r = df[cols].head(20) + operand_executors = { + DataFrameReadCSV: _wrap_execute_data_source_usecols(cols, DataFrameReadCSV) + } + result = r.execute( + extra_config={"operand_executors": operand_executors} + ).fetch() + expected = pd_df[cols].head(20) + result.reset_index(drop=True, inplace=True) + pd.testing.assert_frame_equal(result, expected) + + r = df["c"] + operand_executors = { + DataFrameReadCSV: _wrap_execute_data_source_usecols("c", DataFrameReadCSV) + } + result = r.execute( + extra_config={"operand_executors": operand_executors} + ).fetch() + expected = pd_df["c"] + result.reset_index(drop=True, inplace=True) + pd.testing.assert_series_equal(result, expected) + + r = df["d"].head(3) + operand_executors = { + DataFrameReadCSV: _wrap_execute_data_source_mixed(3, "d", DataFrameReadCSV) + } + result = r.execute( + extra_config={"operand_executors": operand_executors} + ).fetch() + expected = pd_df["d"].head(3) + pd.testing.assert_series_equal(result, expected) + + # test DataFrame.head + r = df.head(3) + operand_executors = { + DataFrameReadCSV: _wrap_execute_data_source(3, DataFrameReadCSV) + } + result = r.execute( + extra_config={"operand_executors": operand_executors} + ).fetch() + expected = pd_df.head(3) + pd.testing.assert_frame_equal(result, expected) + + # test DataFrame.tail + r = df.tail(3) + result = r.execute().fetch() + expected = pd_df.tail(3) + pd.testing.assert_frame_equal( + result.reset_index(drop=True), expected.reset_index(drop=True) + ) + + # test head more than 1 chunk + r = df.head(99) + result = r.execute().fetch() + result.reset_index(drop=True, inplace=True) + expected = pd_df.head(99) + pd.testing.assert_frame_equal(result, expected) + + # test Series.tail more than 1 chunk + r = df.tail(99) + result = r.execute().fetch() + expected = pd_df.tail(99) + pd.testing.assert_frame_equal( + result.reset_index(drop=True), expected.reset_index(drop=True) + ) + + # test head number greater than limit + df = md.read_csv(filename, chunk_bytes=chunk_bytes) + r = df.head(1100) + + with pytest.raises(RuntimeError): + operand_executors = { + DataFrameReadCSV: _wrap_execute_data_source(3, DataFrameReadCSV) + } + r.execute(extra_config={"operand_executors": operand_executors}) + + result = r.execute().fetch() + expected = pd_df.head(1100) + pd.testing.assert_frame_equal( + result.reset_index(drop=True), expected.reset_index(drop=True) + ) + + filename = os.path.join(tempdir, "test_sql.db") + conn = sa.create_engine("sqlite:///" + filename) + pd_df.to_sql("test_sql", conn) + + df = md.read_sql("test_sql", conn, index_col="index", chunk_size=20) + + # test DataFrame.head + r = df.head(3) + operand_executors = { + DataFrameReadSQL: _wrap_execute_data_source(3, DataFrameReadSQL) + } + result = r.execute( + extra_config={"operand_executors": operand_executors} + ).fetch() + result.index.name = None + expected = pd_df.head(3) + pd.testing.assert_frame_equal(result, expected) + + # test head on read_parquet + filename = os.path.join(tempdir, "test_parquet.db") + pd_df.to_parquet(filename, index=False, compression="gzip") + + engines = [] + if pa is not None: + engines.append("pyarrow") + if fp is not None: + engines.append("fastparquet") + + for engine in engines: + df = md.read_parquet(filename, engine=engine) + r = df.head(3) + + operand_executors = { + DataFrameReadParquet: _wrap_execute_data_source(3, DataFrameReadParquet) + } + result = r.execute( + extra_config={"operand_executors": operand_executors} + ).fetch() + expected = pd_df.head(3) + pd.testing.assert_frame_equal(result, expected) + + dirname = os.path.join(tempdir, "test_parquet2") + os.makedirs(dirname) + pd_df[:1000].to_parquet(os.path.join(dirname, "q1.parquet")) + pd_df[1000:].to_parquet(os.path.join(dirname, "q2.parquet")) + + df = md.read_parquet(dirname) + r = df.head(3) + + operand_executors = { + DataFrameReadParquet: _wrap_execute_data_source(3, DataFrameReadParquet) + } + result = r.execute( + extra_config={"operand_executors": operand_executors} + ).fetch() + expected = pd_df.head(3) + pd.testing.assert_frame_equal(result, expected) + + +def test_reindex_execution(setup): + data = pd.DataFrame(np.random.rand(10, 5), columns=["c1", "c2", "c3", "c4", "c5"]) + df = md.DataFrame(data, chunk_size=4) + + for enable_sparse in [True, False, None]: + r = df.reindex( + index=mt.arange(10, 1, -1, chunk_size=3), enable_sparse=enable_sparse + ) + + result = r.execute().fetch() + expected = data.reindex(index=np.arange(10, 1, -1)) + pd.testing.assert_frame_equal(result, expected) + + r = df.reindex(columns=["c5", "c6", "c2"], enable_sparse=enable_sparse) + + result = r.execute().fetch() + expected = data.reindex(columns=["c5", "c6", "c2"]) + pd.testing.assert_frame_equal(result, expected) + + for enable_sparse in [True, False]: + r = df.reindex( + index=[5, 11, 1], columns=["c5", "c6", "c2"], enable_sparse=enable_sparse + ) + + result = r.execute().fetch() + expected = data.reindex(index=[5, 11, 1], columns=["c5", "c6", "c2"]) + pd.testing.assert_frame_equal(result, expected) + + r = df.reindex( + index=mt.tensor([2, 4, 10]), + columns=["c2", "c3", "c5", "c7"], + method="bfill", + enable_sparse=enable_sparse, + ) + + result = r.execute().fetch() + expected = data.reindex( + index=[2, 4, 10], columns=["c2", "c3", "c5", "c7"], method="bfill" + ) + pd.testing.assert_frame_equal(result, expected) + + for fill_value, test_fill_value in [ + (3, 3), + (df.iloc[:, 0].max(), data.iloc[:, 0].max()), + ]: + r = df.reindex( + index=mt.tensor([2, 4, 10]), + columns=["c2", "c3", "c5", "c7"], + fill_value=fill_value, + enable_sparse=enable_sparse, + ) + + result = r.execute().fetch() + expected = data.reindex( + index=[2, 4, 10], + columns=["c2", "c3", "c5", "c7"], + fill_value=test_fill_value, + ) + pd.testing.assert_frame_equal(result, expected) + + # test date_range index + data = pd.DataFrame( + np.random.rand(10, 5), index=pd.date_range("2020-1-1", periods=10) + ) + df = md.DataFrame(data, chunk_size=5) + + r = df.reindex( + index=md.date_range("2020-1-6", periods=6), + method="ffill", + enable_sparse=enable_sparse, + ) + + result = r.execute().fetch() + expected = data.reindex( + index=pd.date_range("2020-1-6", periods=6), method="ffill" + ) + pd.testing.assert_frame_equal(result, expected) + + # test MultiIndex + data = pd.DataFrame( + np.random.rand(10, 5), + index=pd.MultiIndex.from_arrays([np.arange(10), np.arange(11, 1, -1)]), + ) + df = md.DataFrame(data, chunk_size=5) + + r = df.reindex([2, 4, 9, 12], level=1, enable_sparse=enable_sparse) + + result = r.execute(extra_config={"check_shape": False}).fetch( + extra_config={"check_shape": False} + ) + expected = data.reindex([2, 4, 9, 12], level=1) + pd.testing.assert_frame_equal(result, expected) + + r = df.reindex( + mt.tensor([2, 4, 9, 12], chunk_size=2), level=1, enable_sparse=enable_sparse + ) + + result = r.execute(extra_config={"check_shape": False}).fetch( + extra_config={"check_shape": False} + ) + expected = data.reindex([2, 4, 9, 12], level=1) + pd.testing.assert_frame_equal(result, expected) + + # test duplicate index + index = np.arange(10) + index[-1] = 0 + data = pd.DataFrame(np.random.rand(10, 5), index=index) + df = md.DataFrame(data, chunk_size=5) + + with pytest.raises(ValueError): + r = df.reindex([0, 1], enable_sparse=enable_sparse) + r.execute() + + # test one chunk + data = pd.DataFrame( + np.random.rand(10, 5), columns=["c1", "c2", "c3", "c4", "c5"] + ) + df = md.DataFrame(data, chunk_size=10) + + r = df.reindex( + index=mt.arange(10, 1, -1, chunk_size=10), + fill_value=df["c1"].max(), + enable_sparse=enable_sparse, + ) + + result = r.execute().fetch() + expected = data.reindex(index=np.arange(10, 1, -1), fill_value=data["c1"].max()) + pd.testing.assert_frame_equal(result, expected) + + # test series + s_data = pd.Series(np.random.rand(10), index=[f"c{i + 1}" for i in range(10)]) + series = md.Series(s_data, chunk_size=6) + + r = series.reindex(["c2", "c11", "c4"], copy=False, enable_sparse=enable_sparse) + + result = r.execute().fetch() + expected = s_data.reindex(["c2", "c11", "c4"], copy=False) + pd.testing.assert_series_equal(result, expected) + + +def test_reindex_like_execution(setup): + data = pd.DataFrame( + np.random.rand(10, 5), + columns=["c1", "c2", "c3", "c4", "c5"], + index=pd.date_range("2021-1-1", periods=10), + ) + data2 = pd.DataFrame( + np.random.rand(4, 2), + columns=["c2", "c4"], + index=pd.date_range("2020-1-2", periods=4), + ) + df = md.DataFrame(data, chunk_size=4) + df2 = md.DataFrame(data2, chunk_size=3) + + r = df.reindex_like(df2) + result = r.execute().fetch() + expected = data.reindex_like(data2) + pd.testing.assert_frame_equal(result, expected) + + r = df.reindex_like(df, copy=False) + result = r.execute().fetch() + expected = data.reindex_like(data) + pd.testing.assert_frame_equal(result, expected) + + s = md.Series(data["c2"], chunk_size=4) + s2 = md.Series(data2["c2"], chunk_size=3) + + r = s.reindex_like(s2) + result = r.execute().fetch() + expected = data["c2"].reindex_like(data2["c2"]) + pd.testing.assert_series_equal(result, expected) + + r = s.reindex_like(s, copy=False) + result = r.execute().fetch() + expected = data["c2"].reindex_like(data["c2"]) + pd.testing.assert_series_equal(result, expected) + + +def test_where_execution(setup): + dates = pd.date_range("1/1/2000", periods=20) + + raw_df = pd.DataFrame( + np.random.randn(20, 10), index=dates, columns=list("ABCDEFGHIJ") + ) + raw_df2 = pd.DataFrame( + np.random.randn(20, 10), index=dates, columns=list("ABCDEFGHIJ") + ) + df = md.DataFrame(raw_df, chunk_size=6) + df2 = md.DataFrame(raw_df2, chunk_size=7) + + raw_series = pd.Series(np.random.randn(20), index=dates) + raw_series2 = pd.Series(np.random.randn(20), index=dates) + raw_series3 = pd.Series(np.random.randn(10), index=list("ABCDEFGHIJ")) + series = md.Series(raw_series, chunk_size=6) + series2 = md.Series(raw_series2, chunk_size=7) + series3 = md.Series(raw_series3, chunk_size=7) + + # tests for dataframes + with pytest.raises(NotImplementedError): + df.mask(df < 0, md.DataFrame(np.random.randn(5, 5))) + with pytest.raises(NotImplementedError): + df.mask(series < 0, md.Series(np.random.randn(5)), axis=0) + + r = df.mask(df < 0) + pd.testing.assert_frame_equal(r.execute().fetch(), raw_df.mask(raw_df < 0)) + r = df.mask(raw_df < 0, df2) + pd.testing.assert_frame_equal(r.execute().fetch(), raw_df.mask(raw_df < 0, raw_df2)) + + # tests for series + with pytest.raises(NotImplementedError): + series.mask(series < 0, md.Series(np.random.randn(5))) + + r = series.where(series < 0, 0) + pd.testing.assert_series_equal( + r.execute().fetch(), raw_series.where(raw_series < 0, 0) + ) + r = series.where(series < 0, series2) + pd.testing.assert_series_equal( + r.execute().fetch(), raw_series.where(raw_series < 0, raw_series2) + ) + + # test for dataframe with series + with pytest.raises(ValueError): + df.mask(df < 0, series) + + r = df.mask(df < 0, series, axis=0) + pd.testing.assert_frame_equal( + r.execute().fetch(), raw_df.mask(raw_df < 0, raw_series, axis=0) + ) + r = df.mask(series < 0, df2) + pd.testing.assert_frame_equal( + r.execute().fetch(), raw_df.mask(raw_series < 0, raw_df2) + ) + r = df.mask(series < 0, series3, axis=1) + pd.testing.assert_frame_equal( + r.execute().fetch(), raw_df.mask(raw_series < 0, raw_series3, axis=1) + ) + + # test inplace + new_df = df.copy() + new_df.mask(new_df < 0, inplace=True) + pd.testing.assert_frame_equal(new_df.execute().fetch(), raw_df.mask(raw_df < 0)) + + +def test_set_axis_execution(setup): + raw_df = pd.DataFrame(np.random.rand(10, 5), columns=["c1", "c2", "c3", "c4", "c5"]) + df = md.DataFrame(raw_df, chunk_size=3) + + # test axis=0 + idx_data = np.arange(0, 10) + np.random.shuffle(idx_data) + new_idx = md.Index(idx_data, chunk_size=4) + + r = df.set_axis(new_idx) + pd.testing.assert_frame_equal(r.execute().fetch(), raw_df.set_axis(idx_data)) + + new_idx = pd.Index(range(9, -1, -1)) + r = df.set_axis(new_idx) + pd.testing.assert_frame_equal(r.execute().fetch(), raw_df.set_axis(new_idx)) + + df1 = df.copy() + df1.index = pd.Index(range(9, -1, -1)) + pd.testing.assert_frame_equal(df1.execute().fetch(), raw_df.set_axis(new_idx)) + + ser = md.Series(idx_data) + with pytest.raises(ValueError): + df.set_axis(ser[ser > 5]).execute() + + # test axis=1 + new_axis = ["a1", "a2", "a3", "a4", "a5"] + r = df.set_axis(new_axis, axis=1) + pd.testing.assert_frame_equal( + r.execute().fetch(), raw_df.set_axis(new_axis, axis=1) + ) + + r = df.set_axis(md.Index(new_axis, store_data=True), axis=1) + pd.testing.assert_frame_equal( + r.execute().fetch(), raw_df.set_axis(new_axis, axis=1) + ) + + df1 = df.copy() + df1.columns = new_axis + pd.testing.assert_frame_equal( + df1.execute().fetch(), raw_df.set_axis(new_axis, axis=1) + ) + + with pytest.raises(ValueError): + df.set_axis(["a1", "a2", "a3", "a4"], axis=1) + + # test series + raw_series = pd.Series(np.random.rand(10)) + s = md.Series(raw_series, chunk_size=3) + + idx_data = np.arange(0, 10) + np.random.shuffle(idx_data) + new_idx = md.Index(idx_data, chunk_size=4) + + r = s.set_axis(new_idx) + pd.testing.assert_series_equal(r.execute().fetch(), raw_series.set_axis(idx_data)) + + s1 = s.copy() + s1.index = new_idx + pd.testing.assert_series_equal(s1.execute().fetch(), raw_series.set_axis(idx_data)) + + +def test_sample_execution(setup): + rs = np.random.RandomState(0) + + # test dataframe + raw_df = pd.DataFrame(rs.rand(100, 5), columns=["c1", "c2", "c3", "c4", "c5"]) + + # test single chunk + df = md.DataFrame(raw_df) + r = df.sample(10, random_state=rs) + pd.testing.assert_frame_equal( + r.execute().fetch(), raw_df.sample(10, random_state=rs) + ) + r = df.sample(frac=0.1, weights="c1", random_state=rs) + pd.testing.assert_frame_equal( + r.execute().fetch(), raw_df.sample(frac=0.1, weights="c1", random_state=rs) + ) + r = df.sample(10, weights=df["c2"], random_state=rs) + pd.testing.assert_frame_equal( + r.execute().fetch(), raw_df.sample(10, weights=raw_df["c2"], random_state=rs) + ) + + r = df.sample(10, weights=df["c2"], random_state=0) + pd.testing.assert_frame_equal( + r.execute().fetch(), raw_df.sample(10, weights=raw_df["c2"], random_state=0) + ) + + r = df.sample(10, weights=df["c2"], random_state=np.array([1, 2])) + pd.testing.assert_frame_equal( + r.execute().fetch(), + raw_df.sample(10, weights=raw_df["c2"], random_state=np.array([1, 2])), + ) + + # test multinomial tile & execution + df = md.DataFrame(raw_df, chunk_size=13) + r1 = df.sample(10, replace=True, random_state=rs) + r2 = df[:].sample(10, replace=True, random_state=rs) + pd.testing.assert_frame_equal(r1.execute().fetch(), r2.execute().fetch()) + + r1 = df.sample(frac=0.1, weights="c2", always_multinomial=True, random_state=rs) + r2 = df[:].sample(frac=0.1, weights="c2", always_multinomial=True, random_state=rs) + pd.testing.assert_frame_equal(r1.execute().fetch(), r2.execute().fetch()) + + r1 = df.sample(frac=0.1, weights=df["c2"], always_multinomial=True, random_state=rs) + r2 = df[:].sample( + frac=0.1, weights=df["c2"], always_multinomial=True, random_state=rs + ) + pd.testing.assert_frame_equal(r1.execute().fetch(), r2.execute().fetch()) + + r1 = df.sample(frac=0.1, weights=df["c2"], always_multinomial=True, random_state=0) + r2 = df[:].sample( + frac=0.1, weights=df["c2"], always_multinomial=True, random_state=0 + ) + pd.testing.assert_frame_equal(r1.execute().fetch(), r2.execute().fetch()) + + r1 = df.sample( + frac=0.1, + weights=df["c2"], + always_multinomial=True, + random_state=np.array([1, 2]), + ) + r2 = df[:].sample( + frac=0.1, + weights=df["c2"], + always_multinomial=True, + random_state=np.array([1, 2]), + ) + pd.testing.assert_frame_equal(r1.execute().fetch(), r2.execute().fetch()) + + # test reservoir tile & execution + df = md.DataFrame(raw_df, chunk_size=13) + r1 = df.sample(90, random_state=rs) + r2 = df[:].sample(90, random_state=rs) + pd.testing.assert_frame_equal(r1.execute().fetch(), r2.execute().fetch()) + + r1 = df.sample(10, random_state=rs) + r2 = df[:].sample(10, random_state=rs) + pd.testing.assert_frame_equal(r1.execute().fetch(), r2.execute().fetch()) + + r1 = df.sample(frac=0.1, weights="c2", random_state=rs) + r2 = df[:].sample(frac=0.1, weights="c2", random_state=rs) + pd.testing.assert_frame_equal(r1.execute().fetch(), r2.execute().fetch()) + + r1 = df.sample(frac=0.1, weights=df["c2"], random_state=rs) + r2 = df[:].sample(frac=0.1, weights=df["c2"], random_state=rs) + pd.testing.assert_frame_equal(r1.execute().fetch(), r2.execute().fetch()) + + r1 = df.sample(frac=0.1, weights=df["c2"], random_state=0) + r2 = df[:].sample(frac=0.1, weights=df["c2"], random_state=0) + pd.testing.assert_frame_equal(r1.execute().fetch(), r2.execute().fetch()) + + r1 = df.sample(frac=0.1, weights=df["c2"], random_state=np.array([1, 2])) + r2 = df[:].sample(frac=0.1, weights=df["c2"], random_state=np.array([1, 2])) + pd.testing.assert_frame_equal(r1.execute().fetch(), r2.execute().fetch()) + + # test series + raw_series = pd.Series(rs.rand(100)) + raw_weights = pd.Series(rs.rand(100)) + + # test single chunk + s = md.Series(raw_series) + r = s.sample(10, random_state=rs) + pd.testing.assert_series_equal( + r.execute().fetch(), raw_series.sample(10, random_state=rs) + ) + weights = md.Series(raw_weights, chunk_size=13) + r = s.sample(10, weights=weights, random_state=rs) + pd.testing.assert_series_equal( + r.execute().fetch(), raw_series.sample(10, weights=raw_weights, random_state=rs) + ) + + # test multinomial tile & execution + s = md.Series(raw_series, chunk_size=13) + weights = md.Series(raw_weights, chunk_size=13) + + r1 = s.sample(10, replace=True, random_state=rs) + r2 = s[:].sample(10, replace=True, random_state=rs) + pd.testing.assert_series_equal(r1.execute().fetch(), r2.execute().fetch()) + + r1 = s.sample(frac=0.1, weights=weights, always_multinomial=True, random_state=rs) + r2 = s[:].sample( + frac=0.1, weights=weights, always_multinomial=True, random_state=rs + ) + pd.testing.assert_series_equal(r1.execute().fetch(), r2.execute().fetch()) + + # test reservoir tile & execution + r1 = s.sample(10, random_state=rs) + r2 = s[:].sample(10, random_state=rs) + pd.testing.assert_series_equal(r1.execute().fetch(), r2.execute().fetch()) + + r1 = s.sample(frac=0.1, weights=weights, random_state=rs) + r2 = s[:].sample(frac=0.1, weights=weights, random_state=rs) + pd.testing.assert_series_equal(r1.execute().fetch(), r2.execute().fetch()) + + +def test_loc_setitem(setup): + raw_df = pd.DataFrame({"a": [1, 2, 3, 4, 2, 4, 5, 7, 2, 8, 9], 1: [10] * 11}) + md_data = md.DataFrame(raw_df, chunk_size=3) + md_data.loc[md_data["a"] <= 4, 1] = "v1" + pd_data = raw_df.copy(True) + pd_data.loc[pd_data["a"] <= 4, 1] = "v1" + pd.testing.assert_frame_equal(md_data.to_pandas(), pd_data) + + md_data1 = md.DataFrame(raw_df, chunk_size=3) + md_data1.loc[1:3] = "v2" + pd_data1 = raw_df.copy(True) + pd_data1.loc[1:3] = "v2" + pd.testing.assert_frame_equal(md_data1.to_pandas(), pd_data1) + + md_data2 = md.DataFrame(raw_df, chunk_size=3) + md_data2.loc[1:3, 1] = "v2" + pd_data2 = raw_df.copy(True) + pd_data2.loc[1:3, 1] = "v2" + pd.testing.assert_frame_equal(md_data2.to_pandas(), pd_data2) + + +def test_add_prefix_suffix(setup): + rs = np.random.RandomState(0) + raw = pd.DataFrame(rs.rand(10, 4), columns=["A", "B", "C", "D"]) + df = md.DataFrame(raw, chunk_size=3) + + r = df.add_prefix("col_") + pd.testing.assert_frame_equal(r.execute().fetch(), raw.add_prefix("col_")) + + r = df.add_suffix("_col") + pd.testing.assert_frame_equal(r.execute().fetch(), raw.add_suffix("_col")) + + raw = pd.Series(rs.rand(10), name="series") + series = md.Series(raw, chunk_size=3) + + r = series.add_prefix("item_") + pd.testing.assert_series_equal(r.execute().fetch(), raw.add_prefix("item_")) + + r = series.add_suffix("_item") + pd.testing.assert_series_equal(r.execute().fetch(), raw.add_suffix("_item")) + + +@pytest.mark.parametrize("join", ["outer", "left"]) +def test_align_execution(setup, join): + rs = np.random.RandomState(0) + raw_df1 = pd.DataFrame( + rs.rand(10, 10), columns=list("ABCDEFGHIJ"), index=pd.RangeIndex(10) + ) + raw_df2 = pd.DataFrame( + rs.rand(10, 10), + columns=list("ACDFGIJKLM"), + index=[2, 3, 6, 7, 8, 9, 10, 13, 15, 17], + ) + raw_s1 = pd.Series(rs.rand(10), index=[2, 3, 6, 7, 8, 9, 10, 13, 15, 17]) + raw_s2 = pd.Series(rs.rand(10), index=pd.RangeIndex(10)) + raw_s3 = raw_s4 = raw_df2.iloc[0, :] + raw_s5 = raw_df1.iloc[0, :] + + df1 = md.DataFrame(raw_df1, chunk_size=5) + df2 = md.DataFrame(raw_df2, chunk_size=4) + s1 = md.Series(raw_s1, chunk_size=4) + s2 = md.Series(raw_s2, chunk_size=4) + s3 = md.Series(raw_s3, chunk_size=4) + s4 = df2.iloc[0, :] + s5 = df1.iloc[0, :] + + # test dataframe vs dataframe + r1, r2 = mars.fetch( + mars.execute(*df1.align(df1, join=join), extra_config={"check_nsplits": False}) + ) + pd.testing.assert_frame_equal(r1, raw_df1) + pd.testing.assert_frame_equal(r2, raw_df1) + + r1, r2 = mars.fetch( + mars.execute(*df1.align(df2, join=join), extra_config={"check_nsplits": False}) + ) + exp1, exp2 = raw_df1.align(raw_df2, join=join) + pd.testing.assert_frame_equal(r1, exp1) + pd.testing.assert_frame_equal(r2, exp2) + + r1, r2 = mars.fetch( + mars.execute( + *df1.align(df2, join=join, axis=0), extra_config={"check_nsplits": False} + ) + ) + exp1, exp2 = raw_df1.align(raw_df2, join=join, axis=0) + pd.testing.assert_frame_equal(r1.sort_index(axis=1), exp1) + pd.testing.assert_frame_equal(r2.sort_index(axis=1), exp2) + + r2, r1 = mars.fetch( + mars.execute( + *df2.align(df1, join=join, axis=0, fill_value=0.0), + extra_config={"check_nsplits": False}, + ) + ) + exp2, exp1 = raw_df2.align(raw_df1, join=join, axis=0, fill_value=0.0) + pd.testing.assert_frame_equal(r1.sort_index(axis=1), exp1) + pd.testing.assert_frame_equal(r2.sort_index(axis=1), exp2) + + r1, r2 = mars.fetch( + mars.execute( + *df1.align(df2, join=join, axis=1), extra_config={"check_nsplits": False} + ) + ) + exp1, exp2 = raw_df1.align(raw_df2, join=join, axis=1) + pd.testing.assert_frame_equal(r1.sort_index(), exp1) + pd.testing.assert_frame_equal(r2.sort_index(), exp2) + + # test dataframe vs series + with pytest.raises(ValueError): + # must specify align axis + df1.align(s1) + + r1, r2 = mars.fetch( + mars.execute( + *df1.align(s1, join=join, axis=0, method="ffill"), + extra_config={"check_nsplits": False}, + ) + ) + exp1, exp2 = raw_df1.align(raw_s1, join=join, axis=0, method="ffill") + pd.testing.assert_frame_equal(r1.sort_index(), exp1) + pd.testing.assert_series_equal(r2.sort_index(), exp2) + + r1, r2 = mars.fetch( + mars.execute( + *df1.align(s1, join=join, axis=0, broadcast_axis=1), + extra_config={"check_nsplits": False}, + ) + ) + exp1, exp2 = raw_df1.align(raw_s1, join=join, axis=0, broadcast_axis=1) + pd.testing.assert_frame_equal(r1.sort_index(), exp1) + pd.testing.assert_frame_equal(r2.sort_index(), exp2) + + r1, r2 = mars.fetch( + mars.execute( + *df1.align(s3, join=join, axis=1), extra_config={"check_nsplits": False} + ) + ) + exp1, exp2 = raw_df1.align(raw_s3, join=join, axis=1) + pd.testing.assert_frame_equal(r1.sort_index(axis=1), exp1) + pd.testing.assert_series_equal(r2.sort_index(), exp2) + + r1, r2 = mars.fetch( + mars.execute( + *df1.align(s4, join=join, axis=1), extra_config={"check_nsplits": False} + ) + ) + exp1, exp2 = raw_df1.align(raw_s4, join=join, axis=1) + pd.testing.assert_frame_equal(r1.sort_index(axis=1), exp1) + pd.testing.assert_series_equal(r2.sort_index(), exp2) + + r1, r2 = mars.fetch( + mars.execute( + *s1.align(df1, join=join, axis=0), extra_config={"check_nsplits": False} + ) + ) + exp1, exp2 = raw_s1.align(raw_df1, join=join, axis=0) + pd.testing.assert_series_equal(r1.sort_index(), exp1) + pd.testing.assert_frame_equal(r2.sort_index(), exp2) + + # test series vs series + r1, r2 = mars.fetch( + mars.execute(*s1.align(s2, join=join), extra_config={"check_nsplits": False}) + ) + exp1, exp2 = raw_s1.align(raw_s2, join=join) + pd.testing.assert_series_equal(r1.sort_index(), exp1) + pd.testing.assert_series_equal(r2.sort_index(), exp2) + + r1, r2 = mars.fetch( + mars.execute(*s4.align(s5, join=join), extra_config={"check_nsplits": False}) + ) + exp1, exp2 = raw_s4.align(raw_s5, join=join) + pd.testing.assert_series_equal(r1.sort_index(), exp1) + pd.testing.assert_series_equal(r2.sort_index(), exp2) diff --git a/python/xorbits/_mars/dataframe/indexing/utils.py b/python/xorbits/_mars/dataframe/indexing/utils.py new file mode 100644 index 000000000..b8dd957d2 --- /dev/null +++ b/python/xorbits/_mars/dataframe/indexing/utils.py @@ -0,0 +1,53 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + + +def calc_columns_index(column_name, df): + """ + Calculate the chunk index on the axis 1 according to the selected column. + :param column_name: selected column name + :param df: input tiled DataFrame + :return: chunk index on the columns axis + """ + column_nsplits = df.nsplits[1] + # if has duplicate columns, will return multiple values + columns = df.columns_value.to_pandas().to_numpy() + column_locs = (columns == column_name).nonzero()[0] + + return [ + np.searchsorted(np.cumsum(column_nsplits), column_loc + 1) + for column_loc in column_locs + ] + + +def convert_labels_into_positions(pandas_index, labels): + """ + Convert labels into positions + + :param pandas_index: pandas Index + :param labels: labels + :return: positions + """ + result = [] + for label in labels: + loc = pandas_index.get_loc(label) + if isinstance(loc, (int, np.integer)): + result.append(loc) + else: + # slice or boolean array + result.extend(pd.RangeIndex(len(pandas_index))[loc].tolist()) + return np.asarray(result) diff --git a/python/xorbits/_mars/dataframe/indexing/where.py b/python/xorbits/_mars/dataframe/indexing/where.py new file mode 100644 index 000000000..ccf45611c --- /dev/null +++ b/python/xorbits/_mars/dataframe/indexing/where.py @@ -0,0 +1,431 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...core import ENTITY_TYPE, recursive_tile +from ...serialization.serializables import AnyField, BoolField, Int32Field, StringField +from ...tensor.utils import filter_inputs +from ..core import DATAFRAME_TYPE, SERIES_TYPE +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_df, build_series, validate_axis + + +class DataFrameWhere(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.WHERE + + _input = AnyField("input") + _cond = AnyField("cond") + _other = AnyField("other") + _axis = Int32Field("axis") + _level = AnyField("level") + _errors = StringField("errors") + _try_cast = BoolField("try_cast") + _replace_true = BoolField("replace_true") + + def __init__( + self, + input=None, + cond=None, + other=None, # pylint: disable=redefined-builtin + axis=None, + level=None, + errors=None, + try_cast=None, + replace_true=None, + **kw + ): + super().__init__( + _input=input, + _cond=cond, + _other=other, + _axis=axis, + _level=level, + _errors=errors, + _try_cast=try_cast, + _replace_true=replace_true, + **kw + ) + + @property + def input(self): + return self._input + + @property + def cond(self): + return self._cond + + @property + def other(self): + return self._other + + @property + def axis(self): + return self._axis + + @property + def level(self): + return self._level + + @property + def errors(self): + return self._errors + + @property + def try_cast(self): + return self._try_cast + + @property + def replace_true(self): + return self._replace_true + + def __call__(self, df_or_series): + def _check_input_index(obj, axis=None): + axis = axis if axis is not None else self.axis + if isinstance(obj, DATAFRAME_TYPE) and ( + df_or_series.columns_value.key != obj.columns_value.key + or df_or_series.index_value.key != obj.index_value.key + ): + raise NotImplementedError("Aligning different indices not supported") + elif ( + isinstance(obj, SERIES_TYPE) + and df_or_series.axes[axis].index_value.key != obj.index_value.key + ): + raise NotImplementedError("Aligning different indices not supported") + + _check_input_index(self.cond, axis=0) + _check_input_index(self.other) + + if isinstance(df_or_series, DATAFRAME_TYPE): + mock_obj = build_df(df_or_series) + else: + mock_obj = build_series(df_or_series) + + if isinstance(self.other, (pd.DataFrame, DATAFRAME_TYPE)): + mock_other = build_df(self.other) + elif isinstance(self.other, (pd.Series, SERIES_TYPE)): + mock_other = build_series(self.other) + else: + mock_other = self.other + + result_df = mock_obj.where( + np.zeros(mock_obj.shape).astype(bool), + other=mock_other, + axis=self.axis, + level=self.level, + errors=self.errors, + try_cast=self.try_cast, + ) + + inputs = filter_inputs([df_or_series, self.cond, self.other]) + if isinstance(df_or_series, DATAFRAME_TYPE): + return self.new_dataframe( + inputs, + shape=df_or_series.shape, + dtypes=result_df.dtypes, + index_value=df_or_series.index_value, + columns_value=df_or_series.columns_value, + ) + else: + return self.new_series( + inputs, + shape=df_or_series.shape, + name=df_or_series.name, + dtype=result_df.dtype, + index_value=df_or_series.index_value, + ) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + self._input = next(inputs_iter) + if isinstance(self._cond, ENTITY_TYPE): + self._cond = next(inputs_iter) + if isinstance(self._other, ENTITY_TYPE): + self._other = next(inputs_iter) + + @classmethod + def tile(cls, op: "DataFrameWhere"): + def rechunk_input(inp, axis=None): + axis = axis if axis is not None else op.axis + if isinstance(inp, DATAFRAME_TYPE): + inp = yield from recursive_tile(inp.rechunk(op.input.nsplits)) + elif isinstance(inp, SERIES_TYPE): + inp = yield from recursive_tile( + inp.rechunk({0: op.input.nsplits[axis]}) + ) + return inp + + def get_tiled_chunk(obj, index, axis=None): + if isinstance(obj, DATAFRAME_TYPE): + return obj.cix[index[0], index[1]] + elif isinstance(obj, SERIES_TYPE): + axis = axis if axis is not None else op.axis + return obj.cix[index[axis],] + else: + return obj + + # TODO support axis alignment for three objects + cond = yield from rechunk_input(op.cond, axis=0) + other = yield from rechunk_input(op.other) + + chunks = [] + for c in op.input.chunks: + cond_chunk = get_tiled_chunk(cond, c.index, axis=0) + other_chunk = get_tiled_chunk(other, c.index) + + new_op = op.copy().reset_key() + new_op._cond = cond_chunk + new_op._other = other_chunk + + inputs = filter_inputs([c, cond_chunk, other_chunk]) + chunks.append(new_op.new_chunk(inputs, **c.params)) + + new_op = op.copy().reset_key() + return new_op.new_tileables( + op.inputs, chunks=chunks, nsplits=op.input.nsplits, **op.input.params + ) + + @classmethod + def execute(cls, ctx, op: "DataFrameWhere"): + out_obj = op.outputs[0] + + input_data = ctx[op.input.key] + cond = op.cond + if isinstance(cond, ENTITY_TYPE): + cond = ctx[cond.key] + + other = op.other + if isinstance(other, ENTITY_TYPE): + other = ctx[other.key] + + if op.replace_true: + ctx[out_obj.key] = input_data.mask( + cond, + other, + axis=op.axis, + level=op.level, + errors=op.errors, + try_cast=op.try_cast, + ) + else: + ctx[out_obj.key] = input_data.where( + cond, + other, + axis=op.axis, + level=op.level, + errors=op.errors, + try_cast=op.try_cast, + ) + + +_doc_template = """ +Replace values where the condition is {replace_true}. + +Parameters +---------- +cond : bool Series/DataFrame, array-like, or callable + Where `cond` is False, keep the original value. Where + True, replace with corresponding value from `other`. + If `cond` is callable, it is computed on the Series/DataFrame and + should return boolean Series/DataFrame or array. The callable must + not change input Series/DataFrame (though pandas doesn't check it). +other : scalar, Series/DataFrame, or callable + Entries where `cond` is True are replaced with + corresponding value from `other`. + If other is callable, it is computed on the Series/DataFrame and + should return scalar or Series/DataFrame. The callable must not + change input Series/DataFrame (though pandas doesn't check it). +inplace : bool, default False + Whether to perform the operation in place on the data. +axis : int, default None + Alignment axis if needed. +level : int, default None + Alignment level if needed. +errors : str, {{'raise', 'ignore'}}, default 'raise' + Note that currently this parameter won't affect + the results and will always coerce to a suitable dtype. + + - 'raise' : allow exceptions to be raised. + - 'ignore' : suppress exceptions. On error return original object. + +try_cast : bool, default False + Try to cast the result back to the input type (if possible). + +Returns +------- +Same type as caller + +See Also +-------- +:func:`DataFrame.{opposite}` : Return an object of same shape as + self. + +Notes +----- +The mask method is an application of the if-then idiom. For each +element in the calling DataFrame, if ``cond`` is ``False`` the +element is used; otherwise the corresponding element from the DataFrame +``other`` is used. + +The signature for :func:`DataFrame.where` differs from +:func:`numpy.where`. Roughly ``df1.where(m, df2)`` is equivalent to +``np.where(m, df1, df2)``. + +For further details and examples see the ``mask`` documentation in +:ref:`indexing `. + +Examples +-------- +>>> import mars.tensor as mt +>>> import mars.dataframe as md +>>> s = md.Series(range(5)) +>>> s.where(s > 0).execute() +0 NaN +1 1.0 +2 2.0 +3 3.0 +4 4.0 +dtype: float64 + +>>> s.mask(s > 0).execute() +0 0.0 +1 NaN +2 NaN +3 NaN +4 NaN +dtype: float64 + +>>> s.where(s > 1, 10).execute() +0 10 +1 10 +2 2 +3 3 +4 4 +dtype: int64 + +>>> df = md.DataFrame(mt.arange(10).reshape(-1, 2), columns=['A', 'B']) +>>> df.execute() + A B +0 0 1 +1 2 3 +2 4 5 +3 6 7 +4 8 9 +>>> m = df % 3 == 0 +>>> df.where(m, -df).execute() + A B +0 0 -1 +1 -2 3 +2 -4 -5 +3 6 -7 +4 -8 9 +>>> df.where(m, -df) == mt.where(m, df, -df).execute() + A B +0 True True +1 True True +2 True True +3 True True +4 True True +>>> df.where(m, -df) == df.mask(~m, -df).execute() + A B +0 True True +1 True True +2 True True +3 True True +4 True True +""" + + +def _where( + df_or_series, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=False, + replace_true=False, +): + if df_or_series.ndim == 2 and getattr(other, "ndim", 2) == 1 and axis is None: + raise ValueError("Must specify axis=0 or 1") + + axis = validate_axis(axis or 0, df_or_series) + op = DataFrameWhere( + cond=cond, + other=other, + axis=axis, + level=level, + errors=errors, + try_cast=try_cast, + replace_true=replace_true, + ) + result = op(df_or_series) + if inplace: + df_or_series.data = result.data + else: + return result + + +def where( + df_or_series, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=False, +): + return _where( + df_or_series, + cond, + other=other, + inplace=inplace, + axis=axis, + level=level, + errors=errors, + try_cast=try_cast, + replace_true=False, + ) + + +def mask( + df_or_series, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=False, +): + return _where( + df_or_series, + cond, + other=other, + inplace=inplace, + axis=axis, + level=level, + errors=errors, + try_cast=try_cast, + replace_true=True, + ) + + +mask.__doc__ = _doc_template.format(replace_true=True, opposite="where") +where.__doc__ = _doc_template.format(replace_true=False, opposite="mask") diff --git a/python/xorbits/_mars/dataframe/initializer.py b/python/xorbits/_mars/dataframe/initializer.py new file mode 100644 index 000000000..46f4b8e90 --- /dev/null +++ b/python/xorbits/_mars/dataframe/initializer.py @@ -0,0 +1,255 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +from pandas.core.dtypes.common import pandas_dtype + +from ..core import ENTITY_TYPE +from ..serialization.serializables import SerializableMeta +from ..tensor import stack +from ..tensor import tensor as astensor +from ..tensor.array_utils import is_cupy +from ..tensor.core import TENSOR_TYPE +from ..utils import ceildiv, lazy_import +from .core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE +from .core import DataFrame as _Frame +from .core import Index as _Index +from .core import Series as _Series +from .datasource.dataframe import from_pandas as from_pandas_df +from .datasource.from_tensor import ( + dataframe_from_1d_tileables, + dataframe_from_tensor, + series_from_tensor, +) +from .datasource.index import from_pandas as from_pandas_index +from .datasource.index import from_tileable as from_tileable_index +from .datasource.series import from_pandas as from_pandas_series +from .utils import is_cudf, is_index + +cudf = lazy_import("cudf") + + +class InitializerMeta(SerializableMeta): + def __instancecheck__(cls, instance): + return isinstance(instance, (cls.__base__,) + getattr(cls, "_allow_data_type_")) + + +class DataFrame(_Frame, metaclass=InitializerMeta): + def __init__( + self, + data=None, + index=None, + columns=None, + dtype=None, + copy=False, + chunk_size=None, + gpu=None, + sparse=None, + num_partitions=None, + ): + need_repart = False + if isinstance(data, TENSOR_TYPE): + if chunk_size is not None: + data = data.rechunk(chunk_size) + df = dataframe_from_tensor( + data, index=index, columns=columns, gpu=gpu, sparse=sparse + ) + need_repart = num_partitions is not None + elif isinstance(data, SERIES_TYPE): + df = data.to_frame() + need_repart = num_partitions is not None + elif isinstance(data, DATAFRAME_TYPE): + if not hasattr(data, "data"): + # DataFrameData + df = _Frame(data) + else: + df = data + need_repart = num_partitions is not None + elif isinstance(data, dict) and self._can_process_by_1d_tileables(data): + # data is a dict and some value is tensor + df = dataframe_from_1d_tileables( + data, index=index, columns=columns, gpu=gpu, sparse=sparse + ) + need_repart = num_partitions is not None + elif isinstance(data, list) and any(isinstance(v, ENTITY_TYPE) for v in data): + # stack data together + data = stack(data) + df = dataframe_from_tensor( + data, index=index, columns=columns, gpu=gpu, sparse=sparse + ) + need_repart = num_partitions is not None + elif isinstance(index, (INDEX_TYPE, SERIES_TYPE)): + if isinstance(data, dict): + data = {k: astensor(v, chunk_size=chunk_size) for k, v in data.items()} + df = dataframe_from_1d_tileables( + data, index=index, columns=columns, gpu=gpu, sparse=sparse + ) + else: + if data is not None: + data = astensor(data, chunk_size=chunk_size) + df = dataframe_from_tensor( + data, index=index, columns=columns, gpu=gpu, sparse=sparse + ) + need_repart = num_partitions is not None + else: + if is_cudf(data) or is_cupy(data): # pragma: no cover + pdf = cudf.DataFrame(data, index=index, columns=columns, dtype=dtype) + if copy: + pdf = pdf.copy() + else: + pdf = pd.DataFrame( + data, index=index, columns=columns, dtype=dtype, copy=copy + ) + if num_partitions is not None: + chunk_size = ceildiv(len(pdf), num_partitions) + df = from_pandas_df(pdf, chunk_size=chunk_size, gpu=gpu, sparse=sparse) + + if need_repart: + df = df.rebalance(num_partitions=num_partitions) + super().__init__(df.data) + + @classmethod + def _can_process_by_1d_tileables(cls, data: dict): + for value in data.values(): + if isinstance(value, ENTITY_TYPE): + return True + elif isinstance(value, (list, tuple)) and any( + isinstance(v, ENTITY_TYPE) for v in value + ): + return True + return False + + +class Series(_Series, metaclass=InitializerMeta): + def __init__( + self, + data=None, + index=None, + dtype=None, + name=None, + copy=False, + chunk_size=None, + gpu=None, + sparse=None, + num_partitions=None, + ): + if dtype is not None: + dtype = pandas_dtype(dtype) + need_repart = False + if isinstance(data, (TENSOR_TYPE, INDEX_TYPE)): + if chunk_size is not None: + data = data.rechunk(chunk_size) + name = name or getattr(data, "name", None) + series = series_from_tensor( + data, index=index, name=name, gpu=gpu, sparse=sparse + ) + need_repart = num_partitions is not None + elif isinstance(index, INDEX_TYPE): + if data is not None: + data = astensor(data, chunk_size=chunk_size) + series = series_from_tensor( + data, index=index, name=name, dtype=dtype, gpu=gpu, sparse=sparse + ) + need_repart = num_partitions is not None + elif isinstance(data, SERIES_TYPE): + if not hasattr(data, "data"): + # SeriesData + series = _Series(data) + else: + series = data + need_repart = num_partitions is not None + else: + if is_cudf(data) or is_cupy(data): # pragma: no cover + pd_series = cudf.Series(data, index=index, dtype=dtype, name=name) + if copy: + pd_series = pd_series.copy() + else: + pd_series = pd.Series( + data, index=index, dtype=dtype, name=name, copy=copy + ) + if num_partitions is not None: + chunk_size = ceildiv(len(pd_series), num_partitions) + series = from_pandas_series( + pd_series, chunk_size=chunk_size, gpu=gpu, sparse=sparse + ) + + if need_repart: + series = series.rebalance(num_partitions=num_partitions) + super().__init__(series.data) + + +class Index(_Index, metaclass=InitializerMeta): + def __new__(cls, data, **_): + # just return cls always until we support other Index's initializers + return object.__new__(cls) + + def __init__( + self, + data=None, + dtype=None, + copy=False, + name=None, + tupleize_cols=True, + chunk_size=None, + gpu=None, + sparse=None, + names=None, + num_partitions=None, + store_data=False, + ): + need_repart = False + if isinstance(data, INDEX_TYPE): + if not hasattr(data, "data"): + # IndexData + index = _Index(data) + else: + index = data + need_repart = num_partitions is not None + else: + if isinstance(data, ENTITY_TYPE): + name = name if name is not None else getattr(data, "name", None) + index = from_tileable_index(data, dtype=dtype, name=name, names=names) + need_repart = num_partitions is not None + else: + if not is_index(data): + name = name if name is not None else getattr(data, "name", None) + xdf = cudf if is_cudf(data) or is_cupy(data) else pd + try: + pd_index = xdf.Index( + data=data, + dtype=dtype, + copy=copy, + name=name, + tupleize_cols=tupleize_cols, + ) + except TypeError: # pragma: no cover + pd_index = xdf.Index( + data=data, dtype=dtype, copy=copy, name=name + ) + else: + pd_index = data + + if num_partitions is not None: + chunk_size = ceildiv(len(pd_index), num_partitions) + index = from_pandas_index( + pd_index, + chunk_size=chunk_size, + gpu=gpu, + sparse=sparse, + store_data=store_data, + ) + + if need_repart: + index = index.rebalance(num_partitions=num_partitions) + super().__init__(index.data) diff --git a/python/xorbits/_mars/dataframe/merge/__init__.py b/python/xorbits/_mars/dataframe/merge/__init__.py new file mode 100644 index 000000000..db87d09ce --- /dev/null +++ b/python/xorbits/_mars/dataframe/merge/__init__.py @@ -0,0 +1,32 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .concat import DataFrameConcat, concat +from .merge import join, merge, DataFrameMerge, DataFrameMergeAlign +from .append import DataFrameAppend, append # isort: skip + + +def _install(): + from ..core import DATAFRAME_TYPE, SERIES_TYPE + + for cls in DATAFRAME_TYPE: + setattr(cls, "join", join) + setattr(cls, "merge", merge) + + for cls in DATAFRAME_TYPE + SERIES_TYPE: + setattr(cls, "append", append) + + +_install() +del _install diff --git a/python/xorbits/_mars/dataframe/merge/append.py b/python/xorbits/_mars/dataframe/merge/append.py new file mode 100644 index 000000000..8b32231e5 --- /dev/null +++ b/python/xorbits/_mars/dataframe/merge/append.py @@ -0,0 +1,222 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import OutputType, recursive_tile +from ...serialization.serializables import BoolField +from ..datasource.dataframe import from_pandas +from ..indexing.iloc import DataFrameIlocGetItem, SeriesIlocGetItem +from ..operands import ( + DATAFRAME_TYPE, + SERIES_TYPE, + DataFrameOperand, + DataFrameOperandMixin, +) +from ..utils import parse_index, standardize_range_index + + +class DataFrameAppend(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.APPEND + + ignore_index = BoolField("ignore_index") + verify_integrity = BoolField("verify_integrity") + sort = BoolField("sort") + + def __init__(self, output_types=None, **kw): + super().__init__(_output_types=output_types, **kw) + + @classmethod + def _tile_dataframe(cls, op: "DataFrameAppend"): + out_df = op.outputs[0] + inputs = op.inputs + first_df, others = inputs[0], inputs[1:] + column_splits = first_df.nsplits[1] + new_others = [] + for item in others: + r = yield from recursive_tile(item.rechunk({1: column_splits})) + new_others.append(r) + others = new_others + out_chunks = [] + nsplits = [[], list(first_df.nsplits[1])] + row_index = 0 + for df in [first_df] + others: + for c in df.chunks: + index = (c.index[0] + row_index, c.index[1]) + iloc_op = DataFrameIlocGetItem(indexes=[slice(None)] * 2) + out_chunks.append( + iloc_op.new_chunk( + [c], + shape=c.shape, + index=index, + dtypes=c.dtypes, + index_value=c.index_value, + columns_value=c.columns_value, + ) + ) + nsplits[0] += df.nsplits[0] + row_index += len(df.nsplits[0]) + if op.ignore_index: + yield out_chunks + out_chunks = standardize_range_index(out_chunks) + + nsplits = tuple(tuple(n) for n in nsplits) + new_op = op.copy() + return new_op.new_dataframes( + op.inputs, + out_df.shape, + nsplits=nsplits, + chunks=out_chunks, + dtypes=out_df.dtypes, + index_value=out_df.index_value, + columns_value=out_df.columns_value, + ) + + @classmethod + def _tile_series(cls, op: "DataFrameAppend"): + out_series = op.outputs[0] + inputs = op.inputs + first_series, others = inputs[0], inputs[1:] + out_chunks = [] + nsplits = () + row_index = 0 + for series in [first_series] + others: + for c in series.chunks: + index = (c.index[0] + row_index,) + iloc_op = SeriesIlocGetItem(indexes=(slice(None),)) + out_chunks.append( + iloc_op.new_chunk( + [c], + shape=c.shape, + index=index, + index_value=c.index_value, + dtype=c.dtype, + name=c.name, + ) + ) + nsplits += series.nsplits[0] + row_index += len(series.nsplits[0]) + + if op.ignore_index: + yield out_chunks + out_chunks = standardize_range_index(out_chunks) + + nsplits = (tuple(nsplits),) + new_op = op.copy() + return new_op.new_seriess( + op.inputs, + out_series.shape, + nsplits=nsplits, + chunks=out_chunks, + dtype=out_series.dtype, + index_value=out_series.index_value, + name=out_series.name, + ) + + @classmethod + def tile(cls, op: "DataFrameAppend"): + if op.output_types[0] == OutputType.dataframe: + return (yield from cls._tile_dataframe(op)) + else: + return (yield from cls._tile_series(op)) + + def _call_dataframe(self, df, other): + if isinstance(other, DATAFRAME_TYPE): + shape = (df.shape[0] + other.shape[0], df.shape[1]) + inputs = [df, other] + if self.ignore_index: + index_value = parse_index(pd.RangeIndex(shape[0])) + else: + index_value = parse_index( + df.index_value.to_pandas().append(other.index_value.to_pandas()) + ) + elif isinstance(other, list): + row_length = df.shape[0] + index = df.index_value.to_pandas() + for item in other: + if not isinstance(item, DATAFRAME_TYPE): # pragma: no cover + raise ValueError(f"Invalid type {type(item)} to append") + row_length += item.shape[0] + index = index.append(item.index_value.to_pandas()) + shape = (row_length, df.shape[1]) + if self.ignore_index: # pragma: no cover + index_value = parse_index(pd.RangeIndex(shape[0])) + else: + index_value = parse_index(index) + inputs = [df] + other + else: # pragma: no cover + raise ValueError(f"Invalid type {type(other)} to append") + return self.new_dataframe( + inputs, + shape=shape, + dtypes=df.dtypes, + index_value=index_value, + columns_value=df.columns_value, + ) + + def _call_series(self, df, other): + if isinstance(other, SERIES_TYPE): + shape = (df.shape[0] + other.shape[0],) + inputs = [df, other] + if self.ignore_index: + index_value = parse_index(pd.RangeIndex(shape[0])) + else: + index_value = parse_index( + df.index_value.to_pandas().append(other.index_value.to_pandas()) + ) + elif isinstance(other, list): + row_length = df.shape[0] + index = df.index_value.to_pandas() + for item in other: + if not isinstance(item, SERIES_TYPE): # pragma: no cover + raise ValueError(f"Invalid type {type(item)} to append") + row_length += item.shape[0] + index = index.append(item.index_value.to_pandas()) + shape = (row_length,) + if self.ignore_index: # pragma: no cover + index_value = parse_index(pd.RangeIndex(shape[0])) + else: + index_value = parse_index(index) + inputs = [df] + other + else: # pragma: no cover + raise ValueError(f"Invalid type {type(other)} to append") + return self.new_series( + inputs, shape=shape, dtype=df.dtype, index_value=index_value, name=df.name + ) + + @classmethod + def execute(cls, ctx, op: "DataFrameAppend"): + first, others = ctx[op.inputs[0].key], [ctx[inp.key] for inp in op.inputs[1:]] + r = first.append(others, verify_integrity=op.verify_integrity, sort=op.sort) + ctx[op.outputs[0].key] = r + + def __call__(self, df, other): + if isinstance(df, DATAFRAME_TYPE): + self.output_types = [OutputType.dataframe] + return self._call_dataframe(df, other) + else: + self.output_types = [OutputType.series] + return self._call_series(df, other) + + +def append(df, other, ignore_index=False, verify_integrity=False, sort=False): + if verify_integrity or sort: # pragma: no cover + raise NotImplementedError("verify_integrity and sort are not supported now") + if isinstance(other, dict): + other = from_pandas(pd.DataFrame(dict((k, [v]) for k, v in other.items()))) + op = DataFrameAppend( + ignore_index=ignore_index, verify_integrity=verify_integrity, sort=sort + ) + return op(df, other) diff --git a/python/xorbits/_mars/dataframe/merge/concat.py b/python/xorbits/_mars/dataframe/merge/concat.py new file mode 100644 index 000000000..b3c21d66b --- /dev/null +++ b/python/xorbits/_mars/dataframe/merge/concat.py @@ -0,0 +1,617 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import ENTITY_TYPE, OutputType, recursive_tile +from ...serialization.serializables import ( + AnyField, + BoolField, + FieldTypes, + ListField, + StringField, +) +from ...utils import has_unknown_shape, lazy_import +from ..operands import SERIES_TYPE, DataFrameOperand, DataFrameOperandMixin +from ..utils import ( + build_empty_df, + build_empty_series, + parse_index, + standardize_range_index, + validate_axis, +) + +cudf = lazy_import("cudf") + + +class DataFrameConcat(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.CONCATENATE + + axis = AnyField("axis", default=None) + join = StringField("join", default=None) + ignore_index = BoolField("ignore_index", default=None) + keys = ListField("keys", default=None) + levels = ListField("levels", default=None) + names = ListField("names", default=None) + verify_integrity = BoolField("verify_integrity", default=None) + sort = BoolField("sort", default=None) + copy_ = BoolField("copy", default=None) + + def __init__(self, copy=None, output_types=None, **kw): + super().__init__(copy_=copy, _output_types=output_types, **kw) + + @property + def level(self): + return self.levels + + @property + def name(self): + return self.names + + @classmethod + def _tile_dataframe(cls, op): + from ..indexing.iloc import DataFrameIlocGetItem + + out_df = op.outputs[0] + inputs = op.inputs + axis = op.axis + + if not all( + inputs[i].nsplits[1 - axis] == inputs[i + 1].nsplits[1 - axis] + for i in range(len(inputs) - 1) + ): + # need rechunk + if has_unknown_shape(*inputs): + yield + normalized_nsplits = {1 - axis: inputs[0].nsplits[1 - axis]} + new_inputs = [] + for inp in inputs: + new_inputs.append( + (yield from recursive_tile(inp.rechunk(normalized_nsplits))) + ) + inputs = new_inputs + + out_chunks = [] + nsplits = [] + cum_index = 0 + for df in inputs: + for c in df.chunks: + if op.axis == 0: + index = (c.index[0] + cum_index, c.index[1]) + else: + index = (c.index[0], c.index[1] + cum_index) + + iloc_op = DataFrameIlocGetItem(indexes=[slice(None)] * 2) + out_chunks.append( + iloc_op.new_chunk( + [c], + shape=c.shape, + index=index, + dtypes=c.dtypes, + index_value=c.index_value, + columns_value=c.columns_value, + ) + ) + nsplits.extend(df.nsplits[op.axis]) + cum_index += len(df.nsplits[op.axis]) + out_nsplits = ( + (tuple(nsplits), inputs[0].nsplits[1]) + if op.axis == 0 + else (inputs[0].nsplits[0], tuple(nsplits)) + ) + + if op.ignore_index: + yield out_chunks + out_chunks = standardize_range_index(out_chunks) + + new_op = op.copy() + return new_op.new_dataframes( + op.inputs, + out_df.shape, + nsplits=out_nsplits, + chunks=out_chunks, + dtypes=out_df.dtypes, + index_value=out_df.index_value, + columns_value=out_df.columns_value, + ) + + @classmethod + def _tile_series(cls, op: "DataFrameConcat"): + from ..datasource.from_tensor import DataFrameFromTensor + from ..indexing.iloc import DataFrameIlocGetItem, SeriesIlocGetItem + + out = op.outputs[0] + inputs = op.inputs + out_chunks = [] + + if op.axis == 1: + if has_unknown_shape(*inputs): + yield + new_inputs = [] + for inp in inputs: + new_inputs.append( + (yield from recursive_tile(inp.rechunk(op.inputs[0].nsplits))) + ) + inputs = new_inputs + + cum_index = 0 + offset = 0 + nsplits = [] + for series in inputs: + for c in series.chunks: + if op.axis == 0: + index = (c.index[0] + cum_index,) + shape = c.shape + iloc_op = SeriesIlocGetItem(indexes=(slice(None),)) + out_chunks.append( + iloc_op.new_chunk( + [c], + shape=shape, + index=index, + index_value=c.index_value, + dtype=c.dtype, + name=c.name, + ) + ) + else: + index = (c.index[0], cum_index) + shape = (c.shape[0], 1) + to_frame_op = DataFrameFromTensor( + input=c, + index=None, + columns=None, + ) + if c.name: + dtypes = pd.Series([c.dtype], index=[c.name]) + else: + dtypes = pd.Series( + [c.dtype], index=pd.RangeIndex(offset, offset + 1) + ) + df_chunk = to_frame_op.new_chunk( + [c], + shape=shape, + index=index, + index_value=c.index_value, + columns_value=parse_index(dtypes.index, store_data=True), + dtypes=dtypes, + ) + iloc_op = DataFrameIlocGetItem(indexes=[slice(None)] * 2) + out_chunks.append( + iloc_op.new_chunk( + [df_chunk], + shape=df_chunk.shape, + index=index, + dtypes=df_chunk.dtypes, + index_value=df_chunk.index_value, + columns_value=df_chunk.columns_value, + ) + ) + + if op.axis == 0: + nsplits.extend(series.nsplits[0]) + cum_index += len(series.nsplits[op.axis]) + else: + nsplits.append(1) + cum_index += 1 + offset += 1 + + if op.ignore_index: + yield out_chunks + out_chunks = standardize_range_index(out_chunks) + + new_op = op.copy() + if op.axis == 0: + nsplits = (tuple(nsplits),) + return new_op.new_seriess( + op.inputs, + out.shape, + nsplits=nsplits, + chunks=out_chunks, + dtype=out.dtype, + index_value=out.index_value, + name=out.name, + ) + else: + nsplits = (inputs[0].nsplits[0], tuple(nsplits)) + return new_op.new_dataframes( + op.inputs, + out.shape, + nsplits=nsplits, + chunks=out_chunks, + dtypes=out.dtypes, + index_value=out.index_value, + columns_value=out.columns_value, + ) + + @classmethod + def tile(cls, op: "DataFrameConcat"): + if isinstance(op.inputs[0], SERIES_TYPE): + return (yield from cls._tile_series(op)) + else: + return (yield from cls._tile_dataframe(op)) + + @classmethod + def execute(cls, ctx, op: "DataFrameConcat"): + def _base_concat(chunk, inputs): + # auto generated concat when executing a DataFrame, Series or Index + if chunk.op.output_types[0] == OutputType.dataframe: + return _auto_concat_dataframe_chunks(chunk, inputs) + elif chunk.op.output_types[0] == OutputType.series: + return _auto_concat_series_chunks(chunk, inputs) + elif chunk.op.output_types[0] == OutputType.index: + return _auto_concat_index_chunks(chunk, inputs) + elif chunk.op.output_types[0] == OutputType.categorical: + return _auto_concat_categorical_chunks(chunk, inputs) + else: # pragma: no cover + raise TypeError( + "Only DataFrameChunk, SeriesChunk, IndexChunk, " + "and CategoricalChunk can be automatically concatenated" + ) + + def _auto_concat_dataframe_chunks(chunk, inputs): + xdf = ( + pd + if isinstance(inputs[0], (pd.DataFrame, pd.Series)) or cudf is None + else cudf + ) + + if chunk.op.axis is not None: + return xdf.concat(inputs, axis=op.axis) + + # auto generated concat when executing a DataFrame + if len(inputs) == 1: + ret = inputs[0] + else: + n_rows = len(set(inp.index[0] for inp in chunk.inputs)) + n_cols = int(len(inputs) // n_rows) + assert n_rows * n_cols == len(inputs) + + concats = [] + for i in range(n_rows): + if n_cols == 1: + concats.append(inputs[i]) + else: + concat = xdf.concat( + [inputs[i * n_cols + j] for j in range(n_cols)], axis=1 + ) + concats.append(concat) + + if xdf is pd: + # The `sort=False` is to suppress a `FutureWarning` of pandas, + # when the index or column of chunks to concatenate is not aligned, + # which may happens for certain ops. + # + # See also Note [Columns of Left Join] in test_merge_execution.py. + ret = xdf.concat(concats, sort=False) + else: + ret = xdf.concat(concats) + # cuDF will lost index name when concat two seriess. + ret.index.name = concats[0].index.name + + return ret + + def _auto_concat_series_chunks(chunk, inputs): + # auto generated concat when executing a Series + if len(inputs) == 1: + concat = inputs[0] + else: + xdf = pd if isinstance(inputs[0], pd.Series) or cudf is None else cudf + if chunk.op.axis is not None: + concat = xdf.concat(inputs, axis=chunk.op.axis) + else: + concat = xdf.concat(inputs) + return concat + + def _auto_concat_index_chunks(chunk, inputs): + if len(inputs) == 1: + xdf = pd if isinstance(inputs[0], pd.Index) or cudf is None else cudf + concat_df = xdf.DataFrame(index=inputs[0]) + else: + xdf = pd if isinstance(inputs[0], pd.Index) or cudf is None else cudf + empty_dfs = [xdf.DataFrame(index=inp) for inp in inputs] + concat_df = xdf.concat(empty_dfs, axis=0) + return concat_df.index + + def _auto_concat_categorical_chunks(_, inputs): + if len(inputs) == 1: # pragma: no cover + return inputs[0] + else: + # convert categorical into array + arrays = [np.asarray(inp) for inp in inputs] + array = np.concatenate(arrays) + return pd.Categorical( + array, categories=inputs[0].categories, ordered=inputs[0].ordered + ) + + chunk = op.outputs[0] + inputs = [ctx[input.key] for input in op.inputs] + + if isinstance(inputs[0], tuple): + ctx[chunk.key] = tuple( + _base_concat(chunk, [input[i] for input in inputs]) + for i in range(len(inputs[0])) + ) + else: + ctx[chunk.key] = _base_concat(chunk, inputs) + + @classmethod + def _concat_index(cls, prev_index: pd.Index, cur_index: pd.Index): + if isinstance(prev_index, pd.RangeIndex) and isinstance( + cur_index, pd.RangeIndex + ): + # handle RangeIndex that append may generate huge amount of data + # e.g. pd.RangeIndex(10_000) and pd.RangeIndex(10_000) + # will generate a Int64Index full of data + # for details see GH#1647 + prev_stop = prev_index.start + prev_index.size * prev_index.step + cur_start = cur_index.start + if prev_stop == cur_start and prev_index.step == cur_index.step: + # continuous RangeIndex, still return RangeIndex + return prev_index.append(cur_index) + else: + # otherwise, return an empty index + return pd.Index([], dtype=prev_index.dtype) + elif isinstance(prev_index, pd.RangeIndex): + return pd.Index([], prev_index.dtype).append(cur_index) + elif isinstance(cur_index, pd.RangeIndex): + return prev_index.append(pd.Index([], cur_index.dtype)) + return prev_index.append(cur_index) + + def _call_series(self, objs): + if self.axis == 0: + row_length = 0 + index = None + for series in objs: + if index is None: + index = series.index_value.to_pandas() + else: + index = self._concat_index(index, series.index_value.to_pandas()) + row_length += series.shape[0] + if self.ignore_index: # pragma: no cover + index_value = parse_index(pd.RangeIndex(row_length)) + else: + index_value = parse_index(index, objs) + return self.new_series( + objs, + shape=(row_length,), + dtype=objs[0].dtype, + index_value=index_value, + name=objs[0].name, + ) + else: + col_length = 0 + columns = [] + dtypes = dict() + undefined_name = 0 + for series in objs: + if series.name is None: + dtypes[undefined_name] = series.dtype + undefined_name += 1 + columns.append(undefined_name) + else: + dtypes[series.name] = series.dtype + columns.append(series.name) + col_length += 1 + if self.ignore_index or undefined_name == len(objs): + columns_value = parse_index(pd.RangeIndex(col_length)) + else: + columns_value = parse_index(pd.Index(columns), store_data=True) + + shape = (objs[0].shape[0], col_length) + return self.new_dataframe( + objs, + shape=shape, + dtypes=pd.Series(dtypes), + index_value=objs[0].index_value, + columns_value=columns_value, + ) + + def _call_dataframes(self, objs): + if self.axis == 0: + row_length = 0 + index = None + empty_dfs = [] + for df in objs: + if index is None: + index = df.index_value.to_pandas() + else: + index = self._concat_index(index, df.index_value.to_pandas()) + row_length += df.shape[0] + if df.ndim == 2: + empty_dfs.append(build_empty_df(df.dtypes)) + else: + empty_dfs.append(build_empty_series(df.dtype, name=df.name)) + + emtpy_result = pd.concat(empty_dfs, join=self.join, sort=self.sort) + shape = (row_length, emtpy_result.shape[1]) + columns_value = parse_index(emtpy_result.columns, store_data=True) + + if self.join == "inner": + objs = [o[list(emtpy_result.columns)] for o in objs] + + if self.ignore_index: # pragma: no cover + index_value = parse_index(pd.RangeIndex(row_length)) + else: + index_value = parse_index(index, objs) + + new_objs = [] + for obj in objs: + if obj.ndim != 2: + # series + new_obj = obj.to_frame().reindex(columns=emtpy_result.dtypes.index) + else: + # dataframe + if list(obj.dtypes.index) != list(emtpy_result.dtypes.index): + new_obj = obj.reindex(columns=emtpy_result.dtypes.index) + else: + new_obj = obj + new_objs.append(new_obj) + + return self.new_dataframe( + new_objs, + shape=shape, + dtypes=emtpy_result.dtypes, + index_value=index_value, + columns_value=columns_value, + ) + else: + col_length = 0 + empty_dfs = [] + for df in objs: + if df.ndim == 2: + # DataFrame + col_length += df.shape[1] + empty_dfs.append(build_empty_df(df.dtypes)) + else: + # Series + col_length += 1 + empty_dfs.append(build_empty_series(df.dtype, name=df.name)) + + emtpy_result = pd.concat(empty_dfs, join=self.join, axis=1, sort=True) + if self.ignore_index: + columns_value = parse_index(pd.RangeIndex(col_length)) + else: + columns_value = parse_index( + pd.Index(emtpy_result.columns), store_data=True + ) + + if self.ignore_index or len({o.index_value.key for o in objs}) == 1: + new_objs = [obj if obj.ndim == 2 else obj.to_frame() for obj in objs] + else: # pragma: no cover + raise NotImplementedError( + "Does not support concat dataframes which has different index" + ) + + shape = (objs[0].shape[0], col_length) + return self.new_dataframe( + new_objs, + shape=shape, + dtypes=emtpy_result.dtypes, + index_value=objs[0].index_value, + columns_value=columns_value, + ) + + def __call__(self, objs): + if all(isinstance(obj, SERIES_TYPE) for obj in objs): + self.output_types = [OutputType.series] + return self._call_series(objs) + else: + self.output_types = [OutputType.dataframe] + return self._call_dataframes(objs) + + +class GroupByConcat(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.GROUPBY_CONCAT + + _groups = ListField("groups", FieldTypes.key) + _groupby_params = AnyField("groupby_params") + + def __init__(self, groups=None, groupby_params=None, output_types=None, **kw): + super().__init__( + _groups=groups, + _groupby_params=groupby_params, + _output_types=output_types, + **kw + ) + + @property + def groups(self): + return self._groups + + @property + def groupby_params(self): + return self._groupby_params + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + + new_groups = [] + for _ in self._groups: + new_groups.append(next(inputs_iter)) + self._groups = new_groups + + if isinstance(self._groupby_params["by"], list): + by = [] + for v in self._groupby_params["by"]: + if isinstance(v, ENTITY_TYPE): + by.append(next(inputs_iter)) + else: + by.append(v) + self._groupby_params["by"] = by + + @classmethod + def execute(cls, ctx, op): + input_data = [ctx[input.key] for input in op.groups] + obj = pd.concat([d.obj for d in input_data]) + + params = op.groupby_params.copy() + if isinstance(params["by"], list): + by = [] + for v in params["by"]: + if isinstance(v, ENTITY_TYPE): + by.append(ctx[v.key]) + else: + by.append(v) + params["by"] = by + selection = params.pop("selection", None) + + result = obj.groupby(**params) + if selection: + result = result[selection] + + ctx[op.outputs[0].key] = result + + +def concat( + objs, + axis=0, + join="outer", + ignore_index=False, + keys=None, + levels=None, + names=None, + verify_integrity=False, + sort=False, + copy=True, +): + if not isinstance(objs, (list, tuple)): # pragma: no cover + raise TypeError( + "first argument must be an iterable of dataframe or series objects" + ) + axis = validate_axis(axis) + if isinstance(objs, dict): # pragma: no cover + keys = objs.keys() + objs = objs.values() + if axis == 1 and join == "inner": # pragma: no cover + raise NotImplementedError("inner join is not support when specify `axis=1`") + if verify_integrity or sort or keys: # pragma: no cover + raise NotImplementedError( + "verify_integrity, sort, keys arguments are not supported now" + ) + op = DataFrameConcat( + axis=axis, + join=join, + ignore_index=ignore_index, + keys=keys, + levels=levels, + names=names, + verify_integrity=verify_integrity, + sort=sort, + copy=copy, + ) + + return op(objs) diff --git a/python/xorbits/_mars/dataframe/merge/merge.py b/python/xorbits/_mars/dataframe/merge/merge.py new file mode 100644 index 000000000..fc86bf353 --- /dev/null +++ b/python/xorbits/_mars/dataframe/merge/merge.py @@ -0,0 +1,1342 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import logging +from collections import namedtuple +from enum import Enum +from typing import Any, Dict, List, Optional, Tuple, Union + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import OutputType, TileStatus, recursive_tile +from ...core.context import get_context +from ...core.operand import MapReduceOperand, OperandStage +from ...serialization.serializables import ( + AnyField, + BoolField, + DictField, + Int32Field, + KeyField, + NamedTupleField, + StringField, + TupleField, +) +from ...typing import TileableType +from ...utils import has_unknown_shape, lazy_import +from ..base.bloom_filter import filter_by_bloom_filter +from ..core import DataFrame, DataFrameChunk, Series +from ..operands import DataFrameOperand, DataFrameOperandMixin, DataFrameShuffleProxy +from ..utils import ( + auto_merge_chunks, + build_concatenated_rows_frame, + build_df, + hash_dataframe_on, + infer_index_value, + is_cudf, + parse_index, +) + +logger = logging.getLogger(__name__) +DEFAULT_BLOOM_FILTER_CHUNK_THRESHOLD = 10 +# use bloom filter to filter large DataFrame +BLOOM_FILTER_OPTIONS = [ + "max_elements", + "error_rate", + "apply_chunk_size_threshold", + "filter", + "combine_size", +] +BLOOM_FILTER_ON_OPTIONS = ["large", "small", "both"] +DEFAULT_BLOOM_FILTER_ON = "large" + +cudf = lazy_import("cudf") + + +class DataFrameMergeAlign(MapReduceOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.DATAFRAME_SHUFFLE_MERGE_ALIGN + + index_shuffle_size = Int32Field("index_shuffle_size") + shuffle_on = AnyField("shuffle_on") + + input = KeyField("input") + # for mapper + mapper_id = Int32Field("mapper_id", default=0) + + def __init__(self, output_types=None, **kw): + super().__init__(_output_types=output_types, **kw) + if output_types is None: + if self.stage == OperandStage.map: + output_types = [OutputType.dataframe] + elif self.stage == OperandStage.reduce: + output_types = [OutputType.dataframe] * 2 + self._output_types = output_types + + @property + def output_limit(self) -> int: + return len(self.output_types) + + @classmethod + def execute_map(cls, ctx, op): + chunk = op.outputs[0] + df = ctx[op.inputs[0].key] + shuffle_on = op.shuffle_on + + if shuffle_on is not None: + # shuffle on field may be resident in index + to_reset_index_names = [] + if not isinstance(shuffle_on, (list, tuple)): + if shuffle_on not in df.dtypes: + to_reset_index_names.append(shuffle_on) + else: + for son in shuffle_on: + if son not in df.dtypes: + to_reset_index_names.append(shuffle_on) + if len(to_reset_index_names) > 0: + df = df.reset_index(to_reset_index_names) + + filters = hash_dataframe_on(df, shuffle_on, op.index_shuffle_size) + + # shuffle on index + for index_idx, index_filter in enumerate(filters): + reducer_index = (index_idx, chunk.index[1]) + if index_filter is not None and index_filter is not list(): + ctx[chunk.key, reducer_index] = ( + op.mapper_id, + ctx.get_current_chunk().index, + df.iloc[index_filter], + ) + else: + ctx[chunk.key, reducer_index] = ( + op.mapper_id, + ctx.get_current_chunk().index, + None, + ) + + @classmethod + def execute_reduce(cls, ctx, op: "DataFrameMergeAlign"): + for i, chunk in enumerate(op.outputs): + input_idx_to_df = { + partition_index: data + for mapper_id, partition_index, data in op.iter_mapper_data( + ctx, skip_none=True + ) + if mapper_id == i + } + row_idxes = sorted({idx[0] for idx in input_idx_to_df}) + res = [] + for row_idx in row_idxes: + row_df = input_idx_to_df.get((row_idx, 0), None) + if row_df is not None: + res.append(row_df) + xdf = cudf if is_cudf(res[0]) else pd + ctx[chunk.key] = xdf.concat(res, axis=0) + + @classmethod + def execute(cls, ctx, op): + if op.stage == OperandStage.map: + cls.execute_map(ctx, op) + else: + cls.execute_reduce(ctx, op) + + +MergeSplitInfo = namedtuple("MergeSplitInfo", "split_side, split_index, nsplits") + + +class MergeMethod(Enum): + one_chunk = 0 + broadcast = 1 + shuffle = 2 + + +class DataFrameMerge(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.DATAFRAME_MERGE + + how = StringField("how") + on = AnyField("on") + left_on = AnyField("left_on") + right_on = AnyField("right_on") + left_index = BoolField("left_index") + right_index = BoolField("right_index") + sort = BoolField("sort") + suffixes = TupleField("suffixes") + copy_ = BoolField("copy_") + indicator = BoolField("indicator") + validate = AnyField("validate") + method = StringField("method") + auto_merge = StringField("auto_merge") + auto_merge_threshold = Int32Field("auto_merge_threshold") + bloom_filter = AnyField("bloom_filter") + bloom_filter_options = DictField("bloom_filter_options") + + # only for broadcast merge + split_info = NamedTupleField("split_info") + + def __init__(self, copy=None, **kwargs): + super().__init__(copy_=copy, **kwargs) + + def __call__(self, left, right): + empty_left, empty_right = build_df(left), build_df(right) + + # validate arguments. + merged = empty_left.merge( + empty_right, + how=self.how, + on=self.on, + left_on=self.left_on, + right_on=self.right_on, + left_index=self.left_index, + right_index=self.right_index, + sort=self.sort, + suffixes=self.suffixes, + copy=self.copy_, + indicator=self.indicator, + validate=self.validate, + ) + + # update default values. + if self.on is None and self.left_on is None and self.right_on is None: + if not self.left_index or not self.right_index: + # use the common columns + left_cols = empty_left.columns + right_cols = empty_right.columns + common_cols = left_cols.intersection(right_cols) + self.left_on = self.right_on = list(common_cols) + + # the `index_value` doesn't matter. + index_tokenize_objects = [ + left, + right, + self.how, + self.left_on, + self.right_on, + self.left_index, + self.right_index, + ] + return self.new_dataframe( + [left, right], + shape=(np.nan, merged.shape[1]), + dtypes=merged.dtypes, + index_value=parse_index(merged.index, *index_tokenize_objects), + columns_value=parse_index(merged.columns, store_data=True), + ) + + @classmethod + def _gen_map_chunk( + cls, + chunk: DataFrameChunk, + shuffle_on: Union[List, str], + out_size: int, + mapper_id: int = 0, + ): + map_op = DataFrameMergeAlign( + stage=OperandStage.map, + shuffle_on=shuffle_on, + sparse=chunk.issparse(), + mapper_id=mapper_id, + index_shuffle_size=out_size, + ) + return map_op.new_chunk( + [chunk], + shape=(np.nan, np.nan), + dtypes=chunk.dtypes, + index=chunk.index, + index_value=chunk.index_value, + columns_value=chunk.columns_value, + ) + + @classmethod + def _gen_shuffle_chunks( + cls, + out_shape: Tuple, + shuffle_on: Union[List, str], + df: Union[DataFrame, Series], + ): + # gen map chunks + map_chunks = [ + cls._gen_map_chunk(chunk, shuffle_on, out_shape[0]) for chunk in df.chunks + ] + + proxy_chunk = DataFrameShuffleProxy( + output_types=[OutputType.dataframe] + ).new_chunk( + map_chunks, + shape=(), + dtypes=df.dtypes, + index_value=df.index_value, + columns_value=df.columns_value, + ) + + # gen reduce chunks + reduce_chunks = [] + out_indices = list(itertools.product(*(range(s) for s in out_shape))) + for out_idx in out_indices: + reduce_op = DataFrameMergeAlign( + stage=OperandStage.reduce, + n_reducers=len(out_indices), + sparse=proxy_chunk.issparse(), + output_types=[OutputType.dataframe], + ) + reduce_chunks.append( + reduce_op.new_chunk( + [proxy_chunk], + shape=(np.nan, np.nan), + dtypes=proxy_chunk.dtypes, + index=out_idx, + index_value=proxy_chunk.index_value, + columns_value=proxy_chunk.columns_value, + ) + ) + return reduce_chunks + + @classmethod + def _gen_both_shuffle_chunks( + cls, + out_shape: Tuple, + left_shuffle_on: Union[List, str], + right_shuffle_on: Union[List, str], + left: Union[DataFrame, Series], + right: Union[DataFrame, Series], + ): + # gen map chunks + # for left dataframe, use 0 as mapper_id + left_map_chunks = [ + cls._gen_map_chunk(chunk, left_shuffle_on, out_shape[0], mapper_id=0) + for chunk in left.chunks + ] + # for right dataframe, use 1 as mapper_id + right_map_chunks = [ + cls._gen_map_chunk(chunk, right_shuffle_on, out_shape[0], mapper_id=1) + for chunk in right.chunks + ] + map_chunks = left_map_chunks + right_map_chunks + + proxy_chunk = DataFrameShuffleProxy( + output_types=[OutputType.dataframe] + ).new_chunk( + map_chunks, + shape=(), + dtypes=left.dtypes, + index_value=left.index_value, + columns_value=left.columns_value, + ) + + # gen reduce chunks + left_reduce_chunks = [] + right_reduce_chunks = [] + out_indices = list(itertools.product(*(range(s) for s in out_shape))) + for out_idx in out_indices: + reduce_op = DataFrameMergeAlign( + stage=OperandStage.reduce, + sparse=proxy_chunk.issparse(), + n_reducers=len(out_indices), + ) + left_param = { + "shape": (np.nan, np.nan), + "dtypes": left.dtypes, + "index": out_idx, + "index_value": left.index_value, + "columns_value": left.columns_value, + } + right_param = { + "shape": (np.nan, np.nan), + "dtypes": right.dtypes, + "index": out_idx, + "index_value": right.index_value, + "columns_value": right.columns_value, + } + params = [left_param, right_param] + left_reduce, right_reduce = reduce_op.new_chunks([proxy_chunk], kws=params) + left_reduce_chunks.append(left_reduce) + right_reduce_chunks.append(right_reduce) + return left_reduce_chunks, right_reduce_chunks + + @classmethod + def _apply_bloom_filter( + cls, + left: TileableType, + right: TileableType, + left_on: Union[List, str], + right_on: Union[List, str], + op: "DataFrameMerge", + ): + bloom_filter_params = dict() + bloom_filter_options = op.bloom_filter_options or dict() + for option in ["max_elements", "error_rate", "combine_size"]: + if option in bloom_filter_options: + bloom_filter_params[option] = bloom_filter_options[option] + if "max_elements" not in bloom_filter_params: + bloom_filter_params["max_elements"] = max( + c.shape[0] for c in left.chunks + right.chunks + ) + filter_on = bloom_filter_options.get("filter", DEFAULT_BLOOM_FILTER_ON) + if filter_on == "large": + if len(left.chunks) > len(right.chunks): + left = filter_by_bloom_filter( + left, right, left_on, right_on, **bloom_filter_params + ) + else: + right = filter_by_bloom_filter( + right, left, right_on, left_on, **bloom_filter_params + ) + elif filter_on == "small": + if len(left.chunks) < len(right.chunks): + left = filter_by_bloom_filter( + left, right, left_on, right_on, **bloom_filter_params + ) + else: + right = filter_by_bloom_filter( + right, left, right_on, left_on, **bloom_filter_params + ) + else: + assert filter_on == "both" + # both + left = filter_by_bloom_filter( + left, right, left_on, right_on, **bloom_filter_params + ) + right = filter_by_bloom_filter( + right, left, right_on, left_on, **bloom_filter_params + ) + return left, right + + @classmethod + def _tile_one_chunk( + cls, + op: "DataFrameMerge", + left: Union[DataFrame, Series], + right: Union[DataFrame, Series], + ): + df = op.outputs[0] + if len(left.chunks) == 1 and len(right.chunks) == 1: + merge_op = op.copy().reset_key() + out_chunk = merge_op.new_chunk( + [left.chunks[0], right.chunks[0]], + shape=df.shape, + index=left.chunks[0].index, + index_value=df.index_value, + dtypes=df.dtypes, + columns_value=df.columns_value, + ) + out_chunks = [out_chunk] + nsplits = ((np.nan,), (df.shape[1],)) + elif len(left.chunks) == 1: + out_chunks = [] + left_chunk = left.chunks[0] + left_chunk.is_broadcaster = True + for c in right.chunks: + merge_op = op.copy().reset_key() + out_chunk = merge_op.new_chunk( + [left_chunk, c], + shape=(np.nan, df.shape[1]), + index=c.index, + index_value=infer_index_value( + left_chunk.index_value, c.index_value + ), + dtypes=df.dtypes, + columns_value=df.columns_value, + ) + out_chunks.append(out_chunk) + nsplits = ((np.nan,) * len(right.chunks), (df.shape[1],)) + else: + out_chunks = [] + right_chunk = right.chunks[0] + # set `is_broadcaster` as True + right_chunk.is_broadcaster = True + for c in left.chunks: + merge_op = op.copy().reset_key() + out_chunk = merge_op.new_chunk( + [c, right_chunk], + shape=(np.nan, df.shape[1]), + index=c.index, + index_value=infer_index_value( + right_chunk.index_value, c.index_value + ), + dtypes=df.dtypes, + columns_value=df.columns_value, + ) + out_chunks.append(out_chunk) + nsplits = ((np.nan,) * len(left.chunks), (df.shape[1],)) + + new_op = op.copy() + return new_op.new_dataframes( + op.inputs, + df.shape, + nsplits=nsplits, + chunks=out_chunks, + dtypes=df.dtypes, + index_value=df.index_value, + columns_value=df.columns_value, + ) + + @classmethod + def _tile_shuffle( + cls, + op: "DataFrameMerge", + left: Union[DataFrame, Series], + right: Union[DataFrame, Series], + ): + df = op.outputs[0] + left_row_chunk_size = left.chunk_shape[0] + right_row_chunk_size = right.chunk_shape[0] + out_row_chunk_size = max(left_row_chunk_size, right_row_chunk_size) + + out_chunk_shape = (out_row_chunk_size, 1) + nsplits = [[np.nan for _ in range(out_row_chunk_size)], [df.shape[1]]] + + left_on = _prepare_shuffle_on(op.left_index, op.left_on, op.on) + right_on = _prepare_shuffle_on(op.right_index, op.right_on, op.on) + + # do shuffle + left_chunks, right_chunks = cls._gen_both_shuffle_chunks( + out_chunk_shape, left_on, right_on, left, right + ) + + out_chunks = [] + for left_chunk, right_chunk in zip(left_chunks, right_chunks): + merge_op = op.copy().reset_key() + out_chunk = merge_op.new_chunk( + [left_chunk, right_chunk], + shape=(np.nan, df.shape[1]), + index=left_chunk.index, + index_value=infer_index_value( + left_chunk.index_value, right_chunk.index_value + ), + dtypes=df.dtypes, + columns_value=df.columns_value, + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_dataframes( + op.inputs, + df.shape, + nsplits=tuple(tuple(ns) for ns in nsplits), + chunks=out_chunks, + dtypes=df.dtypes, + index_value=df.index_value, + columns_value=df.columns_value, + ) + + @classmethod + def _tile_broadcast( + cls, + op: "DataFrameMerge", + left: Union[DataFrame, Series], + right: Union[DataFrame, Series], + ): + from .concat import DataFrameConcat + + out_df = op.outputs[0] + out_chunks = [] + if left.chunk_shape[0] < right.chunk_shape[0]: + # broadcast left + if op.how == "inner": + left_chunks = left.chunks + need_split = False + else: + left_on = _prepare_shuffle_on(op.left_index, op.left_on, op.on) + left_chunks = cls._gen_shuffle_chunks(left.chunk_shape, left_on, left) + need_split = True + # set is_broadcast property + for c in left_chunks: + c.is_broadcaster = True + right_chunks = right.chunks + for right_chunk in right_chunks: + merged_chunks = [] + # concat all merged results + for j, left_chunk in enumerate(left_chunks): + merge_op = op.copy().reset_key() + if need_split: + merge_op.split_info = MergeSplitInfo( + "right", j, len(left_chunks) + ) + merged_chunks.append( + merge_op.new_chunk( + [left_chunk, right_chunk], + index=(j, 0), + shape=(np.nan, out_df.shape[1]), + columns_value=out_df.columns_value, + ) + ) + concat_op = DataFrameConcat(output_types=[OutputType.dataframe]) + out_chunks.append( + concat_op.new_chunk( + merged_chunks, + shape=(np.nan, out_df.shape[1]), + dtypes=out_df.dtypes, + index=right_chunk.index, + index_value=infer_index_value( + left_chunks[0].index_value, right_chunk.index_value + ), + columns_value=out_df.columns_value, + ) + ) + nsplits = ((np.nan,) * len(right.chunks), (out_df.shape[1],)) + else: + # broadcast right + if op.how == "inner": + need_split = False + right_chunks = right.chunks + else: + need_split = True + right_on = _prepare_shuffle_on(op.right_index, op.right_on, op.on) + right_chunks = cls._gen_shuffle_chunks( + right.chunk_shape, right_on, right + ) + # set is_broadcast property + for c in right_chunks: + c.is_broadcaster = True + left_chunks = left.chunks + for left_chunk in left_chunks: + merged_chunks = [] + # concat all merged results + for j, right_chunk in enumerate(right_chunks): + merge_op = op.copy().reset_key() + if need_split: + merge_op.split_info = MergeSplitInfo( + "left", j, len(right_chunks) + ) + merged_chunks.append( + merge_op.new_chunk( + [left_chunk, right_chunk], + shape=(np.nan, out_df.shape[1]), + index=(j, 0), + columns_value=out_df.columns_value, + ) + ) + concat_op = DataFrameConcat(output_types=[OutputType.dataframe]) + out_chunks.append( + concat_op.new_chunk( + merged_chunks, + shape=(np.nan, out_df.shape[1]), + dtypes=out_df.dtypes, + index=left_chunk.index, + index_value=infer_index_value( + left_chunk.index_value, right_chunks[0].index_value + ), + columns_value=out_df.columns_value, + ) + ) + nsplits = ((np.nan,) * len(left.chunks), (out_df.shape[1],)) + + new_op = op.copy() + return new_op.new_dataframes( + op.inputs, + out_df.shape, + nsplits=tuple(tuple(ns) for ns in nsplits), + chunks=out_chunks, + dtypes=out_df.dtypes, + index_value=out_df.index_value, + columns_value=out_df.columns_value, + ) + + @classmethod + def _can_merge_with_one_chunk( + cls, left: TileableType, right: TileableType, how: str + ) -> bool: + return (len(left.chunks) == 1 and how in ["right", "inner"]) or ( + len(right.chunks) == 1 and how in ["left", "inner"] + ) + + @classmethod + def _can_merge_with_broadcast( + cls, big_chunk_size: int, small_chunk_size: int, big_side: str, how: str + ) -> bool: + return how in [big_side, "inner"] and np.log2(big_chunk_size) > small_chunk_size + + @classmethod + def _get_auto_merge_options(cls, auto_merge: str) -> Tuple[bool, bool]: + if auto_merge == "both": + return True, True + elif auto_merge == "none": + return False, False + elif auto_merge == "before": + return True, False + else: + assert auto_merge == "after" + return False, True + + @classmethod + def _choose_merge_method( + cls, op: "DataFrameMerge", left: TileableType, right: TileableType + ): + how = op.how + method = op.method + left_row_chunk_size = left.chunk_shape[0] + right_row_chunk_size = right.chunk_shape[0] + if left_row_chunk_size > right_row_chunk_size: + big_side = "left" + big_chunk_size = left_row_chunk_size + small_chunk_size = right_row_chunk_size + else: + big_side = "right" + big_chunk_size = right_row_chunk_size + small_chunk_size = left_row_chunk_size + if method == "auto": + if cls._can_merge_with_one_chunk(left, right, how): + return MergeMethod.one_chunk + elif cls._can_merge_with_broadcast( + big_chunk_size, small_chunk_size, big_side, how + ): + return MergeMethod.broadcast + else: + return MergeMethod.shuffle + elif method == "broadcast": + if cls._can_merge_with_one_chunk(left, right, how): + return MergeMethod.one_chunk + elif how in [big_side, "inner"]: + return MergeMethod.broadcast + else: # pragma: no cover + raise ValueError("Cannot specify merge method `broadcast`") + else: + assert method == "shuffle" + return MergeMethod.shuffle + + @classmethod + def _if_apply_bloom_filter( + cls, + method: MergeMethod, + op: "DataFrameMerge", + left: TileableType, + right: TileableType, + ): + # bloom filter can only work for inner merge + if op.how != "inner" or op.bloom_filter is False: + return False + elif op.bloom_filter is True: + return True + + bloom_filter_options = op.bloom_filter_options or dict() + bloom_filter_chunk_threshold = bloom_filter_options.get( + "apply_chunk_size_threshold", DEFAULT_BLOOM_FILTER_CHUNK_THRESHOLD + ) + + # TODO(hks): disable bloom_filter for now, when it is ready, turn it on them + # bloom_filter == auto + if len(left.chunks + right.chunks) <= bloom_filter_chunk_threshold: + # if size of input chunks <= threshold, skip bloom filter + return False + elif method == MergeMethod.shuffle: + # for shuffle, enable bloom filter by default + return False + + return False + + @classmethod + def tile(cls, op: "DataFrameMerge"): + left = build_concatenated_rows_frame(op.inputs[0]) + right = build_concatenated_rows_frame(op.inputs[1]) + + ctx = get_context() + auto_merge_threshold = op.auto_merge_threshold + auto_merge_before, auto_merge_after = cls._get_auto_merge_options(op.auto_merge) + + if ( + auto_merge_before + and len(left.chunks) + len(right.chunks) > auto_merge_threshold + ): + yield TileStatus([left, right] + left.chunks + right.chunks, progress=0.2) + left_chunk_size = len(left.chunks) + right_chunk_size = len(right.chunks) + left = auto_merge_chunks(ctx, left) + right = auto_merge_chunks(ctx, right) + logger.info( + "Auto merge before %s, left data shape: %s, chunk count: %s -> %s, " + "right data shape: %s, chunk count: %s -> %s.", + op, + left.shape, + left_chunk_size, + len(left.chunks), + right.shape, + right_chunk_size, + len(right.chunks), + ) + else: + logger.info( + "Skip auto merge before %s, left data shape: %s, chunk count: %d, " + "right data shape: %s, chunk count: %d.", + op, + left.shape, + len(left.chunks), + right.shape, + len(right.chunks), + ) + + method = cls._choose_merge_method(op, left, right) + if cls._if_apply_bloom_filter(method, op, left, right): + if has_unknown_shape(left, right): # pragma: no cover + yield TileStatus(left.chunks + right.chunks, progress=0.3) + left_on = _prepare_shuffle_on(op.left_index, op.left_on, op.on) + right_on = _prepare_shuffle_on(op.right_index, op.right_on, op.on) + small_one = right if len(left.chunks) > len(right.chunks) else left + logger.info( + "Apply bloom filter for operand %s, use DataFrame %s to build bloom filter.", + op, + small_one, + ) + left, right = yield from recursive_tile( + *cls._apply_bloom_filter(left, right, left_on, right_on, op) + ) + # auto merge after bloom filter + yield TileStatus([left, right] + left.chunks + right.chunks, progress=0.5) + left = auto_merge_chunks(ctx, left) + right = auto_merge_chunks(ctx, right) + + if op.method == "auto": + # if method is auto, select new method after auto merge + method = cls._choose_merge_method(op, left, right) + logger.info("Choose %s method for merge operand %s.", method, op) + if method == MergeMethod.one_chunk: + ret = cls._tile_one_chunk(op, left, right) + elif method == MergeMethod.broadcast: + ret = cls._tile_broadcast(op, left, right) + else: + assert method == MergeMethod.shuffle + ret = cls._tile_shuffle(op, left, right) + + if ( + op.how == "inner" + and auto_merge_after + and len(ret[0].chunks) > auto_merge_threshold + ): + # if how=="inner", output data size will reduce greatly with high probability, + # use auto_merge_chunks to combine small chunks. + yield TileStatus( + ret[0].chunks, progress=0.8 + ) # trigger execution for chunks + merged = auto_merge_chunks(get_context(), ret[0]) + logger.info( + "Auto merge after %s, data shape: %s, chunk count: %s -> %s.", + op, + merged.shape, + len(ret[0].chunks), + len(merged.chunks), + ) + return [merged] + else: + logger.info( + "Skip auto merge after %s, data shape: %s, chunk count: %d.", + op, + ret[0].shape, + len(ret[0].chunks), + ) + return ret + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + left, right = ctx[op.inputs[0].key], ctx[op.inputs[1].key] + + if getattr(op, "split_info", None) is not None: + split_info = op.split_info + if split_info.split_side == "left": + index = hash_dataframe_on(left, on=op.on, size=split_info.nsplits)[ + split_info.split_index + ] + left = left.iloc[index] + else: + index = hash_dataframe_on(right, on=op.on, size=split_info.nsplits)[ + split_info.split_index + ] + right = right.iloc[index] + + def execute_merge(x, y): + if not op.gpu: + kwargs = dict( + copy=op.copy, validate=op.validate, indicator=op.indicator + ) + else: # pragma: no cover + # cudf doesn't support 'validate' and 'copy' + kwargs = dict(indicator=op.indicator) + return x.merge( + y, + how=op.how, + on=op.on, + left_on=op.left_on, + right_on=op.right_on, + left_index=op.left_index, + right_index=op.right_index, + sort=op.sort, + suffixes=op.suffixes, + **kwargs, + ) + + # workaround for: https://github.com/pandas-dev/pandas/issues/27943 + try: + r = execute_merge(left, right) + except ValueError: + r = execute_merge(left.copy(deep=True), right.copy(deep=True)) + + # make sure column's order + if not all( + n1 == n2 for n1, n2 in zip(chunk.columns_value.to_pandas(), r.columns) + ): + r = r[list(chunk.columns_value.to_pandas())] + ctx[chunk.key] = r + + +def _prepare_shuffle_on(use_index, side_on, on): + # consistent with pandas: `left_index` precedes `left_on` and `right_index` precedes `right_on` + if use_index: + # `None` means we will shuffle on df.index. + return None + elif side_on is not None: + return side_on + else: + return on + + +def merge( + df: Union[DataFrame, Series], + right: Union[DataFrame, Series], + how: str = "inner", + on: str = None, + left_on: str = None, + right_on: str = None, + left_index: bool = False, + right_index: bool = False, + sort: bool = False, + suffixes: Tuple[Optional[str], Optional[str]] = ("_x", "_y"), + copy: bool = True, + indicator: bool = False, + validate: str = None, + method: str = "auto", + auto_merge: str = "both", + auto_merge_threshold: int = 8, + bloom_filter: Union[bool, str] = "auto", + bloom_filter_options: Dict[str, Any] = None, +) -> DataFrame: + """ + Merge DataFrame or named Series objects with a database-style join. + + A named Series object is treated as a DataFrame with a single named column. + + The join is done on columns or indexes. If joining columns on + columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes + on indexes or indexes on a column or columns, the index will be passed on. + When performing a cross merge, no column specifications to merge on are + allowed. + + Parameters + ---------- + right : DataFrame or named Series + Object to merge with. + how : {'left', 'right', 'outer', 'inner'}, default 'inner' + Type of merge to be performed. + + * left: use only keys from left frame, similar to a SQL left outer join; + preserve key order. + * right: use only keys from right frame, similar to a SQL right outer join; + preserve key order. + * outer: use union of keys from both frames, similar to a SQL full outer + join; sort keys lexicographically. + * inner: use intersection of keys from both frames, similar to a SQL inner + join; preserve the order of the left keys. + + on : label or list + Column or index level names to join on. These must be found in both + DataFrames. If `on` is None and not merging on indexes then this defaults + to the intersection of the columns in both DataFrames. + left_on : label or list, or array-like + Column or index level names to join on in the left DataFrame. Can also + be an array or list of arrays of the length of the left DataFrame. + These arrays are treated as if they are columns. + right_on : label or list, or array-like + Column or index level names to join on in the right DataFrame. Can also + be an array or list of arrays of the length of the right DataFrame. + These arrays are treated as if they are columns. + left_index : bool, default False + Use the index from the left DataFrame as the join key(s). If it is a + MultiIndex, the number of keys in the other DataFrame (either the index + or a number of columns) must match the number of levels. + right_index : bool, default False + Use the index from the right DataFrame as the join key. Same caveats as + left_index. + sort : bool, default False + Sort the join keys lexicographically in the result DataFrame. If False, + the order of the join keys depends on the join type (how keyword). + suffixes : list-like, default is ("_x", "_y") + A length-2 sequence where each element is optionally a string + indicating the suffix to add to overlapping column names in + `left` and `right` respectively. Pass a value of `None` instead + of a string to indicate that the column name from `left` or + `right` should be left as-is, with no suffix. At least one of the + values must not be None. + copy : bool, default True + If False, avoid copy if possible. + indicator : bool or str, default False + If True, adds a column to the output DataFrame called "_merge" with + information on the source of each row. The column can be given a different + name by providing a string argument. The column will have a Categorical + type with the value of "left_only" for observations whose merge key only + appears in the left DataFrame, "right_only" for observations + whose merge key only appears in the right DataFrame, and "both" + if the observation's merge key is found in both DataFrames. + validate : str, optional + If specified, checks if merge is of specified type. + + * "one_to_one" or "1:1": check if merge keys are unique in both + left and right datasets. + * "one_to_many" or "1:m": check if merge keys are unique in left + dataset. + * "many_to_one" or "m:1": check if merge keys are unique in right + dataset. + * "many_to_many" or "m:m": allowed, but does not result in checks. + method : {"auto", "shuffle", "broadcast"}, default auto + "broadcast" is recommended when one DataFrame is much smaller than the other, + otherwise, "shuffle" will be a better choice. By default, we choose method + according to actual data size. + auto_merge : {"both", "none", "before", "after"}, default both + Auto merge small chunks before or after merge + + * "both": auto merge small chunks before and after, + * "none": do not merge small chunks + * "before": only merge small chunks before merge + * "after": only merge small chunks after merge + auto_merge_threshold : int, default 8 + When how is "inner", merged result could be much smaller than original DataFrame, + if the number of chunks is greater than the threshold, + it will merge small chunks automatically. + bloom_filter: bool, str, default "auto" + Use bloom filter to optimize merge + bloom_filter_options: dict + * "max_elements": max elements in bloom filter, + default value is the max size of all input chunks + * "error_rate": error raite, default 0.1. + * "apply_chunk_size_threshold": min chunk size of input chunks to apply bloom filter, default 10 + when chunk size of left and right is greater than this threshold, apply bloom filter + * "filter": "large", "small", "both", default "large" + decides to filter on large, small or both DataFrames. + + Returns + ------- + DataFrame + A DataFrame of the two merged objects. + + Examples + -------- + >>> import mars.dataframe as md + >>> df1 = md.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], + ... 'value': [1, 2, 3, 5]}) + >>> df2 = md.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], + ... 'value': [5, 6, 7, 8]}) + >>> df1.execute() + lkey value + 0 foo 1 + 1 bar 2 + 2 baz 3 + 3 foo 5 + >>> df2.execute() + rkey value + 0 foo 5 + 1 bar 6 + 2 baz 7 + 3 foo 8 + + Merge df1 and df2 on the lkey and rkey columns. The value columns have + the default suffixes, _x and _y, appended. + + >>> df1.merge(df2, left_on='lkey', right_on='rkey').execute() + lkey value_x rkey value_y + 0 foo 1 foo 5 + 1 foo 1 foo 8 + 2 foo 5 foo 5 + 3 foo 5 foo 8 + 4 bar 2 bar 6 + 5 baz 3 baz 7 + + Merge DataFrames df1 and df2 with specified left and right suffixes + appended to any overlapping columns. + + >>> df1.merge(df2, left_on='lkey', right_on='rkey', + ... suffixes=('_left', '_right')).execute() + lkey value_left rkey value_right + 0 foo 1 foo 5 + 1 foo 1 foo 8 + 2 foo 5 foo 5 + 3 foo 5 foo 8 + 4 bar 2 bar 6 + 5 baz 3 baz 7 + + Merge DataFrames df1 and df2, but raise an exception if the DataFrames have + any overlapping columns. + + >>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False)).execute() + Traceback (most recent call last): + ... + ValueError: columns overlap but no suffix specified: + Index(['value'], dtype='object') + + >>> df1 = md.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) + >>> df2 = md.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) + >>> df1.execute() + a b + 0 foo 1 + 1 bar 2 + >>> df2.execute() + a c + 0 foo 3 + 1 baz 4 + + >>> df1.merge(df2, how='inner', on='a').execute() + a b c + 0 foo 1 3 + + >>> df1.merge(df2, how='left', on='a').execute() + a b c + 0 foo 1 3.0 + 1 bar 2 NaN + """ + if method is None: + method = "auto" + if method not in [ + "auto", + "shuffle", + "broadcast", + ]: # pragma: no cover + raise NotImplementedError(f"{method} merge is not supported") + if auto_merge not in ["both", "none", "before", "after"]: # pragma: no cover + raise ValueError( + f"auto_merge can only be `both`, `none`, `before` or `after`, got {auto_merge}" + ) + if bloom_filter not in [True, False, "auto"]: + raise ValueError( + f'bloom_filter can only be True, False, or "auto", got {bloom_filter}' + ) + if bloom_filter_options: + if not isinstance(bloom_filter_options, dict): + raise TypeError( + f"bloom_filter_options must be a dict, got {type(bloom_filter_options)}" + ) + for k, v in bloom_filter_options.items(): + if k not in BLOOM_FILTER_OPTIONS: + raise ValueError( + f"Invalid bloom filter option {k}, available: {BLOOM_FILTER_OPTIONS}" + ) + if k == "filter" and v not in BLOOM_FILTER_ON_OPTIONS: + raise ValueError( + f"Invalid filter {k}, available: {BLOOM_FILTER_ON_OPTIONS}" + ) + op = DataFrameMerge( + how=how, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + sort=sort, + suffixes=suffixes, + copy=copy, + indicator=indicator, + validate=validate, + method=method, + auto_merge=auto_merge, + auto_merge_threshold=auto_merge_threshold, + bloom_filter=bloom_filter, + bloom_filter_options=bloom_filter_options, + output_types=[OutputType.dataframe], + ) + return op(df, right) + + +def join( + df: Union[DataFrame, Series], + other: Union[DataFrame, Series], + on: str = None, + how: str = "left", + lsuffix: str = "", + rsuffix: str = "", + sort: bool = False, + method: str = None, + auto_merge: str = "both", + auto_merge_threshold: int = 8, + bloom_filter: Union[bool, Dict] = True, + bloom_filter_options: Dict[str, Any] = None, +) -> DataFrame: + """ + Join columns of another DataFrame. + + Join columns with `other` DataFrame either on index or on a key + column. Efficiently join multiple DataFrame objects by index at once by + passing a list. + + Parameters + ---------- + other : DataFrame, Series, or list of DataFrame + Index should be similar to one of the columns in this one. If a + Series is passed, its name attribute must be set, and that will be + used as the column name in the resulting joined DataFrame. + on : str, list of str, or array-like, optional + Column or index level name(s) in the caller to join on the index + in `other`, otherwise joins index-on-index. If multiple + values given, the `other` DataFrame must have a MultiIndex. Can + pass an array as the join key if it is not already contained in + the calling DataFrame. Like an Excel VLOOKUP operation. + how : {'left', 'right', 'outer', 'inner'}, default 'left' + How to handle the operation of the two objects. + + * left: use calling frame's index (or column if on is specified) + * right: use `other`'s index. + * outer: form union of calling frame's index (or column if on is + specified) with `other`'s index, and sort it. + lexicographically. + * inner: form intersection of calling frame's index (or column if + on is specified) with `other`'s index, preserving the order + of the calling's one. + + lsuffix : str, default '' + Suffix to use from left frame's overlapping columns. + rsuffix : str, default '' + Suffix to use from right frame's overlapping columns. + sort : bool, default False + Order result DataFrame lexicographically by the join key. If False, + the order of the join key depends on the join type (how keyword). + method : {"shuffle", "broadcast"}, default None + "broadcast" is recommended when one DataFrame is much smaller than the other, + otherwise, "shuffle" will be a better choice. By default, we choose method + according to actual data size. + auto_merge : {"both", "none", "before", "after"}, default both + Auto merge small chunks before or after merge + + * "both": auto merge small chunks before and after, + * "none": do not merge small chunks + * "before": only merge small chunks before merge + * "after": only merge small chunks after merge + auto_merge_threshold : int, default 8 + When how is "inner", merged result could be much smaller than original DataFrame, + if the number of chunks is greater than the threshold, + it will merge small chunks automatically. + bloom_filter: bool, str, default "auto" + Use bloom filter to optimize merge + bloom_filter_options: dict + * "max_elements": max elements in bloom filter, + default value is the max size of all input chunks + * "error_rate": error raite, default 0.1. + * "apply_chunk_size_threshold": min chunk size of input chunks to apply bloom filter, default 10 + when chunk size of left and right is greater than this threshold, apply bloom filter + * "filter": "large", "small", "both", default "large" + decides to filter on large, small or both DataFrames. + + Returns + ------- + DataFrame + A dataframe containing columns from both the caller and `other`. + + See Also + -------- + DataFrame.merge : For column(s)-on-column(s) operations. + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], + ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) + + >>> df.execute() + key A + 0 K0 A0 + 1 K1 A1 + 2 K2 A2 + 3 K3 A3 + 4 K4 A4 + 5 K5 A5 + + >>> other = md.DataFrame({'key': ['K0', 'K1', 'K2'], + ... 'B': ['B0', 'B1', 'B2']}) + + >>> other.execute() + key B + 0 K0 B0 + 1 K1 B1 + 2 K2 B2 + + Join DataFrames using their indexes. + + >>> df.join(other, lsuffix='_caller', rsuffix='_other').execute() + key_caller A key_other B + 0 K0 A0 K0 B0 + 1 K1 A1 K1 B1 + 2 K2 A2 K2 B2 + 3 K3 A3 NaN NaN + 4 K4 A4 NaN NaN + 5 K5 A5 NaN NaN + + If we want to join using the key columns, we need to set key to be + the index in both `df` and `other`. The joined DataFrame will have + key as its index. + + >>> df.set_index('key').join(other.set_index('key')).execute() + A B + key + K0 A0 B0 + K1 A1 B1 + K2 A2 B2 + K3 A3 NaN + K4 A4 NaN + K5 A5 NaN + + Another option to join using the key columns is to use the `on` + parameter. DataFrame.join always uses `other`'s index but we can use + any column in `df`. This method preserves the original DataFrame's + index in the result. + + >>> df.join(other.set_index('key'), on='key').execute() + key A B + 0 K0 A0 B0 + 1 K1 A1 B1 + 2 K2 A2 B2 + 3 K3 A3 NaN + 4 K4 A4 NaN + 5 K5 A5 NaN + + Using non-unique key values shows how they are matched. + + >>> df = md.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'], + ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) + + >>> df.execute() + key A + 0 K0 A0 + 1 K1 A1 + 2 K1 A2 + 3 K3 A3 + 4 K0 A4 + 5 K1 A5 + + >>> df.join(other.set_index('key'), on='key').execute() + key A B + 0 K0 A0 B0 + 1 K1 A1 B1 + 2 K1 A2 B1 + 3 K3 A3 NaN + 4 K0 A4 B0 + 5 K1 A5 B1 + """ + return merge( + df, + other, + left_on=on, + how=how, + left_index=on is None, + right_index=True, + suffixes=(lsuffix, rsuffix), + sort=sort, + method=method, + auto_merge=auto_merge, + auto_merge_threshold=auto_merge_threshold, + bloom_filter=bloom_filter, + bloom_filter_options=bloom_filter_options, + ) diff --git a/python/xorbits/_mars/dataframe/merge/tests/__init__.py b/python/xorbits/_mars/dataframe/merge/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/merge/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/merge/tests/test_merge.py b/python/xorbits/_mars/dataframe/merge/tests/test_merge.py new file mode 100644 index 000000000..cb5a7686b --- /dev/null +++ b/python/xorbits/_mars/dataframe/merge/tests/test_merge.py @@ -0,0 +1,345 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest + +from ....core import tile +from ....core.operand import OperandStage +from ...core import IndexValue +from ...datasource.dataframe import from_pandas +from .. import DataFrameMerge, DataFrameMergeAlign, concat + + +def test_merge(): + df1 = pd.DataFrame( + np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"] + ) + df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"]) + + mdf1 = from_pandas(df1, chunk_size=2) + mdf2 = from_pandas(df2, chunk_size=3) + + parameters = [ + {}, + {"how": "left", "right_on": "x", "left_index": True}, + {"how": "right", "left_on": "a", "right_index": True}, + {"how": "left", "left_on": "a", "right_on": "x"}, + {"how": "right", "left_on": "a", "right_index": True}, + {"how": "right", "on": "a"}, + {"how": "inner", "on": ["a", "b"]}, + ] + + for kw in parameters: + df = mdf1.merge(mdf2, **kw) + df = tile(df) + + assert df.chunk_shape == (2, 1) + for chunk in df.chunks: + assert isinstance(chunk.op, DataFrameMerge) + assert chunk.op.how == kw.get("how", "inner") + left, right = chunk.op.inputs + assert isinstance(left.op, DataFrameMergeAlign) + assert left.op.stage == OperandStage.reduce + assert isinstance(right.op, DataFrameMergeAlign) + assert right.op.stage == OperandStage.reduce + assert len(left.inputs[0].inputs) == 4 + assert len(right.inputs[0].inputs) == 4 + for lchunk in left.inputs[0].inputs[:2]: + assert isinstance(lchunk.op, DataFrameMergeAlign) + assert lchunk.op.stage == OperandStage.map + assert lchunk.op.index_shuffle_size == 2 + if kw.get("on", None) or kw.get("left_on", None): + # defaults to common columns + assert lchunk.op.shuffle_on == kw.get("on", None) or kw.get( + "left_on", None + ) + for rchunk in right.inputs[0].inputs[2:]: + assert isinstance(rchunk.op, DataFrameMergeAlign) + assert rchunk.op.stage == OperandStage.map + assert rchunk.op.index_shuffle_size == 2 + if kw.get("on", None) or kw.get("right_on", None): + # defaults to common columns + assert rchunk.op.shuffle_on == kw.get("on", None) or kw.get( + "right_on", None + ) + pd.testing.assert_index_equal( + chunk.columns_value.to_pandas(), df.columns_value.to_pandas() + ) + + +def test_merge_invalid_parameters(): + pdf1 = pd.DataFrame( + np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"] + ) + pdf2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"]) + + df1 = from_pandas(pdf1, chunk_size=2) + df2 = from_pandas(pdf2, chunk_size=3) + + with pytest.raises(ValueError): + df1.merge(df2, bloom_filter="wrong") + + with pytest.raises(TypeError): + df1.merge(df2, bloom_filter_options="wrong") + + with pytest.raises(ValueError): + df1.merge(df2, bloom_filter_options={"wrong": 1}) + + with pytest.raises(ValueError): + df1.merge(df2, bloom_filter_options={"filter": "wrong"}) + + +def test_join(): + df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]], index=["a1", "a2", "a3"]) + df2 = pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]], index=["a1", "b2", "b3"]) + 1 + df2 = pd.concat([df2, df2 + 1]) + + mdf1 = from_pandas(df1, chunk_size=2) + mdf2 = from_pandas(df2, chunk_size=2) + + parameters = [ + {"lsuffix": "l_", "rsuffix": "r_"}, + {"lsuffix": "l_", "rsuffix": "r_", "how": "left"}, + {"lsuffix": "l_", "rsuffix": "r_", "how": "right"}, + {"lsuffix": "l_", "rsuffix": "r_", "how": "inner"}, + {"lsuffix": "l_", "rsuffix": "r_", "how": "left"}, + ] + + for kw in parameters: + df = mdf1.join(mdf2, auto_merge="none", bloom_filter=False, **kw) + df = tile(df) + + assert df.chunk_shape == (3, 1) + for chunk in df.chunks: + assert isinstance(chunk.op, DataFrameMerge) + assert chunk.op.how == kw.get("how", "left") + left, right = chunk.op.inputs + assert isinstance(left.op, DataFrameMergeAlign) + assert left.op.stage == OperandStage.reduce + assert isinstance(right.op, DataFrameMergeAlign) + assert right.op.stage == OperandStage.reduce + assert len(left.inputs[0].inputs) == 5 + assert len(right.inputs[0].inputs) == 5 + for lchunk in left.inputs[0].inputs: + assert isinstance(lchunk.op, DataFrameMergeAlign) + assert lchunk.op.stage == OperandStage.map + assert lchunk.op.index_shuffle_size == 3 + assert lchunk.op.shuffle_on == None + for rchunk in right.inputs[0].inputs: + assert isinstance(rchunk.op, DataFrameMergeAlign) + assert rchunk.op.stage == OperandStage.map + assert rchunk.op.index_shuffle_size == 3 + assert rchunk.op.shuffle_on == None + pd.testing.assert_index_equal( + chunk.columns_value.to_pandas(), df.columns_value.to_pandas() + ) + + +def test_join_on(): + df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]], columns=["a1", "a2", "a3"]) + df2 = ( + pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]], columns=["a1", "b2", "b3"]) + 1 + ) + df2 = pd.concat([df2, df2 + 1]) + + mdf1 = from_pandas(df1, chunk_size=2) + mdf2 = from_pandas(df2, chunk_size=2) + + parameters = [ + {"lsuffix": "l_", "rsuffix": "r_"}, + {"lsuffix": "l_", "rsuffix": "r_", "how": "left", "on": "a1"}, + {"lsuffix": "l_", "rsuffix": "r_", "how": "right", "on": "a2"}, + {"lsuffix": "l_", "rsuffix": "r_", "how": "inner", "on": "a2"}, + {"lsuffix": "l_", "rsuffix": "r_", "how": "outer", "on": "a2"}, + ] + + for kw in parameters: + df = mdf1.join(mdf2, auto_merge="none", bloom_filter=False, **kw) + df = tile(df) + + assert df.chunk_shape == (3, 1) + for chunk in df.chunks: + assert isinstance(chunk.op, DataFrameMerge) + assert chunk.op.how == kw.get("how", "left") + left, right = chunk.op.inputs + assert isinstance(left.op, DataFrameMergeAlign) + assert left.op.stage == OperandStage.reduce + assert isinstance(right.op, DataFrameMergeAlign) + assert right.op.stage == OperandStage.reduce + assert len(left.inputs[0].inputs) == 5 + assert len(right.inputs[0].inputs) == 5 + for lchunk in left.inputs[0].inputs[:2]: + assert isinstance(lchunk.op, DataFrameMergeAlign) + assert lchunk.op.stage == OperandStage.map + assert lchunk.op.index_shuffle_size == 3 + assert lchunk.op.shuffle_on == kw.get("on", None) + for rchunk in right.inputs[0].inputs[2:]: + assert isinstance(rchunk.op, DataFrameMergeAlign) + assert rchunk.op.stage == OperandStage.map + assert rchunk.op.index_shuffle_size == 3 + assert rchunk.op.shuffle_on == None + pd.testing.assert_index_equal( + chunk.columns_value.to_pandas(), df.columns_value.to_pandas() + ) + + +def test_merge_one_chunk(): + df1 = pd.DataFrame({"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 5]}) + df2 = pd.DataFrame({"rkey": ["foo", "bar", "baz", "foo"], "value": [5, 6, 7, 8]}) + + # all have one chunk + mdf1 = from_pandas(df1) + mdf2 = from_pandas(df2) + df = mdf1.merge(mdf2, left_on="lkey", right_on="rkey") + tiled, tiled1, tiled2 = tile(df, mdf1, mdf2) + + assert tiled.chunk_shape == (1, 1) + assert tiled.chunks[0].inputs[0].key == tiled1.chunks[0].key + assert tiled.chunks[0].inputs[1].key == tiled2.chunks[0].key + + # left has one chunk + mdf1 = from_pandas(df1) + mdf2 = from_pandas(df2, chunk_size=2) + df = mdf1.merge(mdf2, left_on="lkey", right_on="rkey") + tiled, tiled1, tiled2 = tile(df, mdf1, mdf2) + + assert tiled.chunk_shape == (2, 1) + assert tiled.chunks[0].inputs[0].key == tiled1.chunks[0].key + assert tiled.chunks[0].inputs[1].key == tiled2.chunks[0].key + assert tiled.chunks[1].inputs[0].key == tiled1.chunks[0].key + assert tiled.chunks[1].inputs[1].key == tiled2.chunks[1].key + + # right has one chunk + mdf1 = from_pandas(df1, chunk_size=2) + mdf2 = from_pandas(df2) + df = mdf1.merge(mdf2, left_on="lkey", right_on="rkey") + tiled, tiled1, tiled2 = tile(df, mdf1, mdf2) + + assert tiled.chunk_shape == (2, 1) + assert tiled.chunks[0].inputs[0].key == tiled1.chunks[0].key + assert tiled.chunks[0].inputs[1].key == tiled2.chunks[0].key + assert tiled.chunks[1].inputs[0].key == tiled1.chunks[1].key + assert tiled.chunks[1].inputs[1].key == tiled2.chunks[0].key + + +def test_append(): + df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD")) + df2 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD")) + + mdf1 = from_pandas(df1, chunk_size=3) + mdf2 = from_pandas(df2, chunk_size=3) + adf = mdf1.append(mdf2) + + assert adf.shape == (20, 4) + assert isinstance(adf.index_value.value, IndexValue.Int64Index) + + tiled = tile(adf) + assert tiled.nsplits == ((3, 3, 3, 1, 3, 3, 3, 1), (3, 1)) + assert tiled.chunk_shape == (8, 2) + for i, c in enumerate(tiled.chunks): + index = (i // 2, i % 2) + assert c.index == index + + mdf1 = from_pandas(df1, chunk_size=3) + mdf2 = from_pandas(df2, chunk_size=3) + adf = mdf1.append(mdf2, ignore_index=True) + + assert adf.shape == (20, 4) + assert isinstance(adf.index_value.value, IndexValue.RangeIndex) + pd.testing.assert_index_equal(adf.index_value.to_pandas(), pd.RangeIndex(20)) + + +def test_concat(): + df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD")) + df2 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD")) + + mdf1 = from_pandas(df1, chunk_size=4) + mdf2 = from_pandas(df2, chunk_size=4) + r = concat([mdf1, mdf2], axis="index") + + assert r.shape == (20, 4) + pd.testing.assert_series_equal(r.dtypes, df1.dtypes) + + tiled = tile(r) + assert tiled.nsplits == ((4, 4, 2, 4, 4, 2), (4,)) + for i, c in enumerate(tiled.chunks): + assert c.index == (i, 0) + + df3 = pd.DataFrame( + np.random.rand(10, 4), columns=list("ABCD"), index=pd.RangeIndex(10, 20) + ) + + mdf3 = from_pandas(df3, chunk_size=4) + r = concat([mdf1, mdf3], axis="index") + + assert r.shape == (20, 4) + pd.testing.assert_series_equal(r.dtypes, df1.dtypes) + pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.RangeIndex(20)) + + df4 = pd.DataFrame( + np.random.rand(10, 4), + columns=list("ABCD"), + index=np.random.permutation(np.arange(10)), + ) + + mdf4 = from_pandas(df4, chunk_size=4) + r = concat([mdf1, mdf4], axis="index") + + assert r.shape == (20, 4) + pd.testing.assert_series_equal(r.dtypes, df1.dtypes) + pd.testing.assert_index_equal( + r.index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) + + r = concat([mdf4, mdf1], axis="index") + + assert r.shape == (20, 4) + pd.testing.assert_series_equal(r.dtypes, df1.dtypes) + pd.testing.assert_index_equal( + r.index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) + + r = concat([mdf4, mdf4], axis="index") + + assert r.shape == (20, 4) + pd.testing.assert_series_equal(r.dtypes, df1.dtypes) + pd.testing.assert_index_equal( + r.index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) + + mdf1 = from_pandas(df1, chunk_size=3) + mdf2 = from_pandas(df2, chunk_size=4) + r = concat([mdf1, mdf2], axis="columns") + + assert r.shape == (10, 8) + expected_dtypes = pd.concat([df1, df2], axis="columns").dtypes + pd.testing.assert_series_equal(r.dtypes, expected_dtypes) + + tiled = tile(r) + assert tiled.nsplits == ((3, 3, 3, 1), (3, 1, 4)) + for i, c in enumerate(tiled.chunks): + index = (i // 3, i % 3) + assert c.index == index + + df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD")) + df2 = pd.DataFrame(np.random.rand(10, 3), columns=list("ABC")) + mdf1 = from_pandas(df1, chunk_size=3) + mdf2 = from_pandas(df2, chunk_size=3) + r = concat([mdf1, mdf2], join="inner") + + assert r.shape == (20, 3) + tiled = tile(r) + assert tiled.nsplits == ((3, 3, 3, 1, 3, 3, 3, 1), (3,)) diff --git a/python/xorbits/_mars/dataframe/merge/tests/test_merge_execution.py b/python/xorbits/_mars/dataframe/merge/tests/test_merge_execution.py new file mode 100644 index 000000000..ae5f5c12d --- /dev/null +++ b/python/xorbits/_mars/dataframe/merge/tests/test_merge_execution.py @@ -0,0 +1,846 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest + +from ....core.graph.builder.utils import build_graph +from ...datasource.dataframe import from_pandas +from ...datasource.series import from_pandas as series_from_pandas +from ...utils import sort_dataframe_inplace +from .. import DataFrameConcat, DataFrameMergeAlign, concat + + +def test_merge(setup): + df1 = pd.DataFrame( + np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"] + ) + df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"]) + df3 = df1.copy() + df3.index = pd.RangeIndex(2, 6, name="index") + df4 = df1.copy() + df4.index = pd.MultiIndex.from_tuples( + [(i, i + 1) for i in range(4)], names=["i1", "i2"] + ) + + mdf1 = from_pandas(df1, chunk_size=2) + mdf2 = from_pandas(df2, chunk_size=2) + mdf3 = from_pandas(df3, chunk_size=3) + mdf4 = from_pandas(df4, chunk_size=2) + + # Note [Index of Merge] + # + # When `left_index` and `right_index` of `merge` is both false, pandas will generate an RangeIndex to + # the final result dataframe. + # + # We chunked the `left` and `right` dataframe, thus every result chunk will have its own RangeIndex. + # When they are contenated we don't generate a new RangeIndex for the result, thus we cannot obtain the + # same index value with pandas. But we guarantee that the content of dataframe is correct. + + # merge on index + expected0 = df1.merge(df2) + jdf0 = mdf1.merge(mdf2, auto_merge="none") + result0 = jdf0.execute().fetch() + pd.testing.assert_frame_equal( + sort_dataframe_inplace(expected0, 0), sort_dataframe_inplace(result0, 0) + ) + + # merge on left index and `right_on` + expected1 = df1.merge(df2, how="left", right_on="x", left_index=True) + jdf1 = mdf1.merge( + mdf2, how="left", right_on="x", left_index=True, auto_merge="none" + ) + result1 = jdf1.execute().fetch() + expected1.set_index("a_x", inplace=True) + result1.set_index("a_x", inplace=True) + pd.testing.assert_frame_equal( + sort_dataframe_inplace(expected1, 0), sort_dataframe_inplace(result1, 0) + ) + + # merge on `left_on` and right index + expected2 = df1.merge(df2, how="right", left_on="a", right_index=True) + jdf2 = mdf1.merge( + mdf2, how="right", left_on="a", right_index=True, auto_merge="none" + ) + result2 = jdf2.execute().fetch() + expected2.set_index("a", inplace=True) + result2.set_index("a", inplace=True) + pd.testing.assert_frame_equal( + sort_dataframe_inplace(expected2, 0), sort_dataframe_inplace(result2, 0) + ) + + # merge on `left_on` and `right_on` + expected3 = df1.merge(df2, how="left", left_on="a", right_on="x") + jdf3 = mdf1.merge(mdf2, how="left", left_on="a", right_on="x", auto_merge="none") + result3 = jdf3.execute().fetch() + expected3.set_index("a_x", inplace=True) + result3.set_index("a_x", inplace=True) + pd.testing.assert_frame_equal( + sort_dataframe_inplace(expected3, 0), sort_dataframe_inplace(result3, 0) + ) + + # merge on `on` + expected4 = df1.merge(df2, how="right", on="a") + jdf4 = mdf1.merge(mdf2, how="right", on="a", auto_merge="none") + result4 = jdf4.execute().fetch() + expected4.set_index("a", inplace=True) + result4.set_index("a", inplace=True) + pd.testing.assert_frame_equal( + sort_dataframe_inplace(expected4, 0), sort_dataframe_inplace(result4, 0) + ) + + # merge on multiple columns + expected5 = df1.merge(df2, how="inner", on=["a", "b"]) + jdf5 = mdf1.merge(mdf2, how="inner", on=["a", "b"], auto_merge="none") + result5 = jdf5.execute().fetch() + pd.testing.assert_frame_equal( + sort_dataframe_inplace(expected5, 0), sort_dataframe_inplace(result5, 0) + ) + + # merge when some on is index + expected6 = df3.merge(df2, how="inner", left_on="index", right_on="a") + jdf6 = mdf3.merge( + mdf2, how="inner", left_on="index", right_on="a", auto_merge="none" + ) + result6 = jdf6.execute().fetch() + pd.testing.assert_frame_equal( + sort_dataframe_inplace(expected6, 0), sort_dataframe_inplace(result6, 0) + ) + + # merge when on is in MultiIndex + expected7 = df4.merge(df2, how="inner", left_on="i1", right_on="a") + jdf7 = mdf4.merge(mdf2, how="inner", left_on="i1", right_on="a", auto_merge="none") + result7 = jdf7.execute().fetch() + pd.testing.assert_frame_equal( + sort_dataframe_inplace(expected7, 0), sort_dataframe_inplace(result7, 0) + ) + + mdf5 = from_pandas(df2, chunk_size=4) + mdf6 = from_pandas(df4, chunk_size=1) + expected7 = df4.merge(df2, how="inner", left_on="i1", right_on="a") + jdf7 = mdf6.merge(mdf5, how="inner", left_on="i1", right_on="a", auto_merge="none") + result7 = jdf7.execute().fetch() + pd.testing.assert_frame_equal( + sort_dataframe_inplace(expected7, 0), sort_dataframe_inplace(result7, 0) + ) + + # merge when on is in MultiIndex, and on not in index + expected8 = df4.merge(df2, how="inner", on=["a", "b"]) + jdf8 = mdf4.merge(mdf2, how="inner", on=["a", "b"], auto_merge="none") + result8 = jdf8.execute().fetch() + pd.testing.assert_frame_equal( + sort_dataframe_inplace(expected8, 0), sort_dataframe_inplace(result8, 0) + ) + + +def test_join(setup): + df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]], index=["a1", "a2", "a3"]) + df2 = pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]], index=["a1", "b2", "b3"]) + 1 + df2 = pd.concat([df2, df2 + 1]) + + mdf1 = from_pandas(df1, chunk_size=2) + mdf2 = from_pandas(df2, chunk_size=2) + + # default `how` + expected0 = df1.join(df2, lsuffix="l_", rsuffix="r_") + jdf0 = mdf1.join(mdf2, lsuffix="l_", rsuffix="r_", auto_merge="none") + result0 = jdf0.execute().fetch() + pd.testing.assert_frame_equal(expected0.sort_index(), result0.sort_index()) + + # how = 'left' + expected1 = df1.join(df2, how="left", lsuffix="l_", rsuffix="r_") + jdf1 = mdf1.join(mdf2, how="left", lsuffix="l_", rsuffix="r_", auto_merge="none") + result1 = jdf1.execute().fetch() + pd.testing.assert_frame_equal(expected1.sort_index(), result1.sort_index()) + + # how = 'right' + expected2 = df1.join(df2, how="right", lsuffix="l_", rsuffix="r_") + jdf2 = mdf1.join(mdf2, how="right", lsuffix="l_", rsuffix="r_", auto_merge="none") + result2 = jdf2.execute().fetch() + pd.testing.assert_frame_equal(expected2.sort_index(), result2.sort_index()) + + # how = 'inner' + expected3 = df1.join(df2, how="inner", lsuffix="l_", rsuffix="r_") + jdf3 = mdf1.join(mdf2, how="inner", lsuffix="l_", rsuffix="r_", auto_merge="none") + result3 = jdf3.execute().fetch() + pd.testing.assert_frame_equal(expected3.sort_index(), result3.sort_index()) + + # how = 'outer' + expected4 = df1.join(df2, how="outer", lsuffix="l_", rsuffix="r_") + jdf4 = mdf1.join(mdf2, how="outer", lsuffix="l_", rsuffix="r_", auto_merge="none") + result4 = jdf4.execute().fetch() + pd.testing.assert_frame_equal(expected4.sort_index(), result4.sort_index()) + + +def test_join_on(setup): + df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]], columns=["a1", "a2", "a3"]) + df2 = ( + pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]], columns=["a1", "b2", "b3"]) + 1 + ) + df2 = pd.concat([df2, df2 + 1]) + + mdf1 = from_pandas(df1, chunk_size=2) + mdf2 = from_pandas(df2, chunk_size=2) + + expected0 = df1.join(df2, on=None, lsuffix="_l", rsuffix="_r") + jdf0 = mdf1.join(mdf2, on=None, lsuffix="_l", rsuffix="_r", auto_merge="none") + result0 = jdf0.execute().fetch() + pd.testing.assert_frame_equal( + sort_dataframe_inplace(expected0, 0), sort_dataframe_inplace(result0, 0) + ) + + expected1 = df1.join(df2, how="left", on="a1", lsuffix="_l", rsuffix="_r") + jdf1 = mdf1.join( + mdf2, how="left", on="a1", lsuffix="_l", rsuffix="_r", auto_merge="none" + ) + result1 = jdf1.execute().fetch() + + # Note [Columns of Left Join] + # + # I believe we have no chance to obtain the entirely same result with pandas here: + # + # Look at the following example: + # + # >>> df1 + # a1 a2 a3 + # 0 1 3 3 + # >>> df2 + # a1 b2 b3 + # 1 2 6 7 + # >>> df3 + # a1 b2 b3 + # 1 2 6 7 + # 1 2 6 7 + # + # >>> df1.merge(df2, how='left', left_on='a1', left_index=False, right_index=True) + # a1_x a2 a3 a1_y b2 b3 + # 0 1 3 3 2 6 7 + # >>> df1.merge(df3, how='left', left_on='a1', left_index=False, right_index=True) + # a1 a1_x a2 a3 a1_y b2 b3 + # 0 1 1 3 3 2 6 7 + # 0 1 1 3 3 2 6 7 + # + # Note that the result of `df1.merge(df3)` has an extra column `a` compared to `df1.merge(df2)`. + # The value of column `a` is the same of `a1_x`, just because `1` occurs twice in index of `df3`. + # I haven't invistagated why pandas has such behaviour... + # + # We cannot yield the same result with pandas, because, the `df3` is chunked, then some of the + # result chunk has 6 columns, others may have 7 columns, when concatenated into one DataFrame + # some cells of column `a` will have value `NaN`, which is different from the result of pandas. + # + # But we can guarantee that other effective columns have absolutely same value with pandas. + + columns_to_compare = jdf1.columns_value.to_pandas() + + pd.testing.assert_frame_equal( + sort_dataframe_inplace(expected1[columns_to_compare], 0, 1), + sort_dataframe_inplace(result1[columns_to_compare], 0, 1), + ) + + # Note [Index of Join on EmptyDataFrame] + # + # It is tricky that it is non-trivial to get the same `index` result with pandas. + # + # Look at the following example: + # + # >>> df1 + # a1 a2 a3 + # 1 4 2 6 + # >>> df2 + # a1 b2 b3 + # 1 2 6 7 + # 2 8 9 10 + # >>> df3 + # Empty DataFrame + # Columns: [a1, a2, a3] + # Index: [] + # >>> df1.join(df2, how='right', on='a2', lsuffix='_l', rsuffix='_r') + # a1_l a2 a3 a1_r b2 b3 + # 1.0 4.0 2 6.0 8 9 10 + # NaN NaN 1 NaN 2 6 7 + # >>> df3.join(df2, how='right', on='a2', lsuffix='_l', rsuffix='_r') + # a1_l a2 a3 a1_r b2 b3 + # 1 NaN 1 NaN 2 6 7 + # 2 NaN 2 NaN 8 9 10 + # + # When the `left` dataframe is not empty, the mismatched rows in `right` will have index value `NaN`, + # and the matched rows have index value from `right`. When the `left` dataframe is empty, the mismatched + # rows have index value from `right`. + # + # Since we chunked the `left` dataframe, it is uneasy to obtain the same index value with pandas in the + # final result dataframe, but we guaranteed that the dataframe content is correctly. + + expected2 = df1.join(df2, how="right", on="a2", lsuffix="_l", rsuffix="_r") + jdf2 = mdf1.join( + mdf2, how="right", on="a2", lsuffix="_l", rsuffix="_r", auto_merge="none" + ) + result2 = jdf2.execute().fetch() + + expected2.set_index("a2", inplace=True) + result2.set_index("a2", inplace=True) + pd.testing.assert_frame_equal( + sort_dataframe_inplace(expected2, 0), sort_dataframe_inplace(result2, 0) + ) + + expected3 = df1.join(df2, how="inner", on="a2", lsuffix="_l", rsuffix="_r") + jdf3 = mdf1.join( + mdf2, how="inner", on="a2", lsuffix="_l", rsuffix="_r", auto_merge="none" + ) + result3 = jdf3.execute().fetch() + pd.testing.assert_frame_equal( + sort_dataframe_inplace(expected3, 0), sort_dataframe_inplace(result3, 0) + ) + + expected4 = df1.join(df2, how="outer", on="a2", lsuffix="_l", rsuffix="_r") + jdf4 = mdf1.join( + mdf2, how="outer", on="a2", lsuffix="_l", rsuffix="_r", auto_merge="none" + ) + result4 = jdf4.execute().fetch() + + expected4.set_index("a2", inplace=True) + result4.set_index("a2", inplace=True) + pd.testing.assert_frame_equal( + sort_dataframe_inplace(expected4, 0), sort_dataframe_inplace(result4, 0) + ) + + +def test_merge_one_chunk(setup): + df1 = pd.DataFrame( + {"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 5]}, + index=["a1", "a2", "a3", "a4"], + ) + df2 = pd.DataFrame( + {"rkey": ["foo", "bar", "baz", "foo"], "value": [5, 6, 7, 8]}, + index=["a1", "a2", "a3", "a4"], + ) + + # all have one chunk + mdf1 = from_pandas(df1) + mdf2 = from_pandas(df2) + + expected = df1.merge(df2, left_on="lkey", right_on="rkey") + jdf = mdf1.merge(mdf2, left_on="lkey", right_on="rkey", auto_merge="none") + result = jdf.execute().fetch() + + pd.testing.assert_frame_equal( + expected.sort_values(by=expected.columns[1]).reset_index(drop=True), + result.sort_values(by=result.columns[1]).reset_index(drop=True), + ) + + # left have one chunk + mdf1 = from_pandas(df1) + mdf2 = from_pandas(df2, chunk_size=2) + + expected = df1.merge(df2, left_on="lkey", right_on="rkey") + jdf = mdf1.merge(mdf2, left_on="lkey", right_on="rkey", auto_merge="none") + result = jdf.execute().fetch() + + pd.testing.assert_frame_equal( + expected.sort_values(by=expected.columns[1]).reset_index(drop=True), + result.sort_values(by=result.columns[1]).reset_index(drop=True), + ) + + # right have one chunk + mdf1 = from_pandas(df1, chunk_size=3) + mdf2 = from_pandas(df2) + + expected = df1.merge(df2, left_on="lkey", right_on="rkey") + jdf = mdf1.merge(mdf2, left_on="lkey", right_on="rkey", auto_merge="none") + result = jdf.execute().fetch() + + pd.testing.assert_frame_equal( + expected.sort_values(by=expected.columns[1]).reset_index(drop=True), + result.sort_values(by=result.columns[1]).reset_index(drop=True), + ) + + # left have one chunk and how="left", then one chunk tile + # will result in wrong results, see #GH 2107 + mdf1 = from_pandas(df1, chunk_size=2) + mdf2 = from_pandas(df2) + + expected = df2.merge(df1, left_on="rkey", right_on="lkey", how="left") + jdf = mdf2.merge( + mdf1, left_on="rkey", right_on="lkey", how="left", auto_merge="none" + ) + result = jdf.execute().fetch() + + pd.testing.assert_frame_equal( + expected.sort_values(by=expected.columns[1]).reset_index(drop=True), + result.sort_values(by=result.columns[1]).reset_index(drop=True), + ) + + +def test_broadcast_merge(setup): + ns = np.random.RandomState(0) + # small dataframe + raw1 = pd.DataFrame( + { + "key": ns.randint(0, 10, size=10), + "value": np.arange(10), + }, + index=[f"a{i}" for i in range(10)], + ) + # big dataframe + raw2 = pd.DataFrame( + { + "key": ns.randint(0, 100, size=100), + "value": np.arange(100, 200), + }, + index=[f"a{i}" for i in range(100)], + ) + + # test broadcast right and how="inner" + df1 = from_pandas(raw1, chunk_size=5) + df2 = from_pandas(raw2, chunk_size=10) + r = df2.merge(df1, on="key", auto_merge="none", bloom_filter=False) + # make sure it selects broadcast merge, for broadcast, there must be + # DataFrameConcat operands + graph = build_graph([r], tile=True) + assert any(isinstance(c.op, DataFrameConcat) for c in graph) + # inner join doesn't need shuffle + assert all(not isinstance(c.op, DataFrameMergeAlign) for c in graph) + + result = r.execute().fetch() + expected = raw2.merge(raw1, on="key") + + expected.set_index("key", inplace=True) + result.set_index("key", inplace=True) + pd.testing.assert_frame_equal( + sort_dataframe_inplace(expected, 0), sort_dataframe_inplace(result, 0) + ) + + # test broadcast right and how="left" + df1 = from_pandas(raw1, chunk_size=5) + df2 = from_pandas(raw2, chunk_size=10) + r = df2.merge(df1, on="key", how="left", auto_merge="none", method="broadcast") + # make sure it selects broadcast merge, for broadcast, there must be + # DataFrameConcat operands + graph = build_graph([r], tile=True) + assert any(isinstance(c.op, DataFrameConcat) for c in graph) + # left join need shuffle + assert any(isinstance(c.op, DataFrameMergeAlign) for c in graph) + + result = r.execute().fetch() + expected = raw2.merge(raw1, on="key", how="left") + + expected.set_index("key", inplace=True) + result.set_index("key", inplace=True) + pd.testing.assert_frame_equal( + expected.sort_values(by=["key", "value_x"]), + result.sort_values(by=["key", "value_x"]), + ) + + # test broadcast left + df1 = from_pandas(raw1, chunk_size=5) + df2 = from_pandas(raw2, chunk_size=10) + r = df1.merge(df2, on="key", auto_merge="none", bloom_filter=False) + # make sure it selects broadcast merge, for broadcast, there must be + # DataFrameConcat operands + graph = build_graph([r], tile=True) + assert any(isinstance(c.op, DataFrameConcat) for c in graph) + # inner join doesn't need shuffle + assert all(not isinstance(c.op, DataFrameMergeAlign) for c in graph) + + result = r.execute().fetch() + expected = raw1.merge(raw2, on="key") + + expected.set_index("key", inplace=True) + result.set_index("key", inplace=True) + pd.testing.assert_frame_equal( + sort_dataframe_inplace(expected, 0), sort_dataframe_inplace(result, 0) + ) + + # test broadcast left and how="right" + df1 = from_pandas(raw1, chunk_size=5) + df2 = from_pandas(raw2, chunk_size=10) + r = df1.merge(df2, on="key", how="right", auto_merge="none") + # make sure it selects broadcast merge, for broadcast, there must be + # DataFrameConcat operands + graph = build_graph([r], tile=True) + assert any(isinstance(c.op, DataFrameConcat) for c in graph) + # right join need shuffle + assert any(isinstance(c.op, DataFrameMergeAlign) for c in graph) + + result = r.execute().fetch() + expected = raw1.merge(raw2, on="key", how="right") + + expected.set_index("key", inplace=True) + result.set_index("key", inplace=True) + pd.testing.assert_frame_equal( + expected.sort_values(by=["key", "value_x"]), + result.sort_values(by=["key", "value_x"]), + ) + + +def test_merge_with_bloom_filter(setup): + ns = np.random.RandomState(0) + raw_df1 = pd.DataFrame( + { + "col1": ns.random(100), + "col2": ns.randint(0, 10, size=(100,)), + "col3": ns.randint(0, 10, size=(100,)), + } + ) + raw_df2 = pd.DataFrame( + { + "col1": ns.random(100), + "col2": ns.randint(0, 10, size=(100,)), + "col3": ns.randint(0, 10, size=(100,)), + } + ) + + df1 = from_pandas(raw_df1, chunk_size=10) + df2 = from_pandas(raw_df2, chunk_size=15) + + expected = raw_df1.merge(raw_df2, on="col2") + + result = ( + df1.merge( + df2, + on="col2", + bloom_filter=True, + bloom_filter_options={"max_elements": 100, "error_rate": 0.01}, + auto_merge="none", + ) + .execute() + .fetch() + ) + pd.testing.assert_frame_equal( + expected.sort_values(by=["col1_x", "col2"]).reset_index(drop=True), + result.sort_values(by=["col1_x", "col2"]).reset_index(drop=True), + ) + + result = ( + df2.merge(df1, on=["col2", "col3"], bloom_filter=True, auto_merge="none") + .execute() + .fetch() + ) + expected = raw_df2.merge(raw_df1, on=["col2", "col3"]) + pd.testing.assert_frame_equal( + expected.sort_values(by=["col1_x", "col2"]).reset_index(drop=True), + result.sort_values(by=["col1_x", "col2"]).reset_index(drop=True), + ) + + # on index + result = df2.merge(df1, bloom_filter=True, auto_merge="none").execute().fetch() + expected = raw_df2.merge(raw_df1) + pd.testing.assert_frame_equal( + expected.sort_index().reset_index(drop=True), + result.sort_index().reset_index(drop=True), + ) + + # on float column + result = ( + df2.merge(df1, on="col1", bloom_filter=True, auto_merge="none") + .execute() + .fetch() + ) + expected = raw_df2.merge(raw_df1, on="col1") + pd.testing.assert_frame_equal( + expected.sort_values(by=["col1", "col2_x"]).reset_index(drop=True), + result.sort_values(by=["col1", "col2_x"]).reset_index(drop=True), + ) + + # on float columns + result = ( + df2.merge(df1, on=["col1", "col2"], bloom_filter=True, auto_merge="none") + .execute() + .fetch() + ) + expected = raw_df2.merge(raw_df1, on=["col1", "col2"]) + pd.testing.assert_frame_equal( + expected.sort_values(by=["col1", "col2"]).reset_index(drop=True), + result.sort_values(by=["col1", "col2"]).reset_index(drop=True), + ) + + # multi index + raw_df3 = raw_df1.copy() + raw_df3.index = pd.MultiIndex.from_tuples( + [(i, i + 1) for i in range(100)], names=["i1", "i2"] + ) + df3 = from_pandas(raw_df3, chunk_size=8) + result = ( + df3.merge( + df1, left_on="i1", right_on="col2", bloom_filter=True, auto_merge="none" + ) + .execute() + .fetch() + ) + expected = raw_df3.merge(raw_df1, left_on="i1", right_on="col2") + pd.testing.assert_frame_equal( + expected.sort_index().sort_values(by=["col1_x"]).reset_index(drop=True), + result.sort_index().sort_values(by=["col1_x"]).reset_index(drop=True), + ) + + df4 = from_pandas(raw_df3, chunk_size=20) + result = ( + df4.merge( + df1, left_on="i1", right_on="col2", bloom_filter=True, auto_merge="none" + ) + .execute() + .fetch() + ) + expected = raw_df3.merge(raw_df1, left_on="i1", right_on="col2") + pd.testing.assert_frame_equal( + expected.sort_index().sort_values(by=["col1_x"]).reset_index(drop=True), + result.sort_index().sort_values(by=["col1_x"]).reset_index(drop=True), + ) + + +@pytest.mark.parametrize("filter", ["small", "large", "both"]) +def test_merge_with_bloom_filter_options(setup, filter): + ns = np.random.RandomState(0) + raw_df1 = pd.DataFrame( + { + "col1": ns.random(100), + "col2": ns.randint(0, 10, size=(100,)), + "col3": ns.randint(0, 10, size=(100,)), + } + ) + raw_df2 = pd.DataFrame( + { + "col1": ns.random(100), + "col2": ns.randint(0, 10, size=(100,)), + "col3": ns.randint(0, 10, size=(100,)), + } + ) + + df1 = from_pandas(raw_df1, chunk_size=25) + df2 = from_pandas(raw_df2, chunk_size=30) + m = df1.merge( + df2, + on="col2", + auto_merge="none", + method="shuffle", + bloom_filter=True, + bloom_filter_options={"filter": filter, "apply_chunk_size_threshold": 0}, + ) + + expected = raw_df1.merge(raw_df2, on="col2") + result = m.execute().fetch() + pd.testing.assert_frame_equal( + expected.sort_index().sort_values(by=["col1_x"]).reset_index(drop=True), + result.sort_index().sort_values(by=["col1_x"]).reset_index(drop=True), + ) + + +@pytest.mark.parametrize("auto_merge", ["none", "both", "before", "after"]) +def test_merge_on_duplicate_columns(setup, auto_merge): + raw1 = pd.DataFrame( + [["foo", 1, "bar"], ["bar", 2, "foo"], ["baz", 3, "foo"]], + columns=["lkey", "value", "value"], + index=["a1", "a2", "a3"], + ) + raw2 = pd.DataFrame( + {"rkey": ["foo", "bar", "baz", "foo"], "value": [5, 6, 7, 8]}, + index=["a1", "a2", "a3", "a4"], + ) + + df1 = from_pandas(raw1, chunk_size=2) + df2 = from_pandas(raw2, chunk_size=3) + + r = df1.merge( + df2, + left_on="lkey", + right_on="rkey", + auto_merge=auto_merge, + auto_merge_threshold=0, + ) + result = r.execute().fetch() + expected = raw1.merge(raw2, left_on="lkey", right_on="rkey") + pd.testing.assert_frame_equal(expected, result) + + +def test_append_execution(setup): + df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD")) + df2 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD")) + + mdf1 = from_pandas(df1, chunk_size=3) + mdf2 = from_pandas(df2, chunk_size=3) + + adf = mdf1.append(mdf2) + expected = df1.append(df2) + result = adf.execute().fetch() + pd.testing.assert_frame_equal(expected, result) + + adf = mdf1.append(mdf2, ignore_index=True) + expected = df1.append(df2, ignore_index=True) + result = adf.execute(extra_config={"check_index_value": False}).fetch() + pd.testing.assert_frame_equal(expected, result) + + mdf1 = from_pandas(df1, chunk_size=3) + mdf2 = from_pandas(df2, chunk_size=2) + + adf = mdf1.append(mdf2) + expected = df1.append(df2) + result = adf.execute().fetch() + pd.testing.assert_frame_equal(expected, result) + + adf = mdf1.append(mdf2, ignore_index=True) + expected = df1.append(df2, ignore_index=True) + result = adf.execute(extra_config={"check_index_value": False}).fetch() + pd.testing.assert_frame_equal(expected, result) + + df3 = pd.DataFrame(np.random.rand(8, 4), columns=list("ABCD")) + mdf3 = from_pandas(df3, chunk_size=3) + expected = df1.append([df2, df3]) + adf = mdf1.append([mdf2, mdf3]) + result = adf.execute().fetch() + pd.testing.assert_frame_equal(expected, result) + + adf = mdf1.append(dict(A=1, B=2, C=3, D=4), ignore_index=True) + expected = df1.append(dict(A=1, B=2, C=3, D=4), ignore_index=True) + result = adf.execute(extra_config={"check_index_value": False}).fetch() + pd.testing.assert_frame_equal(expected, result) + + # test for series + series1 = pd.Series(np.random.rand(10)) + series2 = pd.Series(np.random.rand(10)) + + mseries1 = series_from_pandas(series1, chunk_size=3) + mseries2 = series_from_pandas(series2, chunk_size=3) + + aseries = mseries1.append(mseries2) + expected = series1.append(series2) + result = aseries.execute().fetch() + pd.testing.assert_series_equal(expected, result) + + aseries = mseries1.append(mseries2, ignore_index=True) + expected = series1.append(series2, ignore_index=True) + result = aseries.execute(extra_config={"check_index_value": False}).fetch() + pd.testing.assert_series_equal(expected, result) + + mseries1 = series_from_pandas(series1, chunk_size=3) + mseries2 = series_from_pandas(series2, chunk_size=2) + + aseries = mseries1.append(mseries2) + expected = series1.append(series2) + result = aseries.execute().fetch() + pd.testing.assert_series_equal(expected, result) + + aseries = mseries1.append(mseries2, ignore_index=True) + expected = series1.append(series2, ignore_index=True) + result = aseries.execute(extra_config={"check_index_value": False}).fetch() + pd.testing.assert_series_equal(expected, result) + + series3 = pd.Series(np.random.rand(4)) + mseries3 = series_from_pandas(series3, chunk_size=2) + expected = series1.append([series2, series3]) + aseries = mseries1.append([mseries2, mseries3]) + result = aseries.execute().fetch() + pd.testing.assert_series_equal(expected, result) + + +def test_concat(setup): + df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD")) + df2 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD")) + + mdf1 = from_pandas(df1, chunk_size=3) + mdf2 = from_pandas(df2, chunk_size=3) + + r = concat([mdf1, mdf2]) + expected = pd.concat([df1, df2]) + result = r.execute().fetch() + pd.testing.assert_frame_equal(expected, result) + + # test different chunk size and ignore_index=True + mdf1 = from_pandas(df1, chunk_size=2) + mdf2 = from_pandas(df2, chunk_size=3) + + r = concat([mdf1, mdf2], ignore_index=True) + expected = pd.concat([df1, df2], ignore_index=True) + result = r.execute(extra_config={"check_index_value": False}).fetch() + pd.testing.assert_frame_equal(expected, result) + + # test axis=1 + mdf1 = from_pandas(df1, chunk_size=2) + mdf2 = from_pandas(df2, chunk_size=3) + + r = concat([mdf1, mdf2], axis=1) + expected = pd.concat([df1, df2], axis=1) + result = r.execute().fetch() + pd.testing.assert_frame_equal(expected, result) + + # test multiply dataframes + r = concat([mdf1, mdf2, mdf1]) + expected = pd.concat([df1, df2, df1]) + result = r.execute().fetch() + pd.testing.assert_frame_equal(expected, result) + + df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD")) + df2 = pd.DataFrame(np.random.rand(10, 3), columns=list("ABC")) + + mdf1 = from_pandas(df1, chunk_size=3) + mdf2 = from_pandas(df2, chunk_size=3) + + # test join=inner + r = concat([mdf1, mdf2], join="inner") + expected = pd.concat([df1, df2], join="inner") + result = r.execute().fetch() + pd.testing.assert_frame_equal(expected, result) + + # test for series + series1 = pd.Series(np.random.rand(10)) + series2 = pd.Series(np.random.rand(10)) + + mseries1 = series_from_pandas(series1, chunk_size=3) + mseries2 = series_from_pandas(series2, chunk_size=3) + + r = concat([mseries1, mseries2]) + expected = pd.concat([series1, series2]) + result = r.execute().fetch() + pd.testing.assert_series_equal(result, expected) + + # test different series and ignore_index + mseries1 = series_from_pandas(series1, chunk_size=4) + mseries2 = series_from_pandas(series2, chunk_size=3) + + r = concat([mseries1, mseries2], ignore_index=True) + expected = pd.concat([series1, series2], ignore_index=True) + result = r.execute(extra_config={"check_index_value": False}).fetch() + pd.testing.assert_series_equal(result, expected) + + # test axis=1 + mseries1 = series_from_pandas(series1, chunk_size=3) + mseries2 = series_from_pandas(series2, chunk_size=3) + + r = concat([mseries1, mseries2], axis=1) + expected = pd.concat([series1, series2], axis=1) + result = r.execute(extra_config={"check_shape": False}).fetch() + pd.testing.assert_frame_equal(result, expected) + + # test merge dataframe and series + r = concat([mdf1, mseries2], ignore_index=True) + expected = pd.concat([df1, series2], ignore_index=True) + result = r.execute(extra_config={"check_index_value": False}).fetch() + pd.testing.assert_frame_equal(result, expected) + + # test merge series and dataframe + r = concat([mseries1, mdf2], ignore_index=True) + expected = pd.concat([series1, df2], ignore_index=True) + result = r.execute(extra_config={"check_index_value": False}).fetch() + pd.testing.assert_frame_equal(result, expected) + + # test merge dataframe and series, axis=1 + r = concat([mdf1, mseries2], axis=1) + expected = pd.concat([df1, series2], axis=1) + result = r.execute().fetch() + pd.testing.assert_frame_equal(result, expected) + + # test merge series and dataframe, axis=1 + r = concat([mseries1, mdf2], axis=1) + expected = pd.concat([series1, df2], axis=1) + result = r.execute().fetch() + pd.testing.assert_frame_equal(result, expected) diff --git a/python/xorbits/_mars/dataframe/missing/__init__.py b/python/xorbits/_mars/dataframe/missing/__init__.py new file mode 100644 index 000000000..5632476fc --- /dev/null +++ b/python/xorbits/_mars/dataframe/missing/__init__.py @@ -0,0 +1,51 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .checkna import isna, isnull, notna, notnull +from .dropna import df_dropna, index_dropna, series_dropna +from .fillna import bfill, ffill, fillna, index_fillna +from .replace import df_replace, series_replace + + +def _install(): + from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE + + for cls in DATAFRAME_TYPE + SERIES_TYPE: + setattr(cls, "fillna", fillna) + setattr(cls, "ffill", ffill) + setattr(cls, "pad", ffill) + setattr(cls, "backfill", bfill) + setattr(cls, "bfill", bfill) + setattr(cls, "isna", isna) + setattr(cls, "isnull", isnull) + setattr(cls, "notna", notna) + setattr(cls, "notnull", notnull) + + for cls in DATAFRAME_TYPE: + setattr(cls, "dropna", df_dropna) + setattr(cls, "replace", df_replace) + + for cls in SERIES_TYPE: + setattr(cls, "dropna", series_dropna) + setattr(cls, "replace", series_replace) + + for cls in INDEX_TYPE: + setattr(cls, "fillna", index_fillna) + setattr(cls, "dropna", index_dropna) + setattr(cls, "isna", isna) + setattr(cls, "notna", notna) + + +_install() +del _install diff --git a/python/xorbits/_mars/dataframe/missing/checkna.py b/python/xorbits/_mars/dataframe/missing/checkna.py new file mode 100644 index 000000000..65ab17555 --- /dev/null +++ b/python/xorbits/_mars/dataframe/missing/checkna.py @@ -0,0 +1,295 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any + +import numpy as np +import pandas as pd + +from ... import dataframe as md +from ... import opcodes +from ... import tensor as mt +from ...config import options +from ...core import OutputType +from ...serialization.serializables import BoolField +from ..operands import ( + DATAFRAME_TYPE, + ENTITY_TYPE, + INDEX_TYPE, + SERIES_TYPE, + TENSOR_TYPE, + DataFrameOperand, + DataFrameOperandMixin, +) + + +class DataFrameCheckNA(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.CHECK_NA + + _positive = BoolField("positive") + _use_inf_as_na = BoolField("use_inf_as_na") + + def __init__( + self, positive=None, use_inf_as_na=None, sparse=None, output_types=None, **kw + ): + super().__init__( + _positive=positive, + _use_inf_as_na=use_inf_as_na, + _output_types=output_types, + sparse=sparse, + **kw, + ) + + @property + def positive(self) -> bool: + return self._positive + + @property + def use_inf_as_na(self) -> bool: + return self._use_inf_as_na + + def __call__(self, df): + if isinstance(df, DATAFRAME_TYPE): + self.output_types = [OutputType.dataframe] + elif isinstance(df, SERIES_TYPE): + self.output_types = [OutputType.series] + elif isinstance(df, TENSOR_TYPE) or isinstance(df, INDEX_TYPE): + self.output_types = [OutputType.tensor] + else: + raise TypeError( + f"Expecting mars dataframe, series, index, or tensor, got {type(df)}" + ) + + params = df.params.copy() + if self.output_types[0] == OutputType.dataframe: + params["dtypes"] = pd.Series( + [np.dtype("bool")] * len(df.dtypes), index=df.columns_value.to_pandas() + ) + else: + params["dtype"] = np.dtype("bool") + return self.new_tileable([df], **params) + + @classmethod + def tile(cls, op: "DataFrameCheckNA"): + in_df = op.inputs[0] + out_df = op.outputs[0] + + chunks = [] + for c in in_df.chunks: + params = c.params.copy() + if op.output_types[0] == OutputType.dataframe: + params["dtypes"] = pd.Series( + [np.dtype("bool")] * len(c.dtypes), + index=c.columns_value.to_pandas(), + ) + else: + params["dtype"] = np.dtype("bool") + new_op = op.copy().reset_key() + chunks.append(new_op.new_chunk([c], **params)) + + new_op = op.copy().reset_key() + params = out_df.params.copy() + params.update(dict(chunks=chunks, nsplits=in_df.nsplits)) + return new_op.new_tileables([in_df], **params) + + @classmethod + def execute(cls, ctx, op: "DataFrameCheckNA"): + in_data = ctx[op.inputs[0].key] + old_use_inf_as_na = pd.get_option("mode.use_inf_as_na") + try: + pd.set_option("mode.use_inf_as_na", op.use_inf_as_na) + if op.positive: + ctx[op.outputs[0].key] = in_data.isna() + else: + ctx[op.outputs[0].key] = in_data.notna() + finally: + pd.set_option("mode.use_inf_as_na", old_use_inf_as_na) + + +def _from_pandas(obj: Any): + if isinstance(obj, pd.DataFrame): + from ..datasource.dataframe import from_pandas + + return from_pandas(obj) + elif isinstance(obj, pd.Series): + from ..datasource.series import from_pandas + + return from_pandas(obj) + elif isinstance(obj, np.ndarray): + return mt.tensor(obj) + else: + return obj + + +def isna(obj): + """ + Detect missing values. + + Return a boolean same-sized object indicating if the values are NA. + NA values, such as None or :attr:`numpy.NaN`, gets mapped to True + values. + + Everything else gets mapped to False values. Characters such as empty + strings ``''`` or :attr:`numpy.inf` are not considered NA values + (unless you set ``pandas.options.mode.use_inf_as_na = True``). + + Returns + ------- + DataFrame + Mask of bool values for each element in DataFrame that + indicates whether an element is not an NA value. + + See Also + -------- + DataFrame.isnull : Alias of isna. + DataFrame.notna : Boolean inverse of isna. + DataFrame.dropna : Omit axes labels with missing values. + isna : Top-level isna. + + Examples + -------- + Show which entries in a DataFrame are NA. + + >>> import numpy as np + >>> import mars.dataframe as md + >>> df = md.DataFrame({'age': [5, 6, np.NaN], + ... 'born': [md.NaT, md.Timestamp('1939-05-27'), + ... md.Timestamp('1940-04-25')], + ... 'name': ['Alfred', 'Batman', ''], + ... 'toy': [None, 'Batmobile', 'Joker']}) + >>> df.execute() + age born name toy + 0 5.0 NaT Alfred None + 1 6.0 1939-05-27 Batman Batmobile + 2 NaN 1940-04-25 Joker + + >>> df.isna().execute() + age born name toy + 0 False True False True + 1 False False False False + 2 True False False False + + Show which entries in a Series are NA. + + >>> ser = md.Series([5, 6, np.NaN]) + >>> ser.execute() + 0 5.0 + 1 6.0 + 2 NaN + dtype: float64 + + >>> ser.isna().execute() + 0 False + 1 False + 2 True + dtype: bool + """ + if isinstance(obj, md.MultiIndex): + raise NotImplementedError("isna is not defined for MultiIndex") + elif isinstance(obj, ENTITY_TYPE): + if isinstance(obj, TENSOR_TYPE): + if options.dataframe.mode.use_inf_as_na: + return ~mt.isfinite(obj) + else: + return mt.isnan(obj) + else: + op = DataFrameCheckNA( + positive=True, use_inf_as_na=options.dataframe.mode.use_inf_as_na + ) + return op(obj) + else: + return _from_pandas(pd.isna(obj)) + + +def notna(obj): + """ + Detect existing (non-missing) values. + + Return a boolean same-sized object indicating if the values are not NA. + Non-missing values get mapped to True. Characters such as empty + strings ``''`` or :attr:`numpy.inf` are not considered NA values + (unless you set ``pandas.options.mode.use_inf_as_na = True``). + NA values, such as None or :attr:`numpy.NaN`, get mapped to False + values. + + Returns + ------- + DataFrame + Mask of bool values for each element in DataFrame that + indicates whether an element is not an NA value. + + See Also + -------- + DataFrame.notnull : Alias of notna. + DataFrame.isna : Boolean inverse of notna. + DataFrame.dropna : Omit axes labels with missing values. + notna : Top-level notna. + + Examples + -------- + Show which entries in a DataFrame are not NA. + + >>> import numpy as np + >>> import mars.dataframe as md + >>> df = md.DataFrame({'age': [5, 6, np.NaN], + ... 'born': [md.NaT, md.Timestamp('1939-05-27'), + ... md.Timestamp('1940-04-25')], + ... 'name': ['Alfred', 'Batman', ''], + ... 'toy': [None, 'Batmobile', 'Joker']}) + >>> df.execute() + age born name toy + 0 5.0 NaT Alfred None + 1 6.0 1939-05-27 Batman Batmobile + 2 NaN 1940-04-25 Joker + + >>> df.notna().execute() + age born name toy + 0 True False True False + 1 True True True True + 2 False True True True + + Show which entries in a Series are not NA. + + >>> ser = md.Series([5, 6, np.NaN]) + >>> ser.execute() + 0 5.0 + 1 6.0 + 2 NaN + dtype: float64 + + >>> ser.notna().execute() + 0 True + 1 True + 2 False + dtype: bool + """ + if isinstance(obj, md.MultiIndex): + raise NotImplementedError("isna is not defined for MultiIndex") + elif isinstance(obj, ENTITY_TYPE): + if isinstance(obj, TENSOR_TYPE): + if options.dataframe.mode.use_inf_as_na: + return mt.isfinite(obj) + else: + return ~mt.isnan(obj) + else: + op = DataFrameCheckNA( + positive=False, use_inf_as_na=options.dataframe.mode.use_inf_as_na + ) + return op(obj) + else: + return _from_pandas(pd.notna(obj)) + + +isnull = isna +notnull = notna diff --git a/python/xorbits/_mars/dataframe/missing/dropna.py b/python/xorbits/_mars/dataframe/missing/dropna.py new file mode 100644 index 000000000..0e595c572 --- /dev/null +++ b/python/xorbits/_mars/dataframe/missing/dropna.py @@ -0,0 +1,451 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...config import options +from ...core import OutputType, recursive_tile +from ...serialization.serializables import AnyField, BoolField, Int32Field, StringField +from ...utils import no_default, pd_release_version +from ..align import align_dataframe_series +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import parse_index, validate_axis + +_drop_na_enable_no_default = pd_release_version[:2] >= (1, 5) + + +class DataFrameDropNA(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.DROP_NA + + _axis = AnyField("axis") + _how = StringField("how") + _thresh = Int32Field("thresh") + _subset = AnyField("subset") + _use_inf_as_na = BoolField("use_inf_as_na") + + # when True, dropna will be called on the input, + # otherwise non-nan counts will be used + _drop_directly = BoolField("drop_directly") + # size of subset, used when how == 'any' + _subset_size = Int32Field("subset_size") + + def __init__( + self, + axis=None, + how=None, + thresh=None, + subset=None, + use_inf_as_na=None, + drop_directly=None, + subset_size=None, + sparse=None, + output_types=None, + **kw + ): + super().__init__( + _axis=axis, + _how=how, + _thresh=thresh, + _subset=subset, + _use_inf_as_na=use_inf_as_na, + _drop_directly=drop_directly, + _subset_size=subset_size, + _output_types=output_types, + sparse=sparse, + **kw + ) + + @property + def axis(self) -> int: + return self._axis + + @property + def how(self) -> str: + return self._how + + @property + def thresh(self) -> int: + return self._thresh + + @property + def subset(self) -> list: + return self._subset + + @property + def use_inf_as_na(self) -> bool: + return self._use_inf_as_na + + @property + def drop_directly(self) -> bool: + return self._drop_directly + + @property + def subset_size(self) -> int: + return self._subset_size + + def __call__(self, df): + new_shape = list(df.shape) + new_shape[0] = np.nan + + params = df.params.copy() + params["index_value"] = parse_index(None, df.key, df.index_value.key) + params["shape"] = tuple(new_shape) + return self.new_tileable([df], **params) + + @classmethod + def _tile_drop_directly(cls, op: "DataFrameDropNA"): + in_df = op.inputs[0] + out_df = op.outputs[0] + + chunks = [] + for c in in_df.chunks: + new_shape = list(c.shape) + new_shape[0] = np.nan + + params = c.params.copy() + params["index_value"] = parse_index(None, c.key, c.index_value.key) + params["shape"] = tuple(new_shape) + + new_op = op.copy().reset_key() + new_op._drop_directly = True + chunks.append(new_op.new_chunk([c], **params)) + + new_nsplits = list(in_df.nsplits) + new_nsplits[0] = (np.nan,) * len(in_df.nsplits[0]) + + new_op = op.copy().reset_key() + params = out_df.params.copy() + params.update(dict(chunks=chunks, nsplits=new_nsplits)) + return new_op.new_tileables([in_df], **params) + + @classmethod + def tile(cls, op: "DataFrameDropNA"): + in_df = op.inputs[0] + out_df = op.outputs[0] + + if not _drop_na_enable_no_default: + op._how = None if op.how is no_default else op.how + op._thresh = None if op.thresh is no_default else op.thresh + + # series tiling will go here + if len(in_df.chunk_shape) == 1 or in_df.chunk_shape[1] == 1: + return cls._tile_drop_directly(op) + + subset_df = in_df + if op.subset: + subset_df = in_df[op.subset] + count_series = yield from recursive_tile( + subset_df.agg("count", axis=1, _use_inf_as_na=op.use_inf_as_na) + ) + + nsplits, out_shape, left_chunks, right_chunks = align_dataframe_series( + in_df, count_series, axis=0 + ) + out_chunk_indexes = itertools.product(*(range(s) for s in out_shape)) + + out_chunks = [] + for out_idx, df_chunk in zip(out_chunk_indexes, left_chunks): + series_chunk = right_chunks[out_idx[0]] + kw = dict( + shape=(np.nan, nsplits[1][out_idx[1]]), + dtypes=df_chunk.dtypes, + index_value=df_chunk.index_value, + columns_value=df_chunk.columns_value, + ) + + new_op = op.copy().reset_key() + new_op._drop_directly = False + new_op._subset_size = len(op.subset) if op.subset else len(in_df.dtypes) + out_chunks.append( + new_op.new_chunk([df_chunk, series_chunk], index=out_idx, **kw) + ) + + new_op = op.copy().reset_key() + params = out_df.params.copy() + new_nsplits = list(tuple(ns) for ns in nsplits) + new_nsplits[0] = (np.nan,) * len(new_nsplits[0]) + params.update(dict(nsplits=tuple(new_nsplits), chunks=out_chunks)) + return new_op.new_tileables(op.inputs, **params) + + @classmethod + def execute(cls, ctx, op: "DataFrameDropNA"): + try: + pd.set_option("mode.use_inf_as_na", op.use_inf_as_na) + + in_data = ctx[op.inputs[0].key] + if op.drop_directly: + if isinstance(in_data, pd.DataFrame): + result = in_data.dropna( + axis=op.axis, how=op.how, thresh=op.thresh, subset=op.subset + ) + elif isinstance(in_data, pd.Series): + result = in_data.dropna(axis=op.axis, how=op.how) + else: + result = in_data.dropna(how=op.how) + ctx[op.outputs[0].key] = result + return + + in_counts = ctx[op.inputs[1].key] + if op.how == "all": + in_counts = in_counts[in_counts > 0] + else: + if op.thresh is None or op.thresh is no_default: + thresh = op.subset_size + else: # pragma: no cover + thresh = op.thresh + in_counts = in_counts[in_counts >= thresh] + + ctx[op.outputs[0].key] = in_data.reindex(in_counts.index) + finally: + pd.reset_option("mode.use_inf_as_na") + + +def df_dropna( + df, axis=0, how=no_default, thresh=no_default, subset=None, inplace=False +): + """ + Remove missing values. + + See the :ref:`User Guide ` for more on which values are + considered missing, and how to work with missing data. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + Determine if rows or columns which contain missing values are + removed. + + * 0, or 'index' : Drop rows which contain missing values. + * 1, or 'columns' : Drop columns which contain missing value. + + .. versionchanged:: 1.0.0 + + Pass tuple or list to drop on multiple axes. + Only a single axis is allowed. + + how : {'any', 'all'}, default 'any' + Determine if row or column is removed from DataFrame, when we have + at least one NA or all NA. + + * 'any' : If any NA values are present, drop that row or column. + * 'all' : If all values are NA, drop that row or column. + + thresh : int, optional + Require that many non-NA values. + subset : array-like, optional + Labels along other axis to consider, e.g. if you are dropping rows + these would be a list of columns to include. + inplace : bool, default False + If True, do operation inplace and return None. + + Returns + ------- + DataFrame + DataFrame with NA entries dropped from it. + + See Also + -------- + DataFrame.isna: Indicate missing values. + DataFrame.notna : Indicate existing (non-missing) values. + DataFrame.fillna : Replace missing values. + Series.dropna : Drop missing values. + Index.dropna : Drop missing indices. + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], + ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], + ... "born": [md.NaT, md.Timestamp("1940-04-25"), + ... md.NaT]}) + >>> df.execute() + name toy born + 0 Alfred NaN NaT + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT + + Drop the rows where at least one element is missing. + + >>> df.dropna().execute() + name toy born + 1 Batman Batmobile 1940-04-25 + + Drop the rows where all elements are missing. + + >>> df.dropna(how='all').execute() + name toy born + 0 Alfred NaN NaT + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT + + Keep only the rows with at least 2 non-NA values. + + >>> df.dropna(thresh=2).execute() + name toy born + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT + + Define in which columns to look for missing values. + + >>> df.dropna(subset=['name', 'born']).execute() + name toy born + 1 Batman Batmobile 1940-04-25 + + Keep the DataFrame with valid entries in the same variable. + + >>> df.dropna(inplace=True) + >>> df.execute() + name toy born + 1 Batman Batmobile 1940-04-25 + """ + axis = validate_axis(axis, df) + if axis != 0: + raise NotImplementedError("Does not support dropna on DataFrame when axis=1") + if ( + _drop_na_enable_no_default + and (how is not no_default) + and (thresh is not no_default) + ): + raise TypeError( + "You cannot set both the how and thresh arguments at the same time." + ) + if thresh is no_default and how is no_default: + how = "any" + + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameDropNA( + axis=axis, + how=how, + thresh=thresh, + subset=subset, + output_types=[OutputType.dataframe], + use_inf_as_na=use_inf_as_na, + ) + out_df = op(df) + if inplace: + df.data = out_df.data + else: + return out_df + + +def series_dropna(series, axis=0, inplace=False, how=None): + """ + Return a new Series with missing values removed. + + See the :ref:`User Guide ` for more on which values are + considered missing, and how to work with missing data. + + Parameters + ---------- + axis : {0 or 'index'}, default 0 + There is only one axis to drop values from. + inplace : bool, default False + If True, do operation inplace and return None. + how : str, optional + Not in use. Kept for compatibility. + + Returns + ------- + Series + Series with NA entries dropped from it. + + See Also + -------- + Series.isna: Indicate missing values. + Series.notna : Indicate existing (non-missing) values. + Series.fillna : Replace missing values. + DataFrame.dropna : Drop rows or columns which contain NA values. + Index.dropna : Drop missing indices. + + Examples + -------- + >>> import mars.dataframe as md + >>> ser = md.Series([1., 2., np.nan]) + >>> ser.execute() + 0 1.0 + 1 2.0 + 2 NaN + dtype: float64 + + Drop NA values from a Series. + + >>> ser.dropna().execute() + 0 1.0 + 1 2.0 + dtype: float64 + + Keep the Series with valid entries in the same variable. + + >>> ser.dropna(inplace=True) + >>> ser.execute() + 0 1.0 + 1 2.0 + dtype: float64 + + Empty strings are not considered NA values. ``None`` is considered an + NA value. + + >>> ser = md.Series([np.NaN, 2, md.NaT, '', None, 'I stay']) + >>> ser.execute() + 0 NaN + 1 2 + 2 NaT + 3 + 4 None + 5 I stay + dtype: object + >>> ser.dropna().execute() + 1 2 + 3 + 5 I stay + dtype: object + """ + axis = validate_axis(axis, series) + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameDropNA( + axis=axis, + how=how, + output_types=[OutputType.series], + use_inf_as_na=use_inf_as_na, + ) + out_series = op(series) + if inplace: + series.data = out_series.data + else: + return out_series + + +def index_dropna(index, how="any"): + """ + Return Index without NA/NaN values. + + Parameters + ---------- + how : {'any', 'all'}, default 'any' + If the Index is a MultiIndex, drop the value when any or all levels + are NaN. + + Returns + ------- + Index + """ + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameDropNA( + axis=0, how=how, output_types=[OutputType.index], use_inf_as_na=use_inf_as_na + ) + return op(index) diff --git a/python/xorbits/_mars/dataframe/missing/fillna.py b/python/xorbits/_mars/dataframe/missing/fillna.py new file mode 100644 index 000000000..73c46cbc9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/missing/fillna.py @@ -0,0 +1,678 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...config import options +from ...core import ENTITY_TYPE, Entity, OutputType, get_output_types +from ...core.operand import OperandStage +from ...serialization.serializables import AnyField, BoolField, Int64Field, StringField +from ..align import ( + align_dataframe_dataframe, + align_dataframe_series, + align_series_series, +) +from ..core import DATAFRAME_TYPE, SERIES_TYPE +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import validate_axis + + +class FillNA(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.FILL_NA + + _value = AnyField( + "value", on_serialize=lambda x: x.data if isinstance(x, Entity) else x + ) + _method = StringField("method") + _axis = AnyField("axis") + _limit = Int64Field("limit") + _downcast = AnyField("downcast") + _use_inf_as_na = BoolField("use_inf_as_na") + + _output_limit = Int64Field("output_limit") + + def __init__( + self, + value=None, + method=None, + axis=None, + limit=None, + downcast=None, + use_inf_as_na=None, + output_types=None, + output_limit=None, + **kw + ): + super().__init__( + _value=value, + _method=method, + _axis=axis, + _limit=limit, + _downcast=downcast, + _use_inf_as_na=use_inf_as_na, + _output_types=output_types, + _output_limit=output_limit, + **kw + ) + + @property + def value(self): + return self._value + + @property + def method(self): + return self._method + + @property + def axis(self): + return self._axis + + @property + def limit(self): + return self._limit + + @property + def downcast(self): + return self._downcast + + @property + def use_inf_as_na(self): + return self._use_inf_as_na + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if self._method is None and len(inputs) > 1: + self._value = self._inputs[1] + + @property + def output_limit(self): + return self._output_limit or 1 + + @staticmethod + def _get_first_slice(op, df, end): + if op.method == "bfill": + if op.output_types[0] == OutputType.series: + return df.iloc[:end] + else: + if op.axis == 1: + return df.iloc[:, :end] + else: + return df.iloc[:end, :] + else: + if op.output_types[0] == OutputType.series: + return df.iloc[-end:] + else: + if op.axis == 1: + return df.iloc[:, -end:] + else: + return df.iloc[-end:, :] + + @classmethod + def _execute_map(cls, ctx, op): + input_data = ctx[op.inputs[0].key] + limit = op.limit + axis = op.axis + method = op.method + + filled = input_data.fillna( + method=method, axis=axis, limit=limit, downcast=op.downcast + ) + ctx[op.outputs[0].key] = cls._get_first_slice(op, filled, 1) + del filled + + @classmethod + def _execute_combine(cls, ctx, op): + axis = op.axis + method = op.method + limit = op.limit + + input_data = ctx[op.inputs[0].key] + if limit is not None: + n_summaries = (len(op.inputs) - 1) // 2 + summaries = [ctx[inp.key] for inp in op.inputs[1 : 1 + n_summaries]] + else: + summaries = [ctx[inp.key] for inp in op.inputs[1:]] + + if not summaries: + ctx[op.outputs[0].key] = input_data.fillna( + method=method, axis=axis, limit=limit, downcast=op.downcast + ) + return + + valid_summary = cls._get_first_slice( + op, pd.concat(summaries, axis=axis).fillna(method=method, axis=axis), 1 + ) + + if method == "bfill": + concat_df = pd.concat([input_data, valid_summary], axis=axis) + else: + concat_df = pd.concat([valid_summary, input_data], axis=axis) + + concat_df.fillna( + method=method, axis=axis, inplace=True, limit=limit, downcast=op.downcast + ) + ctx[op.outputs[0].key] = cls._get_first_slice(op, concat_df, -1) + + @classmethod + def execute(cls, ctx, op): + try: + pd.set_option("mode.use_inf_as_na", op.use_inf_as_na) + if op.stage == OperandStage.map: + cls._execute_map(ctx, op) + elif op.stage == OperandStage.combine: + cls._execute_combine(ctx, op) + else: + input_data = ctx[op.inputs[0].key] + value = getattr(op, "value", None) + if isinstance(op.value, ENTITY_TYPE): + value = ctx[op.value.key] + if not isinstance(input_data, pd.Index): + ctx[op.outputs[0].key] = input_data.fillna( + value=value, + method=op.method, + axis=op.axis, + limit=op.limit, + downcast=op.downcast, + ) + else: + ctx[op.outputs[0].key] = input_data.fillna( + value=value, downcast=op.downcast + ) + finally: + pd.reset_option("mode.use_inf_as_na") + + @classmethod + def _tile_one_by_one(cls, op): + in_df = op.inputs[0] + in_value_df = op.value if isinstance(op.value, ENTITY_TYPE) else None + df = op.outputs[0] + + new_chunks = [] + for c in in_df.chunks: + inputs = [c] if in_value_df is None else [c, in_value_df.chunks[0]] + kw = c.params + new_op = op.copy().reset_key() + new_chunks.append(new_op.new_chunk(inputs, **kw)) + + kw = df.params.copy() + kw.update(dict(chunks=new_chunks, nsplits=in_df.nsplits)) + new_op = op.copy().reset_key() + return new_op.new_tileables(op.inputs, **kw) + + @classmethod + def _build_combine(cls, op, input_chunks, summary_chunks, idx, is_forward=True): + c = input_chunks[idx] + + summaries_to_concat = [] + + idx_range = list( + range(idx) if is_forward else range(idx + 1, len(summary_chunks)) + ) + for i in idx_range: + summaries_to_concat.append(summary_chunks[i]) + + new_chunk_op = op.copy().reset_key() + new_chunk_op.stage = OperandStage.combine + + chunks_to_concat = [c] + summaries_to_concat + return new_chunk_op.new_chunk(chunks_to_concat, **c.params) + + @classmethod + def _tile_directional_dataframe(cls, op): + in_df = op.inputs[0] + df = op.outputs[0] + is_forward = op.method == "ffill" + + n_rows, n_cols = in_df.chunk_shape + + # map to get individual results and summaries + src_chunks = np.empty(in_df.chunk_shape, dtype=object) + summary_chunks = np.empty(in_df.chunk_shape, dtype=object) + for c in in_df.chunks: + new_chunk_op = op.copy().reset_key() + new_chunk_op.stage = OperandStage.map + if op.axis == 1: + summary_shape = (c.shape[0], 1) + else: + summary_shape = (1, c.shape[1]) + src_chunks[c.index] = c + summary_chunks[c.index] = new_chunk_op.new_chunk( + [c], shape=summary_shape, dtypes=df.dtypes + ) + + # combine summaries into results + output_chunk_array = np.empty(in_df.chunk_shape, dtype=object) + if op.axis == 1: + for row in range(n_rows): + row_src = src_chunks[row, :] + row_summaries = summary_chunks[row, :] + for col in range(n_cols): + output_chunk_array[row, col] = cls._build_combine( + op, row_src, row_summaries, col, is_forward + ) + else: + for col in range(n_cols): + col_src = src_chunks[:, col] + col_summaries = summary_chunks[:, col] + for row in range(n_rows): + output_chunk_array[row, col] = cls._build_combine( + op, col_src, col_summaries, row, is_forward + ) + + output_chunks = list(output_chunk_array.reshape((n_rows * n_cols,))) + new_op = op.copy().reset_key() + return new_op.new_tileables( + op.inputs, + shape=in_df.shape, + nsplits=in_df.nsplits, + chunks=output_chunks, + dtypes=df.dtypes, + index_value=df.index_value, + columns_value=df.columns_value, + ) + + @classmethod + def _tile_directional_series(cls, op): + in_series = op.inputs[0] + series = op.outputs[0] + forward = op.method == "ffill" + + # map to get individual results and summaries + summary_chunks = np.empty(in_series.chunk_shape, dtype=object) + for c in in_series.chunks: + new_chunk_op = op.copy().reset_key() + new_chunk_op.stage = OperandStage.map + summary_chunks[c.index] = new_chunk_op.new_chunk( + [c], shape=(1,), dtype=series.dtype + ) + + # combine summaries into results + output_chunks = [ + cls._build_combine(op, in_series.chunks, summary_chunks, i, forward) + for i in range(len(in_series.chunks)) + ] + new_op = op.copy().reset_key() + return new_op.new_tileables( + op.inputs, + shape=in_series.shape, + nsplits=in_series.nsplits, + chunks=output_chunks, + dtype=series.dtype, + index_value=series.index_value, + ) + + @classmethod + def _tile_both_dataframes(cls, op): + in_df = op.inputs[0] + in_value = op.inputs[1] + df = op.outputs[0] + + nsplits, out_shape, left_chunks, right_chunks = align_dataframe_dataframe( + in_df, in_value + ) + out_chunk_indexes = itertools.product(*(range(s) for s in out_shape[0])) + + out_chunks = [] + for idx, left_chunk, right_chunk in zip( + out_chunk_indexes, left_chunks, right_chunks + ): + out_chunk = ( + op.copy() + .reset_key() + .new_chunk([left_chunk, right_chunk], shape=(np.nan, np.nan), index=idx) + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_dataframes( + op.inputs, + df.shape, + nsplits=tuple(tuple(ns) for ns in nsplits), + chunks=out_chunks, + dtypes=df.dtypes, + index_value=df.index_value, + columns_value=df.columns_value, + ) + + @classmethod + def _tile_dataframe_series(cls, op): + left, right = op.inputs[0], op.inputs[1] + df = op.outputs[0] + + nsplits, out_shape, left_chunks, right_chunks = align_dataframe_series( + left, right, axis=1 + ) + out_chunk_indexes = itertools.product(*(range(s) for s in out_shape)) + + out_chunks = [] + for out_idx, df_chunk in zip(out_chunk_indexes, left_chunks): + series_chunk = right_chunks[out_idx[1]] + kw = dict( + shape=(nsplits[0][out_idx[0]], nsplits[1][out_idx[1]]), + index_value=df_chunk.index_value, + columns_value=df_chunk.columns_value, + ) + out_chunk = ( + op.copy() + .reset_key() + .new_chunk([df_chunk, series_chunk], index=out_idx, **kw) + ) + out_chunks.append(out_chunk) + + new_op = op.copy().reset_key() + return new_op.new_dataframes( + op.inputs, + df.shape, + nsplits=tuple(tuple(ns) for ns in nsplits), + chunks=out_chunks, + dtypes=df.dtypes, + index_value=df.index_value, + columns_value=df.columns_value, + ) + + @classmethod + def _tile_both_series(cls, op): + left, right = op.inputs[0], op.inputs[1] + df = op.outputs[0] + + nsplits, out_shape, left_chunks, right_chunks = align_series_series(left, right) + + out_chunks = [] + for idx, left_chunk, right_chunk in zip( + range(out_shape[0]), left_chunks, right_chunks + ): + out_chunk = ( + op.copy() + .reset_key() + .new_chunk( + [left_chunk, right_chunk], + index_value=left_chunk.index_value, + shape=(np.nan,), + index=(idx,), + ) + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_seriess( + op.inputs, + df.shape, + nsplits=tuple(tuple(ns) for ns in nsplits), + chunks=out_chunks, + dtype=df.dtype, + index_value=df.index_value, + name=df.name, + ) + + @classmethod + def tile(cls, op): + in_df = op.inputs[0] + if len(in_df.chunks) == 1 and ( + not isinstance(op.value, ENTITY_TYPE) or len(op.value.chunks) == 1 + ): + return cls._tile_one_by_one(op) + elif op.method is not None: + if op.output_types[0] == OutputType.dataframe: + return cls._tile_directional_dataframe(op) + else: + return cls._tile_directional_series(op) + elif not isinstance(op.value, ENTITY_TYPE): + return cls._tile_one_by_one(op) + elif isinstance(op.value, DATAFRAME_TYPE): + return cls._tile_both_dataframes(op) + elif op.output_types[0] == OutputType.dataframe: + return cls._tile_dataframe_series(op) + else: + return cls._tile_both_series(op) + + def __call__(self, a, value_df=None): + method = getattr(self, "method", None) + if method == "backfill": + method = "bfill" + elif method == "pad": + method = "ffill" + self._method = method + axis = getattr(self, "axis", None) or 0 + self._axis = validate_axis(axis, a) + + inputs = [a] + if value_df is not None: + inputs.append(value_df) + if isinstance(a, DATAFRAME_TYPE): + return self.new_dataframe( + inputs, + shape=a.shape, + dtypes=a.dtypes, + index_value=a.index_value, + columns_value=a.columns_value, + ) + elif isinstance(a, SERIES_TYPE): + return self.new_series( + inputs, + shape=a.shape, + dtype=a.dtype, + index_value=a.index_value, + name=a.name, + ) + else: + return self.new_index( + inputs, + shape=a.shape, + dtype=a.dtype, + index_value=a.index_value, + name=a.name, + names=a.names, + ) + + +def fillna( + df, value=None, method=None, axis=None, inplace=False, limit=None, downcast=None +): + """ + Fill NA/NaN values using the specified method. + + Parameters + ---------- + value : scalar, dict, Series, or DataFrame + Value to use to fill holes (e.g. 0), alternately a + dict/Series/DataFrame of values specifying which value to use for + each index (for a Series) or column (for a DataFrame). Values not + in the dict/Series/DataFrame will not be filled. This value cannot + be a list. + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use next valid observation to fill gap. + axis : {0 or 'index', 1 or 'columns'} + Axis along which to fill missing values. + inplace : bool, default False + If True, fill in-place. Note: this will modify any + other views on this object (e.g., a no-copy slice for a column in a + DataFrame). + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. Must be greater than 0 if not None. + downcast : dict, default is None + A dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible). + + Returns + ------- + DataFrame or None + Object with missing values filled or None if ``inplace=True``. + + See Also + -------- + interpolate : Fill NaN values using interpolation. + reindex : Conform object to new index. + asfreq : Convert TimeSeries to specified frequency. + + Examples + -------- + >>> import mars.tensor as mt + >>> import mars.dataframe as md + >>> df = md.DataFrame([[mt.nan, 2, mt.nan, 0], + ... [3, 4, mt.nan, 1], + ... [mt.nan, mt.nan, mt.nan, 5], + ... [mt.nan, 3, mt.nan, 4]], + ... columns=list('ABCD')) + >>> df.execute() + A B C D + 0 NaN 2.0 NaN 0 + 1 3.0 4.0 NaN 1 + 2 NaN NaN NaN 5 + 3 NaN 3.0 NaN 4 + + Replace all NaN elements with 0s. + + >>> df.fillna(0).execute() + A B C D + 0 0.0 2.0 0.0 0 + 1 3.0 4.0 0.0 1 + 2 0.0 0.0 0.0 5 + 3 0.0 3.0 0.0 4 + + We can also propagate non-null values forward or backward. + + >>> df.fillna(method='ffill').execute() + A B C D + 0 NaN 2.0 NaN 0 + 1 3.0 4.0 NaN 1 + 2 3.0 4.0 NaN 5 + 3 3.0 3.0 NaN 4 + + Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1, + 2, and 3 respectively. + + >>> values = {'A': 0, 'B': 1, 'C': 2, 'D': 3} + >>> df.fillna(value=values).execute() + A B C D + 0 0.0 2.0 2.0 0 + 1 3.0 4.0 2.0 1 + 2 0.0 1.0 2.0 5 + 3 0.0 3.0 2.0 4 + """ + if value is None and method is None: + raise ValueError("Must specify a fill 'value' or 'method'.") + elif value is not None and method is not None: + raise ValueError("Cannot specify both 'value' and 'method'.") + + if isinstance(df, SERIES_TYPE) and isinstance( + value, (DATAFRAME_TYPE, pd.DataFrame) + ): + raise ValueError( + '"value" parameter must be a scalar, dict or Series, but you passed a "%s"' + % type(value).__name__ + ) + + if downcast is not None: + raise NotImplementedError( + 'Currently argument "downcast" is not implemented yet' + ) + if limit is not None: + raise NotImplementedError('Currently argument "limit" is not implemented yet') + + if isinstance(value, ENTITY_TYPE): + value, value_df = None, value + else: + value_df = None + + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = FillNA( + value=value, + method=method, + axis=axis, + limit=limit, + downcast=downcast, + use_inf_as_na=use_inf_as_na, + output_types=get_output_types(df), + ) + out_df = op(df, value_df=value_df) + if inplace: + df.data = out_df.data + else: + return out_df + + +def ffill(df, axis=None, inplace=False, limit=None, downcast=None): + """ + Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``. + + Returns + ------- + {klass} or None + Object with missing values filled or None if ``inplace=True``. + """ + return fillna( + df, method="ffill", axis=axis, inplace=inplace, limit=limit, downcast=downcast + ) + + +def bfill(df, axis=None, inplace=False, limit=None, downcast=None): + """ + Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``. + + Returns + ------- + {klass} or None + Object with missing values filled or None if ``inplace=True``. + """ + return fillna( + df, method="bfill", axis=axis, inplace=inplace, limit=limit, downcast=downcast + ) + + +def index_fillna(index, value=None, downcast=None): + """ + Fill NA/NaN values with the specified value. + + Parameters + ---------- + value : scalar + Scalar value to use to fill holes (e.g. 0). + This value cannot be a list-likes. + downcast : dict, default is None + A dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible). + + Returns + ------- + Index + + See Also + -------- + DataFrame.fillna : Fill NaN values of a DataFrame. + Series.fillna : Fill NaN Values of a Series. + """ + if isinstance(value, (list, pd.Series, SERIES_TYPE)): + raise ValueError("'value' must be a scalar, passed: %s" % type(value)) + + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = FillNA( + value=value, + downcast=downcast, + use_inf_as_na=use_inf_as_na, + output_types=get_output_types(index), + ) + return op(index) diff --git a/python/xorbits/_mars/dataframe/missing/replace.py b/python/xorbits/_mars/dataframe/missing/replace.py new file mode 100644 index 000000000..edac7bcbc --- /dev/null +++ b/python/xorbits/_mars/dataframe/missing/replace.py @@ -0,0 +1,637 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...core import recursive_tile +from ...core.operand import OperandStage +from ...serialization.serializables import AnyField, FieldTypes, Int32Field, ListField +from ...utils import no_default +from ..operands import ( + SERIES_CHUNK_TYPE, + SERIES_TYPE, + DataFrameOperand, + DataFrameOperandMixin, +) +from ..utils import build_df, build_series, parse_index + + +class DataFrameReplace(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.REPLACE + + _to_replace = AnyField("to_replace") + _value = AnyField("value") + _limit = Int32Field("limit") + _regex = AnyField("regex") + _method = AnyField("method") + + _fill_chunks = ListField("fill_chunks", FieldTypes.key) + + def __init__( + self, + to_replace=None, + value=None, + limit=None, + regex=None, + method=None, + fill_chunks=None, + **kw + ): + super().__init__( + _to_replace=to_replace, + _value=value, + _limit=limit, + _regex=regex, + _method=method, + _fill_chunks=fill_chunks, + **kw + ) + + @property + def to_replace(self): + return self._to_replace + + @property + def value(self): + return self._value + + @property + def limit(self): + return self._limit + + @property + def regex(self): + return self._regex + + @property + def method(self): + return self._method + + @property + def fill_chunks(self): + return self._fill_chunks + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + input_iter = iter(inputs) + next(input_iter) + if isinstance(self.to_replace, (SERIES_TYPE, SERIES_CHUNK_TYPE)): + self._to_replace = next(input_iter) + if isinstance(self.value, (SERIES_TYPE, SERIES_CHUNK_TYPE)): + self._value = next(input_iter) + self._fill_chunks = list(input_iter) + + def __call__(self, df_or_series): + inputs = [df_or_series] + mock_obj = ( + build_df(df_or_series) + if df_or_series.ndim == 2 + else build_series(df_or_series) + ) + + if isinstance(self.to_replace, SERIES_TYPE): + mock_to_replace = build_series(self.to_replace) + inputs.append(self.to_replace) + else: + mock_to_replace = self.to_replace + + if isinstance(self.value, SERIES_TYPE): + mock_value = build_series(self.value) + inputs.append(self.value) + else: + mock_value = self.value + + mock_result = mock_obj.replace( + mock_to_replace, mock_value, regex=self.regex, method=self.method + ) + + if df_or_series.ndim == 2: + return self.new_dataframe( + inputs, + shape=df_or_series.shape, + dtypes=mock_result.dtypes, + index_value=df_or_series.index_value, + columns_value=df_or_series.columns_value, + ) + else: + return self.new_series( + inputs, + shape=df_or_series.shape, + dtype=mock_result.dtype, + index_value=df_or_series.index_value, + ) + + @classmethod + def _build_result_chunk( + cls, op: "DataFrameReplace", in_chunks, with_fill=False, stage=None + ): + in_obj = op.inputs[0] + out_obj = op.outputs[0] + in_chunk = in_chunks[0] + + kw = in_chunk.params + new_shape = list(in_chunk.shape) + if with_fill: + new_shape[0] = 1 + + if in_obj.ndim == 2: + new_dtypes = out_obj.dtypes[in_chunk.dtypes.index] + kw.update( + dict( + dtypes=new_dtypes, + shape=tuple(new_shape), + column_values=parse_index(new_dtypes.index), + ) + ) + else: + kw.update(dict(dtype=out_obj.dtype, shape=tuple(new_shape))) + + new_op = op.copy().reset_key() + new_op.stage = stage + return new_op.new_chunk(in_chunks, **kw) + + @classmethod + def tile(cls, op: "DataFrameReplace"): + in_obj = op.inputs[0] + out_obj = op.outputs[0] + + chunk_inputs_ex = [] + tileable_inputs_ex = [] + to_replace = op.to_replace + if isinstance(to_replace, SERIES_TYPE): + to_replace = yield from recursive_tile( + to_replace.rechunk((to_replace.shape[0],)) + ) + chunk_inputs_ex.append(to_replace.chunks[0]) + tileable_inputs_ex.append(to_replace) + value = op.value + if isinstance(value, SERIES_TYPE): + value = yield from recursive_tile(value.rechunk((value.shape[0],))) + chunk_inputs_ex.append(value.chunks[0]) + tileable_inputs_ex.append(value) + + # fill methods only available when `to_replace` is a scalar, list or tuple + # and `value` is no_default. + with_fill = ( + op.value is no_default + and not isinstance(op.to_replace, dict) + and op.method is not None + ) + + chunks = [] + if not with_fill: + for in_chunk in in_obj.chunks: + inputs = [in_chunk] + chunk_inputs_ex + chunks.append( + cls._build_result_chunk( + op, inputs, with_fill, OperandStage.map if with_fill else None + ) + ) + else: + map_array = np.empty(out_obj.shape, dtype=object) + for in_chunk in in_obj.chunks: + inputs = [in_chunk] + chunk_inputs_ex + map_array[in_chunk.index] = cls._build_result_chunk( + op, inputs, with_fill, OperandStage.map if with_fill else None + ) + + for in_chunk in in_obj.chunks: + if op.method in (no_default, "pad", "ffill"): + slc = slice(0, in_chunk.index[0]) + else: + slc = slice(in_chunk.index[0] + 1, in_obj.chunk_shape[0]) + + if in_chunk.ndim == 2: + append_chunks = list(map_array[slc, in_chunk.index[1]]) + else: + append_chunks = list(map_array[slc]) + + inputs = [in_chunk] + chunk_inputs_ex + append_chunks + chunks.append( + cls._build_result_chunk(op, inputs, False, OperandStage.combine) + ) + + inputs = [in_obj] + tileable_inputs_ex + new_op = op.copy().reset_key() + return new_op.new_tileables( + inputs, chunks=chunks, nsplits=in_obj.nsplits, **out_obj.params + ) + + @classmethod + def execute(cls, ctx, op: "DataFrameReplace"): + in_data = ctx[op.inputs[0].key] + to_replace = op.to_replace + if isinstance(to_replace, SERIES_CHUNK_TYPE): + to_replace = ctx[to_replace.key] + value = op.value + if isinstance(value, SERIES_CHUNK_TYPE): + value = ctx[value.key] + + if not op.fill_chunks: + concat_data = in_data + else: + to_concat = [ctx[c.key] for c in op.fill_chunks] + if op.method in (no_default, "pad", "ffill"): + to_concat += [in_data] + else: + to_concat = [in_data] + to_concat + concat_data = pd.concat(to_concat) + + replace_args = (to_replace,) + if value is not no_default: + replace_args += (value,) + replace_kwargs = dict(regex=op.regex, method=op.method, limit=op.limit) + replace_kwargs = { + k: v for k, v in replace_kwargs.items() if v is not no_default + } + + result = concat_data.replace(*replace_args, **replace_kwargs) + del concat_data + + if op.stage == OperandStage.map: + to_slice = op.outputs[0].shape[0] + if op.method in (no_default, "pad", "ffill"): + result = result.iloc[-to_slice:] + else: + result = result.iloc[:to_slice] + else: + to_remove = len(result) - len(in_data) + if to_remove > 0: + if op.method in (no_default, "pad", "ffill"): + result = result.iloc[to_remove:] + else: + result = result.iloc[:-to_remove] + ctx[op.outputs[0].key] = result + + +_fun_doc = """ +Replace values given in `to_replace` with `value`. + +Values of the #obj_type# are replaced with other values dynamically. +This differs from updating with ``.loc`` or ``.iloc``, which require +you to specify a location to update with some value. + +Parameters +---------- +to_replace : str, regex, list, dict, Series, int, float, or None + How to find the values that will be replaced. + + * numeric, str or regex: + + - numeric: numeric values equal to `to_replace` will be + replaced with `value` + - str: string exactly matching `to_replace` will be replaced + with `value` + - regex: regexs matching `to_replace` will be replaced with + `value` + + * list of str, regex, or numeric: + + - First, if `to_replace` and `value` are both lists, they + **must** be the same length. + - Second, if ``regex=True`` then all of the strings in **both** + lists will be interpreted as regexs otherwise they will match + directly. This doesn't matter much for `value` since there + are only a few possible substitution regexes you can use. + - str, regex and numeric rules apply as above. + + * dict: + + - Dicts can be used to specify different replacement values + for different existing values. For example, + ``{'a': 'b', 'y': 'z'}`` replaces the value 'a' with 'b' and + 'y' with 'z'. To use a dict in this way the `value` + parameter should be `None`. + - For a DataFrame a dict can specify that different values + should be replaced in different columns. For example, + ``{'a': 1, 'b': 'z'}`` looks for the value 1 in column 'a' + and the value 'z' in column 'b' and replaces these values + with whatever is specified in `value`. The `value` parameter + should not be ``None`` in this case. You can treat this as a + special case of passing two lists except that you are + specifying the column to search in. + - For a DataFrame nested dictionaries, e.g., + ``{'a': {'b': np.nan}}``, are read as follows: look in column + 'a' for the value 'b' and replace it with NaN. The `value` + parameter should be ``None`` to use a nested dict in this + way. You can nest regular expressions as well. Note that + column names (the top-level dictionary keys in a nested + dictionary) **cannot** be regular expressions. + + * None: + + - This means that the `regex` argument must be a string, + compiled regular expression, or list, dict, ndarray or + Series of such elements. If `value` is also ``None`` then + this **must** be a nested dictionary or Series. + + See the examples section for examples of each of these. +value : scalar, dict, list, str, regex, default None + Value to replace any values matching `to_replace` with. + For a DataFrame a dict of values can be used to specify which + value to use for each column (columns not in the dict will not be + filled). Regular expressions, strings and lists or dicts of such + objects are also allowed. +inplace : bool, default False + If True, in place. Note: this will modify any + other views on this object (e.g. a column from a DataFrame). + Returns the caller if this is True. +limit : int, default None + Maximum size gap to forward or backward fill. +regex : bool or same types as `to_replace`, default False + Whether to interpret `to_replace` and/or `value` as regular + expressions. If this is ``True`` then `to_replace` *must* be a + string. Alternatively, this could be a regular expression or a + list, dict, or array of regular expressions in which case + `to_replace` must be ``None``. +method : {'pad', 'ffill', 'bfill', `None`} + The method to use when for replacement, when `to_replace` is a + scalar, list or tuple and `value` is ``None``. + +Returns +------- +#obj_type# + Object after replacement. + +Raises +------ +AssertionError + * If `regex` is not a ``bool`` and `to_replace` is not + ``None``. +TypeError + * If `to_replace` is a ``dict`` and `value` is not a ``list``, + ``dict``, ``ndarray``, or ``Series`` + * If `to_replace` is ``None`` and `regex` is not compilable + into a regular expression or is a list, dict, ndarray, or + Series. + * When replacing multiple ``bool`` or ``datetime64`` objects and + the arguments to `to_replace` does not match the type of the + value being replaced +ValueError + * If a ``list`` or an ``ndarray`` is passed to `to_replace` and + `value` but they are not the same length. + +See Also +-------- +#obj_type#.fillna : Fill NA values. +#obj_type#.where : Replace values based on boolean condition. +Series.str.replace : Simple string replacement. + +Notes +----- +* Regex substitution is performed under the hood with ``re.sub``. The + rules for substitution for ``re.sub`` are the same. +* Regular expressions will only substitute on strings, meaning you + cannot provide, for example, a regular expression matching floating + point numbers and expect the columns in your frame that have a + numeric dtype to be matched. However, if those floating point + numbers *are* strings, then you can do this. +* This method has *a lot* of options. You are encouraged to experiment + and play with this method to gain intuition about how it works. +* When dict is used as the `to_replace` value, it is like + key(s) in the dict are the to_replace part and + value(s) in the dict are the value parameter. + +Examples +-------- + +**Scalar `to_replace` and `value`** + +>>> import mars.tensor as mt +>>> import mars.dataframe as md +>>> s = md.Series([0, 1, 2, 3, 4]) +>>> s.replace(0, 5).execute() +0 5 +1 1 +2 2 +3 3 +4 4 +dtype: int64 + +>>> df = md.DataFrame({'A': [0, 1, 2, 3, 4], +... 'B': [5, 6, 7, 8, 9], +... 'C': ['a', 'b', 'c', 'd', 'e']}) +>>> df.replace(0, 5).execute() + A B C +0 5 5 a +1 1 6 b +2 2 7 c +3 3 8 d +4 4 9 e + +**List-like `to_replace`** + +>>> df.replace([0, 1, 2, 3], 4).execute() + A B C +0 4 5 a +1 4 6 b +2 4 7 c +3 4 8 d +4 4 9 e + +>>> df.replace([0, 1, 2, 3], [4, 3, 2, 1]).execute() + A B C +0 4 5 a +1 3 6 b +2 2 7 c +3 1 8 d +4 4 9 e + +>>> s.replace([1, 2], method='bfill').execute() +0 0 +1 3 +2 3 +3 3 +4 4 +dtype: int64 + +**dict-like `to_replace`** + +>>> df.replace({0: 10, 1: 100}).execute() + A B C +0 10 5 a +1 100 6 b +2 2 7 c +3 3 8 d +4 4 9 e + +>>> df.replace({'A': 0, 'B': 5}, 100).execute() + A B C +0 100 100 a +1 1 6 b +2 2 7 c +3 3 8 d +4 4 9 e + +>>> df.replace({'A': {0: 100, 4: 400}}).execute() + A B C +0 100 5 a +1 1 6 b +2 2 7 c +3 3 8 d +4 400 9 e + +**Regular expression `to_replace`** + +>>> df = md.DataFrame({'A': ['bat', 'foo', 'bait'], +... 'B': ['abc', 'bar', 'xyz']}) +>>> df.replace(to_replace=r'^ba.$', value='new', regex=True).execute() + A B +0 new abc +1 foo new +2 bait xyz + +>>> df.replace({'A': r'^ba.$'}, {'A': 'new'}, regex=True).execute() + A B +0 new abc +1 foo bar +2 bait xyz + +>>> df.replace(regex=r'^ba.$', value='new').execute() + A B +0 new abc +1 foo new +2 bait xyz + +>>> df.replace(regex={r'^ba.$': 'new', 'foo': 'xyz'}).execute() + A B +0 new abc +1 xyz new +2 bait xyz + +>>> df.replace(regex=[r'^ba.$', 'foo'], value='new').execute() + A B +0 new abc +1 new new +2 bait xyz + +Note that when replacing multiple ``bool`` or ``datetime64`` objects, +the data types in the `to_replace` parameter must match the data +type of the value being replaced: + +>>> df = md.DataFrame({'A': [True, False, True], +... 'B': [False, True, False]}) +>>> df.replace({'a string': 'new value', True: False}) # raises.execute() +Traceback (most recent call last): + ....execute() +TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str' + +This raises a ``TypeError`` because one of the ``dict`` keys is not of +the correct type for replacement. + +Compare the behavior of ``s.replace({'a': None})`` and +``s.replace('a', None)`` to understand the peculiarities +of the `to_replace` parameter: + +>>> s = md.Series([10, 'a', 'a', 'b', 'a']) + +When one uses a dict as the `to_replace` value, it is like the +value(s) in the dict are equal to the `value` parameter. +``s.replace({'a': None})`` is equivalent to +``s.replace(to_replace={'a': None}, value=None, method=None)``: + +>>> s.replace({'a': None}).execute() +0 10 +1 None +2 None +3 b +4 None +dtype: object + +When ``value=None`` and `to_replace` is a scalar, list or +tuple, `replace` uses the method parameter (default 'pad') to do the +replacement. So this is why the 'a' values are being replaced by 10 +in rows 1 and 2 and 'b' in row 4 in this case. +The command ``s.replace('a', None)`` is actually equivalent to +``s.replace(to_replace='a', value=None, method='pad')``: + +>>> s.replace('a', None).execute() +0 10 +1 10 +2 10 +3 b +4 b +dtype: object +""" + + +def _replace( + df_or_series, + to_replace=None, + value=None, + inplace=False, + limit=None, + regex=False, + method=no_default, +): + if not isinstance(to_replace, dict) and value is no_default and limit is not None: + raise NotImplementedError("fill with limit not supported when value is None") + + if not isinstance(regex, bool): + to_replace = regex + regex = True + op = DataFrameReplace( + to_replace=to_replace, value=value, limit=limit, regex=regex, method=method + ) + ret = op(df_or_series) + if inplace: + df_or_series.data = ret.data + else: + return ret + + +def df_replace( + df, + to_replace=no_default, + value=no_default, + inplace=False, + limit=None, + regex=False, + method=no_default, +): + return _replace( + df, + to_replace=to_replace, + value=value, + inplace=inplace, + limit=limit, + regex=regex, + method=method, + ) + + +def series_replace( + series, + to_replace=no_default, + value=no_default, + inplace=False, + limit=None, + regex=False, + method=no_default, +): + return _replace( + series, + to_replace=to_replace, + value=value, + inplace=inplace, + limit=limit, + regex=regex, + method=method, + ) + + +df_replace.__doc__ = _fun_doc.replace("#obj_type#", "DataFrame") +series_replace.__doc__ = _fun_doc.replace("#obj_type#", "Series") diff --git a/python/xorbits/_mars/dataframe/missing/tests/__init__.py b/python/xorbits/_mars/dataframe/missing/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/missing/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/missing/tests/test_missing.py b/python/xorbits/_mars/dataframe/missing/tests/test_missing.py new file mode 100644 index 000000000..0c4806192 --- /dev/null +++ b/python/xorbits/_mars/dataframe/missing/tests/test_missing.py @@ -0,0 +1,438 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random + +import numpy as np +import pandas as pd +import pytest + +from .... import dataframe as md +from .... import tensor as mt +from ....core import tile +from ....core.operand import OperandStage +from ....utils import pd_release_version + +_drop_na_enable_no_default = pd_release_version[:2] >= (1, 5) + + +def test_fill_na(): + df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list("ABCDEFGHIJ")) + for _ in range(20): + df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99) + value_df_raw = pd.DataFrame( + np.random.randint(0, 100, (10, 7)).astype(np.float32), columns=list("ABCDEFG") + ) + series_raw = pd.Series(np.nan, index=range(20)) + for _ in range(3): + series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99) + value_series_raw = pd.Series( + np.random.randint(0, 100, (10,)).astype(np.float32), index=list("ABCDEFGHIJ") + ) + + df = md.DataFrame(df_raw) + series = md.Series(series_raw) + + # when nothing supplied, raise + with pytest.raises(ValueError): + df.fillna() + # when both values and methods supplied, raises + with pytest.raises(ValueError): + df.fillna(value=1, method="ffill") + # when call on series, cannot supply DataFrames + with pytest.raises(ValueError): + series.fillna(value=df) + with pytest.raises(ValueError): + series.fillna(value=df_raw) + with pytest.raises(NotImplementedError): + series.fillna(value=series_raw, downcast="infer") + with pytest.raises(NotImplementedError): + series.ffill(limit=1) + + df2 = tile(df.fillna(value_series_raw)) + assert len(df2.chunks) == 1 + assert df2.chunks[0].shape == df2.shape + assert df2.chunks[0].op.stage is None + + series2 = tile(series.fillna(value_series_raw)) + assert len(series2.chunks) == 1 + assert series2.chunks[0].shape == series2.shape + assert series2.chunks[0].op.stage is None + + df = md.DataFrame(df_raw, chunk_size=5) + df2 = tile(df.fillna(value_series_raw)) + assert len(df2.chunks) == 8 + assert df2.chunks[0].shape == (5, 5) + assert df2.chunks[0].op.stage is None + + series = md.Series(series_raw, chunk_size=5) + series2 = tile(series.fillna(value_series_raw)) + assert len(series2.chunks) == 4 + assert series2.chunks[0].shape == (5,) + assert series2.chunks[0].op.stage is None + + df2 = tile(df.ffill(axis="columns")) + assert len(df2.chunks) == 8 + assert df2.chunks[0].shape == (5, 5) + assert df2.chunks[0].op.axis == 1 + assert df2.chunks[0].op.stage == OperandStage.combine + assert df2.chunks[0].op.method == "ffill" + assert df2.chunks[0].op.limit is None + + series2 = tile(series.bfill()) + assert len(series2.chunks) == 4 + assert series2.chunks[0].shape == (5,) + assert series2.chunks[0].op.stage == OperandStage.combine + assert series2.chunks[0].op.method == "bfill" + assert series2.chunks[0].op.limit is None + + value_df = md.DataFrame(value_df_raw, chunk_size=7) + value_series = md.Series(value_series_raw, chunk_size=7) + + df2 = tile(df.fillna(value_df)) + assert df2.shape == df.shape + assert df2.chunks[0].op.stage is None + + df2 = tile(df.fillna(value_series)) + assert df2.shape == df.shape + assert df2.chunks[0].op.stage is None + + value_series_raw.index = list(range(10)) + value_series = md.Series(value_series_raw) + series2 = tile(series.fillna(value_series)) + assert series2.shape == series.shape + assert series2.chunks[0].op.stage is None + + +def test_drop_na(): + # dataframe cases + df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list("ABCDEFGHIJ")) + for _ in range(30): + df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99) + for rowid in range(random.randint(1, 5)): + row = random.randint(0, 19) + for idx in range(0, 10): + df_raw.iloc[row, idx] = random.randint(0, 99) + + # not supporting drop with axis=1 + with pytest.raises(NotImplementedError): + md.DataFrame(df_raw).dropna(axis=1) + + if _drop_na_enable_no_default: + with pytest.raises(TypeError): + md.DataFrame(df_raw).dropna(how="any", thresh=0) + + # only one chunk in columns, can run dropna directly + r = tile(md.DataFrame(df_raw, chunk_size=(4, 10)).dropna()) + assert r.shape == (np.nan, 10) + assert r.nsplits == ((np.nan,) * 5, (10,)) + for c in r.chunks: + assert isinstance(c.op, type(r.op)) + assert len(c.inputs) == 1 + assert len(c.inputs[0].inputs) == 0 + assert c.shape == (np.nan, 10) + + # multiple chunks in columns, count() will be called first + r = tile(md.DataFrame(df_raw, chunk_size=4).dropna()) + assert r.shape == (np.nan, 10) + assert r.nsplits == ((np.nan,) * 5, (4, 4, 2)) + for c in r.chunks: + assert isinstance(c.op, type(r.op)) + assert len(c.inputs) == 2 + assert len(c.inputs[0].inputs) == 0 + assert c.inputs[1].op.stage == OperandStage.agg + assert np.isnan(c.shape[0]) + + # series cases + series_raw = pd.Series(np.nan, index=range(20)) + for _ in range(10): + series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99) + + r = tile(md.Series(series_raw, chunk_size=4).dropna()) + assert r.shape == (np.nan,) + assert r.nsplits == ((np.nan,) * 5,) + for c in r.chunks: + assert isinstance(c.op, type(r.op)) + assert len(c.inputs) == 1 + assert len(c.inputs[0].inputs) == 0 + assert c.shape == (np.nan,) + + +def test_replace(): + # dataframe cases + df_raw = pd.DataFrame(-1, index=range(0, 20), columns=list("ABCDEFGHIJ")) + for _ in range(30): + df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99) + for rowid in range(random.randint(1, 5)): + row = random.randint(0, 19) + for idx in range(0, 10): + df_raw.iloc[row, idx] = random.randint(0, 99) + + # not supporting fill with limit + df = md.DataFrame(df_raw, chunk_size=4) + with pytest.raises(NotImplementedError): + df.replace(-1, method="ffill", limit=5) + + r = tile(df.replace(-1, method="ffill")) + assert len(r.chunks) == 15 + assert r.chunks[0].shape == (4, 4) + assert r.chunks[0].op.stage == OperandStage.combine + assert r.chunks[0].op.method == "ffill" + assert r.chunks[0].op.limit is None + assert r.chunks[-1].inputs[-1].shape == (1, 2) + assert r.chunks[-1].inputs[-1].op.stage == OperandStage.map + assert r.chunks[-1].inputs[-1].op.method == "ffill" + assert r.chunks[-1].inputs[-1].op.limit is None + + r = tile(df.replace(-1, 99)) + assert len(r.chunks) == 15 + assert r.chunks[0].shape == (4, 4) + assert r.chunks[0].op.stage is None + assert r.chunks[0].op.limit is None + + # series cases + series_raw = pd.Series(-1, index=range(20)) + for _ in range(10): + series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99) + series = md.Series(series_raw, chunk_size=4) + + r = tile(series.replace(-1, method="ffill")) + assert len(r.chunks) == 5 + assert r.chunks[0].shape == (4,) + assert r.chunks[0].op.stage == OperandStage.combine + assert r.chunks[0].op.method == "ffill" + assert r.chunks[0].op.limit is None + assert r.chunks[-1].inputs[-1].shape == (1,) + assert r.chunks[-1].inputs[-1].op.stage == OperandStage.map + assert r.chunks[-1].inputs[-1].op.method == "ffill" + assert r.chunks[-1].inputs[-1].op.limit is None + + r = tile(series.replace(-1, 99)) + assert len(r.chunks) == 5 + assert r.chunks[0].shape == (4,) + assert r.chunks[0].op.stage is None + assert r.chunks[0].op.limit is None + + +@pytest.mark.parametrize("inf_as_na", [True, False]) +def test_isna(setup, inf_as_na): + from ....config import options + from ..checkna import isna + + old_mars_inf_as_na = options.dataframe.mode.use_inf_as_na + options.dataframe.mode.use_inf_as_na = inf_as_na + # this option could be changed by mars execution. + old_pd_inf_as_na = pd.get_option("mode.use_inf_as_na") + pd.options.mode.use_inf_as_na = inf_as_na + + # scalars + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + assert isna("dog") == pd.isna("dog") + assert isna(None) == pd.isna(None) + assert isna(md.NA) == pd.isna(pd.NA) + assert isna(md.NaT) == pd.isna(pd.NaT) + assert isna(mt.NaN) == pd.isna(np.NaN) + assert isna(type) == pd.isna(type) + + # multi index + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + with pytest.raises(NotImplementedError): + midx = md.MultiIndex() + isna(midx) + + # list + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + l = [1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT] + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + actual = isna(l).execute().fetch() + expected = pd.isna(l) + np.testing.assert_array_equal(expected, actual) + + # tuple + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + t = (1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT) + assert not isna(t) + + # numpy ndarray + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + narr = np.array((1, 2, 3, np.Inf, np.NaN)) + actual = isna(narr).execute().fetch() + expected = pd.isna(narr) + np.testing.assert_array_equal(expected, actual) + + # pandas index + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + pi = pd.Index((1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT)) + actual = isna(pi).execute().fetch() + expected = pd.isna(pi) + np.testing.assert_array_equal(expected, actual) + + # pandas series + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + ps = pd.Series((1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT)) + actual = isna(ps).execute().fetch() + expected = pd.isna(ps) + pd.testing.assert_series_equal(expected, actual) + + # pandas dataframe + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + pdf = pd.DataFrame( + {"foo": (1, 2, 3, np.Inf, pd.NA), "bar": (4, 5, 6, np.NaN, pd.NaT)} + ) + actual = isna(pdf).execute().fetch() + expected = pd.isna(pdf) + pd.testing.assert_frame_equal(expected, actual) + + # mars tensor + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + marr = mt.tensor(narr) + actual = isna(marr).execute().fetch() + expected = pd.isna(narr) + np.testing.assert_array_equal(expected, actual) + + # mars index + from ...datasource.index import from_pandas as from_pandas_index + + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + mi = from_pandas_index(pi) + actual = isna(mi).execute().fetch() + expected = pd.isna(pi) + np.testing.assert_array_equal(expected, actual) + + # mars series + from ...datasource.series import from_pandas as from_pandas_series + + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + ms = from_pandas_series(ps) + actual = isna(ms).execute().fetch() + expected = pd.isna(ps) + pd.testing.assert_series_equal(expected, actual) + + # mars dataframe + from ...datasource.dataframe import from_pandas as from_pandas_df + + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + mdf = from_pandas_df(pdf) + actual = isna(mdf).execute().fetch() + expected = pd.isna(pdf) + pd.testing.assert_frame_equal(expected, actual) + + options.dataframe.mode.use_inf_as_na = old_mars_inf_as_na + pd.options.mode.use_inf_as_na = old_pd_inf_as_na + + +@pytest.mark.parametrize("inf_as_na", [True, False]) +def test_notna(setup, inf_as_na): + from ....config import options + from ..checkna import notna + + old_mars_inf_as_na = options.dataframe.mode.use_inf_as_na + options.dataframe.mode.use_inf_as_na = inf_as_na + # this option could be changed by mars execution. + old_pd_inf_as_na = pd.get_option("mode.use_inf_as_na") + pd.options.mode.use_inf_as_na = inf_as_na + + # scalars + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + assert notna("dog") == pd.notna("dog") + assert notna(None) == pd.notna(None) + assert notna(md.NA) == pd.notna(pd.NA) + assert notna(md.NaT) == pd.notna(pd.NaT) + assert notna(mt.NaN) == pd.notna(np.NaN) + assert notna(type) == pd.notna(type) + + # multi index + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + with pytest.raises(NotImplementedError): + midx = md.MultiIndex() + notna(midx) + + # list + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + l = [1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT] + actual = notna(l).execute().fetch() + expected = pd.notna(l) + np.testing.assert_array_equal(expected, actual) + + # tuple + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + t = (1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT) + assert notna(t) + + # numpy ndarray + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + narr = np.array((1, 2, 3, np.Inf, np.NaN)) + actual = notna(narr).execute().fetch() + expected = pd.notna(narr) + np.testing.assert_array_equal(expected, actual) + + # pandas index + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + pi = pd.Index((1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT)) + actual = notna(pi).execute().fetch() + expected = pd.notna(pi) + np.testing.assert_array_equal(expected, actual) + + # pandas series + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + ps = pd.Series((1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT)) + actual = notna(ps).execute().fetch() + expected = pd.notna(ps) + pd.testing.assert_series_equal(expected, actual) + + # pandas dataframe + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + pdf = pd.DataFrame( + {"foo": (1, 2, 3, np.Inf, pd.NA), "bar": (4, 5, 6, np.NaN, pd.NaT)} + ) + actual = notna(pdf).execute().fetch() + expected = pd.notna(pdf) + pd.testing.assert_frame_equal(expected, actual) + + # mars tensor + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + marr = mt.tensor(narr) + actual = notna(marr).execute().fetch() + expected = pd.notna(narr) + np.testing.assert_array_equal(expected, actual) + + # mars index + from ...datasource.index import from_pandas as from_pandas_index + + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + mi = from_pandas_index(pi) + actual = notna(mi).execute().fetch() + expected = pd.notna(pi) + np.testing.assert_array_equal(expected, actual) + + # mars series + from ...datasource.series import from_pandas as from_pandas_series + + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + ms = from_pandas_series(ps) + actual = notna(ms).execute().fetch() + expected = pd.notna(ps) + pd.testing.assert_series_equal(expected, actual) + + # mars dataframe + from ...datasource.dataframe import from_pandas as from_pandas_df + + assert pd.get_option("mode.use_inf_as_na") == inf_as_na + mdf = from_pandas_df(pdf) + actual = notna(mdf).execute().fetch() + expected = pd.notna(pdf) + pd.testing.assert_frame_equal(expected, actual) + + options.dataframe.mode.use_inf_as_na = old_mars_inf_as_na + pd.options.mode.use_inf_as_na = old_pd_inf_as_na diff --git a/python/xorbits/_mars/dataframe/missing/tests/test_missing_execution.py b/python/xorbits/_mars/dataframe/missing/tests/test_missing_execution.py new file mode 100644 index 000000000..cc19fb284 --- /dev/null +++ b/python/xorbits/_mars/dataframe/missing/tests/test_missing_execution.py @@ -0,0 +1,333 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import re +import string + +import numpy as np +import pandas as pd + +try: + import pyarrow as pa +except ImportError: # pragma: no cover + pa = None + +from .... import dataframe as md + + +def test_check_na_execution(setup): + df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list("ABCDEFGHIJ")) + for _ in range(20): + df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99) + + df = md.DataFrame(df_raw, chunk_size=4) + + pd.testing.assert_frame_equal(df.isna().execute().fetch(), df_raw.isna()) + pd.testing.assert_frame_equal(df.notna().execute().fetch(), df_raw.notna()) + + series_raw = pd.Series(np.nan, index=range(20)) + for _ in range(3): + series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99) + + series = md.Series(series_raw, chunk_size=4) + + pd.testing.assert_series_equal(series.isna().execute().fetch(), series_raw.isna()) + pd.testing.assert_series_equal(series.notna().execute().fetch(), series_raw.notna()) + + idx_data = np.array([np.nan] * 20) + for _ in range(3): + idx_data[random.randint(0, 19)] = random.randint(0, 99) + idx_raw = pd.Index(idx_data) + + idx = md.Index(idx_raw, chunk_size=4) + + np.testing.assert_array_equal(idx.isna().execute().fetch(), idx_raw.isna()) + np.testing.assert_array_equal(idx.notna().execute().fetch(), idx_raw.notna()) + + +def test_dataframe_fill_na_execution(setup): + df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list("ABCDEFGHIJ")) + for _ in range(20): + df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99) + value_df_raw = pd.DataFrame( + np.random.randint(0, 100, (10, 7)).astype(np.float32), columns=list("ABCDEFG") + ) + df = md.DataFrame(df_raw) + + # test DataFrame single chunk with numeric fill + r = df.fillna(1) + pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.fillna(1)) + + # test DataFrame single chunk with value as single chunk + value_df = md.DataFrame(value_df_raw) + r = df.fillna(value_df) + pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.fillna(value_df_raw)) + + df = md.DataFrame(df_raw, chunk_size=3) + + # test chunked with numeric fill + r = df.fillna(1) + pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.fillna(1)) + + # test forward fill in axis=0 without limit + r = df.fillna(method="pad") + pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.fillna(method="pad")) + + # test backward fill in axis=0 without limit + r = df.fillna(method="backfill") + pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.fillna(method="backfill")) + + # test forward fill in axis=1 without limit + r = df.ffill(axis=1) + pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.ffill(axis=1)) + + # test backward fill in axis=1 without limit + r = df.bfill(axis=1) + pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.bfill(axis=1)) + + # test fill with dataframe + value_df = md.DataFrame(value_df_raw, chunk_size=4) + r = df.fillna(value_df) + pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.fillna(value_df_raw)) + + # test fill with series + value_series_raw = pd.Series( + np.random.randint(0, 100, (10,)).astype(np.float32), index=list("ABCDEFGHIJ") + ) + value_series = md.Series(value_series_raw, chunk_size=4) + r = df.fillna(value_series) + pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.fillna(value_series_raw)) + + # test inplace tile + df.fillna(1, inplace=True) + pd.testing.assert_frame_equal(df.execute().fetch(), df_raw.fillna(1)) + + +def test_series_fill_na_execution(setup): + series_raw = pd.Series(np.nan, index=range(20)) + for _ in range(3): + series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99) + value_series_raw = pd.Series(np.random.randint(0, 100, (10,)).astype(np.float32)) + + # test single chunk + series = md.Series(series_raw) + + r = series.fillna(1) + pd.testing.assert_series_equal(r.execute().fetch(), series_raw.fillna(1)) + + # test single chunk with value as single chunk + value_series = md.Series(value_series_raw) + r = series.fillna(value_series) + pd.testing.assert_series_equal( + r.execute().fetch(), series_raw.fillna(value_series_raw) + ) + + series = md.Series(series_raw, chunk_size=3) + + # test chunked with numeric fill + r = series.fillna(1) + pd.testing.assert_series_equal(r.execute().fetch(), series_raw.fillna(1)) + + # test forward fill in axis=0 without limit + r = series.fillna(method="pad") + pd.testing.assert_series_equal(r.execute().fetch(), series_raw.fillna(method="pad")) + + # test backward fill in axis=0 without limit + r = series.fillna(method="backfill") + pd.testing.assert_series_equal( + r.execute().fetch(), series_raw.fillna(method="backfill") + ) + + # test fill with series + value_df = md.Series(value_series_raw, chunk_size=4) + r = series.fillna(value_df) + pd.testing.assert_series_equal( + r.execute().fetch(), series_raw.fillna(value_series_raw) + ) + + # test inplace tile + series.fillna(1, inplace=True) + pd.testing.assert_series_equal(series.execute().fetch(), series_raw.fillna(1)) + + +def test_index_fill_na_execution(setup): + idx_data = np.array([np.nan] * 20) + for _ in range(10): + idx_data[random.randint(0, 19)] = random.randint(0, 99) + idx_raw = pd.Index(idx_data) + + # test single chunk + idx = md.Index(idx_raw) + + r = idx.fillna(1) + pd.testing.assert_index_equal(r.execute().fetch(), idx_raw.fillna(1)) + + idx = md.Index(idx_raw, chunk_size=3) + + # test chunked with numeric fill + r = idx.fillna(1) + pd.testing.assert_index_equal(r.execute().fetch(), idx_raw.fillna(1)) + + +def test_drop_na_execution(setup): + # dataframe cases + df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list("ABCDEFGHIJ")) + for _ in range(30): + df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99) + for rowid in range(random.randint(1, 5)): + row = random.randint(0, 19) + for idx in range(0, 10): + df_raw.iloc[row, idx] = random.randint(0, 99) + + # only one chunk in columns, can run dropna directly + r = md.DataFrame(df_raw, chunk_size=(4, 10)).dropna() + pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.dropna()) + + # multiple chunks in columns, count() will be called first + r = md.DataFrame(df_raw, chunk_size=4).dropna() + pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.dropna()) + + r = md.DataFrame(df_raw, chunk_size=4).dropna(how="all") + pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.dropna(how="all")) + + r = md.DataFrame(df_raw, chunk_size=4).dropna(subset=list("ABFI")) + pd.testing.assert_frame_equal( + r.execute().fetch(), df_raw.dropna(subset=list("ABFI")) + ) + + r = md.DataFrame(df_raw, chunk_size=4).dropna(how="all", subset=list("BDHJ")) + pd.testing.assert_frame_equal( + r.execute().fetch(), df_raw.dropna(how="all", subset=list("BDHJ")) + ) + + r = md.DataFrame(df_raw, chunk_size=4) + r.dropna(how="all", inplace=True) + pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.dropna(how="all")) + + # series cases + series_raw = pd.Series(np.nan, index=range(20)) + for _ in range(10): + series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99) + + r = md.Series(series_raw, chunk_size=4).dropna() + pd.testing.assert_series_equal(r.execute().fetch(), series_raw.dropna()) + + r = md.Series(series_raw, chunk_size=4) + r.dropna(inplace=True) + pd.testing.assert_series_equal(r.execute().fetch(), series_raw.dropna()) + + # index cases + idx_data = np.array([np.nan] * 20) + for _ in range(10): + idx_data[random.randint(0, 19)] = random.randint(0, 99) + idx_raw = pd.Index(idx_data) + + r = md.Index(idx_raw, chunk_size=4).dropna() + pd.testing.assert_index_equal(r.execute().fetch(), idx_raw.dropna()) + + +def test_replace_execution(setup): + # dataframe cases + df_raw = pd.DataFrame(-1, index=range(0, 20), columns=list("ABCDEFGHIJ")) + for _ in range(30): + df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99) + for rowid in range(random.randint(1, 5)): + row = random.randint(0, 19) + for idx in range(0, 10): + df_raw.iloc[row, idx] = random.randint(0, 99) + df = md.DataFrame(df_raw, chunk_size=4) + + r = df.replace(-1, method="ffill") + pd.testing.assert_frame_equal( + r.execute().fetch(), df_raw.replace(-1, method="ffill") + ) + + r = df.replace(-1, method="bfill") + pd.testing.assert_frame_equal( + r.execute().fetch(), df_raw.replace(-1, method="bfill") + ) + + r = df.replace(-1, 999) + pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.replace(-1, 999)) + + if pd.__version__ >= "1.4.4": + r = df.replace({-1: 999}) + pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.replace({-1: 999})) + + raw_to_replace = pd.Series([-1, 1, 2]) + to_replace_series = md.Series(raw_to_replace) + raw_value = pd.Series([2, 3, -1]) + value_series = md.Series(raw_value) + r = df.replace(to_replace_series, value_series) + pd.testing.assert_frame_equal( + r.execute().fetch(), df_raw.replace(raw_to_replace, raw_value) + ) + + df.replace({"A": -1}, {"A": 9}, inplace=True) + pd.testing.assert_frame_equal( + df.execute().fetch(), df_raw.replace({"A": -1}, {"A": 9}) + ) + + if pd.__version__ >= "1.4.4": + df.replace({"A": {-1: 9}}, inplace=True) + pd.testing.assert_frame_equal( + df.execute().fetch(), df_raw.replace({"A": {-1: 9}}) + ) + + # series cases + series_raw = pd.Series(-1, index=range(20)) + for _ in range(10): + series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99) + series = md.Series(series_raw, chunk_size=4) + + if pd.__version__ >= "1.4.4": + r = series.replace(-1) + pd.testing.assert_series_equal(r.execute().fetch(), series_raw.replace(-1)) + + r = series.replace(-1, method="ffill") + pd.testing.assert_series_equal( + r.execute().fetch(), series_raw.replace(-1, method="ffill") + ) + + r = series.replace(-1, method="bfill") + pd.testing.assert_series_equal( + r.execute().fetch(), series_raw.replace(-1, method="bfill") + ) + + r = series.replace(-1, 999) + pd.testing.assert_series_equal(r.execute().fetch(), series_raw.replace(-1, 999)) + + # str series cases + tmpl_chars = list(string.ascii_letters + string.digits) + random.shuffle(tmpl_chars) + + def _rand_slice(): + lb = random.randint(0, len(tmpl_chars) - 1) + rb = random.randint(lb, len(tmpl_chars) - 1) + return "".join(tmpl_chars[lb : rb + 1]) + + series_raw = pd.Series([_rand_slice() for _ in range(20)]) + series = md.Series(series_raw, chunk_size=4) + + regs = [ + re.compile(r".A.", flags=re.IGNORECASE), + re.compile(r".B.", flags=re.IGNORECASE), + re.compile(r".C.", flags=re.IGNORECASE), + re.compile(r".D.", flags=re.IGNORECASE), + ] + r = series.replace(regex=regs, value="new") + pd.testing.assert_series_equal( + r.execute().fetch(), series_raw.replace(regex=regs, value="new") + ) diff --git a/python/xorbits/_mars/dataframe/operands.py b/python/xorbits/_mars/dataframe/operands.py new file mode 100644 index 000000000..760a8be44 --- /dev/null +++ b/python/xorbits/_mars/dataframe/operands.py @@ -0,0 +1,482 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict +from functools import reduce + +import numpy as np +import pandas as pd + +from ..core import ENTITY_TYPE, FuseChunk, FuseChunkData, OutputType +from ..core.operand import ( + Fuse, + FuseChunkMixin, + Operand, + ShuffleProxy, + TileableOperandMixin, +) +from ..tensor.core import TENSOR_TYPE +from ..tensor.datasource import tensor as astensor +from ..tensor.operands import TensorOperandMixin +from ..utils import calc_nsplits +from .core import ( + CATEGORICAL_TYPE, + DATAFRAME_CHUNK_TYPE, + DATAFRAME_GROUPBY_TYPE, + DATAFRAME_TYPE, + INDEX_CHUNK_TYPE, + INDEX_TYPE, + SERIES_CHUNK_TYPE, + SERIES_GROUPBY_TYPE, + SERIES_TYPE, +) +from .utils import parse_index + + +class DataFrameOperandMixin(TileableOperandMixin): + __slots__ = () + _op_module_ = "dataframe" + + def new_dataframes( + self, + inputs, + shape=None, + dtypes=None, + index_value=None, + columns_value=None, + chunks=None, + nsplits=None, + output_limit=None, + kws=None, + **kw + ): + setattr(self, "_output_types", [OutputType.dataframe]) + return self.new_tileables( + inputs, + shape=shape, + dtypes=dtypes, + index_value=index_value, + columns_value=columns_value, + chunks=chunks, + nsplits=nsplits, + output_limit=output_limit, + kws=kws, + **kw + ) + + def new_dataframe( + self, + inputs, + shape=None, + dtypes=None, + index_value=None, + columns_value=None, + **kw + ): + if getattr(self, "output_limit") != 1: + raise TypeError("cannot new DataFrame with more than 1 outputs") + + return self.new_dataframes( + inputs, + shape=shape, + dtypes=dtypes, + index_value=index_value, + columns_value=columns_value, + **kw + )[0] + + def new_seriess( + self, + inputs, + shape=None, + dtype=None, + index_value=None, + name=None, + chunks=None, + nsplits=None, + output_limit=None, + kws=None, + **kw + ): + setattr(self, "_output_types", [OutputType.series]) + return self.new_tileables( + inputs, + shape=shape, + dtype=dtype, + index_value=index_value, + name=name, + chunks=chunks, + nsplits=nsplits, + output_limit=output_limit, + kws=kws, + **kw + ) + + def new_series( + self, inputs, shape=None, dtype=None, index_value=None, name=None, **kw + ): + if getattr(self, "output_limit") != 1: + raise TypeError("cannot new Series with more than 1 outputs") + + return self.new_seriess( + inputs, shape=shape, dtype=dtype, index_value=index_value, name=name, **kw + )[0] + + def new_df_or_series(self, inputs, **kw): + setattr(self, "_output_types", [OutputType.df_or_series]) + return self.new_tileables(inputs, **kw)[0] + + def new_indexes( + self, + inputs, + shape=None, + dtype=None, + index_value=None, + name=None, + chunks=None, + nsplits=None, + output_limit=None, + kws=None, + **kw + ): + setattr(self, "_output_types", [OutputType.index]) + return self.new_tileables( + inputs, + shape=shape, + dtype=dtype, + index_value=index_value, + name=name, + chunks=chunks, + nsplits=nsplits, + output_limit=output_limit, + kws=kws, + **kw + ) + + def new_index( + self, inputs, shape=None, dtype=None, index_value=None, name=None, **kw + ): + if getattr(self, "output_limit") != 1: + raise TypeError("cannot new Index with more than 1 outputs") + + return self.new_indexes( + inputs, shape=shape, dtype=dtype, index_value=index_value, name=name, **kw + )[0] + + def new_scalars( + self, inputs, dtype=None, chunks=None, output_limit=None, kws=None, **kw + ): + setattr(self, "_output_types", [OutputType.scalar]) + return self.new_tileables( + inputs, + shape=(), + dtype=dtype, + chunks=chunks, + nsplits=(), + output_limit=output_limit, + kws=kws, + **kw + ) + + def new_scalar(self, inputs, dtype=None, **kw): + if getattr(self, "output_limit") != 1: + raise TypeError("cannot new tensor with more than 1 outputs") + + return self.new_scalars(inputs, dtype=dtype, **kw)[0] + + def new_categoricals( + self, + inputs, + shape=None, + dtype=None, + categories_value=None, + chunks=None, + nsplits=None, + output_limit=None, + kws=None, + **kw + ): + setattr(self, "_output_types", [OutputType.categorical]) + return self.new_tileables( + inputs, + shape=shape, + dtype=dtype, + categories_value=categories_value, + chunks=chunks, + nsplits=nsplits, + output_limit=output_limit, + kws=kws, + **kw + ) + + def new_categorical( + self, inputs, shape=None, dtype=None, categories_value=None, **kw + ): + if getattr(self, "output_limit") != 1: + raise TypeError("cannot new Categorical with more than 1 outputs") + + return self.new_categoricals( + inputs, shape=shape, dtype=dtype, categories_value=categories_value, **kw + )[0] + + @classmethod + def _process_groupby_params(cls, groupby_params): + new_groupby_params = groupby_params.copy() + if isinstance(groupby_params["by"], list): + by = [] + for v in groupby_params["by"]: + if isinstance(v, ENTITY_TYPE): + by.append(cls.concat_tileable_chunks(v).chunks[0]) + else: + by.append(v) + new_groupby_params["by"] = by + return new_groupby_params + + @classmethod + def _get_groupby_inputs(cls, groupby, groupby_params): + inputs = [groupby] + chunk_inputs = list(groupby.chunks) + if isinstance(groupby_params["by"], list): + for chunk_v, v in zip( + groupby_params["by"], groupby.op.groupby_params["by"] + ): + if isinstance(v, ENTITY_TYPE): + inputs.append(v) + chunk_inputs.append(chunk_v) + return inputs, chunk_inputs + + @classmethod + def concat_tileable_chunks(cls, tileable): + from .merge.concat import DataFrameConcat, GroupByConcat + + df = tileable + assert not df.is_coarse() + + if isinstance(df, DATAFRAME_TYPE): + chunk = DataFrameConcat(output_types=[OutputType.dataframe]).new_chunk( + df.chunks, + shape=df.shape, + index=(0, 0), + dtypes=df.dtypes, + index_value=df.index_value, + columns_value=df.columns_value, + ) + return DataFrameConcat(output_types=[OutputType.dataframe]).new_dataframe( + [df], + shape=df.shape, + chunks=[chunk], + nsplits=tuple((s,) for s in df.shape), + dtypes=df.dtypes, + index_value=df.index_value, + columns_value=df.columns_value, + ) + elif isinstance(df, SERIES_TYPE): + chunk = DataFrameConcat(output_types=[OutputType.series]).new_chunk( + df.chunks, + shape=df.shape, + index=(0,), + dtype=df.dtype, + index_value=df.index_value, + name=df.name, + ) + return DataFrameConcat(output_types=[OutputType.series]).new_series( + [df], + shape=df.shape, + chunks=[chunk], + nsplits=tuple((s,) for s in df.shape), + dtype=df.dtype, + index_value=df.index_value, + name=df.name, + ) + elif isinstance(df, INDEX_TYPE): + chunk = DataFrameConcat(output_types=[OutputType.index]).new_chunk( + df.chunks, + shape=df.shape, + index=(0,), + dtype=df.dtype, + index_value=df.index_value, + name=df.name, + ) + return DataFrameConcat(output_types=[OutputType.index]).new_index( + [df], + shape=df.shape, + chunks=[chunk], + nsplits=tuple((s,) for s in df.shape), + dtype=df.dtype, + index_value=df.index_value, + name=df.name, + ) + elif isinstance(df, (DATAFRAME_GROUPBY_TYPE, SERIES_GROUPBY_TYPE)): + output_type = ( + OutputType.dataframe_groupby + if isinstance(df, DATAFRAME_GROUPBY_TYPE) + else OutputType.series_groupby + ) + groupby_params = cls._process_groupby_params(df.op.groupby_params) + inputs, chunk_inputs = cls._get_groupby_inputs(df, groupby_params) + chunk = GroupByConcat( + groups=df.chunks, + groupby_params=groupby_params, + output_types=[output_type], + ).new_chunk(chunk_inputs, **df.params) + return GroupByConcat( + groups=[df], + groupby_params=df.op.groupby_params, + output_types=[output_type], + ).new_tileable(inputs, chunks=[chunk], **df.params) + elif isinstance(df, CATEGORICAL_TYPE): + chunk = DataFrameConcat(output_types=[OutputType.categorical]).new_chunk( + df.chunks, + shape=df.shape, + index=(0,), + dtype=df.dtype, + categories_value=df.categories_value, + ) + return DataFrameConcat( + output_types=[OutputType.categorical] + ).new_categorical( + [df], + shape=df.shape, + chunks=[chunk], + nsplits=tuple((s,) for s in df.shape), + dtype=df.dtype, + categories_value=df.categories_value, + ) + elif isinstance(df, TENSOR_TYPE): + return TensorOperandMixin.concat_tileable_chunks(tileable) + else: + raise NotImplementedError + + @classmethod + def create_tileable_from_chunks(cls, chunks, inputs=None, **kw): + ndim = chunks[0].ndim + index_min, index_max = [None] * ndim, [None] * ndim + for c in chunks: + for ax, i in enumerate(c.index): + if index_min[ax] is None: + index_min[ax] = i + else: + index_min[ax] = min(i, index_min[ax]) + if index_max[ax] is None: + index_max[ax] = i + else: + index_max[ax] = max(i, index_max[ax]) + + # gen {chunk index -> shape} + chunk_index_to_shape = OrderedDict() + chunk_index_to_chunk = dict() + for c in chunks: + new_index = [] + for ax, i in enumerate(c.index): + new_index.append(i - index_min[ax]) + chunk_index_to_shape[tuple(new_index)] = c.shape + chunk_index_to_chunk[tuple(new_index)] = c + + nsplits = calc_nsplits(chunk_index_to_shape) + shape = tuple(sum(ns) for ns in nsplits) + chunk_shape = tuple(len(ns) for ns in nsplits) + op = chunks[0].op.copy().reset_key() + if isinstance(chunks[0], DATAFRAME_CHUNK_TYPE): + params = cls._calc_dataframe_params(chunk_index_to_chunk, chunk_shape) + params.update(kw) + return op.new_dataframe( + inputs, shape=shape, chunks=chunks, nsplits=nsplits, **params + ) + elif isinstance(chunks[0], SERIES_CHUNK_TYPE): + params = cls._calc_series_index_params(chunks) + params.update(kw) + return op.new_series( + inputs, shape=shape, chunks=chunks, nsplits=nsplits, **params + ) + else: + assert isinstance(chunks[0], INDEX_CHUNK_TYPE) + params = cls._calc_series_index_params(chunks) + params.update(kw) + return op.new_index( + inputs, shape=shape, chunks=chunks, nsplits=nsplits, **params + ) + + @classmethod + def _calc_dataframe_params(cls, chunk_index_to_chunks, chunk_shape): + dtypes = pd.concat( + [ + chunk_index_to_chunks[0, i].dtypes + for i in range(chunk_shape[1]) + if (0, i) in chunk_index_to_chunks + ] + ) + columns_value = parse_index(dtypes.index, store_data=True) + pd_indexes = [ + chunk_index_to_chunks[i, 0].index_value.to_pandas() + for i in range(chunk_shape[0]) + if (i, 0) in chunk_index_to_chunks + ] + pd_index = reduce(lambda x, y: x.append(y), pd_indexes) + index_value = parse_index(pd_index) + return { + "dtypes": dtypes, + "columns_value": columns_value, + "index_value": index_value, + } + + @classmethod + def _calc_series_index_params(cls, chunks): + pd_indexes = [c.index_value.to_pandas() for c in chunks] + pd_index = reduce(lambda x, y: x.append(y), pd_indexes) + index_value = parse_index(pd_index) + return {"dtype": chunks[0].dtype, "index_value": index_value} + + def get_fuse_op_cls(self, _): + return DataFrameFuseChunk + + @staticmethod + def _process_input(x): + from .initializer import DataFrame, Series + + if isinstance(x, (DATAFRAME_TYPE, SERIES_TYPE)) or pd.api.types.is_scalar(x): + return x + elif isinstance(x, pd.Series): + return Series(x) + elif isinstance(x, pd.DataFrame): + return DataFrame(x) + elif isinstance(x, (list, tuple, np.ndarray, TENSOR_TYPE)): + return astensor(x) + raise NotImplementedError + + +DataFrameOperand = Operand + + +class DataFrameShuffleProxy(ShuffleProxy, DataFrameOperandMixin): + def __init__(self, sparse=None, output_types=None, **kwargs): + super().__init__(sparse=sparse, _output_types=output_types, **kwargs) + + @classmethod + def execute(cls, ctx, op): + pass + + +class DataFrameFuseChunkMixin(FuseChunkMixin, DataFrameOperandMixin): + __slots__ = () + + def _create_chunk(self, output_idx, index, **kw): + data = FuseChunkData(_index=index, _shape=kw.pop("shape", None), _op=self, **kw) + + return FuseChunk(data) + + +class DataFrameFuseChunk(Fuse, DataFrameFuseChunkMixin): + @property + def output_types(self): + return self.outputs[-1].chunk.op.output_types diff --git a/python/xorbits/_mars/dataframe/plotting/__init__.py b/python/xorbits/_mars/dataframe/plotting/__init__.py new file mode 100644 index 000000000..29371e4f1 --- /dev/null +++ b/python/xorbits/_mars/dataframe/plotting/__init__.py @@ -0,0 +1,34 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def _install(): + import pandas as pd + + from ..base.accessor import CachedAccessor + from ..core import DATAFRAME_TYPE, GROUPBY_TYPE, SERIES_TYPE + from .core import PlotAccessor + + for t in DATAFRAME_TYPE + SERIES_TYPE + GROUPBY_TYPE: + t.plot = CachedAccessor("plot", PlotAccessor) + + for method in dir(pd.DataFrame.plot): + if not method.startswith("_"): + PlotAccessor._register(method) + + PlotAccessor.__doc__ = pd.DataFrame.plot.__doc__.replace("pd.", "md.") + + +_install() +del _install diff --git a/python/xorbits/_mars/dataframe/plotting/core.py b/python/xorbits/_mars/dataframe/plotting/core.py new file mode 100644 index 000000000..66c7acc59 --- /dev/null +++ b/python/xorbits/_mars/dataframe/plotting/core.py @@ -0,0 +1,69 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict + +import pandas as pd + +from ...core import ENTITY_TYPE, ExecutableTuple +from ...utils import adapt_mars_docstring + + +class PlotAccessor: + def __init__(self, obj): + self._obj = obj + + def __call__(self, kind="line", session=None, **kwargs): + to_executes = OrderedDict() + to_executes["__object__"] = self._obj + + for k, v in kwargs.items(): + if isinstance(v, ENTITY_TYPE): + to_executes[k] = v + + result = dict() + executed = ExecutableTuple(to_executes.values()).execute().fetch() + for p, v in zip(to_executes, executed): + result[p] = v + + data = result.pop("__object__") + pd_kwargs = kwargs.copy() + pd_kwargs["kind"] = kind + pd_kwargs.update(result) + + return data.plot(**pd_kwargs) + + @classmethod + def _gen_func(cls, name, doc): + def _inner(self, *args, **kwargs): + return self(kind=name, *args, **kwargs) + + _inner.__name__ = name + _inner.__doc__ = doc + + return _inner + + @classmethod + def _register(cls, method): + doc = getattr(pd.DataFrame.plot, method).__doc__ + new_doc = adapt_mars_docstring(doc) + if method == "hexbin": + # make doc pass + new_doc = new_doc.replace( + "reduce_C_function=mt.sum", "reduce_C_function=sum" + ) + elif method == "line": + new_doc = new_doc.replace("s.plot.line().execute()", "s.plot.line()") + new_doc = new_doc.replace("type(axes).execute()", "type(axes)") + setattr(cls, method, cls._gen_func(method, new_doc)) diff --git a/python/xorbits/_mars/dataframe/plotting/tests/__init__.py b/python/xorbits/_mars/dataframe/plotting/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/plotting/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/plotting/tests/test_plot.py b/python/xorbits/_mars/dataframe/plotting/tests/test_plot.py new file mode 100644 index 000000000..9badb2fc9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/plotting/tests/test_plot.py @@ -0,0 +1,118 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tempfile +import warnings + +import numpy as np +import pandas as pd +import pytest + +try: + import matplotlib +except ImportError: # pragma: no cover + matplotlib = None + +from .... import dataframe as md +from .... import tensor as mt + + +def close(fignum=None): # pragma: no cover + from matplotlib.pyplot import close as _close + from matplotlib.pyplot import get_fignums + + if fignum is None: + for fignum in get_fignums(): + _close(fignum) + else: + _close(fignum) + + +def assert_is_valid_plot_return_object(objs): # pragma: no cover + import matplotlib.pyplot as plt + + if isinstance(objs, (pd.Series, np.ndarray)): + for el in objs.ravel(): + msg = ( + "one of 'objs' is not a matplotlib Axes instance, " + f"type encountered {type(el).__name__}" + ) + assert isinstance(el, (plt.Axes, dict)), msg + else: + msg = ( + "objs is neither an ndarray of Artist instances nor a single " + f"ArtistArtist instance, tuple, or dict, 'objs' is a {type(objs).__name__}" + ) + assert isinstance(objs, (plt.Artist, tuple, dict)), msg + + +def _check_plot_works(f, filterwarnings="always", **kwargs): # pragma: no cover + import matplotlib.pyplot as plt + + ret = None + with warnings.catch_warnings(): + warnings.simplefilter(filterwarnings) + try: + try: + fig = kwargs["figure"] + except KeyError: + fig = plt.gcf() + + plt.clf() + + kwargs.get("ax", fig.add_subplot(211)) + ret = f(**kwargs) + + assert_is_valid_plot_return_object(ret) + + if f is pd.plotting.bootstrap_plot: + assert "ax" not in kwargs + else: + kwargs["ax"] = fig.add_subplot(212) + + ret = f(**kwargs) + assert_is_valid_plot_return_object(ret) + + with tempfile.TemporaryFile() as path: + plt.savefig(path) + finally: + close(fig) + + return ret + + +@pytest.mark.skipif(matplotlib is None, reason="matplotlib is not installed") +def test_plot(setup): + raw = pd.DataFrame( + { + "a": ["s" + str(i) for i in range(10)], + "b": np.random.RandomState(0).randint(10, size=10), + } + ) + df = md.DataFrame(raw, chunk_size=3) + + _check_plot_works(df.plot, x="a", y="b") + _check_plot_works(df.plot, x="a", y=mt.tensor("b")) + _check_plot_works(df.plot.line) + + raw = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) + df = md.DataFrame(raw, chunk_size=3) + _check_plot_works(df.groupby("A").plot) diff --git a/python/xorbits/_mars/dataframe/reduction/__init__.py b/python/xorbits/_mars/dataframe/reduction/__init__.py new file mode 100644 index 000000000..a8ccc76a5 --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/__init__.py @@ -0,0 +1,111 @@ +# isort: skip_file +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .core import CustomReduction +from .aggregation import DataFrameAggregate + +from .sum import DataFrameSum +from .prod import DataFrameProd +from .max import DataFrameMax +from .min import DataFrameMin +from .count import DataFrameCount +from .mean import DataFrameMean +from .var import DataFrameVar +from .all import DataFrameAll +from .any import DataFrameAny +from .skew import DataFrameSkew +from .kurtosis import DataFrameKurtosis +from .sem import DataFrameSem +from .reduction_size import DataFrameSize +from .str_concat import DataFrameStrConcat, build_str_concat_object +from .custom_reduction import DataFrameCustomReduction + +from .cummax import DataFrameCummax +from .cummin import DataFrameCummin +from .cumprod import DataFrameCumprod +from .cumsum import DataFrameCumsum + +from .nunique import DataFrameNunique +from .unique import DataFrameUnique, unique + + +def _install(): + from ..core import DATAFRAME_TYPE, SERIES_TYPE, INDEX_TYPE + from .aggregation import aggregate + from .sum import sum_series, sum_dataframe + from .prod import prod_series, prod_dataframe + from .max import max_series, max_dataframe, max_index + from .min import min_series, min_dataframe, min_index + from .count import count_series, count_dataframe + from .mean import mean_series, mean_dataframe + from .var import var_series, var_dataframe + from .std import std_series, std_dataframe + from .all import all_series, all_dataframe, all_index + from .any import any_series, any_dataframe, any_index + from .cummax import cummax + from .cummin import cummin + from .cumprod import cumprod + from .cumsum import cumsum + from .nunique import nunique_dataframe, nunique_series + from .sem import sem_dataframe, sem_series + from .skew import skew_dataframe, skew_series + from .kurtosis import kurt_dataframe, kurt_series + from .reduction_size import size_dataframe, size_series + + funcs = [ + ("sum", sum_series, sum_dataframe), + ("prod", prod_series, prod_dataframe), + ("product", prod_series, prod_dataframe), + ("max", max_series, max_dataframe), + ("min", min_series, min_dataframe), + ("count", count_series, count_dataframe), + ("mean", mean_series, mean_dataframe), + ("var", var_series, var_dataframe), + ("std", std_series, std_dataframe), + ("all", all_series, all_dataframe), + ("any", any_series, any_dataframe), + ("cummax", cummax, cummax), + ("cummin", cummin, cummin), + ("cumprod", cumprod, cumprod), + ("cumsum", cumsum, cumsum), + ("agg", aggregate, aggregate), + ("aggregate", aggregate, aggregate), + ("nunique", nunique_series, nunique_dataframe), + ("sem", sem_series, sem_dataframe), + ("skew", skew_series, skew_dataframe), + ("kurt", kurt_series, kurt_dataframe), + ("kurtosis", kurt_series, kurt_dataframe), + ("unique", unique, None), + ("_reduction_size", size_dataframe, size_series), + ] + for func_name, series_func, df_func in funcs: + if df_func is not None: # pragma: no branch + for t in DATAFRAME_TYPE: + setattr(t, func_name, df_func) + if series_func is not None: # pragma: no branch + for t in SERIES_TYPE: + setattr(t, func_name, series_func) + + for t in INDEX_TYPE: + setattr(t, "agg", aggregate) + setattr(t, "aggregate", aggregate) + setattr(t, "all", all_index) + setattr(t, "any", any_index) + setattr(t, "min", min_index) + setattr(t, "max", max_index) + + +_install() +del _install diff --git a/python/xorbits/_mars/dataframe/reduction/aggregation.py b/python/xorbits/_mars/dataframe/reduction/aggregation.py new file mode 100644 index 000000000..4a0882f41 --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/aggregation.py @@ -0,0 +1,1037 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import functools +import itertools +from collections import OrderedDict +from collections.abc import Iterable +from typing import Dict, List + +import numpy as np +import pandas as pd + +from ... import opcodes +from ... import tensor as mars_tensor +from ...config import options +from ...core import ENTITY_TYPE, OutputType, enter_mode, recursive_tile +from ...core.custom_log import redirect_custom_log +from ...core.operand import OperandStage +from ...serialization.serializables import ( + AnyField, + BoolField, + DictField, + Int32Field, + ListField, +) +from ...utils import ceildiv, enter_current_session, lazy_import, pd_release_version +from ..core import INDEX_CHUNK_TYPE +from ..merge import DataFrameConcat +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_df, build_empty_df, build_series, parse_index, validate_axis +from .core import CustomReduction, ReductionAggStep, ReductionCompiler, ReductionSteps + +cp = lazy_import("cupy", rename="cp") +cudf = lazy_import("cudf") + +_agg_size_as_series = pd_release_version >= (1, 3, 0) + + +def where_function(cond, var1, var2): + if hasattr(var1, "ndim") and var1.ndim >= 1: + return var1.where(cond, var2) + elif isinstance(var1, ENTITY_TYPE): + return mars_tensor.where(cond, var1, var2) + else: + return np.where(cond, var1, var2).item() + + +_agg_functions = { + "sum": lambda x, skipna=True: x.sum(skipna=skipna), + "prod": lambda x, skipna=True: x.prod(skipna=skipna), + "product": lambda x, skipna=True: x.product(skipna=skipna), + "min": lambda x, skipna=True: x.min(skipna=skipna), + "max": lambda x, skipna=True: x.max(skipna=skipna), + "all": lambda x, skipna=True: x.all(skipna=skipna), + "any": lambda x, skipna=True: x.any(skipna=skipna), + "count": lambda x: x.count(), + "size": lambda x: x._reduction_size(), + "mean": lambda x, skipna=True: x.mean(skipna=skipna), + "var": lambda x, skipna=True, ddof=1: x.var(skipna=skipna, ddof=ddof), + "std": lambda x, skipna=True, ddof=1: x.std(skipna=skipna, ddof=ddof), + "sem": lambda x, skipna=True, ddof=1: x.sem(skipna=skipna, ddof=ddof), + "skew": lambda x, skipna=True, bias=False: x.skew(skipna=skipna, bias=bias), + "kurt": lambda x, skipna=True, bias=False: x.kurt(skipna=skipna, bias=bias), + "kurtosis": lambda x, skipna=True, bias=False: x.kurtosis(skipna=skipna, bias=bias), + "nunique": lambda x: x.nunique(), +} + + +class DataFrameAggregate(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.AGGREGATE + + raw_func = AnyField("raw_func") + raw_func_kw = DictField("raw_func_kw") + func = AnyField("func") + func_rename = ListField("func_rename") + axis = AnyField("axis") + numeric_only = BoolField("numeric_only") + bool_only = BoolField("bool_only") + use_inf_as_na = BoolField("use_inf_as_na") + + combine_size = Int32Field("combine_size") + pre_funcs = ListField("pre_funcs") + agg_funcs = ListField("agg_funcs") + post_funcs = ListField("post_funcs") + + @staticmethod + def _filter_dtypes(op: "DataFrameAggregate", dtypes): + if not op.numeric_only and not op.bool_only: + return dtypes + empty_df = build_empty_df(dtypes) + return empty_df.select_dtypes( + [np.number, np.bool_] if op.numeric_only else [np.bool_] + ).dtypes + + def _calc_result_shape(self, df): + if df.ndim == 2: + if self.numeric_only: + df = df.select_dtypes([np.number, np.bool_]) + elif self.bool_only: + df = df.select_dtypes([np.bool_]) + + if self.output_types[0] == OutputType.dataframe: + test_obj = build_df(df, size=[2, 2], fill_value=[1, 2], ensure_string=True) + else: + test_obj = build_series( + df, size=[2, 2], fill_value=[1, 2], name=df.name, ensure_string=True + ) + + result_df = test_obj.agg(self.raw_func, axis=self.axis, **self.raw_func_kw) + + if isinstance(result_df, pd.DataFrame): + self.output_types = [OutputType.dataframe] + return result_df.dtypes, result_df.index + elif isinstance(result_df, pd.Series): + self.output_types = [OutputType.series] + return pd.Series([result_df.dtype], index=[result_df.name]), result_df.index + else: + self.output_types = [OutputType.scalar] + return np.array(result_df).dtype, None + + def __call__(self, df, output_type=None, dtypes=None, index=None): + self._output_types = df.op.output_types + normalize_reduction_funcs(self, ndim=df.ndim) + if output_type is None or dtypes is None: + with enter_mode(kernel=False, build=False): + dtypes, index = self._calc_result_shape(df) + else: + self.output_types = [output_type] + + if self.output_types[0] == OutputType.dataframe: + if self.axis == 0: + new_shape = (len(index), len(dtypes)) + new_index = parse_index(index, store_data=True) + else: + new_shape = (df.shape[0], len(dtypes)) + new_index = df.index_value + return self.new_dataframe( + [df], + shape=new_shape, + dtypes=dtypes, + index_value=new_index, + columns_value=parse_index(dtypes.index, store_data=True), + ) + elif self.output_types[0] == OutputType.series: + if df.ndim == 1: + new_shape = (len(index),) + new_index = parse_index(index, store_data=True) + elif self.axis == 0: + new_shape = (len(index),) + new_index = parse_index(index, store_data=True) + else: + new_shape = (df.shape[0],) + new_index = df.index_value + return self.new_series( + [df], + shape=new_shape, + dtype=dtypes[0], + name=dtypes.index[0], + index_value=new_index, + ) + elif self.output_types[0] == OutputType.tensor: + return self.new_tileable([df], dtype=dtypes, shape=(np.nan,)) + else: + return self.new_scalar([df], dtype=dtypes) + + @staticmethod + def _safe_append(d, key, val): + if key not in d: + d[key] = [] + if val not in d[key]: + d[key].append(val) + + @classmethod + def _gen_map_chunks( + cls, + op, + in_df, + out_df, + func_infos: List[ReductionSteps], + input_index_to_output: Dict[int, int], + ): + axis = op.axis + + if axis == 0: + agg_chunks_shape = ( + (in_df.chunk_shape[0], len(func_infos)) + if len(in_df.chunk_shape) == 2 + else (in_df.chunk_shape[0], 1) + ) + else: + agg_chunks_shape = (len(func_infos), in_df.chunk_shape[1]) + + agg_chunks = np.empty(agg_chunks_shape, dtype=object) + dtypes_cache = dict() + for chunk in in_df.chunks: + input_index = chunk.index[1 - axis] if len(chunk.index) > 1 else 0 + if input_index not in input_index_to_output: + continue + map_op = op.copy().reset_key() # type: "DataFrameAggregate" + new_axis_index = input_index_to_output[input_index] + func_info = func_infos[new_axis_index] + # force as_index=True for map phase + map_op.output_types = ( + [OutputType.dataframe] if chunk.ndim == 2 else [OutputType.series] + ) + map_op.stage = OperandStage.map + map_op.pre_funcs = func_info.pre_funcs + map_op.agg_funcs = func_info.agg_funcs + + if axis == 0: + new_index = ( + (chunk.index[0], new_axis_index) + if len(chunk.index) == 2 + else (chunk.index[0], 0) + ) + else: + new_index = (new_axis_index, chunk.index[1]) + + if map_op.output_types[0] == OutputType.dataframe: + if axis == 0: + shape = (1, out_df.shape[-1]) + if out_df.ndim == 2: + columns_value = out_df.columns_value + index_value = out_df.index_value + else: + columns_value = out_df.index_value + index_value = parse_index(pd.Index([0]), out_df.key) + + try: + dtypes = dtypes_cache[chunk.index[1]] + except KeyError: + dtypes = chunk.dtypes.reindex( + columns_value.to_pandas() + ).dropna() + dtypes_cache[chunk.index[1]] = dtypes + + agg_chunk = map_op.new_chunk( + [chunk], + shape=shape, + index=new_index, + dtypes=dtypes, + columns_value=columns_value, + index_value=index_value, + ) + else: + shape = (out_df.shape[0], 1) + columns_value = parse_index( + pd.Index([0]), out_df.key, store_data=True + ) + index_value = out_df.index_value + + agg_chunk = map_op.new_chunk( + [chunk], + shape=shape, + index=new_index, + columns_value=columns_value, + index_value=index_value, + ) + else: + agg_chunk = map_op.new_chunk([chunk], shape=(1,), index=new_index) + agg_chunks[agg_chunk.index] = agg_chunk + return agg_chunks + + @classmethod + def _tile_single_chunk(cls, op: "DataFrameAggregate"): + in_df = op.inputs[0] + out_df = op.outputs[0] + + chunk_op = op.copy().reset_key() + if op.output_types[0] == OutputType.dataframe: + chunk = chunk_op.new_chunk( + in_df.chunks, + index=(0, 0), + shape=out_df.shape, + index_value=out_df.index_value, + columns_value=out_df.columns_value, + dtypes=out_df.dtypes, + ) + elif op.output_types[0] == OutputType.series: + chunk = chunk_op.new_chunk( + in_df.chunks, + index=(0,), + shape=out_df.shape, + dtype=out_df.dtype, + index_value=out_df.index_value, + name=out_df.name, + ) + elif op.output_types[0] == OutputType.tensor: + chunk = chunk_op.new_chunk( + in_df.chunks, index=(0,), dtype=out_df.dtype, shape=(np.nan,) + ) + else: + chunk = chunk_op.new_chunk( + in_df.chunks, dtype=out_df.dtype, index=(), shape=() + ) + + tileable_op = op.copy().reset_key() + kw = out_df.params.copy() + kw.update(dict(chunks=[chunk], nsplits=tuple((x,) for x in out_df.shape))) + return tileable_op.new_tileables([in_df], **kw) + + @classmethod + def _tile_size(cls, op: "DataFrameAggregate"): + in_df = op.inputs[0] + out_df = op.outputs[0] + + chunks = [] + for c in in_df.chunks: + chunk_op = op.copy().reset_key() + chunks.append( + chunk_op.new_chunk( + [c], + index=c.index, + shape=(1,) * len(in_df.shape), + dtype=out_df.dtype, + ) + ) + + tileable_op = op.copy().reset_key() + nsplits = tuple((1,) * s for s in in_df.chunk_shape) + tileable = tileable_op.new_tileable( + out_df.inputs, + chunks=chunks, + nsplits=nsplits, + shape=in_df.chunk_shape, + dtype=out_df.dtype, + ) + ret = yield from recursive_tile(tileable.sum()) + return [ret] + + @staticmethod + def _add_functions( + op: "DataFrameAggregate", compiler: ReductionCompiler, cols=None + ): + if isinstance(op.func, list): + func_iter = ((None, f) for f in op.func) + cols_set = set(cols) if cols is not None else None + else: + assert cols is not None + cols_set = set(cols) & set(op.func.keys()) + if len(cols_set) == 0: + return False + func_iter = ((col, f) for col, funcs in op.func.items() for f in funcs) + + func_renames = ( + op.func_rename + if getattr(op, "func_rename", None) is not None + else itertools.repeat(None) + ) + for func_rename, (col, f) in zip(func_renames, func_iter): + if cols_set is not None and col is not None and col not in cols_set: + continue + func_name = None + if isinstance(f, str): + f, func_name = _agg_functions[f], f + if func_rename is not None: + func_name = func_rename + ndim = 1 if cols is None else 2 + func_cols = [col] if col is not None else None + compiler.add_function(f, ndim, cols=func_cols, func_name=func_name) + return True + + @classmethod + def _tile_tree(cls, op: "DataFrameAggregate"): + in_df = op.inputs[0] + out_df = op.outputs[0] + combine_size = op.combine_size + axis = op.axis + + input_index_to_output = dict() + output_index_to_input = [] + axis_func_infos = [] + dtypes_list = [] + if len(in_df.chunk_shape) > 1: + for col_idx in range(in_df.chunk_shape[1 - axis]): + compiler = ReductionCompiler(axis=op.axis) + idx_chunk = ( + in_df.cix[0, col_idx] if axis == 0 else in_df.cix[col_idx, 0] + ) + new_dtypes = cls._filter_dtypes(op, idx_chunk.dtypes) + if not cls._add_functions(op, compiler, cols=list(new_dtypes.index)): + continue + input_index_to_output[col_idx] = len(axis_func_infos) + output_index_to_input.append(col_idx) + axis_func_infos.append(compiler.compile()) + dtypes_list.append(new_dtypes) + else: + compiler = ReductionCompiler(axis=op.axis) + cls._add_functions(op, compiler) + input_index_to_output[0] = 0 + axis_func_infos.append(compiler.compile()) + + chunks = cls._gen_map_chunks( + op, in_df, out_df, axis_func_infos, input_index_to_output + ) + while chunks.shape[axis] > combine_size: + if axis == 0: + new_chunks_shape = ( + ceildiv(chunks.shape[0], combine_size), + chunks.shape[1], + ) + else: + new_chunks_shape = ( + chunks.shape[0], + ceildiv(chunks.shape[1], combine_size), + ) + + new_chunks = np.empty(new_chunks_shape, dtype=object) + for idx0, i in enumerate(range(0, chunks.shape[axis], combine_size)): + for idx1 in range(chunks.shape[1 - axis]): + func_info = axis_func_infos[idx1] + if axis == 0: + chks = chunks[i : i + combine_size, idx1] + chunk_index = (idx0, idx1) + if chks[0].ndim == 1: + concat_shape = (len(chks),) + agg_shape = (1,) + else: + concat_shape = (len(chks), chks[0].shape[1]) + agg_shape = (chks[0].shape[1], 1) + else: + chks = chunks[idx1, i : i + combine_size] + chunk_index = (idx1, idx0) + concat_shape = (chks[0].shape[0], len(chks)) + agg_shape = (chks[0].shape[0], 1) + + chks = chks.reshape((chks.shape[0],)).tolist() + if len(chks) == 1: + chk = chks[0] + else: + concat_op = DataFrameConcat( + output_types=[OutputType.dataframe], axis=axis + ) + # Change index for concatenate + for j, c in enumerate(chks): + c._index = (j, 0) if axis == 0 else (0, j) + chk = concat_op.new_chunk( + chks, + dtypes=dtypes_list[idx1] if dtypes_list else None, + shape=concat_shape, + index_value=chks[0].index_value, + ) + chunk_op = op.copy().reset_key() + chunk_op.output_types = [OutputType.dataframe] + chunk_op.stage = OperandStage.combine + chunk_op.agg_funcs = func_info.agg_funcs + + if axis == 0: + new_chunks[chunk_index] = chunk_op.new_chunk( + [chk], + index=chunk_index, + shape=agg_shape, + index_value=chks[0].index_value, + ) + else: + new_chunks[chunk_index] = chunk_op.new_chunk( + [chk], + index=chunk_index, + shape=agg_shape, + index_value=chks[0].columns_value, + ) + chunks = new_chunks + + agg_chunks = [] + for idx in range(chunks.shape[1 - axis]): + func_info = axis_func_infos[idx] + + concat_op = DataFrameConcat(output_types=[OutputType.dataframe], axis=axis) + if axis == 0: + chks = chunks[:, idx] + if chks[0].ndim == 1: + concat_shape = (len(chks),) + else: + concat_shape = (len(chks), chks[0].shape[1]) + else: + chks = chunks[idx, :] + concat_shape = (chks[0].shape[0], len(chks)) + chks = chks.reshape((chks.shape[0],)).tolist() + chk = concat_op.new_chunk( + chks, + dtypes=dtypes_list[idx] if dtypes_list else None, + shape=concat_shape, + index_value=chks[0].index_value, + ) + chunk_op = op.copy().reset_key() + chunk_op.stage = OperandStage.agg + chunk_op.agg_funcs = func_info.agg_funcs + chunk_op.post_funcs = func_info.post_funcs + + kw = out_df.params.copy() + if op.output_types[0] == OutputType.dataframe: + if axis == 0: + src_col_chunk = in_df.cix[0, output_index_to_input[idx]] + valid_cols = [ + c for pre in func_info.pre_funcs for c in pre.columns or () + ] + if not valid_cols: + columns_value = src_col_chunk.columns_value + shape_len = src_col_chunk.shape[1] + else: + col_index = pd.Index(valid_cols).unique() + columns_value = parse_index(col_index, store_data=True) + shape_len = len(col_index) + kw.update( + dict( + shape=(out_df.shape[0], shape_len), + columns_value=columns_value, + index=(0, idx), + dtypes=out_df.dtypes[columns_value.to_pandas()], + ) + ) + else: + src_col_chunk = in_df.cix[output_index_to_input[idx], 0] + kw.update( + dict( + index=(idx, 0), + index_value=src_col_chunk.index_value, + shape=(src_col_chunk.shape[0], out_df.shape[1]), + dtypes=out_df.dtypes, + ) + ) + else: + if op.output_types[0] == OutputType.series: + if in_df.ndim == 1: + index_value, shape = out_df.index_value, out_df.shape + elif axis == 0: + out_dtypes = dtypes_list[idx] + index_value = parse_index(out_dtypes.index, store_data=True) + shape = (len(out_dtypes),) + else: + src_chunk = in_df.cix[output_index_to_input[idx], 0] + index_value, shape = ( + src_chunk.index_value, + (src_chunk.shape[0],), + ) + kw.update( + dict( + name=out_df.name, + dtype=out_df.dtype, + index=(idx,), + index_value=index_value, + shape=shape, + ) + ) + elif op.output_types[0] == OutputType.tensor: + kw.update(dict(index=(0,), shape=(np.nan,), dtype=out_df.dtype)) + else: + kw.update(dict(index=(), shape=(), dtype=out_df.dtype)) + agg_chunks.append(chunk_op.new_chunk([chk], **kw)) + + new_op = op.copy() + if op.output_types[0] == OutputType.dataframe: + if axis == 0: + nsplits = ((out_df.shape[0],), tuple(c.shape[1] for c in agg_chunks)) + else: + nsplits = (tuple(c.shape[0] for c in agg_chunks), (out_df.shape[1],)) + return new_op.new_tileables( + op.inputs, + chunks=agg_chunks, + nsplits=nsplits, + dtypes=out_df.dtypes, + shape=out_df.shape, + index_value=out_df.index_value, + columns_value=out_df.columns_value, + ) + elif op.output_types[0] == OutputType.series: + nsplits = (tuple(c.shape[0] for c in agg_chunks),) + return new_op.new_tileables( + op.inputs, + chunks=agg_chunks, + nsplits=nsplits, + dtype=out_df.dtype, + shape=out_df.shape, + index_value=out_df.index_value, + name=out_df.name, + ) + elif op.output_types[0] == OutputType.tensor: # unique + return new_op.new_tileables( + op.inputs, + chunks=agg_chunks, + dtype=out_df.dtype, + shape=out_df.shape, + nsplits=((np.nan,),), + ) + else: # scalar + return new_op.new_tileables( + op.inputs, chunks=agg_chunks, dtype=out_df.dtype, shape=(), nsplits=() + ) + + @classmethod + def tile(cls, op: "DataFrameAggregate"): + in_df = op.inputs[0] + + if len(in_df.chunks) == 1: + return cls._tile_single_chunk(op) + elif not _agg_size_as_series and in_df.ndim == 2 and op.raw_func == "size": + return (yield from cls._tile_size(op)) + else: + return cls._tile_tree(op) + + @classmethod + def _wrap_df(cls, op, value, index=None): + xdf = cudf if op.gpu else pd + axis = op.axis + ndim = op.inputs[0].ndim + + if ndim == 2: + dtype = None + if isinstance(value, (np.generic, int, float, complex)): + value = xdf.DataFrame([value], columns=index) + elif not isinstance(value, xdf.DataFrame): + new_index = None if not op.gpu else getattr(value, "index", None) + dtype = getattr(value, "dtype", None) + if xdf is pd: + value = xdf.DataFrame(value, columns=index, index=new_index) + else: # pragma: no cover + value = xdf.DataFrame(value) + value.index = new_index + value.columns = index + else: + return value + + value = value.T if axis == 0 else value + if ( + dtype == np.dtype("O") + and getattr(op.outputs[0], "dtypes", None) is not None + ): + value = value.astype(op.outputs[0].dtypes) + return value + else: + if isinstance(value, (np.generic, int, float, complex)): + value = xdf.Series([value], index=index) + elif isinstance(value, np.ndarray): + # assert value.ndim == 0 + value = xdf.Series(value.tolist(), index=index) + return value + + @staticmethod + def _pack_inputs(agg_funcs: List[ReductionAggStep], in_data): + pos = 0 + out_dict = dict() + for step in agg_funcs: + if step.custom_reduction is None: + out_dict[step.output_key] = in_data[pos] + else: + out_dict[step.output_key] = tuple( + in_data[pos : pos + step.output_limit] + ) + pos += step.output_limit + return out_dict + + @classmethod + def _do_predefined_agg(cls, op: "DataFrameAggregate", input_obj, func_name, kwds): + if func_name == "size": + return input_obj.agg(lambda x: x.size, axis=op.axis) + elif func_name == "str_concat": + ret = input_obj.agg(lambda x: x.str.cat(**kwds), axis=op.axis) + if isinstance(ret, str): + ret = pd.Series([ret]) + return ret + else: + if op.gpu: + if kwds.pop("numeric_only", None): + raise NotImplementedError("numeric_only not implemented under cudf") + if isinstance(input_obj, pd.Index): + kwds.pop("skipna", None) + return getattr(input_obj, func_name)(**kwds) + + @classmethod + def _select_dtypes(cls, in_data, op: "DataFrameAggregate"): + if in_data.ndim == 2: + if op.numeric_only: + in_data = in_data.select_dtypes([np.number, np.bool_]) + elif op.bool_only: + in_data = in_data.select_dtypes([np.bool_]) + return in_data + + @classmethod + def _execute_map(cls, ctx, op: "DataFrameAggregate"): + in_data = ctx[op.inputs[0].key] + axis_index = op.outputs[0].index[op.axis] + in_data = cls._select_dtypes(in_data, op) + + # map according to map groups + ret_map_dfs = dict() + in_cols_set = set(in_data.columns) if in_data.ndim == 2 else None + for input_key, output_key, cols, func in op.pre_funcs: + if cols and in_cols_set == set(cols): + cols = None + + src_df = in_data if cols is None else in_data[cols] + if input_key == output_key: + ret_map_dfs[output_key] = src_df + else: + ret_map_dfs[output_key] = func(src_df, gpu=op.is_gpu()) + + agg_dfs = [] + for ( + input_key, + _, + map_func_name, + _agg_func_name, + custom_reduction, + _output_key, + _output_limit, + kwds, + ) in op.agg_funcs: + input_obj = ret_map_dfs[input_key] + if map_func_name == "custom_reduction": + pre_result = custom_reduction.pre(input_obj) + if not isinstance(pre_result, tuple): + pre_result = (pre_result,) + + if custom_reduction.pre_with_agg: + # when custom_reduction.pre already aggregates, skip + agg_result = pre_result + else: + agg_result = custom_reduction.agg(*pre_result) + if not isinstance(agg_result, tuple): + agg_result = (agg_result,) + + agg_dfs.extend( + [cls._wrap_df(op, r, index=[axis_index]) for r in agg_result] + ) + else: + agg_dfs.append( + cls._wrap_df( + op, + cls._do_predefined_agg(op, input_obj, map_func_name, kwds), + index=[axis_index], + ) + ) + ctx[op.outputs[0].key] = tuple(agg_dfs) + + @classmethod + def _execute_combine(cls, ctx, op: "DataFrameAggregate"): + in_data = ctx[op.inputs[0].key] + in_data_dict = cls._pack_inputs(op.agg_funcs, in_data) + axis = op.axis + axis_index = op.outputs[0].index[axis] + + combines = [] + for ( + _input_key, + _, + _map_func_name, + agg_func_name, + custom_reduction, + output_key, + _output_limit, + kwds, + ) in op.agg_funcs: + input_obj = in_data_dict[output_key] + if agg_func_name == "custom_reduction": + agg_result = custom_reduction.agg(*input_obj) + if not isinstance(agg_result, tuple): + agg_result = (agg_result,) + combines.extend( + [cls._wrap_df(op, r, index=[axis_index]) for r in agg_result] + ) + else: + combines.append( + cls._wrap_df( + op, + cls._do_predefined_agg(op, input_obj, agg_func_name, kwds), + index=[axis_index], + ) + ) + ctx[op.outputs[0].key] = tuple(combines) + + @classmethod + def _execute_agg(cls, ctx, op: "DataFrameAggregate"): + xdf = cudf if op.gpu else pd + xp = cp if op.gpu else np + + out = op.outputs[0] + in_data = ctx[op.inputs[0].key] + in_data_dict = cls._pack_inputs(op.agg_funcs, in_data) + axis = op.axis + + # perform agg + for ( + _input_key, + _, + _map_func_name, + agg_func_name, + custom_reduction, + output_key, + _output_limit, + kwds, + ) in op.agg_funcs: + input_obj = in_data_dict[output_key] + if agg_func_name == "custom_reduction": + agg_result = custom_reduction.agg(*input_obj) + if not isinstance(agg_result, tuple): + agg_result = (agg_result,) + in_data_dict[output_key] = custom_reduction.post(*agg_result) + else: + in_data_dict[output_key] = cls._do_predefined_agg( + op, input_obj, agg_func_name, kwds + ) + + aggs = [] + # perform post op + for input_keys, _output_key, func_name, cols, func in op.post_funcs: + if cols is None: + func_inputs = [in_data_dict[k] for k in input_keys] + else: + func_inputs = [in_data_dict[k][cols] for k in input_keys] + + agg_series = func(*func_inputs, gpu=op.is_gpu()) + agg_series_ndim = getattr(agg_series, "ndim", 0) + + ser_index = None + if agg_series_ndim < out.ndim: + ser_index = [func_name] + aggs.append(cls._wrap_df(op, agg_series, index=ser_index)) + + # concatenate to produce final result + concat_df = xdf.concat(aggs, axis=axis) + if op.output_types[0] == OutputType.series: + if concat_df.ndim > 1: + if op.inputs[0].ndim == 2: + if axis == 0: + concat_df = concat_df.iloc[0, :] + else: + concat_df = concat_df.iloc[:, 0] + else: + concat_df = concat_df.iloc[:, 0] + concat_df.name = op.outputs[0].name + + concat_df = concat_df.astype(op.outputs[0].dtype, copy=False) + elif op.output_types[0] == OutputType.scalar: + concat_df = concat_df.iloc[0] + try: + concat_df = concat_df.astype(op.outputs[0].dtype) + except AttributeError: + # concat_df may be a string and has no `astype` method + pass + elif op.output_types[0] == OutputType.tensor: + concat_df = xp.array(concat_df).astype(dtype=out.dtype) + else: + if axis == 0: + concat_df = concat_df.reindex(op.outputs[0].index_value.to_pandas()) + else: + concat_df = concat_df[op.outputs[0].columns_value.to_pandas()] + + concat_df = concat_df.astype(op.outputs[0].dtypes, copy=False) + ctx[op.outputs[0].key] = concat_df + + @classmethod + @redirect_custom_log + @enter_current_session + def execute(cls, ctx, op: "DataFrameAggregate"): + try: + pd.set_option("mode.use_inf_as_na", op.use_inf_as_na) + if op.stage == OperandStage.map: + cls._execute_map(ctx, op) + elif op.stage == OperandStage.combine: + cls._execute_combine(ctx, op) + elif op.stage == OperandStage.agg: + cls._execute_agg(ctx, op) + elif not _agg_size_as_series and op.raw_func == "size": + xp = cp if op.gpu else np + ctx[op.outputs[0].key] = xp.array( + ctx[op.inputs[0].key].agg(op.raw_func, axis=op.axis) + ).reshape(op.outputs[0].shape) + else: + xp = cp if op.gpu else np + in_obj = op.inputs[0] + in_data = ctx[in_obj.key] + in_data = cls._select_dtypes(in_data, op) + if isinstance(in_obj, INDEX_CHUNK_TYPE): + result = op.func[0](in_data) + elif ( + op.output_types[0] == OutputType.scalar + and in_data.shape == (0,) + and callable(op.func[0]) + ): + result = op.func[0](in_data) + else: + result = in_data.agg(op.raw_func, axis=op.axis) + if op.outputs[0].ndim == 1: + result = result.astype(op.outputs[0].dtype, copy=False) + + if op.output_types[0] == OutputType.tensor: + result = xp.array(result) + ctx[op.outputs[0].key] = result + finally: + pd.reset_option("mode.use_inf_as_na") + + +def is_funcs_aggregate(func, func_kw=None, ndim=2): + func_kw = func_kw or dict() + if ndim == 1 and func is None: + func, func_kw = func_kw, dict() + + to_check = [] + if func is not None: + if isinstance(func, (list, tuple)): + to_check.extend(func) + elif isinstance(func, dict): + if ndim == 2: + for f in func.values(): + if isinstance(f, Iterable) and not isinstance(f, str): + to_check.extend(f) + else: + to_check.append(f) + else: + if any(isinstance(v, tuple) for v in func.values()): + raise TypeError("nested renamer is not supported") + to_check.extend(func.values()) + else: + to_check.append(func) + else: + for v in func_kw.values(): + if ( + not isinstance(v, tuple) + or len(v) != 2 + or (not isinstance(v[1], str) and not callable(v[1])) + ): + raise TypeError("Must provide 'func' or tuples of (column, aggfunc).") + else: + to_check.append(v[1]) + + compiler = ReductionCompiler() + for f in to_check: + if f in _agg_functions: + continue + elif callable(f): + try: + if ndim == 2: + compiler.add_function(f, 2, cols=["A", "B"]) + else: + compiler.add_function(f, 1) + except ValueError: + return False + else: + return False + return True + + +def normalize_reduction_funcs(op, ndim=None): + raw_func = op.raw_func + if ndim == 1 and raw_func is None: + raw_func = op.raw_func_kw + + if raw_func is not None: + if isinstance(raw_func, dict): + if ndim == 2: + new_func = OrderedDict() + for k, v in raw_func.items(): + if isinstance(v, str) or callable(v): + new_func[k] = [v] + else: + new_func[k] = v + op.func = new_func + else: + op.func = list(raw_func.values()) + op.func_rename = list(raw_func.keys()) + elif isinstance(raw_func, Iterable) and not isinstance(raw_func, str): + op.func = list(raw_func) + else: + op.func = [raw_func] + else: + new_func = OrderedDict() + new_func_names = OrderedDict() + for k, v in op.raw_func_kw.items(): + try: + col_funcs = new_func[v[0]] + col_func_names = new_func_names[v[0]] + except KeyError: + col_funcs = new_func[v[0]] = [] + col_func_names = new_func_names[v[0]] = [] + col_funcs.append(v[1]) + col_func_names.append(k) + op.func = new_func + op.func_rename = functools.reduce( + lambda a, b: a + b, new_func_names.values(), [] + ) + + custom_idx = 0 + if isinstance(op.func, list): + custom_iter = (f for f in op.func if isinstance(f, CustomReduction)) + else: + custom_iter = (f for f in op.func.values() if isinstance(f, CustomReduction)) + for r in custom_iter: + if r.name == "": + r.name = f"" + custom_idx += 1 + + +def aggregate(df, func=None, axis=0, **kw): + axis = validate_axis(axis, df) + use_inf_as_na = kw.pop("_use_inf_as_na", options.dataframe.mode.use_inf_as_na) + if ( + df.ndim == 2 + and isinstance(func, dict) + and (df.op.output_types[0] == OutputType.series or axis == 1) + ): + raise NotImplementedError( + "Currently cannot aggregate dicts over axis=1 on %s" % type(df).__name__ + ) + combine_size = kw.pop("_combine_size", None) or options.combine_size + numeric_only = kw.pop("_numeric_only", None) + bool_only = kw.pop("_bool_only", None) + + output_type = kw.pop("_output_type", None) + dtypes = kw.pop("_dtypes", None) + index = kw.pop("_index", None) + + if not is_funcs_aggregate(func, func_kw=kw, ndim=df.ndim): + return df.transform(func, axis=axis, _call_agg=True) + + op = DataFrameAggregate( + raw_func=copy.deepcopy(func), + raw_func_kw=copy.deepcopy(kw), + axis=axis, + combine_size=combine_size, + numeric_only=numeric_only, + bool_only=bool_only, + use_inf_as_na=use_inf_as_na, + ) + + return op(df, output_type=output_type, dtypes=dtypes, index=index) diff --git a/python/xorbits/_mars/dataframe/reduction/all.py b/python/xorbits/_mars/dataframe/reduction/all.py new file mode 100644 index 000000000..f01db9b81 --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/all.py @@ -0,0 +1,129 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...config import options +from ...core import OutputType +from .core import ( + DATAFRAME_TYPE, + DataFrameReductionMixin, + DataFrameReductionOperand, + recursive_tile, +) + + +class DataFrameAll(DataFrameReductionOperand, DataFrameReductionMixin): + _op_type_ = OperandDef.ALL + _func_name = "all" + + @property + def is_atomic(self): + return True + + @classmethod + def tile(cls, op): + in_df = op.inputs[0] + out_df = op.outputs[0] + if op.axis is None and isinstance(in_df, DATAFRAME_TYPE): + dtypes = pd.Series([out_df.dtype]) + index = in_df.dtypes.index + out_df = yield from recursive_tile( + in_df.agg( + cls.get_reduction_callable(op), + axis=0, + _numeric_only=op.numeric_only, + _bool_only=op.bool_only, + _combine_size=op.combine_size, + _output_type=OutputType.series, + _dtypes=dtypes, + _index=index, + ) + ) + out_df = yield from recursive_tile( + out_df.agg( + cls.get_reduction_callable(op), + axis=0, + _numeric_only=op.numeric_only, + _bool_only=op.bool_only, + _combine_size=op.combine_size, + _output_type=OutputType.scalar, + _dtypes=out_df.dtype, + _index=None, + ) + ) + return [out_df] + else: + return (yield from super().tile(op)) + + def __call__(self, df): + if self.axis is None and isinstance(df, DATAFRAME_TYPE): + return self.new_scalar([df], np.dtype("bool")) + else: + return super().__call__(df) + + +def all_series( + series, + axis=0, + bool_only=None, + skipna=True, + level=None, + combine_size=None, + method=None, +): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameAll( + axis=axis, + skipna=skipna, + level=level, + bool_only=bool_only, + combine_size=combine_size, + output_types=[OutputType.scalar], + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(series) + + +def all_dataframe( + df, + axis=0, + bool_only=None, + skipna=True, + level=None, + combine_size=None, + method=None, +): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + output_types = [OutputType.series] if axis is not None else [OutputType.scalar] + op = DataFrameAll( + axis=axis, + skipna=skipna, + level=level, + bool_only=bool_only, + combine_size=combine_size, + output_types=output_types, + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(df) + + +def all_index(idx): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameAll(output_types=[OutputType.scalar], use_inf_as_na=use_inf_as_na) + return op(idx) diff --git a/python/xorbits/_mars/dataframe/reduction/any.py b/python/xorbits/_mars/dataframe/reduction/any.py new file mode 100644 index 000000000..36bece7dc --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/any.py @@ -0,0 +1,129 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...config import options +from ...core import OutputType +from .core import ( + DATAFRAME_TYPE, + DataFrameReductionMixin, + DataFrameReductionOperand, + recursive_tile, +) + + +class DataFrameAny(DataFrameReductionOperand, DataFrameReductionMixin): + _op_type_ = OperandDef.ANY + _func_name = "any" + + @property + def is_atomic(self): + return True + + @classmethod + def tile(cls, op): + in_df = op.inputs[0] + out_df = op.outputs[0] + if op.axis is None and isinstance(in_df, DATAFRAME_TYPE): + dtypes = pd.Series([out_df.dtype]) + index = in_df.dtypes.index + out_df = yield from recursive_tile( + in_df.agg( + cls.get_reduction_callable(op), + axis=0, + _numeric_only=op.numeric_only, + _bool_only=op.bool_only, + _combine_size=op.combine_size, + _output_type=OutputType.series, + _dtypes=dtypes, + _index=index, + ) + ) + out_df = yield from recursive_tile( + out_df.agg( + cls.get_reduction_callable(op), + axis=0, + _numeric_only=op.numeric_only, + _bool_only=op.bool_only, + _combine_size=op.combine_size, + _output_type=OutputType.scalar, + _dtypes=out_df.dtype, + _index=None, + ) + ) + return [out_df] + else: + return (yield from super().tile(op)) + + def __call__(self, df): + if self.axis is None and isinstance(df, DATAFRAME_TYPE): + return self.new_scalar([df], np.dtype("bool")) + else: + return super().__call__(df) + + +def any_series( + series, + axis=0, + bool_only=None, + skipna=True, + level=None, + combine_size=None, + method=None, +): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameAny( + axis=axis, + skipna=skipna, + level=level, + bool_only=bool_only, + combine_size=combine_size, + output_types=[OutputType.scalar], + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(series) + + +def any_dataframe( + df, + axis=0, + bool_only=None, + skipna=True, + level=None, + combine_size=None, + method=None, +): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + output_types = [OutputType.series] if axis is not None else [OutputType.scalar] + op = DataFrameAny( + axis=axis, + skipna=skipna, + level=level, + bool_only=bool_only, + combine_size=combine_size, + output_types=output_types, + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(df) + + +def any_index(index): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameAny(output_types=[OutputType.scalar], use_inf_as_na=use_inf_as_na) + return op(index) diff --git a/python/xorbits/_mars/dataframe/reduction/core.py b/python/xorbits/_mars/dataframe/reduction/core.py new file mode 100644 index 000000000..283b13538 --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/core.py @@ -0,0 +1,1251 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import inspect +from collections import OrderedDict +from typing import Any, Callable, Dict, List, NamedTuple, Optional + +import numpy as np +import pandas as pd + +from ...core import ( + ENTITY_TYPE, + OutputType, + enter_mode, + is_build_mode, + is_kernel_mode, + recursive_tile, +) +from ...core.operand import OperandStage +from ...serialization.serializables import ( + AnyField, + BoolField, + DataTypeField, + Int32Field, + StringField, +) +from ...utils import pd_release_version, tokenize +from ..core import SERIES_TYPE +from ..operands import DATAFRAME_TYPE, DataFrameOperand, DataFrameOperandMixin +from ..utils import ( + build_df, + build_empty_df, + build_empty_series, + build_series, + parse_index, + validate_axis, +) + +# in pandas<1.3, when aggregating with multiple levels and numeric_only is True, +# object cols not ignored with min-max funcs +_level_reduction_keep_object = pd_release_version[:2] < (1, 3) +# in pandas>=1.3, when dataframes are reduced into series, mixture of float and bool +# results in object. +_reduce_bool_as_object = pd_release_version[:2] != (1, 2) + + +class DataFrameReductionOperand(DataFrameOperand): + _axis = AnyField("axis") + _skipna = BoolField("skipna") + _level = AnyField("level") + _numeric_only = BoolField("numeric_only") + _bool_only = BoolField("bool_only") + _min_count = Int32Field("min_count") + _use_inf_as_na = BoolField("use_inf_as_na") + _method = StringField("method") + + _dtype = DataTypeField("dtype") + _combine_size = Int32Field("combine_size") + + def __init__( + self, + axis=None, + skipna=None, + level=None, + numeric_only=None, + bool_only=None, + min_count=None, + dtype=None, + combine_size=None, + gpu=None, + sparse=None, + output_types=None, + use_inf_as_na=None, + method=None, + **kw, + ): + super().__init__( + _axis=axis, + _skipna=skipna, + _level=level, + _numeric_only=numeric_only, + _bool_only=bool_only, + _min_count=min_count, + _dtype=dtype, + _combine_size=combine_size, + gpu=gpu, + sparse=sparse, + _output_types=output_types, + _use_inf_as_na=use_inf_as_na, + _method=method, + **kw, + ) + + @property + def axis(self): + return self._axis + + @property + def skipna(self): + return self._skipna + + @property + def level(self): + return self._level + + @property + def numeric_only(self): + return self._numeric_only + + @property + def bool_only(self): + return self._bool_only + + @property + def min_count(self): + return self._min_count + + @property + def dtype(self): + return self._dtype + + @property + def combine_size(self): + return self._combine_size + + @property + def use_inf_as_na(self): + return self._use_inf_as_na + + @property + def is_atomic(self): + return False + + @property + def method(self): + return self._method + + def get_reduction_args(self, axis=None): + args = dict(skipna=self.skipna) + if self.inputs and self.inputs[0].ndim > 1: + args["axis"] = axis + if self.numeric_only is not None: + args["numeric_only"] = self.numeric_only + if self.bool_only is not None: + args["bool_only"] = self.bool_only + return {k: v for k, v in args.items() if v is not None} + + +class DataFrameCumReductionOperand(DataFrameOperand): + _axis = AnyField("axis") + _skipna = BoolField("skipna") + _use_inf_as_na = BoolField("use_inf_as_na") + + _dtype = DataTypeField("dtype") + + def __init__( + self, + axis=None, + skipna=None, + dtype=None, + gpu=None, + sparse=None, + output_types=None, + use_inf_as_na=None, + **kw, + ): + super().__init__( + _axis=axis, + _skipna=skipna, + _dtype=dtype, + gpu=gpu, + sparse=sparse, + _output_types=output_types, + _use_inf_as_na=use_inf_as_na, + **kw, + ) + + @property + def axis(self): + return self._axis + + @property + def skipna(self): + return self._skipna + + @property + def dtype(self): + return self._dtype + + @property + def use_inf_as_na(self): + return self._use_inf_as_na + + +def _default_agg_fun(value, func_name=None, **kw): + if value.ndim == 1: + kw.pop("bool_only", None) + kw.pop("numeric_only", None) + return getattr(value, func_name)(**kw) + else: + return getattr(value, func_name)(**kw) + + +@functools.lru_cache(100) +def _get_series_reduction_dtype( + dtype, + func_name, + axis=None, + bool_only=False, + skipna=True, + numeric_only=False, +): + test_series = build_series(dtype=dtype, ensure_string=True) + if func_name == "count": + reduced = test_series.count() + elif func_name == "nunique": + reduced = test_series.nunique() + elif func_name in ("all", "any"): + reduced = getattr(test_series, func_name)(axis=axis, bool_only=bool_only) + elif func_name == "size": + reduced = test_series.size + elif func_name == "str_concat": + reduced = pd.Series([test_series.str.cat()]) + else: + reduced = getattr(test_series, func_name)( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) + return pd.Series(reduced).dtype + + +@functools.lru_cache(100) +def _get_df_reduction_dtype( + dtype, func_name, axis=None, bool_only=False, skipna=False, numeric_only=False +): + test_df = build_series(dtype=dtype, ensure_string=True).to_frame() + if func_name == "count": + reduced = getattr(test_df, func_name)(axis=axis, numeric_only=numeric_only) + elif func_name == "nunique": + reduced = getattr(test_df, func_name)(axis=axis) + elif func_name in ("all", "any"): + reduced = getattr(test_df, func_name)(axis=axis, bool_only=bool_only) + elif func_name == "str_concat": + reduced = test_df.apply(lambda s: s.str.cat(), axis=axis) + else: + reduced = getattr(test_df, func_name)( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) + if len(reduced) == 0: + return None + return reduced.dtype + + +class DataFrameReductionMixin(DataFrameOperandMixin): + @classmethod + def get_reduction_callable(cls, op): + func_name = getattr(op, "_func_name") + kw = dict( + skipna=op.skipna, numeric_only=op.numeric_only, bool_only=op.bool_only + ) + kw = {k: v for k, v in kw.items() if v is not None} + fun = functools.partial(_default_agg_fun, func_name=func_name, **kw) + fun.__name__ = func_name + return fun + + @classmethod + def tile(cls, op): + in_df = op.inputs[0] + out_df = op.outputs[0] + + if isinstance(out_df, SERIES_TYPE): + output_type = OutputType.series + dtypes = pd.Series([out_df.dtype], index=[out_df.name]) + index = out_df.index_value.to_pandas() + elif out_df.ndim == 1: + output_type = OutputType.tensor + dtypes, index = out_df.dtype, None + else: + output_type = OutputType.scalar + dtypes, index = out_df.dtype, None + + out_df = yield from recursive_tile( + in_df.agg( + cls.get_reduction_callable(op), + axis=op.axis or 0, + _numeric_only=op.numeric_only, + _bool_only=op.bool_only, + _combine_size=op.combine_size, + _output_type=output_type, + _dtypes=dtypes, + _index=index, + ) + ) + return [out_df] + + def _call_groupby_level(self, df, level): + return df.groupby(level=level).agg( + self.get_reduction_callable(self), method=self.method + ) + + def _call_dataframe(self, df): + axis = getattr(self, "axis", None) or 0 + level = getattr(self, "level", None) + skipna = getattr(self, "skipna", True) + numeric_only = getattr(self, "numeric_only", None) + bool_only = getattr(self, "bool_only", None) + self._axis = axis = validate_axis(axis, df) + func_name = getattr(self, "_func_name") + + if level is not None and axis == 1: + raise NotImplementedError("Not support specify level for axis==1") + + if func_name == "size": + reduced = pd.Series( + np.zeros(df.shape[1 - axis]), + index=df.dtypes.index if axis == 0 else None, + ) + reduced_cols = list(reduced.index) + reduced_dtype = reduced.dtype + elif func_name == "custom_reduction": + empty_df = build_df(df, ensure_string=True) + reduced = getattr(self, "custom_reduction").__call_agg__(empty_df) + reduced_cols = list(reduced.index) + reduced_dtype = reduced.dtype + else: + reduced_cols, dtypes = [], [] + for col, src_dt in df.dtypes.items(): + dt = _get_df_reduction_dtype( + src_dt, + func_name, + axis=axis, + bool_only=bool_only, + skipna=skipna, + numeric_only=numeric_only, + ) + if dt is not None: + reduced_cols.append(col) + dtypes.append(dt) + elif ( + _level_reduction_keep_object + and numeric_only + and level is not None + and func_name in ("min", "max") + and src_dt == np.dtype(object) + ): # pragma: no cover + reduced_cols.append(col) + dtypes.append(np.dtype(object)) + if len(dtypes) == 0: + reduced_dtype = np.dtype("O") + elif all(dt == dtypes[0] for dt in dtypes): + reduced_dtype = dtypes[0] + else: + # as we already bypassed dtypes with same values, + # when has_mixed_bool is True, there are other dtypes + # other than bool. + has_mixed_bool = any(dt == np.dtype(bool) for dt in dtypes) + if _reduce_bool_as_object and has_mixed_bool: + reduced_dtype = np.dtype("O") + elif not all(isinstance(dt, np.dtype) for dt in dtypes): + # todo currently we return mixed dtypes as np.dtype('O'). + # handle pandas Dtypes in the future more carefully. + reduced_dtype = np.dtype("O") + else: + reduced_dtype = np.find_common_type(dtypes, []) + + if level is not None: + return self._call_groupby_level(df[reduced_cols], level) + + if axis == 0: + reduced_shape = (len(reduced_cols),) + reduced_index_value = parse_index(pd.Index(reduced_cols), store_data=True) + else: + reduced_shape = (df.shape[0],) + reduced_index_value = parse_index(pd.RangeIndex(-1)) + + return self.new_series( + [df], + shape=reduced_shape, + dtype=reduced_dtype, + index_value=reduced_index_value, + ) + + def _call_series(self, series): + level = getattr(self, "level", None) + axis = getattr(self, "axis", None) + skipna = getattr(self, "skipna", True) + numeric_only = getattr(self, "numeric_only", None) + bool_only = getattr(self, "bool_only", None) + self._axis = axis = validate_axis(axis or 0, series) + func_name = getattr(self, "_func_name") + + if level is not None: + return self._call_groupby_level(series, level) + + if func_name == "custom_reduction": + empty_series = build_series(series, ensure_string=True) + result_scalar = getattr(self, "custom_reduction").__call_agg__(empty_series) + if hasattr(result_scalar, "to_pandas"): # pragma: no cover + result_scalar = result_scalar.to_pandas() + result_dtype = pd.Series(result_scalar).dtype + else: + result_dtype = _get_series_reduction_dtype( + series.dtype, + func_name, + axis=axis, + bool_only=bool_only, + numeric_only=numeric_only, + skipna=skipna, + ) + return self.new_scalar([series], dtype=result_dtype) + + def __call__(self, a): + if is_kernel_mode() and not getattr(self, "is_atomic", False): + return self.get_reduction_callable(self)(a) + + if isinstance(a, DATAFRAME_TYPE): + return self._call_dataframe(a) + else: + return self._call_series(a) + + +class DataFrameCumReductionMixin(DataFrameOperandMixin): + @classmethod + def _tile_one_chunk(cls, op): + df = op.outputs[0] + params = df.params.copy() + + chk = op.inputs[0].chunks[0] + chunk_params = {k: v for k, v in chk.params.items() if k in df.params} + chunk_params["shape"] = df.shape + chunk_params["index"] = chk.index + new_chunk_op = op.copy().reset_key() + chunk = new_chunk_op.new_chunk(op.inputs[0].chunks, kws=[chunk_params]) + + new_op = op.copy() + nsplits = tuple((s,) for s in chunk.shape) + params["chunks"] = [chunk] + params["nsplits"] = nsplits + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + def _build_combine(cls, op, input_chunks, summary_chunks, idx): + c = input_chunks[idx] + to_concat_chunks = [c] + for j in range(idx): + to_concat_chunks.append(summary_chunks[j]) + + new_chunk_op = op.copy().reset_key() + new_chunk_op.stage = OperandStage.combine + return new_chunk_op.new_chunk(to_concat_chunks, **c.params) + + @classmethod + def _tile_dataframe(cls, op): + in_df = op.inputs[0] + df = op.outputs[0] + + n_rows, n_cols = in_df.chunk_shape + + # map to get individual results and summaries + src_chunks = np.empty(in_df.chunk_shape, dtype=object) + summary_chunks = np.empty(in_df.chunk_shape, dtype=object) + for c in in_df.chunks: + new_chunk_op = op.copy().reset_key() + new_chunk_op.stage = OperandStage.map + if op.axis == 1: + summary_shape = (c.shape[0], 1) + else: + summary_shape = (1, c.shape[1]) + src_chunks[c.index] = c + summary_chunks[c.index] = new_chunk_op.new_chunk( + [c], shape=summary_shape, dtypes=df.dtypes + ) + + # combine summaries into results + output_chunk_array = np.empty(in_df.chunk_shape, dtype=object) + if op.axis == 1: + for row in range(n_rows): + row_src = src_chunks[row, :] + row_summaries = summary_chunks[row, :] + for col in range(n_cols): + output_chunk_array[row, col] = cls._build_combine( + op, row_src, row_summaries, col + ) + else: + for col in range(n_cols): + col_src = src_chunks[:, col] + col_summaries = summary_chunks[:, col] + for row in range(n_rows): + output_chunk_array[row, col] = cls._build_combine( + op, col_src, col_summaries, row + ) + + output_chunks = list(output_chunk_array.reshape((n_rows * n_cols,))) + new_op = op.copy().reset_key() + return new_op.new_tileables( + op.inputs, + shape=in_df.shape, + nsplits=in_df.nsplits, + chunks=output_chunks, + dtypes=df.dtypes, + index_value=df.index_value, + columns_value=df.columns_value, + ) + + @classmethod + def _tile_series(cls, op): + in_series = op.inputs[0] + series = op.outputs[0] + + # map to get individual results and summaries + summary_chunks = np.empty(in_series.chunk_shape, dtype=object) + for c in in_series.chunks: + new_chunk_op = op.copy().reset_key() + new_chunk_op.stage = OperandStage.map + summary_chunks[c.index] = new_chunk_op.new_chunk( + [c], shape=(1,), dtype=series.dtype + ) + + # combine summaries into results + output_chunks = [ + cls._build_combine(op, in_series.chunks, summary_chunks, i) + for i in range(len(in_series.chunks)) + ] + new_op = op.copy().reset_key() + return new_op.new_tileables( + op.inputs, + shape=in_series.shape, + nsplits=in_series.nsplits, + chunks=output_chunks, + dtype=series.dtype, + index_value=series.index_value, + name=series.name, + ) + + @classmethod + def tile(cls, op): + in_df = op.inputs[0] + if len(in_df.chunks) == 1: + return cls._tile_one_chunk(op) + if isinstance(in_df, DATAFRAME_TYPE): + return cls._tile_dataframe(op) + else: + return cls._tile_series(op) + + @staticmethod + def _get_last_slice(op, df, start): + if op.output_types[0] == OutputType.series: + return df.iloc[start:] + else: + if op.axis == 1: + return df.iloc[:, start:] + else: + return df.iloc[start:, :] + + @classmethod + def _execute_map(cls, ctx, op): + in_data = ctx[op.inputs[0].key] + kwargs = dict() + if op.axis is not None: + kwargs["axis"] = op.axis + if op.skipna is not None: + kwargs["skipna"] = op.skipna + partial = getattr(in_data, getattr(cls, "_func_name"))(**kwargs) + if op.skipna: + partial.fillna(method="ffill", axis=op.axis, inplace=True) + ctx[op.outputs[0].key] = cls._get_last_slice(op, partial, -1) + + @classmethod + def _execute_combine(cls, ctx, op): + kwargs = dict() + if op.axis is not None: + kwargs["axis"] = op.axis + if op.skipna is not None: + kwargs["skipna"] = op.skipna + + if len(op.inputs) > 1: + ref_datas = [ctx[inp.key] for inp in op.inputs[1:]] + concat_df = getattr( + pd.concat(ref_datas, axis=op.axis), getattr(cls, "_func_name") + )(**kwargs) + if op.skipna: + concat_df.fillna(method="ffill", axis=op.axis, inplace=True) + + in_data = ctx[op.inputs[0].key] + concat_df = pd.concat( + [cls._get_last_slice(op, concat_df, -1), in_data], axis=op.axis + ) + result = getattr(concat_df, getattr(cls, "_func_name"))(**kwargs) + ctx[op.outputs[0].key] = cls._get_last_slice(op, result, 1) + else: + ctx[op.outputs[0].key] = getattr( + ctx[op.inputs[0].key], getattr(cls, "_func_name") + )(**kwargs) + + @classmethod + def execute(cls, ctx, op): + try: + pd.set_option("mode.use_inf_as_na", op.use_inf_as_na) + if op.stage == OperandStage.map: + return cls._execute_map(ctx, op) + else: + return cls._execute_combine(ctx, op) + finally: + pd.reset_option("mode.use_inf_as_na") + + def _call_dataframe(self, df): + axis = getattr(self, "axis", None) or 0 + self._axis = axis = validate_axis(axis, df) + + empty_df = build_empty_df(df.dtypes) + reduced_df = getattr(empty_df, getattr(self, "_func_name"))(axis=axis) + return self.new_dataframe( + [df], + shape=df.shape, + dtypes=reduced_df.dtypes, + index_value=df.index_value, + columns_value=df.columns_value, + ) + + def _call_series(self, series): + axis = getattr(self, "axis", None) or 0 + if axis == "index": + axis = 0 + self._axis = axis + + return self.new_series( + [series], + shape=series.shape, + dtype=series.dtype, + name=series.name, + index_value=series.index_value, + ) + + def __call__(self, a): + if isinstance(a, DATAFRAME_TYPE): + return self._call_dataframe(a) + else: + return self._call_series(a) + + +class CustomReduction: + name: Optional[str] + output_limit: Optional[int] + kwds: Dict + + # set to True when pre() already performs aggregation + pre_with_agg = False + + def __init__(self, name=None, is_gpu=None): + self.name = name or "" + self.output_limit = 1 + self._is_gpu = is_gpu + + @property + def __name__(self): + return self.name + + def __call__(self, value): + if isinstance(value, ENTITY_TYPE): + from .custom_reduction import build_custom_reduction_result + + return build_custom_reduction_result(value, self) + return self.__call_agg__(value) + + def __call_agg__(self, value): + r = self.pre(value) + if not isinstance(r, tuple): + r = (r,) + # update output limit into actual size + self.output_limit = len(r) + + # only perform aggregation when pre() does not perform aggregation + if not self.pre_with_agg: + r = self.agg(*r) + if not isinstance(r, tuple): + r = (r,) + + r = self.post(*r) + return r + + def is_gpu(self): + return self._is_gpu if not is_build_mode() else False + + def pre(self, value): # noqa: R0201 # pylint: disable=no-self-use + return (value,) + + def agg(self, *values): # noqa: R0201 # pylint: disable=no-self-use + raise NotImplementedError + + def post(self, *value): # noqa: R0201 # pylint: disable=no-self-use + assert len(value) == 1 + return value[0] + + def __mars_tokenize__(self): + import cloudpickle + + return cloudpickle.dumps(self) + + +class ReductionPreStep(NamedTuple): + input_key: str + output_key: str + columns: Optional[List[str]] + func: Callable + + +class ReductionAggStep(NamedTuple): + input_key: str + raw_func_name: Optional[str] + map_func_name: Optional[str] + agg_func_name: Optional[str] + custom_reduction: Optional[CustomReduction] + output_key: str + output_limit: int + kwds: Dict[str, Any] + + +class ReductionPostStep(NamedTuple): + input_keys: List[str] + output_key: str + func_name: str + columns: Optional[List[str]] + func: Callable + + +class ReductionSteps(NamedTuple): + pre_funcs: List[ReductionPreStep] + agg_funcs: List[ReductionAggStep] + post_funcs: List[ReductionPostStep] + + +# lookup table for numpy arithmetic operands in pandas +_func_name_converts = dict( + greater="gt", + greater_equal="ge", + less="lt", + less_equal="le", + equal="eq", + not_equal="ne", + true_divide="truediv", + floor_divide="floordiv", + power="pow", +) +_func_name_to_op = dict( + greater=">", + gt=">", + greater_equal=">=", + ge=">", + less="<", + lt="<", + less_equal="<=", + le="<=", + equal="==", + eq="==", + not_equal="!=", + ne="!=", + bitwise_and="&", + __and__="&", + bitwise_or="|", + __or__="|", + bitwise_xor="^", + __xor__="^", + add="+", + subtract="-", + sub="-", + multiply="*", + mul="*", + true_divide="/", + truediv="/", + floor_divide="//", + floordiv="//", + power="**", + pow="**", + mod="%", +) +_func_compile_cache = dict() # type: Dict[str, ReductionSteps] + + +class ReductionCompiler: + def __init__(self, axis=0, store_source=False): + self._axis = axis + self._store_source = store_source + + self._key_to_tileable = dict() + self._output_tileables = [] + self._lambda_counter = 0 + self._custom_counter = 0 + self._func_cache = dict() + + self._compiled_funcs = [] + self._output_key_to_pre_steps = dict() + self._output_key_to_pre_cols = dict() + self._output_key_to_agg_steps = dict() + self._output_key_to_post_steps = dict() + self._output_key_to_post_cols = dict() + + @classmethod + def _check_function_valid(cls, func): + if isinstance(func, functools.partial): + return cls._check_function_valid(func.func) + elif isinstance(func, CustomReduction): + return + + func_code = func.__code__ + func_vars = {n: func.__globals__.get(n) for n in func_code.co_names} + if func.__closure__: + func_vars.update( + { + n: cell.cell_contents + for n, cell in zip(func_code.co_freevars, func.__closure__) + } + ) + # external Mars objects shall not be referenced + for var_name, val in func_vars.items(): + if isinstance(val, ENTITY_TYPE): + raise ValueError( + f"Variable {var_name} used by {func.__name__} " + "cannot be a Mars object" + ) + + @staticmethod + def _update_col_dict(col_dict: Dict, key: str, cols: List): + if key in col_dict: + existing_cols = col_dict[key] + if existing_cols is not None: + existing_col_set = set(existing_cols) + col_dict[key].extend([c for c in cols if c not in existing_col_set]) + else: + col_dict[key] = list(cols) if cols is not None else None + + def add_function(self, func, ndim, cols=None, func_name=None): + from .aggregation import _agg_functions + + cols = cols if cols is not None and self._axis == 0 else None + + func_name = func_name or getattr(func, "__name__", None) + if func_name == "" or func_name is None: + func_name = f"" + self._lambda_counter += 1 + if func_name == "" or func_name is None: + func_name = f"" + self._custom_counter += 1 + + if inspect.isbuiltin(func): + raw_func_name = getattr(func, "__name__", "N/A") + if raw_func_name in _agg_functions: + func = _agg_functions[raw_func_name] + else: + raise ValueError(f"Unexpected built-in function {raw_func_name}") + + compile_result = self._compile_function(func, func_name, ndim=ndim) + self._compiled_funcs.append(compile_result) + + for step in compile_result.pre_funcs: + self._output_key_to_pre_steps[step.output_key] = step + self._update_col_dict(self._output_key_to_pre_cols, step.output_key, cols) + + for step in compile_result.agg_funcs: + self._output_key_to_agg_steps[step.output_key] = step + + for step in compile_result.post_funcs: + self._output_key_to_post_steps[step.output_key] = step + self._update_col_dict(self._output_key_to_post_cols, step.output_key, cols) + + def _compile_expr_function(self, py_src: str, local_consts: dict): + from ... import dataframe, tensor + + result_store = dict() + global_vars = globals().copy() + global_vars.update(local_consts) + global_vars.update(dict(mt=tensor, md=dataframe, array=np.array, nan=np.nan)) + exec( + py_src, global_vars, result_store + ) # noqa: W0122 # nosec # pylint: disable=exec-used + fun = result_store["expr_function"] + if self._store_source: + fun.__source__ = py_src + return fun + + @staticmethod + def _build_mock_return_object(func, input_dtype, ndim): + from ..initializer import DataFrame as MarsDataFrame + from ..initializer import Series as MarsSeries + + if ndim == 1: + mock_series = build_empty_series(np.dtype(input_dtype)) + mock_obj = MarsSeries(mock_series) + else: + mock_df = build_empty_df( + pd.Series([np.dtype(input_dtype)] * 2, index=["A", "B"]) + ) + mock_obj = MarsDataFrame(mock_df) + + # calc target tileable to generate DAG + with enter_mode(kernel=True, build=False): + return func(mock_obj) + + @enter_mode(build=True) + def _compile_function(self, func, func_name=None, ndim=1) -> ReductionSteps: + from ...tensor.arithmetic.core import TensorBinOp, TensorUnaryOp + from ...tensor.base import TensorWhere + from ..arithmetic.core import DataFrameBinOp, DataFrameUnaryOp + from ..datasource.dataframe import DataFrameDataSource + from ..datasource.series import SeriesDataSource + from ..indexing.where import DataFrameWhere + + func_token = tokenize(func, self._axis, func_name, ndim) + if func_token in _func_compile_cache: + return _func_compile_cache[func_token] + custom_reduction = func if isinstance(func, CustomReduction) else None + + self._check_function_valid(func) + + try: + func_ret = self._build_mock_return_object(func, float, ndim=ndim) + except (TypeError, AttributeError): + # we may encounter lambda x: x.str.cat(...), use an object series to test + func_ret = self._build_mock_return_object(func, object, ndim=1) + output_limit = getattr(func, "output_limit", None) or 1 + + if not isinstance(func_ret, ENTITY_TYPE): + raise ValueError( + f"Custom function should return a Mars object, not {type(func_ret)}" + ) + if func_ret.ndim >= ndim: + raise ValueError("Function not a reduction") + + agg_graph = func_ret.build_graph() + agg_tileables = set(t for t in agg_graph if getattr(t.op, "is_atomic", False)) + # check operands before aggregation + for t in agg_graph.dfs( + list(agg_tileables), visit_predicate="all", reverse=True + ): + if t not in agg_tileables and not isinstance( + t.op, + ( + DataFrameUnaryOp, + DataFrameBinOp, + TensorUnaryOp, + TensorBinOp, + TensorWhere, + DataFrameWhere, + DataFrameDataSource, + SeriesDataSource, + ), + ): + raise ValueError(f"Cannot support operand {type(t.op)} in aggregation") + # check operands after aggregation + for t in agg_graph.dfs(list(agg_tileables), visit_predicate="all"): + if t not in agg_tileables and not isinstance( + t.op, + ( + DataFrameUnaryOp, + DataFrameBinOp, + TensorWhere, + DataFrameWhere, + TensorUnaryOp, + TensorBinOp, + ), + ): + raise ValueError(f"Cannot support operand {type(t.op)} in aggregation") + + pre_funcs, agg_funcs, post_funcs = [], [], [] + visited_inputs = set() + # collect aggregations and their inputs + for t in agg_tileables: + agg_input_key = t.inputs[0].key + + # collect agg names + step_func_name = getattr(t.op, "_func_name") + if step_func_name in ("count", "size"): + map_func_name, agg_func_name = step_func_name, "sum" + else: + map_func_name, agg_func_name = step_func_name, step_func_name + + # build agg description + agg_funcs.append( + ReductionAggStep( + agg_input_key, + func_name, + map_func_name, + agg_func_name, + custom_reduction, + t.key, + output_limit, + t.op.get_reduction_args(axis=self._axis), + ) + ) + # collect agg input and build function + if agg_input_key not in visited_inputs: + visited_inputs.add(agg_input_key) + initial_inputs = list(t.inputs[0].build_graph().iter_indep()) + assert len(initial_inputs) == 1 + input_key = initial_inputs[0].key + + func_str, _, local_consts = self._generate_function_str(t.inputs[0]) + pre_funcs.append( + ReductionPreStep( + input_key, + agg_input_key, + None, + self._compile_expr_function(func_str, local_consts), + ) + ) + # collect function output after agg + func_str, input_keys, local_consts = self._generate_function_str(func_ret) + post_funcs.append( + ReductionPostStep( + input_keys, + func_ret.key, + func_name, + None, + self._compile_expr_function(func_str, local_consts), + ) + ) + if len(_func_compile_cache) > 100: # pragma: no cover + _func_compile_cache.pop(next(iter(_func_compile_cache.keys()))) + result = _func_compile_cache[func_token] = ReductionSteps( + pre_funcs, agg_funcs, post_funcs + ) + return result + + def _generate_function_str(self, out_tileable): + """ + Generate python code from tileable DAG + """ + from ...tensor.arithmetic.core import TensorBinOp, TensorUnaryOp + from ...tensor.base import TensorWhere + from ...tensor.datasource import Scalar + from ..arithmetic.core import ( + DataFrameBinOp, + DataFrameUnaryOp, + DataFrameUnaryUfunc, + ) + from ..datasource.dataframe import DataFrameDataSource + from ..datasource.series import SeriesDataSource + from ..indexing.where import DataFrameWhere + + input_key_to_var = OrderedDict() + local_key_to_var = dict() + local_consts_to_val = dict() + ref_counts = dict() + ref_visited = set() + local_lines = [] + + input_op_types = ( + DataFrameDataSource, + SeriesDataSource, + DataFrameReductionOperand, + ) + + def _calc_ref_counts(t): + # calculate object refcount for t, this reduces memory usage in functions + if t.key in ref_visited: + return + ref_visited.add(t.key) + for inp in t.inputs: + _calc_ref_counts(inp) + + if not isinstance(inp.op, input_op_types): + if inp.key not in ref_counts: + ref_counts[inp.key] = 0 + ref_counts[inp.key] += 1 + + def _gen_expr_str(t): + # generate code for t + if t.key in local_key_to_var: + return + + if isinstance(t.op, input_op_types): + # tileable is an input arg, build a function variable + if t.key not in input_key_to_var: # pragma: no branch + input_key_to_var[t.key] = local_key_to_var[ + t.key + ] = f"invar{len(input_key_to_var)}" + else: + keys_to_del = [] + for inp in t.inputs: + _gen_expr_str(inp) + + if inp.key in ref_counts: + ref_counts[inp.key] -= 1 + if ref_counts[inp.key] == 0: + # the input is no longer referenced, a del statement will be produced + keys_to_del.append(inp.key) + + var_name = local_key_to_var[t.key] = f"var{len(local_key_to_var)}" + keys_to_vars = {inp.key: local_key_to_var[inp.key] for inp in t.inputs} + + def _interpret_var(v): + # get representation for variables + if hasattr(v, "key"): + return keys_to_vars[v.key] + elif isinstance(v, (int, bool, str, bytes, np.integer, np.bool_)): + return repr(v) + else: + const_name = f"_const_{len(local_consts_to_val)}" + local_consts_to_val[const_name] = v + return const_name + + func_name = func_name_raw = getattr(t.op, "_func_name", None) + rfunc_name = getattr(t.op, "_rfunc_name", func_name) + + if func_name is None: + func_name = func_name_raw = getattr(t.op, "_bit_func_name", None) + rfunc_name = getattr(t.op, "_bit_rfunc_name", func_name) + + # handle function name differences between numpy and pandas arithmetic ops + if func_name in _func_name_converts: + func_name = _func_name_converts[func_name] + if rfunc_name in _func_name_converts: + rfunc_name = "r" + _func_name_converts[rfunc_name] + + # build given different op types + if isinstance(t.op, (DataFrameUnaryOp, TensorUnaryOp)): + val = _interpret_var(t.inputs[0]) + if isinstance(t.op, DataFrameUnaryUfunc): + statements = [f"{var_name} = np.{func_name_raw}({val})"] + else: + statements = [ + f"try:", + f" {var_name} = {val}.{func_name}()", + f"except AttributeError:", + f" {var_name} = np.{func_name_raw}({val})", + ] + elif isinstance(t.op, (DataFrameBinOp, TensorBinOp)): + lhs, rhs = t.op.lhs, t.op.rhs + op_axis = ( + 1 - self._axis + if hasattr(lhs, "ndim") + and hasattr(rhs, "ndim") + and lhs.ndim != rhs.ndim + else None + ) + lhs = _interpret_var(lhs) + rhs = _interpret_var(rhs) + axis_expr = f"axis={op_axis!r}, " if op_axis is not None else "" + op_str = _func_name_to_op[func_name] + if t.op.lhs is t.inputs[0]: + statements = [ + f"try:", + f" {var_name} = {lhs}.{func_name}({rhs}, {axis_expr})", + f"except AttributeError:", + f" {var_name} = {lhs} {op_str} {rhs}", + ] + else: + statements = [ + f"try:", + f" {var_name} = {rhs}.{rfunc_name}({lhs}, {axis_expr})", + f"except AttributeError:", + f" {var_name} = {rhs} {op_str} {lhs}", + ] + elif isinstance(t.op, TensorWhere): + cond = _interpret_var(t.op.condition) + x = _interpret_var(t.op.x) + y = _interpret_var(t.op.y) + statements = [ + f"if not gpu:", + f" {var_name} = np.where({cond}, {x}, {y})", + f"else:", # there is a bug with cudf.where + f" {var_name} = {x}", + ] + elif isinstance(t.op, DataFrameWhere): + func_name = "mask" if t.op.replace_true else "where" + inp = _interpret_var(t.op.input) + cond = _interpret_var(t.op.cond) + other = _interpret_var(t.op.other) + statements = [ + f"if not gpu:", + f" {var_name} = {inp}.{func_name}({cond}, {other}, " + f"axis={t.op.axis!r}, level={t.op.level!r})", + f"else:", # there is a bug with cudf.where + f" {var_name} = {inp}", + ] + elif isinstance(t.op, Scalar): + # for scalar inputs of other operands + data = _interpret_var(t.op.data) + statements = [f"{var_name} = {data}"] + else: # pragma: no cover + raise NotImplementedError( + f"Does not support aggregating on {type(t.op)}" + ) + + # append del statements for used inputs + for key in keys_to_del: + statements.append(f"del {local_key_to_var[key]}") + + local_lines.extend(statements) + + _calc_ref_counts(out_tileable) + _gen_expr_str(out_tileable) + + args_str = ", ".join(input_key_to_var.values()) + lines_str = "\n ".join(local_lines) + return ( + f"def expr_function({args_str}, gpu=None):\n" + f" {lines_str}\n" + f" return {local_key_to_var[out_tileable.key]}", + list(input_key_to_var.keys()), + local_consts_to_val, + ) + + def compile(self) -> ReductionSteps: + pre_funcs, agg_funcs, post_funcs = [], [], [] + referred_cols = set() + for key, step in self._output_key_to_pre_steps.items(): + cols = self._output_key_to_pre_cols[key] + if cols: + referred_cols.update(cols) + pre_funcs.append( + ReductionPreStep(step.input_key, step.output_key, cols, step.func) + ) + + for step in self._output_key_to_agg_steps.values(): + agg_funcs.append(step) + + for key, step in self._output_key_to_post_steps.items(): + cols = self._output_key_to_post_cols[key] + if cols and set(cols) == set(referred_cols): + post_cols = None + else: + post_cols = cols + + func_name = step.func_name + if self._lambda_counter == 1 and step.func_name == "": + func_name = "" + if self._custom_counter == 1 and step.func_name == "": + func_name = "" + + post_funcs.append( + ReductionPostStep( + step.input_keys, + step.output_key, + func_name, + post_cols, + step.func, + ) + ) + + return ReductionSteps(pre_funcs, agg_funcs, post_funcs) diff --git a/python/xorbits/_mars/dataframe/reduction/count.py b/python/xorbits/_mars/dataframe/reduction/count.py new file mode 100644 index 000000000..3b4643fe2 --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/count.py @@ -0,0 +1,68 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...config import options +from ...core import OutputType +from .core import DataFrameReductionMixin, DataFrameReductionOperand + + +class DataFrameCount(DataFrameReductionOperand, DataFrameReductionMixin): + _op_type_ = OperandDef.COUNT + _func_name = "count" + + @property + def is_atomic(self): + return True + + @classmethod + def get_reduction_callable(cls, op): + skipna, numeric_only = op.skipna, op.numeric_only + + def count(value): + if value.ndim == 1: + return value.count() + return value.count(skipna=skipna, numeric_only=numeric_only) + + return count + + +def count_series(series, level=None, combine_size=None, **kw): + use_inf_as_na = kw.pop("_use_inf_as_na", options.dataframe.mode.use_inf_as_na) + method = kw.pop("method", None) + op = DataFrameCount( + level=level, + combine_size=combine_size, + output_types=[OutputType.scalar], + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(series) + + +def count_dataframe( + df, axis=0, level=None, numeric_only=False, combine_size=None, **kw +): + use_inf_as_na = kw.pop("_use_inf_as_na", options.dataframe.mode.use_inf_as_na) + method = kw.pop("method", None) + op = DataFrameCount( + axis=axis, + level=level, + numeric_only=numeric_only, + combine_size=combine_size, + output_types=[OutputType.series], + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/cummax.py b/python/xorbits/_mars/dataframe/reduction/cummax.py new file mode 100644 index 000000000..b48976c27 --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/cummax.py @@ -0,0 +1,33 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...config import options +from .core import DataFrameCumReductionMixin, DataFrameCumReductionOperand + + +class DataFrameCummax(DataFrameCumReductionOperand, DataFrameCumReductionMixin): + _op_type_ = OperandDef.CUMMAX + _func_name = "cummax" + + +def cummax(df, axis=None, skipna=True): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameCummax( + axis=axis, + skipna=skipna, + output_types=df.op.output_types, + use_inf_as_na=use_inf_as_na, + ) + return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/cummin.py b/python/xorbits/_mars/dataframe/reduction/cummin.py new file mode 100644 index 000000000..502a59749 --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/cummin.py @@ -0,0 +1,33 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...config import options +from .core import DataFrameCumReductionMixin, DataFrameCumReductionOperand + + +class DataFrameCummin(DataFrameCumReductionOperand, DataFrameCumReductionMixin): + _op_type_ = OperandDef.CUMMIN + _func_name = "cummin" + + +def cummin(df, axis=None, skipna=True): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameCummin( + axis=axis, + skipna=skipna, + output_types=df.op.output_types, + use_inf_as_na=use_inf_as_na, + ) + return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/cumprod.py b/python/xorbits/_mars/dataframe/reduction/cumprod.py new file mode 100644 index 000000000..22b3d99f9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/cumprod.py @@ -0,0 +1,33 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...config import options +from .core import DataFrameCumReductionMixin, DataFrameCumReductionOperand + + +class DataFrameCumprod(DataFrameCumReductionOperand, DataFrameCumReductionMixin): + _op_type_ = OperandDef.CUMPROD + _func_name = "cumprod" + + +def cumprod(df, axis=None, skipna=True): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameCumprod( + axis=axis, + skipna=skipna, + output_types=df.op.output_types, + use_inf_as_na=use_inf_as_na, + ) + return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/cumsum.py b/python/xorbits/_mars/dataframe/reduction/cumsum.py new file mode 100644 index 000000000..964e721c3 --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/cumsum.py @@ -0,0 +1,33 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...config import options +from .core import DataFrameCumReductionMixin, DataFrameCumReductionOperand + + +class DataFrameCumsum(DataFrameCumReductionOperand, DataFrameCumReductionMixin): + _op_type_ = OperandDef.CUMSUM + _func_name = "cumsum" + + +def cumsum(df, axis=None, skipna=True): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameCumsum( + axis=axis, + skipna=skipna, + output_types=df.op.output_types, + use_inf_as_na=use_inf_as_na, + ) + return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/custom_reduction.py b/python/xorbits/_mars/dataframe/reduction/custom_reduction.py new file mode 100644 index 000000000..e4e66afa9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/custom_reduction.py @@ -0,0 +1,45 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...config import options +from ...core import OutputType +from ...serialization.serializables import AnyField +from .core import DataFrameReductionMixin, DataFrameReductionOperand + + +class DataFrameCustomReduction(DataFrameReductionOperand, DataFrameReductionMixin): + _op_type_ = OperandDef.CUSTOM_REDUCTION + _func_name = "custom_reduction" + + custom_reduction = AnyField("custom_reduction") + + @property + def is_atomic(self): + return True + + def get_reduction_args(self, axis=None): + return dict() + + +def build_custom_reduction_result(df, custom_reduction_obj, method=None): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + output_type = OutputType.series if df.ndim == 2 else OutputType.scalar + op = DataFrameCustomReduction( + custom_reduction=custom_reduction_obj, + output_types=[output_type], + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/kurtosis.py b/python/xorbits/_mars/dataframe/reduction/kurtosis.py new file mode 100644 index 000000000..b8e66d02f --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/kurtosis.py @@ -0,0 +1,124 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes +from ...config import options +from ...core import ENTITY_TYPE, OutputType +from ...serialization.serializables import BoolField +from .core import DataFrameReductionMixin, DataFrameReductionOperand + + +class DataFrameKurtosis(DataFrameReductionOperand, DataFrameReductionMixin): + _op_type_ = opcodes.KURTOSIS + _func_name = "kurt" + + _bias = BoolField("bias") + _fisher = BoolField("fisher") + + def __init__(self, bias=None, fisher=None, **kw): + super().__init__(_bias=bias, _fisher=fisher, **kw) + + @property + def bias(self): + return self._bias + + @property + def fisher(self): + return self._fisher + + @classmethod + def get_reduction_callable(cls, op): + from .aggregation import where_function + + skipna, bias, fisher = op.skipna, op.bias, op.fisher + + def kurt(x): + cnt = x.count() + mean = x.mean(skipna=skipna) + divided = ( + (x**4).mean(skipna=skipna) + - 4 * (x**3).mean(skipna=skipna) * mean + + 6 * (x**2).mean(skipna=skipna) * mean**2 + - 3 * mean**4 + ) + var = x.var(skipna=skipna, ddof=0) + if isinstance(var, ENTITY_TYPE) or var > 0: + val = where_function(var > 0, divided / var**2, np.nan) + else: + val = np.nan + if not bias: + val = where_function( + (var > 0) & (cnt > 3), + (val * (cnt**2 - 1) - 3 * (cnt - 1) ** 2) / (cnt - 2) / (cnt - 3), + np.nan, + ) + if not fisher: + val += 3 + return val + + return kurt + + +def kurt_series( + df, + axis=None, + skipna=True, + level=None, + combine_size=None, + bias=False, + fisher=True, + method=None, +): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameKurtosis( + axis=axis, + skipna=skipna, + level=level, + combine_size=combine_size, + bias=bias, + fisher=fisher, + output_types=[OutputType.scalar], + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(df) + + +def kurt_dataframe( + df, + axis=None, + skipna=True, + level=None, + numeric_only=None, + combine_size=None, + bias=False, + fisher=True, + method=None, +): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameKurtosis( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + bias=bias, + fisher=fisher, + combine_size=combine_size, + output_types=[OutputType.series], + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/max.py b/python/xorbits/_mars/dataframe/reduction/max.py new file mode 100644 index 000000000..6a6e38ad7 --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/max.py @@ -0,0 +1,75 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...config import options +from ...core import OutputType +from .core import DataFrameReductionMixin, DataFrameReductionOperand + + +class DataFrameMax(DataFrameReductionOperand, DataFrameReductionMixin): + _op_type_ = OperandDef.MAX + _func_name = "max" + + @property + def is_atomic(self): + return True + + +def max_series(df, axis=None, skipna=True, level=None, combine_size=None, method=None): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameMax( + axis=axis, + skipna=skipna, + level=level, + combine_size=combine_size, + output_types=[OutputType.scalar], + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(df) + + +def max_dataframe( + df, + axis=None, + skipna=True, + level=None, + numeric_only=None, + combine_size=None, + method=None, +): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameMax( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + combine_size=combine_size, + output_types=[OutputType.series], + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(df) + + +def max_index(df, axis=None, skipna=True): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameMax( + axis=axis, + skipna=skipna, + output_types=[OutputType.scalar], + use_inf_as_na=use_inf_as_na, + ) + return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/mean.py b/python/xorbits/_mars/dataframe/reduction/mean.py new file mode 100644 index 000000000..72a9196ba --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/mean.py @@ -0,0 +1,69 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...config import options +from ...core import OutputType +from .core import DataFrameReductionMixin, DataFrameReductionOperand + + +class DataFrameMean(DataFrameReductionOperand, DataFrameReductionMixin): + _op_type_ = OperandDef.MEAN + _func_name = "mean" + + @classmethod + def get_reduction_callable(cls, op): + skipna = op.skipna + + def mean(x): + return x.sum(skipna=skipna) / x.count() + + return mean + + +def mean_series(df, axis=None, skipna=True, level=None, combine_size=None, method=None): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameMean( + axis=axis, + skipna=skipna, + level=level, + combine_size=combine_size, + output_types=[OutputType.scalar], + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(df) + + +def mean_dataframe( + df, + axis=None, + skipna=True, + level=None, + numeric_only=None, + combine_size=None, + method=None, +): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameMean( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + combine_size=combine_size, + output_types=[OutputType.series], + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/min.py b/python/xorbits/_mars/dataframe/reduction/min.py new file mode 100644 index 000000000..908b5a479 --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/min.py @@ -0,0 +1,75 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...config import options +from ...core import OutputType +from .core import DataFrameReductionMixin, DataFrameReductionOperand + + +class DataFrameMin(DataFrameReductionOperand, DataFrameReductionMixin): + _op_type_ = OperandDef.MIN + _func_name = "min" + + @property + def is_atomic(self): + return True + + +def min_series(df, axis=None, skipna=True, level=None, combine_size=None, method=None): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameMin( + axis=axis, + skipna=skipna, + level=level, + combine_size=combine_size, + output_types=[OutputType.scalar], + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(df) + + +def min_dataframe( + df, + axis=None, + skipna=True, + level=None, + numeric_only=None, + combine_size=None, + method=None, +): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameMin( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + combine_size=combine_size, + output_types=[OutputType.series], + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(df) + + +def min_index(df, axis=None, skipna=True): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameMin( + axis=axis, + skipna=skipna, + output_types=[OutputType.scalar], + use_inf_as_na=use_inf_as_na, + ) + return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/nunique.py b/python/xorbits/_mars/dataframe/reduction/nunique.py new file mode 100644 index 000000000..0ccd6472a --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/nunique.py @@ -0,0 +1,240 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd + +try: + import pyarrow as pa +except ImportError: # pragma: no cover + pa = None + +from ... import opcodes as OperandDef +from ...config import options +from ...core import OutputType +from ...serialization.serializables import BoolField +from ...utils import lazy_import +from ..arrays import ArrowListArray, ArrowListDtype +from .core import CustomReduction, DataFrameReductionMixin, DataFrameReductionOperand + +cudf = lazy_import("cudf") + + +class NuniqueReduction(CustomReduction): + pre_with_agg = True + + def __init__( + self, name="unique", axis=0, dropna=True, use_arrow_dtype=False, is_gpu=False + ): + super().__init__(name, is_gpu=is_gpu) + self._axis = axis + self._dropna = dropna + self._use_arrow_dtype = use_arrow_dtype + + @staticmethod + def _drop_duplicates_to_arrow(v, explode=False): + if explode: + v = v.explode() + try: + return ArrowListArray([v.drop_duplicates().to_numpy()]) + except pa.ArrowInvalid: + # fallback due to diverse dtypes + return [v.drop_duplicates().to_list()] + + def pre(self, in_data): # noqa: W0221 # pylint: disable=arguments-differ + xdf = cudf if self.is_gpu() else pd + if isinstance(in_data, xdf.Series): + unique_values = in_data.drop_duplicates() + return xdf.Series(unique_values, name=in_data.name) + else: + if self._axis == 0: + data = dict() + for d, v in in_data.iteritems(): + if not self._use_arrow_dtype or xdf is cudf: + data[d] = [v.drop_duplicates().to_list()] + else: + data[d] = self._drop_duplicates_to_arrow(v) + df = xdf.DataFrame(data) + else: + df = xdf.DataFrame(columns=[0]) + for d, v in in_data.iterrows(): + if not self._use_arrow_dtype or xdf is cudf: + df.loc[d] = [v.drop_duplicates().to_list()] + else: + df.loc[d] = self._drop_duplicates_to_arrow(v) + return df + + def agg(self, in_data): # noqa: W0221 # pylint: disable=arguments-differ + xdf = cudf if self.is_gpu() else pd + if isinstance(in_data, xdf.Series): + unique_values = in_data.explode().drop_duplicates() + return xdf.Series(unique_values, name=in_data.name) + else: + if self._axis == 0: + data = dict() + for d, v in in_data.iteritems(): + if not self._use_arrow_dtype or xdf is cudf: + data[d] = [v.explode().drop_duplicates().to_list()] + else: + v = pd.Series(v.to_numpy()) + data[d] = self._drop_duplicates_to_arrow(v, explode=True) + df = xdf.DataFrame(data) + else: + df = xdf.DataFrame(columns=[0]) + for d, v in in_data.iterrows(): + if not self._use_arrow_dtype or xdf is cudf: + df.loc[d] = [v.explode().drop_duplicates().to_list()] + else: + df.loc[d] = self._drop_duplicates_to_arrow(v, explode=True) + return df + + def post(self, in_data): # noqa: W0221 # pylint: disable=arguments-differ + xdf = cudf if self.is_gpu() else pd + if isinstance(in_data, xdf.Series): + return in_data.explode().nunique(dropna=self._dropna) + else: + in_data_iter = ( + in_data.iteritems() if self._axis == 0 else in_data.iterrows() + ) + data = dict() + for d, v in in_data_iter: + if isinstance(v.dtype, ArrowListDtype): + v = xdf.Series(v.to_numpy()) + data[d] = v.explode().nunique(dropna=self._dropna) + return xdf.Series(data) + + +class DataFrameNunique(DataFrameReductionOperand, DataFrameReductionMixin): + _op_type_ = OperandDef.NUNIQUE + _func_name = "nunique" + + _dropna = BoolField("dropna") + _use_arrow_dtype = BoolField("use_arrow_dtype") + + def __init__(self, dropna=None, use_arrow_dtype=None, **kw): + super().__init__(_dropna=dropna, _use_arrow_dtype=use_arrow_dtype, **kw) + + @property + def dropna(self): + return self._dropna + + @property + def use_arrow_dtype(self): + return self._use_arrow_dtype + + @classmethod + def get_reduction_callable(cls, op): + return NuniqueReduction( + name=cls._func_name, + axis=op.axis, + dropna=op.dropna, + use_arrow_dtype=op.use_arrow_dtype, + is_gpu=op.is_gpu(), + ) + + +def nunique_dataframe(df, axis=0, dropna=True, combine_size=None): + """ + Count distinct observations over requested axis. + + Return Series with number of distinct observations. Can ignore NaN + values. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for + column-wise. + dropna : bool, default True + Don't include NaN in the counts. + combine_size : int, optional + The number of chunks to combine. + + Returns + ------- + Series + + See Also + -------- + Series.nunique: Method nunique for Series. + DataFrame.count: Count non-NA cells for each column or row. + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]}) + >>> df.nunique().execute() + A 3 + B 1 + dtype: int64 + + >>> df.nunique(axis=1).execute() + 0 1 + 1 2 + 2 2 + dtype: int64 + """ + op = DataFrameNunique( + axis=axis, + dropna=dropna, + combine_size=combine_size, + output_types=[OutputType.series], + use_arrow_dtype=options.dataframe.use_arrow_dtype, + ) + return op(df) + + +def nunique_series(series, dropna=True, combine_size=None): + """ + Return number of unique elements in the object. + + Excludes NA values by default. + + Parameters + ---------- + dropna : bool, default True + Don't include NaN in the count. + combine_size : int, optional + The number of chunks to combine. + + Returns + ------- + int + + See Also + -------- + DataFrame.nunique: Method nunique for DataFrame. + Series.count: Count non-NA/null observations in the Series. + + Examples + -------- + >>> import mars.dataframe as md + >>> s = md.Series([1, 3, 5, 7, 7]) + >>> s.execute() + 0 1 + 1 3 + 2 5 + 3 7 + 4 7 + dtype: int64 + + >>> s.nunique().execute() + 4 + """ + op = DataFrameNunique( + dropna=dropna, + combine_size=combine_size, + output_types=[OutputType.scalar], + use_arrow_dtype=options.dataframe.use_arrow_dtype, + ) + return op(series) diff --git a/python/xorbits/_mars/dataframe/reduction/prod.py b/python/xorbits/_mars/dataframe/reduction/prod.py new file mode 100644 index 000000000..ca1e9caee --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/prod.py @@ -0,0 +1,86 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes +from ...config import options +from ...core import OutputType +from .aggregation import where_function +from .core import DataFrameReductionMixin, DataFrameReductionOperand + + +class DataFrameProd(DataFrameReductionOperand, DataFrameReductionMixin): + _op_type_ = opcodes.PROD + _func_name = "prod" + + @property + def is_atomic(self): + return self.min_count == 0 + + @classmethod + def get_reduction_callable(cls, op): + skipna, min_count = op.skipna, op.min_count + + def prod(value): + if min_count == 0: + return value.prod(skipna=skipna) + else: + return where_function( + value.count() >= min_count, value.prod(skipna=skipna), np.nan + ) + + return prod + + +def prod_series( + df, axis=None, skipna=True, level=None, min_count=0, combine_size=None, method=None +): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameProd( + axis=axis, + skipna=skipna, + level=level, + min_count=min_count, + combine_size=combine_size, + output_types=[OutputType.scalar], + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(df) + + +def prod_dataframe( + df, + axis=None, + skipna=True, + level=None, + min_count=0, + numeric_only=None, + combine_size=None, + method=None, +): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameProd( + axis=axis, + skipna=skipna, + level=level, + min_count=min_count, + numeric_only=numeric_only, + combine_size=combine_size, + output_types=[OutputType.series], + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/reduction_size.py b/python/xorbits/_mars/dataframe/reduction/reduction_size.py new file mode 100644 index 000000000..3aa4dfbb5 --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/reduction_size.py @@ -0,0 +1,36 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...core import OutputType +from .core import DataFrameReductionMixin, DataFrameReductionOperand + + +class DataFrameSize(DataFrameReductionOperand, DataFrameReductionMixin): + _op_type_ = OperandDef.REDUCTION_SIZE + _func_name = "size" + + @property + def is_atomic(self): + return True + + +def size_series(df): + op = DataFrameSize(output_types=[OutputType.scalar]) + return op(df) + + +def size_dataframe(df): + op = DataFrameSize(output_types=[OutputType.series]) + return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/sem.py b/python/xorbits/_mars/dataframe/reduction/sem.py new file mode 100644 index 000000000..a11c73105 --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/sem.py @@ -0,0 +1,86 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...config import options +from ...core import OutputType +from ...serialization.serializables import Int32Field +from .core import DataFrameReductionMixin, DataFrameReductionOperand + + +class DataFrameSem(DataFrameReductionOperand, DataFrameReductionMixin): + _op_type_ = OperandDef.SEM + _func_name = "sem" + + _ddof = Int32Field("ddof") + + def __init__(self, ddof=None, **kw): + super().__init__(_ddof=ddof, **kw) + + @property + def ddof(self): + return self._ddof + + @classmethod + def get_reduction_callable(cls, op): + skipna, ddof = op.skipna, op.ddof + + def sem(x): + var = x.var(skipna=skipna, ddof=ddof) + cnt = x.count() + return (var / cnt) ** 0.5 + + return sem + + +def sem_series( + series, axis=None, skipna=True, level=None, ddof=1, combine_size=None, method=None +): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameSem( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + combine_size=combine_size, + output_types=[OutputType.scalar], + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(series) + + +def sem_dataframe( + df, + axis=None, + skipna=True, + level=None, + ddof=1, + numeric_only=None, + combine_size=None, + method=None, +): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameSem( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + combine_size=combine_size, + output_types=[OutputType.series], + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/skew.py b/python/xorbits/_mars/dataframe/reduction/skew.py new file mode 100644 index 000000000..5609535ea --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/skew.py @@ -0,0 +1,106 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes +from ...config import options +from ...core import ENTITY_TYPE, OutputType +from ...serialization.serializables import BoolField +from .core import DataFrameReductionMixin, DataFrameReductionOperand + + +class DataFrameSkew(DataFrameReductionOperand, DataFrameReductionMixin): + _op_type_ = opcodes.SKEW + _func_name = "skew" + + _bias = BoolField("bias") + + def __init__(self, bias=None, **kw): + super().__init__(_bias=bias, **kw) + + @property + def bias(self): + return self._bias + + @classmethod + def get_reduction_callable(cls, op): + from .aggregation import where_function + + skipna, bias = op.skipna, op.bias + + def skew(x): + cnt = x.count() + mean = x.mean(skipna=skipna) + divided = ( + (x**3).mean(skipna=skipna) + - 3 * (x**2).mean(skipna=skipna) * mean + + 2 * mean**3 + ) + var = x.var(skipna=skipna, ddof=0) + if isinstance(var, ENTITY_TYPE) or var > 0: + val = where_function(var > 0, divided / var**1.5, np.nan) + else: + val = np.nan + if not bias: + val = where_function( + (var > 0) & (cnt > 2), + val * ((cnt * (cnt - 1)) ** 0.5 / (cnt - 2)), + np.nan, + ) + return val + + return skew + + +def skew_series( + df, axis=None, skipna=True, level=None, combine_size=None, bias=False, method=None +): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameSkew( + axis=axis, + skipna=skipna, + level=level, + combine_size=combine_size, + bias=bias, + output_types=[OutputType.scalar], + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(df) + + +def skew_dataframe( + df, + axis=None, + skipna=True, + level=None, + numeric_only=None, + combine_size=None, + bias=False, + method=None, +): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameSkew( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + bias=bias, + combine_size=combine_size, + output_types=[OutputType.series], + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/std.py b/python/xorbits/_mars/dataframe/reduction/std.py new file mode 100644 index 000000000..a2d446acf --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/std.py @@ -0,0 +1,58 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...tensor.arithmetic import sqrt +from .var import var_dataframe, var_series + + +def std_dataframe( + df, + axis=None, + skipna=True, + level=None, + ddof=1, + numeric_only=None, + combine_size=None, + method=None, +): + ret = sqrt( + var_dataframe( + df, + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + combine_size=combine_size, + method=method, + ) + ) + return ret + + +def std_series( + series, axis=None, skipna=True, level=None, ddof=1, combine_size=None, method=None +): + ret = sqrt( + var_series( + series, + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + combine_size=combine_size, + method=method, + ) + ) + return ret diff --git a/python/xorbits/_mars/dataframe/reduction/str_concat.py b/python/xorbits/_mars/dataframe/reduction/str_concat.py new file mode 100644 index 000000000..2e2c82835 --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/str_concat.py @@ -0,0 +1,59 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...core import OutputType +from ...serialization.serializables import StringField +from .core import DataFrameReductionMixin, DataFrameReductionOperand + + +class DataFrameStrConcat(DataFrameReductionOperand, DataFrameReductionMixin): + _op_type_ = OperandDef.STR_CONCAT + _func_name = "str_concat" + + _sep = StringField("sep") + _na_rep = StringField("na_rep") + + def __init__(self, sep=None, na_rep=None, **kw): + super().__init__(_sep=sep, _na_rep=na_rep, **kw) + + @property + def sep(self): + return self._sep + + @property + def na_rep(self): + return self._na_rep + + def get_reduction_args(self, axis=None): + return dict(sep=self._sep, na_rep=self._na_rep) + + @property + def is_atomic(self): + return True + + @classmethod + def get_reduction_callable(cls, op): + sep, na_rep = op.sep, op.na_rep + + def str_concat(obj): + return build_str_concat_object(obj, sep=sep, na_rep=na_rep) + + return str_concat + + +def build_str_concat_object(df, sep=None, na_rep=None): + output_type = OutputType.series if df.ndim == 2 else OutputType.scalar + op = DataFrameStrConcat(sep=sep, na_rep=na_rep, output_types=[output_type]) + return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/sum.py b/python/xorbits/_mars/dataframe/reduction/sum.py new file mode 100644 index 000000000..7dc431574 --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/sum.py @@ -0,0 +1,87 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes +from ...config import options +from ...core import OutputType +from .core import DataFrameReductionMixin, DataFrameReductionOperand + + +class DataFrameSum(DataFrameReductionOperand, DataFrameReductionMixin): + _op_type_ = opcodes.SUM + _func_name = "sum" + + @property + def is_atomic(self): + return self.min_count == 0 + + @classmethod + def get_reduction_callable(cls, op): + from .aggregation import where_function + + skipna, min_count = op.skipna, op.min_count + + def sum_(value): + if min_count == 0: + return value.sum(skipna=skipna) + else: + return where_function( + value.count() >= min_count, value.sum(skipna=skipna), np.nan + ) + + return sum_ + + +def sum_series( + df, axis=None, skipna=True, level=None, min_count=0, combine_size=None, method=None +): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameSum( + axis=axis, + skipna=skipna, + level=level, + min_count=min_count, + combine_size=combine_size, + output_types=[OutputType.scalar], + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(df) + + +def sum_dataframe( + df, + axis=None, + skipna=True, + level=None, + min_count=0, + numeric_only=None, + combine_size=None, + method=None, +): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameSum( + axis=axis, + skipna=skipna, + level=level, + min_count=min_count, + numeric_only=numeric_only, + combine_size=combine_size, + output_types=[OutputType.series], + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/tests/__init__.py b/python/xorbits/_mars/dataframe/reduction/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/reduction/tests/test_reduction.py b/python/xorbits/_mars/dataframe/reduction/tests/test_reduction.py new file mode 100644 index 000000000..4c5c92224 --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/tests/test_reduction.py @@ -0,0 +1,625 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import operator +from functools import reduce +from typing import NamedTuple + +import numpy as np +import pandas as pd +import pytest + +from .... import dataframe as md +from ....core import tile +from ....core.operand import OperandStage +from ....tensor import Tensor +from ...core import DataFrame, IndexValue, OutputType, Series +from ...datasource.dataframe import from_pandas as from_pandas_df +from ...datasource.series import from_pandas as from_pandas_series +from ...merge import DataFrameConcat +from .. import ( + CustomReduction, + DataFrameAggregate, + DataFrameAll, + DataFrameAny, + DataFrameCount, + DataFrameCummax, + DataFrameCummin, + DataFrameCumprod, + DataFrameCumsum, + DataFrameKurtosis, + DataFrameMax, + DataFrameMean, + DataFrameMin, + DataFrameNunique, + DataFrameProd, + DataFrameSem, + DataFrameSkew, + DataFrameSum, + DataFrameVar, +) +from ..aggregation import where_function +from ..core import ReductionCompiler + +pytestmark = pytest.mark.pd_compat + + +class FunctionOptions(NamedTuple): + has_skipna: bool = True + has_numeric_only: bool = True + has_bool_only: bool = False + + +reduction_functions = [ + ("sum", DataFrameSum, FunctionOptions()), + ("prod", DataFrameProd, FunctionOptions()), + ("min", DataFrameMin, FunctionOptions()), + ("max", DataFrameMax, FunctionOptions()), + ("count", DataFrameCount, FunctionOptions(has_skipna=False)), + ("mean", DataFrameMean, FunctionOptions()), + ("var", DataFrameVar, FunctionOptions()), + ("skew", DataFrameSkew, FunctionOptions()), + ("kurt", DataFrameKurtosis, FunctionOptions()), + ("sem", DataFrameSem, FunctionOptions()), + ("all", DataFrameAll, FunctionOptions(has_numeric_only=False, has_bool_only=True)), + ("any", DataFrameAny, FunctionOptions(has_numeric_only=False, has_bool_only=True)), +] + + +@pytest.mark.parametrize("func_name,op,func_opts", reduction_functions) +def test_series_reduction(func_name, op, func_opts: FunctionOptions): + data = pd.Series(range(20), index=[str(i) for i in range(20)]) + series = getattr(from_pandas_series(data, chunk_size=3), func_name)() + + assert isinstance(series, Tensor) + assert isinstance(series.op, op) + assert series.shape == () + + series = tile(series) + + assert len(series.chunks) == 1 + assert isinstance(series.chunks[0].op, DataFrameAggregate) + assert isinstance(series.chunks[0].inputs[0].op, DataFrameConcat) + assert len(series.chunks[0].inputs[0].inputs) == 2 + + data = pd.Series(np.random.rand(25), name="a") + if func_opts.has_skipna: + kwargs = dict(axis="index", skipna=False) + else: + kwargs = dict() + series = getattr(from_pandas_series(data, chunk_size=7), func_name)(**kwargs) + + assert isinstance(series, Tensor) + assert series.shape == () + + series = tile(series) + + assert len(series.chunks) == 1 + assert isinstance(series.chunks[0].op, DataFrameAggregate) + assert isinstance(series.chunks[0].inputs[0].op, DataFrameConcat) + assert len(series.chunks[0].inputs[0].inputs) == 4 + + +@pytest.mark.parametrize("func_name,op,func_opts", reduction_functions) +def test_dataframe_reduction(func_name, op, func_opts: FunctionOptions): + data = pd.DataFrame( + {"a": list(range(20)), "b": list(range(20, 0, -1))}, + index=[str(i) for i in range(20)], + ) + reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)() + + assert isinstance(reduction_df, Series) + assert isinstance(reduction_df.op, op) + assert isinstance(reduction_df.index_value._index_value, IndexValue.Index) + assert reduction_df.shape == (2,) + + reduction_df = tile(reduction_df) + + assert len(reduction_df.chunks) == 1 + assert isinstance(reduction_df.chunks[0].op, DataFrameAggregate) + assert isinstance(reduction_df.chunks[0].inputs[0].op, DataFrameConcat) + assert len(reduction_df.chunks[0].inputs[0].inputs) == 2 + + data = pd.DataFrame(np.random.rand(20, 10)) + reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)() + + assert isinstance(reduction_df, Series) + assert isinstance( + reduction_df.index_value._index_value, + (IndexValue.RangeIndex, IndexValue.Int64Index), + ) + assert reduction_df.shape == (10,) + + reduction_df = tile(reduction_df) + + assert len(reduction_df.chunks) == 4 + assert reduction_df.nsplits == ((3, 3, 3, 1),) + assert isinstance(reduction_df.chunks[0].op, DataFrameAggregate) + assert isinstance(reduction_df.chunks[0].inputs[0].op, DataFrameConcat) + assert len(reduction_df.chunks[0].inputs[0].inputs) == 2 + + data = pd.DataFrame(np.random.rand(20, 20), index=[str(i) for i in range(20)]) + reduction_df = getattr(from_pandas_df(data, chunk_size=4), func_name)( + axis="columns" + ) + + assert reduction_df.shape == (20,) + + reduction_df = tile(reduction_df) + + assert len(reduction_df.chunks) == 5 + assert reduction_df.nsplits == ((4,) * 5,) + assert isinstance(reduction_df.chunks[0].op, DataFrameAggregate) + assert isinstance(reduction_df.chunks[0].inputs[0].op, DataFrameConcat) + assert len(reduction_df.chunks[0].inputs[0].inputs) == 2 + + with pytest.raises(NotImplementedError): + getattr(from_pandas_df(data, chunk_size=3), func_name)(level=0, axis=1) + + +cum_reduction_functions = [ + ("cummin", DataFrameCummin, FunctionOptions()), + ("cummax", DataFrameCummax, FunctionOptions()), + ("cumprod", DataFrameCumprod, FunctionOptions()), + ("cumsum", DataFrameCumsum, FunctionOptions()), +] + + +@pytest.mark.parametrize("func_name,op,func_opts", cum_reduction_functions) +def test_cum_series_reduction(func_name, op, func_opts: FunctionOptions): + data = pd.Series({"a": list(range(20))}, index=[str(i) for i in range(20)]) + series = getattr(from_pandas_series(data, chunk_size=3), func_name)() + + assert isinstance(series, Series) + assert series.shape == (20,) + + series = tile(series) + + assert len(series.chunks) == 7 + assert isinstance(series.chunks[0].op, op) + assert series.chunks[0].op.stage == OperandStage.combine + assert isinstance(series.chunks[-1].inputs[-1].op, op) + assert series.chunks[-1].inputs[-1].op.stage == OperandStage.map + assert len(series.chunks[-1].inputs) == 7 + + data = pd.Series(np.random.rand(25), name="a") + if func_opts.has_skipna: + kwargs = dict(axis="index", skipna=False) + else: + kwargs = dict() + series = getattr(from_pandas_series(data, chunk_size=7), func_name)(**kwargs) + + assert isinstance(series, Series) + assert series.shape == (25,) + + series = tile(series) + + assert len(series.chunks) == 4 + assert isinstance(series.chunks[0].op, op) + assert series.chunks[0].op.stage == OperandStage.combine + assert isinstance(series.chunks[-1].inputs[-1].op, op) + assert series.chunks[-1].inputs[-1].op.stage == OperandStage.map + assert len(series.chunks[-1].inputs) == 4 + + +@pytest.mark.parametrize("func_name,op,func_opts", cum_reduction_functions) +def test_cum_dataframe_reduction(func_name, op, func_opts: FunctionOptions): + data = pd.DataFrame( + {"a": list(range(20)), "b": list(range(20, 0, -1))}, + index=[str(i) for i in range(20)], + ) + reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)() + + assert isinstance(reduction_df, DataFrame) + assert isinstance(reduction_df.index_value._index_value, IndexValue.Index) + assert reduction_df.shape == (20, 2) + + reduction_df = tile(reduction_df) + + assert len(reduction_df.chunks) == 7 + assert isinstance(reduction_df.chunks[0].op, op) + assert reduction_df.chunks[0].op.stage == OperandStage.combine + assert isinstance(reduction_df.chunks[-1].inputs[-1].op, op) + assert reduction_df.chunks[-1].inputs[-1].op.stage == OperandStage.map + assert len(reduction_df.chunks[-1].inputs) == 7 + + data = pd.DataFrame(np.random.rand(20, 10)) + reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)() + + assert isinstance(reduction_df, DataFrame) + assert isinstance(reduction_df.index_value._index_value, IndexValue.RangeIndex) + assert reduction_df.shape == (20, 10) + + reduction_df = tile(reduction_df) + + assert len(reduction_df.chunks) == 28 + assert reduction_df.nsplits == ((3, 3, 3, 3, 3, 3, 2), (3, 3, 3, 1)) + assert reduction_df.chunks[0].op.stage == OperandStage.combine + assert isinstance(reduction_df.chunks[-1].inputs[-1].op, op) + assert reduction_df.chunks[-1].inputs[-1].op.stage == OperandStage.map + assert len(reduction_df.chunks[-1].inputs) == 7 + + +def test_nunique(): + data = pd.DataFrame( + np.random.randint(0, 6, size=(20, 10)), + columns=["c" + str(i) for i in range(10)], + ) + df = from_pandas_df(data, chunk_size=3) + result = df.nunique() + + assert result.shape == (10,) + assert result.op.output_types[0] == OutputType.series + assert isinstance(result.op, DataFrameNunique) + + tiled = tile(result) + assert tiled.shape == (10,) + assert len(tiled.chunks) == 4 + assert tiled.nsplits == ((3, 3, 3, 1),) + assert tiled.chunks[0].op.stage == OperandStage.agg + assert isinstance(tiled.chunks[0].op, DataFrameAggregate) + + data2 = data.copy() + df2 = from_pandas_df(data2, chunk_size=3) + result2 = df2.nunique(axis=1) + + assert result2.shape == (20,) + assert result2.op.output_types[0] == OutputType.series + assert isinstance(result2.op, DataFrameNunique) + + tiled = tile(result2) + assert tiled.shape == (20,) + assert len(tiled.chunks) == 7 + assert tiled.nsplits == ((3, 3, 3, 3, 3, 3, 2),) + assert tiled.chunks[0].op.stage == OperandStage.agg + assert isinstance(tiled.chunks[0].op, DataFrameAggregate) + + +def test_dataframe_aggregate(): + data = pd.DataFrame(np.random.rand(20, 19)) + agg_funcs = [ + "sum", + "min", + "max", + "mean", + "var", + "std", + "all", + "any", + "skew", + "kurt", + "sem", + ] + + df = from_pandas_df(data) + result = tile(df.agg(agg_funcs)) + assert len(result.chunks) == 1 + assert result.shape == (len(agg_funcs), data.shape[1]) + assert list(result.columns_value.to_pandas()) == list(range(19)) + assert list(result.index_value.to_pandas()) == agg_funcs + assert result.op.output_types[0] == OutputType.dataframe + assert result.op.func == agg_funcs + + df = from_pandas_df(data, chunk_size=(3, 4)) + + result = tile(df.agg("sum")) + assert len(result.chunks) == 5 + assert result.shape == (data.shape[1],) + assert list(result.index_value.to_pandas()) == list(range(data.shape[1])) + assert result.op.output_types[0] == OutputType.series + assert result.op.func == ["sum"] + agg_chunk = result.chunks[0] + assert agg_chunk.shape == (4,) + assert list(agg_chunk.index_value.to_pandas()) == list(range(4)) + assert agg_chunk.op.stage == OperandStage.agg + + result = tile(df.agg("sum", axis=1)) + assert len(result.chunks) == 7 + assert result.shape == (data.shape[0],) + assert list(result.index_value.to_pandas()) == list(range(data.shape[0])) + assert result.op.output_types[0] == OutputType.series + agg_chunk = result.chunks[0] + assert agg_chunk.shape == (3,) + assert list(agg_chunk.index_value.to_pandas()) == list(range(3)) + assert agg_chunk.op.stage == OperandStage.agg + + result = tile(df.agg("var", axis=1)) + assert len(result.chunks) == 7 + assert result.shape == (data.shape[0],) + assert list(result.index_value.to_pandas()) == list(range(data.shape[0])) + assert result.op.output_types[0] == OutputType.series + assert result.op.func == ["var"] + agg_chunk = result.chunks[0] + assert agg_chunk.shape == (3,) + assert list(agg_chunk.index_value.to_pandas()) == list(range(3)) + assert agg_chunk.op.stage == OperandStage.agg + + result = tile(df.agg(agg_funcs)) + assert len(result.chunks) == 5 + assert result.shape == (len(agg_funcs), data.shape[1]) + assert list(result.columns_value.to_pandas()) == list(range(data.shape[1])) + assert list(result.index_value.to_pandas()) == agg_funcs + assert result.op.output_types[0] == OutputType.dataframe + assert result.op.func == agg_funcs + agg_chunk = result.chunks[0] + assert agg_chunk.shape == (len(agg_funcs), 4) + assert list(agg_chunk.columns_value.to_pandas()) == list(range(4)) + assert list(agg_chunk.index_value.to_pandas()) == agg_funcs + assert agg_chunk.op.stage == OperandStage.agg + + result = tile(df.agg(agg_funcs, axis=1)) + assert len(result.chunks) == 7 + assert result.shape == (data.shape[0], len(agg_funcs)) + assert list(result.columns_value.to_pandas()) == agg_funcs + assert list(result.index_value.to_pandas()) == list(range(data.shape[0])) + assert result.op.output_types[0] == OutputType.dataframe + assert result.op.func == agg_funcs + agg_chunk = result.chunks[0] + assert agg_chunk.shape == (3, len(agg_funcs)) + assert list(agg_chunk.columns_value.to_pandas()) == agg_funcs + assert list(agg_chunk.index_value.to_pandas()) == list(range(3)) + assert agg_chunk.op.stage == OperandStage.agg + + dict_fun = {0: "sum", 2: ["var", "max"], 9: ["mean", "var", "std"]} + all_cols = set( + reduce( + operator.add, [[v] if isinstance(v, str) else v for v in dict_fun.values()] + ) + ) + result = tile(df.agg(dict_fun)) + assert len(result.chunks) == 2 + assert result.shape == (len(all_cols), len(dict_fun)) + assert set(result.columns_value.to_pandas()) == set(dict_fun.keys()) + assert set(result.index_value.to_pandas()) == all_cols + assert result.op.output_types[0] == OutputType.dataframe + assert result.op.func[0] == [dict_fun[0]] + assert result.op.func[2] == dict_fun[2] + agg_chunk = result.chunks[0] + assert agg_chunk.shape == (len(all_cols), 2) + assert list(agg_chunk.columns_value.to_pandas()) == [0, 2] + assert set(agg_chunk.index_value.to_pandas()) == all_cols + assert agg_chunk.op.stage == OperandStage.agg + + with pytest.raises(TypeError): + df.agg(sum_0="sum", mean_0="mean") + with pytest.raises(NotImplementedError): + df.agg({0: ["sum", "min", "var"], 9: ["mean", "var", "std"]}, axis=1) + + +def test_series_aggregate(): + data = pd.Series(np.random.rand(20), index=[str(i) for i in range(20)], name="a") + agg_funcs = [ + "sum", + "min", + "max", + "mean", + "var", + "std", + "all", + "any", + "skew", + "kurt", + "sem", + ] + + series = from_pandas_series(data) + + result = tile(series.agg(agg_funcs)) + assert len(result.chunks) == 1 + assert result.shape == (len(agg_funcs),) + assert list(result.index_value.to_pandas()) == agg_funcs + assert result.op.output_types[0] == OutputType.series + assert result.op.func == agg_funcs + + series = from_pandas_series(data, chunk_size=3) + + result = tile(series.agg("sum")) + assert len(result.chunks) == 1 + assert result.shape == () + assert result.op.output_types[0] == OutputType.scalar + agg_chunk = result.chunks[0] + assert agg_chunk.shape == () + assert agg_chunk.op.stage == OperandStage.agg + + result = tile(series.agg(agg_funcs)) + assert len(result.chunks) == 1 + assert result.shape == (len(agg_funcs),) + assert list(result.index_value.to_pandas()) == agg_funcs + assert result.op.output_types[0] == OutputType.series + assert result.op.func == agg_funcs + agg_chunk = result.chunks[0] + assert agg_chunk.shape == (len(agg_funcs),) + assert list(agg_chunk.index_value.to_pandas()) == agg_funcs + assert agg_chunk.op.stage == OperandStage.agg + + with pytest.raises(TypeError): + series.agg(sum_0=(0, "sum"), mean_0=(0, "mean")) + + +def test_compile_function(): + compiler = ReductionCompiler() + ms = md.Series([1, 2, 3]) + # no Mars objects inside closures + with pytest.raises(ValueError): + compiler.add_function(functools.partial(lambda x: (x + ms).sum()), ndim=2) + # function should return a Mars object + with pytest.raises(ValueError): + compiler.add_function(lambda x: x is not None, ndim=2) + # function should perform some sort of reduction in dimensionality + with pytest.raises(ValueError): + compiler.add_function(lambda x: x, ndim=2) + # function should only contain acceptable operands + with pytest.raises(ValueError): + compiler.add_function(lambda x: x.sort_values().max(), ndim=1) + with pytest.raises(ValueError): + compiler.add_function(lambda x: x.max().shift(1), ndim=2) + + # test agg for all data + for ndim in [1, 2]: + compiler = ReductionCompiler(store_source=True) + compiler.add_function(lambda x: (x**2).count() + 1, ndim=ndim) + result = compiler.compile() + # check pre_funcs + assert len(result.pre_funcs) == 1 + assert "pow" in result.pre_funcs[0].func.__source__ + # check agg_funcs + assert len(result.agg_funcs) == 1 + assert result.agg_funcs[0].map_func_name == "count" + assert result.agg_funcs[0].agg_func_name == "sum" + # check post_funcs + assert len(result.post_funcs) == 1 + assert result.post_funcs[0].func_name == "" + assert "add" in result.post_funcs[0].func.__source__ + + compiler.add_function( + lambda x: -x.prod() ** 2 + (1 + (x**2).count()), ndim=ndim + ) + result = compiler.compile() + # check pre_funcs + assert len(result.pre_funcs) == 2 + assert ( + "pow" in result.pre_funcs[0].func.__source__ + or "pow" in result.pre_funcs[1].func.__source__ + ) + assert ( + "pow" not in result.pre_funcs[0].func.__source__ + or "pow" not in result.pre_funcs[1].func.__source__ + ) + # check agg_funcs + assert len(result.agg_funcs) == 2 + assert set(result.agg_funcs[i].map_func_name for i in range(2)) == { + "count", + "prod", + } + assert set(result.agg_funcs[i].agg_func_name for i in range(2)) == { + "sum", + "prod", + } + # check post_funcs + assert len(result.post_funcs) == 2 + assert result.post_funcs[0].func_name == "" + assert "add" in result.post_funcs[0].func.__source__ + assert "add" in result.post_funcs[1].func.__source__ + + compiler = ReductionCompiler(store_source=True) + compiler.add_function( + lambda x: where_function(x.all(), x.count(), 0), ndim=ndim + ) + result = compiler.compile() + # check pre_funcs + assert len(result.pre_funcs) == 1 + assert result.pre_funcs[0].input_key == result.pre_funcs[0].output_key + # check agg_funcs + assert len(result.agg_funcs) == 2 + assert set(result.agg_funcs[i].map_func_name for i in range(2)) == { + "all", + "count", + } + assert set(result.agg_funcs[i].agg_func_name for i in range(2)) == { + "sum", + "all", + } + # check post_funcs + assert len(result.post_funcs) == 1 + if ndim == 1: + assert "np.where" in result.post_funcs[0].func.__source__ + else: + assert "np.where" not in result.post_funcs[0].func.__source__ + assert ".where" in result.post_funcs[0].func.__source__ + + # check boolean expressions + compiler = ReductionCompiler(store_source=True) + compiler.add_function(lambda x: (x == "1").sum(), ndim=ndim) + result = compiler.compile() + # check pre_funcs + assert len(result.pre_funcs) == 1 + assert "eq" in result.pre_funcs[0].func.__source__ + # check agg_funcs + assert len(result.agg_funcs) == 1 + assert result.agg_funcs[0].map_func_name == "sum" + assert result.agg_funcs[0].agg_func_name == "sum" + + # test agg for specific columns + compiler = ReductionCompiler(store_source=True) + compiler.add_function(lambda x: 1 + x.sum(), ndim=2, cols=["a", "b"]) + compiler.add_function(lambda x: -1 + x.sum(), ndim=2, cols=["b", "c"]) + result = compiler.compile() + # check pre_funcs + assert len(result.pre_funcs) == 1 + assert set(result.pre_funcs[0].columns) == set("abc") + # check agg_funcs + assert len(result.agg_funcs) == 1 + assert result.agg_funcs[0].map_func_name == "sum" + assert result.agg_funcs[0].agg_func_name == "sum" + # check post_funcs + assert len(result.post_funcs) == 2 + assert set("".join(sorted(result.post_funcs[i].columns)) for i in range(2)) == { + "ab", + "bc", + } + + # test agg for multiple columns + compiler = ReductionCompiler(store_source=True) + compiler.add_function(lambda x: x.sum(), ndim=2, cols=["a"]) + compiler.add_function(lambda x: x.sum(), ndim=2, cols=["b"]) + compiler.add_function(lambda x: x.min(), ndim=2, cols=["c"]) + result = compiler.compile() + # check pre_funcs + assert len(result.pre_funcs) == 1 + assert set(result.pre_funcs[0].columns) == set("abc") + # check agg_funcs + assert len(result.agg_funcs) == 2 + assert result.agg_funcs[0].map_func_name == "sum" + assert result.agg_funcs[0].agg_func_name == "sum" + # check post_funcs + assert len(result.post_funcs) == 2 + assert set(result.post_funcs[0].columns) == set("ab") + + +def test_custom_aggregation(): + class MockReduction1(CustomReduction): + def agg(self, v1): + return v1.sum() + + class MockReduction2(CustomReduction): + def pre(self, value): + return value + 1, value**2 + + def agg(self, v1, v2): + return v1.sum(), v2.prod() + + def post(self, v1, v2): + return v1 + v2 + + for ndim in [1, 2]: + compiler = ReductionCompiler() + compiler.add_function(MockReduction1(), ndim=ndim) + result = compiler.compile() + # check agg_funcs + assert len(result.agg_funcs) == 1 + assert result.agg_funcs[0].map_func_name == "custom_reduction" + assert result.agg_funcs[0].agg_func_name == "custom_reduction" + assert isinstance(result.agg_funcs[0].custom_reduction, MockReduction1) + assert result.agg_funcs[0].output_limit == 1 + + compiler = ReductionCompiler() + compiler.add_function(MockReduction2(), ndim=ndim) + result = compiler.compile() + # check agg_funcs + assert len(result.agg_funcs) == 1 + assert result.agg_funcs[0].map_func_name == "custom_reduction" + assert result.agg_funcs[0].agg_func_name == "custom_reduction" + assert isinstance(result.agg_funcs[0].custom_reduction, MockReduction2) + assert result.agg_funcs[0].output_limit == 2 diff --git a/python/xorbits/_mars/dataframe/reduction/tests/test_reduction_execution.py b/python/xorbits/_mars/dataframe/reduction/tests/test_reduction_execution.py new file mode 100644 index 000000000..3cedf6106 --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/tests/test_reduction_execution.py @@ -0,0 +1,1062 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import NamedTuple + +import numpy as np +import pandas as pd +import pytest + +try: + import pyarrow as pa +except ImportError: # pragma: no cover + pa = None + +from .... import dataframe as md +from ....config import option_context +from ....deploy.oscar.session import get_default_session +from ....tests.core import require_cudf, require_cupy +from ....utils import lazy_import, pd_release_version +from ... import CustomReduction, NamedAgg +from ...base import to_gpu + +pytestmark = pytest.mark.pd_compat + +cp = lazy_import("cupy", rename="cp") +_agg_size_as_series = pd_release_version >= (1, 3) +_support_kw_agg = pd_release_version >= (1, 1) + + +@pytest.fixture +def check_ref_counts(): + yield + + import gc + + gc.collect() + + sess = get_default_session() + assert len(sess._get_ref_counts()) == 0 + + +class FunctionOptions(NamedTuple): + has_min_count: bool = False + + +reduction_functions = [ + ("sum", FunctionOptions(has_min_count=True)), + ("prod", FunctionOptions(has_min_count=True)), + ("min", FunctionOptions()), + ("max", FunctionOptions()), + ("mean", FunctionOptions()), + ("var", FunctionOptions()), + ("std", FunctionOptions()), + ("sem", FunctionOptions()), + ("skew", FunctionOptions()), + ("kurt", FunctionOptions()), +] + + +@pytest.mark.parametrize("func_name,func_opts", reduction_functions) +def test_series_reduction( + setup, check_ref_counts, func_name, func_opts: FunctionOptions +): + def compute(data, **kwargs): + return getattr(data, func_name)(**kwargs) + + rs = np.random.RandomState(0) + data = pd.Series( + rs.randint(0, 8, (10,)), index=[str(i) for i in range(10)], name="a" + ) + r = compute(md.Series(data)) + assert pytest.approx(compute(data)) == r.execute().fetch() + + r = compute(md.Series(data, chunk_size=6)) + assert pytest.approx(compute(data)) == r.execute().fetch() + + r = compute(md.Series(data, chunk_size=3)) + assert pytest.approx(compute(data)) == r.execute().fetch() + + r = compute(md.Series(data, chunk_size=4), axis="index") + assert pytest.approx(compute(data, axis="index")) == r.execute().fetch() + + r = compute(md.Series(data, chunk_size=4), axis="index") + assert pytest.approx(compute(data, axis="index")) == r.execute().fetch() + + data = pd.Series(rs.rand(20), name="a") + data[0] = 0.1 # make sure not all elements are NAN + data[data > 0.5] = np.nan + r = compute(md.Series(data, chunk_size=3)) + assert pytest.approx(compute(data)) == r.execute().fetch() + + r = compute(md.Series(data, chunk_size=3), skipna=False) + assert np.isnan(r.execute().fetch()) + + if func_opts.has_min_count: + r = compute(md.Series(data, chunk_size=3), skipna=False, min_count=2) + assert np.isnan(r.execute().fetch()) + + r = compute(md.Series(data, chunk_size=3), min_count=1) + assert pytest.approx(compute(data, min_count=1)) == r.execute().fetch() + + reduction_df5 = compute(md.Series(data, chunk_size=3), min_count=21) + assert np.isnan(reduction_df5.execute().fetch()) + + # test reduction on empty series + data = pd.Series([], dtype=float, name="a") + r = compute(md.Series(data)) + np.testing.assert_equal(r.execute().fetch(), compute(data)) + + +@pytest.mark.parametrize("func_name,func_opts", reduction_functions) +def test_series_level_reduction(setup, func_name, func_opts: FunctionOptions): + def compute(data, **kwargs): + return getattr(data, func_name)(**kwargs) + + rs = np.random.RandomState(0) + idx = pd.MultiIndex.from_arrays( + [[str(i) for i in range(100)], rs.choice(["A", "B"], size=(100,))], + names=["a", "b"], + ) + data = pd.Series(rs.randint(0, 8, size=(100,)), index=idx) + + r = compute(md.Series(data, chunk_size=13), level=1, method="tree") + pd.testing.assert_series_equal( + compute(data, level=1).sort_index(), r.execute().fetch().sort_index() + ) + + # test null + data = pd.Series(rs.rand(100), name="a", index=idx) + idx_df = idx.to_frame() + data[data > 0.5] = np.nan + data[int(idx_df[idx_df.b == "A"].iloc[0, 0])] = 0.1 + data[int(idx_df[idx_df.b == "B"].iloc[0, 0])] = 0.1 + + r = compute(md.Series(data, chunk_size=13), level=1, method="tree") + pd.testing.assert_series_equal( + compute(data, level=1).sort_index(), r.execute().fetch().sort_index() + ) + + r = compute(md.Series(data, chunk_size=13), level=1, skipna=False, method="tree") + pd.testing.assert_series_equal( + compute(data, level=1, skipna=False).sort_index(), + r.execute().fetch().sort_index(), + ) + + if func_opts.has_min_count: + r = compute(md.Series(data, chunk_size=13), min_count=1, level=1, method="tree") + pd.testing.assert_series_equal( + compute(data, min_count=1, level=1).sort_index(), + r.execute().fetch().sort_index(), + ) + + +@pytest.mark.parametrize("func_name,func_opts", reduction_functions) +def test_dataframe_reduction( + setup, check_ref_counts, func_name, func_opts: FunctionOptions +): + def compute(data, **kwargs): + return getattr(data, func_name)(**kwargs) + + rs = np.random.RandomState(0) + data = pd.DataFrame(rs.rand(20, 10)) + r = compute(md.DataFrame(data)) + pd.testing.assert_series_equal(compute(data), r.execute().fetch()) + + r = compute(md.DataFrame(data, chunk_size=3)) + pd.testing.assert_series_equal(compute(data), r.execute().fetch()) + + r = compute(md.DataFrame(data, chunk_size=6), axis="index", numeric_only=True) + pd.testing.assert_series_equal( + compute(data, axis="index", numeric_only=True), r.execute().fetch() + ) + + r = compute(md.DataFrame(data, chunk_size=3), axis=1) + pd.testing.assert_series_equal(compute(data, axis=1), r.execute().fetch()) + + # test null + np_data = rs.rand(20, 10) + np_data[np_data > 0.6] = np.nan + data = pd.DataFrame(np_data) + + r = compute(md.DataFrame(data, chunk_size=3)) + pd.testing.assert_series_equal(compute(data), r.execute().fetch()) + + r = compute(md.DataFrame(data, chunk_size=3), skipna=False) + pd.testing.assert_series_equal(compute(data, skipna=False), r.execute().fetch()) + + r = compute(md.DataFrame(data, chunk_size=3), skipna=False) + pd.testing.assert_series_equal(compute(data, skipna=False), r.execute().fetch()) + + if func_opts.has_min_count: + r = compute(md.DataFrame(data, chunk_size=3), min_count=15) + pd.testing.assert_series_equal(compute(data, min_count=15), r.execute().fetch()) + + r = compute(md.DataFrame(data, chunk_size=3), min_count=3) + pd.testing.assert_series_equal(compute(data, min_count=3), r.execute().fetch()) + + r = compute(md.DataFrame(data, chunk_size=3), axis=1, min_count=3) + pd.testing.assert_series_equal( + compute(data, axis=1, min_count=3), r.execute().fetch() + ) + + r = compute(md.DataFrame(data, chunk_size=3), axis=1, min_count=8) + pd.testing.assert_series_equal( + compute(data, axis=1, min_count=8), r.execute().fetch() + ) + + # test numeric_only + data = pd.DataFrame( + rs.rand(10, 10), + index=rs.randint(-100, 100, size=(10,)), + columns=[rs.bytes(10) for _ in range(10)], + ) + r = compute(md.DataFrame(data, chunk_size=2)) + pd.testing.assert_series_equal(compute(data), r.execute().fetch()) + + r = compute(md.DataFrame(data, chunk_size=6), axis="index", numeric_only=True) + pd.testing.assert_series_equal( + compute(data, axis="index", numeric_only=True), r.execute().fetch() + ) + + r = compute(md.DataFrame(data, chunk_size=3), axis="columns") + pd.testing.assert_series_equal(compute(data, axis="columns"), r.execute().fetch()) + + data_dict = dict((str(i), rs.rand(10)) for i in range(10)) + data_dict["string"] = pd.Series([str(i) for i in range(10)]).radd("O") + data_dict["bool"] = rs.choice([True, False], (10,)) + data = pd.DataFrame(data_dict) + r = compute(md.DataFrame(data, chunk_size=3), axis="index", numeric_only=True) + pd.testing.assert_series_equal( + compute(data, axis="index", numeric_only=True), r.execute().fetch() + ) + r = compute(md.DataFrame(data), axis="index", numeric_only=True) + pd.testing.assert_series_equal( + compute(data, axis="index", numeric_only=True), r.execute().fetch() + ) + + data1 = pd.DataFrame(rs.rand(10, 10), columns=[str(i) for i in range(10)]) + data2 = pd.DataFrame(rs.rand(10, 10), columns=[str(i) for i in range(10)]) + df = md.DataFrame(data1, chunk_size=5) + md.DataFrame(data2, chunk_size=6) + r = compute(df) + pd.testing.assert_series_equal( + compute(data1 + data2).sort_index(), r.execute().fetch().sort_index() + ) + + +@pytest.mark.parametrize("func_name,func_opts", reduction_functions) +def test_dataframe_level_reduction( + setup, check_ref_counts, func_name, func_opts: FunctionOptions +): + def compute(data, **kwargs): + return getattr(data, func_name)(**kwargs) + + rs = np.random.RandomState(0) + idx = pd.MultiIndex.from_arrays( + [[str(i) for i in range(100)], rs.choice(["A", "B"], size=(100,))], + names=["a", "b"], + ) + data = pd.DataFrame(rs.rand(100, 10), index=idx) + + r = compute(md.DataFrame(data, chunk_size=13), level=1, method="tree") + pd.testing.assert_frame_equal( + compute(data, level=1).sort_index(), r.execute().fetch().sort_index() + ) + + r = compute( + md.DataFrame(data, chunk_size=13), level=1, numeric_only=True, method="tree" + ) + pd.testing.assert_frame_equal( + compute(data, numeric_only=True, level=1).sort_index(), + r.execute().fetch().sort_index(), + ) + + # test null + data = pd.DataFrame(rs.rand(100, 10), index=idx) + data[data > 0.6] = np.nan + + r = compute(md.DataFrame(data, chunk_size=13), level=1, method="tree") + pd.testing.assert_frame_equal( + compute(data, level=1).sort_index(), r.execute().fetch().sort_index() + ) + + r = compute(md.DataFrame(data, chunk_size=13), level=1, skipna=False, method="tree") + pd.testing.assert_frame_equal( + compute(data, level=1, skipna=False).sort_index(), + r.execute().fetch().sort_index(), + ) + + if func_opts.has_min_count: + r = compute( + md.DataFrame(data, chunk_size=13), level=1, min_count=10, method="tree" + ) + pd.testing.assert_frame_equal( + compute(data, level=1, min_count=10).sort_index(), + r.execute().fetch().sort_index(), + ) + + # behavior of 'skew', 'kurt' differs for cases with and without level + skip_funcs = ("skew", "kurt") + if pd_release_version <= (1, 2, 0): + # fails under pandas 1.2. see pandas-dev/pandas#38774 for more details + skip_funcs += ("sem",) + + if func_name not in skip_funcs: + data_dict = dict((str(i), rs.rand(100)) for i in range(10)) + data_dict["string"] = ["O" + str(i) for i in range(100)] + data_dict["bool"] = rs.choice([True, False], (100,)) + data = pd.DataFrame(data_dict, index=idx) + + r = compute( + md.DataFrame(data, chunk_size=13), level=1, numeric_only=True, method="tree" + ) + pd.testing.assert_frame_equal( + compute(data, level=1, numeric_only=True).sort_index(), + r.execute().fetch().sort_index(), + ) + + +@require_cudf +@require_cupy +def test_gpu_execution(setup_gpu, check_ref_counts): + df_raw = pd.DataFrame(np.random.rand(30, 3), columns=list("abc")) + df = to_gpu(md.DataFrame(df_raw, chunk_size=6)) + + r = df.sum() + res = r.execute().fetch() + pd.testing.assert_series_equal(res.to_pandas(), df_raw.sum()) + + r = df.kurt() + res = r.execute().fetch() + pd.testing.assert_series_equal(res.to_pandas(), df_raw.kurt()) + + r = df.agg(["sum", "var"]) + res = r.execute().fetch() + pd.testing.assert_frame_equal(res.to_pandas(), df_raw.agg(["sum", "var"])) + + s_raw = pd.Series(np.random.rand(30)) + s = to_gpu(md.Series(s_raw, chunk_size=6)) + + r = s.sum() + res = r.execute().fetch() + assert pytest.approx(res) == s_raw.sum() + + r = s.kurt() + res = r.execute().fetch() + assert pytest.approx(res) == s_raw.kurt() + + r = s.agg(["sum", "var"]) + res = r.execute().fetch() + pd.testing.assert_series_equal(res.to_pandas(), s_raw.agg(["sum", "var"])) + + s_raw = pd.Series( + np.random.randint(0, 3, size=(30,)) * np.random.randint(0, 5, size=(30,)) + ) + s = to_gpu(md.Series(s_raw, chunk_size=6)) + + r = s.unique() + res = r.execute().fetch() + np.testing.assert_array_equal(cp.asnumpy(res).sort(), s_raw.unique().sort()) + + +bool_reduction_functions = ["all", "any"] + + +@pytest.mark.parametrize("func_name", bool_reduction_functions) +def test_series_bool_reduction(setup, check_ref_counts, func_name): + def compute(data, **kwargs): + return getattr(data, func_name)(**kwargs) + + rs = np.random.RandomState(0) + data = pd.Series(rs.rand(10) > 0.5, index=[str(i) for i in range(10)], name="a") + r = compute(md.Series(data)) + assert compute(data) == r.execute().fetch() + + r = compute(md.Series(data, chunk_size=6)) + assert pytest.approx(compute(data)) == r.execute().fetch() + + r = compute(md.Series(data, chunk_size=3)) + assert pytest.approx(compute(data)) == r.execute().fetch() + + r = compute(md.Series(data, chunk_size=4), axis="index") + assert pytest.approx(compute(data, axis="index")) == r.execute().fetch() + + # test null + data = pd.Series(rs.rand(20), name="a") + data[0] = 0.1 # make sure not all elements are NAN + data[data > 0.5] = np.nan + r = compute(md.Series(data, chunk_size=3)) + assert compute(data) == r.execute().fetch() + + r = compute(md.Series(data, chunk_size=3), skipna=False) + assert r.execute().fetch() is True + + +@pytest.mark.parametrize("func_name", bool_reduction_functions) +def test_series_bool_level_reduction(setup, check_ref_counts, func_name): + def compute(data, **kwargs): + return getattr(data, func_name)(**kwargs) + + rs = np.random.RandomState(0) + idx = pd.MultiIndex.from_arrays( + [[str(i) for i in range(100)], rs.choice(["A", "B"], size=(100,))], + names=["a", "b"], + ) + data = pd.Series(rs.randint(0, 8, size=(100,)), index=idx) + + r = compute(md.Series(data, chunk_size=13), level=1, method="tree") + pd.testing.assert_series_equal( + compute(data, level=1).sort_index(), r.execute().fetch().sort_index() + ) + + # test null + data = pd.Series(rs.rand(100), name="a", index=idx) + idx_df = idx.to_frame() + data[data > 0.5] = np.nan + data[int(idx_df[idx_df.b == "A"].iloc[0, 0])] = 0.1 + data[int(idx_df[idx_df.b == "B"].iloc[0, 0])] = 0.1 + + r = compute(md.Series(data, chunk_size=13), level=1, method="tree") + pd.testing.assert_series_equal( + compute(data, level=1).sort_index(), r.execute().fetch().sort_index() + ) + + r = compute(md.Series(data, chunk_size=13), level=1, skipna=False, method="tree") + pd.testing.assert_series_equal( + compute(data, level=1, skipna=False).sort_index(), + r.execute().fetch().sort_index(), + ) + + +@pytest.mark.parametrize("func_name", bool_reduction_functions) +def test_dataframe_bool_reduction(setup, check_ref_counts, func_name): + def compute(data, **kwargs): + return getattr(data, func_name)(**kwargs) + + rs = np.random.RandomState(0) + data = pd.DataFrame(rs.rand(20, 10)) + data.iloc[:, :5] = data.iloc[:, :5] > 0.5 + r = compute(md.DataFrame(data)) + pd.testing.assert_series_equal(compute(data), r.execute().fetch()) + + r = compute(md.DataFrame(data, chunk_size=3)) + pd.testing.assert_series_equal(compute(data), r.execute().fetch()) + + r = compute( + md.DataFrame(data, chunk_size=6), axis="index", bool_only=True, method="tree" + ) + pd.testing.assert_series_equal( + compute(data, axis="index", bool_only=True), + r.execute(extra_config={"check_all": False}).fetch(), + ) + + r = compute(md.DataFrame(data, chunk_size=3), axis=1) + pd.testing.assert_series_equal(compute(data, axis=1), r.execute().fetch()) + + r = compute(md.DataFrame(data, chunk_size=3), axis=None) + assert compute(data, axis=None) == r.execute().fetch() + + # test null + np_data = rs.rand(20, 10) + np_data[np_data > 0.6] = np.nan + data = pd.DataFrame(np_data) + + r = compute(md.DataFrame(data, chunk_size=3)) + pd.testing.assert_series_equal(compute(data), r.execute().fetch()) + + r = compute(md.DataFrame(data, chunk_size=3), skipna=False) + pd.testing.assert_series_equal(compute(data, skipna=False), r.execute().fetch()) + + r = compute(md.DataFrame(data, chunk_size=3), skipna=False) + pd.testing.assert_series_equal(compute(data, skipna=False), r.execute().fetch()) + + # test bool_only + data = pd.DataFrame( + rs.rand(10, 10), + index=rs.randint(-100, 100, size=(10,)), + columns=[rs.bytes(10) for _ in range(10)], + ) + data.iloc[:, :5] = data.iloc[:, :5] > 0.5 + data.iloc[:5, 5:] = data.iloc[:5, 5:] > 0.5 + r = compute(md.DataFrame(data, chunk_size=2)) + pd.testing.assert_series_equal(compute(data), r.execute().fetch()) + + r = compute(md.DataFrame(data, chunk_size=6), axis="index", bool_only=True) + pd.testing.assert_series_equal( + compute(data, axis="index", bool_only=True), r.execute().fetch() + ) + + r = compute(md.DataFrame(data, chunk_size=3), axis="columns") + pd.testing.assert_series_equal(compute(data, axis="columns"), r.execute().fetch()) + + data_dict = dict((str(i), rs.rand(10)) for i in range(10)) + data_dict["string"] = [str(i) for i in range(10)] + data_dict["bool"] = rs.choice([True, False], (10,)) + data = pd.DataFrame(data_dict) + r = compute(md.DataFrame(data, chunk_size=3), axis="index", bool_only=True) + pd.testing.assert_series_equal( + compute(data, axis="index", bool_only=True), r.execute().fetch() + ) + + +@pytest.mark.parametrize("func_name", bool_reduction_functions) +def test_dataframe_bool_level_reduction(setup, check_ref_counts, func_name): + def compute(data, **kwargs): + return getattr(data, func_name)(**kwargs) + + rs = np.random.RandomState(0) + idx = pd.MultiIndex.from_arrays( + [[str(i) for i in range(100)], rs.choice(["A", "B"], size=(100,))], + names=["a", "b"], + ) + data = pd.DataFrame(rs.rand(100, 10), index=idx) + data.iloc[:, :5] = data.iloc[:, :5] > 0.5 + + r = compute(md.DataFrame(data, chunk_size=13), level=1, method="tree") + pd.testing.assert_frame_equal( + compute(data, level=1).sort_index(), r.execute().fetch().sort_index() + ) + + # test null + data = pd.DataFrame(rs.rand(100, 10), index=idx) + data[data > 0.6] = np.nan + + r = compute(md.DataFrame(data, chunk_size=13), level=1, method="tree") + pd.testing.assert_frame_equal( + compute(data, level=1).sort_index(), r.execute().fetch().sort_index() + ) + + r = compute(md.DataFrame(data, chunk_size=13), level=1, skipna=False, method="tree") + pd.testing.assert_frame_equal( + compute(data, level=1, skipna=False).sort_index(), + r.execute().fetch().sort_index(), + ) + + # test bool_only + # bool_only not supported when level specified + + +def test_series_count(setup, check_ref_counts): + array = np.random.rand(10) + array[[2, 7, 9]] = np.nan + data = pd.Series(array) + series = md.Series(data) + + result = series.count().execute().fetch() + expected = data.count() + assert result == expected + + series2 = md.Series(data, chunk_size=1) + + result = series2.count().execute().fetch() + expected = data.count() + assert result == expected + + series2 = md.Series(data, chunk_size=3) + + result = series2.count().execute().fetch() + expected = data.count() + assert result == expected + + +def test_dataframe_count(setup, check_ref_counts): + data = pd.DataFrame( + { + "Person": ["John", "Myla", "Lewis", "John", "Myla"], + "Age": [24.0, np.nan, 21.0, 33, 26], + "Single": [False, True, True, True, False], + } + ) + df = md.DataFrame(data) + + result = df.count().execute().fetch() + expected = data.count() + pd.testing.assert_series_equal(result, expected) + + result = df.count(axis="columns").execute().fetch() + expected = data.count(axis="columns") + pd.testing.assert_series_equal(result, expected) + + df2 = md.DataFrame(data, chunk_size=2) + + result = df2.count().execute().fetch() + expected = data.count() + pd.testing.assert_series_equal(result, expected) + + result = df2.count(axis="columns").execute().fetch() + expected = data.count(axis="columns") + pd.testing.assert_series_equal(result, expected) + + df3 = md.DataFrame(data, chunk_size=3) + + result = df3.count(numeric_only=True).execute().fetch() + expected = data.count(numeric_only=True) + pd.testing.assert_series_equal(result, expected) + + result = df3.count(axis="columns", numeric_only=True).execute().fetch() + expected = data.count(axis="columns", numeric_only=True) + pd.testing.assert_series_equal(result, expected) + + +def test_nunique(setup, check_ref_counts): + data1 = pd.Series(np.random.randint(0, 5, size=(20,))) + + series = md.Series(data1) + result = series.nunique().execute().fetch() + expected = data1.nunique() + assert result == expected + + series = md.Series(data1, chunk_size=6) + result = series.nunique().execute().fetch() + expected = data1.nunique() + assert result == expected + + # test dropna + data2 = data1.copy() + data2[[2, 9, 18]] = np.nan + + series = md.Series(data2) + result = series.nunique().execute().fetch() + expected = data2.nunique() + assert result == expected + + series = md.Series(data2, chunk_size=3) + result = series.nunique(dropna=False).execute().fetch() + expected = data2.nunique(dropna=False) + assert result == expected + + # test dataframe + data1 = pd.DataFrame( + np.random.randint(0, 6, size=(20, 20)), + columns=["c" + str(i) for i in range(20)], + ) + df = md.DataFrame(data1) + result = df.nunique().execute().fetch() + expected = data1.nunique() + pd.testing.assert_series_equal(result, expected) + + df = md.DataFrame(data1, chunk_size=6) + result = df.nunique().execute().fetch() + expected = data1.nunique() + pd.testing.assert_series_equal(result, expected) + + df = md.DataFrame(data1) + result = df.nunique(axis=1).execute().fetch() + expected = data1.nunique(axis=1) + pd.testing.assert_series_equal(result, expected) + + df = md.DataFrame(data1, chunk_size=3) + result = df.nunique(axis=1).execute().fetch() + expected = data1.nunique(axis=1) + pd.testing.assert_series_equal(result, expected) + + # test dropna + data2 = data1.copy() + data2.iloc[[2, 9, 18], [2, 9, 18]] = np.nan + + df = md.DataFrame(data2) + result = df.nunique().execute().fetch() + expected = data2.nunique() + pd.testing.assert_series_equal(result, expected) + + df = md.DataFrame(data2, chunk_size=3) + result = df.nunique(dropna=False).execute().fetch() + expected = data2.nunique(dropna=False) + pd.testing.assert_series_equal(result, expected) + + df = md.DataFrame(data1, chunk_size=3) + result = df.nunique(axis=1).execute().fetch() + expected = data1.nunique(axis=1) + pd.testing.assert_series_equal(result, expected) + + +@pytest.mark.skipif(pa is None, reason="pyarrow not installed") +def test_use_arrow_dtype_nunique(setup, check_ref_counts): + with option_context({"dataframe.use_arrow_dtype": True, "combine_size": 2}): + rs = np.random.RandomState(0) + data1 = pd.DataFrame( + {"a": rs.random(10), "b": [f"s{i}" for i in rs.randint(100, size=10)]} + ) + data1["c"] = data1["b"].copy() + data1["d"] = data1["b"].copy() + data1["e"] = data1["b"].copy() + + df = md.DataFrame(data1, chunk_size=(3, 2)) + r = df.nunique(axis=0) + result = r.execute().fetch() + expected = data1.nunique(axis=0) + pd.testing.assert_series_equal(result, expected) + + r = df.nunique(axis=1) + result = r.execute().fetch() + expected = data1.nunique(axis=1) + pd.testing.assert_series_equal(result, expected) + + +def test_unique(setup, check_ref_counts): + data1 = pd.Series(np.random.randint(0, 5, size=(20,))) + + series = md.Series(data1) + result = series.unique().execute().fetch() + expected = data1.unique() + np.testing.assert_array_equal(result, expected) + + series = md.Series(data1, chunk_size=6) + result = series.unique().execute().fetch() + expected = data1.unique() + np.testing.assert_array_equal(result, expected) + + data2 = pd.Series( + [pd.Timestamp("20200101", tz="US/Eastern")] * 5 + + [pd.Timestamp("20200202")] + + [pd.Timestamp("20020101")] * 9 + ) + series = md.Series(data2) + result = series.unique().execute().fetch() + expected = data2.unique() + np.testing.assert_array_equal(result, expected) + + series = md.Series(data2, chunk_size=6) + result = series.unique().execute().fetch() + expected = data2.unique() + np.testing.assert_array_equal(result, expected) + + # test md.unique + result = md.unique(data2).execute().fetch() + expected = pd.unique(data2) + np.testing.assert_array_equal(result, expected) + + raw_list = list("baabc") + result = md.unique(raw_list).execute().fetch() + expected = pd.unique(raw_list) + np.testing.assert_array_equal(result, expected) + + data1 = pd.Series(np.random.randint(0, 5, size=(20,))) + result = md.unique(data1).execute().fetch() + expected = pd.unique(data1) + np.testing.assert_array_equal(result, expected) + + +def test_index_reduction(setup, check_ref_counts): + rs = np.random.RandomState(0) + data = pd.Index(rs.randint(0, 5, (100,))) + data2 = pd.Index(rs.randint(1, 6, (100,))) + + for method in ["min", "max", "all", "any"]: + idx = md.Index(data) + result = getattr(idx, method)().execute().fetch() + assert result == getattr(data, method)() + + idx = md.Index(data, chunk_size=10) + result = getattr(idx, method)().execute().fetch() + assert result == getattr(data, method)() + + idx = md.Index(data2) + result = getattr(idx, method)().execute().fetch() + assert result == getattr(data2, method)() + + idx = md.Index(data2, chunk_size=10) + result = getattr(idx, method)().execute().fetch() + assert result == getattr(data2, method)() + + +cum_reduction_functions = ["cummax", "cummin", "cumprod", "cumsum"] + + +@pytest.mark.parametrize("func_name", cum_reduction_functions) +def test_series_cum_reduction(setup, check_ref_counts, func_name): + def compute(data, **kwargs): + return getattr(data, func_name)(**kwargs) + + data = pd.Series(np.random.rand(20), index=[str(i) for i in range(20)], name="a") + r = compute(md.Series(data)) + pd.testing.assert_series_equal(compute(data), r.execute().fetch()) + + r = compute(md.Series(data, chunk_size=6)) + pd.testing.assert_series_equal(compute(data), r.execute().fetch()) + + r = compute(md.Series(data, chunk_size=3)) + pd.testing.assert_series_equal(compute(data), r.execute().fetch()) + + r = compute(md.Series(data, chunk_size=4), axis="index") + pd.testing.assert_series_equal(compute(data, axis="index"), r.execute().fetch()) + + data = pd.Series(np.random.rand(20), name="a") + data[0] = 0.1 # make sure not all elements are NAN + data[data > 0.5] = np.nan + r = compute(md.Series(data, chunk_size=3)) + pd.testing.assert_series_equal(compute(data), r.execute().fetch()) + + r = compute(md.Series(data, chunk_size=3), skipna=False) + pd.testing.assert_series_equal(compute(data, skipna=False), r.execute().fetch()) + + +@pytest.mark.parametrize("func_name", cum_reduction_functions) +def test_dataframe_cum_reduction(setup, check_ref_counts, func_name): + def compute(data, **kwargs): + return getattr(data, func_name)(**kwargs) + + data = pd.DataFrame(np.random.rand(20, 10)) + r = compute(md.DataFrame(data)) + pd.testing.assert_frame_equal(compute(data), r.execute().fetch()) + + r = compute(md.DataFrame(data, chunk_size=3)) + pd.testing.assert_frame_equal(compute(data), r.execute().fetch()) + + r = compute(md.DataFrame(data, chunk_size=3), axis=1) + pd.testing.assert_frame_equal(compute(data, axis=1), r.execute().fetch()) + + # test null + np_data = np.random.rand(20, 10) + np_data[np_data > 0.6] = np.nan + data = pd.DataFrame(np_data) + + r = compute(md.DataFrame(data, chunk_size=3)) + pd.testing.assert_frame_equal(compute(data), r.execute().fetch()) + + r = compute(md.DataFrame(data, chunk_size=3), skipna=False) + pd.testing.assert_frame_equal(compute(data, skipna=False), r.execute().fetch()) + + r = compute(md.DataFrame(data, chunk_size=3), skipna=False) + pd.testing.assert_frame_equal(compute(data, skipna=False), r.execute().fetch()) + + # test numeric_only + data = pd.DataFrame( + np.random.rand(10, 10), + index=np.random.randint(-100, 100, size=(10,)), + columns=[np.random.bytes(10) for _ in range(10)], + ) + r = compute(md.DataFrame(data, chunk_size=2)) + pd.testing.assert_frame_equal(compute(data), r.execute().fetch()) + + r = compute(md.DataFrame(data, chunk_size=3), axis="columns") + pd.testing.assert_frame_equal(compute(data, axis="columns"), r.execute().fetch()) + + +def test_dataframe_aggregate(setup, check_ref_counts): + all_aggs = [ + "sum", + "prod", + "min", + "max", + "count", + "size", + "mean", + "var", + "std", + "sem", + "skew", + "kurt", + ] + data = pd.DataFrame(np.random.rand(20, 20)) + + df = md.DataFrame(data) + result = df.agg(all_aggs) + pd.testing.assert_frame_equal(result.execute().fetch(), data.agg(all_aggs)) + + result = df.agg("size") + if _agg_size_as_series: + pd.testing.assert_series_equal(result.execute().fetch(), data.agg("size")) + else: + assert result.execute().fetch() == data.agg("size") + + for func in (a for a in all_aggs if a != "size"): + result = df.agg(func) + pd.testing.assert_series_equal(result.execute().fetch(), data.agg(func)) + + result = df.agg(func, axis=1) + pd.testing.assert_series_equal(result.execute().fetch(), data.agg(func, axis=1)) + + df = md.DataFrame(data, chunk_size=3) + + # will redirect to transform + result = df.agg(["cumsum", "cummax"]) + pd.testing.assert_frame_equal( + result.execute().fetch(), data.agg(["cumsum", "cummax"]) + ) + + result = df.agg("size") + if _agg_size_as_series: + pd.testing.assert_series_equal(result.execute().fetch(), data.agg("size")) + else: + assert result.execute().fetch() == data.agg("size") + + for func in (a for a in all_aggs if a != "size"): + result = df.agg(func) + pd.testing.assert_series_equal(result.execute().fetch(), data.agg(func)) + + result = df.agg(func, axis=1) + pd.testing.assert_series_equal(result.execute().fetch(), data.agg(func, axis=1)) + + result = df.agg(["sum"]) + pd.testing.assert_frame_equal(result.execute().fetch(), data.agg(["sum"])) + + result = df.agg([sum]) + pd.testing.assert_frame_equal(result.execute().fetch(), data.agg([sum])) + + result = df.agg(all_aggs) + pd.testing.assert_frame_equal(result.execute().fetch(), data.agg(all_aggs)) + + result = df.agg(all_aggs, axis=1) + pd.testing.assert_frame_equal(result.execute().fetch(), data.agg(all_aggs, axis=1)) + + result = df.agg({0: ["sum", "min", "var"], 9: ["mean", "var", "std"]}) + pd.testing.assert_frame_equal( + result.execute().fetch(), + data.agg({0: ["sum", "min", "var"], 9: ["mean", "var", "std"]}), + ) + + result = df.agg({0: [sum, min, max]}) + pd.testing.assert_frame_equal( + result.execute().fetch(), + data.agg({0: [sum, min, max]}), + ) + + if _support_kw_agg: + agg_kw = dict( + sum_0=NamedAgg(0, "sum"), + min_0=NamedAgg(0, "min"), + mean_9=NamedAgg(9, "mean"), + ) + result = df.agg(**agg_kw) + pd.testing.assert_frame_equal(result.execute().fetch(), data.agg(**agg_kw)) + + +def test_series_aggregate(setup, check_ref_counts): + all_aggs = [ + "sum", + "prod", + "min", + "max", + "count", + "size", + "mean", + "var", + "std", + "sem", + "skew", + "kurt", + ] + data = pd.Series(np.random.rand(20), index=[str(i) for i in range(20)], name="a") + series = md.Series(data) + + result = series.agg(all_aggs) + pd.testing.assert_series_equal(result.execute().fetch(), data.agg(all_aggs)) + + for func in all_aggs: + result = series.agg(func) + assert pytest.approx(result.execute().fetch()) == data.agg(func) + + series = md.Series(data, chunk_size=3) + + for func in all_aggs: + result = series.agg(func) + assert pytest.approx(result.execute().fetch()) == data.agg(func) + + result = series.agg(all_aggs) + pd.testing.assert_series_equal(result.execute().fetch(), data.agg(all_aggs)) + + result = series.agg({"col_sum": "sum", "col_count": "count"}) + pd.testing.assert_series_equal( + result.execute().fetch(), data.agg({"col_sum": "sum", "col_count": "count"}) + ) + + result = series.agg({"col_sum": sum, "col_count": "count"}) + pd.testing.assert_series_equal( + result.execute().fetch(), data.agg({"col_sum": sum, "col_count": "count"}) + ) + + if _support_kw_agg: + result = series.agg(col_var="var", col_skew="skew") + pd.testing.assert_series_equal( + result.execute().fetch(), data.agg(col_var="var", col_skew="skew") + ) + + +def test_aggregate_str_cat(setup, check_ref_counts): + agg_fun = lambda x: x.str.cat(sep="_", na_rep="NA") + + rs = np.random.RandomState(0) + raw_df = pd.DataFrame( + { + "a": rs.choice(["A", "B", "C"], size=(100,)), + "b": rs.choice([None, "alfa", "bravo", "charlie"], size=(100,)), + } + ) + + mdf = md.DataFrame(raw_df, chunk_size=13) + + r = mdf.agg(agg_fun) + pd.testing.assert_series_equal(r.execute().fetch(), raw_df.agg(agg_fun)) + + raw_series = pd.Series(rs.choice([None, "alfa", "bravo", "charlie"], size=(100,))) + + ms = md.Series(raw_series, chunk_size=13) + + r = ms.agg(agg_fun) + assert r.execute().fetch() == raw_series.agg(agg_fun) + + +class MockReduction1(CustomReduction): + def agg(self, v1): + return v1.sum() + + +class MockReduction2(CustomReduction): + def pre(self, value): + return value + 1, value**2 + + def agg(self, v1, v2): + return v1.sum(), v2.prod() + + def post(self, v1, v2): + return v1 + v2 + + +def test_custom_dataframe_aggregate(setup, check_ref_counts): + rs = np.random.RandomState(0) + data = pd.DataFrame(rs.rand(30, 20)) + + df = md.DataFrame(data) + result = df.agg(MockReduction1()) + pd.testing.assert_series_equal(result.execute().fetch(), data.agg(MockReduction1())) + + result = df.agg(MockReduction2()) + pd.testing.assert_series_equal(result.execute().fetch(), data.agg(MockReduction2())) + + df = md.DataFrame(data, chunk_size=5) + result = df.agg(MockReduction2()) + pd.testing.assert_series_equal(result.execute().fetch(), data.agg(MockReduction2())) + + result = df.agg(MockReduction2()) + pd.testing.assert_series_equal(result.execute().fetch(), data.agg(MockReduction2())) + + +def test_custom_series_aggregate(setup, check_ref_counts): + rs = np.random.RandomState(0) + data = pd.Series(rs.rand(20)) + + s = md.Series(data) + result = s.agg(MockReduction1()) + assert result.execute().fetch() == data.agg(MockReduction1()) + + result = s.agg(MockReduction2()) + assert result.execute().fetch() == data.agg(MockReduction2()) + + s = md.Series(data, chunk_size=5) + result = s.agg(MockReduction2()) + assert pytest.approx(result.execute().fetch()) == data.agg(MockReduction2()) + + result = s.agg(MockReduction2()) + assert pytest.approx(result.execute().fetch()) == data.agg(MockReduction2()) diff --git a/python/xorbits/_mars/dataframe/reduction/unique.py b/python/xorbits/_mars/dataframe/reduction/unique.py new file mode 100644 index 000000000..790862ffc --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/unique.py @@ -0,0 +1,97 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import ENTITY_TYPE, OutputType +from ...tensor.core import TensorOrder +from ...utils import lazy_import +from ..initializer import Series as asseries +from .core import CustomReduction, DataFrameReductionMixin, DataFrameReductionOperand + +cudf = lazy_import("cudf") + + +class UniqueReduction(CustomReduction): + def agg(self, data): # noqa: W0221 # pylint: disable=arguments-differ + xdf = cudf if self.is_gpu() else pd + # convert to series data + return xdf.Series(data.unique()) + + def post(self, data): # noqa: W0221 # pylint: disable=arguments-differ + return data.unique() + + +class DataFrameUnique(DataFrameReductionOperand, DataFrameReductionMixin): + _op_type_ = OperandDef.UNIQUE + _func_name = "unique" + + @classmethod + def get_reduction_callable(cls, op): + return UniqueReduction(name=cls._func_name, is_gpu=op.is_gpu()) + + @classmethod + def tile(cls, op): + if op.method == "tree": + return (yield from super().tile(op)) + else: + raise NotImplementedError(f"Method {op.method} hasn't been supported") + + def __call__(self, a): + if not isinstance(a, ENTITY_TYPE): + a = asseries(a) + self.output_types = [OutputType.tensor] + return self.new_tileables( + [a], shape=(np.nan,), dtype=a.dtype, order=TensorOrder.C_ORDER + )[0] + + +def unique(values, method="tree"): + """ + Uniques are returned in order of appearance. This does NOT sort. + + Parameters + ---------- + values : 1d array-like + method : 'shuffle' or 'tree', 'tree' method provide a better performance, 'shuffle' + is recommended if the number of unique values is very large. + See Also + -------- + Index.unique + Series.unique + + Examples + -------- + >>> import mars.dataframe as md + >>> import pandas as pd + >>> md.unique(md.Series([2, 1, 3, 3])).execute() + array([2, 1, 3]) + + >>> md.unique(md.Series([2] + [1] * 5)).execute() + array([2, 1]) + + >>> md.unique(md.Series([pd.Timestamp('20160101'), + ... pd.Timestamp('20160101')])).execute() + array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]') + + >>> md.unique(md.Series([pd.Timestamp('20160101', tz='US/Eastern'), + ... pd.Timestamp('20160101', tz='US/Eastern')])).execute() + array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')], + dtype=object) + """ + op = DataFrameUnique(method=method) + return op(values) diff --git a/python/xorbits/_mars/dataframe/reduction/var.py b/python/xorbits/_mars/dataframe/reduction/var.py new file mode 100644 index 000000000..35cc28ef5 --- /dev/null +++ b/python/xorbits/_mars/dataframe/reduction/var.py @@ -0,0 +1,89 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...config import options +from ...core import OutputType +from ...serialization.serializables import Int32Field +from .core import DataFrameReductionMixin, DataFrameReductionOperand + + +class DataFrameVar(DataFrameReductionOperand, DataFrameReductionMixin): + _op_type_ = OperandDef.VAR + _func_name = "var" + + _ddof = Int32Field("ddof") + + def __init__(self, ddof=None, **kw): + super().__init__(_ddof=ddof, **kw) + + @property + def ddof(self): + return self._ddof + + @classmethod + def get_reduction_callable(cls, op): + skipna, ddof = op.skipna, op.ddof + + def var(x): + cnt = x.count() + if ddof == 0: + return (x**2).mean(skipna=skipna) - (x.mean(skipna=skipna)) ** 2 + return ((x**2).sum(skipna=skipna) - x.sum(skipna=skipna) ** 2 / cnt) / ( + cnt - ddof + ) + + return var + + +def var_series( + series, axis=None, skipna=True, level=None, ddof=1, combine_size=None, method=None +): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameVar( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + combine_size=combine_size, + output_types=[OutputType.scalar], + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(series) + + +def var_dataframe( + df, + axis=None, + skipna=True, + level=None, + ddof=1, + numeric_only=None, + combine_size=None, + method=None, +): + use_inf_as_na = options.dataframe.mode.use_inf_as_na + op = DataFrameVar( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + combine_size=combine_size, + output_types=[OutputType.series], + use_inf_as_na=use_inf_as_na, + method=method, + ) + return op(df) diff --git a/python/xorbits/_mars/dataframe/sort/__init__.py b/python/xorbits/_mars/dataframe/sort/__init__.py new file mode 100644 index 000000000..cc92657fa --- /dev/null +++ b/python/xorbits/_mars/dataframe/sort/__init__.py @@ -0,0 +1,34 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .sort_index import DataFrameSortIndex +from .sort_values import DataFrameSortValues + + +def _install(): + from ..core import DATAFRAME_TYPE, SERIES_TYPE + from .sort_index import sort_index + from .sort_values import dataframe_sort_values, series_sort_values + + for cls in DATAFRAME_TYPE: + setattr(cls, "sort_values", dataframe_sort_values) + setattr(cls, "sort_index", sort_index) + + for cls in SERIES_TYPE: + setattr(cls, "sort_values", series_sort_values) + setattr(cls, "sort_index", sort_index) + + +_install() +del _install diff --git a/python/xorbits/_mars/dataframe/sort/core.py b/python/xorbits/_mars/dataframe/sort/core.py new file mode 100644 index 000000000..a1f2aa879 --- /dev/null +++ b/python/xorbits/_mars/dataframe/sort/core.py @@ -0,0 +1,121 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd + +from ...config import options +from ...core import recursive_tile +from ...core.operand import OperandStage +from ...serialization.serializables import ( + AnyField, + BoolField, + FieldTypes, + Int32Field, + Int64Field, + ListField, + StringField, +) +from ...utils import ceildiv +from ..operands import DataFrameOperand +from ..utils import parse_index + + +class DataFrameSortOperand(DataFrameOperand): + axis = Int32Field("axis") + ascending = AnyField("ascending") + inplace = BoolField("inplace") + kind = StringField("kind") + na_position = StringField("na_position") + ignore_index = BoolField("ignore_index") + parallel_kind = StringField("parallel_kind") + psrs_kinds = ListField("psrs_kinds", FieldTypes.string) + nrows = Int64Field("nrows", default=None) + + @classmethod + def _tile_head(cls, op: "DataFrameSortOperand"): + from ..merge import DataFrameConcat + + inp = op.inputs[0] + out = op.outputs[0] + axis = op.axis + assert axis == 0 + pd_index = out.index_value.to_pandas() + combine_size = options.combine_size + + if inp.ndim == 2: + if inp.chunk_shape[1 - axis] > 1: # pragma: no cover + if any(pd.isna(s) for s in inp.nsplits[1 - axis]): + yield + inp = yield from recursive_tile( + inp.rechunk({1 - axis: inp.shape[1 - axis]}) + ) + + out_chunks = [] + for c in inp.chunks: + chunk_op = op.copy().reset_key() + chunk_op.stage = OperandStage.map + chunk_params = c.params + chunk_params["index_value"] = parse_index(pd_index, c) + out_chunks.append(chunk_op.new_chunk([c], kws=[chunk_params])) + + while True: + chunk_size = ceildiv(len(out_chunks), combine_size) + combine_chunks = [] + for i in range(chunk_size): + chunk_index = (i,) if inp.ndim == 1 else (i, 0) + + to_combine_chunks = out_chunks[ + i * combine_size : (i + 1) * combine_size + ] + concat_params = to_combine_chunks[0].params + concat_params["index"] = chunk_index + shape = list(to_combine_chunks[0].shape) + shape[0] = sum(c.shape[0] for c in to_combine_chunks) + shape = tuple(shape) + concat_params["shape"] = shape + if len(to_combine_chunks) == 1: + c = to_combine_chunks[0].copy() + c._index = chunk_index + else: + c = DataFrameConcat( + axis=axis, output_types=op.output_types + ).new_chunk(to_combine_chunks, kws=[concat_params]) + chunk_op = op.copy().reset_key() + chunk_op.stage = ( + OperandStage.combine if chunk_size > 1 else OperandStage.agg + ) + chunk_params = c.params + chunk_params["index_value"] = parse_index(pd_index, c) + chunk_params["shape"] = (min(shape[0], op.nrows),) + shape[1:] + combine_chunks.append(chunk_op.new_chunk([c], kws=[chunk_params])) + out_chunks = combine_chunks + if chunk_size == 1: + break + + new_op = op.copy() + params = out.params + params["nsplits"] = tuple((s,) for s in out.shape) + params["chunks"] = out_chunks + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + def _tile(cls, op): # pragma: no cover + raise NotImplementedError + + @classmethod + def tile(cls, op: "DataFrameSortOperand"): + if op.nrows is not None: + return (yield from cls._tile_head(op)) + else: + return (yield from cls._tile(op)) diff --git a/python/xorbits/_mars/dataframe/sort/psrs.py b/python/xorbits/_mars/dataframe/sort/psrs.py new file mode 100644 index 000000000..b91c4762c --- /dev/null +++ b/python/xorbits/_mars/dataframe/sort/psrs.py @@ -0,0 +1,729 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...core.operand import MapReduceOperand, OperandStage +from ...serialization.serializables import ( + AnyField, + BoolField, + Int32Field, + ListField, + StringField, +) +from ...tensor.base.psrs import PSRSOperandMixin +from ...utils import calc_nsplits, lazy_import +from ..core import IndexValue, OutputType +from ..operands import DataFrameOperand, DataFrameOperandMixin, DataFrameShuffleProxy +from ..utils import is_cudf, parse_index, standardize_range_index + +cudf = lazy_import("cudf") + +_PSRS_DISTINCT_COL = "__PSRS_TMP_DISTINCT_COL" + + +class _Largest: + """ + This util class resolve TypeError when + comparing strings with None values + """ + + def __lt__(self, other): + return False + + def __gt__(self, other): + return self is not other + + +_largest = _Largest() + + +class _ReversedValue: + def __init__(self, value): + self._value = value + + def __lt__(self, other): + if type(other) is _ReversedValue: + # may happen when call searchsorted + return self._value >= other._value + return self._value >= other + + def __gt__(self, other): + return self._value <= other + + def __repr__(self): + return repr(self._value) + + +class DataFramePSRSOperandMixin(DataFrameOperandMixin, PSRSOperandMixin): + @classmethod + def _collect_op_properties(cls, op): + from .sort_values import DataFrameSortValues + + if isinstance(op, DataFrameSortValues): + properties = dict( + sort_type="sort_values", + axis=op.axis, + by=op.by, + ascending=op.ascending, + inplace=op.inplace, + na_position=op.na_position, + gpu=op.is_gpu(), + ) + else: + properties = dict( + sort_type="sort_index", + axis=op.axis, + level=op.level, + ascending=op.ascending, + inplace=op.inplace, + na_position=op.na_position, + sort_remaining=op.sort_remaining, + gpu=op.is_gpu(), + ) + return properties + + @classmethod + def local_sort_and_regular_sample( + cls, op, in_data, axis_chunk_shape, axis_offsets, out_idx + ): + # stage 1: local sort and regular samples collected + sorted_chunks, indices_chunks, sampled_chunks = [], [], [] + for i in range(axis_chunk_shape): + in_chunk = in_data.chunks[i] + kind = None if op.psrs_kinds is None else op.psrs_kinds[0] + chunk_op = DataFramePSRSSortRegularSample( + kind=kind, + n_partition=axis_chunk_shape, + output_types=op.output_types, + **cls._collect_op_properties(op) + ) + kws = [] + sort_shape = in_chunk.shape + kws.append( + { + "shape": sort_shape, + "index_value": in_chunk.index_value, + "index": in_chunk.index, + } + ) + if chunk_op.sort_type == "sort_values": + sampled_shape = ( + (axis_chunk_shape, len(op.by)) if op.by else (axis_chunk_shape,) + ) + else: + sampled_shape = ( + (axis_chunk_shape, sort_shape[1]) + if len(sort_shape) == 2 + else (axis_chunk_shape,) + ) + kws.append( + { + "shape": sampled_shape, + "index_value": in_chunk.index_value, + "index": (i,), + "type": "regular_sampled", + } + ) + if op.outputs[0].ndim == 2: + kws[0].update( + {"columns_value": in_chunk.columns_value, "dtypes": in_chunk.dtypes} + ) + kws[1].update( + {"columns_value": in_chunk.columns_value, "dtypes": in_chunk.dtypes} + ) + else: + kws[0].update(({"dtype": in_chunk.dtype, "name": in_chunk.name})) + kws[1].update({"dtype": in_chunk.dtype}) + + chunks = chunk_op.new_chunks([in_chunk], kws=kws, output_limit=len(kws)) + sort_chunk, sampled_chunk = chunks + sorted_chunks.append(sort_chunk) + sampled_chunks.append(sampled_chunk) + return sorted_chunks, indices_chunks, sampled_chunks + + @classmethod + def concat_and_pivot( + cls, op, axis_chunk_shape, out_idx, sorted_chunks, sampled_chunks + ): + from .sort_values import DataFrameSortValues + + # stage 2: gather and merge samples, choose and broadcast p-1 pivots + kind = None if op.psrs_kinds is None else op.psrs_kinds[1] + if isinstance(op, DataFrameSortValues): + output_types = op.output_types + else: + output_types = [OutputType.index] + concat_pivot_op = DataFramePSRSConcatPivot( + kind=kind, + n_partition=axis_chunk_shape, + output_types=output_types, + **cls._collect_op_properties(op) + ) + concat_pivot_shape = ( + sorted_chunks[0].shape[: op.axis] + + (axis_chunk_shape - 1,) + + sorted_chunks[0].shape[op.axis + 1 :] + ) + concat_pivot_index = out_idx[: op.axis] + (0,) + out_idx[op.axis :] + concat_pivot_chunk = concat_pivot_op.new_chunk( + sampled_chunks, + shape=concat_pivot_shape, + index=concat_pivot_index, + ) + return concat_pivot_chunk + + @classmethod + def partition_local_data( + cls, op, axis_chunk_shape, sorted_chunks, indices_chunks, concat_pivot_chunk + ): + # stage 3: Local data is partitioned + partition_chunks = [] + length = len(sorted_chunks) + for i in range(length): + chunk_inputs = [sorted_chunks[i], concat_pivot_chunk] + partition_shuffle_map = DataFramePSRSShuffle( + n_partition=axis_chunk_shape, + stage=OperandStage.map, + output_types=op.output_types, + **cls._collect_op_properties(op) + ) + if isinstance(chunk_inputs[0].index_value.value, IndexValue.RangeIndex): + index_value = parse_index(pd.Index([], dtype=np.int64)) + else: + index_value = chunk_inputs[0].index_value + kw = dict( + shape=chunk_inputs[0].shape, + index=chunk_inputs[0].index, + index_value=index_value, + ) + if op.outputs[0].ndim == 2: + kw.update( + dict( + columns_value=chunk_inputs[0].columns_value, + dtypes=chunk_inputs[0].dtypes, + ) + ) + else: + kw.update(dict(dtype=chunk_inputs[0].dtype, name=chunk_inputs[0].name)) + partition_chunk = partition_shuffle_map.new_chunk(chunk_inputs, **kw) + partition_chunks.append(partition_chunk) + return partition_chunks + + @classmethod + def partition_merge_data( + cls, op, need_align, return_value, partition_chunks, proxy_chunk + ): + # stage 4: all *ith* classes are gathered and merged + partition_sort_chunks, partition_indices_chunks, sort_info_chunks = [], [], [] + for i, partition_chunk in enumerate(partition_chunks): + kind = None if op.psrs_kinds is None else op.psrs_kinds[2] + partition_shuffle_reduce = DataFramePSRSShuffle( + stage=OperandStage.reduce, + kind=kind, + reducer_index=(i,), + n_reducers=len(partition_chunks), + output_types=op.output_types, + **cls._collect_op_properties(op) + ) + chunk_shape = list(partition_chunk.shape) + chunk_shape[op.axis] = np.nan + + kw = dict( + shape=tuple(chunk_shape), + index=partition_chunk.index, + index_value=partition_chunk.index_value, + ) + if op.outputs[0].ndim == 2: + kw.update( + dict( + columns_value=partition_chunk.columns_value, + dtypes=partition_chunk.dtypes, + ) + ) + else: + kw.update(dict(dtype=partition_chunk.dtype, name=partition_chunk.name)) + cs = partition_shuffle_reduce.new_chunks([proxy_chunk], **kw) + + partition_sort_chunks.append(cs[0]) + return partition_sort_chunks, partition_indices_chunks, sort_info_chunks + + @classmethod + def _tile_psrs(cls, op, in_data): + out = op.outputs[0] + in_df, axis_chunk_shape, _, _ = yield from cls.preprocess(op, in_data=in_data) + + # stage 1: local sort and regular samples collected + sorted_chunks, _, sampled_chunks = cls.local_sort_and_regular_sample( + op, in_df, axis_chunk_shape, None, None + ) + + # stage 2: gather and merge samples, choose and broadcast p-1 pivots + concat_pivot_chunk = cls.concat_and_pivot( + op, + axis_chunk_shape, + (0,) if in_df.ndim == 2 else (), + sorted_chunks, + sampled_chunks, + ) + + # stage 3: Local data is partitioned + partition_chunks = cls.partition_local_data( + op, axis_chunk_shape, sorted_chunks, None, concat_pivot_chunk + ) + + proxy_chunk = DataFrameShuffleProxy(output_types=op.output_types).new_chunk( + partition_chunks, shape=() + ) + + # stage 4: all *ith* classes are gathered and merged + partition_sort_chunks = cls.partition_merge_data( + op, False, None, partition_chunks, proxy_chunk + )[0] + + if op.ignore_index: + yield partition_sort_chunks + chunks = standardize_range_index(partition_sort_chunks, axis=op.axis) + else: + chunks = partition_sort_chunks + + nsplits = calc_nsplits({c.index: c.shape for c in chunks}) + if op.outputs[0].ndim == 2: + new_op = op.copy() + return new_op.new_dataframes( + op.inputs, + shape=out.shape, + chunks=chunks, + nsplits=nsplits, + index_value=out.index_value, + columns_value=out.columns_value, + dtypes=out.dtypes, + ) + else: + new_op = op.copy() + return new_op.new_seriess( + op.inputs, + shape=out.shape, + chunks=chunks, + nsplits=nsplits, + index_value=out.index_value, + dtype=out.dtype, + name=out.name, + ) + + +def execute_sort_values(data, op, inplace=None, by=None): + if inplace is None: + inplace = op.inplace + # ignore_index is new in Pandas version 1.0.0. + ignore_index = getattr(op, "ignore_index", False) + if isinstance(data, (pd.DataFrame, pd.Series)): + kwargs = dict( + axis=op.axis, + ascending=op.ascending, + ignore_index=ignore_index, + na_position=op.na_position, + kind=op.kind, + ) + if isinstance(data, pd.DataFrame): + kwargs["by"] = by if by is not None else op.by + if inplace: + kwargs["inplace"] = True + try: + data.sort_values(**kwargs) + except TypeError: # pragma: no cover + kwargs.pop("ignore_index", None) + data.sort_values(**kwargs) + return data + else: + try: + return data.sort_values(**kwargs) + except TypeError: # pragma: no cover + kwargs.pop("ignore_index", None) + return data.sort_values(**kwargs) + + else: # pragma: no cover + # cudf doesn't support axis and kind + if isinstance(data, cudf.DataFrame): + return data.sort_values( + op.by, ascending=op.ascending, na_position=op.na_position + ) + else: + return data.sort_values(ascending=op.ascending, na_position=op.na_position) + + +def execute_sort_index(data, op, inplace=None): + if inplace is None: + inplace = op.inplace + # ignore_index is new in Pandas version 1.0.0. + ignore_index = getattr(op, "ignore_index", False) + if isinstance(data, (pd.DataFrame, pd.Series)): + kwargs = dict( + level=op.level, + ascending=op.ascending, + ignore_index=ignore_index, + na_position=op.na_position, + kind=op.kind, + sort_remaining=op.sort_remaining, + ) + if inplace: + kwargs["inplace"] = True + try: + data.sort_index(**kwargs) + except TypeError: # pragma: no cover + kwargs.pop("ignore_index", None) + data.sort_index(**kwargs) + return data + else: + try: + return data.sort_index(**kwargs) + except TypeError: # pragma: no cover + kwargs.pop("ignore_index", None) + return data.sort_index(**kwargs) + + else: # pragma: no cover + # cudf only support ascending + return data.sort_index(ascending=op.ascending) + + +class DataFramePSRSChunkOperand(DataFrameOperand): + # sort type could be 'sort_values' or 'sort_index' + sort_type = StringField("sort_type") + + axis = Int32Field("axis") + by = ListField("by", default=None) + ascending = AnyField("ascending") + inplace = BoolField("inplace") + kind = StringField("kind") + na_position = StringField("na_position") + + # for sort_index + level = ListField("level") + sort_remaining = BoolField("sort_remaining") + + n_partition = Int32Field("n_partition") + + def __init__(self, output_types=None, **kw): + super().__init__(_output_types=output_types, **kw) + + +class DataFramePSRSSortRegularSample(DataFramePSRSChunkOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.PSRS_SORT_REGULAR_SMAPLE + + @property + def output_limit(self): + return 2 + + @classmethod + def execute(cls, ctx, op): + a = ctx[op.inputs[0].key] + xdf = pd if isinstance(a, (pd.DataFrame, pd.Series)) else cudf + + if len(a) == 0: + # when chunk is empty, return the empty chunk itself + ctx[op.outputs[0].key] = ctx[op.outputs[-1].key] = a + return + + if op.sort_type == "sort_values": + ctx[op.outputs[0].key] = res = execute_sort_values(a, op) + else: + ctx[op.outputs[0].key] = res = execute_sort_index(a, op) + + by = op.by + add_distinct_col = bool(int(os.environ.get("PSRS_DISTINCT_COL", "0"))) + if ( + add_distinct_col + and isinstance(a, xdf.DataFrame) + and op.sort_type == "sort_values" + ): + # when running under distributed mode, we introduce an extra column + # to make sure pivots are distinct + chunk_idx = op.inputs[0].index[0] + distinct_col = ( + _PSRS_DISTINCT_COL + if a.columns.nlevels == 1 + else (_PSRS_DISTINCT_COL,) + ("",) * (a.columns.nlevels - 1) + ) + res[distinct_col] = np.arange( + chunk_idx << 32, (chunk_idx << 32) + len(a), dtype=np.int64 + ) + by = list(by) + [distinct_col] + + n = op.n_partition + if op.sort_type == "sort_values" and a.shape[op.axis] < n: + num = n // a.shape[op.axis] + 1 + res = execute_sort_values(xdf.concat([res] * num), op, by=by) + + w = res.shape[op.axis] * 1.0 / (n + 1) + slc = np.linspace( + max(w - 1, 0), res.shape[op.axis] - 1, num=n, endpoint=False + ).astype(int) + if op.axis == 1: + slc = (slice(None), slc) + if op.sort_type == "sort_values": + # do regular sample + if op.by is not None: + ctx[op.outputs[-1].key] = res[by].iloc[slc] + else: + ctx[op.outputs[-1].key] = res.iloc[slc] + else: + # do regular sample + ctx[op.outputs[-1].key] = res.iloc[slc] + + +class DataFramePSRSConcatPivot(DataFramePSRSChunkOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.PSRS_CONCAT_PIVOT + + @property + def output_limit(self): + return 1 + + @classmethod + def execute(cls, ctx, op): + inputs = [ctx[c.key] for c in op.inputs if len(ctx[c.key]) > 0] + if len(inputs) == 0: + # corner case: nothing sampled, we need to do nothing + ctx[op.outputs[-1].key] = ctx[op.inputs[0].key] + return + + xdf = pd if isinstance(inputs[0], (pd.DataFrame, pd.Series)) else cudf + + a = xdf.concat(inputs, axis=op.axis) + p = len(inputs) + assert a.shape[op.axis] == p * len(op.inputs) + + slc = np.linspace( + p - 1, a.shape[op.axis] - 1, num=len(op.inputs) - 1, endpoint=False + ).astype(int) + if op.axis == 1: + slc = (slice(None), slc) + if op.sort_type == "sort_values": + a = execute_sort_values(a, op, inplace=False) + ctx[op.outputs[-1].key] = a.iloc[slc] + else: + a = execute_sort_index(a, op, inplace=False) + ctx[op.outputs[-1].key] = a.index[slc] + + +class DataFramePSRSShuffle(MapReduceOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.PSRS_SHUFFLE + + sort_type = StringField("sort_type") + + # for shuffle map + axis = Int32Field("axis") + by = ListField("by") + ascending = AnyField("ascending") + inplace = BoolField("inplace") + na_position = StringField("na_position") + n_partition = Int32Field("n_partition") + + # for sort_index + level = ListField("level") + sort_remaining = BoolField("sort_remaining") + + # for shuffle reduce + kind = StringField("kind") + + def __init__(self, output_types=None, **kw): + super().__init__(_output_types=output_types, **kw) + + @property + def output_limit(self): + return 1 + + @staticmethod + def _calc_poses(src_cols, pivots, ascending=True): + # The pivots are immutable if it is got from shared memory, e.g. Ray object store. + # Pandas < 1.4 has item setting bug and pandas >= 1.4 has fixed it. + # + # Here, almost all the cases that the pivots are got from shared memory. + # + # `pivots[col] = -pivots[col]` will automatically replace the col with a new copy + # `-pivots[col]` in pandas >= 1.4, but it will try to inplace set col in pandas < 1.4 + # + # So, we use assign here to walk around incorrect inplace set item bug in pandas < 1.4. + # Please refer to: https://github.com/mars-project/mars/issues/3215 + # related issue: https://github.com/pandas-dev/pandas/pull/43406 + copy_cols = {} + if isinstance(ascending, list): + for asc, col in zip(ascending, pivots.columns): + # Make pivots available to use ascending order when mixed order specified + if not asc: + if pd.api.types.is_numeric_dtype(pivots.dtypes[col]): + # for numeric dtypes, convert to negative is more efficient + copy_cols[col] = -pivots[col] + src_cols[col] = -src_cols[col] + else: + # for other types, convert to ReversedValue + copy_cols[col] = pivots[col].map( + lambda x: x + if type(x) is _ReversedValue + else _ReversedValue(x) + ) + ascending = True + + if copy_cols: + pivots = pivots.assign(**copy_cols) + + records = src_cols.to_records(index=False) + p_records = pivots.to_records(index=False) + if ascending: + poses = records.searchsorted(p_records, side="right") + else: + poses = len(records) - records[::-1].searchsorted(p_records, side="right") + del records, p_records + return poses + + @classmethod + def _execute_dataframe_map(cls, ctx, op): + a, pivots = [ctx[c.key] for c in op.inputs] + out = op.outputs[0] + + if len(a) == 0: + # when the chunk is empty, no slices can be produced + for i in range(op.n_partition): + ctx[out.key, (i,)] = a + return + + # use numpy.searchsorted to find split positions. + by = op.by + + distinct_col = ( + _PSRS_DISTINCT_COL + if a.columns.nlevels == 1 + else (_PSRS_DISTINCT_COL,) + ("",) * (a.columns.nlevels - 1) + ) + if distinct_col in a.columns: + by = list(by) + [distinct_col] + + try: + poses = cls._calc_poses(a[by], pivots, op.ascending) + except TypeError: + poses = cls._calc_poses( + a[by].fillna(_largest), pivots.fillna(_largest), op.ascending + ) + + poses = (None,) + tuple(poses) + (None,) + for i in range(op.n_partition): + values = a.iloc[poses[i] : poses[i + 1]] + if is_cudf(values): # pragma: no cover + values = values.copy() + ctx[out.key, (i,)] = values + + @classmethod + def _calc_series_poses(cls, s, pivots, ascending=True): + if ascending: + poses = s.searchsorted(pivots, side="right") + else: + poses = len(s) - s.iloc[::-1].searchsorted(pivots, side="right") + return poses + + @classmethod + def _execute_series_map(cls, ctx, op): + a, pivots = [ctx[c.key] for c in op.inputs] + out = op.outputs[0] + + if len(a) == 0: + # when the chunk is empty, no slices can be produced + for i in range(op.n_partition): + ctx[out.key, (i,)] = a + return + + if isinstance(a, pd.Series): + try: + poses = cls._calc_series_poses(a, pivots, ascending=op.ascending) + except TypeError: + filled_a = a.fillna(_largest) + filled_pivots = pivots.fillna(_largest) + poses = cls._calc_series_poses( + filled_a, filled_pivots, ascending=op.ascending + ) + poses = (None,) + tuple(poses) + (None,) + for i in range(op.n_partition): + values = a.iloc[poses[i] : poses[i + 1]] + ctx[out.key, (i,)] = values + + @classmethod + def _execute_sort_index_map(cls, ctx, op): + a, pivots = [ctx[c.key] for c in op.inputs] + out = op.outputs[0] + + if op.ascending: + poses = a.index.searchsorted(list(pivots), side="right") + else: + poses = len(a) - a.index[::-1].searchsorted(list(pivots), side="right") + poses = (None,) + tuple(poses) + (None,) + for i in range(op.n_partition): + values = a.iloc[poses[i] : poses[i + 1]] + ctx[out.key, (i,)] = values + + @classmethod + def _execute_map(cls, ctx, op): + a = [ctx[c.key] for c in op.inputs][0] + if op.sort_type == "sort_values": + if len(a.shape) == 2: + # DataFrame type + cls._execute_dataframe_map(ctx, op) + else: + # Series type + cls._execute_series_map(ctx, op) + else: + cls._execute_sort_index_map(ctx, op) + + @classmethod + def _execute_reduce(cls, ctx, op: "DataFramePSRSShuffle"): + out_chunk = op.outputs[0] + raw_inputs = list(op.iter_mapper_data(ctx, pop=False)) + + xdf = pd if isinstance(raw_inputs[0], (pd.DataFrame, pd.Series)) else cudf + if xdf is pd: + concat_values = xdf.concat(raw_inputs, axis=op.axis, copy=False) + else: + concat_values = xdf.concat(raw_inputs, axis=op.axis) + del raw_inputs[:] + + if isinstance(concat_values, xdf.DataFrame): + concat_values.drop( + _PSRS_DISTINCT_COL, axis=1, inplace=True, errors="ignore" + ) + + col_index_dtype = out_chunk.columns_value.to_pandas().dtype + if concat_values.columns.dtype != col_index_dtype: + concat_values.columns = concat_values.columns.astype(col_index_dtype) + + if op.sort_type == "sort_values": + ctx[op.outputs[0].key] = execute_sort_values(concat_values, op) + else: + ctx[op.outputs[0].key] = execute_sort_index(concat_values, op) + + @classmethod + def estimate_size(cls, ctx, op): + super().estimate_size(ctx, op) + result = ctx[op.outputs[0].key] + if op.stage == OperandStage.reduce: + ctx[op.outputs[0].key] = (result[0], result[1] * 1.5) + else: + ctx[op.outputs[0].key] = result + + @classmethod + def execute(cls, ctx, op): + if op.stage == OperandStage.map: + cls._execute_map(ctx, op) + else: + cls._execute_reduce(ctx, op) diff --git a/python/xorbits/_mars/dataframe/sort/sort_index.py b/python/xorbits/_mars/dataframe/sort/sort_index.py new file mode 100644 index 000000000..f7f0874be --- /dev/null +++ b/python/xorbits/_mars/dataframe/sort/sort_index.py @@ -0,0 +1,245 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import OutputType, recursive_tile +from ...serialization.serializables import BoolField, ListField +from ...tensor.base.sort import _validate_sort_psrs_kinds +from ...utils import calc_nsplits +from ..operands import DATAFRAME_TYPE +from ..utils import ( + build_concatenated_rows_frame, + parse_index, + standardize_range_index, + validate_axis, +) +from .core import DataFrameSortOperand +from .psrs import DataFramePSRSOperandMixin, execute_sort_index + + +class DataFrameSortIndex(DataFrameSortOperand, DataFramePSRSOperandMixin): + _op_type_ = OperandDef.SORT_INDEX + + level = ListField("level", default=None) + sort_remaining = BoolField("sort_remaining", default=None) + + @classmethod + def _tile(cls, op): + df = op.inputs[0] + + if op.axis == 0: + if df.chunk_shape[op.axis] == 1: + if op.output_types[0] == OutputType.dataframe: + df = build_concatenated_rows_frame(df) + out_chunks = [] + for chunk in df.chunks: + chunk_op = op.copy().reset_key() + out_chunks.append( + chunk_op.new_chunk( + [chunk], + shape=chunk.shape, + index=chunk.index, + index_value=chunk.index_value, + columns_value=chunk.columns_value, + dtypes=chunk.dtypes, + ) + ) + new_op = op.copy() + kws = op.outputs[0].params.copy() + kws["nsplits"] = df.nsplits + kws["chunks"] = out_chunks + return new_op.new_dataframes(op.inputs, **kws) + else: + out_chunks = [] + for chunk in df.chunks: + chunk_op = op.copy().reset_key() + out_chunks.append( + chunk_op.new_chunk( + [chunk], + shape=chunk.shape, + index=chunk.index, + index_value=chunk.index_value, + name=chunk.name, + dtype=chunk.dtype, + ) + ) + new_op = op.copy() + kws = op.outputs[0].params.copy() + kws["nsplits"] = df.nsplits + kws["chunks"] = out_chunks + return new_op.new_seriess(op.inputs, **kws) + else: + if op.output_types[0] == OutputType.dataframe: + df = build_concatenated_rows_frame(df) + if op.na_position != "last": # pragma: no cover + raise NotImplementedError("Only support puts NaNs at the end.") + # use parallel sorting by regular sampling + return (yield from cls._tile_psrs(op, df)) + else: + assert op.axis == 1 + + sorted_columns = list( + df.columns_value.to_pandas().sort_values(ascending=op.ascending) + ) + r = [(yield from recursive_tile(df[sorted_columns]))] + if op.ignore_index: + chunks = r[0].chunks + yield chunks + out = op.outputs[0] + chunks = standardize_range_index(chunks, axis=0) + new_op = op.copy() + return new_op.new_dataframes( + op.inputs, + shape=out.shape, + chunks=chunks, + nsplits=calc_nsplits({c.index: c.shape for c in chunks}), + index_value=out.index_value, + columns_value=out.columns_value, + dtypes=out.dtypes, + ) + return r + + @classmethod + def execute(cls, ctx, op: "DataFrameSortIndex"): + in_data = ctx[op.inputs[0].key] + result = execute_sort_index(in_data, op) + if op.nrows is not None: + result = result.head(op.nrows) + ctx[op.outputs[0].key] = result + + def _call_dataframe(self, df): + if self.ignore_index: + index_value = parse_index(pd.RangeIndex(df.shape[0])) + else: + index_value = df.index_value + if self.axis == 0: + return self.new_dataframe( + [df], + shape=df.shape, + dtypes=df.dtypes, + index_value=index_value, + columns_value=df.columns_value, + ) + else: + dtypes = df.dtypes.sort_index(ascending=self.ascending) + columns_value = parse_index(dtypes.index, store_data=True) + return self.new_dataframe( + [df], + shape=df.shape, + dtypes=dtypes, + index_value=index_value, + columns_value=columns_value, + ) + + def _call_series(self, series): + if self.axis != 0: # pragma: no cover + raise TypeError(f"Invalid axis: {self.axis}") + if self.ignore_index: + index_value = parse_index(pd.RangeIndex(series.shape[0])) + else: + index_value = series.index_value + + return self.new_series( + [series], + shape=series.shape, + dtype=series.dtype, + index_value=index_value, + name=series.name, + ) + + def __call__(self, a): + if isinstance(a, DATAFRAME_TYPE): + self.output_types = [OutputType.dataframe] + return self._call_dataframe(a) + else: + self.output_types = [OutputType.series] + return self._call_series(a) + + +def sort_index( + a, + axis=0, + level=None, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + sort_remaining=True, + ignore_index: bool = False, + parallel_kind="PSRS", + psrs_kinds=None, +): + """ + Sort object by labels (along an axis). + + Parameters + ---------- + a : Input DataFrame or Series. + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis along which to sort. The value 0 identifies the rows, + and 1 identifies the columns. + level : int or level name or list of ints or list of level names + If not None, sort on values in specified index level(s). + ascending : bool, default True + Sort ascending vs. descending. + inplace : bool, default False + If True, perform operation in-place. + kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort' + Choice of sorting algorithm. See also ndarray.np.sort for more + information. `mergesort` is the only stable algorithm. For + DataFrames, this option is only applied when sorting on a single + column or label. + na_position : {'first', 'last'}, default 'last' + Puts NaNs at the beginning if `first`; `last` puts NaNs at the end. + Not implemented for MultiIndex. + sort_remaining : bool, default True + If True and sorting by level and index is multilevel, sort by other + levels too (in order) after sorting by specified level. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + parallel_kind: {'PSRS'}, optional. + Parallel sorting algorithm, for the details, refer to: + http://csweb.cs.wfu.edu/bigiron/LittleFE-PSRS/build/html/PSRSalgorithm.html + psrs_kinds: Sorting algorithms during PSRS algorithm. + + Returns + ------- + sorted_obj : DataFrame or None + DataFrame with sorted index if inplace=False, None otherwise. + """ + if na_position not in ["last", "first"]: # pragma: no cover + raise TypeError(f"Invalid na_position: {na_position}") + psrs_kinds = _validate_sort_psrs_kinds(psrs_kinds) + axis = validate_axis(axis, a) + level = level if isinstance(level, (list, tuple)) else [level] + op = DataFrameSortIndex( + level=level, + axis=axis, + ascending=ascending, + inplace=inplace, + kind=kind, + na_position=na_position, + sort_remaining=sort_remaining, + ignore_index=ignore_index, + parallel_kind=parallel_kind, + psrs_kinds=psrs_kinds, + gpu=a.op.is_gpu(), + ) + sorted_a = op(a) + if inplace: + a.data = sorted_a.data + else: + return sorted_a diff --git a/python/xorbits/_mars/dataframe/sort/sort_values.py b/python/xorbits/_mars/dataframe/sort/sort_values.py new file mode 100644 index 000000000..49174c836 --- /dev/null +++ b/python/xorbits/_mars/dataframe/sort/sort_values.py @@ -0,0 +1,387 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import OutputType +from ...serialization.serializables import ListField +from ...tensor.base.sort import _validate_sort_psrs_kinds +from ..core import IndexValue +from ..utils import build_concatenated_rows_frame, parse_index, validate_axis +from .core import DataFrameSortOperand +from .psrs import DataFramePSRSOperandMixin, execute_sort_values + + +class DataFrameSortValues(DataFrameSortOperand, DataFramePSRSOperandMixin): + _op_type_ = OperandDef.SORT_VALUES + + by = ListField("by", default=None) + + def __init__(self, output_types=None, **kw): + super().__init__(_output_types=output_types, **kw) + + @classmethod + def _tile_dataframe(cls, op): + df = build_concatenated_rows_frame(op.inputs[0]) + + if df.chunk_shape[op.axis] == 1: + out_chunks = [] + for chunk in df.chunks: + chunk_op = op.copy().reset_key() + out_chunks.append( + chunk_op.new_chunk( + [chunk], + shape=chunk.shape, + index=chunk.index, + index_value=op.outputs[0].index_value, + columns_value=chunk.columns_value, + dtypes=chunk.dtypes, + ) + ) + new_op = op.copy() + kws = op.outputs[0].params.copy() + kws["nsplits"] = df.nsplits + kws["chunks"] = out_chunks + return new_op.new_dataframes(op.inputs, **kws) + else: + if op.na_position != "last": # pragma: no cover + raise NotImplementedError("Only support puts NaNs at the end.") + # use parallel sorting by regular sampling + return (yield from cls._tile_psrs(op, df)) + + @classmethod + def _tile_series(cls, op): + series = op.inputs[0] + if len(series.chunks) == 1: + chunk = series.chunks[0] + chunk_op = op.copy().reset_key() + out_chunks = [ + chunk_op.new_chunk( + series.chunks, + shape=chunk.shape, + index=chunk.index, + index_value=op.outputs[0].index_value, + dtype=chunk.dtype, + name=chunk.name, + ) + ] + new_op = op.copy() + kws = op.outputs[0].params.copy() + kws["nsplits"] = series.nsplits + kws["chunks"] = out_chunks + return new_op.new_seriess(op.inputs, **kws) + else: + if op.na_position != "last": # pragma: no cover + raise NotImplementedError("Only support puts NaNs at the end.") + # use parallel sorting by regular sampling + return (yield from cls._tile_psrs(op, series)) + + @classmethod + def _tile(cls, op): + inp = op.inputs[0] + if inp.shape[op.axis] == 0: + # if the length is zero, return input directly + return inp + if inp.ndim == 2: + return (yield from cls._tile_dataframe(op)) + else: + return (yield from cls._tile_series(op)) + + @classmethod + def execute(cls, ctx, op: "DataFrameSortValues"): + in_data = ctx[op.inputs[0].key] + result = execute_sort_values(in_data, op) + if op.nrows is not None: + result = result.head(op.nrows) + ctx[op.outputs[0].key] = result + + def __call__(self, a): + assert self.axis == 0 + if self.ignore_index: + index_value = parse_index(pd.RangeIndex(a.shape[0])) + else: + if isinstance(a.index_value.value, IndexValue.RangeIndex): + index_value = parse_index(pd.Index([], dtype=np.int64)) + else: + index_value = a.index_value + if a.ndim == 2: + return self.new_dataframe( + [a], + shape=a.shape, + dtypes=a.dtypes, + index_value=index_value, + columns_value=a.columns_value, + ) + else: + return self.new_series( + [a], shape=a.shape, dtype=a.dtype, index_value=index_value, name=a.name + ) + + +def dataframe_sort_values( + df, + by, + axis=0, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + ignore_index=False, + parallel_kind="PSRS", + psrs_kinds=None, +): + """ + Sort by the values along either axis. + + Parameters + ---------- + df : Mars DataFrame + Input dataframe. + by : str + Name or list of names to sort by. + axis : %(axes_single_arg)s, default 0 + Axis to be sorted. + ascending : bool or list of bool, default True + Sort ascending vs. descending. Specify list for multiple sort + orders. If this is a list of bools, must match the length of + the by. + inplace : bool, default False + If True, perform operation in-place. + kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort' + Choice of sorting algorithm. See also ndarray.np.sort for more + information. `mergesort` is the only stable algorithm. For + DataFrames, this option is only applied when sorting on a single + column or label. + na_position : {'first', 'last'}, default 'last' + Puts NaNs at the beginning if `first`; `last` puts NaNs at the + end. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + parallel_kind : {'PSRS'}, default 'PSRS' + Parallel sorting algorithm, for the details, refer to: + http://csweb.cs.wfu.edu/bigiron/LittleFE-PSRS/build/html/PSRSalgorithm.html + + Returns + ------- + sorted_obj : DataFrame or None + DataFrame with sorted values if inplace=False, None otherwise. + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame({ + ... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'], + ... 'col2': [2, 1, 9, 8, 7, 4], + ... 'col3': [0, 1, 9, 4, 2, 3], + ... }) + >>> df.execute() + col1 col2 col3 + 0 A 2 0 + 1 A 1 1 + 2 B 9 9 + 3 NaN 8 4 + 4 D 7 2 + 5 C 4 3 + + Sort by col1 + + >>> df.sort_values(by=['col1']).execute() + col1 col2 col3 + 0 A 2 0 + 1 A 1 1 + 2 B 9 9 + 5 C 4 3 + 4 D 7 2 + 3 NaN 8 4 + + Sort by multiple columns + + >>> df.sort_values(by=['col1', 'col2']).execute() + col1 col2 col3 + 1 A 1 1 + 0 A 2 0 + 2 B 9 9 + 5 C 4 3 + 4 D 7 2 + 3 NaN 8 4 + + Sort Descending + + >>> df.sort_values(by='col1', ascending=False).execute() + col1 col2 col3 + 4 D 7 2 + 5 C 4 3 + 2 B 9 9 + 0 A 2 0 + 1 A 1 1 + 3 NaN 8 4 + + Putting NAs first + + >>> df.sort_values(by='col1', ascending=False, na_position='first').execute() + col1 col2 col3 + 3 NaN 8 4 + 4 D 7 2 + 5 C 4 3 + 2 B 9 9 + 0 A 2 0 + 1 A 1 1 + """ + + if na_position not in ["last", "first"]: # pragma: no cover + raise TypeError(f"invalid na_position: {na_position}") + axis = validate_axis(axis, df) + if axis != 0: + raise NotImplementedError("Only support sort on axis 0") + psrs_kinds = _validate_sort_psrs_kinds(psrs_kinds) + by = by if isinstance(by, (list, tuple)) else [by] + if isinstance(ascending, list): # pragma: no cover + if all(ascending): + # all are True, convert to True + ascending = True + elif not any(ascending): + # all are False, convert to False + ascending = False + op = DataFrameSortValues( + by=by, + axis=axis, + ascending=ascending, + inplace=inplace, + kind=kind, + na_position=na_position, + ignore_index=ignore_index, + parallel_kind=parallel_kind, + psrs_kinds=psrs_kinds, + gpu=df.op.is_gpu(), + output_types=[OutputType.dataframe], + ) + sorted_df = op(df) + if inplace: + df.data = sorted_df.data + else: + return sorted_df + + +def series_sort_values( + series, + axis=0, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + ignore_index=False, + parallel_kind="PSRS", + psrs_kinds=None, +): + """ + Sort by the values. + + Sort a Series in ascending or descending order by some + criterion. + + Parameters + ---------- + series : input Series. + axis : {0 or 'index'}, default 0 + Axis to direct sorting. The value 'index' is accepted for + compatibility with DataFrame.sort_values. + ascending : bool, default True + If True, sort values in ascending order, otherwise descending. + inplace : bool, default False + If True, perform operation in-place. + kind : {'quicksort', 'mergesort' or 'heapsort'}, default 'quicksort' + Choice of sorting algorithm. See also :func:`numpy.sort` for more + information. 'mergesort' is the only stable algorithm. + na_position : {'first' or 'last'}, default 'last' + Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at + the end. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + Returns + ------- + Series + Series ordered by values. + + Examples + -------- + >>> import mars.dataframe as md + >>> raw = pd.Series([np.nan, 1, 3, 10, 5]) + >>> s = md.Series(raw) + >>> s.execute() + 0 NaN + 1 1.0 + 2 3.0 + 3 10.0 + 4 5.0 + dtype: float64 + + Sort values ascending order (default behaviour) + + >>> s.sort_values(ascending=True).execute() + 1 1.0 + 2 3.0 + 4 5.0 + 3 10.0 + 0 NaN + dtype: float64 + + Sort values descending order + + >>> s.sort_values(ascending=False).execute() + 3 10.0 + 4 5.0 + 2 3.0 + 1 1.0 + 0 NaN + dtype: float64 + + Sort values inplace + + >>> s.sort_values(ascending=False, inplace=True) + >>> s.execute() + 3 10.0 + 4 5.0 + 2 3.0 + 1 1.0 + 0 NaN + dtype: float64 + + Sort values putting NAs first + """ + if na_position not in ["last", "first"]: # pragma: no cover + raise TypeError(f"invalid na_position: {na_position}") + axis = validate_axis(axis, series) + if axis != 0: + raise NotImplementedError("Only support sort on axis 0") + psrs_kinds = _validate_sort_psrs_kinds(psrs_kinds) + op = DataFrameSortValues( + axis=axis, + ascending=ascending, + inplace=inplace, + kind=kind, + na_position=na_position, + ignore_index=ignore_index, + parallel_kind=parallel_kind, + psrs_kinds=psrs_kinds, + output_types=[OutputType.series], + gpu=series.op.is_gpu(), + ) + sorted_series = op(series) + if inplace: + series.data = sorted_series.data + else: + return sorted_series diff --git a/python/xorbits/_mars/dataframe/sort/tests/__init__.py b/python/xorbits/_mars/dataframe/sort/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/sort/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/sort/tests/test_sort.py b/python/xorbits/_mars/dataframe/sort/tests/test_sort.py new file mode 100644 index 000000000..f66b33cde --- /dev/null +++ b/python/xorbits/_mars/dataframe/sort/tests/test_sort.py @@ -0,0 +1,124 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ....core import tile +from ....core.operand import OperandStage +from ...indexing.getitem import DataFrameIndex +from ...initializer import DataFrame +from ..sort_index import DataFrameSortIndex, sort_index +from ..sort_values import DataFrameSortValues, dataframe_sort_values + + +def test_sort_values(): + raw = pd.DataFrame( + { + "a": np.random.rand(10), + "b": np.random.randint(1000, size=10), + "c": np.random.rand(10), + "d": [np.random.bytes(10) for _ in range(10)], + "e": [pd.Timestamp(f"201{i}") for i in range(10)], + "f": [pd.Timedelta(f"{i} days") for i in range(10)], + }, + ) + df = DataFrame(raw) + sorted_df = dataframe_sort_values(df, by="c") + + assert sorted_df.shape == raw.shape + assert isinstance(sorted_df.op, DataFrameSortValues) + + tiled = tile(sorted_df) + + assert len(tiled.chunks) == 1 + assert isinstance(tiled.chunks[0].op, DataFrameSortValues) + + df = DataFrame(raw, chunk_size=6) + sorted_df = dataframe_sort_values(df, by="c") + + assert sorted_df.shape == raw.shape + assert isinstance(sorted_df.op, DataFrameSortValues) + + tiled = tile(sorted_df) + + assert len(tiled.chunks) == 2 + assert tiled.chunks[0].op.stage == OperandStage.reduce + + df = DataFrame(raw, chunk_size=3) + sorted_df = dataframe_sort_values(df, by=["a", "c"]) + + assert sorted_df.shape == raw.shape + assert isinstance(sorted_df.op, DataFrameSortValues) + + tiled = tile(sorted_df) + + assert len(tiled.chunks) == 3 + assert tiled.chunks[0].op.stage == OperandStage.reduce + pd.testing.assert_series_equal(tiled.chunks[0].dtypes, raw.dtypes) + assert tiled.chunks[1].op.stage == OperandStage.reduce + pd.testing.assert_series_equal(tiled.chunks[1].dtypes, raw.dtypes) + assert tiled.chunks[2].op.stage == OperandStage.reduce + pd.testing.assert_series_equal(tiled.chunks[2].dtypes, raw.dtypes) + + +def test_sort_index(): + raw = pd.DataFrame( + np.random.rand(10, 10), columns=np.random.rand(10), index=np.random.rand(10) + ) + df = DataFrame(raw) + sorted_df = sort_index(df) + + assert sorted_df.shape == raw.shape + assert isinstance(sorted_df.op, DataFrameSortIndex) + + tiled = tile(sorted_df) + + assert len(tiled.chunks) == 1 + assert isinstance(tiled.chunks[0].op, DataFrameSortIndex) + + df = DataFrame(raw, chunk_size=6) + sorted_df = sort_index(df) + + assert sorted_df.shape == raw.shape + assert isinstance(sorted_df.op, DataFrameSortIndex) + + tiled = tile(sorted_df) + + assert len(tiled.chunks) == 2 + assert tiled.chunks[0].op.stage == OperandStage.reduce + + df = DataFrame(raw, chunk_size=3) + sorted_df = sort_index(df) + + assert sorted_df.shape == raw.shape + assert isinstance(sorted_df.op, DataFrameSortIndex) + + tiled = tile(sorted_df) + + assert len(tiled.chunks) == 3 + assert tiled.chunks[0].op.stage == OperandStage.reduce + assert tiled.chunks[1].op.stage == OperandStage.reduce + assert tiled.chunks[2].op.stage == OperandStage.reduce + + # support on axis 1 + df = DataFrame(raw, chunk_size=4) + sorted_df = sort_index(df, axis=1) + + assert sorted_df.shape == raw.shape + assert isinstance(sorted_df.op, DataFrameSortIndex) + + tiled = tile(sorted_df) + + assert all(isinstance(c.op, DataFrameIndex) for c in tiled.chunks) is True diff --git a/python/xorbits/_mars/dataframe/sort/tests/test_sort_execution.py b/python/xorbits/_mars/dataframe/sort/tests/test_sort_execution.py new file mode 100644 index 000000000..8c04fe8de --- /dev/null +++ b/python/xorbits/_mars/dataframe/sort/tests/test_sort_execution.py @@ -0,0 +1,423 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys + +import numpy as np +import pandas as pd +import pytest + +from ....tests.core import require_cudf +from ... import ArrowStringDtype, DataFrame, Series + + +@pytest.mark.parametrize( + "distinct_opt", ["0"] if sys.platform.lower().startswith("win") else ["0", "1"] +) +def test_sort_values_execution(setup, distinct_opt): + ns = np.random.RandomState(0) + os.environ["PSRS_DISTINCT_COL"] = distinct_opt + df = pd.DataFrame(ns.rand(100, 10), columns=["a" + str(i) for i in range(10)]) + + # test one chunk + mdf = DataFrame(df) + result = mdf.sort_values("a0").execute().fetch() + expected = df.sort_values("a0") + + pd.testing.assert_frame_equal(result, expected) + + result = mdf.sort_values(["a6", "a7"], ascending=False).execute().fetch() + expected = df.sort_values(["a6", "a7"], ascending=False) + + pd.testing.assert_frame_equal(result, expected) + + # test psrs + mdf = DataFrame(df, chunk_size=10) + result = mdf.sort_values("a0").execute().fetch() + expected = df.sort_values("a0") + + pd.testing.assert_frame_equal(result, expected) + + result = mdf.sort_values(["a3", "a4"]).execute().fetch() + expected = df.sort_values(["a3", "a4"]) + + pd.testing.assert_frame_equal(result, expected) + + # test ascending=False + result = mdf.sort_values(["a0", "a1"], ascending=False).execute().fetch() + expected = df.sort_values(["a0", "a1"], ascending=False) + + pd.testing.assert_frame_equal(result, expected) + + result = mdf.sort_values(["a7"], ascending=False).execute().fetch() + expected = df.sort_values(["a7"], ascending=False) + + pd.testing.assert_frame_equal(result, expected) + + # test ascending is a list + result = ( + mdf.sort_values(["a3", "a4", "a5", "a6"], ascending=[False, True, True, False]) + .execute() + .fetch() + ) + expected = df.sort_values( + ["a3", "a4", "a5", "a6"], ascending=[False, True, True, False] + ) + pd.testing.assert_frame_equal(result, expected) + + in_df = pd.DataFrame( + { + "col1": ns.choice([f"a{i}" for i in range(5)], size=(100,)), + "col2": ns.choice([f"b{i}" for i in range(5)], size=(100,)), + "col3": ns.choice([f"c{i}" for i in range(5)], size=(100,)), + "col4": ns.randint(10, 20, size=(100,)), + } + ) + mdf = DataFrame(in_df, chunk_size=10) + result = ( + mdf.sort_values( + ["col1", "col4", "col3", "col2"], ascending=[False, False, True, False] + ) + .execute() + .fetch() + ) + expected = in_df.sort_values( + ["col1", "col4", "col3", "col2"], ascending=[False, False, True, False] + ) + pd.testing.assert_frame_equal(result, expected) + + # test multiindex + df2 = df.copy(deep=True) + df2.columns = pd.MultiIndex.from_product([list("AB"), list("CDEFG")]) + mdf = DataFrame(df2, chunk_size=5) + + result = mdf.sort_values([("A", "C")]).execute().fetch() + expected = df2.sort_values([("A", "C")]) + + pd.testing.assert_frame_equal(result, expected) + + # test rechunk + mdf = DataFrame(df, chunk_size=3) + result = mdf.sort_values("a0").execute().fetch() + expected = df.sort_values("a0") + + pd.testing.assert_frame_equal(result, expected) + + result = mdf.sort_values(["a3", "a4"]).execute().fetch() + expected = df.sort_values(["a3", "a4"]) + + pd.testing.assert_frame_equal(result, expected) + + # test other types + raw = pd.DataFrame( + { + "a": np.random.rand(10), + "b": np.random.randint(1000, size=10), + "c": np.random.rand(10), + "d": [np.random.bytes(10) for _ in range(10)], + "e": [pd.Timestamp(f"201{i}") for i in range(10)], + "f": [pd.Timedelta(f"{i} days") for i in range(10)], + }, + ) + mdf = DataFrame(raw, chunk_size=3) + + for label in raw.columns: + result = mdf.sort_values(label).execute().fetch() + expected = raw.sort_values(label) + pd.testing.assert_frame_equal(result, expected) + + result = mdf.sort_values(["a", "b", "e"], ascending=False).execute().fetch() + expected = raw.sort_values(["a", "b", "e"], ascending=False) + + pd.testing.assert_frame_equal(result, expected) + + # test nan + df = pd.DataFrame( + { + "col1": ["A", "A", "B", "B", "D", "C"], + "col2": [2, 1, 9, np.nan, 7, 4], + "col3": [0, 1, 9, 4, 2, 3], + } + ) + mdf = DataFrame(df) + result = mdf.sort_values(["col2"]).execute().fetch() + expected = df.sort_values(["col2"]) + + pd.testing.assert_frame_equal(result, expected) + + mdf = DataFrame(df, chunk_size=3) + result = mdf.sort_values(["col2"]).execute().fetch() + expected = df.sort_values(["col2"]) + + pd.testing.assert_frame_equal(result, expected) + + # test None (issue #1885) + df = pd.DataFrame(np.random.rand(1000, 10)) + + df[0][df[0] < 0.5] = "A" + df[0][df[0] != "A"] = None + + mdf = DataFrame(df) + result = mdf.sort_values([0, 1]).execute().fetch() + expected = df.sort_values([0, 1]) + + pd.testing.assert_frame_equal(result, expected) + + mdf = DataFrame(df, chunk_size=100) + result = mdf.sort_values([0, 1]).execute().fetch() + expected = df.sort_values([0, 1]) + + pd.testing.assert_frame_equal(result, expected) + + # test ignore_index + df = pd.DataFrame(np.random.rand(10, 3), columns=["a" + str(i) for i in range(3)]) + + mdf = DataFrame(df, chunk_size=3) + result = mdf.sort_values(["a0", "a1"], ignore_index=True).execute().fetch() + try: # for python3.5 + expected = df.sort_values(["a0", "a1"], ignore_index=True) + except TypeError: + expected = df.sort_values(["a0", "a1"]) + expected.index = pd.RangeIndex(len(expected)) + + pd.testing.assert_frame_equal(result, expected) + + # test inplace + mdf = DataFrame(df) + mdf.sort_values("a0", inplace=True) + result = mdf.execute().fetch() + df.sort_values("a0", inplace=True) + + pd.testing.assert_frame_equal(result, df) + + # test unknown shape + df = pd.DataFrame({"a": list(range(10)), "b": np.random.random(10)}) + mdf = DataFrame(df, chunk_size=4) + filtered = mdf[mdf["a"] > 2] + result = filtered.sort_values(by="b").execute().fetch() + + pd.testing.assert_frame_equal(result, df[df["a"] > 2].sort_values(by="b")) + + # test empty dataframe + df = pd.DataFrame({"a": list(range(10)), "b": np.random.random(10)}) + mdf = DataFrame(df, chunk_size=4) + filtered = mdf[mdf["b"] > 100] + result = filtered.sort_values(by="b").execute().fetch() + + pd.testing.assert_frame_equal(result, df[df["b"] > 100].sort_values(by="b")) + + # test chunks with zero length + df = pd.DataFrame({"a": list(range(10)), "b": np.random.random(10)}) + df.iloc[4:8, 1] = 0 + + mdf = DataFrame(df, chunk_size=4) + filtered = mdf[mdf["b"] != 0] + result = filtered.sort_values(by="b").execute().fetch() + + pd.testing.assert_frame_equal(result, df[df["b"] != 0].sort_values(by="b")) + + # test Series.sort_values + raw = pd.Series(np.random.rand(10)) + series = Series(raw) + result = series.sort_values().execute().fetch() + expected = raw.sort_values() + + pd.testing.assert_series_equal(result, expected) + + series = Series(raw, chunk_size=3) + result = series.sort_values().execute().fetch() + expected = raw.sort_values() + + pd.testing.assert_series_equal(result, expected) + + series = Series(raw, chunk_size=2) + result = series.sort_values(ascending=False).execute().fetch() + expected = raw.sort_values(ascending=False) + + pd.testing.assert_series_equal(result, expected) + + # test empty series + series = pd.Series(list(range(10)), name="a") + mseries = Series(series, chunk_size=4) + filtered = mseries[mseries > 100] + result = filtered.sort_values().execute().fetch() + + pd.testing.assert_series_equal(result, series[series > 100].sort_values()) + + # test series with None + series = pd.Series(np.arange(1000)) + + series[series < 500] = "A" + series[series != "A"] = None + + mseries = Series(series, chunk_size=100) + result = mseries.sort_values().execute().fetch() + expected = series.sort_values() + pd.testing.assert_series_equal( + result.reset_index(drop=True), expected.reset_index(drop=True) + ) + + # test for empty input(#GH 2649) + pd_df = pd.DataFrame(np.random.rand(10, 3), columns=["col1", "col2", "col3"]) + df = DataFrame(pd_df, chunk_size=4) + df = df[df["col2"] > 1].execute() + result = df.sort_values(by="col1").execute().fetch() + expected = pd_df[pd_df["col2"] > 1].sort_values(by="col1") + pd.testing.assert_frame_equal(result, expected) + + pd_s = pd.Series(np.random.rand(10)) + s = Series(pd_s, chunk_size=4) + s = s[s > 1].execute() + result = s.sort_values().execute().fetch() + expected = pd_s[pd_s > 1].sort_values() + pd.testing.assert_series_equal(result, expected) + + +def test_sort_index_execution(setup): + raw = pd.DataFrame(np.random.rand(100, 20), index=np.random.rand(100)) + + mdf = DataFrame(raw) + result = mdf.sort_index().execute().fetch() + expected = raw.sort_index() + pd.testing.assert_frame_equal(result, expected) + + mdf = DataFrame(raw) + mdf.sort_index(inplace=True) + result = mdf.execute().fetch() + expected = raw.sort_index() + pd.testing.assert_frame_equal(result, expected) + + mdf = DataFrame(raw, chunk_size=30) + result = mdf.sort_index().execute().fetch() + expected = raw.sort_index() + pd.testing.assert_frame_equal(result, expected) + + mdf = DataFrame(raw, chunk_size=20) + result = mdf.sort_index(ascending=False).execute().fetch() + expected = raw.sort_index(ascending=False) + pd.testing.assert_frame_equal(result, expected) + + mdf = DataFrame(raw, chunk_size=10) + result = mdf.sort_index(ignore_index=True).execute().fetch() + try: # for python3.5 + expected = raw.sort_index(ignore_index=True) + except TypeError: + expected = raw.sort_index() + expected.index = pd.RangeIndex(len(expected)) + pd.testing.assert_frame_equal(result, expected) + + # test axis=1 + raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10)) + + mdf = DataFrame(raw) + result = mdf.sort_index(axis=1).execute().fetch() + expected = raw.sort_index(axis=1) + pd.testing.assert_frame_equal(result, expected) + + mdf = DataFrame(raw, chunk_size=3) + result = mdf.sort_index(axis=1).execute().fetch() + expected = raw.sort_index(axis=1) + pd.testing.assert_frame_equal(result, expected) + + mdf = DataFrame(raw, chunk_size=4) + result = mdf.sort_index(axis=1, ascending=False).execute().fetch() + expected = raw.sort_index(axis=1, ascending=False) + pd.testing.assert_frame_equal(result, expected) + + mdf = DataFrame(raw, chunk_size=4) + + result = mdf.sort_index(axis=1, ignore_index=True).execute().fetch() + try: # for python3.5 + expected = raw.sort_index(axis=1, ignore_index=True) + except TypeError: + expected = raw.sort_index(axis=1) + expected.index = pd.RangeIndex(len(expected)) + pd.testing.assert_frame_equal(result, expected) + + # test series + raw = pd.Series(np.random.rand(10), index=np.random.rand(10)) + + series = Series(raw) + result = series.sort_index().execute().fetch() + expected = raw.sort_index() + pd.testing.assert_series_equal(result, expected) + + series = Series(raw, chunk_size=2) + result = series.sort_index().execute().fetch() + expected = raw.sort_index() + pd.testing.assert_series_equal(result, expected) + + series = Series(raw, chunk_size=3) + result = series.sort_index(ascending=False).execute().fetch() + expected = raw.sort_index(ascending=False) + pd.testing.assert_series_equal(result, expected) + + +def test_arrow_string_sort_values(setup): + rs = np.random.RandomState(0) + raw = pd.DataFrame( + {"a": rs.rand(10), "b": [f"s{rs.randint(1000)}" for _ in range(10)]} + ) + raw["b"] = raw["b"].astype(ArrowStringDtype()) + mdf = DataFrame(raw, chunk_size=3) + + df = mdf.sort_values(by="b") + result = df.execute().fetch() + expected = raw.sort_values(by="b") + pd.testing.assert_frame_equal(result, expected) + + +@require_cudf +def test_gpu_execution(setup_gpu): + # test sort_values + rs = np.random.RandomState(0) + distinct_opts = ["0"] if sys.platform.lower().startswith("win") else ["0", "1"] + for add_distinct in distinct_opts: + os.environ["PSRS_DISTINCT_COL"] = add_distinct + + # test dataframe + raw = pd.DataFrame(rs.rand(100, 10), columns=["a" + str(i) for i in range(10)]) + mdf = DataFrame(raw, chunk_size=30).to_gpu() + + result = mdf.sort_values(by="a0").execute().fetch() + expected = raw.sort_values(by="a0") + pd.testing.assert_frame_equal(result.to_pandas(), expected) + + # test series + raw = pd.Series(rs.rand(10)) + series = Series(raw).to_gpu() + + result = series.sort_values().execute().fetch() + expected = raw.sort_values() + pd.testing.assert_series_equal(result.to_pandas(), expected) + + # test DataFrame.sort_index + raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10)) + mdf = DataFrame(raw).to_gpu() + + result = mdf.sort_index().execute().fetch() + expected = raw.sort_index() + pd.testing.assert_frame_equal(result.to_pandas(), expected) + + # test Series.sort_index + raw = pd.Series( + np.random.rand(10), + index=np.random.rand(10), + ) + series = Series(raw).to_gpu() + + result = series.sort_index().execute().fetch() + expected = raw.sort_index() + pd.testing.assert_series_equal(result.to_pandas(), expected) diff --git a/python/xorbits/_mars/dataframe/statistics/__init__.py b/python/xorbits/_mars/dataframe/statistics/__init__.py new file mode 100644 index 000000000..c994509f9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/statistics/__init__.py @@ -0,0 +1,35 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .quantile import DataFrameQuantile + + +def _install(): + from ..core import DATAFRAME_TYPE, SERIES_TYPE + from .corr import df_corr, df_corrwith, series_autocorr, series_corr + from .quantile import quantile_dataframe, quantile_series + + for t in SERIES_TYPE: + t.quantile = quantile_series + t.corr = series_corr + t.autocorr = series_autocorr + + for t in DATAFRAME_TYPE: + t.quantile = quantile_dataframe + t.corr = df_corr + t.corrwith = df_corrwith + + +_install() +del _install diff --git a/python/xorbits/_mars/dataframe/statistics/corr.py b/python/xorbits/_mars/dataframe/statistics/corr.py new file mode 100644 index 000000000..accf53bd1 --- /dev/null +++ b/python/xorbits/_mars/dataframe/statistics/corr.py @@ -0,0 +1,423 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import opcodes +from ...core import ENTITY_TYPE, recursive_tile +from ...serialization.serializables import AnyField, BoolField, Int32Field, KeyField +from ...tensor.utils import filter_inputs +from ...utils import has_unknown_shape +from ..core import DATAFRAME_TYPE, SERIES_TYPE +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_empty_df, parse_index, validate_axis + + +class DataFrameCorr(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.CORR + + other = KeyField("other", default=None) + method = AnyField("method", default=None) + min_periods = Int32Field("min_periods", default=None) + axis = Int32Field("axis", default=None) + drop = BoolField("drop", default=None) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + next(inputs_iter) + if isinstance(self.other, ENTITY_TYPE): + self.other = next(inputs_iter) + + def __call__(self, df_or_series): + if isinstance(df_or_series, SERIES_TYPE): + inputs = filter_inputs([df_or_series, self.other]) + return self.new_scalar(inputs, dtype=np.dtype(np.float_)) + else: + + def _filter_numeric(obj): + if not isinstance(obj, DATAFRAME_TYPE): + return obj + num_dtypes = build_empty_df(obj.dtypes)._get_numeric_data().dtypes + if len(num_dtypes) != len(obj.dtypes): + return obj[list(num_dtypes.index)] + return obj + + df_or_series = _filter_numeric(df_or_series) + self.other = _filter_numeric(self.other) + + inputs = filter_inputs([df_or_series, self.other]) + if self.axis is None: + dtypes = pd.Series( + [np.dtype(np.float_)] * len(df_or_series.dtypes), + index=df_or_series.dtypes.index, + ) + return self.new_dataframe( + inputs, + shape=(df_or_series.shape[1],) * 2, + dtypes=dtypes, + index_value=df_or_series.columns_value, + columns_value=df_or_series.columns_value, + ) + else: + new_index_value = df_or_series.axes[1 - self.axis].index_value + if isinstance(self.other, DATAFRAME_TYPE): + align_dtypes = pd.concat( + [self.other.dtypes, df_or_series.dtypes], axis=1 + ) + align_shape = (np.nan, align_dtypes.shape[0]) + new_index_value = parse_index(align_dtypes.index) + else: + align_shape = df_or_series.shape + + shape = (np.nan,) if self.drop else (align_shape[1 - self.axis],) + return self.new_series( + inputs, + shape=shape, + dtype=np.dtype(np.float_), + index_value=new_index_value, + ) + + @classmethod + def _tile_single(cls, op: "DataFrameCorr"): + out = op.outputs[0] + + new_op = op.copy().reset_key() + chunk = new_op.new_chunk( + [inp.chunks[0] for inp in op.inputs], + index=(0,) * len(out.shape), + **out.params, + ) + + new_op = op.copy().reset_key() + return new_op.new_tileables( + op.inputs, chunks=[chunk], nsplits=((s,) for s in out.shape), **out.params + ) + + @staticmethod + def _tile_pearson_cross(left, right, min_periods): + left_tensor, right_tensor = ( + left.fillna(0).to_tensor(), + right.fillna(0).to_tensor(), + ) + + nna_left = left.notna().to_tensor().astype(np.float_) + nna_right = right.notna().to_tensor().astype(np.float_) + + sum_left = left_tensor.T.dot(nna_right) + sum_right = right_tensor.T.dot(nna_left) + sum_left2 = (left_tensor.T**2).dot(nna_right) + sum_right2 = (right_tensor.T**2).dot(nna_left) + sum_mul = left_tensor.T.dot(right_tensor) + data_count = nna_left.T.dot(nna_right) + + divisor = np.sqrt(data_count * sum_left2 - sum_left * sum_left).T * np.sqrt( + data_count * sum_right2 - sum_right * sum_right + ) + + result = (data_count * sum_mul - sum_left * sum_right.T) / divisor + if min_periods is not None: + result = np.where(data_count >= min_periods, result, np.nan) + return result + + @classmethod + def _tile_pearson_align(cls, left, right, axis): + if left.ndim == right.ndim: + left, right = yield from recursive_tile(left.align(right)) + else: + left, right = yield from recursive_tile(left.align(right, axis=axis)) + if has_unknown_shape(left, right): + yield left.chunks + right.chunks + [left, right] + + nna_left = left.notna().astype(np.float_) + nna_right = right.notna().astype(np.float_) + + left, right = left.fillna(0), right.fillna(0) + + sum_left = left.mul(nna_right, axis=axis).sum(axis=axis) + sum_right = nna_left.mul(right, axis=axis).sum(axis=axis) + sum_left2 = (left**2).mul(nna_right, axis=axis).sum(axis=axis) + sum_right2 = nna_left.mul(right**2, axis=axis).sum(axis=axis) + sum_mul = left.mul(right, axis=axis).sum(axis=axis) + data_count = nna_left.mul(nna_right, axis=axis).sum(axis=axis) + + divisor = np.sqrt(data_count * sum_left2 - sum_left * sum_left) * np.sqrt( + data_count * sum_right2 - sum_right * sum_right + ) + return (data_count * sum_mul - sum_left * sum_right) / divisor + + @classmethod + def _tile_series(cls, op: "DataFrameCorr"): + left = op.inputs[0] + right = op.other + + _check_supported_methods(op.method) + return [ + ( + yield from recursive_tile( + cls._tile_pearson_cross(left, right, min_periods=op.min_periods) + ) + ) + ] + + @classmethod + def _tile_dataframe_cross(cls, op: "DataFrameCorr"): + from ..initializer import DataFrame as MarsDataFrame + + left = op.inputs[0] + right = op.other if op.other is not None else op.inputs[0] + + _check_supported_methods(op.method) + + result = cls._tile_pearson_cross(left, right, min_periods=op.min_periods) + result = MarsDataFrame( + result, index=left.dtypes.index, columns=right.dtypes.index + ) + return [(yield from recursive_tile(result))] + + @classmethod + def _tile_dataframe_align(cls, op: "DataFrameCorr"): + left = op.inputs[0] + right = op.other + + _check_supported_methods(op.method) + result = yield from cls._tile_pearson_align(left, right, axis=op.axis) + if op.drop: + result = result.dropna(axis=op.axis) + return [(yield from recursive_tile(result))] + + @classmethod + def tile(cls, op: "DataFrameCorr"): + inp = op.inputs[0] + if len(inp.chunks) == 1 and (op.other is None or len(op.other.chunks) == 1): + return cls._tile_single(op) + elif isinstance(inp, SERIES_TYPE): + return (yield from cls._tile_series(op)) + elif op.axis is None: + return (yield from cls._tile_dataframe_cross(op)) + else: + return (yield from cls._tile_dataframe_align(op)) + + @classmethod + def execute(cls, ctx, op: "DataFrameCorr"): + inp = op.inputs[0] + out = op.outputs[0] + inp_data = ctx[inp.key] + + if inp.ndim == 1: + ctx[out.key] = inp_data.corr( + ctx[op.other.key], method=op.method, min_periods=op.min_periods + ) + elif op.axis is None: + ctx[out.key] = inp_data.corr(method=op.method, min_periods=op.min_periods) + else: + ctx[out.key] = inp_data.corrwith( + ctx[op.other.key], method=op.method, axis=op.axis, drop=op.drop + ) + + +def _check_supported_methods(method): + if method != "pearson": + raise NotImplementedError(f"Correlation method {method!r} not supported") + + +def df_corr(df, method="pearson", min_periods=1): + """ + Compute pairwise correlation of columns, excluding NA/null values. + + Parameters + ---------- + method : {'pearson', 'kendall', 'spearman'} or callable + Method of correlation: + + * pearson : standard correlation coefficient + * kendall : Kendall Tau correlation coefficient + * spearman : Spearman rank correlation + * callable: callable with input two 1d ndarrays + and returning a float. Note that the returned matrix from corr + will have 1 along the diagonals and will be symmetric + regardless of the callable's behavior. + + .. note:: + kendall, spearman and callables not supported on multiple chunks yet. + + min_periods : int, optional + Minimum number of observations required per pair of columns + to have a valid result. Currently only available for Pearson + and Spearman correlation. + + Returns + ------- + DataFrame + Correlation matrix. + + See Also + -------- + DataFrame.corrwith : Compute pairwise correlation with another + DataFrame or Series. + Series.corr : Compute the correlation between two Series. + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], + ... columns=['dogs', 'cats']) + >>> df.corr(method='pearson').execute() + dogs cats + dogs 1.000000 -0.851064 + cats -0.851064 1.000000 + """ + op = DataFrameCorr(method=method, min_periods=min_periods) + return op(df) + + +def df_corrwith(df, other, axis=0, drop=False, method="pearson"): + """ + Compute pairwise correlation. + + Pairwise correlation is computed between rows or columns of + DataFrame with rows or columns of Series or DataFrame. DataFrames + are first aligned along both axes before computing the + correlations. + + Parameters + ---------- + other : DataFrame, Series + Object with which to compute correlations. + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to use. 0 or 'index' to compute column-wise, 1 or 'columns' for + row-wise. + drop : bool, default False + Drop missing indices from result. + method : {'pearson', 'kendall', 'spearman'} or callable + Method of correlation: + + * pearson : standard correlation coefficient + * kendall : Kendall Tau correlation coefficient + * spearman : Spearman rank correlation + * callable: callable with input two 1d ndarrays + and returning a float. + + .. note:: + kendall, spearman and callables not supported on multiple chunks yet. + + Returns + ------- + Series + Pairwise correlations. + + See Also + -------- + DataFrame.corr : Compute pairwise correlation of columns. + """ + axis = validate_axis(axis, df) + if drop: + # TODO implement with df.align(method='inner') + raise NotImplementedError("drop=True not implemented") + op = DataFrameCorr(other=other, method=method, axis=axis, drop=drop) + return op(df) + + +def series_corr(series, other, method="pearson", min_periods=None): + """ + Compute correlation with `other` Series, excluding missing values. + + Parameters + ---------- + other : Series + Series with which to compute the correlation. + method : {'pearson', 'kendall', 'spearman'} or callable + Method used to compute correlation: + + - pearson : Standard correlation coefficient + - kendall : Kendall Tau correlation coefficient + - spearman : Spearman rank correlation + - callable: Callable with input two 1d ndarrays and returning a float. + + .. note:: + kendall, spearman and callables not supported on multiple chunks yet. + + min_periods : int, optional + Minimum number of observations needed to have a valid result. + + Returns + ------- + float + Correlation with other. + + See Also + -------- + DataFrame.corr : Compute pairwise correlation between columns. + DataFrame.corrwith : Compute pairwise correlation with another + DataFrame or Series. + + Examples + -------- + >>> import mars.dataframe as md + >>> s1 = md.Series([.2, .0, .6, .2]) + >>> s2 = md.Series([.3, .6, .0, .1]) + >>> s1.corr(s2, method='pearson').execute() + -0.8510644963469898 + """ + op = DataFrameCorr(other=other, method=method, min_periods=min_periods) + return op(series) + + +def series_autocorr(series, lag=1): + """ + Compute the lag-N autocorrelation. + + This method computes the Pearson correlation between + the Series and its shifted self. + + Parameters + ---------- + lag : int, default 1 + Number of lags to apply before performing autocorrelation. + + Returns + ------- + float + The Pearson correlation between self and self.shift(lag). + + See Also + -------- + Series.corr : Compute the correlation between two Series. + Series.shift : Shift index by desired number of periods. + DataFrame.corr : Compute pairwise correlation of columns. + DataFrame.corrwith : Compute pairwise correlation between rows or + columns of two DataFrame objects. + + Notes + ----- + If the Pearson correlation is not well defined return 'NaN'. + + Examples + -------- + >>> import mars.dataframe as md + >>> s = md.Series([0.25, 0.5, 0.2, -0.05]) + >>> s.autocorr().execute() # doctest: +ELLIPSIS.execute() + 0.10355... + >>> s.autocorr(lag=2).execute() # doctest: +ELLIPSIS.execute() + -0.99999... + + If the Pearson correlation is not well defined, then 'NaN' is returned. + + >>> s = md.Series([1, 0, 0, 0]) + >>> s.autocorr().execute() + nan + """ + op = DataFrameCorr(other=series.shift(lag), method="pearson") + return op(series) diff --git a/python/xorbits/_mars/dataframe/statistics/quantile.py b/python/xorbits/_mars/dataframe/statistics/quantile.py new file mode 100644 index 000000000..4caa0fc26 --- /dev/null +++ b/python/xorbits/_mars/dataframe/statistics/quantile.py @@ -0,0 +1,483 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import ENTITY_TYPE, recursive_tile +from ...serialization.serializables import ( + AnyField, + BoolField, + DataTypeField, + Int32Field, + KeyField, + StringField, +) +from ...tensor.core import TENSOR_CHUNK_TYPE, TENSOR_TYPE +from ...tensor.datasource import empty +from ...tensor.datasource import from_dataframe as tensor_from_dataframe +from ...tensor.datasource import from_series as tensor_from_series +from ...tensor.datasource import tensor as astensor +from ...tensor.statistics.quantile import quantile as tensor_quantile +from ..core import DATAFRAME_TYPE +from ..datasource.from_tensor import dataframe_from_tensor, series_from_tensor +from ..initializer import DataFrame as create_df +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_empty_df, find_common_type, parse_index, validate_axis + + +class DataFrameQuantile(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = OperandDef.QUANTILE + + _input = KeyField("input") + _q = AnyField("q") + _axis = Int32Field("axis") + _numeric_only = BoolField("numeric_only") + _interpolation = StringField("interpolation") + + _dtype = DataTypeField("dtype") + + def __init__( + self, + q=None, + interpolation=None, + axis=None, + numeric_only=None, + dtype=None, + gpu=None, + output_types=None, + **kw + ): + super().__init__( + _q=q, + _interpolation=interpolation, + _axis=axis, + _numeric_only=numeric_only, + _dtype=dtype, + _output_types=output_types, + gpu=gpu, + **kw + ) + + @property + def input(self): + return self._input + + @property + def q(self): + return self._q + + @property + def interpolation(self): + return self._interpolation + + @property + def axis(self): + return self._axis + + @property + def numeric_only(self): + return self._numeric_only + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + if isinstance(self._q, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)): + self._q = self._inputs[-1] + + def _calc_dtype_on_axis_1(self, a, dtypes): + quantile_dtypes = [] + for name in dtypes.index: + dt = tensor_quantile( + tensor_from_series(a[name]), + self._q, + interpolation=self._interpolation, + handle_non_numeric=not self._numeric_only, + ).dtype + quantile_dtypes.append(dt) + return find_common_type(quantile_dtypes) + + def _call_dataframe(self, a, inputs): + if self._numeric_only: + empty_df = build_empty_df(a.dtypes) + dtypes = empty_df._get_numeric_data().dtypes + else: + dtypes = a.dtypes + if isinstance(self._q, TENSOR_TYPE): + q_val = self._q + pd_index = pd.Index([], dtype=q_val.dtype) + name = None + store_index_value = False + else: + q_val = np.asanyarray(self._q) + pd_index = pd.Index(q_val) + name = self._q if q_val.size == 1 else None + store_index_value = True + tokenize_objects = (a, q_val, self._interpolation, type(self).__name__) + + if q_val.ndim == 0 and self._axis == 0: + index_value = parse_index(dtypes.index, store_data=store_index_value) + shape = (len(dtypes),) + # calc dtype + dtype = self._calc_dtype_on_axis_1(a, dtypes) + return self.new_series( + inputs, + shape=shape, + dtype=dtype, + index_value=index_value, + name=name or dtypes.index.name, + ) + elif q_val.ndim == 0 and self._axis == 1: + index_value = a.index_value + shape = (len(a),) + # calc dtype + dt = tensor_quantile( + empty(a.shape[1], dtype=find_common_type(list(dtypes))), + self._q, + interpolation=self._interpolation, + handle_non_numeric=not self._numeric_only, + ).dtype + return self.new_series( + inputs, + shape=shape, + dtype=dt, + index_value=index_value, + name=name or index_value.name, + ) + elif q_val.ndim == 1 and self._axis == 0: + shape = (len(q_val), len(dtypes)) + index_value = parse_index( + pd_index, *tokenize_objects, store_data=store_index_value + ) + dtype_list = [] + for name in dtypes.index: + dtype_list.append( + tensor_quantile( + tensor_from_series(a[name]), + self._q, + interpolation=self._interpolation, + handle_non_numeric=not self._numeric_only, + ).dtype + ) + dtypes = pd.Series(dtype_list, index=dtypes.index) + return self.new_dataframe( + inputs, + shape=shape, + dtypes=dtypes, + index_value=index_value, + columns_value=parse_index(dtypes.index, store_data=True), + ) + else: + assert q_val.ndim == 1 and self._axis == 1 + shape = (len(q_val), a.shape[0]) + index_value = parse_index( + pd_index, *tokenize_objects, store_data=store_index_value + ) + pd_columns = a.index_value.to_pandas() + dtype_list = np.full(len(pd_columns), self._calc_dtype_on_axis_1(a, dtypes)) + dtypes = pd.Series(dtype_list, index=pd_columns) + return self.new_dataframe( + inputs, + shape=shape, + dtypes=dtypes, + index_value=index_value, + columns_value=parse_index( + dtypes.index, store_data=True, key=a.index_value.key + ), + ) + + def _call_series(self, a, inputs): + if isinstance(self._q, TENSOR_TYPE): + q_val = self._q + index_val = pd.Index([], dtype=q_val.dtype) + store_index_value = False + else: + q_val = np.asanyarray(self._q) + index_val = pd.Index(q_val) + store_index_value = True + + # get dtype by tensor + a_t = astensor(a) + self._dtype = dtype = tensor_quantile( + a_t, + self._q, + interpolation=self._interpolation, + handle_non_numeric=not self._numeric_only, + ).dtype + + if q_val.ndim == 0: + return self.new_scalar(inputs, dtype=dtype) + else: + return self.new_series( + inputs, + shape=q_val.shape, + dtype=dtype, + index_value=parse_index( + index_val, + a, + q_val, + self._interpolation, + type(self).__name__, + store_data=store_index_value, + ), + name=a.name, + ) + + def __call__(self, a, q_input=None): + inputs = [a] + if q_input is not None: + inputs.append(q_input) + if isinstance(a, DATAFRAME_TYPE): + return self._call_dataframe(a, inputs) + else: + return self._call_series(a, inputs) + + @classmethod + def _tile_dataframe(cls, op): + from ...tensor.merge.stack import TensorStack + + df = op.outputs[0] + if df.ndim == 1: + if op.axis == 0: + ts = [] + for name in df.index_value.to_pandas(): + a = tensor_from_series(op.input[name]) + t = tensor_quantile( + a, + op.q, + interpolation=op.interpolation, + handle_non_numeric=not op.numeric_only, + ) + ts.append(t) + try: + dtype = np.result_type(*[it.dtype for it in ts]) + except TypeError: + dtype = np.dtype(object) + stack_op = TensorStack(axis=0, dtype=dtype) + tr = stack_op(ts) + r = series_from_tensor( + tr, index=df.index_value.to_pandas(), name=ts[0].op.q.item() + ) + else: + assert op.axis == 1 + empty_df = build_empty_df(op.input.dtypes) + fields = empty_df._get_numeric_data().columns.tolist() + t = tensor_from_dataframe(op.input[fields]) + tr = tensor_quantile( + t, + op.q, + axis=1, + interpolation=op.interpolation, + handle_non_numeric=not op.numeric_only, + ) + r = series_from_tensor(tr, index=op.input.index, name=tr.op.q.item()) + else: + assert df.ndim == 2 + if op.axis == 0: + d = OrderedDict() + for name in df.dtypes.index: + a = tensor_from_series(op.input[name]) + t = tensor_quantile( + a, + op.q, + interpolation=op.interpolation, + handle_non_numeric=not op.numeric_only, + ) + d[name] = t + r = create_df(d, index=op.q) + else: + assert op.axis == 1 + empty_df = build_empty_df(op.input.dtypes) + fields = empty_df._get_numeric_data().columns.tolist() + t = tensor_from_dataframe(op.input[fields]) + tr = tensor_quantile( + t, + op.q, + axis=1, + interpolation=op.interpolation, + handle_non_numeric=not op.numeric_only, + ) + if not op.input.index_value.has_value(): + raise NotImplementedError + # TODO(xuye.qin): use index=op.input.index when we support DataFrame.index + r = dataframe_from_tensor( + tr, index=op.q, columns=op.input.index_value.to_pandas() + ) + + return (yield from recursive_tile(r)) + + @classmethod + def _tile_series(cls, op): + a = tensor_from_series(op.input) + t = tensor_quantile( + a, + op.q, + interpolation=op.interpolation, + handle_non_numeric=not op.numeric_only, + ) + if isinstance(op.outputs[0], TENSOR_TYPE): + r = t + else: + r = series_from_tensor(t, index=op.q, name=op.outputs[0].name) + r = yield from recursive_tile(r) + return [r] + + @classmethod + def tile(cls, op): + if isinstance(op.input, DATAFRAME_TYPE): + tiled = yield from cls._tile_dataframe(op) + else: + tiled = yield from cls._tile_series(op) + return tiled + + +def quantile_series(series, q=0.5, interpolation="linear"): + """ + Return value at the given quantile. + + Parameters + ---------- + q : float or array-like, default 0.5 (50% quantile) + 0 <= q <= 1, the quantile(s) to compute. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + + This optional parameter specifies the interpolation method to use, + when the desired quantile lies between two data points `i` and `j`: + + * linear: `i + (j - i) * fraction`, where `fraction` is the + fractional part of the index surrounded by `i` and `j`. + * lower: `i`. + * higher: `j`. + * nearest: `i` or `j` whichever is nearest. + * midpoint: (`i` + `j`) / 2. + + Returns + ------- + float or Series + If ``q`` is an array or a tensor, a Series will be returned where the + index is ``q`` and the values are the quantiles, otherwise + a float will be returned. + + See Also + -------- + core.window.Rolling.quantile + numpy.percentile + + Examples + -------- + >>> import mars.dataframe as md + >>> s = md.Series([1, 2, 3, 4]) + >>> s.quantile(.5).execute() + 2.5 + >>> s.quantile([.25, .5, .75]).execute() + 0.25 1.75 + 0.50 2.50 + 0.75 3.25 + dtype: float64 + """ + + if isinstance(q, ENTITY_TYPE): + q = astensor(q) + q_input = q + else: + q_input = None + + op = DataFrameQuantile(q=q, interpolation=interpolation, gpu=series.op.gpu) + return op(series, q_input=q_input) + + +def quantile_dataframe(df, q=0.5, axis=0, numeric_only=True, interpolation="linear"): + """ + Return values at the given quantile over requested axis. + + Parameters + ---------- + q : float or array-like, default 0.5 (50% quantile) + Value between 0 <= q <= 1, the quantile(s) to compute. + axis : {0, 1, 'index', 'columns'} (default 0) + Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + numeric_only : bool, default True + If False, the quantile of datetime and timedelta data will be + computed as well. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + This optional parameter specifies the interpolation method to use, + when the desired quantile lies between two data points `i` and `j`: + * linear: `i + (j - i) * fraction`, where `fraction` is the + fractional part of the index surrounded by `i` and `j`. + * lower: `i`. + * higher: `j`. + * nearest: `i` or `j` whichever is nearest. + * midpoint: (`i` + `j`) / 2. + + Returns + ------- + Series or DataFrame + If ``q`` is an array or a tensor, a DataFrame will be returned where the + index is ``q``, the columns are the columns of self, and the + values are the quantiles. + If ``q`` is a float, a Series will be returned where the + index is the columns of self and the values are the quantiles. + + See Also + -------- + core.window.Rolling.quantile: Rolling quantile. + numpy.percentile: Numpy function to compute the percentile. + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), + ... columns=['a', 'b']) + >>> df.quantile(.1).execute() + a 1.3 + b 3.7 + Name: 0.1, dtype: float64 + + >>> df.quantile([.1, .5]).execute() + a b + 0.1 1.3 3.7 + 0.5 2.5 55.0 + + Specifying `numeric_only=False` will also compute the quantile of + datetime and timedelta data. + + >>> df = md.DataFrame({'A': [1, 2], + ... 'B': [md.Timestamp('2010'), + ... md.Timestamp('2011')], + ... 'C': [md.Timedelta('1 days'), + ... md.Timedelta('2 days')]}) + >>> df.quantile(0.5, numeric_only=False).execute() + A 1.5 + B 2010-07-02 12:00:00 + C 1 days 12:00:00 + Name: 0.5, dtype: object + """ + if isinstance(q, ENTITY_TYPE): + q = astensor(q) + q_input = q + else: + q_input = None + axis = validate_axis(axis, df) + + op = DataFrameQuantile( + q=q, + interpolation=interpolation, + axis=axis, + numeric_only=numeric_only, + gpu=df.op.gpu, + ) + return op(df, q_input=q_input) diff --git a/python/xorbits/_mars/dataframe/statistics/tests/__init__.py b/python/xorbits/_mars/dataframe/statistics/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/statistics/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/statistics/tests/test_statistics.py b/python/xorbits/_mars/dataframe/statistics/tests/test_statistics.py new file mode 100644 index 000000000..d183f09ce --- /dev/null +++ b/python/xorbits/_mars/dataframe/statistics/tests/test_statistics.py @@ -0,0 +1,92 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ....core import tile +from ....tensor import Tensor +from ...core import DataFrame, Series +from ...datasource.dataframe import from_pandas as df_from_pandas +from ...datasource.series import from_pandas as series_from_pandas + + +def test_series_quantile(): + raw = pd.Series(np.random.rand(10)) + s = series_from_pandas(raw, chunk_size=3) + + r = s.quantile() + assert isinstance(r, Tensor) + tile(r) + + s = series_from_pandas(raw, chunk_size=3) + + r = s.quantile([0.3, 0.7]) + assert isinstance(r, Series) + assert r.shape == (2,) + pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.Index([0.3, 0.7])) + tile(r) + + +def test_dataframe_quantile(): + raw = pd.DataFrame( + { + "a": np.random.rand(10), + "b": np.random.randint(1000, size=10), + "c": [np.random.bytes(5) for _ in range(10)], + } + ) + s = df_from_pandas(raw, chunk_size=7) + + # q = 0.3, axis = 0 + r = s.quantile(0.3) + e = raw.quantile(0.3) + assert isinstance(r, Series) + assert r.shape == (2,) + assert r.dtype == e.dtype + pd.testing.assert_index_equal(r.index_value.to_pandas(), e.index) + + tile(r) + + # q = 0.3, axis = 1 + r = s.quantile(0.3, axis=1) + e = raw.quantile(0.3, axis=1) + assert isinstance(r, Series) + assert r.shape == e.shape + assert r.dtype == e.dtype + pd.testing.assert_index_equal(r.index_value.to_pandas(), e.index) + + tile(r) + + # q = [0.3, 0.7], axis = 0 + r = s.quantile([0.3, 0.7]) + e = raw.quantile([0.3, 0.7]) + assert isinstance(r, DataFrame) + assert r.shape == e.shape + pd.testing.assert_series_equal(r.dtypes, e.dtypes) + pd.testing.assert_index_equal(r.index_value.to_pandas(), e.index) + pd.testing.assert_index_equal(r.columns_value.to_pandas(), e.columns) + + tile(r) + + # q = [0.3, 0.7], axis = 1 + r = s.quantile([0.3, 0.7], axis=1) + e = raw.quantile([0.3, 0.7], axis=1) + assert isinstance(r, DataFrame) + assert r.shape == e.shape + pd.testing.assert_series_equal(r.dtypes, e.dtypes) + pd.testing.assert_index_equal(r.index_value.to_pandas(), e.index) + pd.testing.assert_index_equal(r.columns_value.to_pandas(), e.columns) + + tile(r) diff --git a/python/xorbits/_mars/dataframe/statistics/tests/test_statistics_execution.py b/python/xorbits/_mars/dataframe/statistics/tests/test_statistics_execution.py new file mode 100644 index 000000000..c6a6bc02b --- /dev/null +++ b/python/xorbits/_mars/dataframe/statistics/tests/test_statistics_execution.py @@ -0,0 +1,262 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest + +from ....tensor import tensor +from ... import DataFrame, Series + + +def test_series_quantile_execution(setup): + raw = pd.Series(np.random.rand(10), name="a") + a = Series(raw, chunk_size=3) + + # q = 0.5, scalar + r = a.quantile() + result = r.execute().fetch() + expected = raw.quantile() + + assert result == expected + + # q is a list + r = a.quantile([0.3, 0.7]) + result = r.execute().fetch() + expected = raw.quantile([0.3, 0.7]) + + pd.testing.assert_series_equal(result, expected) + + # test interpolation + r = a.quantile([0.3, 0.7], interpolation="midpoint") + result = r.execute().fetch() + expected = raw.quantile([0.3, 0.7], interpolation="midpoint") + + pd.testing.assert_series_equal(result, expected) + + q = tensor([0.3, 0.7]) + + # q is a tensor + r = a.quantile(q) + result = r.execute().fetch() + expected = raw.quantile([0.3, 0.7]) + + pd.testing.assert_series_equal(result, expected) + + +def test_dataframe_quantile_execution(setup): + raw = pd.DataFrame( + { + "a": np.random.rand(10), + "b": np.random.randint(1000, size=10), + "c": np.random.rand(10), + "d": [np.random.bytes(10) for _ in range(10)], + "e": [pd.Timestamp(f"201{i}") for i in range(10)], + "f": [pd.Timedelta(f"{i} days") for i in range(10)], + }, + index=pd.RangeIndex(1, 11), + ) + df = DataFrame(raw, chunk_size=3) + + # q = 0.5, axis = 0, series + r = df.quantile() + result = r.execute().fetch() + expected = raw.quantile() + + pd.testing.assert_series_equal(result, expected) + + # q = 0.5, axis = 1, series + r = df.quantile(axis=1) + result = r.execute().fetch() + expected = raw.quantile(axis=1) + + pd.testing.assert_series_equal(result, expected) + + # q is a list, axis = 0, dataframe + r = df.quantile([0.3, 0.7]) + result = r.execute().fetch() + expected = raw.quantile([0.3, 0.7]) + + pd.testing.assert_frame_equal(result, expected) + + # q is a list, axis = 1, dataframe + r = df.quantile([0.3, 0.7], axis=1) + result = r.execute().fetch() + expected = raw.quantile([0.3, 0.7], axis=1) + + pd.testing.assert_frame_equal(result, expected) + + # test interpolation + r = df.quantile([0.3, 0.7], interpolation="midpoint") + result = r.execute().fetch() + expected = raw.quantile([0.3, 0.7], interpolation="midpoint") + + pd.testing.assert_frame_equal(result, expected) + + q = tensor([0.3, 0.7]) + + # q is a tensor + r = df.quantile(q) + result = r.execute().fetch() + expected = raw.quantile([0.3, 0.7]) + + pd.testing.assert_frame_equal(result, expected) + + # test numeric_only + raw2 = pd.DataFrame( + { + "a": np.random.rand(10), + "b": np.random.randint(1000, size=10), + "c": np.random.rand(10), + "d": [pd.Timestamp(f"201{i}") for i in range(10)], + }, + index=pd.RangeIndex(1, 11), + ) + df2 = DataFrame(raw2, chunk_size=3) + + r = df2.quantile([0.3, 0.7], numeric_only=False) + result = r.execute().fetch() + expected = raw2.quantile([0.3, 0.7], numeric_only=False) + + pd.testing.assert_frame_equal(result, expected) + + r = df2.quantile(numeric_only=False) + result = r.execute().fetch() + expected = raw2.quantile(numeric_only=False) + + pd.testing.assert_series_equal(result, expected) + + +def test_dataframe_corr(setup): + rs = np.random.RandomState(0) + raw = rs.rand(20, 10) + raw = pd.DataFrame(np.where(raw > 0.4, raw, np.nan), columns=list("ABCDEFGHIJ")) + raw["k"] = pd.Series(["aaa"] * 20) + + df = DataFrame(raw) + + result = df.corr() + pd.testing.assert_frame_equal(result.execute().fetch(), raw.corr()) + + result = df.corr(method="kendall") + pd.testing.assert_frame_equal(result.execute().fetch(), raw.corr(method="kendall")) + + df = DataFrame(raw, chunk_size=6) + + with pytest.raises(Exception): + df.corr(method="kendall").execute() + + result = df.corr() + pd.testing.assert_frame_equal(result.execute().fetch(), raw.corr()) + + result = df.corr(min_periods=7) + pd.testing.assert_frame_equal(result.execute().fetch(), raw.corr(min_periods=7)) + + +@pytest.mark.skip_ray_dag # https://github.com/mars-project/mars/issues/3247 +def test_dataframe_corr_with(setup): + rs = np.random.RandomState(0) + raw_df = rs.rand(20, 10) + raw_df = pd.DataFrame( + np.where(raw_df > 0.4, raw_df, np.nan), columns=list("ABCDEFGHIJ") + ) + raw_df2 = rs.rand(20, 10) + raw_df2 = pd.DataFrame( + np.where(raw_df2 > 0.4, raw_df2, np.nan), columns=list("ACDEGHIJKL") + ) + raw_s = rs.rand(20) + raw_s = pd.Series(np.where(raw_s > 0.4, raw_s, np.nan)) + raw_s2 = rs.rand(10) + raw_s2 = pd.Series(np.where(raw_s2 > 0.4, raw_s2, np.nan), index=raw_df2.columns) + + df = DataFrame(raw_df) + df2 = DataFrame(raw_df2) + + result = df.corrwith(df2) + pd.testing.assert_series_equal(result.execute().fetch(), raw_df.corrwith(raw_df2)) + + result = df.corrwith(df2, axis=1) + pd.testing.assert_series_equal( + result.execute().fetch(), raw_df.corrwith(raw_df2, axis=1) + ) + + result = df.corrwith(df2, method="kendall") + pd.testing.assert_series_equal( + result.execute().fetch(), raw_df.corrwith(raw_df2, method="kendall") + ) + + df = DataFrame(raw_df, chunk_size=4) + df2 = DataFrame(raw_df2, chunk_size=6) + s = Series(raw_s, chunk_size=5) + s2 = Series(raw_s2, chunk_size=5) + + with pytest.raises(Exception): + df.corrwith(df2, method="kendall").execute() + + result = df.corrwith(df2) + pd.testing.assert_series_equal( + result.execute().fetch().sort_index(), raw_df.corrwith(raw_df2).sort_index() + ) + + result = df.corrwith(df2, axis=1) + pd.testing.assert_series_equal( + result.execute().fetch().sort_index(), + raw_df.corrwith(raw_df2, axis=1).sort_index(), + ) + + result = df.corrwith(s) + pd.testing.assert_series_equal( + result.execute().fetch().sort_index(), raw_df.corrwith(raw_s).sort_index() + ) + + result = df.corrwith(s2, axis=1) + pd.testing.assert_series_equal( + result.execute().fetch().sort_index(), + raw_df.corrwith(raw_s2, axis=1).sort_index(), + ) + + +def test_series_corr(setup): + rs = np.random.RandomState(0) + raw = rs.rand(20) + raw = pd.Series(np.where(raw > 0.4, raw, np.nan)) + raw2 = rs.rand(20) + raw2 = pd.Series(np.where(raw2 > 0.4, raw2, np.nan)) + + s = Series(raw) + s2 = Series(raw2) + + result = s.corr(s2) + assert result.execute().fetch() == raw.corr(raw2) + + result = s.corr(s2, method="kendall") + assert result.execute().fetch() == raw.corr(raw2, method="kendall") + + result = s.autocorr(2) + assert result.execute().fetch() == raw.autocorr(2) + + s = Series(raw, chunk_size=6) + s2 = Series(raw2, chunk_size=4) + + with pytest.raises(Exception): + s.corr(s2, method="kendall").execute() + + result = s.corr(s2) + assert pytest.approx(result.execute().fetch()) == raw.corr(raw2) + + result = s.corr(s2, min_periods=7) + assert pytest.approx(result.execute().fetch()) == raw.corr(raw2, min_periods=7) + + result = s.autocorr(2) + assert pytest.approx(result.execute().fetch()) == raw.autocorr(2) diff --git a/python/xorbits/_mars/dataframe/tests/__init__.py b/python/xorbits/_mars/dataframe/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/tests/test_arrays.py b/python/xorbits/_mars/dataframe/tests/test_arrays.py new file mode 100644 index 000000000..fdd521054 --- /dev/null +++ b/python/xorbits/_mars/dataframe/tests/test_arrays.py @@ -0,0 +1,482 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest + +try: + import pyarrow as pa +except ImportError: + pa = None + +from ...config import option_context +from ...core import enter_mode +from .. import ArrowListArray, ArrowListDtype, ArrowStringArray, ArrowStringDtype +from ..arrays import _use_bool_any_all +from ..utils import arrow_table_to_pandas_dataframe + + +@pytest.mark.skipif(pa is None, reason="pyarrow not installed") +def test_arrow_dtype(): + s = pa.array(["a", "b"]) + assert list(ArrowStringDtype().__from_arrow__(s)) == list(ArrowStringArray(s)) + + assert ArrowStringDtype() == ArrowStringDtype.construct_from_string("Arrow[string]") + + assert ArrowListDtype( + ArrowListDtype("string") + ) == ArrowListDtype.construct_from_string("Arrow[List[string]]") + + assert repr(ArrowListDtype(np.int8)) == "Arrow[List[int8]]" + + with pytest.raises(TypeError): + ArrowListDtype.construct_from_string("Arrow[string]") + + assert ArrowListDtype.is_dtype("Arrow[List[uint8]]") is True + assert ArrowListDtype.is_dtype("List[int8]") is False + assert ArrowListDtype.is_dtype(ArrowStringDtype()) is False + + assert ArrowListDtype(np.int8) != ArrowStringDtype() + assert ArrowListDtype(np.int8).kind == np.dtype(object).kind + + assert ArrowListDtype(np.int8).arrow_type == pa.list_(pa.int8()) + + +@pytest.mark.skipif(pa is None, reason="pyarrow not installed") +def test_arrow_string_array_creation(): + # create from pandas Series + series = pd.Series(["a", "bc", "de"]) + array = ArrowStringArray(series) + assert isinstance(array._arrow_array, pa.ChunkedArray) + + if pd.__version__ >= "1.0.0": + # test create from StringArray which occurs in pandas 1.0 + s = pd.arrays.StringArray(np.array(["a", "bc", "de"], dtype=object)) + array = ArrowStringArray(s) + assert isinstance(array._arrow_array, pa.ChunkedArray) + + # create from list + lst = ["a", "bc", "de"] + array = ArrowStringArray(lst) + assert isinstance(array._arrow_array, pa.ChunkedArray) + + # create from pyarrow Array + a = pa.array(["a", "bc", "de"]) + array = ArrowStringArray(a) + assert isinstance(array._arrow_array, pa.ChunkedArray) + + # create from ArrowStringArray + array2 = ArrowStringArray(array) + assert isinstance(array2._arrow_array, pa.ChunkedArray) + + # test copy + arrow_array = array2._arrow_array + array3 = ArrowStringArray(arrow_array, copy=True) + assert array3._arrow_array is not arrow_array + + # test from_scalars + array = ArrowStringArray.from_scalars([1, 2]) + assert isinstance(array._arrow_array, pa.ChunkedArray) + assert isinstance(array._arrow_array.chunks[0], pa.StringArray) + + # test _from_sequence + array = ArrowStringArray._from_sequence(["a", "b", "cc"]) + assert isinstance(array._arrow_array, pa.ChunkedArray) + + # test _from_sequence_of_strings + array = ArrowStringArray._from_sequence_of_strings(["a", "b"]) + assert isinstance(array._arrow_array, pa.ChunkedArray) + + +@pytest.mark.skipif(pa is None, reason="pyarrow not installed") +def test_arrow_list_array_creation(): + # create from pandas Series + series = pd.Series([["a", "b"], ["c"], ["d", "e"]]) + array = ArrowListArray(series) + assert isinstance(array.dtype, ArrowListDtype) + assert isinstance(array.dtype.value_type, ArrowStringDtype) + assert isinstance(array._arrow_array, pa.ChunkedArray) + + # create from list + lst = [["a"], ["b", "c"], ["d", "e"]] + array = ArrowListArray(lst) + assert isinstance(array.dtype, ArrowListDtype) + assert isinstance(array.dtype.value_type, ArrowStringDtype) + assert isinstance(array._arrow_array, pa.ChunkedArray) + + # create from pyarrow Array + a = pa.array([[1.0], [2.0, 3.0], [4.0]]) + array = ArrowListArray(a) + assert isinstance(array.dtype, ArrowListDtype) + assert array.dtype.value_type == np.float64 + assert isinstance(array._arrow_array, pa.ChunkedArray) + + # create from ArrowListArray + array2 = ArrowListArray(array) + assert isinstance(array2._arrow_array, pa.ChunkedArray) + + # test _from_sequence + array = ArrowListArray._from_sequence([[1, 2], [3, 4], [5]]) + assert isinstance(array.dtype, ArrowListDtype) + assert array.dtype.value_type == np.int64 + assert isinstance(array._arrow_array, pa.ChunkedArray) + + # test pandas_only + with option_context({"dataframe.arrow_array.pandas_only": True}): + array = ArrowListArray._from_sequence([[1, 2], [3, 4], [5]]) + assert isinstance(array.dtype, ArrowListDtype) + assert isinstance(array._ndarray, np.ndarray) + + # test pandas_only and in kernel mode + with enter_mode(kernel=True), option_context( + {"dataframe.arrow_array.pandas_only": True} + ), pytest.raises(ImportError): + ArrowListArray._from_sequence([[1, 2], [3, 4], [5]]) + + +@pytest.mark.skipif(pa is None, reason="pyarrow not installed") +def test_arrow_string_array_functions(): + lst = np.array(["abc", "de", "eee", "中文"], dtype=object) + # leverage string array to get the right answer + string_array = pd.arrays.StringArray(lst) + has_na_arrow_array = ArrowStringArray(["abc", None, "eee", "中文"]) + has_na_string_array = pd.arrays.StringArray( + np.array(["abc", pd.NA, "eee", "中文"], dtype=object) + ) + + for pandas_only in [False, True]: + with option_context({"dataframe.arrow_array.pandas_only": pandas_only}): + arrow_array = ArrowStringArray(lst) + + # getitem, scalar + assert arrow_array[1] == string_array[1] + assert arrow_array[-1] == string_array[-1] + # getitem, slice + assert list(arrow_array[:2]) == list(string_array[:2]) + assert list(arrow_array[1:-1]) == list(string_array[1:-1]) + assert list(arrow_array[::2]) == list(string_array[::2]) + # getitem, boolean index + cond = np.array([len(c) > 2 for c in lst]) + assert list(arrow_array[cond]) == list(string_array[cond]) + # getitem, fancy index + selection = [3, 1, 2] + assert list(arrow_array[selection]) == list(string_array[selection]) + selection = [3, -1, 2, -4] + assert list(arrow_array[selection]) == list(string_array[selection]) + selection = np.array([3, -1, 2, -4]) + assert list(arrow_array[selection]) == list(string_array[selection]) + + # setitem + arrow_array2 = arrow_array.copy() + string_array2 = string_array.copy() + arrow_array2[0] = "ss" + string_array2[0] = "ss" + assert list(arrow_array2) == list(string_array2) + arrow_array2[1:3] = ["ss1", "ss2"] + string_array2[1:3] = ["ss1", "ss2"] + assert list(arrow_array2) == list(string_array2) + arrow_array2[1:3] = arrow_array2[2:4] + string_array2[1:3] = string_array2[2:4] + assert list(arrow_array2) == list(string_array2) + arrow_array2[2:] = pd.Series(["ss3", "ss4"]) + string_array2[2:] = pd.Series(["ss3", "ss4"]) + assert list(arrow_array2) == list(string_array2) + with pytest.raises(ValueError): + arrow_array2[0] = ["a", "b"] + arrow_array2[-1] = None + string_array2[-1] = None + assert list(arrow_array2)[:-1] == list(string_array2)[:-1] + assert pd.isna(list(arrow_array2)[-1]) is True + with pytest.raises(ValueError): + arrow_array2[0] = 2 + with pytest.raises(ValueError): + arrow_array2[:2] = [1, 2] + + # test to_numpy + np.testing.assert_array_equal( + arrow_array.to_numpy(), string_array.to_numpy() + ) + np.testing.assert_array_equal( + arrow_array.to_numpy(copy=True), string_array.to_numpy(copy=True) + ) + np.testing.assert_array_equal( + has_na_arrow_array.to_numpy(copy=True, na_value="ss"), + has_na_string_array.to_numpy(copy=True, na_value="ss"), + ) + + # test fillna + arrow_array3 = has_na_arrow_array.fillna("filled") + string_array3 = has_na_string_array.fillna("filled") + assert list(arrow_array3) == list(string_array3) + + # test astype + arrow_array4 = ArrowStringArray(["1", "10", "100"]) + # leverage string array to get the right answer + string_array4 = pd.arrays.StringArray( + np.array(["1", "10", "100"], dtype=object) + ) + np.testing.assert_array_equal( + arrow_array4.astype(np.int64), string_array4.astype(np.int64) + ) + np.testing.assert_almost_equal( + arrow_array4.astype(float), string_array4.astype(float) + ) + assert list(arrow_array4.astype(ArrowStringDtype(), copy=False)) == list( + string_array4.astype(pd.StringDtype(), copy=False) + ) + assert list(arrow_array4.astype(ArrowStringDtype(), copy=True)) == list( + string_array4.astype(pd.StringDtype(), copy=True) + ) + + # test factorize + codes, unique = arrow_array.factorize() + codes2, unique2 = string_array.factorize() + assert list(codes) == list(codes2) + assert list(unique) == list(unique2) + + # test nbytes + assert arrow_array.nbytes < pd.Series( + string_array.astype(object) + ).memory_usage(deep=True, index=False) + + # test memory_usage + if pandas_only: + assert arrow_array.memory_usage(deep=False) == pd.Series( + string_array + ).memory_usage(index=False) + else: + assert arrow_array.memory_usage(deep=True) == arrow_array.nbytes + + # test unique + assert arrow_array.unique() == pd.Series(string_array).unique() + arrow_array2 = arrow_array.copy() + arrow_array2._force_use_pandas = True + assert arrow_array2.unique() == pd.Series(string_array).unique() + + # test isna + np.testing.assert_array_equal( + has_na_arrow_array.isna(), has_na_string_array.isna() + ) + has_na_arrow_array2 = has_na_arrow_array.copy() + has_na_arrow_array2._force_use_pandas = True + np.testing.assert_array_equal( + has_na_arrow_array2.isna(), has_na_string_array.isna() + ) + + # test take + assert list(arrow_array.take([1, 2, -1])) == list( + string_array.take([1, 2, -1]) + ) + assert list( + arrow_array.take([1, 2, -1], allow_fill=True).fillna("aa") + ) == list(string_array.take([1, 2, -1], allow_fill=True).fillna("aa")) + assert list( + arrow_array.take([1, 2, -1], allow_fill=True, fill_value="aa") + ) == list(string_array.take([1, 2, -1], allow_fill=True, fill_value="aa")) + + # test shift + assert list(arrow_array.shift(2, fill_value="aa")) == list( + string_array.shift(2, fill_value="aa") + ) + + # test value_counts + assert list(arrow_array.value_counts()) == list(string_array.value_counts()) + assert list(has_na_arrow_array.value_counts(dropna=True)) == list( + has_na_string_array.value_counts(dropna=True) + ) + + # test all any + assert arrow_array.all() == string_array.all() + assert arrow_array.any() == string_array.any() + + # test arithmetic + assert list(arrow_array + "s") == list(string_array + "s") + assert list((arrow_array + has_na_arrow_array).fillna("ss")) == list( + (string_array + has_na_string_array).fillna("ss") + ) + + # test comparison + np.testing.assert_array_equal(arrow_array < "s", string_array < "s") + pd.testing.assert_series_equal( + pd.Series(arrow_array < has_na_arrow_array), + pd.Series(string_array < has_na_string_array), + ) + + # test repr + assert "ArrowStringArray" in repr(arrow_array) + + # test concat empty + arrow_array5 = ArrowStringArray(pa.chunked_array([], type=pa.string())) + concatenated = ArrowStringArray._concat_same_type( + [arrow_array5, arrow_array5] + ) + if not pandas_only: + assert len(concatenated._arrow_array.chunks) == 1 + pd.testing.assert_series_equal( + pd.Series(arrow_array5), pd.Series(concatenated) + ) + + +@pytest.mark.skipif(pa is None, reason="pyarrow not installed") +def test_arrow_list_functions(): + lst = np.array([["a, bc"], ["de"], ["e", "ee"], ["中文", "中文2"]], dtype=object) + has_na_lst = lst.copy() + has_na_lst[1] = None + + for pandas_only in [False, True]: + with option_context({"dataframe.arrow_array.pandas_only": pandas_only}): + arrow_array = ArrowListArray(lst) + has_na_arrow_array = ArrowListArray(has_na_lst) + + # getitem, scalar + assert arrow_array[1] == lst[1] + assert list(arrow_array[-1]) == lst[-1] + # getitem, slice + np.testing.assert_array_equal(arrow_array[:2].to_numpy(), lst[:2]) + + # setitem + arrow_array2 = arrow_array.copy() + lst2 = lst.copy() + for s in [["ss"], pd.Series(["ss"])]: + arrow_array2[0] = s + lst2[0] = ["ss"] + np.testing.assert_array_equal(arrow_array2.to_numpy(), lst2) + arrow_array2[0] = None + lst2[0] = None + np.testing.assert_array_equal(arrow_array2.to_numpy(), lst2) + with pytest.raises(ValueError): + # must set list like object + arrow_array2[0] = "ss" + + # test to_numpy + np.testing.assert_array_equal(arrow_array.to_numpy(), lst) + np.testing.assert_array_equal(arrow_array.to_numpy(copy=True), lst) + np.testing.assert_array_equal( + has_na_arrow_array.to_numpy(na_value=1), + pd.Series(has_na_lst).fillna(1).to_numpy(), + ) + + # test fillna + if not pandas_only: + arrow_array3 = has_na_arrow_array.fillna(lst[1]) + np.testing.assert_array_equal(arrow_array3.to_numpy(), lst) + + # test astype + with pytest.raises(TypeError): + arrow_array.astype(np.int64) + with pytest.raises(TypeError): + arrow_array.astype(ArrowListDtype(np.int64)) + arrow_array4 = ArrowListArray([[1, 2], [3]]) + expected = np.array([["1", "2"], ["3"]], dtype=object) + np.testing.assert_array_equal( + arrow_array4.astype(ArrowListDtype(str)), expected + ) + np.testing.assert_array_equal( + arrow_array4.astype(ArrowListDtype(arrow_array4.dtype)), arrow_array4 + ) + np.testing.assert_array_equal( + arrow_array4.astype(ArrowListDtype(arrow_array4.dtype), copy=False), + arrow_array4, + ) + + # test nbytes + assert arrow_array.nbytes < pd.Series(lst).memory_usage(deep=True) + + # test memory_usage + if not pandas_only: + assert arrow_array.memory_usage(deep=True) == arrow_array.nbytes + + # test isna + np.testing.assert_array_equal( + has_na_arrow_array.isna(), pd.Series(has_na_lst).isna() + ) + + # test take + assert list(arrow_array.take([1, 2, -1])) == list( + pd.Series(lst).take([1, 2, -1]) + ) + + # test shift + assert ( + list(arrow_array.shift(2, fill_value=["aa"])) + == [["aa"]] * 2 + lst[:-2].tolist() + ) + + # test all any + if _use_bool_any_all: + assert arrow_array.all() == pd.array(lst).all() + assert arrow_array.any() == pd.array(lst).any() + else: + assert arrow_array.all() == lst.all() + assert arrow_array.any() == lst.any() + + # test repr + assert "ArrowListArray" in repr(arrow_array) + + # test concat empty + arrow_array5 = ArrowListArray( + pa.chunked_array([], type=pa.list_(pa.string())) + ) + concatenated = ArrowListArray._concat_same_type( + [arrow_array5, arrow_array5] + ) + if not pandas_only: + assert len(concatenated._arrow_array.chunks) == 1 + pd.testing.assert_series_equal( + pd.Series(arrow_array5), pd.Series(concatenated) + ) + + +@pytest.mark.skipif(pa is None, reason="pyarrow not installed") +def test_to_pandas(): + rs = np.random.RandomState(0) + df = pd.DataFrame( + { + "a": rs.rand(100), + "b": ["s" + str(i) for i in rs.randint(100, size=100)], + "c": [["ss0" + str(i), "ss1" + str(i)] for i in rs.randint(100, size=100)], + } + ) + + batch_size = 15 + n_batch = len(df) // 15 + 1 + batches = [ + pa.RecordBatch.from_pandas(df[i * batch_size : (i + 1) * batch_size]) + for i in range(n_batch) + ] + table = pa.Table.from_batches(batches) + + df1 = arrow_table_to_pandas_dataframe(table, use_arrow_dtype=False) + assert df1.dtypes.iloc[1] == np.dtype("O") + assert df1.dtypes.iloc[2] == np.dtype("O") + + df2 = arrow_table_to_pandas_dataframe(table) + assert df2.dtypes.iloc[1] == ArrowStringDtype() + assert df2.dtypes.iloc[2] == ArrowListDtype(str) + assert df2.memory_usage(deep=True).sum() < df.memory_usage(deep=True).sum() + + # test df method + df4 = df2.groupby("b").sum() + df4.index = df4.index.astype(object) + expected = df.groupby("b").sum() + pd.testing.assert_frame_equal(df4, expected) + + s = ("s" + df2["b"]).astype("string") + expected = ("s" + df["b"]).astype("string") + pd.testing.assert_series_equal(s, expected) + + s2 = df2["b"].str[:2] + expected = df["b"].astype("string").str[:2] + pd.testing.assert_series_equal(s2, expected) diff --git a/python/xorbits/_mars/dataframe/tests/test_core.py b/python/xorbits/_mars/dataframe/tests/test_core.py new file mode 100644 index 000000000..53c01789c --- /dev/null +++ b/python/xorbits/_mars/dataframe/tests/test_core.py @@ -0,0 +1,394 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest + +from ...core import tile +from ...lib.groupby_wrapper import wrapped_groupby +from ...utils import pd_release_version +from .. import cut +from ..initializer import DataFrame, Index, Series + +_with_inclusive_bounds = pd_release_version >= (1, 3, 0) + + +def test_dataframe_params(): + raw = pd.DataFrame({"a": [1, 2, 3]}) + df = DataFrame(raw) + df = df[df["a"] < 2] + df = tile(df) + c = df.chunks[0] + + assert any(np.isnan(s) for s in c.params["shape"]) + assert np.isnan(c.params["index_value"].min_val) + c.params = c.get_params_from_data(raw[raw["a"] < 2]) + # shape and index_value updated + assert not any(np.isnan(s) for s in c.params["shape"]) + assert not np.isnan(c.params["index_value"].min_val) + + params = c.params.copy() + params.pop("index", None) + df.params = params + assert np.prod(df.shape) > 0 + df.refresh_params() + + +def test_series_params(): + raw = pd.Series([1, 2, 3], name="a") + series = Series(raw) + series = series[series < 2] + series = tile(series) + c = series.chunks[0] + + assert series.T is series + + assert any(np.isnan(s) for s in c.params["shape"]) + assert np.isnan(c.params["index_value"].min_val) + c.params = c.get_params_from_data(raw[raw < 2]) + # shape and index_value updated + assert not any(np.isnan(s) for s in c.params["shape"]) + assert not np.isnan(c.params["index_value"].min_val) + + params = c.params.copy() + params.pop("index", None) + series.params = params + assert np.prod(series.shape) > 0 + series.refresh_params() + + +def test_index_params(): + raw = pd.Series([1, 2, 3], name="a") + raw.index.name = "b" + series = Series(raw) + series = series[series < 2] + index = series.index + index = tile(index) + c = index.chunks[0] + + assert index.T is index + + assert any(np.isnan(s) for s in c.params["shape"]) + assert np.isnan(c.params["index_value"].min_val) + c.params = c.get_params_from_data(raw[raw < 2].index) + # shape and index_value updated + assert not any(np.isnan(s) for s in c.params["shape"]) + assert not np.isnan(c.params["index_value"].min_val) + + params = c.params.copy() + params.pop("index", None) + index.params = params + assert np.prod(index.shape) > 0 + index.refresh_params() + + +def test_categorical_params(): + raw = np.random.rand(10) + cate = cut(raw, [0.3, 0.5, 0.7]) + cate = tile(cate) + c = cate.chunks[0] + + c.params = c.get_params_from_data(pd.cut(raw, [0.3, 0.5, 0.7])) + assert len(c.params["categories_value"].to_pandas()) > 0 + + params = c.params.copy() + params.pop("index", None) + cate.params = params + assert len(cate.params["categories_value"].to_pandas()) > 0 + cate.refresh_params() + + +def test_groupby_params(): + raw = pd.DataFrame({"a": [1, 2, 3]}) + df = DataFrame(raw) + grouped = df.groupby("a") + grouped = tile(grouped) + c = grouped.chunks[0] + + c.params = c.get_params_from_data(wrapped_groupby(raw, by="a")) + params = c.params.copy() + params.pop("index", None) + grouped.params = params + + raw = pd.Series([1, 2, 3], name="a") + series = Series(raw) + grouped = series.groupby(level=0) + grouped = tile(grouped) + c = grouped.chunks[0] + + c.params = c.get_params_from_data(wrapped_groupby(raw, level=0)) + params = c.params.copy() + params.pop("index", None) + grouped.params = params + grouped.refresh_params() + + +def test_dataframe_dir(): + df = DataFrame(pd.DataFrame(np.random.rand(4, 3), columns=list("ABC"))) + dir_result = set(dir(df)) + for c in df.dtypes.index: + assert c in dir_result + + +def test_to_frame_or_series(setup): + raw = pd.Series(np.random.rand(10), name="col") + series = Series(raw) + + r = series.to_frame() + result = r.execute().fetch() + pd.testing.assert_frame_equal(raw.to_frame(), result) + + r = series.to_frame(name="new_name") + result = r.execute().fetch() + pd.testing.assert_frame_equal(raw.to_frame(name="new_name"), result) + + series = series[series > 0.1] + r = series.to_frame(name="new_name") + result = r.execute().fetch() + pd.testing.assert_frame_equal(raw[raw > 0.1].to_frame(name="new_name"), result) + + raw = pd.Index(np.random.rand(10), name="col") + index = Index(raw) + + r = index.to_frame() + result = r.execute().fetch() + pd.testing.assert_frame_equal(raw.to_frame(), result) + + r = index.to_frame(index=False) + result = r.execute().fetch() + pd.testing.assert_frame_equal(raw.to_frame(index=False), result) + + r = index.to_frame(name="new_name") + result = r.execute().fetch() + pd.testing.assert_frame_equal(raw.to_frame(name="new_name"), result) + + r = index.to_series() + result = r.execute().fetch() + pd.testing.assert_series_equal(raw.to_series(), result) + + r = index.to_series(index=pd.RangeIndex(0, 10)) + result = r.execute().fetch() + pd.testing.assert_series_equal(raw.to_series(index=pd.RangeIndex(0, 10)), result) + + r = index.to_series(name="new_name") + result = r.execute().fetch() + pd.testing.assert_series_equal(raw.to_series(name="new_name"), result) + + raw = pd.MultiIndex.from_tuples([("A", "E"), ("B", "F"), ("C", "G")]) + index = Index(raw, tupleize_cols=True) + + r = index.to_frame() + result = r.execute().fetch() + pd.testing.assert_frame_equal(raw.to_frame(), result) + + with pytest.raises(TypeError): + index.to_frame(name="XY") + + with pytest.raises(ValueError): + index.to_frame(name=["X", "Y", "Z"]) + + r = index.to_frame(name=["X", "Y"]) + result = r.execute().fetch() + pd.testing.assert_frame_equal(raw.to_frame(name=["X", "Y"]), result) + + r = index.to_series(name="new_name") + result = r.execute().fetch() + pd.testing.assert_series_equal(raw.to_series(name="new_name"), result) + + +def test_to_frame_or_series_apply(setup): + df1 = DataFrame(pd.DataFrame([[0, 1], [2, 3]], columns=["col1", "col2"])) + df2 = df1.append(DataFrame(pd.DataFrame(columns=["col1", "col2"]))) + pd_df2 = df2.apply( + lambda row: pd.Series([1, 2], index=["c", "d"]), axis=1 + ).to_pandas() + assert pd_df2.columns.tolist() == ["c", "d"] + + def f(df): + df["col3"] = df["col2"] + return df + + pd_df3 = df2.groupby(["col1"]).apply(f).to_pandas() + assert pd_df3.columns.tolist() == ["col1", "col2", "col3"] + + pd_df4 = df2.map_chunk( + lambda chunk_df: chunk_df.apply( + lambda row: pd.Series([1, 2], index=["c", "d"]), axis=1 + ) + ).to_pandas() + assert pd_df4.columns.tolist() == ["c", "d"] + + ser1 = Series(pd.Series(data={"a": 1, "b": 2, "c": 3}, index=["a", "b", "c"])) + ser2 = ser1.append(Series(pd.Series(dtype=np.int64))) + pd_ser2 = ser2.apply(lambda v: str(v)).execute() + assert pd_ser2.dtype == object + + ser3 = ser2.map_chunk( + lambda chunk_series: chunk_series.apply(lambda x: float(x)) + ).execute() + + def check_dtype(s): + assert s.dtypes == np.float64 + return s + + ser3.map_chunk(check_dtype).execute() + + +def test_assign(setup): + rs = np.random.RandomState(0) + raw = pd.DataFrame({"A": rs.rand(10), "B": rs.rand(10)}) + + df = DataFrame(raw, chunk_size=5) + result = df.assign(C=df.B / df.A).execute().fetch() + expected = raw.assign(C=raw.B / raw.A) + pd.testing.assert_frame_equal(result, expected) + + # lambda syntax + result = df.assign(C=lambda x: x.B / x.A).execute().fetch() + expected = raw.assign(C=lambda x: x.B / x.A) + pd.testing.assert_frame_equal(result, expected) + + # Non-Series array-like + row_list = rs.rand(10).tolist() + result = df.assign(C=row_list).execute().fetch() + expected = raw.assign(C=row_list) + pd.testing.assert_frame_equal(result, expected) + + # multiple + row_list = rs.rand(10).tolist() + result = df.assign(C=row_list, D=df.A, E=lambda x: x.B) + result["C"] = result["C"].astype("int64") + expected = raw.assign(C=row_list, D=raw.A, E=lambda x: x.B) + expected["C"] = expected["C"].astype("int64") + pd.testing.assert_frame_equal(result.execute().fetch(), expected) + + +def test_key_value(setup): + raw = pd.DataFrame(np.random.rand(4, 3), columns=list("ABC")) + df = DataFrame(raw) + + result = df.values.execute().fetch() + np.testing.assert_array_equal(result, raw.values) + + result = df.keys().execute().fetch() + pd.testing.assert_index_equal(result, raw.keys()) + + raw = pd.Series(np.random.rand(10)) + s = Series(raw) + + result = s.values.execute().fetch() + np.testing.assert_array_equal(result, raw.values) + + result = s.keys().execute().fetch() + pd.testing.assert_index_equal(result, raw.keys()) + + raw = pd.Index(np.random.rand(10)) + idx = Index(raw) + + result = idx.values.execute().fetch() + np.testing.assert_array_equal(result, raw.values) + + +@pytest.mark.pd_compat +def test_between(setup): + pd_series = pd.Series(pd.date_range("1/1/2000", periods=10)) + pd_left, pd_right = pd_series[3], pd_series[7] + series = Series(pd_series, chunk_size=5) + left, right = series.iloc[3], series.iloc[7] + + result = series.between(left, right).execute().fetch() + expected = pd_series.between(pd_left, pd_right) + pd.testing.assert_series_equal(result, expected) + + if _with_inclusive_bounds: + result = series.between(left, right, inclusive="both").execute().fetch() + expected = pd_series.between(pd_left, pd_right, inclusive="both") + pd.testing.assert_series_equal(result, expected) + + result = series.between(left, right, inclusive="left").execute().fetch() + expected = pd_series.between(pd_left, pd_right, inclusive="left") + pd.testing.assert_series_equal(result, expected) + + result = series.between(left, right, inclusive="right").execute().fetch() + expected = pd_series.between(pd_left, pd_right, inclusive="right") + pd.testing.assert_series_equal(result, expected) + + result = series.between(left, right, inclusive="neither").execute().fetch() + expected = pd_series.between(pd_left, pd_right, inclusive="neither") + pd.testing.assert_series_equal(result, expected) + + with pytest.raises(ValueError): + series = Series(pd.date_range("1/1/2000", periods=10), chunk_size=5) + series.between(left, right, inclusive="yes").execute().fetch() + + # test_between_datetime_values + pd_series = pd.Series(pd.bdate_range("1/1/2000", periods=20).astype(object)) + pd_series[::2] = np.nan + + series = Series(pd_series, chunk_size=5) + result = series[series.between(series[3], series[17])].execute().fetch() + expected = pd_series[3:18].dropna() + pd.testing.assert_series_equal(result, expected) + + result = ( + series[series.between(series[3], series[17], inclusive="neither")] + .execute() + .fetch() + ) + expected = pd_series[5:16].dropna() + pd.testing.assert_series_equal(result, expected) + + # test_between_period_values + pd_series = pd.Series(pd.period_range("2000-01-01", periods=10, freq="D")) + pd_left, pd_right = pd_series[2], pd_series[7] + + series = Series(pd_series, chunk_size=5) + left, right = series[2], series[7] + + result = series.between(left, right).execute().fetch() + expected = pd_series.between(pd_left, pd_right) + pd.testing.assert_series_equal(result, expected) + + +def test_series_median(setup): + raw = pd.Series(np.random.rand(10), name="col") + series = Series(raw) + + r = series.median() + result = r.execute().fetch() + assert np.isclose(raw.median(), result) + + raw = pd.Series(np.random.rand(100), name="col") + series = Series(raw) + + r = series.median() + result = r.execute().fetch() + assert np.isclose(raw.median(), result) + + raw = pd.Series(np.random.rand(10), name="col") + raw[np.random.randint(0, 10)] = None + series = Series(raw) + + r = series.median() + result = r.execute().fetch() + assert np.isclose(raw.median(), result) + + raw = pd.Series(np.random.rand(10), name="col") + raw[np.random.randint(0, 10)] = None + series = Series(raw) + + r = series.median(skipna=False) + result = r.execute().fetch() + assert np.isnan(raw.median(skipna=False)) and np.isnan(result) diff --git a/python/xorbits/_mars/dataframe/tests/test_initializer.py b/python/xorbits/_mars/dataframe/tests/test_initializer.py new file mode 100644 index 000000000..2117acc40 --- /dev/null +++ b/python/xorbits/_mars/dataframe/tests/test_initializer.py @@ -0,0 +1,209 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ... import dataframe as md +from ... import tensor as mt +from ...tests.core import require_cudf, require_cupy +from ...utils import lazy_import + +cupy = lazy_import("cupy") +cudf = lazy_import("cudf") + + +def test_dataframe_initializer(setup): + # from tensor + raw = np.random.rand(100, 10) + tensor = mt.tensor(raw, chunk_size=7) + r = md.DataFrame(tensor) + result = r.execute().fetch() + pd.testing.assert_frame_equal(result, pd.DataFrame(raw)) + + r = md.DataFrame(tensor, chunk_size=13) + result = r.execute().fetch() + pd.testing.assert_frame_equal(result, pd.DataFrame(raw)) + + # from Mars dataframe + raw = pd.DataFrame(np.random.rand(100, 10), columns=list("ABCDEFGHIJ")) + df = md.DataFrame(raw, chunk_size=15) * 2 + r = md.DataFrame(df, num_partitions=11) + result = r.execute().fetch() + pd.testing.assert_frame_equal(result, raw * 2) + + # from tileable dict + raw_dict = { + "C": np.random.choice(["u", "v", "w"], size=(100,)), + "A": pd.Series(np.random.rand(100)), + "B": np.random.randint(0, 10, size=(100,)), + } + m_dict = raw_dict.copy() + m_dict["A"] = md.Series(m_dict["A"]) + m_dict["B"] = mt.tensor(m_dict["B"]) + r = md.DataFrame(m_dict, columns=list("ABC")) + result = r.execute().fetch() + pd.testing.assert_frame_equal(result, pd.DataFrame(raw_dict, columns=list("ABC"))) + + r = md.DataFrame({"a": [mt.tensor([1, 2, 3]).sum() + 1]}) + result = r.execute().fetch() + pd.testing.assert_frame_equal(result, pd.DataFrame({"a": [7]})) + + # from tileable list + raw_list = [ + np.random.choice(["u", "v", "w"], size=(3,)), + pd.Series(np.random.rand(3)), + np.random.randint(0, 10, size=(3,)), + ] + m_list = raw_list.copy() + m_list[1] = md.Series(m_list[1]) + m_list[2] = mt.tensor(m_list[2]) + r = md.DataFrame(m_list, columns=list("ABC")) + result = r.execute(extra_config={"check_dtypes": False}).fetch() + pd.testing.assert_frame_equal(result, pd.DataFrame(raw_list, columns=list("ABC"))) + + # from raw pandas initializer + raw = pd.DataFrame(np.random.rand(100, 10), columns=list("ABCDEFGHIJ")) + r = md.DataFrame(raw, num_partitions=10) + result = r.execute().fetch() + pd.testing.assert_frame_equal(result, raw) + + # from mars series + raw_s = np.random.rand(100) + s = md.Series(raw_s, chunk_size=20) + r = md.DataFrame(s, num_partitions=10) + result = r.execute().fetch() + pd.testing.assert_frame_equal(result, pd.DataFrame(raw_s)) + + # test check instance + r = r * 2 + assert isinstance(r, md.DataFrame) + + +@require_cudf +@require_cupy +def test_dataframe_gpu_initializer(setup_gpu): + # from raw cudf initializer + raw = cudf.DataFrame(cupy.random.rand(100, 10), columns=list("ABCDEFGHIJ")) + r = md.DataFrame(raw, chunk_size=13) + result = r.execute().fetch() + pd.testing.assert_frame_equal(result.to_pandas(), raw.to_pandas()) + + raw = cupy.random.rand(100, 10) + r = md.DataFrame(raw, columns=list("ABCDEFGHIJ"), chunk_size=13) + result = r.execute().fetch() + expected = cudf.DataFrame(raw, columns=list("ABCDEFGHIJ")) + pd.testing.assert_frame_equal(result.to_pandas(), expected.to_pandas()) + + +def test_series_initializer(setup): + # from tensor + raw = np.random.rand(100) + tensor = mt.tensor(raw, chunk_size=7) + r = md.Series(tensor) + result = r.execute().fetch() + pd.testing.assert_series_equal(result, pd.Series(raw)) + + r = md.Series(tensor, chunk_size=13) + result = r.execute().fetch() + pd.testing.assert_series_equal(result, pd.Series(raw)) + + # from index + raw = np.arange(100) + np.random.shuffle(raw) + raw = pd.Index(raw, name="idx_name") + idx = md.Index(raw, chunk_size=7) + r = md.Series(idx) + result = r.execute().fetch() + pd.testing.assert_series_equal(result, pd.Series(raw)) + + # from Mars series + raw = pd.Series(np.random.rand(100), name="series_name") + ms = md.Series(raw, chunk_size=15) * 2 + r = md.Series(ms, num_partitions=11) + result = r.execute().fetch() + pd.testing.assert_series_equal(result, raw * 2) + + # from raw pandas initializer + raw = pd.Series(np.random.rand(100), name="series_name") + r = md.Series(raw, num_partitions=10) + result = r.execute().fetch() + pd.testing.assert_series_equal(result, raw) + + # test check instance + r = r * 2 + assert isinstance(r, md.Series) + + +@require_cudf +@require_cupy +def test_series_gpu_initializer(setup_gpu): + # from raw cudf initializer + raw = cudf.Series(cupy.random.rand(100), name="a") + r = md.Series(raw, chunk_size=13) + result = r.execute().fetch() + pd.testing.assert_series_equal(result.to_pandas(), raw.to_pandas()) + + raw = cupy.random.rand(100) + r = md.Series(raw, name="a", chunk_size=13) + result = r.execute().fetch() + expected = cudf.Series(raw, name="a") + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +def test_index_initializer(setup): + # from tensor + raw = np.arange(100) + np.random.shuffle(raw) + tensor = mt.tensor(raw) + r = md.Index(tensor, chunk_size=7) + result = r.execute().fetch() + pd.testing.assert_index_equal(result, pd.Index(raw)) + + # from Mars index + raw = np.arange(100) + np.random.shuffle(raw) + idx = md.Index(raw, chunk_size=7) + r = md.Index(idx, num_partitions=11) + result = r.execute().fetch() + pd.testing.assert_index_equal(result, pd.Index(raw)) + + # from pandas initializer + raw = np.arange(100) + np.random.shuffle(raw) + raw_ser = pd.Series(raw, name="series_name") + r = md.Index(raw_ser, chunk_size=7) + result = r.execute().fetch() + pd.testing.assert_index_equal(result, pd.Index(raw_ser)) + + raw_idx = pd.Index(raw, name="idx_name") + r = md.Index(raw_idx, num_partitions=10) + result = r.execute().fetch() + pd.testing.assert_index_equal(result, pd.Index(raw_idx)) + + +@require_cudf +@require_cupy +def test_index_gpu_initializer(setup_gpu): + # from raw cudf initializer + raw = cudf.Index(cupy.random.rand(100), name="a") + r = md.Index(raw, chunk_size=13) + result = r.execute().fetch() + pd.testing.assert_index_equal(result.to_pandas(), raw.to_pandas()) + + raw = cupy.random.rand(100) + r = md.Index(raw, name="a", chunk_size=13) + result = r.execute().fetch() + expected = cudf.Index(raw, name="a") + pd.testing.assert_index_equal(result.to_pandas(), expected.to_pandas()) diff --git a/python/xorbits/_mars/dataframe/tests/test_utils.py b/python/xorbits/_mars/dataframe/tests/test_utils.py new file mode 100644 index 000000000..58df53090 --- /dev/null +++ b/python/xorbits/_mars/dataframe/tests/test_utils.py @@ -0,0 +1,689 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator +from collections import OrderedDict +from numbers import Integral +from typing import Dict, List + +import numpy as np +import pandas as pd +import pytest + +from ...config import option_context +from ...core import tile +from ...utils import Timer +from ..core import IndexValue +from ..initializer import DataFrame, Index, Series +from ..utils import ( + auto_merge_chunks, + build_concatenated_rows_frame, + build_split_idx_to_origin_idx, + decide_dataframe_chunk_sizes, + decide_series_chunk_size, + fetch_corner_data, + filter_index_value, + infer_dtypes, + infer_index_value, + make_dtypes, + merge_index_value, + parse_index, + split_monotonic_index_min_max, + validate_axis, + whether_to_clean_up, +) + + +def test_decide_dataframe_chunks(): + with option_context() as options: + options.chunk_store_limit = 64 + + memory_usage = pd.Series([8, 22.2, 4, 2, 11.2], index=list("abcde")) + + shape = (10, 5) + nsplit = decide_dataframe_chunk_sizes(shape, None, memory_usage) + for ns in nsplit: + assert all(isinstance(i, Integral) for i in ns) is True + assert shape == tuple(sum(ns) for ns in nsplit) + + nsplit = decide_dataframe_chunk_sizes(shape, {0: 4}, memory_usage) + for ns in nsplit: + assert all(isinstance(i, Integral) for i in ns) is True + assert shape == tuple(sum(ns) for ns in nsplit) + + nsplit = decide_dataframe_chunk_sizes(shape, (2, 3), memory_usage) + for ns in nsplit: + assert all(isinstance(i, Integral) for i in ns) is True + assert shape == tuple(sum(ns) for ns in nsplit) + + nsplit = decide_dataframe_chunk_sizes(shape, (10, 3), memory_usage) + for ns in nsplit: + assert all(isinstance(i, Integral) for i in ns) is True + assert shape == tuple(sum(ns) for ns in nsplit) + + options.chunk_store_limit = 20 + + shape = (10, 5) + nsplit = decide_dataframe_chunk_sizes(shape, None, memory_usage) + for ns in nsplit: + assert all(isinstance(i, Integral) for i in ns) is True + assert shape == tuple(sum(ns) for ns in nsplit) + + nsplit = decide_dataframe_chunk_sizes(shape, {1: 3}, memory_usage) + for ns in nsplit: + assert all(isinstance(i, Integral) for i in ns) is True + assert shape == tuple(sum(ns) for ns in nsplit) + + nsplit = decide_dataframe_chunk_sizes(shape, (2, 3), memory_usage) + for ns in nsplit: + assert all(isinstance(i, Integral) for i in ns) is True + assert shape == tuple(sum(ns) for ns in nsplit) + + nsplit = decide_dataframe_chunk_sizes(shape, (10, 3), memory_usage) + for ns in nsplit: + assert all(isinstance(i, Integral) for i in ns) is True + assert shape == tuple(sum(ns) for ns in nsplit) + + +def test_decide_series_chunks(): + with option_context() as options: + options.chunk_store_limit = 64 + + s = pd.Series(np.empty(50, dtype=np.int64)) + nsplit = decide_series_chunk_size( + s.shape, None, s.memory_usage(index=False, deep=True) + ) + assert len(nsplit) == 1 + assert sum(nsplit[0]) == 50 + assert nsplit[0][0] == 8 + + +def test_parse_index(): + index = pd.Index([], dtype=np.int64) + parsed_index = parse_index(index) + assert isinstance(parsed_index.value, IndexValue.Int64Index) + pd.testing.assert_index_equal(index, parsed_index.to_pandas()) + + index = pd.Index([1, 2], dtype=np.int64) + parsed_index = parse_index(index) # not parse data + assert isinstance(parsed_index.value, IndexValue.Int64Index) + with pytest.raises(AssertionError): + pd.testing.assert_index_equal(index, parsed_index.to_pandas()) + + parsed_index = parse_index(index, store_data=True) # parse data + assert isinstance(parsed_index.value, IndexValue.Int64Index) + pd.testing.assert_index_equal(index, parsed_index.to_pandas()) + + index = pd.RangeIndex(0, 10, 3) + parsed_index = parse_index(index) + assert isinstance(parsed_index.value, IndexValue.RangeIndex) + pd.testing.assert_index_equal(index, parsed_index.to_pandas()) + + index = pd.MultiIndex.from_arrays([[0, 1], ["a", "b"], ["X", "Y"]]) + parsed_index = parse_index(index) # not parse data + assert isinstance(parsed_index.value, IndexValue.MultiIndex) + with pytest.raises(AssertionError): + pd.testing.assert_index_equal(index, parsed_index.to_pandas()) + + parsed_index = parse_index(index, store_data=True) # parse data + assert isinstance(parsed_index.value, IndexValue.MultiIndex) + pd.testing.assert_index_equal(index, parsed_index.to_pandas()) + + +def test_split_monotonic_index_min_max(): + left_min_max = [[0, True, 3, True], [3, False, 5, False]] + right_min_max = [[1, False, 3, True], [4, False, 6, True]] + left_splits, right_splits = split_monotonic_index_min_max( + left_min_max, True, right_min_max, True + ) + assert left_splits == [ + [(0, True, 1, True), (1, False, 3, True)], + [(3, False, 4, True), (4, False, 5, False), (5, True, 6, True)], + ] + assert right_splits == [ + [(0, True, 1, True), (1, False, 3, True)], + [(3, False, 4, True), (4, False, 5, False), (5, True, 6, True)], + ] + left_splits, right_splits = split_monotonic_index_min_max( + right_min_max, False, left_min_max, False + ) + assert list(reversed(left_splits)) == [ + [(0, True, 1, True), (1, False, 3, True)], + [(3, False, 4, True), (4, False, 5, False), (5, True, 6, True)], + ] + assert list(reversed(right_splits)) == [ + [(0, True, 1, True), (1, False, 3, True)], + [(3, False, 4, True), (4, False, 5, False), (5, True, 6, True)], + ] + + left_min_max = [[2, True, 4, True], [8, True, 9, False]] + right_min_max = [[1, False, 3, True], [4, False, 6, True]] + left_splits, right_splits = split_monotonic_index_min_max( + left_min_max, True, right_min_max, True + ) + assert left_splits == [ + [(1, False, 2, False), (2, True, 3, True), (3, False, 4, True)], + [(4, False, 6, True), (8, True, 9, False)], + ] + assert right_splits == [ + [(1, False, 2, False), (2, True, 3, True)], + [(3, False, 4, True), (4, False, 6, True), (8, True, 9, False)], + ] + + left_min_max = [ + [1, False, 3, True], + [4, False, 6, True], + [10, True, 12, False], + [13, True, 14, False], + ] + right_min_max = [[2, True, 4, True], [5, True, 7, False]] + left_splits, right_splits = split_monotonic_index_min_max( + left_min_max, True, right_min_max, True + ) + assert left_splits == [ + [(1, False, 2, False), (2, True, 3, True)], + [(3, False, 4, True), (4, False, 5, False), (5, True, 6, True)], + [(6, False, 7, False), (10, True, 12, False)], + [(13, True, 14, False)], + ] + assert right_splits == [ + [(1, False, 2, False), (2, True, 3, True), (3, False, 4, True)], + [ + (4, False, 5, False), + (5, True, 6, True), + (6, False, 7, False), + (10, True, 12, False), + (13, True, 14, False), + ], + ] + left_splits, right_splits = split_monotonic_index_min_max( + right_min_max, True, left_min_max, True + ) + assert left_splits == [ + [(1, False, 2, False), (2, True, 3, True), (3, False, 4, True)], + [ + (4, False, 5, False), + (5, True, 6, True), + (6, False, 7, False), + (10, True, 12, False), + (13, True, 14, False), + ], + ] + assert right_splits == [ + [(1, False, 2, False), (2, True, 3, True)], + [(3, False, 4, True), (4, False, 5, False), (5, True, 6, True)], + [(6, False, 7, False), (10, True, 12, False)], + [(13, True, 14, False)], + ] + + # left min_max like ([.., .., 4 True], [4, False, ..., ...] + # right min_max like ([..., ..., 4 False], [4, True, ..., ...] + left_min_max = [[1, False, 4, True], [4, False, 6, True]] + right_min_max = [[1, False, 4, False], [4, True, 6, True]] + left_splits, right_splits = split_monotonic_index_min_max( + left_min_max, True, right_min_max, True + ) + assert left_splits == [ + [(1, False, 4, False), (4, True, 4, True)], + [(4, False, 6, True)], + ] + assert right_splits == [ + [(1, False, 4, False)], + [(4, True, 4, True), (4, False, 6, True)], + ] + + # identical index + left_min_max = [[1, False, 3, True], [4, False, 6, True]] + right_min_max = [[1, False, 3, True], [4, False, 6, True]] + left_splits, right_splits = split_monotonic_index_min_max( + left_min_max, True, right_min_max, True + ) + assert left_splits == [[tuple(it)] for it in left_min_max] + assert right_splits == [[tuple(it)] for it in left_min_max] + + +def test_build_split_idx_to_origin_idx(): + splits = [[(1, False, 2, False), (2, True, 3, True)], [(5, False, 6, True)]] + res = build_split_idx_to_origin_idx(splits) + + assert res == {0: (0, 0), 1: (0, 1), 2: (1, 0)} + + splits = [[(5, False, 6, True)], [(1, False, 2, False), (2, True, 3, True)]] + res = build_split_idx_to_origin_idx(splits, increase=False) + + assert res == {0: (1, 0), 1: (1, 1), 2: (0, 0)} + + +def test_filter_index_value(): + pd_index = pd.RangeIndex(10) + index_value = parse_index(pd_index) + + min_max = (0, True, 9, True) + assert ( + filter_index_value(index_value, min_max).to_pandas().tolist() + == pd_index[(pd_index >= 0) & (pd_index <= 9)].tolist() + ) + + min_max = (0, False, 9, False) + assert ( + filter_index_value(index_value, min_max).to_pandas().tolist() + == pd_index[(pd_index > 0) & (pd_index < 9)].tolist() + ) + + pd_index = pd.RangeIndex(1, 11, 3) + index_value = parse_index(pd_index) + + min_max = (2, True, 10, True) + assert ( + filter_index_value(index_value, min_max).to_pandas().tolist() + == pd_index[(pd_index >= 2) & (pd_index <= 10)].tolist() + ) + + min_max = (2, False, 10, False) + assert ( + filter_index_value(index_value, min_max).to_pandas().tolist() + == pd_index[(pd_index > 2) & (pd_index < 10)].tolist() + ) + + pd_index = pd.RangeIndex(9, -1, -1) + index_value = parse_index(pd_index) + + min_max = (0, True, 9, True) + assert ( + filter_index_value(index_value, min_max).to_pandas().tolist() + == pd_index[(pd_index >= 0) & (pd_index <= 9)].tolist() + ) + + min_max = (0, False, 9, False) + assert ( + filter_index_value(index_value, min_max).to_pandas().tolist() + == pd_index[(pd_index > 0) & (pd_index < 9)].tolist() + ) + + pd_index = pd.RangeIndex(10, 0, -3) + index_value = parse_index(pd_index, store_data=False) + + min_max = (2, True, 10, True) + assert ( + filter_index_value(index_value, min_max).to_pandas().tolist() + == pd_index[(pd_index >= 2) & (pd_index <= 10)].tolist() + ) + + min_max = (2, False, 10, False) + assert ( + filter_index_value(index_value, min_max).to_pandas().tolist() + == pd_index[(pd_index > 2) & (pd_index < 10)].tolist() + ) + + pd_index = pd.Index([0, 3, 8], dtype=np.int64) + index_value = parse_index(pd_index, store_data=True) + + min_max = (2, True, 8, False) + assert ( + filter_index_value(index_value, min_max, store_data=True).to_pandas().tolist() + == pd_index[(pd_index >= 2) & (pd_index < 8)].tolist() + ) + + index_value = parse_index(pd_index) + + min_max = (2, True, 8, False) + filtered = filter_index_value(index_value, min_max) + assert len(filtered.to_pandas().tolist()) == 0 + assert isinstance(filtered.value, IndexValue.Int64Index) + + +def test_merge_index_value(): + with Timer() as timer: + index_values = {i: parse_index(pd.RangeIndex(1e7)) for i in range(20)} + index_value = merge_index_value(index_values) + pd.testing.assert_index_equal( + index_value.to_pandas(), pd.Index([], dtype=np.int64) + ) + assert index_value.min_val == 0 + assert index_value.max_val == 1e7 - 1 + + # range indexes that are continuous + index_values = { + i: parse_index(pd.RangeIndex(i * 1e7, (i + 1) * 1e7)) for i in range(20) + } + index_value = merge_index_value(index_values) + pd.testing.assert_index_equal(index_value.to_pandas(), pd.RangeIndex(1e7 * 20)) + assert index_value.min_val == 0 + assert index_value.max_val == 1e7 * 20 - 1 + assert timer.duration < 1 + + +def test_infer_dtypes(): + data1 = pd.DataFrame([[1, "a", False]], columns=[2.0, 3.0, 4.0]) + data2 = pd.DataFrame([[1, 3.0, "b"]], columns=[1, 2, 3]) + + pd.testing.assert_series_equal( + infer_dtypes(data1.dtypes, data2.dtypes, operator.add), (data1 + data2).dtypes + ) + + +def test_infer_index_value(): + # same range index + index1 = pd.RangeIndex(1, 3) + index2 = pd.RangeIndex(1, 3) + + ival1 = parse_index(index1) + ival2 = parse_index(index2) + oival = infer_index_value(ival1, ival2) + + assert oival.key == ival1.key + assert oival.key == ival2.key + + # different range index + index1 = pd.RangeIndex(1, 3) + index2 = pd.RangeIndex(2, 4) + + ival1 = parse_index(index1) + ival2 = parse_index(index2) + oival = infer_index_value(ival1, ival2) + + assert isinstance(oival.value, IndexValue.Int64Index) + assert oival.key != ival1.key + assert oival.key != ival2.key + + # same int64 index, all unique + index1 = pd.Index([1, 2], dtype=np.int64) + index2 = pd.Index([1, 2], dtype=np.int64) + + ival1 = parse_index(index1) + ival2 = parse_index(index2) + oival = infer_index_value(ival1, ival2) + + assert isinstance(oival.value, IndexValue.Int64Index) + assert oival.key == ival1.key + assert oival.key == ival2.key + + # same int64 index, not all unique + index1 = pd.Index([1, 2, 2], dtype=np.int64) + index2 = pd.Index([1, 2, 2], dtype=np.int64) + + ival1 = parse_index(index1) + ival2 = parse_index(index2) + oival = infer_index_value(ival1, ival2) + + assert isinstance(oival.value, IndexValue.Int64Index) + assert oival.key != ival1.key + assert oival.key != ival2.key + + # different int64 index + index1 = pd.Index([1, 2], dtype=np.int64) + index2 = pd.Index([2, 3], dtype=np.int64) + + ival1 = parse_index(index1) + ival2 = parse_index(index2) + oival = infer_index_value(ival1, ival2) + + assert isinstance(oival.value, IndexValue.Int64Index) + assert oival.key != ival1.key + assert oival.key != ival2.key + + # different index type + index1 = pd.Index([1, 2], dtype=np.int64) + index2 = pd.Index([2.0, 3.0], dtype=np.float64) + + ival1 = parse_index(index1) + ival2 = parse_index(index2) + oival = infer_index_value(ival1, ival2) + + assert isinstance(oival.value, IndexValue.Float64Index) + assert oival.key != ival1.key + assert oival.key != ival2.key + + # range index and other index + index1 = pd.RangeIndex(1, 4) + index2 = pd.Index([2, 3, 4], dtype=np.float64) + + ival1 = parse_index(index1) + ival2 = parse_index(index2) + oival = infer_index_value(ival1, ival2) + + assert isinstance(oival.value, IndexValue.Float64Index) + assert oival.key != ival1.key + assert oival.key != ival2.key + + index1 = pd.DatetimeIndex([]) + index2 = pd.RangeIndex(2) + + ival1 = parse_index(index1) + ival2 = parse_index(index2) + oival = infer_index_value(ival1, ival2) + + assert isinstance(oival.value, IndexValue.Index) + assert oival.key != ival1.key + assert oival.key != ival2.key + + +def test_index_inferred_type(): + assert Index(pd.Index([1, 2, 3, 4])).inferred_type == "integer" + assert Index(pd.Index([1, 2, 3, 4]).astype("uint32")).inferred_type == "integer" + assert Index(pd.Index([1.2, 2.3, 4.5])).inferred_type == "floating" + assert ( + Index(pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])).inferred_type + == "interval" + ) + assert ( + Index(pd.MultiIndex.from_tuples([("a", 1), ("b", 2)])).inferred_type == "mixed" + ) + + +def test_validate_axis(): + df = DataFrame(pd.DataFrame(np.random.rand(4, 3))) + + assert validate_axis(0, df) == 0 + assert validate_axis("index", df) == 0 + assert validate_axis(1, df) == 1 + assert validate_axis("columns", df) == 1 + + with pytest.raises(ValueError): + validate_axis("unknown index", df) + + with pytest.raises(ValueError): + validate_axis(object(), df) + + with pytest.raises(ValueError): + validate_axis(-1, df) + + with pytest.raises(ValueError): + validate_axis(2, df) + + df2 = df[df[0] < 0.5] # create unknown shape + assert validate_axis(0, df2) == 0 + + +def test_dataframe_dir(): + df = DataFrame(pd.DataFrame(np.random.rand(4, 3), columns=list("ABC"))) + dir_result = set(dir(df)) + for c in df.dtypes.index: + assert c in dir_result + + +def test_fetch_dataframe_corner_data(setup): + max_rows = pd.get_option("display.max_rows") + try: + min_rows = pd.get_option("display.min_rows") + except KeyError: # pragma: no cover + min_rows = max_rows + + for row in ( + 5, + max_rows - 2, + max_rows - 1, + max_rows, + max_rows + 1, + max_rows + 2, + max_rows + 3, + ): + pdf = pd.DataFrame(np.random.rand(row, 5)) + df = DataFrame(pdf, chunk_size=max_rows // 2) + df.execute() + + corner = fetch_corner_data(df) + assert corner.shape[0] <= max_rows + 2 + corner_max_rows = max_rows if row <= max_rows else corner.shape[0] - 1 + assert corner.to_string( + max_rows=corner_max_rows, min_rows=min_rows + ) == pdf.to_string(max_rows=max_rows, min_rows=min_rows) + + +def test_make_dtypes(): + s = make_dtypes([int, float, np.dtype(int)]) + pd.testing.assert_series_equal( + s, pd.Series([np.dtype(int), np.dtype(float), np.dtype(int)]) + ) + + s = make_dtypes(OrderedDict([("a", int), ("b", float), ("c", np.dtype(int))])) + pd.testing.assert_series_equal( + s, pd.Series([np.dtype(int), np.dtype(float), np.dtype(int)], index=list("abc")) + ) + + s = make_dtypes(pd.Series([int, float, np.dtype(int)])) + pd.testing.assert_series_equal( + s, pd.Series([np.dtype(int), np.dtype(float), np.dtype(int)]) + ) + + assert make_dtypes(None) is None + + +@pytest.mark.parametrize( + "columns", + [ + pd.RangeIndex(8), + pd.MultiIndex.from_product([list("AB"), list("CDEF")]), + ], +) +def test_build_concatenated_rows_frame(setup, columns): + df = pd.DataFrame(np.random.rand(16, 8), columns=columns) + + # single chunk + mdf = tile(DataFrame(df, chunk_size=8)) + concatenated = build_concatenated_rows_frame(mdf) + assert len(concatenated.chunks) == 2 + pd.testing.assert_frame_equal(concatenated.execute().fetch(), df) + + # multiple chunks + mdf = tile(DataFrame(df, chunk_size=5)) + concatenated = build_concatenated_rows_frame(mdf) + assert len(concatenated.chunks) == 4 + for i in range(4): + pd.testing.assert_index_equal( + concatenated.chunks[i].columns_value.to_pandas(), df.columns + ) + pd.testing.assert_frame_equal(concatenated.execute().fetch(), df) + + +def test_auto_merge_chunks(): + from ..merge import DataFrameConcat + + pdf = pd.DataFrame(np.random.rand(16, 4), columns=list("abcd")) + memory_size = pdf.iloc[:4].memory_usage().sum() + + class FakeContext: + def __init__(self, retval=True): + self._retval = retval + + def get_chunks_meta(self, data_keys: List[str], **_) -> List[Dict]: + if self._retval: + return [{"memory_size": memory_size}] * len(data_keys) + else: + return [None] * len(data_keys) + + df = tile(DataFrame(pdf, chunk_size=4)) + df2 = auto_merge_chunks(FakeContext(), df, 2 * memory_size) + assert len(df2.chunks) == 2 + assert isinstance(df2.chunks[0].op, DataFrameConcat) + assert len(df2.chunks[0].op.inputs) == 2 + assert isinstance(df2.chunks[1].op, DataFrameConcat) + assert len(df2.chunks[1].op.inputs) == 2 + + df2 = auto_merge_chunks(FakeContext(), df, 3 * memory_size) + assert len(df2.chunks) == 2 + assert isinstance(df2.chunks[0].op, DataFrameConcat) + assert len(df2.chunks[0].op.inputs) == 3 + assert not isinstance(df2.chunks[1].op, DataFrameConcat) + assert len(df2.chunks[1].op.inputs) == 0 + assert df2.chunks[1].shape == df.chunks[-1].shape + assert df2.chunks[1].index == (1, 0) + + # mock situation that df not executed + df2 = auto_merge_chunks(FakeContext(False), df, 3 * memory_size) + assert df2 is df + + # number of chunks on columns > 1 + df3 = tile(DataFrame(pdf, chunk_size=2)) + df4 = auto_merge_chunks(FakeContext(), df3, 2 * memory_size) + assert df4 is df3 + + # each chunk's size is greater than limit + df5 = auto_merge_chunks(FakeContext(), df, memory_size / 5) + assert all((c1.shape == c2.shape) for c1, c2 in zip(df.chunks, df5.chunks)) + + # test series + ps = pdf.loc[:, "a"] + memory_size = ps.iloc[:4].memory_usage() + s = tile(Series(ps, chunk_size=4)) + s2 = auto_merge_chunks(FakeContext(), s, 2 * memory_size) + assert len(s2.chunks) == 2 + assert isinstance(s2.chunks[0].op, DataFrameConcat) + assert s2.chunks[0].name == "a" + assert len(s2.chunks[0].op.inputs) == 2 + assert isinstance(s2.chunks[1].op, DataFrameConcat) + assert s2.chunks[1].name == "a" + assert len(s2.chunks[1].op.inputs) == 2 + + +@pytest.mark.parametrize("multiplier_and_expected", [(1, False), (3, True), (4, True)]) +def test_whether_to_clean_up(multiplier_and_expected): + threshold = 10**4 + multiplier, expected = multiplier_and_expected + + class FakeOperandwithClosure: + def __init__(self, func): + self.func = func + self.need_clean_up_func = False + + @property + def need_clean_up_func(self): + return self._need_clean_up_func + + @need_clean_up_func.setter + def need_clean_up_func(self, need_clean_up_func: bool): + self._need_clean_up_func = need_clean_up_func + + class FakeCallable: + __slots__ = "df", "__dict__" + + def __init__(self, multiplier): + self.list = [ + ["This is a string.", 1.2, range(10)], + [ + bytes("This is a byte message.", "utf-8"), + bytearray("This is a byte array.", "utf-8"), + ], + ] + self.dic = {"one": pd.Series([i for i in range(10**multiplier)])} + self.df = pd.DataFrame(self.dic) + self.ds = pd.Series([i for i in range(10**multiplier)]) + + def __call__(self, z): + pass + + op = FakeOperandwithClosure(func=FakeCallable(multiplier=multiplier)) + result = whether_to_clean_up(op=op, threshold=threshold) + assert result is expected + assert op.need_clean_up_func is expected diff --git a/python/xorbits/_mars/dataframe/tseries/__init__.py b/python/xorbits/_mars/dataframe/tseries/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/tseries/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/tseries/tests/__init__.py b/python/xorbits/_mars/dataframe/tseries/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/tseries/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/tseries/tests/test_tseries.py b/python/xorbits/_mars/dataframe/tseries/tests/test_tseries.py new file mode 100644 index 000000000..3f857faa9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/tseries/tests/test_tseries.py @@ -0,0 +1,30 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pytest + +from .... import dataframe as md + + +def test_to_datetime(): + wrong_args = [pd.DataFrame({"a": [1, 2]}), {"a": [1, 2]}] + + for arg in wrong_args: + with pytest.raises(ValueError) as cm: + md.to_datetime(arg) + assert "[year, month, day]" in str(cm.value) + + with pytest.raises(TypeError): + md.to_datetime([[1, 2], [3, 4]]) diff --git a/python/xorbits/_mars/dataframe/tseries/tests/test_tseries_execution.py b/python/xorbits/_mars/dataframe/tseries/tests/test_tseries_execution.py new file mode 100644 index 000000000..0be314ac7 --- /dev/null +++ b/python/xorbits/_mars/dataframe/tseries/tests/test_tseries_execution.py @@ -0,0 +1,97 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd + +from .... import dataframe as md +from ....tensor import tensor +from ....tests.core import require_cudf +from ... import DataFrame, Index, Series, to_datetime + + +def test_to_datetime_execution(setup): + # scalar + r = to_datetime(1490195805, unit="s") + + result = r.execute().fetch( + extra_config={"check_dtypes": False, "check_shape": False} + ) + expected = pd.to_datetime(1490195805, unit="s") + assert pd.to_datetime(result) == expected + + # test list like + raw = ["3/11/2000", "3/12/2000", "3/13/2000"] + t = tensor(raw, chunk_size=2) + r = to_datetime(t, infer_datetime_format=True) + + result = r.execute().fetch() + expected = pd.to_datetime(raw, infer_datetime_format=True) + pd.testing.assert_index_equal(result, expected) + + # test series + raw_series = pd.Series(raw) + s = Series(raw_series, chunk_size=2) + r = to_datetime(s) + + result = r.execute().fetch() + expected = pd.to_datetime(raw_series) + pd.testing.assert_series_equal(result, expected) + + # test DataFrame + raw_df = pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) + df = DataFrame(raw_df, chunk_size=(1, 2)) + r = to_datetime(df) + + result = r.execute().fetch() + expected = pd.to_datetime(raw_df) + pd.testing.assert_series_equal(result, expected) + + # test Index + raw_index = pd.Index([1, 2, 3]) + s = Index(raw_index, chunk_size=2) + r = to_datetime(s) + + result = r.execute().fetch() + expected = pd.to_datetime(raw_index) + pd.testing.assert_index_equal(result, expected) + + # test raises == 'ignore' + raw = ["13000101"] + r = to_datetime(raw, format="%Y%m%d", errors="ignore") + result = r.execute().fetch() + expected = pd.to_datetime(raw, format="%Y%m%d", errors="ignore") + pd.testing.assert_index_equal(result, expected) + + # test unit + r = to_datetime([1490195805], unit="s") + result = r.execute().fetch() + expected = pd.to_datetime([1490195805], unit="s") + pd.testing.assert_index_equal(result, expected) + + # test origin + r = to_datetime([1, 2, 3], unit="D", origin=pd.Timestamp("1960-01-01")) + result = r.execute().fetch() + expected = pd.to_datetime([1, 2, 3], unit="D", origin=pd.Timestamp("1960-01-01")) + pd.testing.assert_index_equal(result, expected) + + +@require_cudf +def test_to_datetime_gpu_execution(setup_gpu): + s = md.Series(["3/11/2000", "3/12/2000", "3/13/2000"]).to_gpu() + r = to_datetime(s, format="%m/%d/%Y") + result = r.execute().fetch().to_pandas() + expected = pd.to_datetime( + pd.Series(["3/11/2000", "3/12/2000", "3/13/2000"]), format="%m/%d/%Y" + ) + pd.testing.assert_series_equal(result, expected) diff --git a/python/xorbits/_mars/dataframe/tseries/to_datetime.py b/python/xorbits/_mars/dataframe/tseries/to_datetime.py new file mode 100644 index 000000000..8ff68c9d4 --- /dev/null +++ b/python/xorbits/_mars/dataframe/tseries/to_datetime.py @@ -0,0 +1,370 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import partial +from typing import Any + +import numpy as np +import pandas as pd +from pandas.api.types import is_dict_like, is_scalar + +from ... import opcodes +from ...core import recursive_tile +from ...serialization.serializables import AnyField, BoolField, KeyField, StringField +from ...tensor import tensor as astensor +from ...tensor.core import TENSOR_CHUNK_TYPE +from ..core import DATAFRAME_TYPE, INDEX_CHUNK_TYPE, INDEX_TYPE, SERIES_TYPE +from ..initializer import DataFrame as asdataframe +from ..initializer import Index as asindex +from ..initializer import Series as asseries +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import lazy_import, parse_index + +cudf = lazy_import("cudf") + + +class DataFrameToDatetime(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.TO_DATETIME + + arg = KeyField("arg") + errors = StringField("errors", default=None) + dayfirst = BoolField("dayfirst", default=None) + yearfirst = BoolField("yearfirst", default=None) + utc = BoolField("utc", default=None) + format = StringField("format", default=None) + exact = BoolField("exact", default=None) + unit = StringField("unit", default=None) + infer_datetime_format = BoolField("infer_datetime_format", default=None) + origin = AnyField("origin", default=None) + cache = BoolField("cache", default=None) + + @property + def _params(self): + return tuple( + getattr(self, k) + for k in self._keys_ + if k not in self._no_copy_attrs_ and k != "arg" and hasattr(self, k) + ) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self.arg = self._inputs[0] + + def __call__(self, arg): + if is_scalar(arg): + ret = pd.to_datetime( + arg, + errors=self.errors, + dayfirst=self.dayfirst, + yearfirst=self.yearfirst, + utc=self.utc, + format=self.format, + exact=self.exact, + unit=self.unit, + infer_datetime_format=self.infer_datetime_format, + origin=self.origin, + cache=self.cache, + ) + return astensor(ret) + + dtype = np.datetime64(1, "ns").dtype + if isinstance(arg, (pd.Series, SERIES_TYPE)): + arg = asseries(arg) + return self.new_series( + [arg], + shape=arg.shape, + dtype=dtype, + index_value=arg.index_value, + name=arg.name, + ) + if is_dict_like(arg) or isinstance(arg, DATAFRAME_TYPE): + arg = asdataframe(arg) + columns = arg.columns_value.to_pandas().tolist() + if sorted(columns) != sorted(["year", "month", "day"]): + missing = ",".join( + c for c in ["day", "month", "year"] if c not in columns + ) + raise ValueError( + "to assemble mappings requires at least " + f"that [year, month, day] be specified: [{missing}] is missing" + ) + return self.new_series( + [arg], shape=(arg.shape[0],), dtype=dtype, index_value=arg.index_value + ) + elif isinstance(arg, (pd.Index, INDEX_TYPE)): + arg = asindex(arg) + return self.new_index( + [arg], + shape=arg.shape, + dtype=dtype, + index_value=parse_index(pd.Index([], dtype=dtype), self._params, arg), + name=arg.name, + ) + else: + arg = astensor(arg) + if arg.ndim != 1: + raise TypeError( + "arg must be a string, datetime, " + "list, tuple, 1-d tensor, or Series" + ) + return self.new_index( + [arg], + shape=arg.shape, + dtype=dtype, + index_value=parse_index(pd.Index([], dtype=dtype), self._params, arg), + ) + + @classmethod + def tile(cls, op: "DataFrameToDatetime"): + out = op.outputs[0] + arg = op.arg + + if isinstance(arg, DATAFRAME_TYPE): + if np.isnan(arg.shape[0]) or any( + np.isnan(s) for s in arg.nsplits[1] + ): # pragma: no cover + yield + + arg = yield from recursive_tile(arg.rechunk({1: arg.shape[1]})) + + out_chunks = [] + for chunk in arg.chunks: + chunk_op = op.copy().reset_key() + if isinstance(chunk, (TENSOR_CHUNK_TYPE, INDEX_CHUNK_TYPE)): + chunk_index_value = parse_index( + pd.Index([], dtype=out.dtype), op._params, chunk + ) + else: + chunk_index_value = chunk.index_value + + out_chunk = chunk_op.new_chunk( + [chunk], + shape=(chunk.shape[0],), + dtype=out.dtype, + index_value=chunk_index_value, + name=out.name, + index=(chunk.index[0],), + ) + out_chunks.append(out_chunk) + + params = out.params + params["nsplits"] = (arg.nsplits[0],) + params["chunks"] = out_chunks + new_op = op.copy() + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + def execute(cls, ctx, op: "DataFrameToDatetime"): + arg = ctx[op.arg.key] + + unit = op.unit + if cudf and op.gpu: + func = cudf.to_datetime + if unit is None: + unit = "ns" + else: + func = pd.to_datetime + + call = partial( + func, + errors=op.errors, + dayfirst=op.dayfirst, + yearfirst=op.yearfirst, + utc=op.utc, + format=op.format, + exact=op.exact, + unit=unit, + infer_datetime_format=op.infer_datetime_format, + origin=op.origin, + cache=op.cache, + ) + + try: + ctx[op.outputs[0].key] = call(arg) + except ValueError: # pragma: no cover + ctx[op.outputs[0].key] = call(arg.copy()) + + +def to_datetime( + arg, + errors: str = "raise", + dayfirst: bool = False, + yearfirst: bool = False, + utc: bool = None, + format: str = None, + exact: bool = True, + unit: str = None, + infer_datetime_format: bool = False, + origin: Any = "unix", + cache: bool = True, +): + """ + Convert argument to datetime. + + Parameters + ---------- + arg : int, float, str, datetime, list, tuple, 1-d array, Series DataFrame/dict-like + The object to convert to a datetime. + errors : {'ignore', 'raise', 'coerce'}, default 'raise' + - If 'raise', then invalid parsing will raise an exception. + - If 'coerce', then invalid parsing will be set as NaT. + - If 'ignore', then invalid parsing will return the input. + dayfirst : bool, default False + Specify a date parse order if `arg` is str or its list-likes. + If True, parses dates with the day first, eg 10/11/12 is parsed as + 2012-11-10. + Warning: dayfirst=True is not strict, but will prefer to parse + with day first (this is a known bug, based on dateutil behavior). + yearfirst : bool, default False + Specify a date parse order if `arg` is str or its list-likes. + + - If True parses dates with the year first, eg 10/11/12 is parsed as + 2010-11-12. + - If both dayfirst and yearfirst are True, yearfirst is preceded (same + as dateutil). + + Warning: yearfirst=True is not strict, but will prefer to parse + with year first (this is a known bug, based on dateutil behavior). + utc : bool, default None + Return UTC DatetimeIndex if True (converting any tz-aware + datetime.datetime objects as well). + format : str, default None + The strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse + all the way up to nanoseconds. + See strftime documentation for more information on choices: + https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. + exact : bool, True by default + Behaves as: + - If True, require an exact format match. + - If False, allow the format to match anywhere in the target string. + + unit : str, default 'ns' + The unit of the arg (D,s,ms,us,ns) denote the unit, which is an + integer or float number. This will be based off the origin. + Example, with unit='ms' and origin='unix' (the default), this + would calculate the number of milliseconds to the unix epoch start. + infer_datetime_format : bool, default False + If True and no `format` is given, attempt to infer the format of the + datetime strings, and if it can be inferred, switch to a faster + method of parsing them. In some cases this can increase the parsing + speed by ~5-10x. + origin : scalar, default 'unix' + Define the reference date. The numeric values would be parsed as number + of units (defined by `unit`) since this reference date. + + - If 'unix' (or POSIX) time; origin is set to 1970-01-01. + - If 'julian', unit must be 'D', and origin is set to beginning of + Julian Calendar. Julian day number 0 is assigned to the day starting + at noon on January 1, 4713 BC. + - If Timestamp convertible, origin is set to Timestamp identified by + origin. + cache : bool, default True + If True, use a cache of unique, converted dates to apply the datetime + conversion. May produce significant speed-up when parsing duplicate + date strings, especially ones with timezone offsets. The cache is only + used when there are at least 50 values. The presence of out-of-bounds + values will render the cache unusable and may slow down parsing. + + Returns + ------- + datetime + If parsing succeeded. + Return type depends on input: + + - list-like: DatetimeIndex + - Series: Series of datetime64 dtype + - scalar: Timestamp + + In case when it is not possible to return designated types (e.g. when + any element of input is before Timestamp.min or after Timestamp.max) + return will have datetime.datetime type (or corresponding + array/Series). + + See Also + -------- + DataFrame.astype : Cast argument to a specified dtype. + to_timedelta : Convert argument to timedelta. + convert_dtypes : Convert dtypes. + + Examples + -------- + Assembling a datetime from multiple columns of a DataFrame. The keys can be + common abbreviations like ['year', 'month', 'day', 'minute', 'second', + 'ms', 'us', 'ns']) or plurals of the same + + >>> import mars.dataframe as md + + >>> df = md.DataFrame({'year': [2015, 2016], + ... 'month': [2, 3], + ... 'day': [4, 5]}) + >>> md.to_datetime(df).execute() + 0 2015-02-04 + 1 2016-03-05 + dtype: datetime64[ns] + + If a date does not meet the `timestamp limitations + `_, passing errors='ignore' + will return the original input instead of raising any exception. + + Passing errors='coerce' will force an out-of-bounds date to NaT, + in addition to forcing non-dates (or non-parseable dates) to NaT. + + >>> md.to_datetime('13000101', format='%Y%m%d', errors='ignore').execute() + datetime.datetime(1300, 1, 1, 0, 0) + >>> md.to_datetime('13000101', format='%Y%m%d', errors='coerce').execute() + NaT + + Passing infer_datetime_format=True can often-times speedup a parsing + if its not an ISO8601 format exactly, but in a regular format. + + >>> s = md.Series(['3/11/2000', '3/12/2000', '3/13/2000'] * 1000) + >>> s.head().execute() + 0 3/11/2000 + 1 3/12/2000 + 2 3/13/2000 + 3 3/11/2000 + 4 3/12/2000 + dtype: object + + Using a unix epoch time + + >>> md.to_datetime(1490195805, unit='s').execute() + Timestamp('2017-03-22 15:16:45') + >>> md.to_datetime(1490195805433502912, unit='ns').execute() + Timestamp('2017-03-22 15:16:45.433502912') + + .. warning:: For float arg, precision rounding might happen. To prevent + unexpected behavior use a fixed-width exact type. + + Using a non-unix epoch origin + + >>> md.to_datetime([1, 2, 3], unit='D', + ... origin=md.Timestamp('1960-01-01')).execute() + DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], \ +dtype='datetime64[ns]', freq=None) + """ + op = DataFrameToDatetime( + errors=errors, + dayfirst=dayfirst, + yearfirst=yearfirst, + utc=utc, + format=format, + exact=exact, + unit=unit, + infer_datetime_format=infer_datetime_format, + origin=origin, + cache=cache, + ) + return op(arg) diff --git a/python/xorbits/_mars/dataframe/ufunc/__init__.py b/python/xorbits/_mars/dataframe/ufunc/__init__.py new file mode 100644 index 000000000..d49856b7c --- /dev/null +++ b/python/xorbits/_mars/dataframe/ufunc/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def _install(): + from ..core import DataFrame, Series + from .tensor import _tensor_ufunc + from .ufunc import _array_ufunc + + for Entity in (DataFrame, Series): + Entity.__array_ufunc__ = _array_ufunc + Entity.__tensor_ufunc__ = _tensor_ufunc + + +_install() +del _install diff --git a/python/xorbits/_mars/dataframe/ufunc/tensor.py b/python/xorbits/_mars/dataframe/ufunc/tensor.py new file mode 100644 index 000000000..2d91fa942 --- /dev/null +++ b/python/xorbits/_mars/dataframe/ufunc/tensor.py @@ -0,0 +1,54 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...utils import classproperty + +_tensor_op_to_df_op = dict() + + +def register_tensor_ufunc(op): + _tensor_op_to_df_op[op.tensor_op_type] = op + + +def get_tensor_ufunc_implementation(tensor_op): + if tensor_op in _tensor_op_to_df_op: + return _tensor_op_to_df_op[tensor_op] + + +class TensorUfuncMixin: + __slots__ = () + + @classproperty + def tensor_op_type(self): + raise NotImplementedError + + @classmethod + def ufunc_call(cls, tensor_op, inputs, out, where, **kw): + if out is not None: + return NotImplemented + if where is not None: + raise NotImplementedError + + try: + op = _tensor_op_to_df_op[tensor_op](**kw) + return op(*inputs) + except (KeyError, TypeError): + return NotImplemented + + +def _tensor_ufunc(_, tensor_op, inputs, out, where, **kw): + op = get_tensor_ufunc_implementation(tensor_op) + if op is not None: + return op.ufunc_call(tensor_op, inputs, out, where, **kw) + return NotImplemented diff --git a/python/xorbits/_mars/dataframe/ufunc/ufunc.py b/python/xorbits/_mars/dataframe/ufunc/ufunc.py new file mode 100644 index 000000000..7f4a7fe61 --- /dev/null +++ b/python/xorbits/_mars/dataframe/ufunc/ufunc.py @@ -0,0 +1,53 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from numbers import Number + +from ...tensor import tensor as astensor +from ...tensor.ufunc.ufunc import UFUNC_TO_TENSOR_FUNCS +from ..core import DATAFRAME_TYPE, SERIES_TYPE + + +def _check_arg(arg): + if isinstance(arg, Number): + return True + + if isinstance(arg, (DATAFRAME_TYPE, SERIES_TYPE)): + return True + + try: + astensor(arg) + return True + except ValueError: + return False + + +def _array_ufunc(_, ufunc, method, *inputs, **kwargs): + out = kwargs.get("out", tuple()) + for x in inputs + out: + if not _check_arg(x): + return NotImplemented + + if ufunc.signature is not None: + return NotImplemented + if ufunc not in UFUNC_TO_TENSOR_FUNCS: + return NotImplemented + + # we delegate numpy ufunc to tensor ufunc, + # tensor ufunc will handle Mars DataFrame properly. + try: + tensor_func = getattr(UFUNC_TO_TENSOR_FUNCS[ufunc], method) + return tensor_func(*inputs, **kwargs) + except (AttributeError, NotImplementedError): + return NotImplemented diff --git a/python/xorbits/_mars/dataframe/utils.py b/python/xorbits/_mars/dataframe/utils.py new file mode 100644 index 000000000..4a4bdfb8a --- /dev/null +++ b/python/xorbits/_mars/dataframe/utils.py @@ -0,0 +1,1579 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import inspect +import itertools +import logging +import operator +import os +import sys +from contextlib import contextmanager +from numbers import Integral +from typing import Any, List, Union + +import cloudpickle +import numpy as np +import pandas as pd +from pandas.api.extensions import ExtensionDtype +from pandas.api.types import is_string_dtype +from pandas.core.dtypes.cast import find_common_type + +from ..config import options +from ..core import Entity, ExecutableTuple +from ..core.context import Context, get_context +from ..lib.mmh3 import hash as mmh_hash +from ..tensor.utils import dictify_chunk_size, normalize_chunk_sizes +from ..typing import ChunkType, TileableType +from ..utils import ( + ModulePlaceholder, + is_full_slice, + is_on_ray, + lazy_import, + parse_readable_size, + parse_version, + sbytes, + tokenize, +) + +try: + import pyarrow as pa +except ImportError: # pragma: no cover + pa = ModulePlaceholder("pyarrow") + +cudf = lazy_import("cudf", rename="cudf") +vineyard = lazy_import("vineyard") +try: + import ray + + ray_release_version = parse_version(ray.__version__).release + ray_deprecate_ml_dataset = ray_release_version[:2] >= (2, 0) +except ImportError: + ray_release_version = None + ray_deprecate_ml_dataset = None +logger = logging.getLogger(__name__) + + +def hash_index(index, size): + def func(x, size): + return mmh_hash(sbytes(x)) % size + + f = functools.partial(func, size=size) + idx_to_grouped = index.groupby(index.map(f)) + return [idx_to_grouped.get(i, list()) for i in range(size)] + + +def hash_dataframe_on(df, on, size, level=None): + if on is None: + idx = df.index + if level is not None: + idx = idx.to_frame(False)[level] + if cudf and isinstance(idx, cudf.Index): # pragma: no cover + idx = idx.to_pandas() + hashed_label = pd.util.hash_pandas_object(idx, categorize=False) + elif callable(on): + # todo optimization can be added, if ``on`` is a numpy ufunc or sth can be vectorized + hashed_label = pd.util.hash_pandas_object(df.index.map(on), categorize=False) + else: + if isinstance(on, list): + to_concat = [] + for v in on: + if isinstance(v, pd.Series): + to_concat.append(v) + else: + to_concat.append(df[v]) + data = pd.concat(to_concat, axis=1) + else: + data = df[on] + hashed_label = pd.util.hash_pandas_object(data, index=False, categorize=False) + idx_to_grouped = pd.RangeIndex(0, len(hashed_label)).groupby(hashed_label % size) + return [idx_to_grouped.get(i, pd.Index([])) for i in range(size)] + + +def hash_dtypes(dtypes, size): + hashed_indexes = hash_index(dtypes.index, size) + return [dtypes[index] for index in hashed_indexes] + + +def sort_dataframe_inplace(df, *axis): + for ax in axis: + df.sort_index(axis=ax, inplace=True) + return df + + +@functools.lru_cache(1) +def _get_range_index_type(): + if cudf is not None: + return pd.RangeIndex, cudf.RangeIndex + else: + return pd.RangeIndex + + +@functools.lru_cache(1) +def _get_multi_index_type(): + if cudf is not None: + return pd.MultiIndex, cudf.MultiIndex + else: + return pd.MultiIndex + + +def _get_range_index_start(pd_range_index): + try: + return pd_range_index.start + except AttributeError: # pragma: no cover + return pd_range_index._start + + +def _get_range_index_stop(pd_range_index): + try: + return pd_range_index.stop + except AttributeError: # pragma: no cover + return pd_range_index._stop + + +def _get_range_index_step(pd_range_index): + try: + return pd_range_index.step + except AttributeError: # pragma: no cover + pass + try: # pragma: no cover + return pd_range_index._step + except AttributeError: # pragma: no cover + return 1 # cudf does not support step arg + + +def is_pd_range_empty(pd_range_index): + start, stop, step = ( + _get_range_index_start(pd_range_index), + _get_range_index_stop(pd_range_index), + _get_range_index_step(pd_range_index), + ) + return (start >= stop and step >= 0) or (start <= stop and step < 0) + + +def decide_dataframe_chunk_sizes(shape, chunk_size, memory_usage): + """ + Decide how a given DataFrame can be split into chunk. + + :param shape: DataFrame's shape + :param chunk_size: if dict provided, it's dimension id to chunk size; + if provided, it's the chunk size for each dimension. + :param memory_usage: pandas Series in which each column's memory usage + :type memory_usage: pandas.Series + :return: the calculated chunk size for each dimension + :rtype: tuple + """ + chunk_size = dictify_chunk_size(shape, chunk_size) + average_memory_usage = memory_usage / shape[0] + + nleft = len(shape) - len(chunk_size) + if nleft < 0: + raise ValueError("chunks have more than two dimensions") + if nleft == 0: + return normalize_chunk_sizes( + shape, tuple(chunk_size[j] for j in range(len(shape))) + ) + + max_chunk_size = options.chunk_store_limit + + # for the row side, along axis 0 + if 0 not in chunk_size: + row_chunk_size = [] + row_left_size = shape[0] + else: + row_chunk_size = normalize_chunk_sizes((shape[0],), (chunk_size[0],))[0] + row_left_size = -1 + # for the column side, along axis 1 + if 1 not in chunk_size: + col_chunk_size = [] + col_chunk_store = [] + col_left_size = shape[1] + else: + col_chunk_size = normalize_chunk_sizes((shape[1],), (chunk_size[1],))[0] + acc = [0] + np.cumsum(col_chunk_size).tolist() + col_chunk_store = [ + average_memory_usage[acc[i] : acc[i + 1]].sum() + for i in range(len(col_chunk_size)) + ] + col_left_size = -1 + + while True: + nbytes_occupied = np.prod( + [max(it) for it in (row_chunk_size, col_chunk_store) if it] + ) + dim_size = np.maximum( + int(np.power(max_chunk_size / nbytes_occupied, 1 / float(nleft))), 1 + ) + + if col_left_size == 0 and not col_chunk_size: + col_chunk_size.append(0) + + if row_left_size == 0 and not row_chunk_size: + row_chunk_size.append(0) + + # check col first + if col_left_size > 0: + cs = min(col_left_size, dim_size) + col_chunk_size.append(cs) + start = int(np.sum(col_chunk_size[:-1])) + col_chunk_store.append(average_memory_usage.iloc[start : start + cs].sum()) + col_left_size -= cs + if row_left_size > 0: + if col_chunk_store: + max_col_chunk_store = max(col_chunk_store) + cs = min(row_left_size, int(max_chunk_size / max_col_chunk_store)) + else: + cs = row_left_size + row_chunk_size.append(cs) + row_left_size -= cs + + if col_left_size <= 0 and row_left_size <= 0: + break + + return tuple(row_chunk_size), tuple(col_chunk_size) + + +def decide_series_chunk_size(shape, chunk_size, memory_usage): + chunk_size = dictify_chunk_size(shape, chunk_size) + average_memory_usage = memory_usage / shape[0] if shape[0] != 0 else memory_usage + + if len(chunk_size) == len(shape): + return normalize_chunk_sizes(shape, chunk_size[0]) + + if all(s == 0 for s in shape): + # skip when shape is 0 + return tuple((s,) for s in shape) + + max_chunk_size = options.chunk_store_limit + series_chunk_size = max_chunk_size / average_memory_usage + return normalize_chunk_sizes(shape, int(series_chunk_size)) + + +def parse_index(index_value, *args, store_data=False, key=None): + from .core import IndexValue + + def _extract_property(index, tp, ret_data): + kw = { + "_min_val": _get_index_min(index), + "_max_val": _get_index_max(index), + "_min_val_close": True, + "_max_val_close": True, + "_key": key or _tokenize_index(index, *args), + } + if ret_data: + kw["_data"] = index.values + for field in tp._FIELDS: + if field in kw or field == "_data": + continue + val = getattr(index, field.lstrip("_"), None) + if val is not None: + kw[field] = val + return kw + + def _tokenize_index(index, *token_objects): + if not index.empty: + return tokenize(index) + else: + return tokenize(index, *token_objects) + + def _get_index_min(index): + try: + return index.min() + except (ValueError, AttributeError): + if isinstance(index, pd.IntervalIndex): + return None + raise + except TypeError: + return None + + def _get_index_max(index): + try: + return index.max() + except (ValueError, AttributeError): + if isinstance(index, pd.IntervalIndex): + return None + raise + except TypeError: + return None + + def _serialize_index(index): + tp = getattr(IndexValue, type(index).__name__) + properties = _extract_property(index, tp, store_data) + properties["_name"] = index.name + return tp(**properties) + + def _serialize_range_index(index): + if is_pd_range_empty(index): + properties = { + "_is_monotonic_increasing": True, + "_is_monotonic_decreasing": False, + "_is_unique": True, + "_min_val": _get_index_min(index), + "_max_val": _get_index_max(index), + "_min_val_close": True, + "_max_val_close": False, + "_key": key or _tokenize_index(index, *args), + "_name": index.name, + "_dtype": index.dtype, + } + else: + properties = _extract_property(index, IndexValue.RangeIndex, False) + return IndexValue.RangeIndex( + _slice=slice( + _get_range_index_start(index), + _get_range_index_stop(index), + _get_range_index_step(index), + ), + **properties, + ) + + def _serialize_multi_index(index): + kw = _extract_property(index, IndexValue.MultiIndex, store_data) + kw["_sortorder"] = index.sortorder + kw["_dtypes"] = [lev.dtype for lev in index.levels] + return IndexValue.MultiIndex(**kw) + + if index_value is None: + return IndexValue( + _index_value=IndexValue.Index( + _is_monotonic_increasing=False, + _is_monotonic_decreasing=False, + _is_unique=False, + _min_val=None, + _max_val=None, + _min_val_close=True, + _max_val_close=True, + _key=key or tokenize(*args), + ) + ) + if hasattr(index_value, "to_pandas"): # pragma: no cover + # convert cudf.Index to pandas + index_value = index_value.to_pandas() + + if isinstance(index_value, _get_range_index_type()): + return IndexValue(_index_value=_serialize_range_index(index_value)) + elif isinstance(index_value, _get_multi_index_type()): + return IndexValue(_index_value=_serialize_multi_index(index_value)) + else: + return IndexValue(_index_value=_serialize_index(index_value)) + + +def gen_unknown_index_value(index_value, *args): + pd_index = index_value.to_pandas() + if isinstance(pd_index, pd.RangeIndex): + return parse_index(pd.RangeIndex(-1), *args) + elif not isinstance(pd_index, pd.MultiIndex): + return parse_index(pd.Index([], dtype=pd_index.dtype), *args) + else: + i = pd.MultiIndex.from_arrays( + [c[:0] for c in pd_index.levels], names=pd_index.names + ) + return parse_index(i, *args) + + +def split_monotonic_index_min_max( + left_min_max, left_increase, right_min_max, right_increase +): + """ + Split the original two min_max into new min_max. Each min_max should be a list + in which each item should be a 4-tuple indicates that this chunk's min value, + whether the min value is close, the max value, and whether the max value is close. + The return value would be a nested list, each item is a list + indicates that how this chunk should be split into. + + :param left_min_max: the left min_max + :param left_increase: if the original data of left is increased + :param right_min_max: the right min_max + :param right_increase: if the original data of right is increased + :return: nested list in which each item indicates how min_max is split + + >>> left_min_max = [(0, True, 3, True), (4, True, 8, True), (12, True, 18, True), + ... (20, True, 22, True)] + >>> right_min_max = [(2, True, 6, True), (7, True, 9, True), (10, True, 14, True), + ... (18, True, 19, True)] + >>> l, r = split_monotonic_index_min_max(left_min_max, True, right_min_max, True) + >>> l + [[(0, True, 2, False), (2, True, 3, True)], [(3, False, 4, False), (4, True, 6, True), (6, False, 7, False), + (7, True, 8, True)], [(8, False, 9, True), (10, True, 12, False), (12, True, 14, True), (14, False, 18, False), + (18, True, 18, True)], [(18, False, 19, True), [20, True, 22, True]]] + >>> r + [[(0, True, 2, False), (2, True, 3, True), (3, False, 4, False), (4, True, 6, True)], + [(6, False, 7, False), (7, True, 8, True), (8, False, 9, True)], [(10, True, 12, False), (12, True, 14, True)], + [(14, False, 18, False), (18, True, 18, True), (18, False, 19, True), [20, True, 22, True]]] + """ + left_idx_to_min_max = [[] for _ in left_min_max] + right_idx_to_min_max = [[] for _ in right_min_max] + left_curr_min_max = list(left_min_max[0]) + right_curr_min_max = list(right_min_max[0]) + left_curr_idx = right_curr_idx = 0 + left_terminate = right_terminate = False + + while not left_terminate or not right_terminate: + if left_terminate: + left_idx_to_min_max[left_curr_idx].append(tuple(right_curr_min_max)) + right_idx_to_min_max[right_curr_idx].append(tuple(right_curr_min_max)) + if right_curr_idx + 1 >= len(right_min_max): + right_terminate = True + else: + right_curr_idx += 1 + right_curr_min_max = list(right_min_max[right_curr_idx]) + elif right_terminate: + right_idx_to_min_max[right_curr_idx].append(tuple(left_curr_min_max)) + left_idx_to_min_max[left_curr_idx].append(tuple(left_curr_min_max)) + if left_curr_idx + 1 >= len(left_min_max): + left_terminate = True + else: + left_curr_idx += 1 + left_curr_min_max = list(left_min_max[left_curr_idx]) + elif left_curr_min_max[0] < right_curr_min_max[0]: + # left min < right min + right_min = [right_curr_min_max[0], not right_curr_min_max[1]] + max_val = min(left_curr_min_max[2:], right_min) + assert len(max_val) == 2 + min_max = ( + left_curr_min_max[0], + left_curr_min_max[1], + max_val[0], + max_val[1], + ) + left_idx_to_min_max[left_curr_idx].append(min_max) + right_idx_to_min_max[right_curr_idx].append(min_max) + if left_curr_min_max[2:] == max_val: + # left max < right min + if left_curr_idx + 1 >= len(left_min_max): + left_terminate = True + else: + left_curr_idx += 1 + left_curr_min_max = list(left_min_max[left_curr_idx]) + else: + # from left min(left min close) to right min(exclude right min close) + left_curr_min_max[:2] = right_curr_min_max[:2] + elif left_curr_min_max[0] > right_curr_min_max[0]: + # left min > right min + left_min = [left_curr_min_max[0], not left_curr_min_max[1]] + max_val = min(right_curr_min_max[2:], left_min) + min_max = ( + right_curr_min_max[0], + right_curr_min_max[1], + max_val[0], + max_val[1], + ) + left_idx_to_min_max[left_curr_idx].append(min_max) + right_idx_to_min_max[right_curr_idx].append(min_max) + if right_curr_min_max[2:] == max_val: + # right max < left min + if right_curr_idx + 1 >= len(right_min_max): + right_terminate = True + else: + right_curr_idx += 1 + right_curr_min_max = list(right_min_max[right_curr_idx]) + else: + # from left min(left min close) to right min(exclude right min close) + right_curr_min_max[:2] = left_curr_min_max[:2] + else: + # left min == right min + max_val = min(left_curr_min_max[2:], right_curr_min_max[2:]) + assert len(max_val) == 2 + min_max = ( + left_curr_min_max[0], + left_curr_min_max[1], + max_val[0], + max_val[1], + ) + left_idx_to_min_max[left_curr_idx].append(min_max) + right_idx_to_min_max[right_curr_idx].append(min_max) + if max_val == left_curr_min_max[2:]: + if left_curr_idx + 1 >= len(left_min_max): + left_terminate = True + else: + left_curr_idx += 1 + left_curr_min_max = list(left_min_max[left_curr_idx]) + else: + left_curr_min_max[:2] = max_val[0], not max_val[1] + if max_val == right_curr_min_max[2:]: + if right_curr_idx + 1 >= len(right_min_max): + right_terminate = True + else: + right_curr_idx += 1 + right_curr_min_max = list(right_min_max[right_curr_idx]) + else: + right_curr_min_max[:2] = max_val[0], not max_val[1] + + if left_increase is False: + left_idx_to_min_max = list(reversed(left_idx_to_min_max)) + if right_increase is False: + right_idx_to_min_max = list(reversed(right_idx_to_min_max)) + + return left_idx_to_min_max, right_idx_to_min_max + + +def build_split_idx_to_origin_idx(splits, increase=True): + # splits' len is equal to the original chunk size on a specified axis, + # splits is sth like [[(0, True, 2, True), (2, False, 3, True)]] + # which means there is one input chunk, and will be split into 2 out chunks + # in this function, we want to build a new dict from the out chunk index to + # the original chunk index and the inner position, like {0: (0, 0), 1: (0, 1)} + if increase is False: + splits = list(reversed(splits)) + out_idx = itertools.count(0) + res = dict() + for origin_idx, _ in enumerate(splits): + for pos in range(len(splits[origin_idx])): + if increase is False: + o_idx = len(splits) - origin_idx - 1 + else: + o_idx = origin_idx + res[next(out_idx)] = o_idx, pos + return res + + +def _generate_value(dtype, fill_value): + # special handle for datetime64 and timedelta64 + dispatch = { + np.datetime64: pd.Timestamp, + np.timedelta64: pd.Timedelta, + pd.CategoricalDtype.type: lambda x: pd.CategoricalDtype([x]), + # for object, we do not know the actual dtype, + # just convert to str for common usage + np.object_: lambda x: str(fill_value), + } + # otherwise, just use dtype.type itself to convert + convert = dispatch.get(dtype.type, dtype.type) + return convert(fill_value) + + +def build_empty_df(dtypes, index=None): + columns = dtypes.index + length = len(index) if index is not None else 0 + record = [[_generate_value(dtype, 1) for dtype in dtypes]] * max(1, length) + + # duplicate column may exist, + # so use RangeIndex first + df = pd.DataFrame(record, columns=range(len(dtypes)), index=index) + for i, dtype in enumerate(dtypes): + s = df.iloc[:, i] + if not pd.api.types.is_dtype_equal(s.dtype, dtype): + df.iloc[:, i] = s.astype(dtype) + + df.columns = columns + return df[:length] if len(df) > length else df + + +def build_df(df_obj, fill_value=1, size=1, ensure_string=False): + dfs = [] + if not isinstance(size, (list, tuple)): + sizes = [size] + else: + sizes = size + + if not isinstance(fill_value, (list, tuple)): + fill_values = [fill_value] + else: + fill_values = fill_value + + for size, fill_value in zip(sizes, fill_values): + dtypes = df_obj.dtypes + record = [[_generate_value(dtype, fill_value) for dtype in dtypes]] * size + df = pd.DataFrame(record) + df.columns = dtypes.index + + if len(record) != 0: # columns is empty in some cases + target_index = df_obj.index_value.to_pandas() + if isinstance(target_index, pd.MultiIndex): + index_val = tuple( + _generate_value(level.dtype, fill_value) + for level in target_index.levels + ) + df.index = pd.MultiIndex.from_tuples( + [index_val] * size, names=target_index.names + ) + else: + index_val = _generate_value(target_index.dtype, fill_value) + df.index = pd.Index([index_val] * size, name=target_index.name) + + # make sure dtypes correct + for i, dtype in enumerate(dtypes): + s = df.iloc[:, i] + if not pd.api.types.is_dtype_equal(s.dtype, dtype): + df[df.columns[i]] = s.astype(dtype) + dfs.append(df) + if len(dfs) == 1: + ret_df = dfs[0] + else: + ret_df = pd.concat(dfs) + + if ensure_string: + obj_dtypes = df_obj.dtypes[df_obj.dtypes == np.dtype("O")] + ret_df[obj_dtypes.index] = ret_df[obj_dtypes.index].radd("O") + return ret_df + + +def build_empty_series(dtype, index=None, name=None): + length = len(index) if index is not None else 0 + return pd.Series( + [_generate_value(dtype, 1) for _ in range(length)], + dtype=dtype, + index=index, + name=name, + ) + + +def build_series( + series_obj=None, + fill_value=1, + size=1, + name=None, + ensure_string=False, + dtype=None, + index=None, +): + seriess = [] + if not isinstance(size, (list, tuple)): + sizes = [size] + else: + sizes = size + + if not isinstance(fill_value, (list, tuple)): + fill_values = [fill_value] + else: + fill_values = fill_value + + if series_obj is not None: + dtype = series_obj.dtype + try: + series_index = series_obj.index_value.to_pandas()[:0] + except AttributeError: + series_index = series_obj.index[:0] + else: + series_index = index[:0] if index is not None else None + + for size, fill_value in zip(sizes, fill_values): + empty_series = build_empty_series(dtype, name=name, index=series_index) + record = _generate_value(dtype, fill_value) + if isinstance(empty_series.index, pd.MultiIndex): + index = tuple( + _generate_value(level.dtype, fill_value) + for level in empty_series.index.levels + ) + empty_series = empty_series.reindex( + index=pd.MultiIndex.from_tuples([index], names=empty_series.index.names) + ) + empty_series.iloc[0] = record + else: + if isinstance(empty_series.index.dtype, pd.CategoricalDtype): + index = None + else: + index = _generate_value(empty_series.index.dtype, fill_value) + empty_series.loc[index] = record + + empty_series = pd.concat([empty_series] * size) + # make sure dtype correct for MultiIndex + empty_series = empty_series.astype(dtype, copy=False) + seriess.append(empty_series) + + if len(seriess) == 1: + ret_series = seriess[0] + else: + ret_series = pd.concat(seriess) + + if ensure_string and dtype == np.dtype("O"): + ret_series = ret_series.radd("O") + return ret_series + + +def concat_index_value(index_values, store_data=False): + if not isinstance(index_values, (list, tuple)): + index_values = [index_values] + result = index_values[0] + if not isinstance(result, pd.Index): + result = result.to_pandas() + for index_value in index_values[1:]: + if isinstance(index_value, pd.Index): + result = result.append(index_value) + else: + result = result.append(index_value.to_pandas()) + return parse_index(result, store_data=store_data) + + +def build_concatenated_rows_frame(df): + from ..core import OutputType + from .merge.concat import DataFrameConcat + + # When the df isn't split along the column axis, return the df directly. + if df.chunk_shape[1] == 1: + return df + + columns = concat_index_value( + [df.cix[0, idx].columns_value for idx in range(df.chunk_shape[1])], + store_data=True, + ) + columns_size = columns.to_pandas().size + + out_chunks = [] + for idx in range(df.chunk_shape[0]): + out_chunk = DataFrameConcat( + axis=1, output_types=[OutputType.dataframe] + ).new_chunk( + [df.cix[idx, k] for k in range(df.chunk_shape[1])], + index=(idx, 0), + shape=(df.cix[idx, 0].shape[0], columns_size), + dtypes=df.dtypes, + index_value=df.cix[idx, 0].index_value, + columns_value=columns, + ) + out_chunks.append(out_chunk) + + return DataFrameConcat(axis=1, output_types=[OutputType.dataframe]).new_dataframe( + [df], + chunks=out_chunks, + nsplits=(tuple(chunk.shape[0] for chunk in out_chunks), (df.shape[1],)), + shape=df.shape, + dtypes=df.dtypes, + index_value=df.index_value, + columns_value=df.columns_value, + ) + + +def is_index_value_identical(left: TileableType, right: TileableType) -> bool: + if ( + left.index_value.key == right.index_value.key + and not np.isnan(sum(left.nsplits[0])) + and not np.isnan(sum(right.nsplits[0])) + and left.nsplits[0] == right.nsplits[0] + ): + is_identical = True + else: + target_chunk_index_values = [ + c.index_value for c in left.chunks if len(c.index) <= 1 or c.index[1] == 0 + ] + value_chunk_index_values = [v.index_value for v in right.chunks] + is_identical = len(target_chunk_index_values) == len( + value_chunk_index_values + ) and all( + c.key == v.key + for c, v in zip(target_chunk_index_values, value_chunk_index_values) + ) + return is_identical + + +def _filter_range_index(pd_range_index, min_val, min_val_close, max_val, max_val_close): + if is_pd_range_empty(pd_range_index): + return pd_range_index + + raw_min, raw_max, step = ( + pd_range_index.min(), + pd_range_index.max(), + _get_range_index_step(pd_range_index), + ) + + # seek min range + greater_func = operator.gt if min_val_close else operator.ge + actual_min = raw_min + while greater_func(min_val, actual_min): + actual_min += abs(step) + if step < 0: + actual_min += step # on the right side + + # seek max range + less_func = operator.lt if max_val_close else operator.le + actual_max = raw_max + while less_func(max_val, actual_max): + actual_max -= abs(step) + if step > 0: + actual_max += step # on the right side + + if step > 0: + return pd.RangeIndex(actual_min, actual_max, step) + return pd.RangeIndex(actual_max, actual_min, step) + + +def infer_index_value(left_index_value, right_index_value): + from .core import IndexValue + + if isinstance(left_index_value.value, IndexValue.RangeIndex) and isinstance( + right_index_value.value, IndexValue.RangeIndex + ): + if left_index_value.value.slice == right_index_value.value.slice: + return left_index_value + return parse_index( + pd.Index([], dtype=np.int64), left_index_value, right_index_value + ) + + # when left index and right index is identical, and both of them are elements unique, + # we can infer that the out index should be identical also + if ( + left_index_value.is_unique + and right_index_value.is_unique + and left_index_value.key == right_index_value.key + ): + return left_index_value + + left_index = left_index_value.to_pandas() + right_index = right_index_value.to_pandas() + out_index = pd.Index( + [], dtype=find_common_type([left_index.dtype, right_index.dtype]) + ) + return parse_index(out_index, left_index_value, right_index_value) + + +def filter_index_value(index_value, min_max, store_data=False): + from .core import IndexValue + + min_val, min_val_close, max_val, max_val_close = min_max + + pd_index = index_value.to_pandas() + + if isinstance(index_value.value, IndexValue.RangeIndex): + pd_filtered_index = _filter_range_index( + pd_index, min_val, min_val_close, max_val, max_val_close + ) + return parse_index(pd_filtered_index, store_data=store_data) + + if min_val_close: + f = pd_index >= min_val + else: + f = pd_index > min_val + if max_val_close: + f = f & (pd_index <= max_val) + else: + f = f & (pd_index < max_val) + + return parse_index(pd_index[f], store_data=store_data) + + +def indexing_index_value(index_value, indexes, store_data=False, rechunk=False): + pd_index = index_value.to_pandas() + # when rechunk is True, the output index shall be treated + # different from the input one + if not rechunk and isinstance(indexes, slice) and is_full_slice(indexes): + return index_value + elif not index_value.has_value(): + new_index_value = parse_index(pd_index, indexes, store_data=store_data) + new_index_value._index_value._min_val = index_value.min_val + new_index_value._index_value._min_val_close = index_value.min_val_close + new_index_value._index_value._max_val = index_value.max_val + new_index_value._index_value._max_val_close = index_value.max_val_close + return new_index_value + else: + if isinstance(indexes, Integral): + return parse_index(pd_index[[indexes]], store_data=store_data) + elif isinstance(indexes, Entity): + if isinstance(pd_index, pd.RangeIndex): + return parse_index( + pd.RangeIndex(-1), indexes, index_value, store_data=False + ) + else: + return parse_index( + type(pd_index)([]), indexes, index_value, store_data=False + ) + if isinstance(indexes, tuple): + return parse_index(pd_index[list(indexes)], store_data=store_data) + else: + return parse_index(pd_index[indexes], store_data=store_data) + + +def merge_index_value(to_merge_index_values: dict, store_data: bool = False): + """ + Merge index value according to their chunk index. + + Parameters + ---------- + to_merge_index_values : dict + index to index_value + store_data : bool + store data in index_value + + Returns + ------- + merged_index_value + """ + + pd_index = None + min_val, min_val_close, max_val, max_val_close = None, None, None, None + for _, chunk_index_value in sorted(to_merge_index_values.items()): + if pd_index is None: + pd_index = chunk_index_value.to_pandas() + min_val, min_val_close, max_val, max_val_close = ( + chunk_index_value.min_val, + chunk_index_value.min_val_close, + chunk_index_value.max_val, + chunk_index_value.max_val_close, + ) + else: + cur_pd_index = chunk_index_value.to_pandas() + if store_data or ( + isinstance(pd_index, pd.RangeIndex) + and isinstance(cur_pd_index, pd.RangeIndex) + and cur_pd_index.step == pd_index.step + and cur_pd_index.start == pd_index.stop + ): + # range index that is continuous + pd_index = pd_index.append(cur_pd_index) + else: + pd_index = pd.Index([], dtype=pd_index.dtype) + if chunk_index_value.min_val is not None: + try: + if min_val is None or min_val > chunk_index_value.min_val: + min_val = chunk_index_value.min_val + min_val_close = chunk_index_value.min_val_close + except TypeError: + # min_value has different types that cannot compare + # just stop compare + continue + if chunk_index_value.max_val is not None: + if max_val is None or max_val < chunk_index_value.max_val: + max_val = chunk_index_value.max_val + max_val_close = chunk_index_value.max_val_close + + index_value = parse_index(pd_index, store_data=store_data) + if not index_value.has_value(): + index_value._index_value._min_val = min_val + index_value._index_value._min_val_close = min_val_close + index_value._index_value._max_val = max_val + index_value._index_value._max_val_close = max_val_close + return index_value + + +def infer_dtypes(left_dtypes, right_dtypes, operator): + left = build_empty_df(left_dtypes) + right = build_empty_df(right_dtypes) + return operator(left, right).dtypes + + +@functools.lru_cache(100) +def infer_dtype(left_dtype, right_dtype, operator): + left = build_empty_series(left_dtype) + right = build_empty_series(right_dtype) + return operator(left, right).dtype + + +def filter_dtypes(dtypes, column_min_max): + left_filter = operator.ge if column_min_max[1] else operator.gt + left = left_filter(dtypes.index, column_min_max[0]) + right_filter = operator.le if column_min_max[3] else operator.lt + right = right_filter(dtypes.index, column_min_max[2]) + return dtypes[left & right] + + +def in_range_index(i, pd_range_index): + """ + Check whether the input `i` is within `pd_range_index` which is a pd.RangeIndex. + """ + start, stop, step = ( + _get_range_index_start(pd_range_index), + _get_range_index_stop(pd_range_index), + _get_range_index_step(pd_range_index), + ) + if step > 0 and start <= i < stop and (i - start) % step == 0: + return True + if step < 0 and start >= i > stop and (start - i) % step == 0: + return True + return False + + +def wrap_notimplemented_exception(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except NotImplementedError: + return NotImplemented + + return wrapper + + +def validate_axis(axis, tileable=None): + if axis == "index": + axis = 0 + elif axis == "columns": + axis = 1 + + illegal = False + try: + axis = operator.index(axis) + if axis < 0 or (tileable is not None and axis >= tileable.ndim): + illegal = True + except TypeError: + illegal = True + + if illegal: + raise ValueError(f"No axis named {axis} for object type {type(tileable)}") + return axis + + +def validate_axis_style_args( + data, args, kwargs, arg_name, method_name +): # pragma: no cover + """Argument handler for mixed index, columns / axis functions + + In an attempt to handle both `.method(index, columns)`, and + `.method(arg, axis=.)`, we have to do some bad things to argument + parsing. This translates all arguments to `{index=., columns=.}` style. + + Parameters + ---------- + data : DataFrame + args : tuple + All positional arguments from the user + kwargs : dict + All keyword arguments from the user + arg_name, method_name : str + Used for better error messages + + Returns + ------- + kwargs : dict + A dictionary of keyword arguments. Doesn't modify ``kwargs`` + inplace, so update them with the return value here. + """ + out = {} + # Goal: fill 'out' with index/columns-style arguments + # like out = {'index': foo, 'columns': bar} + + # Start by validating for consistency + axes_names = ["index"] if data.ndim == 1 else ["index", "columns"] + if "axis" in kwargs and any(x in kwargs for x in axes_names): + msg = "Cannot specify both 'axis' and any of 'index' or 'columns'." + raise TypeError(msg) + + # First fill with explicit values provided by the user... + if arg_name in kwargs: + if args: + msg = f"{method_name} got multiple values for argument '{arg_name}'" + raise TypeError(msg) + + axis = axes_names[validate_axis(kwargs.get("axis", 0), data)] + out[axis] = kwargs[arg_name] + + # More user-provided arguments, now from kwargs + for k, v in kwargs.items(): + try: + ax = axes_names[validate_axis(k, data)] + except ValueError: + pass + else: + out[ax] = v + + # All user-provided kwargs have been handled now. + # Now we supplement with positional arguments, emitting warnings + # when there's ambiguity and raising when there's conflicts + + if len(args) == 0: + pass # It's up to the function to decide if this is valid + elif len(args) == 1: + axis = axes_names[validate_axis(kwargs.get("axis", 0), data)] + out[axis] = args[0] + elif len(args) == 2: + if "axis" in kwargs: + # Unambiguously wrong + msg = "Cannot specify both 'axis' and any of 'index' or 'columns'" + raise TypeError(msg) + + msg = ( + "Interpreting call\n\t'.{method_name}(a, b)' as " + "\n\t'.{method_name}(index=a, columns=b)'.\nUse named " + "arguments to remove any ambiguity." + ) + raise TypeError(msg.format(method_name=method_name)) + else: + msg = f"Cannot specify all of '{arg_name}', 'index', 'columns'." + raise TypeError(msg) + return out + + +def validate_output_types(**kwargs): + from ..core import OutputType + + output_type = kwargs.pop("object_type", None) or kwargs.pop("output_type", None) + output_types = kwargs.pop("output_types", None) or ( + [output_type] if output_type is not None else None + ) + return ( + [ + getattr(OutputType, v.lower()) if isinstance(v, str) else v + for v in output_types + ] + if output_types + else None + ) + + +def standardize_range_index(chunks: List[ChunkType], axis: int = 0): + from .base.standardize_range_index import ChunkStandardizeRangeIndex + + row_chunks = dict( + (k, next(v)) for k, v in itertools.groupby(chunks, key=lambda x: x.index[axis]) + ) + row_chunks = [row_chunks[i] for i in range(len(row_chunks))] + + out_chunks = [] + for c in chunks: + prev_chunks = row_chunks[: c.index[axis]] + op = ChunkStandardizeRangeIndex( + prev_shapes=[p.shape for p in prev_chunks], axis=axis + ) + op.output_types = c.op.output_types + params = c.params.copy() + start_pos = sum(p.shape[axis] for p in prev_chunks) + end_pos = start_pos + c.shape[axis] + index = pd.RangeIndex(start_pos, end_pos) + if axis == 0: + params["index_value"] = parse_index(index) + else: + dtypes = params["dtypes"] + dtypes.index = index + params["dtypes"] = dtypes + params["columns_value"] = parse_index(dtypes.index, store_data=True) + out_chunks.append(op.new_chunk([c], kws=[params])) + + return out_chunks + + +def fetch_corner_data(df_or_series, session=None) -> pd.DataFrame: + """ + Fetch corner DataFrame or Series for repr usage. + + :param df_or_series: DataFrame or Series + :return: corner DataFrame + """ + from .indexing.iloc import iloc + + max_rows = pd.get_option("display.max_rows") + try: + min_rows = pd.get_option("display.min_rows") + min_rows = min(min_rows, max_rows) + except KeyError: # pragma: no cover + # display.min_rows is introduced in pandas 0.25 + min_rows = max_rows + + index_size = None + if ( + df_or_series.shape[0] > max_rows + and df_or_series.shape[0] > min_rows // 2 * 2 + 2 + ): + # for pandas, greater than max_rows + # will display min_rows + # thus we fetch min_rows + 2 lines + index_size = min_rows // 2 + 1 + + if index_size is None: + return df_or_series._fetch(session=session) + else: + head = iloc(df_or_series)[:index_size] + tail = iloc(df_or_series)[-index_size:] + head_data, tail_data = ExecutableTuple([head, tail]).fetch(session=session) + xdf = cudf if head.op.is_gpu() else pd + return xdf.concat([head_data, tail_data], axis="index") + + +class ReprSeries(pd.Series): + def __init__(self, corner_data, real_shape): + super().__init__(corner_data) + self._real_shape = real_shape + + def __len__(self): + # As we only fetch corner data to repr, + # the length would be wrong and we have no way to control, + # thus we just overwrite the length to show the real one + return self._real_shape[0] + + +def filter_dtypes_by_index(dtypes, index): + try: + new_dtypes = dtypes.loc[index].dropna() + except KeyError: + dtypes_idx = ( + dtypes.index.to_frame() + .merge(index.to_frame()) + .set_index(list(range(dtypes.index.nlevels))) + .index + ) + new_dtypes = dtypes.loc[dtypes_idx] + new_dtypes.index.names = dtypes.index.names + return new_dtypes + + +@contextmanager +def create_sa_connection(con, **kwargs): + import sqlalchemy as sa + from sqlalchemy.engine import Connection, Engine + + # process con + engine = None + if isinstance(con, Connection): + # connection create by user + close = False + dispose = False + elif isinstance(con, Engine): + con = con.connect() + close = True + dispose = False + else: + engine = sa.create_engine(con, **kwargs) + con = engine.connect() + close = True + dispose = True + + try: + yield con + finally: + if close: + con.close() + if dispose: + engine.dispose() + + +def arrow_table_to_pandas_dataframe(arrow_table, use_arrow_dtype=True, **kw): + if not use_arrow_dtype: + # if not use arrow string, just return + return arrow_table.to_pandas(**kw) + + from .arrays import ArrowListArray, ArrowStringArray + + table: pa.Table = arrow_table + schema: pa.Schema = arrow_table.schema + + arrow_field_names = list() + arrow_arrays = list() + arrow_indexes = list() + other_field_names = list() + other_arrays = list() + for i, arrow_type in enumerate(schema.types): + if arrow_type == pa.string() or isinstance(arrow_type, pa.ListType): + arrow_field_names.append(schema.names[i]) + arrow_indexes.append(i) + arrow_arrays.append(table.columns[i]) + else: + other_field_names.append(schema.names[i]) + other_arrays.append(table.columns[i]) + + df: pd.DataFrame = pa.Table.from_arrays( + other_arrays, names=other_field_names + ).to_pandas(**kw) + for arrow_index, arrow_name, arrow_array in zip( + arrow_indexes, arrow_field_names, arrow_arrays + ): + if arrow_array.type == pa.string(): + series = pd.Series(ArrowStringArray(arrow_array)) + else: + assert isinstance(arrow_array.type, pa.ListType) + series = pd.Series(ArrowListArray(arrow_array)) + df.insert(arrow_index, arrow_name, series) + + return df + + +def contain_arrow_dtype(dtypes): + from .arrays import ArrowStringDtype + + return any(isinstance(dtype, ArrowStringDtype) for dtype in dtypes) + + +def to_arrow_dtypes(dtypes, test_df=None): + from .arrays import ArrowStringDtype + + new_dtypes = dtypes.copy() + for i in range(len(dtypes)): + dtype = dtypes.iloc[i] + if is_string_dtype(dtype): + if test_df is not None: + series = test_df.iloc[:, i] + # check value + non_na_series = series[series.notna()] + if len(non_na_series) > 0: + first_value = non_na_series.iloc[0] + if isinstance(first_value, str): + new_dtypes.iloc[i] = ArrowStringDtype() + else: # pragma: no cover + # empty, set arrow string dtype + new_dtypes.iloc[i] = ArrowStringDtype() + else: + # empty, set arrow string dtype + new_dtypes.iloc[i] = ArrowStringDtype() + return new_dtypes + + +def make_dtype(dtype): + if isinstance(dtype, (np.dtype, ExtensionDtype)): + return dtype + return np.dtype(dtype) if dtype is not None else None + + +def make_dtypes(dtypes): + if dtypes is None: + return None + if not isinstance(dtypes, pd.Series): + dtypes = pd.Series(dtypes) + return dtypes.apply(make_dtype) + + +def is_dataframe(x): + if cudf is not None: # pragma: no cover + if isinstance(x, cudf.DataFrame): + return True + return isinstance(x, pd.DataFrame) + + +def is_series(x): + if cudf is not None: # pragma: no cover + if isinstance(x, cudf.Series): + return True + return isinstance(x, pd.Series) + + +def is_index(x): + if cudf is not None: # pragma: no cover + if isinstance(x, cudf.Index): + return True + return isinstance(x, pd.Index) + + +def get_xdf(x): + if cudf is not None: # pragma: no cover + if isinstance(x, (cudf.DataFrame, cudf.Series, cudf.Index)): + return cudf + return pd + + +def is_cudf(x): + if cudf is not None: # pragma: no cover + if isinstance(x, (cudf.DataFrame, cudf.Series, cudf.Index)): + return True + return False + + +def auto_merge_chunks( + ctx: Context, + df_or_series: TileableType, + merged_file_size: Union[int, float, str] = None, +) -> TileableType: + from .merge import DataFrameConcat + + if df_or_series.ndim == 2 and df_or_series.chunk_shape[1] > 1: + # skip auto merge optimization for DataFrame + # that has more than 1 chunks on columns axis + return df_or_series + + metas = ctx.get_chunks_meta( + [c.key for c in df_or_series.chunks], fields=["memory_size"], error="ignore" + ) + memory_sizes = [meta["memory_size"] if meta is not None else None for meta in metas] + if any(size is None for size in memory_sizes): + # has not been executed before, cannot get accurate memory size, skip auto merge + return df_or_series + + def _concat_chunks(merge_chunks: List[ChunkType], output_index: int): + chunk_size = sum(c.shape[0] for c in merge_chunks) + concat_op = DataFrameConcat(output_types=df_or_series.op.output_types) + if df_or_series.ndim == 1: + kw = dict( + dtype=df_or_series.dtype, + index_value=merge_index_value( + {c.index: c.index_value for c in merge_chunks} + ), + shape=(chunk_size,), + index=(output_index,), + name=df_or_series.name, + ) + else: + kw = dict( + dtypes=merge_chunks[0].dtypes, + index_value=merge_index_value( + {c.index: c.index_value for c in merge_chunks} + ), + columns_value=merge_chunks[0].columns_value, + shape=(chunk_size, merge_chunks[0].shape[1]), + index=(output_index, 0), + ) + return concat_op.new_chunk(merge_chunks, **kw) + + to_merge_size = ( + parse_readable_size(merged_file_size)[0] + if merged_file_size is not None + else options.chunk_store_limit + ) + to_merge_chunks = [] + acc_memory_size = 0 + n_split = [] + out_chunks = [] + last_idx = len(memory_sizes) - 1 + for idx, (chunk, chunk_memory_size) in enumerate( + zip(df_or_series.chunks, memory_sizes) + ): + to_merge_chunks.append(chunk) + acc_memory_size += chunk_memory_size + if ( + acc_memory_size + chunk_memory_size > to_merge_size + and len(to_merge_chunks) > 0 + ) or idx == last_idx: + # adding current chunk would exceed the maximum, + # concat previous chunks + if len(to_merge_chunks) == 1: + # do not generate concat op for 1 input. + c = to_merge_chunks[0].copy() + c._index = ( + (len(n_split),) if df_or_series.ndim == 1 else (len(n_split), 0) + ) + out_chunks.append(c) + n_split.append(c.shape[0]) + else: + merged_chunk = _concat_chunks(to_merge_chunks, len(n_split)) + out_chunks.append(merged_chunk) + n_split.append(merged_chunk.shape[0]) + # reset + acc_memory_size = 0 + to_merge_chunks = [] + # process the last chunk + assert len(to_merge_chunks) == 0 + new_op = df_or_series.op.copy() + params = df_or_series.params.copy() + params["chunks"] = out_chunks + if df_or_series.ndim == 1: + params["nsplits"] = (tuple(n_split),) + else: + params["nsplits"] = (tuple(n_split), df_or_series.nsplits[1]) + return new_op.new_tileable(df_or_series.op.inputs, kws=[params]) + + +# TODO: clean_up_func, is_on_ray and restore_func functions may be +# removed or refactored in the future to calculate func size +# with more accuracy as well as address some serialization issues. +def clean_up_func(op): + threshold = int(os.getenv("MARS_CLOSURE_CLEAN_UP_BYTES_THRESHOLD", 10**4)) + if threshold == -1: # pragma: no cover + return + ctx = get_context() + if ctx is None: + return + + # Note: op.func_key is set only when func was put into storage. + # Under ray backend, func will be put into storage. + # While under mars backend, since storage service is empty on supervisor, + # func won't be put into storage but serialized in advance to reduce upcoming + # expenses brought by serializations and deserializations during subtask transmission. + if whether_to_clean_up(op, threshold) is True: + assert ( + op.logic_key is not None + ), f"Logic key of {op} wasn't calculated before cleaning up func." + logger.info("%s is cleaning up func %s.", op, op.func) + if is_on_ray(ctx): + import ray + + op.func_key = ray.put(op.func) + logger.info("%s func %s is replaced by %s.", op, op.func, op.func_key) + op.func = None + else: + op.func = cloudpickle.dumps(op.func) + + +def whether_to_clean_up(op, threshold): + func = op.func + counted_bytes = 0 + max_recursion_depth = 2 + + from collections import deque + from numbers import Number + + BYPASS_CLASSES = (str, bytes, Number, range, bytearray, pd.DataFrame, pd.Series) + + class GetSizeEarlyStopException(Exception): + pass + + def check_exceed_threshold(): + nonlocal threshold, counted_bytes + if counted_bytes >= threshold: + raise GetSizeEarlyStopException() + + def getsize(obj_outer): + _seen_obj_ids = set() + + def inner_count(obj, recursion_depth): + obj_id = id(obj) + if obj_id in _seen_obj_ids or recursion_depth > max_recursion_depth: + return 0 + _seen_obj_ids.add(obj_id) + recursion_depth += 1 + size = sys.getsizeof(obj) + if isinstance(obj, BYPASS_CLASSES): + return size + elif isinstance(obj, (tuple, list, set, deque)): + size += sum(inner_count(i, recursion_depth) for i in obj) + elif hasattr(obj, "items"): + size += sum( + inner_count(k, recursion_depth) + inner_count(v, recursion_depth) + for k, v in getattr(obj, "items")() + ) + if hasattr(obj, "__dict__"): + size += inner_count(vars(obj), recursion_depth) + if hasattr(obj, "__slots__"): + size += sum( + inner_count(getattr(obj, s), recursion_depth) + for s in obj.__slots__ + if hasattr(obj, s) + ) + return size + + return inner_count(obj_outer, 0) + + try: + # Note: In most cases, func is just a function with closure, while chances are that + # func is a callable that doesn't have __closure__ attribute. + if inspect.isclass(func): + pass + elif hasattr(func, "__closure__") and func.__closure__ is not None: + for cell in func.__closure__: + counted_bytes += getsize(cell.cell_contents) + check_exceed_threshold() + elif callable(func): + if hasattr(func, "__dict__"): + for k, v in func.__dict__.items(): + counted_bytes += sum([getsize(k), getsize(v)]) + check_exceed_threshold() + if hasattr(func, "__slots__"): + for slot in func.__slots__: + counted_bytes += ( + getsize(getattr(func, slot)) if hasattr(func, slot) else 0 + ) + check_exceed_threshold() + except GetSizeEarlyStopException: + logger.debug("Func needs cleanup.") + op.need_clean_up_func = True + else: + assert op.need_clean_up_func is False + logger.debug("Func doesn't need cleanup.") + + return op.need_clean_up_func + + +def restore_func(ctx: Context, op): + if op.need_clean_up_func and ctx is not None: + logger.info("%s is restoring func from %s.", op, op.func_key) + if is_on_ray(ctx): + import ray + + op.func = ray.get(op.func_key) + logger.info("%s func %s is restored.", op, op.func) + else: + op.func = cloudpickle.loads(op.func) + + +def concat_on_columns(objs: List) -> Any: + xdf = get_xdf(objs[0]) + # In cudf, concat with axis=1 and ignore_index=False by default behaves opposite to pandas. + # Cudf would reset the index when axis=1 and ignore_index=False, which does not match with its document. + # Therefore, we deal with this case specially. + result = xdf.concat(objs, axis=1) + if xdf is cudf: + result.index = objs[0].index + return result diff --git a/python/xorbits/_mars/dataframe/window/__init__.py b/python/xorbits/_mars/dataframe/window/__init__.py new file mode 100644 index 000000000..9be1d5efa --- /dev/null +++ b/python/xorbits/_mars/dataframe/window/__init__.py @@ -0,0 +1,32 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def _install(): + from ..core import DATAFRAME_TYPE, SERIES_TYPE + from .ewm.aggregation import DataFrameEwmAgg + from .ewm.core import ewm + from .expanding.aggregation import DataFrameExpandingAgg + from .expanding.core import expanding + from .rolling.aggregation import DataFrameRollingAgg + from .rolling.core import rolling + + for t in DATAFRAME_TYPE + SERIES_TYPE: + t.rolling = rolling + t.expanding = expanding + t.ewm = ewm + + +_install() +del _install diff --git a/python/xorbits/_mars/dataframe/window/aggregation.py b/python/xorbits/_mars/dataframe/window/aggregation.py new file mode 100644 index 000000000..1019ac6bc --- /dev/null +++ b/python/xorbits/_mars/dataframe/window/aggregation.py @@ -0,0 +1,632 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict, namedtuple +from collections.abc import Iterable +from typing import Dict + +import numpy as np +import pandas as pd + +from ...core.operand import OperandStage +from ...serialization.serializables import ( + AnyField, + BoolField, + DictField, + Int32Field, + Int64Field, + StringField, +) +from ...utils import tokenize +from ..core import DATAFRAME_TYPE +from ..merge import DataFrameConcat +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_df, build_empty_series, filter_dtypes_by_index, parse_index + +_stage_info = namedtuple( + "_stage_info", + ( + "map_groups", + "map_sources", + "combine_sources", + "combine_columns", + "combine_funcs", + "key_to_funcs", + "valid_columns", + "min_periods_func_name", + ), +) + + +class BaseDataFrameExpandingAgg(DataFrameOperand, DataFrameOperandMixin): + _min_periods = Int64Field("min_periods") + _axis = Int32Field("axis") + _func = AnyField("func") + + # always treat count as valid. this behavior is cancelled in pandas 1.0 + _count_always_valid = BoolField("count_always_valid") + # True if function name is treated as new index + _append_index = BoolField("append_index") + + # chunk params + _output_agg = BoolField("output_agg") + + _map_groups = DictField("map_groups") + _map_sources = DictField("map_sources") + _combine_sources = DictField("combine_sources") + _combine_columns = DictField("combine_columns") + _combine_funcs = DictField("combine_funcs") + _key_to_funcs = DictField("keys_to_funcs") + + _min_periods_func_name = StringField("min_periods_func_name") + + def __init__( + self, + min_periods=None, + axis=None, + func=None, + count_always_valid=None, + append_index=None, + output_agg=False, + map_groups=None, + map_sources=None, + combine_sources=None, + combine_columns=None, + combine_funcs=None, + key_to_funcs=None, + min_periods_func_name=None, + **kw + ): + super().__init__( + _min_periods=min_periods, + _axis=axis, + _func=func, + _count_always_valid=count_always_valid, + _append_index=append_index, + _output_agg=output_agg, + _map_groups=map_groups, + _map_sources=map_sources, + _combine_sources=combine_sources, + _combine_columns=combine_columns, + _combine_funcs=combine_funcs, + _key_to_funcs=key_to_funcs, + _min_periods_func_name=min_periods_func_name, + **kw + ) + + @property + def min_periods(self) -> int: + return self._min_periods + + @property + def axis(self) -> int: + return self._axis + + @property + def func(self): + return self._func + + @property + def count_always_valid(self): + return self._count_always_valid + + @property + def append_index(self): + return self._append_index + + @property + def output_agg(self): + return self._output_agg + + @property + def map_groups(self) -> Dict: + return self._map_groups + + @property + def map_sources(self) -> Dict: + return self._map_sources + + @property + def combine_sources(self) -> Dict: + return self._combine_sources + + @property + def combine_columns(self) -> Dict: + return self._combine_columns + + @property + def combine_funcs(self) -> Dict: + return self._combine_funcs + + @property + def key_to_funcs(self) -> Dict: + return self._key_to_funcs + + @property + def min_periods_func_name(self) -> str: + return self._min_periods_func_name + + @property + def output_limit(self): + return 2 if self.output_agg else 1 + + def __call__(self, expanding): + inp = expanding.input + raw_func = self.func + self._normalize_funcs() + + if isinstance(inp, DATAFRAME_TYPE): + empty_df = build_df(inp) + for c, t in empty_df.dtypes.items(): + if t == np.dtype("O"): + empty_df[c] = "O" + + test_df = expanding(empty_df).agg(raw_func) + if self._axis == 0: + index_value = inp.index_value + else: + index_value = parse_index( + test_df.index, expanding.params, inp, store_data=False + ) + self._append_index = test_df.columns.nlevels != empty_df.columns.nlevels + return self.new_dataframe( + [inp], + shape=(inp.shape[0], test_df.shape[1]), + dtypes=test_df.dtypes, + index_value=index_value, + columns_value=parse_index(test_df.columns, store_data=True), + ) + else: + pd_index = inp.index_value.to_pandas() + empty_series = build_empty_series( + inp.dtype, index=pd_index[:0], name=inp.name + ) + test_obj = expanding(empty_series).agg(raw_func) + if isinstance(test_obj, pd.DataFrame): + return self.new_dataframe( + [inp], + shape=(inp.shape[0], test_obj.shape[1]), + dtypes=test_obj.dtypes, + index_value=inp.index_value, + columns_value=parse_index(test_obj.dtypes.index, store_data=True), + ) + else: + return self.new_series( + [inp], + shape=inp.shape, + dtype=test_obj.dtype, + index_value=inp.index_value, + name=test_obj.name, + ) + + def _normalize_funcs(self): + if isinstance(self._func, dict): + new_func = OrderedDict() + for k, v in self._func.items(): + if isinstance(v, str) or callable(v): + new_func[k] = [v] + else: + new_func[k] = v + self._func = new_func + elif isinstance(self._func, Iterable) and not isinstance(self._func, str): + self._func = list(self._func) + + @staticmethod + def _safe_append(d, key, val): + if key not in d: + d[key] = [] + if val not in d[key]: + d[key].append(val) + + @classmethod + def _get_stage_functions(cls, op: "BaseDataFrameExpandingAgg", func): + raise NotImplementedError + + @classmethod + def _gen_chunk_stage_info( + cls, op: "BaseDataFrameExpandingAgg", chunk_cols=None, min_periods=1 + ): + map_groups = OrderedDict() + map_sources = OrderedDict() + combine_sources = OrderedDict() + combine_columns = OrderedDict() + combine_funcs = OrderedDict() + key_to_funcs = OrderedDict() + valid_columns = [] + min_periods_func_name = None + + def _clean_dict(d): + return OrderedDict( + (k, sorted(v) if v != [None] else None) for k, v in d.items() + ) + + def _fun_to_str(fun): + if isinstance(fun, str): + return fun + fun_str = tokenize(fun) + key_to_funcs[fun_str] = fun + return fun if isinstance(fun, str) else tokenize(fun) + + def _add_column_to_functions(col, fun_name, mappers, aggregator): + sources = [] + for mapper in mappers: + mapper_str = _fun_to_str(mapper) + cls._safe_append(map_groups, mapper_str, col) + sources.append(mapper_str) + + combine_sources[fun_name] = sources + cls._safe_append(combine_columns, fun_name, col) + combine_funcs[fun_name] = _fun_to_str(aggregator) + + chunk_cols = set(chunk_cols) if chunk_cols is not None else None + if isinstance(op.func, list): + op_func = {None: op.func} + elif isinstance(op.func, str): + op_func = {None: [op.func]} + else: + op_func = op.func + + for col, funcs in op_func.items(): + if col is not None: + if chunk_cols is not None and col not in chunk_cols: + continue + valid_columns.append(col) + + if min_periods > 1: + min_periods_func_name = tokenize(chunk_cols, "min_periods") + _add_column_to_functions( + col, + min_periods_func_name, + *cls._get_stage_functions(op, "_data_count") + ) + + for func in funcs: + mapper_funcs, combine_func = cls._get_stage_functions(op, func) + _add_column_to_functions(col, func, mapper_funcs, combine_func) + + return _stage_info( + map_groups=_clean_dict(map_groups), + map_sources=map_sources, + combine_sources=combine_sources, + combine_columns=_clean_dict(combine_columns), + combine_funcs=combine_funcs, + key_to_funcs=key_to_funcs, + valid_columns=valid_columns or None, + min_periods_func_name=min_periods_func_name, + ) + + @classmethod + def _remap_dtypes(cls, in_df, out_df): + if in_df.ndim == 1: + if out_df.ndim == 2: + return ( + {0: (0, out_df.dtypes)}, + (in_df.nsplits[0], (len(out_df.dtypes),)), + ) + return None, in_df.nsplits + + axis = out_df.op.axis + chunk_idx_to_dtypes = dict() + new_dtypes_sizes = [] + for c in in_df.cix[0, :]: + columns = c.columns_value.to_pandas() + dtypes = filter_dtypes_by_index(out_df.dtypes, columns) + + if len(dtypes): + chunk_idx_to_dtypes[c.index[1]] = (len(chunk_idx_to_dtypes), dtypes) + new_dtypes_sizes.append(len(dtypes)) + new_nsplits = list(in_df.nsplits) + new_nsplits[1 - axis] = tuple(new_dtypes_sizes) + return chunk_idx_to_dtypes, tuple(new_nsplits) + + @classmethod + def _tile_single(cls, op: "BaseDataFrameExpandingAgg"): + in_df = op.inputs[0] + out_df = op.outputs[0] + + chunk_idx_to_dtypes, new_nsplits = cls._remap_dtypes(in_df, out_df) + + chunks = [] + for c in in_df.chunks: + try: + if out_df.ndim == 2: + new_axis_idx, new_dtypes = chunk_idx_to_dtypes[ + c.index[1] if c.ndim > 1 else 0 + ] + else: + new_axis_idx, new_dtypes = None, None + except KeyError: + continue + + chunk_op = op.copy().reset_key() + + if out_df.ndim == 2: + chunks.append( + chunk_op.new_chunk( + [in_df.chunks[0]], + dtypes=new_dtypes, + index=(c.index[0], new_axis_idx), + shape=(c.shape[0], len(new_dtypes)), + index_value=c.index_value, + columns_value=parse_index(new_dtypes.index, store_data=True), + ) + ) + else: + params = c.params.copy() + params["dtype"] = out_df.dtype + chunks.append(chunk_op.new_chunk([in_df.chunks[0]], **params)) + + tileable_op = op.copy().reset_key() + params = out_df.params.copy() + params["chunks"] = chunks + if new_nsplits: + params["nsplits"] = new_nsplits + return tileable_op.new_tileables([in_df], **params) + + @classmethod + def tile(cls, op: "BaseDataFrameExpandingAgg"): + axis = op.axis + + in_df = op.inputs[0] + out_df = op.outputs[0] + + if in_df.chunk_shape[op.axis] == 1: + return cls._tile_single(op) + + dtypes_mapping, new_nsplits = cls._remap_dtypes(in_df, out_df) + new_chunk_shape = tuple(len(split) for split in new_nsplits) + + data_chunks = [] + summary_chunks = np.empty(new_chunk_shape, dtype=object) + stage_info_dict = dict() + for c in in_df.chunks: + try: + if out_df.ndim == 2: + new_axis_idx, new_dtypes = dtypes_mapping[ + c.index[1] if c.ndim > 1 else 0 + ] + else: + new_axis_idx, new_dtypes = None, None + except KeyError: + continue + + new_index = (c.index[0], new_axis_idx) + + try: + stage_info = stage_info_dict[new_index[1]] + except KeyError: + cols = c.dtypes.index if c.ndim == 2 else None + stage_info = stage_info_dict[new_index[1]] = cls._gen_chunk_stage_info( + op, cols, min_periods=op.min_periods + ) + + chunk_op = op.copy().reset_key() + chunk_op._output_agg = c.index[axis] != in_df.chunk_shape[axis] - 1 + chunk_op.stage = OperandStage.map + chunk_op._map_sources = stage_info.map_sources + chunk_op._map_groups = stage_info.map_groups + chunk_op._key_to_funcs = stage_info.key_to_funcs + + if out_df.ndim == 2: + kw0 = dict( + dtypes=new_dtypes, + index=new_index, + shape=(c.shape[0], len(new_dtypes)), + index_value=c.index_value, + columns_value=parse_index(new_dtypes.index, store_data=True), + ) + kw1 = kw0.copy() + kw1["shape"] = (1, len(new_dtypes)) if axis == 0 else (c.shape[0], 1) + else: + kw0 = dict( + dtype=out_df.dtype, + index=c.index, + shape=c.shape, + name=c.name, + index_value=c.index_value, + ) + kw1 = kw0.copy() + kw1["shape"] = (1,) + out_chunks = chunk_op.new_chunks([c], [kw0, kw1]) + data_chunks.append(out_chunks[0]) + if chunk_op.output_agg: + summary_chunks[new_index] = out_chunks[1] + + chunks = [] + for c in data_chunks: + stage_info = stage_info_dict[c.index[1] if c.ndim > 1 else None] + + chunk_op = op.copy().reset_key() + chunk_op._output_agg = False + chunk_op.stage = OperandStage.combine + chunk_op._map_groups = stage_info.map_groups + chunk_op._combine_sources = stage_info.combine_sources + chunk_op._combine_columns = stage_info.combine_columns + chunk_op._combine_funcs = stage_info.combine_funcs + chunk_op._key_to_funcs = stage_info.key_to_funcs + chunk_op._min_periods_func_name = stage_info.min_periods_func_name + + params = c.params.copy() + if c.ndim == 2: + summary_inputs = list(summary_chunks[: c.index[0], c.index[1]]) + else: + summary_inputs = list(summary_chunks[: c.index[0]]) + + if len(summary_inputs) > 1: + concat_op = DataFrameConcat( + output_types=out_df.op.output_types, axis=op.axis + ) + concat_summary = concat_op.new_chunk(summary_inputs) + chunks.append(chunk_op.new_chunk([c, concat_summary], **params)) + elif len(summary_inputs) == 1: + chunks.append(chunk_op.new_chunk([c, summary_inputs[0]], **params)) + else: + chunks.append(chunk_op.new_chunk([c], **params)) + + df_op = op.copy().reset_key() + params = out_df.params.copy() + params.update(dict(chunks=chunks, nsplits=new_nsplits)) + return df_op.new_tileables([in_df], **params) + + @classmethod + def _execute_map_function(cls, op: "BaseDataFrameExpandingAgg", func, in_data): + raise NotImplementedError + + @classmethod + def _execute_map(cls, ctx, op: "BaseDataFrameExpandingAgg"): + in_data = ctx[op.inputs[0].key] + + # map according to map groups + map_results = [] + summary_results = [] + for map_func_str, cols in op.map_groups.items(): + if cols is None: + src_df = in_data + else: + src_df = in_data[cols] + + result, summary = cls._execute_map_function(op, map_func_str, src_df) + map_results.append(result) + if op.output_agg: + summary_results.append(summary) + + if op.output_agg: + summary_results.append( + pd.Series([len(in_data)], index=summary_results[0].index) + ) + + ctx[op.outputs[0].key] = tuple(map_results) + if op.output_agg: + ctx[op.outputs[1].key] = tuple(summary_results) + + @classmethod + def _append_func_name_index(cls, op: "BaseDataFrameExpandingAgg", df, func_name): + if not op.append_index: + return + + col_frame = df.columns.to_frame().copy() + col_frame[len(col_frame.columns)] = func_name + df.columns = pd.MultiIndex.from_frame( + col_frame, names=tuple(df.columns.names) + (None,) + ) + + @classmethod + def _execute_combine_function( + cls, op: "BaseDataFrameExpandingAgg", func, pred_inputs, local_inputs, func_cols + ): + raise NotImplementedError + + @classmethod + def _execute_combine(cls, ctx, op: "BaseDataFrameExpandingAgg"): + out_df = op.outputs[0] + local_data = ctx[op.inputs[0].key] + local_data_dict = dict(zip(op.map_groups.keys(), local_data)) + + func_to_aggs = OrderedDict() + + if len(op.inputs) == 1: + pred_record_count = 0 + for func_name, func_sources in op.combine_sources.items(): + func_str = op.combine_funcs[func_name] + func_cols = op.combine_columns[func_name] + if func_cols is None: + local_inputs = [local_data_dict[src] for src in func_sources] + else: + local_inputs = [ + local_data_dict[src][func_cols] for src in func_sources + ] + + func = op.key_to_funcs[func_str] + func_to_aggs[func_name] = cls._execute_combine_function( + op, func, None, local_inputs, func_cols + ) + else: + pred_data = ctx[op.inputs[1].key] + pred_record_count = pred_data[-1].sum() + pred_data_dict = dict(zip(op.map_groups.keys(), pred_data)) + + for func_name, func_sources in op.combine_sources.items(): + func_str = op.combine_funcs[func_name] + func_cols = op.combine_columns[func_name] + if func_cols is None: + local_inputs = [local_data_dict[src] for src in func_sources] + pred_inputs = [pred_data_dict[src] for src in func_sources] + else: + local_inputs = [ + local_data_dict[src][func_cols] for src in func_sources + ] + pred_inputs = [ + pred_data_dict[src][func_cols] for src in func_sources + ] + + func = op.key_to_funcs[func_str] + func_to_aggs[func_name] = cls._execute_combine_function( + op, func, pred_inputs, local_inputs, func_cols + ) + + if op.min_periods_func_name is not None: + valid_counts = func_to_aggs.pop(op.min_periods_func_name) + invalid_poses = valid_counts < op.min_periods + for func_name in func_to_aggs.keys(): + if func_name == "count": + if ( + not op.count_always_valid + and pred_record_count < op.min_periods - 1 + ): + try: + func_to_aggs[func_name].iloc[ + : op.min_periods - pred_record_count - 1 + ] = np.nan + except ValueError: + func_to_aggs[func_name] = func_to_aggs[func_name].copy() + func_to_aggs[func_name].iloc[ + : op.min_periods - pred_record_count - 1 + ] = np.nan + else: + func_to_aggs[func_name][invalid_poses] = np.nan + + for func_name, agg_df in func_to_aggs.items(): + if out_df.ndim == 2 and agg_df.ndim == 1: + agg_df.name = func_name + agg_df = func_to_aggs[func_name] = pd.DataFrame(agg_df) + cls._append_func_name_index(op, agg_df, func_name) + + if len(func_to_aggs) == 1: + val = list(func_to_aggs.values())[0] + else: + out_df = op.outputs[0] + val = pd.concat(list(func_to_aggs.values()), axis=1 - op.axis) + + if out_df.ndim > 1: + val = val.reindex( + out_df.columns_value.to_pandas(), axis=1 - op.axis, copy=False + ) + else: + val.name = out_df.name + ctx[op.outputs[0].key] = val + + @classmethod + def _execute_raw_function(cls, op: "BaseDataFrameExpandingAgg", in_data): + raise NotImplementedError + + @classmethod + def execute(cls, ctx, op: "BaseDataFrameExpandingAgg"): + if op.stage == OperandStage.map: + cls._execute_map(ctx, op) + elif op.stage == OperandStage.combine: + cls._execute_combine(ctx, op) + else: + in_data = ctx[op.inputs[0].key] + r = cls._execute_raw_function(op, in_data) + ctx[op.outputs[0].key] = r diff --git a/python/xorbits/_mars/dataframe/window/core.py b/python/xorbits/_mars/dataframe/window/core.py new file mode 100644 index 000000000..5ab6aaee6 --- /dev/null +++ b/python/xorbits/_mars/dataframe/window/core.py @@ -0,0 +1,76 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...serialization.serializables import KeyField, Serializable + + +class Window(Serializable): + _input = KeyField("input") + + def __init__(self, input=None, **kw): # pylint: disable=redefined-builtin + super().__init__(_input=input, **kw) + + @property + def input(self): + return self._input + + @property + def params(self): + raise NotImplementedError + + def _repr(self, params): + kvs = [f"{k}={v}" for k, v in params.items() if v is not None] + return "{} [{}]".format(self._repr_name(), ",".join(kvs)) + + def _repr_name(self): + return type(self).__name__ + + def __repr__(self): + return self._repr(self.params) + + def __getitem__(self, item): + columns = self.input.dtypes.index + if isinstance(item, (list, tuple)): + item = list(item) + for col in item: + if col not in columns: + raise KeyError(f"Column not found: {col}") + else: + if item not in columns: + raise KeyError(f"Column not found: {item}") + + return type(self)(input=self.input[item], **self.params) + + def __getattr__(self, item): + try: + return super().__getattribute__(item) + except AttributeError: + if self.input.ndim == 2 and item in self.input.dtypes: + return self[item] + else: + raise + + def __dir__(self): + result = list(super().__dir__()) + if self.input.ndim == 1: + return result + else: + return sorted( + result + + [ + k + for k in self.input.dtypes.index + if isinstance(k, str) and k.isidentifier() + ] + ) diff --git a/python/xorbits/_mars/dataframe/window/ewm/__init__.py b/python/xorbits/_mars/dataframe/window/ewm/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/window/ewm/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/window/ewm/aggregation.py b/python/xorbits/_mars/dataframe/window/ewm/aggregation.py new file mode 100644 index 000000000..31122b2a3 --- /dev/null +++ b/python/xorbits/_mars/dataframe/window/ewm/aggregation.py @@ -0,0 +1,491 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import namedtuple + +import numpy as np +import pandas as pd + +from .... import opcodes +from ....serialization.serializables import BoolField, Float64Field +from ..aggregation import BaseDataFrameExpandingAgg + +_stage_info = namedtuple( + "_stage_info", + ( + "map_groups", + "map_sources", + "combine_sources", + "combine_columns", + "combine_funcs", + "key_to_funcs", + "valid_columns", + "min_periods_func_name", + ), +) + +_cum_alpha_coeff_func = "_cum_alpha_coeff" +_cum_square_alpha_coeff_func = "_cum_square_alpha_coeff" + + +def _add_pred_results( + pred_results, + local_results, + axis=0, + alpha=None, + order=1, + alpha_ignore_na=False, + pred_exponent=None, + alpha_data=None, +): + if pred_results[0].ndim == 1: + df_filler = 0 + else: + df_filler = pred_results[0].iloc[-1, :].dropna() + df_filler[:] = 0 + + new_locals = [] + combine_axis = pred_results[0].ndim - axis - 1 + weight = (1 - alpha) ** order + pred_coeff = weight**pred_exponent + for idx, (pred_result, local_result) in enumerate(zip(pred_results, local_results)): + local_result.fillna(df_filler, inplace=True) + pred_result = pred_result.mul(pred_coeff).sum(axis=axis) + + if alpha_ignore_na: + pred_df = pred_result * weight ** alpha_data.notna().cumsum() + else: + weights = np.arange(1, len(local_result) + 1) + if local_result.ndim == 2: + weights_df = pd.DataFrame( + np.repeat( + weights.reshape((len(local_result), 1)), + len(local_result.columns), + axis=1, + ), + columns=local_result.columns, + index=local_result.index, + ) + else: + weights_df = pd.Series(weights, index=local_result.index) + weights_df[alpha_data.isna()] = np.nan + weights_df.ffill(inplace=True) + weights_df.fillna(0, inplace=True) + + weights_df = weight**weights_df + pred_df = weights_df.mul(pred_result, axis=combine_axis) + + new_locals.append(local_result.add(pred_df, axis=combine_axis)) + return new_locals + + +def _combine_mean( + pred_results, + local_results, + axis=0, + alpha=None, + alpha_ignore_na=False, + pred_exponent=None, +): + if pred_results is None: + return (local_results[0] / local_results[1]).ffill() + + alpha_data = local_results[1] + local_results[0] = local_results[0].ffill() + local_results[1] = alpha_data.ffill() + + local_sum_data, local_count_data = local_results + + if pred_results is not None: + local_sum_data, local_count_data = _add_pred_results( + pred_results, + local_results, + axis=axis, + alpha=alpha, + alpha_ignore_na=alpha_ignore_na, + pred_exponent=pred_exponent, + alpha_data=alpha_data, + ) + return local_sum_data / local_count_data + + +def _combine_var( + pred_results, + local_results, + axis=0, + alpha=None, + alpha_ignore_na=False, + pred_exponent=None, +): + local_results[0] = local_results[0].ffill() + alpha_data = local_results[1] + local_results[1] = alpha_data.ffill() + + local_results[2] = local_results[2].ffill() + alpha2_data = local_results[3] + local_results[3] = alpha2_data.ffill() + + ( + local_sum_data, + local_count_data, + local_sum_square, + local_count2_data, + ) = local_results + if pred_results is None: + return (local_sum_square - local_sum_data**2 / local_count_data) / ( + local_count_data - local_count2_data / local_count_data + ) + + pred_sum_data, pred_count_data, pred_sum_square, pred_count2_data = pred_results + + (local_count2_data,) = _add_pred_results( + [pred_count2_data], + [local_count2_data], + axis=axis, + alpha=alpha, + order=2, + alpha_ignore_na=alpha_ignore_na, + pred_exponent=pred_exponent, + alpha_data=alpha_data, + ) + + local_sum_square, local_sum_data, local_count_data = _add_pred_results( + [pred_sum_square, pred_sum_data, pred_count_data], + [local_sum_square, local_sum_data, local_count_data], + axis=axis, + alpha=alpha, + alpha_ignore_na=alpha_ignore_na, + pred_exponent=pred_exponent, + alpha_data=alpha_data, + ) + + return (local_sum_square - local_sum_data**2 / local_count_data) / ( + local_count_data - local_count2_data / local_count_data + ) + + +def _combine_std( + pred_results, + local_results, + axis=0, + alpha=None, + alpha_ignore_na=False, + pred_exponent=None, +): + return np.sqrt( + _combine_var( + pred_results, + local_results, + axis=axis, + alpha=alpha, + alpha_ignore_na=alpha_ignore_na, + pred_exponent=pred_exponent, + ) + ) + + +def _combine_data_count(pred_results, local_results, axis=0, **__): + if pred_results is None: + return local_results[0] + return local_results[0].add( + pred_results[0].sum(), axis=pred_results[0].ndim - axis - 1 + ) + + +class DataFrameEwmAgg(BaseDataFrameExpandingAgg): + _op_type_ = opcodes.EWM_AGG + + _alpha = Float64Field("alpha") + _adjust = BoolField("adjust") + _alpha_ignore_na = BoolField("alpha_ignore_na") + + _validate_columns = BoolField("_validate_columns") + + _exec_cache = dict() + + def __init__( + self, alpha=None, adjust=None, alpha_ignore_na=None, validate_columns=None, **kw + ): + super().__init__( + _alpha=alpha, + _adjust=adjust, + _alpha_ignore_na=alpha_ignore_na, + _validate_columns=validate_columns, + **kw + ) + + @property + def alpha(self) -> float: + return self._alpha + + @property + def adjust(self) -> bool: + return self._adjust + + @property + def alpha_ignore_na(self) -> bool: + return self._alpha_ignore_na + + @property + def validate_columns(self) -> bool: + return self._validate_columns + + @classmethod + def _get_stage_functions(cls, op: "DataFrameEwmAgg", func): + if func == "_data_count": + return ["_data_count"], _combine_data_count + elif func == "mean": + return ["cumsum", _cum_alpha_coeff_func], _combine_mean + elif func in {"var", "std"}: + return ( + [ + "cumsum", + _cum_alpha_coeff_func, + "cumsum2", + _cum_square_alpha_coeff_func, + ], + _combine_var if func == "var" else _combine_std, + ) + else: # pragma: no cover + raise NotImplementedError + + @classmethod + def _calc_data_alphas(cls, op: "DataFrameEwmAgg", in_data, order): + exec_cache = cls._exec_cache[op.key] + cache_key = ("_calc_data_alphas", order, id(in_data)) + try: + return exec_cache[cache_key] + except KeyError: + pass + + cum_df = in_data.copy() + cum_df[cum_df.notna()] = 1 + if not op.alpha_ignore_na: + cum_df.ffill(inplace=True) + cum_df = cum_df.cumsum(axis=op.axis) - 1 + if not op.alpha_ignore_na: + cum_df[in_data.isna()] = np.nan + + result = exec_cache[cache_key] = (1 - op.alpha) ** (order * cum_df) + return result + + @classmethod + def _execute_cum_alpha_coeff( + cls, op: "DataFrameEwmAgg", in_data, order, final=True + ): + exec_cache = cls._exec_cache[op.key] + cache_key = ("cum_alpha_coeff", order, id(in_data)) + summary = None + + try: + result = exec_cache[cache_key] + except KeyError: + alphas = cls._calc_data_alphas(op, in_data, order) + result = alphas.cumsum() + exec_cache[cache_key] = result + + if final: + if op.output_agg: + summary = result.ffill()[-1:] + return result, summary + + @classmethod + def _execute_cumsum(cls, op: "DataFrameEwmAgg", in_data): + exec_cache = cls._exec_cache[op.key] + cache_key = ("cumsum", id(in_data)) + summary = None + + try: + result = exec_cache[cache_key] + except KeyError: + min_periods = 1 if op.min_periods > 0 else 0 + + try: + data = in_data.ewm( + alpha=op.alpha, + ignore_na=op.alpha_ignore_na, + adjust=op.adjust, + min_periods=min_periods, + ).mean() + except ValueError: + in_data = in_data.copy() + data = in_data.ewm( + alpha=op.alpha, + ignore_na=op.alpha_ignore_na, + adjust=op.adjust, + min_periods=min_periods, + ).mean() + + alpha_sum, _ = op._execute_cum_alpha_coeff(op, in_data, 1, final=False) + result = exec_cache[cache_key] = data * alpha_sum + + if op.output_agg: + summary = result.ffill()[-1:] + return result, summary + + @classmethod + def _execute_cumsum2(cls, op: "DataFrameEwmAgg", in_data): + summary = None + min_periods = 1 if op.min_periods > 0 else 0 + + try: + data = in_data.ewm( + alpha=op.alpha, + ignore_na=op.alpha_ignore_na, + adjust=op.adjust, + min_periods=min_periods, + ).var(bias=True) + except ValueError: + in_data = in_data.copy() + data = in_data.ewm( + alpha=op.alpha, + ignore_na=op.alpha_ignore_na, + adjust=op.adjust, + min_periods=min_periods, + ).var(bias=True) + + alpha_sum, _ = op._execute_cum_alpha_coeff(op, in_data, 1) + cumsum, _ = op._execute_cumsum(op, in_data) + result = alpha_sum * data + cumsum**2 / alpha_sum + + if op.output_agg: + summary = result.ffill()[-1:] + + return result, summary + + @classmethod + def _execute_map_function(cls, op: "DataFrameEwmAgg", func, in_data): + in_data = in_data._get_numeric_data() + + summary = None + min_periods = 1 if op.min_periods > 0 else 0 + if func == "_data_count": + result = in_data.expanding(min_periods=min_periods).count() + elif func in (_cum_alpha_coeff_func, _cum_square_alpha_coeff_func): + order = 1 if func == _cum_alpha_coeff_func else 2 + result, summary = cls._execute_cum_alpha_coeff(op, in_data, order) + elif func == "cumsum": + result, summary = cls._execute_cumsum(op, in_data) + elif func == "cumsum2": + result, summary = cls._execute_cumsum2(op, in_data) + else: # pragma: no cover + raise ValueError("Map function %s not supported") + + if op.output_agg: + summary = summary if summary is not None else result.iloc[-1:] + else: + summary = None + return result, summary + + @classmethod + def _execute_map(cls, ctx, op: "DataFrameEwmAgg"): + try: + cls._exec_cache[op.key] = dict() + + super()._execute_map(ctx, op) + if op.output_agg: + in_data = ctx[op.inputs[0].key] + summaries = list(ctx[op.outputs[1].key]) + + if op.alpha_ignore_na: + in_count = in_data.count() + if not isinstance(in_count, pd.Series): + in_count = pd.Series([in_count]) + summary = in_count + if in_data.ndim == 2: + summary = in_count.to_frame().T + summary.index = summaries[-1].index + else: + remain_counts = in_data.notna()[::-1].to_numpy().argmax(axis=0) + if in_data.ndim > 1: + remain_counts = remain_counts.reshape((1, len(in_data.columns))) + summary = pd.DataFrame( + remain_counts, + columns=in_data.columns, + index=summaries[-1].index, + ) + else: + summary = pd.Series(remain_counts, index=summaries[-1].index) + summaries.insert(-1, summary) + + ctx[op.outputs[1].key] = tuple(summaries) + finally: + cls._exec_cache.pop(op.key, None) + + @classmethod + def _execute_combine_function( + cls, op: "DataFrameEwmAgg", func, prev_inputs, local_inputs, func_cols + ): + exec_cache = cls._exec_cache[op.key] + pred_exponent = exec_cache.get("pred_exponent") + if func_cols and pred_exponent is not None: + pred_exponent = ( + pred_exponent[func_cols] if pred_exponent is not None else None + ) + return func( + prev_inputs, + local_inputs, + axis=op.axis, + alpha=op.alpha, + alpha_ignore_na=op.alpha_ignore_na, + pred_exponent=pred_exponent, + ) + + @classmethod + def _execute_combine(cls, ctx, op: "DataFrameEwmAgg"): + try: + cls._exec_cache[op.key] = dict() + + if len(op.inputs) != 1: + pred_data = ctx[op.inputs[1].key] + + if op.alpha_ignore_na: + pred_exponent = ( + pred_data[-2].shift(-1)[::-1].cumsum()[::-1].fillna(0) + ) + else: + succ_counts = pred_data[-1].shift(-1) + succ_counts.iloc[-1] = 0 + pred_exponent = pred_data[-2].add( + succ_counts[::-1].cumsum()[::-1], axis=op.axis + ) + + cls._exec_cache[op.key]["pred_exponent"] = pred_exponent + + super()._execute_combine(ctx, op) + finally: + cls._exec_cache.pop(op.key, None) + + @classmethod + def _execute_raw_function(cls, op: "DataFrameEwmAgg", in_data): + for _ in range(2): + ewm = in_data.ewm( + alpha=op.alpha, + min_periods=op.min_periods, + adjust=op.adjust, + ignore_na=op.alpha_ignore_na, + ) + try: + val = ewm.agg(op.func) + if ( + in_data.ndim == 2 + and op.validate_columns + and len(val.columns) != len(op.outputs[0].columns_value.to_pandas()) + ): + raise ValueError("Columns not consistent") + return val + except ValueError: + in_data = in_data.copy() + else: # pragma: no cover + raise ValueError diff --git a/python/xorbits/_mars/dataframe/window/ewm/core.py b/python/xorbits/_mars/dataframe/window/ewm/core.py new file mode 100644 index 000000000..ce56da10c --- /dev/null +++ b/python/xorbits/_mars/dataframe/window/ewm/core.py @@ -0,0 +1,288 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from collections import OrderedDict + +from ....serialization.serializables import ( + BoolField, + Float64Field, + Int32Field, + Int64Field, + StringField, +) +from ....utils import pd_release_version +from ...utils import validate_axis +from ..core import Window + +_default_min_period_1 = pd_release_version >= (1, 1, 0) +_pd_1_3_repr = pd_release_version >= (1, 3, 0) +_window_has_method = pd_release_version >= (1, 4, 0) + + +class EWM(Window): + _alpha = Float64Field("alpha") + _min_periods = Int64Field("min_periods") + _adjust = BoolField("adjust") + _ignore_na = BoolField("ignore_na") + _axis = Int32Field("axis") + _method = StringField("method") + + def __init__( + self, + alpha=None, + min_periods=None, + adjust=None, + ignore_na=None, + axis=None, + method=None, + **kw + ): + super().__init__( + _alpha=alpha, + _min_periods=min_periods, + _adjust=adjust, + _ignore_na=ignore_na, + _axis=axis, + _method=method or "single", + **kw + ) + + @property + def alpha(self): + return self._alpha + + @property + def min_periods(self): + return self._min_periods + + @property + def adjust(self): + return self._adjust + + @property + def ignore_na(self): + return self._ignore_na + + @property + def axis(self): + return self._axis + + @property + def method(self): + return self._method + + @property + def params(self): + p = OrderedDict() + + if not _window_has_method: # pragma: no cover + args = ["alpha", "min_periods", "adjust", "ignore_na", "axis"] + else: + args = ["alpha", "min_periods", "adjust", "ignore_na", "axis", "method"] + + for k in args: + p[k] = getattr(self, k) + return p + + def __call__(self, df): + return df.ewm(**self.params) + + def _repr(self, params): + com = 1.0 / params.pop("alpha") - 1 + params["com"] = int(com) if _pd_1_3_repr and com == math.floor(com) else com + try: + params.move_to_end("com", last=False) + except AttributeError: # pragma: no cover + pass + return super()._repr(params) + + def _repr_name(self): + try: + from pandas.core.window import ExponentialMovingWindow # noqa: F401 + + return "ExponentialMovingWindow" + except ImportError: # pragma: no cover + return "EWM" + + def aggregate(self, func): + from .aggregation import DataFrameEwmAgg + + params = self.params + params["alpha_ignore_na"] = params.pop("ignore_na", False) + params["validate_columns"] = False + op = DataFrameEwmAgg(func=func, **params) + return op(self) + + agg = aggregate + + def mean(self): + return self.aggregate("mean") + + def var(self): + return self.aggregate("var") + + def std(self): + return self.aggregate("std") + + +def ewm( + obj, + com=None, + span=None, + halflife=None, + alpha=None, + min_periods=0, + adjust=True, + ignore_na=False, + axis=0, +): + r""" + Provide exponential weighted functions. + + Parameters + ---------- + com : float, optional + Specify decay in terms of center of mass, + :math:`\alpha = 1 / (1 + com),\text{ for } com \geq 0`. + span : float, optional + Specify decay in terms of span, + :math:`\alpha = 2 / (span + 1),\text{ for } span \geq 1`. + halflife : float, optional + Specify decay in terms of half-life, + :math:`\alpha = 1 - exp(log(0.5) / halflife),\text{for} halflife > 0`. + alpha : float, optional + Specify smoothing factor :math:`\alpha` directly, + :math:`0 < \alpha \leq 1`. + min_periods : int, default 0 + Minimum number of observations in window required to have a value + (otherwise result is NA). + adjust : bool, default True + Divide by decaying adjustment factor in beginning periods to account + for imbalance in relative weightings + (viewing EWMA as a moving average). + ignore_na : bool, default False + Ignore missing values when calculating weights; + specify True to reproduce pre-0.15.0 behavior. + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to use. The value 0 identifies the rows, and 1 + identifies the columns. + + Returns + ------- + DataFrame + A Window sub-classed for the particular operation. + + See Also + -------- + rolling : Provides rolling window calculations. + expanding : Provides expanding transformations. + + Notes + ----- + Exactly one of center of mass, span, half-life, and alpha must be provided. + + Allowed values and relationship between the parameters are specified in the + parameter descriptions above; see the link at the end of this section for + a detailed explanation. + + When adjust is True (default), weighted averages are calculated using + weights (1-alpha)**(n-1), (1-alpha)**(n-2), ..., 1-alpha, 1. + + When adjust is False, weighted averages are calculated recursively as: + + weighted_average[0] = arg[0]; + weighted_average[i] = (1-alpha)*weighted_average[i-1] + alpha*arg[i]. + + When ignore_na is False (default), weights are based on absolute positions. + For example, the weights of x and y used in calculating the final weighted + average of [x, None, y] are (1-alpha)**2 and 1 (if adjust is True), and + (1-alpha)**2 and alpha (if adjust is False). + + When ignore_na is True (reproducing pre-0.15.0 behavior), weights are based + on relative positions. For example, the weights of x and y used in + calculating the final weighted average of [x, None, y] are 1-alpha and 1 + (if adjust is True), and 1-alpha and alpha (if adjust is False). + + More details can be found at + https://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html#exponentially-weighted-windows + + Examples + -------- + >>> import numpy as np + >>> import mars.dataframe as md + >>> df = md.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + >>> df.execute() + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + >>> df.ewm(com=0.5).mean().execute() + B + 0 0.000000 + 1 0.750000 + 2 1.615385 + 3 1.615385 + 4 3.670213 + """ + axis = validate_axis(axis, obj) + + decay_count = 0 + for arg in (com, span, halflife, alpha): + if arg is not None: + decay_count += 1 + + if decay_count == 0: + raise ValueError("Must pass one of comass, span, halflife, or alpha") + if decay_count > 1: + raise ValueError("comass, span, halflife, and alpha are mutually exclusive") + + if com is not None: + if com < 0: + raise ValueError("comass must satisfy: comass >= 0") + alpha = 1.0 / (1 + com) + elif span is not None: + if span < 1: + raise ValueError("span must satisfy: span >= 1") + alpha = 2.0 / (1 + span) + elif halflife is not None: + if halflife <= 0: + raise ValueError("halflife must satisfy: halflife > 0") + alpha = 1.0 - math.exp(math.log(0.5) / halflife) + if alpha <= 0 or alpha > 1: + raise ValueError("alpha must satisfy: 0 < alpha <= 1") + + if not adjust and not ignore_na: + raise NotImplementedError( + "adjust == False when ignore_na == False not implemented" + ) + if axis == 1: + raise NotImplementedError("axis other than 0 is not supported") + + if alpha == 1: + return obj.expanding(min_periods=min_periods, axis=axis) + + if _default_min_period_1: + min_periods = min_periods or 1 + + return EWM( + input=obj, + alpha=alpha, + min_periods=min_periods, + adjust=adjust, + ignore_na=ignore_na, + axis=axis, + ) diff --git a/python/xorbits/_mars/dataframe/window/ewm/tests/__init__.py b/python/xorbits/_mars/dataframe/window/ewm/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/window/ewm/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/window/ewm/tests/test_ewm.py b/python/xorbits/_mars/dataframe/window/ewm/tests/test_ewm.py new file mode 100644 index 000000000..4ecd0b5c8 --- /dev/null +++ b/python/xorbits/_mars/dataframe/window/ewm/tests/test_ewm.py @@ -0,0 +1,84 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest + +from ..... import dataframe as md +from .....core import tile + + +def test_ewm(): + df = pd.DataFrame(np.random.rand(4, 3), columns=list("abc")) + df2 = md.DataFrame(df) + + with pytest.raises(NotImplementedError): + _ = df2.ewm(2, adjust=False, ignore_na=False) + + with pytest.raises(ValueError): + _ = df2.ewm() + + with pytest.raises(ValueError): + _ = df2.ewm(com=2, alpha=0.3) + + assert pytest.approx(df2.ewm(com=1).alpha) == 0.5 + with pytest.raises(ValueError): + _ = df2.ewm(com=-1) + + assert pytest.approx(df2.ewm(span=3).alpha) == 0.5 + with pytest.raises(ValueError): + _ = df2.ewm(span=0) + + assert pytest.approx(df2.ewm(halflife=1).alpha) == 0.5 + with pytest.raises(ValueError): + _ = df2.ewm(halflife=-1) + + with pytest.raises(ValueError): + _ = df2.ewm(alpha=2) + + r = df2.ewm(3) + expected = df.ewm(3) + assert repr(r) == repr(expected) + + r = df2.ewm(alpha=1) + assert type(r).__name__ == "Expanding" + + +def test_ewm_agg(): + df = pd.DataFrame(np.random.rand(4, 3), columns=list("abc")) + df2 = md.DataFrame(df, chunk_size=3) + + with pytest.raises(NotImplementedError): + _ = df2.ewm(span=3, axis=1).agg("mean") + + r = df2.ewm(span=3).agg("mean") + expected = df.ewm(span=3).agg("mean") + + assert r.shape == df.shape + assert r.index_value is df2.index_value + pd.testing.assert_index_equal(r.columns_value.to_pandas(), expected.columns) + pd.testing.assert_series_equal(r.dtypes, df2.dtypes) + + r = tile(r) + for c in r.chunks: + assert c.shape == c.inputs[0].shape + assert c.index_value is c.inputs[0].index_value + pd.testing.assert_index_equal(c.columns_value.to_pandas(), expected.columns) + pd.testing.assert_series_equal(c.dtypes, expected.dtypes) + + aggs = ["mean", "var", "std"] + for a in aggs: + r = getattr(df2.ewm(span=3), a)() + assert r.op.func == a diff --git a/python/xorbits/_mars/dataframe/window/ewm/tests/test_ewm_execution.py b/python/xorbits/_mars/dataframe/window/ewm/tests/test_ewm_execution.py new file mode 100644 index 000000000..97147f6a3 --- /dev/null +++ b/python/xorbits/_mars/dataframe/window/ewm/tests/test_ewm_execution.py @@ -0,0 +1,137 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict + +import numpy as np +import pandas as pd + +from ..... import dataframe as md + + +def test_dataframe_ewm_agg(setup): + np.random.seed(0) + + raw = pd.DataFrame( + { + "a": np.random.randint(100, size=(10,)), + "b": np.random.rand(10), + "c": np.random.randint(100, size=(10,)), + "d": ["c" * i for i in np.random.randint(4, size=10)], + } + ) + raw.b[0:3] = np.nan + raw.b[5:7] = np.nan + raw.b[9] = np.nan + + df = md.DataFrame(raw, chunk_size=(10, 3)) + + r = df.ewm(alpha=0.5).agg("mean") + pd.testing.assert_frame_equal(r.execute().fetch(), raw.ewm(alpha=0.5).agg("mean")) + + r = df.ewm(alpha=0.5).agg(["mean"]) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.ewm(alpha=0.5).agg(["mean"])) + + df = md.DataFrame(raw, chunk_size=(3, 3)) + + aggs = ["mean", "var", "std"] + + for fun_name in aggs: + r = df.ewm(alpha=0.3).agg(fun_name) + pd.testing.assert_frame_equal( + r.execute().fetch(), raw.ewm(alpha=0.3).agg(fun_name) + ) + + r = df.ewm(alpha=0.3, ignore_na=True).agg(fun_name) + pd.testing.assert_frame_equal( + r.execute().fetch(), raw.ewm(alpha=0.3, ignore_na=True).agg(fun_name) + ) + + r = df.ewm(alpha=0.3).agg("mean") + pd.testing.assert_frame_equal(r.execute().fetch(), raw.ewm(alpha=0.3).agg("mean")) + + r = df.ewm(alpha=0.3).agg(["mean"]) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.ewm(alpha=0.3).agg(["mean"])) + + r = df.ewm(alpha=0.3).agg(aggs) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.ewm(alpha=0.3).agg(aggs)) + + agg_dict = {"c": "mean"} + r = df.ewm(alpha=0.3).agg(agg_dict) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.ewm(alpha=0.3).agg(agg_dict)) + + agg_dict = OrderedDict([("a", ["mean", "var"]), ("b", "var")]) + r = df.ewm(alpha=0.3).agg(agg_dict) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.ewm(alpha=0.3).agg(agg_dict)) + + r = df.ewm(alpha=0.3, min_periods=0).agg(aggs) + pd.testing.assert_frame_equal( + r.execute().fetch(), raw.ewm(alpha=0.3, min_periods=0).agg(aggs) + ) + + r = df.ewm(alpha=0.3, min_periods=2).agg(aggs) + pd.testing.assert_frame_equal( + r.execute().fetch(), raw.ewm(alpha=0.3, min_periods=2).agg(aggs) + ) + + agg_dict = OrderedDict([("a", ["mean", "var"]), ("b", "var"), ("c", "mean")]) + r = df.ewm(alpha=0.3, min_periods=2).agg(agg_dict) + pd.testing.assert_frame_equal( + r.execute().fetch(), raw.ewm(alpha=0.3, min_periods=2).agg(agg_dict) + ) + + +def test_series_expanding_agg(setup): + raw = pd.Series(np.random.rand(10), name="a") + raw[:3] = np.nan + raw[5:10:2] = np.nan + + series = md.Series(raw, chunk_size=10) + + r = series.ewm(alpha=0.3).agg(["mean"]) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.ewm(alpha=0.3).agg(["mean"])) + + r = series.ewm(alpha=0.3).agg("mean") + pd.testing.assert_series_equal(r.execute().fetch(), raw.ewm(alpha=0.3).agg("mean")) + + series = md.Series(raw, chunk_size=3) + + aggs = ["mean", "var", "std"] + + for fun_name in aggs: + r = series.ewm(alpha=0.3).agg(fun_name) + pd.testing.assert_series_equal( + r.execute().fetch(), raw.ewm(alpha=0.3).agg(fun_name) + ) + + r = series.ewm(alpha=0.3, ignore_na=True).agg(fun_name) + pd.testing.assert_series_equal( + r.execute().fetch(), raw.ewm(alpha=0.3, ignore_na=True).agg(fun_name) + ) + + r = series.ewm(alpha=0.3).agg(["mean"]) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.ewm(alpha=0.3).agg(["mean"])) + + r = series.ewm(alpha=0.3).agg(aggs) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.ewm(alpha=0.3).agg(aggs)) + + r = series.ewm(alpha=0.3, min_periods=0).agg(aggs) + pd.testing.assert_frame_equal( + r.execute().fetch(), raw.ewm(alpha=0.3, min_periods=0).agg(aggs) + ) + + r = series.ewm(alpha=0.3, min_periods=2).agg(aggs) + pd.testing.assert_frame_equal( + r.execute().fetch(), raw.ewm(alpha=0.3, min_periods=2).agg(aggs) + ) diff --git a/python/xorbits/_mars/dataframe/window/expanding/__init__.py b/python/xorbits/_mars/dataframe/window/expanding/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/window/expanding/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/window/expanding/aggregation.py b/python/xorbits/_mars/dataframe/window/expanding/aggregation.py new file mode 100644 index 000000000..502ef3c3c --- /dev/null +++ b/python/xorbits/_mars/dataframe/window/expanding/aggregation.py @@ -0,0 +1,177 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import namedtuple +from functools import partial + +import numpy as np +import pandas as pd + +from .... import opcodes +from ....serialization.serializables import BoolField +from ..aggregation import BaseDataFrameExpandingAgg + +_stage_info = namedtuple( + "_stage_info", + ( + "map_groups", + "map_sources", + "combine_sources", + "combine_columns", + "combine_funcs", + "key_to_funcs", + "valid_columns", + "min_periods_func_name", + ), +) + +_cum_alpha_coeff_func = "_cum_alpha_coeff" +_cum_square_alpha_coeff_func = "_cum_square_alpha_coeff" + + +def _add_pred_results(pred_results, local_results, axis=0): + if pred_results[0].ndim == 1: + df_filler = 0 + else: + df_filler = pred_results[0].iloc[-1, :].dropna() + df_filler[:] = 0 + + new_locals = [] + combine_axis = pred_results[0].ndim - axis - 1 + for pred_result, local_result in zip(pred_results, local_results): + local_result = local_result.fillna(df_filler, axis=axis) + new_locals.append( + local_result.add(pred_result.sum(axis=axis), axis=combine_axis) + ) + return new_locals + + +def _combine_arithmetic(pred_results, local_results, axis=0): + if pred_results is None: + return local_results[0] + return _add_pred_results(pred_results, local_results, axis=axis)[0] + + +def _combine_minmax(pred_results, local_results, axis=0, fun_name=None): + if pred_results is None: + return local_results[0] + + pred_size = len(pred_results[0]) + con = pd.concat([pred_results[0], local_results[0]], axis=axis) + result = con.expanding(axis=axis).agg(fun_name) + if result.ndim == 2: + return result.iloc[pred_size:, :] if axis == 0 else result.iloc[:, pred_size:] + else: + return result.iloc[pred_size:] + + +def _combine_mean(pred_results, local_results, axis=0): + local_sum_data, local_count_data = local_results + + if pred_results is not None: + local_sum_data, local_count_data = _add_pred_results( + pred_results, local_results, axis=axis + ) + return local_sum_data / local_count_data + + +def _combine_var(pred_results, local_results, axis=0): + local_sum_data, local_count_data, local_var_data = local_results + if pred_results is None: + return local_var_data * local_count_data / (local_count_data - 1) + + pred_sum_data, pred_count_data, pred_var_data = pred_results + + local_sum_square = ( + local_count_data * local_var_data + local_sum_data**2 / local_count_data + ) + pred_sum_square = ( + pred_count_data * pred_var_data + pred_sum_data**2 / pred_count_data + ) + + local_sum_square, local_sum_data, local_count_data = _add_pred_results( + [pred_sum_square, pred_sum_data, pred_count_data], + [local_sum_square, local_sum_data, local_count_data], + axis=axis, + ) + + return (local_sum_square - local_sum_data**2 / local_count_data) / ( + local_count_data - 1 + ) + + +def _combine_std(pred_results, local_results, axis=0): + return np.sqrt(_combine_var(pred_results, local_results, axis=axis)) + + +class DataFrameExpandingAgg(BaseDataFrameExpandingAgg): + _op_type_ = opcodes.EXPANDING_AGG + + _center = BoolField("center") + + def __init__(self, center=None, **kw): + super().__init__(_center=center, **kw) + + @property + def center(self): + return self._center + + @classmethod + def _get_stage_functions(cls, op: "DataFrameExpandingAgg", func): + if func == "_data_count": + return ["count"], _combine_arithmetic + elif func in ("sum", "prod", "count"): + return [func], _combine_arithmetic + elif func in ("min", "max"): + return [func], partial(_combine_minmax, fun_name=func) + elif func == "mean": + return ["sum", "count"], _combine_mean + elif func in {"var", "std"}: + return ( + ["sum", "count", "var"], + _combine_var if func == "var" else _combine_std, + ) + else: # pragma: no cover + raise NotImplementedError + + @classmethod + def _execute_map_function(cls, op: "DataFrameExpandingAgg", func, in_data): + min_periods = 1 if op.min_periods > 0 else 0 + + expanding = in_data.expanding( + min_periods=min_periods, center=op.center, axis=op.axis + ) + if func == "var": + result = expanding.var(ddof=0) + else: + result = expanding.agg(func) + + if op.output_agg: + summary = result.iloc[len(result) - 1 : len(result)] + else: + summary = None + return result, summary + + @classmethod + def _execute_combine_function( + cls, op: "DataFrameExpandingAgg", func, pred_inputs, local_inputs, func_cols + ): + return func(pred_inputs, local_inputs, axis=op.axis) + + @classmethod + def _execute_raw_function(cls, op: "DataFrameExpandingAgg", in_data): + expanding = in_data.expanding( + min_periods=op.min_periods, center=op.center, axis=op.axis + ) + return expanding.agg(op.func) diff --git a/python/xorbits/_mars/dataframe/window/expanding/core.py b/python/xorbits/_mars/dataframe/window/expanding/core.py new file mode 100644 index 000000000..e74c3e692 --- /dev/null +++ b/python/xorbits/_mars/dataframe/window/expanding/core.py @@ -0,0 +1,161 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict + +from ....serialization.serializables import ( + BoolField, + Int32Field, + Int64Field, + StringField, +) +from ....utils import pd_release_version +from ...utils import validate_axis +from ..core import Window + +_window_has_method = pd_release_version >= (1, 3, 0) + + +class Expanding(Window): + _min_periods = Int64Field("min_periods") + _axis = Int32Field("axis") + _center = BoolField("center") + _method = StringField("method") + + def __init__(self, min_periods=None, axis=None, center=None, method=None, **kw): + super().__init__( + _min_periods=min_periods, _axis=axis, _center=center, _method=method, **kw + ) + + @property + def min_periods(self): + return self._min_periods + + @property + def axis(self): + return self._axis + + @property + def center(self): + return self._center + + @property + def method(self): + return self._method or "single" + + def __call__(self, df): + return df.expanding(**self.params) + + @property + def params(self): + p = OrderedDict() + + if not _window_has_method: # pragma: no cover + args = ["min_periods", "center", "axis"] + else: + args = ["min_periods", "center", "axis", "method"] + + for k in args: + p[k] = getattr(self, k) + return p + + def aggregate(self, func, **kwargs): + from .aggregation import DataFrameExpandingAgg + + count_always_valid = kwargs.pop("_count_always_valid", False) + + op = DataFrameExpandingAgg( + func=func, count_always_valid=count_always_valid, **self.params + ) + return op(self) + + agg = aggregate + + def sum(self): + return self.aggregate("sum") + + def count(self): + return self.aggregate("count") + + def min(self): + return self.aggregate("min") + + def max(self): + return self.aggregate("max") + + def mean(self): + return self.aggregate("mean") + + def var(self): + return self.aggregate("var") + + def std(self): + return self.aggregate("std") + + +def expanding(obj, min_periods=1, center=False, axis=0): + """ + Provide expanding transformations. + + Parameters + ---------- + min_periods : int, default 1 + Minimum number of observations in window required to have a value + (otherwise result is NA). + center : bool, default False + Set the labels at the center of the window. + axis : int or str, default 0 + + Returns + ------- + a Window sub-classed for the particular operation + + See Also + -------- + rolling : Provides rolling window calculations. + ewm : Provides exponential weighted functions. + + Notes + ----- + By default, the result is set to the right edge of the window. This can be + changed to the center of the window by setting ``center=True``. + + Examples + -------- + >>> import numpy as np + >>> import mars.dataframe as md + >>> df = md.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + >>> df.execute() + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + >>> df.expanding(2).sum().execute() + B + 0 NaN + 1 1.0 + 2 3.0 + 3 3.0 + 4 7.0 + """ + axis = validate_axis(axis, obj) + + if center: + raise NotImplementedError("center == True is not supported") + if axis == 1: + raise NotImplementedError("axis other than 0 is not supported") + + return Expanding(input=obj, min_periods=min_periods, center=center, axis=axis) diff --git a/python/xorbits/_mars/dataframe/window/expanding/tests/__init__.py b/python/xorbits/_mars/dataframe/window/expanding/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/window/expanding/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/window/expanding/tests/test_expanding.py b/python/xorbits/_mars/dataframe/window/expanding/tests/test_expanding.py new file mode 100644 index 000000000..a1ddedf5c --- /dev/null +++ b/python/xorbits/_mars/dataframe/window/expanding/tests/test_expanding.py @@ -0,0 +1,74 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest + +from ..... import dataframe as md +from .....core import tile + + +def test_expanding(): + df = pd.DataFrame(np.random.rand(4, 3), columns=list("abc")) + df2 = md.DataFrame(df) + + with pytest.raises(NotImplementedError): + _ = df2.expanding(3, center=True) + + with pytest.raises(NotImplementedError): + _ = df2.expanding(3, axis=1) + + r = df2.expanding(3, center=False) + expected = df.expanding(3, center=False) + assert repr(r) == repr(expected) + + assert "b" in dir(r) + + with pytest.raises(AttributeError): + _ = r.d + + with pytest.raises(KeyError): + _ = r["d"] + + with pytest.raises(KeyError): + _ = r["a", "d"] + + assert "a" not in dir(r.a) + assert "c" not in dir(r["a", "b"]) + + +def test_expanding_agg(): + df = pd.DataFrame(np.random.rand(4, 3), columns=list("abc")) + df2 = md.DataFrame(df, chunk_size=3) + + r = df2.expanding(3).agg("max") + expected = df.expanding(3).agg("max") + + assert r.shape == df.shape + assert r.index_value is df2.index_value + pd.testing.assert_index_equal(r.columns_value.to_pandas(), expected.columns) + pd.testing.assert_series_equal(r.dtypes, df2.dtypes) + + r = tile(r) + for c in r.chunks: + assert c.shape == c.inputs[0].shape + assert c.index_value is c.inputs[0].index_value + pd.testing.assert_index_equal(c.columns_value.to_pandas(), expected.columns) + pd.testing.assert_series_equal(c.dtypes, expected.dtypes) + + aggs = ["sum", "count", "min", "max", "mean", "var", "std"] + for a in aggs: + r = getattr(df2.expanding(3), a)() + assert r.op.func == a diff --git a/python/xorbits/_mars/dataframe/window/expanding/tests/test_expanding_execution.py b/python/xorbits/_mars/dataframe/window/expanding/tests/test_expanding_execution.py new file mode 100644 index 000000000..e1cf46529 --- /dev/null +++ b/python/xorbits/_mars/dataframe/window/expanding/tests/test_expanding_execution.py @@ -0,0 +1,108 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict + +import numpy as np +import pandas as pd + +from ..... import dataframe as md + + +def test_dataframe_expanding_agg(setup): + raw = pd.DataFrame( + { + "a": np.random.randint(100, size=(10,)), + "b": np.random.rand(10), + "c": np.random.randint(100, size=(10,)), + "d": ["c" * i for i in np.random.randint(4, size=10)], + } + ) + raw.b[:3] = np.nan + raw.b[5:7] = np.nan + + df = md.DataFrame(raw, chunk_size=(10, 3)) + + r = df.expanding().agg(["sum"]) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding().agg(["sum"])) + + df = md.DataFrame(raw, chunk_size=(3, 2)) + + aggs = ["sum", "count", "min", "max", "mean", "var", "std"] + + for fun_name in aggs: + r = df.expanding().agg(fun_name) + pd.testing.assert_frame_equal( + r.execute().fetch(), raw.expanding().agg(fun_name) + ) + + r = df.expanding().agg(["sum"]) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding().agg(["sum"])) + + r = df.expanding().agg(aggs) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding().agg(aggs)) + + agg_dict = {"c": "sum"} + r = df.expanding().agg(agg_dict) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding().agg(agg_dict)) + + agg_dict = OrderedDict([("a", ["sum", "var"]), ("b", "var")]) + r = df.expanding().agg(agg_dict) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding().agg(agg_dict)) + + r = df.expanding(0).agg(aggs) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding(0).agg(aggs)) + + r = df.expanding(2).agg(aggs) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding(2).agg(aggs)) + + agg_dict = OrderedDict([("a", ["min", "max"]), ("b", "max"), ("c", "sum")]) + r = df.expanding(2).agg(agg_dict) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding(2).agg(agg_dict)) + + +def test_series_expanding_agg(setup): + raw = pd.Series(np.random.rand(10), name="a") + raw[:3] = np.nan + raw[5:7] = np.nan + + series = md.Series(raw, chunk_size=10) + + r = series.expanding().agg(["sum"]) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding().agg(["sum"])) + + r = series.expanding().agg("sum") + pd.testing.assert_series_equal(r.execute().fetch(), raw.expanding().agg("sum")) + + series = md.Series(raw, chunk_size=3) + + aggs = ["sum", "count", "min", "max", "mean", "var", "std"] + + for fun_name in aggs: + r = series.expanding().agg(fun_name) + pd.testing.assert_series_equal( + r.execute().fetch(), raw.expanding().agg(fun_name) + ) + + r = series.expanding().agg(["sum"]) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding().agg(["sum"])) + + r = series.expanding().agg(aggs) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding().agg(aggs)) + + r = series.expanding(2).agg(aggs) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding(2).agg(aggs)) + + r = series.expanding(0).agg(aggs) + pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding(0).agg(aggs)) diff --git a/python/xorbits/_mars/dataframe/window/rolling/__init__.py b/python/xorbits/_mars/dataframe/window/rolling/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/window/rolling/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/window/rolling/aggregation.py b/python/xorbits/_mars/dataframe/window/rolling/aggregation.py new file mode 100644 index 000000000..e8d93f285 --- /dev/null +++ b/python/xorbits/_mars/dataframe/window/rolling/aggregation.py @@ -0,0 +1,488 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from .... import opcodes +from ....core import recursive_tile +from ....serialization.serializables import ( + AnyField, + BoolField, + DictField, + FieldTypes, + Int32Field, + Int64Field, + KeyField, + ListField, + StringField, + TupleField, +) +from ....utils import calc_nsplits, has_unknown_shape, lazy_import, pd_release_version +from ...core import DATAFRAME_TYPE +from ...operands import DataFrameOperand, DataFrameOperandMixin +from ...utils import build_empty_df, build_empty_series, parse_index + +cudf = lazy_import("cudf") +_with_pandas_issue_38908 = pd_release_version == (1, 2, 0) + + +class DataFrameRollingAgg(DataFrameOperand, DataFrameOperandMixin): + _op_type_ = opcodes.ROLLING_AGG + + _input = KeyField("input") + _window = AnyField("window") + _min_periods = Int64Field("min_periods") + _center = BoolField("center") + _win_type = StringField("win_type") + _on = StringField("on") + _axis = Int32Field("axis") + _closed = StringField("closed") + _func = AnyField("func") + _func_args = TupleField("func_args") + _func_kwargs = DictField("func_kwargs") + # for chunks + _preds = ListField("preds", FieldTypes.key) + _succs = ListField("succs", FieldTypes.key) + + def __init__( + self, + input=None, + window=None, + min_periods=None, + center=None, # pylint: disable=redefined-builtin + win_type=None, + on=None, + axis=None, + closed=None, + func=None, + func_args=None, + func_kwargs=None, + output_types=None, + preds=None, + succs=None, + **kw + ): + super().__init__( + _input=input, + _window=window, + _min_periods=min_periods, + _center=center, + _win_type=win_type, + _on=on, + _axis=axis, + _closed=closed, + _func=func, + _func_args=func_args, + _func_kwargs=func_kwargs, + _output_types=output_types, + _preds=preds, + _succs=succs, + **kw + ) + + @property + def input(self): + return self._input + + @property + def window(self): + return self._window + + @property + def min_periods(self): + return self._min_periods + + @property + def center(self): + return self._center + + @property + def win_type(self): + return self._win_type + + @property + def on(self): + return self._on + + @property + def axis(self): + return self._axis + + @property + def closed(self): + return self._closed + + @property + def func(self): + return self._func + + @property + def func_args(self): + return self._func_args + + @property + def func_kwargs(self): + return self._func_kwargs + + @property + def preds(self): + return self._preds if self._preds is not None else [] + + @property + def succs(self): + return self._succs if self._succs is not None else [] + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + input_iter = iter(self._inputs) + self._input = next(input_iter) + if self._preds is not None: + self._preds = [next(input_iter) for _ in self._preds] + if self._succs is not None: + self._succs = [next(input_iter) for _ in self._succs] + + def __call__(self, rolling): + inp = rolling.input + + if isinstance(inp, DATAFRAME_TYPE): + pd_index = inp.index_value.to_pandas() + empty_df = build_empty_df(inp.dtypes, index=pd_index[:0]) + params = rolling.params.copy() + if params["win_type"] == "freq": + params["win_type"] = None + if self._func != "count": + empty_df = empty_df._get_numeric_data() + test_df = empty_df.rolling(**params).agg(self._func) + if self._axis == 0: + index_value = inp.index_value + else: + index_value = parse_index( + test_df.index, rolling.params, inp, store_data=False + ) + return self.new_dataframe( + [inp], + shape=(inp.shape[0], test_df.shape[1]), + dtypes=test_df.dtypes, + index_value=index_value, + columns_value=parse_index(test_df.columns, store_data=True), + ) + else: + pd_index = inp.index_value.to_pandas() + empty_series = build_empty_series( + inp.dtype, index=pd_index[:0], name=inp.name + ) + test_obj = empty_series.rolling(**rolling.params).agg(self._func) + if isinstance(test_obj, pd.DataFrame): + return self.new_dataframe( + [inp], + shape=(inp.shape[0], test_obj.shape[1]), + dtypes=test_obj.dtypes, + index_value=inp.index_value, + columns_value=parse_index(test_obj.dtypes.index, store_data=True), + ) + else: + return self.new_series( + [inp], + shape=inp.shape, + dtype=test_obj.dtype, + index_value=inp.index_value, + name=test_obj.name, + ) + + @classmethod + def _check_can_be_tiled(cls, op, is_window_int): + inp = op.input + axis = op.axis + + if axis == 0 and inp.ndim == 2: + if has_unknown_shape(inp): + yield + inp = yield from recursive_tile(inp.rechunk({1: inp.shape[1]})) + + if is_window_int: + # if window is integer + if any(np.isnan(ns) for ns in inp.nsplits[op.axis]): + yield + else: + # if window is offset + # must be aware of index's meta including min and max + for i in range(inp.chunk_shape[axis]): + chunk_index = [0, 0] + chunk_index[axis] = i + chunk = inp.cix[tuple(chunk_index)] + + if axis == 0: + index_value = chunk.index_value + else: + index_value = chunk.columns_value + if pd.isnull(index_value.min_val) or pd.isnull(index_value.max_val): + yield + + return inp + + @classmethod + def _find_extra_chunks_for_int_window(cls, op, inp, cur_chunk_index): + from ...indexing.iloc import DataFrameIlocGetItem, SeriesIlocGetItem + + axis = op.axis + window = op.window + center = op.center + + # find prev chunks + i = cur_chunk_index[axis] + rest = window if not center else window // 2 + prev_chunks = [] + while i > 0 and rest > 0: + prev_chunk_index = list(cur_chunk_index) + prev_chunk_index[axis] = i - 1 + prev_chunk_index = tuple(prev_chunk_index) + + prev_chunk = inp.cix[prev_chunk_index] + size = prev_chunk.shape[axis] + if size <= rest: + prev_chunks.insert(0, prev_chunk) + rest -= size + else: + if prev_chunk.ndim == 1: + slice_prev_chunk_op = SeriesIlocGetItem( + indexes=[slice(-rest, None)] + ) + else: + slices = [slice(None)] * 2 + slices[axis] = slice(-rest, None) + slice_prev_chunk_op = DataFrameIlocGetItem(indexes=slices) + slice_prev_chunk = slice_prev_chunk_op.new_chunk([prev_chunk]) + prev_chunks.insert(0, slice_prev_chunk) + rest = 0 + + i -= 1 + + # find succ chunks + j = cur_chunk_index[axis] + rest = 0 if not center else window - window // 2 - 1 + chunk_size = inp.chunk_shape[axis] + succ_chunks = [] + while j < chunk_size - 1 and rest > 0: + succ_chunk_index = list(cur_chunk_index) + succ_chunk_index[axis] = j + 1 + succ_chunk_index = tuple(succ_chunk_index) + + succ_chunk = inp.cix[succ_chunk_index] + size = succ_chunk.shape[axis] + if size <= rest: + succ_chunks.append(succ_chunk) + rest -= size + else: + if succ_chunk.ndim == 1: + slice_succ_chunk_op = SeriesIlocGetItem(indexes=[slice(rest)]) + else: + slices = [slice(None)] * 2 + slices[axis] = slice(rest) + slice_succ_chunk_op = DataFrameIlocGetItem(indexes=slices) + slice_succ_chunk = slice_succ_chunk_op.new_chunk([succ_chunk]) + succ_chunks.append(slice_succ_chunk) + rest = 0 + + j += 1 + + return prev_chunks, succ_chunks + + @classmethod + def _find_extra_chunks_for_offset_window(cls, op, inp, cur_chunk_index): + from ...indexing.loc import DataFrameLocGetItem + + # when window is offset, center=True is not supported + assert not op.center + + axis = op.axis + window = pd.Timedelta(op.window) + ndim = inp.ndim + + # find prev chunks + i = cur_chunk_index[axis] + prev_chunks = [] + cur_index_min = inp.cix[cur_chunk_index].index_value.min_val + start = cur_index_min - window + assert cur_chunk_index is not None + while i > 0: + prev_chunk_index = list(cur_chunk_index) + prev_chunk_index[axis] = i - 1 + prev_chunk_index = tuple(prev_chunk_index) + + prev_chunk = inp.cix[prev_chunk_index] + prev_index_max = prev_chunk.index_value.max_val + if prev_index_max >= start: + slices = [slice(None)] * ndim + slices[axis] = slice(start, None) + prev_chunk_op = DataFrameLocGetItem( + indexes=slices, output_types=prev_chunk.op.output_types + ) + slice_prev_chunk = prev_chunk_op.new_chunk([prev_chunk]) + prev_chunks.insert(0, slice_prev_chunk) + else: + # index max < start, break + break + + i -= 1 + + return prev_chunks, [] + + @classmethod + def tile(cls, op): + inp = op.input + out = op.outputs[0] + is_window_int = op.win_type != "freq" + axis = op.axis + input_ndim = inp.ndim + output_ndim = out.ndim + + # check if can be tiled + inp = yield from cls._check_can_be_tiled(op, is_window_int) + + if inp.ndim == 1 and out.ndim == 1: + # input series, output series + other_iter = [None] + elif inp.ndim == 1: + # input series, output dataframe + other_iter = [0] + else: + other_iter = range(inp.chunk_shape[1 - axis]) + + out_chunks = [] + for i in other_iter: + for j in range(inp.chunk_shape[axis]): + chunk_op = op.copy().reset_key() + + if inp.ndim == 1: + chunk_index = (j,) + else: + chunk_index = [None, None] + chunk_index[1 - axis] = i + chunk_index[axis] = j + chunk_index = tuple(chunk_index) + + inp_chunk = inp.cix[chunk_index] + if is_window_int: + pred_chunks, succ_chunks = cls._find_extra_chunks_for_int_window( + op, inp, chunk_index + ) + else: + pred_chunks, succ_chunks = cls._find_extra_chunks_for_offset_window( + op, inp, chunk_index + ) + + out_chunk_index = [None] * output_ndim + out_chunk_index[axis] = j + if output_ndim == 2: + out_chunk_index[1 - axis] = i + out_chunk_index = tuple(out_chunk_index) + + chunk_params = {"index": out_chunk_index} + if input_ndim == 1 and output_ndim == 1: + chunk_params["shape"] = inp_chunk.shape + chunk_params["dtype"] = out.dtype + chunk_params["index_value"] = inp_chunk.index_value + chunk_params["name"] = inp_chunk.name + elif input_ndim == 1 and output_ndim == 2: + chunk_params["shape"] = (inp_chunk.shape[0], out.shape[1]) + chunk_params["dtypes"] = out.dtypes + chunk_params["index_value"] = inp_chunk.index_value + chunk_params["columns_value"] = out.columns_value + else: + if axis == 0: + out_shape = list(out.shape) + out_shape[axis] = inp_chunk.shape[axis] + chunk_params["shape"] = tuple(out_shape) + else: + chunk_params["shape"] = inp_chunk.shape + chunk_params["index_value"] = ( + inp_chunk.index_value if axis == 0 else out.index_value + ) + chunk_params["dtypes"] = ( + out.dtypes if axis == 0 else inp_chunk.dtypes + ) + chunk_params["columns_value"] = ( + out.columns_value if axis == 0 else inp_chunk.columns_value + ) + + if len(pred_chunks) > 0: + chunk_op._preds = pred_chunks + if len(succ_chunks) > 0: + chunk_op._succs = succ_chunks + out_chunk = chunk_op.new_chunk( + [inp_chunk] + pred_chunks + succ_chunks, kws=[chunk_params] + ) + out_chunks.append(out_chunk) + + params = out.params + params["chunks"] = out_chunks + if out.ndim == 1: + params["shape"] = (inp.shape[0],) + else: + params["shape"] = (inp.shape[0], params["shape"][1]) + params["nsplits"] = calc_nsplits({c.index: c.shape for c in out_chunks}) + new_op = op.copy() + return new_op.new_tileables([inp], kws=[params]) + + @classmethod + def execute(cls, ctx, op): + inp = ctx[op.input.key] + axis = op.axis + win_type = op.win_type + window = op.window + if win_type == "freq": + win_type = None + window = pd.Timedelta(window) + + preds = [ctx[pred.key] for pred in op.preds] + pred_size = sum(pred.shape[axis] for pred in preds) + succs = [ctx[succ.key] for succ in op.succs] + succ_size = sum(succ.shape[axis] for succ in succs) + + xdf = pd if isinstance(inp, (pd.DataFrame, pd.Series)) else cudf + + if pred_size > 0 or succ_size > 0: + data = xdf.concat(preds + [inp] + succs, axis=axis) + else: + data = inp + + # fix for pandas 1.2.0 + # see: https://github.com/pandas-dev/pandas/issues/38908 + # df.rolling().aggregate('skew') modified original data + # so we copy it first for skew only + if ( + _with_pandas_issue_38908 + and op.func in ["skew", "kurt"] + and op.outputs[0].index[0] == 0 + ): + data = data.copy() + + r = data.rolling( + window=window, + min_periods=op.min_periods, + center=op.center, + win_type=win_type, + on=op.on, + axis=axis, + closed=op.closed, + ) + result = r.aggregate(op.func, *op.func_args, **op.func_kwargs) + + if pred_size > 0 or succ_size > 0: + slc = [slice(None)] * result.ndim + slc[axis] = slice(pred_size, result.shape[axis] - succ_size) + result = result.iloc[tuple(slc)] + + ctx[op.outputs[0].key] = result diff --git a/python/xorbits/_mars/dataframe/window/rolling/core.py b/python/xorbits/_mars/dataframe/window/rolling/core.py new file mode 100644 index 000000000..05933f1a9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/window/rolling/core.py @@ -0,0 +1,354 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict + +from ....serialization.serializables import ( + AnyField, + BoolField, + Int32Field, + Int64Field, + StringField, +) +from ....utils import pd_release_version +from ...core import DATAFRAME_TYPE +from ...utils import build_empty_df, build_empty_series, validate_axis +from ..core import Window + +_window_has_method = pd_release_version >= (1, 3, 0) + + +class Rolling(Window): + _window = AnyField("window") + _min_periods = Int64Field("min_periods") + _center = BoolField("center") + _win_type = StringField("win_type") + _on = StringField("on") + _axis = Int32Field("axis") + _closed = StringField("closed") + _method = StringField("method") + + def __init__( + self, + window=None, + min_periods=None, + center=None, + win_type=None, + on=None, + axis=None, + closed=None, + method=None, + **kw + ): + super().__init__( + _window=window, + _min_periods=min_periods, + _center=center, + _win_type=win_type, + _on=on, + _axis=axis, + _closed=closed, + _method=method, + **kw + ) + + @property + def window(self): + return self._window + + @property + def min_periods(self): + return self._min_periods + + @property + def center(self): + return self._center + + @property + def win_type(self): + return self._win_type + + @property + def on(self): + return self._on + + @property + def axis(self): + return self._axis + + @property + def closed(self): + return self._closed + + @property + def method(self): + return self._method or "single" + + @property + def params(self): + p = OrderedDict() + + if not _window_has_method: # pragma: no cover + args = [ + "window", + "min_periods", + "center", + "win_type", + "axis", + "on", + "closed", + ] + else: + args = [ + "window", + "min_periods", + "center", + "win_type", + "axis", + "on", + "closed", + "method", + ] + + for attr in args: + p[attr] = getattr(self, attr) + return p + + def _repr_name(self): + return "Rolling" if self.win_type is None else "Window" + + def validate(self): + # leverage pandas itself to do validation + pd_index = self._input.index_value.to_pandas() + if isinstance(self._input, DATAFRAME_TYPE): + empty_obj = build_empty_df(self._input.dtypes, index=pd_index[:0]) + else: + empty_obj = build_empty_series( + self._input.dtype, index=pd_index[:0], name=self._input.name + ) + pd_rolling = empty_obj.rolling(**self.params) + for k in self.params: + # update value according to pandas rolling + setattr(self, "_" + k, getattr(pd_rolling, k)) + + def aggregate(self, func, *args, **kwargs): + from .aggregation import DataFrameRollingAgg + + op = DataFrameRollingAgg( + func=func, func_args=args, func_kwargs=kwargs, **self.params + ) + return op(self) + + def agg(self, func, *args, **kwargs): + return self.aggregate(func, *args, **kwargs) + + def count(self): + return self.aggregate("count") + + def sum(self, *args, **kwargs): + return self.aggregate("sum", *args, **kwargs) + + def mean(self, *args, **kwargs): + return self.aggregate("mean", *args, **kwargs) + + def median(self, **kwargs): + return self.aggregate("median", **kwargs) + + def var(self, ddof=1, *args, **kwargs): + return self.aggregate("var", ddof=ddof, *args, **kwargs) + + def std(self, ddof=1, *args, **kwargs): + return self.aggregate("std", ddof=ddof, *args, **kwargs) + + def min(self, *args, **kwargs): + return self.aggregate("min", *args, **kwargs) + + def max(self, *args, **kwargs): + return self.aggregate("max", *args, **kwargs) + + def skew(self, **kwargs): + return self.aggregate("skew", **kwargs) + + def kurt(self, **kwargs): + return self.aggregate("kurt", **kwargs) + + +def rolling( + obj, + window, + min_periods=None, + center=False, + win_type=None, + on=None, + axis=0, + closed=None, +): + """ + Provide rolling window calculations. + + Parameters + ---------- + window : int, or offset + Size of the moving window. This is the number of observations used for + calculating the statistic. Each window will be a fixed size. + If its an offset then this will be the time period of each window. Each + window will be a variable sized based on the observations included in + the time-period. This is only valid for datetimelike indexes. This is + new in 0.19.0 + min_periods : int, default None + Minimum number of observations in window required to have a value + (otherwise result is NA). For a window that is specified by an offset, + `min_periods` will default to 1. Otherwise, `min_periods` will default + to the size of the window. + center : bool, default False + Set the labels at the center of the window. + win_type : str, default None + Provide a window type. If ``None``, all points are evenly weighted. + See the notes below for further information. + on : str, optional + For a DataFrame, a datetime-like column on which to calculate the rolling + window, rather than the DataFrame's index. Provided integer column is + ignored and excluded from result since an integer index is not used to + calculate the rolling window. + axis : int or str, default 0 + closed : str, default None + Make the interval closed on the 'right', 'left', 'both' or + 'neither' endpoints. + For offset-based windows, it defaults to 'right'. + For fixed windows, defaults to 'both'. Remaining cases not implemented + for fixed windows. + + Returns + ------- + a Window or Rolling sub-classed for the particular operation + + See Also + -------- + expanding : Provides expanding transformations. + ewm : Provides exponential weighted functions. + + Notes + ----- + By default, the result is set to the right edge of the window. This can be + changed to the center of the window by setting ``center=True``. + To learn more about the offsets & frequency strings, please see `this link + `__. + + The recognized win_types are: + * ``boxcar`` + * ``triang`` + * ``blackman`` + * ``hamming`` + * ``bartlett`` + * ``parzen`` + * ``bohman`` + * ``blackmanharris`` + * ``nuttall`` + * ``barthann`` + * ``kaiser`` (needs beta) + * ``gaussian`` (needs std) + * ``general_gaussian`` (needs power, width) + * ``slepian`` (needs width) + * ``exponential`` (needs tau), center is set to None. + + If ``win_type=None`` all points are evenly weighted. To learn more about + different window types see `scipy.signal window functions + `__. + + Examples + -------- + >>> import numpy as np + >>> import mars.dataframe as md + >>> df = md.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + >>> df.execute() + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + Rolling sum with a window length of 2, using the 'triang' + window type. + + >>> df.rolling(2, win_type='triang').sum().execute() + B + 0 NaN + 1 0.5 + 2 1.5 + 3 NaN + 4 NaN + + Rolling sum with a window length of 2, min_periods defaults + to the window length. + + >>> df.rolling(2).sum().execute() + B + 0 NaN + 1 1.0 + 2 3.0 + 3 NaN + 4 NaN + + Same as above, but explicitly set the min_periods + + >>> df.rolling(2, min_periods=1).sum().execute() + B + 0 0.0 + 1 1.0 + 2 3.0 + 3 2.0 + 4 4.0 + + A ragged (meaning not-a-regular frequency), time-indexed DataFrame + + >>> df = md.DataFrame({'B': [0, 1, 2, np.nan, 4]}, + >>> index = [md.Timestamp('20130101 09:00:00'), + >>> md.Timestamp('20130101 09:00:02'), + >>> md.Timestamp('20130101 09:00:03'), + >>> md.Timestamp('20130101 09:00:05'), + >>> md.Timestamp('20130101 09:00:06')]) + >>> df.execute() + B + 2013-01-01 09:00:00 0.0 + 2013-01-01 09:00:02 1.0 + 2013-01-01 09:00:03 2.0 + 2013-01-01 09:00:05 NaN + 2013-01-01 09:00:06 4.0 + + Contrasting to an integer rolling window, this will roll a variable + length window corresponding to the time period. + The default for min_periods is 1. + + >>> df.rolling('2s').sum().execute() + B + 2013-01-01 09:00:00 0.0 + 2013-01-01 09:00:02 1.0 + 2013-01-01 09:00:03 3.0 + 2013-01-01 09:00:05 NaN + 2013-01-01 09:00:06 4.0 + """ + axis = validate_axis(axis, obj) + r = Rolling( + input=obj, + window=window, + min_periods=min_periods, + center=center, + win_type=win_type, + on=on, + axis=axis, + closed=closed, + ) + r.validate() + return r diff --git a/python/xorbits/_mars/dataframe/window/rolling/tests/__init__.py b/python/xorbits/_mars/dataframe/window/rolling/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/dataframe/window/rolling/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/dataframe/window/rolling/tests/test_rolling.py b/python/xorbits/_mars/dataframe/window/rolling/tests/test_rolling.py new file mode 100644 index 000000000..31535c1fe --- /dev/null +++ b/python/xorbits/_mars/dataframe/window/rolling/tests/test_rolling.py @@ -0,0 +1,65 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest + +from ..... import dataframe as md +from .....core import tile + + +def test_rolling(): + df = pd.DataFrame(np.random.rand(4, 3), columns=list("abc")) + df2 = md.DataFrame(df) + + r = df2.rolling(3, min_periods=1, center=True, win_type="triang", closed="both") + expected = df.rolling( + 3, min_periods=1, center=True, win_type="triang", closed="both" + ) + assert repr(r) == repr(expected) + + assert "b" in dir(r) + + with pytest.raises(AttributeError): + _ = r.d + + with pytest.raises(KeyError): + _ = r["d"] + + with pytest.raises(KeyError): + _ = r["a", "d"] + + assert "a" not in dir(r.a) + assert "c" not in dir(r["a", "b"]) + + +def test_rolling_agg(): + df = pd.DataFrame(np.random.rand(4, 3), columns=list("abc")) + df2 = md.DataFrame(df, chunk_size=3) + + r = df2.rolling(3).agg("max") + expected = df.rolling(3).agg("max") + + assert r.shape == df.shape + assert r.index_value is df2.index_value + pd.testing.assert_index_equal(r.columns_value.to_pandas(), expected.columns) + pd.testing.assert_series_equal(r.dtypes, df2.dtypes) + + r = tile(r) + for c in r.chunks: + assert c.shape == c.inputs[0].shape + assert c.index_value is c.inputs[0].index_value + pd.testing.assert_index_equal(c.columns_value.to_pandas(), expected.columns) + pd.testing.assert_series_equal(c.dtypes, expected.dtypes) diff --git a/python/xorbits/_mars/dataframe/window/rolling/tests/test_rolling_execution.py b/python/xorbits/_mars/dataframe/window/rolling/tests/test_rolling_execution.py new file mode 100644 index 000000000..ac1bb79ac --- /dev/null +++ b/python/xorbits/_mars/dataframe/window/rolling/tests/test_rolling_execution.py @@ -0,0 +1,145 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ..... import dataframe as md + + +def test_rolling_agg_execution(setup): + rs = np.random.RandomState(0) + raw = pd.DataFrame( + { + "a": rs.randint(100, size=(10,)), + "b": rs.rand(10), + "c": rs.randint(100, size=(10,)), + "d": ["c" * i for i in rs.randint(4, size=10)], + } + ) + raw.iloc[1, ::4] = np.nan + s = raw.iloc[:, 1] + + dfs = [ + md.DataFrame(raw, chunk_size=10), # 1 chunk + md.DataFrame(raw, chunk_size=3), # multiple chunks on each axis + ] + funcs = ["min", ["max", "mean"], {"c": ["std"], "b": ["count", "min"]}] + + df2 = dfs[0].rolling(3).agg(funcs[2]) + + # test 1 chunk + result = df2.execute().fetch() + expected = raw.rolling(3).agg(funcs[2]) + pd.testing.assert_frame_equal(result, expected) + + for window in [2, 5]: + for center in [True, False]: + for func in funcs: + df2 = dfs[1].rolling(window, center=center).agg(func) + + result = df2.execute().fetch() + expected = raw.rolling(window, center=center).agg(func) + pd.testing.assert_frame_equal(result, expected) + + # test min_periods and win_type + df2 = dfs[1].rolling(3, min_periods=1, win_type="triang").agg("sum") + + result = df2.execute().fetch() + expected = raw.rolling(3, min_periods=1, win_type="triang").agg("sum") + pd.testing.assert_frame_equal(result, expected) + + # test rolling getitem, series + df2 = dfs[1].rolling(3)["b"].agg("sum") + + result = df2.execute().fetch() + expected = raw.rolling(3)["b"].agg("sum") + pd.testing.assert_series_equal(result, expected) + + # test rolling getitem, dataframe + df2 = dfs[1].rolling(3)["c", "b"].agg("sum") + + result = df2.execute().fetch() + expected = raw.rolling(3)["c", "b"].agg("sum") + pd.testing.assert_frame_equal(result, expected) + + # test axis=1 + df2 = dfs[1].rolling(3, axis=1).agg("sum") + + result = df2.execute( + extra_config=dict(check_all=False, check_nsplits=False) + ).fetch() + expected = raw.rolling(3, axis=1).agg("sum") + pd.testing.assert_frame_equal(result, expected) + + # test window which is offset + raw2 = raw.copy() + raw2.reset_index(inplace=True, drop=True) + raw2.index = pd.date_range("2020-2-25", periods=10) + + df = md.DataFrame(raw2, chunk_size=3) + for func in funcs: + df2 = df.rolling("2d").agg(func) + + result = df2.execute().fetch() + expected = raw2.rolling("2d").agg(func) + pd.testing.assert_frame_equal(result, expected) + + series = [md.Series(s, chunk_size=10), md.Series(s, chunk_size=4)] + + funcs = ["min", ["max", "mean"], {"c": "std", "b": "count"}] + + for series in series: + for window in [2, 3, 5]: + for center in [True, False]: + for func in funcs: + series2 = series.rolling(window, center=center).agg(func) + + result = series2.execute().fetch() + expected = s.rolling(window, center=center).agg(func) + if isinstance(expected, pd.Series): + pd.testing.assert_series_equal(result, expected) + else: + pd.testing.assert_frame_equal(result, expected) + + df = md.DataFrame(raw, chunk_size=3) + df = df[df.a > 0.5] + r = df.rolling(3).agg("max") + + result = r.execute().fetch() + expected = raw[raw.a > 0.5].rolling(3).agg("max") + pd.testing.assert_frame_equal(result, expected) + + series = md.Series(s, chunk_size=3) + series = series[series > 0.5] + r = series.rolling(3).agg("max") + + result = r.execute().fetch() + expected = s[s > 0.5].rolling(3).agg("max") + pd.testing.assert_series_equal(result, expected) + + # test agg functions + df = md.DataFrame(raw, chunk_size=3) + for func in ["count", "sum", "mean", "median", "min", "max", "skew", "kurt"]: + r = getattr(df.rolling(4), func)() + + result = r.execute().fetch() + expected = getattr(raw.rolling(4), func)() + pd.testing.assert_frame_equal(result, expected) + for func in ["std", "var"]: + r = getattr(df.rolling(4), func)(ddof=0) + + result = r.execute().fetch() + expected = getattr(raw.rolling(4), func)(ddof=0) + pd.testing.assert_frame_equal(result, expected) diff --git a/python/xorbits/_mars/deploy/__init__.py b/python/xorbits/_mars/deploy/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/deploy/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/deploy/kubedl/__init__.py b/python/xorbits/_mars/deploy/kubedl/__init__.py new file mode 100644 index 000000000..4f8329168 --- /dev/null +++ b/python/xorbits/_mars/deploy/kubedl/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .client import KubeDLClusterClient, new_cluster diff --git a/python/xorbits/_mars/deploy/kubedl/client.py b/python/xorbits/_mars/deploy/kubedl/client.py new file mode 100644 index 000000000..e1ec944ac --- /dev/null +++ b/python/xorbits/_mars/deploy/kubedl/client.py @@ -0,0 +1,372 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import time +import warnings + +import requests + +from ...session import new_session +from .config import ( + MarsJobConfig, + MarsSchedulerSpecConfig, + MarsWebSpecConfig, + MarsWorkerSpecConfig, +) + +try: + from kubernetes.client.rest import ApiException as K8SApiException +except ImportError: # pragma: no cover + K8SApiException = None + +KUBEDL_API_VERSION = "kubedl.io/v1alpha1" +KUBEDL_MARS_PLURAL = "marsjobs" + + +logger = logging.getLogger(__name__) + + +class KubeDLClusterClient: + def __init__(self, cluster): + self._cluster = cluster + self._endpoint = None + self._session = None + + @property + def endpoint(self): + return self._endpoint + + @property + def namespace(self): + return self._cluster.namespace + + @property + def session(self): + return self._session + + def start(self): + self._endpoint = self._cluster.start() + self._session = new_session(self._endpoint, verify_ssl=self._cluster.verify_ssl) + + def stop(self, wait=False, timeout=0): + self._cluster.stop(wait=wait, timeout=timeout) + + +class KubeDLCluster: + def __init__( + self, + kube_api_client=None, + image=None, + job_name=None, + namespace=None, + scheduler_num=1, + scheduler_cpu=None, + scheduler_mem=None, + worker_num=1, + worker_cpu=None, + worker_mem=None, + worker_spill_paths=None, + worker_cache_mem=None, + min_worker_num=None, + web_num=1, + web_cpu=None, + web_mem=None, + slb_endpoint=None, + verify_ssl=True, + timeout=None, + **kwargs, + ): + from kubernetes import client as kube_client + + self._kube_api_client = kube_api_client + self._custom_api = kube_client.CustomObjectsApi(kube_api_client) + + self._slb_endpoint = slb_endpoint.rstrip("/") + self._verify_ssl = verify_ssl + + self._job_name = job_name + self._mars_endpoint = None + self._namespace = namespace or "default" + self._image = image + self._timeout = timeout + self._extra_volumes = kwargs.pop("extra_volumes", ()) + self._pre_stop_command = kwargs.pop("pre_stop_command", None) + self._log_when_fail = kwargs.pop("log_when_fail", False) + self._node_selectors = kwargs.pop("node_selectors", None) + + extra_modules = kwargs.pop("extra_modules", None) or [] + extra_modules = ( + extra_modules.split(",") + if isinstance(extra_modules, str) + else extra_modules + ) + extra_envs = kwargs.pop("extra_env", None) or dict() + + if not verify_ssl: + extra_envs["KUBE_VERIFY_SSL"] = "0" + + def _override_modules(updates): + modules = set(extra_modules) + updates = updates.split(",") if isinstance(updates, str) else updates + modules.update(updates) + return sorted(modules) + + def _override_envs(updates): + ret = extra_envs.copy() + ret.update(updates) + return ret + + self._scheduler_num = scheduler_num + self._scheduler_cpu = scheduler_cpu + self._scheduler_mem = scheduler_mem + self._scheduler_extra_modules = _override_modules( + kwargs.pop("scheduler_extra_modules", []) + ) + self._scheduler_extra_env = _override_envs( + kwargs.pop("scheduler_extra_env", None) or dict() + ) + + self._worker_num = worker_num + self._worker_cpu = worker_cpu + self._worker_mem = worker_mem + self._worker_spill_paths = worker_spill_paths + self._worker_cache_mem = worker_cache_mem + self._min_worker_num = min_worker_num or worker_num + self._worker_extra_modules = _override_modules( + kwargs.pop("worker_extra_modules", []) + ) + self._worker_extra_env = _override_envs( + kwargs.pop("worker_extra_env", None) or dict() + ) + + self._web_num = web_num + self._web_cpu = web_cpu + self._web_mem = web_mem + self._web_extra_modules = _override_modules(kwargs.pop("web_extra_modules", [])) + self._web_extra_env = _override_envs( + kwargs.pop("web_extra_env", None) or dict() + ) + + @property + def verify_ssl(self): + return self._verify_ssl + + def _check_if_exist(self): + if self._job_name is None: + return False + try: + api, version = KUBEDL_API_VERSION.rsplit("/", 1) + service_obj = self._custom_api.get_namespaced_custom_object_status( + api, version, self._namespace, KUBEDL_MARS_PLURAL, self._job_name + ) + if len(service_obj.get("status", dict()).get("conditions", [])) > 0: + status = service_obj["status"]["conditions"][-1]["type"] + if status == "Running" or status == "Created": + logger.warning(f"Reusing cluster: {self._job_name}") + return True + else: + return False + else: + return False + except K8SApiException: + return False + + def _create_service(self): + scheduler_cfg = MarsSchedulerSpecConfig( + self._image, + self._scheduler_num, + cpu=self._scheduler_cpu, + memory=self._scheduler_mem, + node_selectors=self._node_selectors, + modules=self._scheduler_extra_modules, + ) + scheduler_cfg.add_simple_envs(self._scheduler_extra_env) + + worker_cfg = MarsWorkerSpecConfig( + self._image, + self._worker_num, + cpu=self._worker_cpu, + memory=self._worker_mem, + cache_mem=self._worker_cache_mem, + spill_dirs=self._worker_spill_paths, + node_selectors=self._node_selectors, + modules=self._worker_extra_modules, + ) + worker_cfg.add_simple_envs(self._worker_extra_env) + + web_cfg = MarsWebSpecConfig( + self._image, + self._web_num, + cpu=self._web_cpu, + memory=self._web_mem, + node_selectors=self._node_selectors, + modules=self._web_extra_modules, + ) + web_cfg.add_simple_envs(self._web_extra_env) + + job_cfg = MarsJobConfig( + job_name=self._job_name, + scheduler_config=scheduler_cfg, + worker_config=worker_cfg, + web_config=web_cfg, + web_host=self._slb_endpoint, + ) + + api, version = KUBEDL_API_VERSION.rsplit("/", 1) + + cfg_json = job_cfg.build() + cfg_json["apiVersion"] = KUBEDL_API_VERSION + + response = self._custom_api.create_namespaced_custom_object( + api, version, self._namespace, KUBEDL_MARS_PLURAL, cfg_json + ) + self._job_name = response["metadata"]["name"] + + def _wait_service_ready(self): + self._mars_endpoint = ( + f"{self._slb_endpoint}/mars/{self._namespace}/{self._job_name}-webservice-0" + ) + logger.warning(f"Kubedl job name: {self._job_name}") + check_start_time = time.time() + worker_count_url = self._mars_endpoint + "/api/worker?action=count" + while True: + try: + if self._timeout and time.time() - check_start_time > self._timeout: + raise TimeoutError("Check Mars service start timeout") + + if not self._verify_ssl: + try: + import urllib3 + + urllib3.disable_warnings( + urllib3.exceptions.InsecureRequestWarning + ) + except ImportError: # pragma: no cover + pass + + api, version = KUBEDL_API_VERSION.rsplit("/", 1) + service_obj = self._custom_api.get_namespaced_custom_object_status( + api, version, self._namespace, KUBEDL_MARS_PLURAL, self._job_name + ) + if len(service_obj.get("status", dict()).get("conditions", [])) > 0: + if service_obj["status"]["conditions"][-1]["type"] == "Failed": + raise SystemError( + service_obj["status"]["conditions"][-1]["message"] + ) + + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", message="Unverified HTTPS request" + ) + resp = requests.get( + worker_count_url, timeout=1, verify=self._verify_ssl + ) + + if int(resp.text) >= self._min_worker_num: + logger.warning(f"Web endpoint started at {self._mars_endpoint}") + break + except (requests.Timeout, ValueError) as ex: + if not isinstance(ex, requests.Timeout): + time.sleep(0.1) + pass + + def start(self): + try: + if not self._check_if_exist(): + self._create_service() + self._wait_service_ready() + return self._mars_endpoint + except: # noqa: E722 + self.stop() + raise + + def stop(self, wait=False, timeout=0): + from kubernetes import client as kube_client + + custom_api = kube_client.CustomObjectsApi(self._kube_api_client) + api, version = KUBEDL_API_VERSION.rsplit("/", 1) + custom_api.delete_namespaced_custom_object( + api, version, self._namespace, KUBEDL_MARS_PLURAL, self._job_name + ) + + if wait: + start_time = time.time() + while True: + try: + custom_api.get_namespaced_custom_object( + api, + version, + self._namespace, + KUBEDL_MARS_PLURAL, + self._job_name, + ) + except K8SApiException as ex: + if ex.status != 404: # pragma: no cover + raise + break + else: + time.sleep(1) + if ( + timeout and time.time() - start_time > timeout + ): # pragma: no cover + raise TimeoutError("Check Mars service stop timeout") + + +def new_cluster( + kube_api_client=None, + image=None, + scheduler_num=1, + scheduler_cpu=2, + scheduler_mem=4 * 1024**3, + worker_num=1, + worker_cpu=8, + worker_mem=32 * 1024**3, + worker_spill_paths=None, + worker_cache_mem="45%", + min_worker_num=None, + web_num=1, + web_cpu=1, + web_mem=4 * 1024**3, + slb_endpoint=None, + verify_ssl=True, + job_name=None, + timeout=None, + **kwargs, +): + worker_spill_paths = worker_spill_paths or ["/tmp/spill-dir"] + cluster = KubeDLCluster( + kube_api_client, + image=image, + scheduler_num=scheduler_num, + scheduler_cpu=scheduler_cpu, + scheduler_mem=scheduler_mem, + worker_num=worker_num, + worker_cpu=worker_cpu, + worker_mem=worker_mem, + worker_spill_paths=worker_spill_paths, + worker_cache_mem=worker_cache_mem, + min_worker_num=min_worker_num, + web_num=web_num, + web_cpu=web_cpu, + web_mem=web_mem, + slb_endpoint=slb_endpoint, + verify_ssl=verify_ssl, + job_name=job_name, + timeout=timeout, + **kwargs, + ) + client = KubeDLClusterClient(cluster) + client.start() + return client diff --git a/python/xorbits/_mars/deploy/kubedl/config.py b/python/xorbits/_mars/deploy/kubedl/config.py new file mode 100644 index 000000000..73425709f --- /dev/null +++ b/python/xorbits/_mars/deploy/kubedl/config.py @@ -0,0 +1,268 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from urllib.parse import urlparse + +from ...utils import calc_size_by_str, parse_readable_size +from ..kubernetes.config import ContainerEnvConfig + +DEFAULT_SERVICE_ACCOUNT_NAME = "kubedl-sa" + + +def _remove_nones(cfg): + return dict((k, v) for k, v in cfg.items() if v is not None) + + +class ResourceConfig: + """ + Configuration builder for Kubernetes computation resources + """ + + def __init__(self, cpu, memory): + self._cpu = cpu + self._memory, ratio = ( + parse_readable_size(memory) if memory is not None else (None, False) + ) + assert not ratio + + def build(self): + return { + "cpu": str(self._cpu), + "memory": str(int(self._memory)), + } + + +class ReplicaSpecConfig: + """ + Base configuration builder for Kubernetes replication controllers + """ + + container_name = "mars" + + def __init__( + self, + name, + image, + replicas, + resource_request=None, + resource_limit=None, + node_selectors=None, + ): + self._name = name + self._image = image + self._replicas = replicas + self._envs = dict() + self._node_selectors = node_selectors + + self.add_default_envs() + + self._resource_request = resource_request + self._resource_limit = resource_limit + + def add_env(self, name, value=None, field_path=None): + self._envs[name] = ContainerEnvConfig(name, value=value, field_path=field_path) + + def add_simple_envs(self, envs): + for k, v in envs.items() or (): + self.add_env(k, v) + + def add_default_envs(self): + pass # pragma: no cover + + def build_container_command(self): + raise NotImplementedError + + def build_container(self): + resources_dict = { + "requests": self._resource_request.build() + if self._resource_request + else None, + "limits": self._resource_limit.build() if self._resource_limit else None, + } + return _remove_nones( + { + "imagePullPolicy": "Always", + "command": self.build_container_command(), + "env": [env.build() for env in self._envs.values()] or None, + "image": self._image, + "name": self.container_name, + "resources": dict((k, v) for k, v in resources_dict.items() if v) + or None, + } + ) + + def build_template_spec(self): + return _remove_nones( + { + "serviceAccountName": DEFAULT_SERVICE_ACCOUNT_NAME, + "nodeSelector": self._node_selectors, + "containers": [self.build_container()], + } + ) + + def build(self): + return { + "replicas": int(self._replicas), + "restartPolicy": "Never", + "template": { + "metadata": { + "labels": {"mars/service-type": self._name}, + }, + "spec": self.build_template_spec(), + }, + } + + +class MarsReplicaSpecConfig(ReplicaSpecConfig): + service_name = None + service_label = None + + def __init__( + self, + image, + replicas, + cpu=None, + memory=None, + limit_resources_ratio=1.2, + memory_limit_ratio=2, + modules=None, + node_selectors=None, + ): + self._cpu = cpu + self._memory, ratio = ( + parse_readable_size(memory) if memory is not None else (None, False) + ) + assert not ratio + + if isinstance(modules, str): + self._modules = modules.split(",") + else: + self._modules = modules + + res_request = ResourceConfig(cpu, memory) if cpu or memory else None + memory_limit_ratio = ( + memory_limit_ratio + if memory_limit_ratio is not None + else limit_resources_ratio + ) + res_limit = ( + ResourceConfig(cpu * limit_resources_ratio, memory * memory_limit_ratio) + if cpu or memory + else None + ) + super().__init__( + self.service_label, + image, + replicas, + resource_request=res_request, + resource_limit=res_limit, + node_selectors=node_selectors, + ) + + def build_container_command(self): + cmd = [ + "/srv/entrypoint.sh", + f"mars.deploy.kubernetes.{self.service_name}", + ] + return cmd + + def add_default_envs(self): + if self._cpu: + self.add_env("MARS_CPU_TOTAL", str(self._cpu)) + + if self._memory: + self.add_env("MARS_MEMORY_TOTAL", str(int(self._memory))) + + if self._modules: + self.add_env("MARS_LOAD_MODULES", ",".join(self._modules)) + + +class MarsSchedulerSpecConfig(MarsReplicaSpecConfig): + service_name = "scheduler" + service_label = "marsscheduler" + + +class MarsWorkerSpecConfig(MarsReplicaSpecConfig): + service_name = "worker" + service_label = "marsworker" + + def __init__(self, *args, **kwargs): + cache_mem = kwargs.pop("cache_mem", None) + self._spill_dirs = kwargs.pop("spill_dirs", None) or () + # set limits as 2*requests for worker replica defaulted. + kwargs["limit_resources_ratio"] = kwargs.get("limit_resources_ratio", 1.2) + super().__init__(*args, **kwargs) + self._cache_mem = calc_size_by_str(cache_mem, self._memory) + self.add_env("MARS_CACHE_MEM_SIZE", self._cache_mem) + + @property + def spill_dirs(self): + return self._spill_dirs + + @property + def cache_mem(self): + return self._cache_mem + + def add_default_envs(self): + super().add_default_envs() + if self._spill_dirs: + self.add_env("MARS_SPILL_DIRS", ":".join(self._spill_dirs)) + + +class MarsWebSpecConfig(MarsReplicaSpecConfig): + service_name = "web" + service_label = "marsweb" + + +class MarsJobConfig: + def __init__( + self, job_name, scheduler_config, worker_config, web_config, web_host=None + ): + self._job_name = job_name + self._scheduler_config = scheduler_config + self._worker_config = worker_config + self._web_config = web_config + self._web_host = web_host + + def build(self): + if self._job_name is None: + metadata = {"generateName": "mars-job-"} + else: + metadata = {"name": self._job_name} + + web_host = self._web_host + if web_host is not None and "://" in web_host: + web_host = urlparse(web_host).netloc + + return { + "kind": "MarsJob", + "metadata": metadata, + "spec": _remove_nones( + { + "workerMemoryTuningPolicy": _remove_nones( + { + "spillDirs": self._worker_config.spill_dirs, + "workerCacheSize": self._worker_config.cache_mem, + } + ), + "cleanPodPolicy": "None", + "webHost": web_host, + "marsReplicaSpecs": { + "Worker": self._worker_config.build(), + "Scheduler": self._scheduler_config.build(), + "WebService": self._web_config.build(), + }, + } + ), + } diff --git a/python/xorbits/_mars/deploy/kubernetes/__init__.py b/python/xorbits/_mars/deploy/kubernetes/__init__.py new file mode 100644 index 000000000..346d3e035 --- /dev/null +++ b/python/xorbits/_mars/deploy/kubernetes/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .client import KubernetesClusterClient, new_cluster diff --git a/python/xorbits/_mars/deploy/kubernetes/client.py b/python/xorbits/_mars/deploy/kubernetes/client.py new file mode 100644 index 000000000..19e35698b --- /dev/null +++ b/python/xorbits/_mars/deploy/kubernetes/client.py @@ -0,0 +1,480 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import functools +import logging +import random +import time +import uuid +from urllib.parse import urlparse + +from ...lib.aio import new_isolation, stop_isolation +from ...services.cluster.api import WebClusterAPI +from ...session import new_session +from ...utils import calc_size_by_str +from ..utils import wait_services_ready +from .config import ( + MarsSupervisorsConfig, + MarsWorkersConfig, + NamespaceConfig, + RoleBindingConfig, + RoleConfig, + ServiceConfig, +) + +try: + from kubernetes.client.rest import ApiException as K8SApiException +except ImportError: # pragma: no cover + K8SApiException = None + +logger = logging.getLogger(__name__) + + +class KubernetesClusterClient: + def __init__(self, cluster): + self._cluster = cluster + self._endpoint = None + self._session = None + + @property + def endpoint(self): + return self._endpoint + + @property + def namespace(self): + return self._cluster.namespace + + @property + def session(self): + return self._session + + def start(self): + try: + self._endpoint = self._cluster.start() + self._session = new_session(self._endpoint) + except: # noqa: E722 # nosec # pylint: disable=bare-except + self.stop() + raise + + def stop(self, wait=False, timeout=0): + self._cluster.stop(wait=wait, timeout=timeout) + + +class KubernetesCluster: + _supervisor_config_cls = MarsSupervisorsConfig + _worker_config_cls = MarsWorkersConfig + _default_service_port = 7103 + _default_web_port = 7104 + + def __init__( + self, + kube_api_client=None, + image=None, + namespace=None, + supervisor_num=1, + supervisor_cpu=1, + supervisor_mem="4G", + supervisor_mem_limit_ratio=None, + worker_num=1, + worker_cpu=None, + worker_mem=None, + worker_spill_paths=None, + worker_cache_mem=None, + min_worker_num=None, + worker_min_cache_mem=None, + worker_mem_limit_ratio=None, + web_port=None, + service_name=None, + service_type=None, + timeout=None, + **kwargs, + ): + from kubernetes import client as kube_client + + if worker_cpu is None or worker_mem is None: # pragma: no cover + raise TypeError("`worker_cpu` and `worker_mem` must be specified") + + self._api_client = kube_api_client + self._core_api = kube_client.CoreV1Api(kube_api_client) + + self._namespace = namespace + self._image = image + self._timeout = timeout + self._service_name = service_name or "marsservice" + self._service_type = service_type or "NodePort" + self._extra_volumes = kwargs.pop("extra_volumes", ()) + self._pre_stop_command = kwargs.pop("pre_stop_command", None) + self._log_when_fail = kwargs.pop("log_when_fail", False) + + extra_modules = kwargs.pop("extra_modules", None) or [] + extra_modules = ( + extra_modules.split(",") + if isinstance(extra_modules, str) + else extra_modules + ) + extra_envs = kwargs.pop("extra_env", None) or dict() + extra_labels = kwargs.pop("extra_labels", None) or dict() + service_port = kwargs.pop("service_port", None) or self._default_service_port + + def _override_modules(updates): + modules = set(extra_modules) + updates = updates.split(",") if isinstance(updates, str) else updates + modules.update(updates) + return sorted(modules) + + def _override_dict(d, updates): + updates = updates or dict() + ret = d.copy() + ret.update(updates) + return ret + + _override_envs = functools.partial(_override_dict, extra_envs) + _override_labels = functools.partial(_override_dict, extra_labels) + + self._supervisor_num = supervisor_num + self._supervisor_cpu = supervisor_cpu + self._supervisor_mem = calc_size_by_str(supervisor_mem, None) + self._supervisor_mem_limit_ratio = supervisor_mem_limit_ratio + self._supervisor_extra_modules = _override_modules( + kwargs.pop("supervisor_extra_modules", []) + ) + self._supervisor_extra_env = _override_envs( + kwargs.pop("supervisor_extra_env", None) + ) + self._supervisor_extra_labels = _override_labels( + kwargs.pop("supervisor_extra_labels", None) + ) + self._supervisor_service_port = ( + kwargs.pop("supervisor_service_port", None) or service_port + ) + self._web_port = web_port or self._default_web_port + self._external_web_endpoint = None + + self._worker_num = worker_num + self._worker_cpu = worker_cpu + self._worker_mem = calc_size_by_str(worker_mem, None) + self._worker_mem_limit_ratio = worker_mem_limit_ratio + self._worker_spill_paths = worker_spill_paths + self._worker_cache_mem = worker_cache_mem + self._worker_min_cache_men = worker_min_cache_mem + self._min_worker_num = min_worker_num + self._worker_extra_modules = _override_modules( + kwargs.pop("worker_extra_modules", []) + ) + self._worker_extra_env = _override_envs(kwargs.pop("worker_extra_env", None)) + self._worker_extra_labels = _override_labels( + kwargs.pop("worker_extra_labels", None) + ) + self._worker_service_port = ( + kwargs.pop("worker_service_port", None) or service_port + ) + + @property + def namespace(self): + return self._namespace + + def _get_free_namespace(self): + while True: + namespace = "mars-ns-" + str(uuid.uuid4().hex) + try: + self._core_api.read_namespace(namespace) + except K8SApiException as ex: + if ex.status != 404: # pragma: no cover + raise + return namespace + + def _create_kube_service(self): + if self._service_type != "NodePort": # pragma: no cover + raise NotImplementedError( + f"Service type {self._service_type} not supported" + ) + + service_config = ServiceConfig( + self._service_name, + service_type="NodePort", + port=self._web_port, + selector={"mars/service-type": MarsSupervisorsConfig.rc_name}, + ) + self._core_api.create_namespaced_service( + self._namespace, service_config.build() + ) + + def _get_ready_pod_count(self, label_selector): + query = self._core_api.list_namespaced_pod( + namespace=self._namespace, label_selector=label_selector + ).to_dict() + cnt = 0 + for el in query["items"]: + if el["status"]["phase"] in ("Error", "Failed"): + logger.warning( + "Error in starting pod, message: %s", el["status"]["message"] + ) + continue + if "status" not in el or "conditions" not in el["status"]: + cnt += 1 + elif any( + cond["type"] == "Ready" and cond["status"] == "True" + for cond in el["status"].get("conditions") or () + ): + cnt += 1 + return cnt + + def _create_namespace(self): + if self._namespace is None: + namespace = self._namespace = self._get_free_namespace() + else: + namespace = self._namespace + + self._core_api.create_namespace(NamespaceConfig(namespace).build()) + + def _create_roles_and_bindings(self): + # create role and binding + role_config = RoleConfig( + "mars-pod-operator", + self._namespace, + api_groups="", + resources="pods,endpoints,services", + verbs="get,watch,list,patch", + ) + role_config.create_namespaced(self._api_client, self._namespace) + role_binding_config = RoleBindingConfig( + "mars-pod-operator-binding", self._namespace, "mars-pod-operator", "default" + ) + role_binding_config.create_namespaced(self._api_client, self._namespace) + + def _create_supervisors(self): + supervisors_config = self._supervisor_config_cls( + self._supervisor_num, + image=self._image, + cpu=self._supervisor_cpu, + memory=self._supervisor_mem, + memory_limit_ratio=self._supervisor_mem_limit_ratio, + modules=self._supervisor_extra_modules, + volumes=self._extra_volumes, + service_name=self._service_name, + service_port=self._supervisor_service_port, + web_port=self._web_port, + pre_stop_command=self._pre_stop_command, + ) + supervisors_config.add_simple_envs(self._supervisor_extra_env) + supervisors_config.add_labels(self._supervisor_extra_labels) + supervisors_config.create_namespaced(self._api_client, self._namespace) + + def _create_workers(self): + workers_config = self._worker_config_cls( + self._worker_num, + image=self._image, + cpu=self._worker_cpu, + memory=self._worker_mem, + memory_limit_ratio=self._worker_mem_limit_ratio, + spill_volumes=self._worker_spill_paths, + modules=self._worker_extra_modules, + volumes=self._extra_volumes, + worker_cache_mem=self._worker_cache_mem, + min_cache_mem=self._worker_min_cache_men, + service_name=self._service_name, + service_port=self._worker_service_port, + pre_stop_command=self._pre_stop_command, + supervisor_web_port=self._web_port, + ) + workers_config.add_simple_envs(self._worker_extra_env) + workers_config.add_labels(self._worker_extra_labels) + workers_config.create_namespaced(self._api_client, self._namespace) + + def _create_services(self): + self._create_supervisors() + self._create_workers() + + def _wait_services_ready(self): + min_worker_num = int(self._min_worker_num or self._worker_num) + limits = [self._supervisor_num, min_worker_num] + selectors = [ + "mars/service-type=" + MarsSupervisorsConfig.rc_name, + "mars/service-type=" + MarsWorkersConfig.rc_name, + ] + start_time = time.time() + logger.debug("Start waiting pods to be ready") + wait_services_ready( + selectors, + limits, + lambda sel: self._get_ready_pod_count(sel), + timeout=self._timeout, + ) + logger.info("All service pods ready.") + if self._timeout is not None: # pragma: no branch + self._timeout -= time.time() - start_time + + def _get_web_address(self): + svc_data = self._core_api.read_namespaced_service( + "marsservice", self._namespace + ).to_dict() + node_port = svc_data["spec"]["ports"][0]["node_port"] + + # docker desktop use a VM to hold docker processes, hence + # we need to use API address instead + desktop_nodes = self._core_api.list_node( + field_selector="metadata.name=docker-desktop" + ).to_dict() + if desktop_nodes["items"]: # pragma: no cover + host_ip = urlparse( + self._core_api.api_client.configuration.host + ).netloc.split(":", 1)[0] + else: + web_pods = self._core_api.list_namespaced_pod( + self._namespace, + label_selector="mars/service-type=" + MarsSupervisorsConfig.rc_name, + ).to_dict() + host_ip = random.choice(web_pods["items"])["status"]["host_ip"] + return f"http://{host_ip}:{node_port}" + + def _wait_web_ready(self): + loop = new_isolation().loop + + async def get_supervisors(): + start_time = time.time() + while True: + try: + cluster_api = WebClusterAPI(self._external_web_endpoint) + supervisors = await cluster_api.get_supervisors() + + if len(supervisors) == self._supervisor_num: + break + except: # noqa: E722 # nosec # pylint: disable=bare-except # pragma: no cover + if ( + self._timeout is not None + and time.time() - start_time > self._timeout + ): + logger.exception("Error when fetching supervisors") + raise TimeoutError( + "Wait for kubernetes cluster timed out" + ) from None + + asyncio.run_coroutine_threadsafe(get_supervisors(), loop).result() + + def _load_cluster_logs(self): + log_dict = dict() + pod_items = self._core_api.list_namespaced_pod(self._namespace).to_dict() + for item in pod_items["items"]: + log_dict[item["metadata"]["name"]] = self._core_api.read_namespaced_pod_log( + name=item["metadata"]["name"], namespace=self._namespace + ) + return log_dict + + def start(self): + try: + self._create_namespace() + self._create_roles_and_bindings() + + self._create_services() + self._create_kube_service() + + self._wait_services_ready() + + self._external_web_endpoint = self._get_web_address() + self._wait_web_ready() + return self._external_web_endpoint + except: # noqa: E722 + if self._log_when_fail: # pargma: no cover + logger.error("Error when creating cluster") + for name, log in self._load_cluster_logs().items(): + logger.error("Error logs for %s:\n%s", name, log) + self.stop() + raise + + def stop(self, wait=False, timeout=0): + # stop isolation + stop_isolation() + + from kubernetes.client import CoreV1Api + + api = CoreV1Api(self._api_client) + api.delete_namespace(self._namespace) + if wait: + start_time = time.time() + while True: + try: + api.read_namespace(self._namespace) + except K8SApiException as ex: + if ex.status != 404: # pragma: no cover + raise + break + else: + time.sleep(1) + if ( + timeout and time.time() - start_time > timeout + ): # pragma: no cover + raise TimeoutError + + +def new_cluster( + kube_api_client=None, + image=None, + supervisor_num=1, + supervisor_cpu=None, + supervisor_mem=None, + worker_num=1, + worker_cpu=None, + worker_mem=None, + worker_spill_paths=None, + worker_cache_mem=None, + min_worker_num=None, + web_num=1, + web_cpu=None, + web_mem=None, + service_type=None, + timeout=None, + **kwargs, +): + """ + :param kube_api_client: Kubernetes API client, can be created with ``new_client_from_config`` + :param image: Docker image to use, ``marsproject/mars:`` by default + :param supervisor_num: Number of supervisors in the cluster, 1 by default + :param supervisor_cpu: Number of CPUs for every supervisor + :param supervisor_mem: Memory size for every supervisor + :param worker_num: Number of workers in the cluster, 1 by default + :param worker_cpu: Number of CPUs for every worker + :param worker_mem: Memory size for every worker + :param worker_spill_paths: Spill paths for worker pods on hosts + :param worker_cache_mem: Size or ratio of cache memory for every worker + :param min_worker_num: Minimal ready workers + :param web_num: Number of web services in the cluster, 1 by default + :param web_cpu: Number of CPUs for every web service + :param web_mem: Memory size for every web service + :param service_type: Type of Kubernetes Service, currently only ``NodePort`` supported + :param timeout: Timeout when creating clusters + """ + cluster_cls = kwargs.pop("cluster_cls", KubernetesCluster) + cluster = cluster_cls( + kube_api_client, + image=image, + supervisor_num=supervisor_num, + supervisor_cpu=supervisor_cpu, + supervisor_mem=supervisor_mem, + worker_num=worker_num, + worker_cpu=worker_cpu, + worker_mem=worker_mem, + worker_spill_paths=worker_spill_paths, + worker_cache_mem=worker_cache_mem, + min_worker_num=min_worker_num, + web_num=web_num, + web_cpu=web_cpu, + web_mem=web_mem, + service_type=service_type, + timeout=timeout, + **kwargs, + ) + client = KubernetesClusterClient(cluster) + client.start() + return client diff --git a/python/xorbits/_mars/deploy/kubernetes/config.py b/python/xorbits/_mars/deploy/kubernetes/config.py new file mode 100644 index 000000000..34a8f8717 --- /dev/null +++ b/python/xorbits/_mars/deploy/kubernetes/config.py @@ -0,0 +1,673 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import abc +import functools +import math +import re + +from ... import __version__ as mars_version +from ...utils import calc_size_by_str, parse_readable_size + +DEFAULT_IMAGE = "marsproject/mars:v" + mars_version +DEFAULT_WORKER_CACHE_MEM = "40%" + + +def _remove_nones(cfg): + return dict((k, v) for k, v in cfg.items() if v is not None) + + +_kube_api_mapping = { + "v1": "CoreV1Api", + "apps/v1": "AppsV1Api", + "rbac.authorization.k8s.io/v1": "RbacAuthorizationV1Api", +} + + +@functools.lru_cache(10) +def _get_k8s_api(api_version, k8s_api_client): + from kubernetes import client as kube_client + + return getattr(kube_client, _kube_api_mapping[api_version])(k8s_api_client) + + +@functools.lru_cache(10) +def _camel_to_underline(name): + s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) + return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower() + + +class KubeConfig(abc.ABC): + api_version = "v1" + + def create_namespaced(self, k8s_api_client, namespace): + api = _get_k8s_api(self.api_version, k8s_api_client) + config = self.build() + method_name = f'create_namespaced_{_camel_to_underline(config["kind"])}' + return getattr(api, method_name)(namespace, config) + + @abc.abstractmethod + def build(self): + """Build config dict of the object""" + + +class RoleConfig(KubeConfig): + """ + Configuration builder for Kubernetes RBAC roles + """ + + api_version = "rbac.authorization.k8s.io/v1" + + def __init__(self, name, namespace, api_groups, resources, verbs): + self._name = name + self._namespace = namespace + self._api_groups = api_groups.split(",") + self._resources = resources.split(",") + self._verbs = verbs.split(",") + + def build(self): + return { + "kind": "Role", + "metadata": {"name": self._name, "namespace": self._namespace}, + "rules": [ + { + "apiGroups": self._api_groups, + "resources": self._resources, + "verbs": self._verbs, + } + ], + } + + +class RoleBindingConfig(KubeConfig): + """ + Configuration builder for Kubernetes RBAC role bindings + """ + + api_version = "rbac.authorization.k8s.io/v1" + + def __init__(self, name, namespace, role_name, service_account_name): + self._name = name + self._namespace = namespace + self._role_name = role_name + self._service_account_name = service_account_name + + def build(self): + return { + "kind": "RoleBinding", + "metadata": {"name": self._name, "namespace": self._namespace}, + "roleRef": { + "apiGroup": "rbac.authorization.k8s.io", + "kind": "Role", + "name": self._role_name, + }, + "subjects": [ + { + "kind": "ServiceAccount", + "name": self._service_account_name, + "namespace": self._namespace, + } + ], + } + + +class NamespaceConfig(KubeConfig): + """ + Configuration builder for Kubernetes namespaces + """ + + def __init__(self, name): + self._name = name + + def build(self): + return { + "kind": "Namespace", + "metadata": { + "name": self._name, + "labels": { + "name": self._name, + }, + }, + } + + +class ServiceConfig(KubeConfig): + """ + Configuration builder for Kubernetes services + """ + + def __init__( + self, name, service_type, selector, port, target_port=None, protocol=None + ): + self._name = name + self._type = service_type + self._protocol = protocol or "TCP" + self._selector = selector + self._port = port + self._target_port = target_port + + def build(self): + return { + "kind": "Service", + "metadata": { + "name": self._name, + "labels": { + "mars/service-name": self._name, + }, + }, + "spec": _remove_nones( + { + "type": self._type, + "selector": self._selector, + "ports": [ + _remove_nones( + { + "protocol": self._protocol, + "port": self._port, + "targetPort": self._target_port, + } + ), + ], + } + ), + } + + +class ResourceConfig: + """ + Configuration builder for Kubernetes computation resources + """ + + def __init__(self, cpu, memory): + self._cpu = cpu + self._memory, ratio = ( + parse_readable_size(memory) if memory is not None else (None, False) + ) + assert not ratio + + @property + def cpu(self): + return self._cpu + + @property + def memory(self): + return self._memory + + def build(self): + return _remove_nones( + { + "cpu": f"{int(self._cpu * 1000)}m" if self._cpu else None, + "memory": str(int(self._memory)) if self._memory else None, + } + ) + + +class PortConfig: + """ + Configuration builder for Kubernetes ports definition for containers + """ + + def __init__(self, container_port): + self._container_port = int(container_port) + + def build(self): + return { + "containerPort": self._container_port, + } + + +class VolumeConfig(abc.ABC): + """ + Base configuration builder for Kubernetes volumes + """ + + def __init__(self, name, mount_path): + self.name = name + self.mount_path = mount_path + + @abc.abstractmethod + def build(self): + """Build volume config""" + + def build_mount(self): + return { + "name": self.name, + "mountPath": self.mount_path, + } + + +class HostPathVolumeConfig(VolumeConfig): + """ + Configuration builder for Kubernetes host volumes + """ + + def __init__(self, name, mount_path, host_path, volume_type=None): + super().__init__(name, mount_path) + self._host_path = host_path + self._volume_type = volume_type or "DirectoryOrCreate" + + def build(self): + return { + "name": self.name, + "hostPath": {"path": self._host_path, "type": self._volume_type}, + } + + +class EmptyDirVolumeConfig(VolumeConfig): + """ + Configuration builder for Kubernetes empty-dir volumes + """ + + def __init__(self, name, mount_path, use_memory=True, size_limit=None): + super().__init__(name, mount_path) + self._medium = "Memory" if use_memory else None + self._size_limit = size_limit + + def build(self): + result = {"name": self.name, "emptyDir": {}} + if self._medium: + result["emptyDir"]["medium"] = self._medium + if self._size_limit: + result["emptyDir"]["sizeLimit"] = str(int(self._size_limit)) + return result + + +class ContainerEnvConfig: + """ + Configuration builder for Kubernetes container environments + """ + + def __init__(self, name, value=None, field_path=None): + self._name = name + self._value = value + self._field_path = field_path + + def build(self): + result = dict(name=self._name) + if self._value is not None: + result["value"] = str(self._value) + elif self._field_path is not None: # pragma: no branch + result["valueFrom"] = {"fieldRef": {"fieldPath": self._field_path}} + return result + + +class ProbeConfig: + """ + Base configuration builder for Kubernetes liveness and readiness probes + """ + + def __init__( + self, + initial_delay=5, + period=1, + timeout=None, + success_thresh=None, + failure_thresh=None, + ): + self._initial_delay = initial_delay + self._period = period + self._timeout = timeout + self._success_thresh = success_thresh + self._failure_thresh = failure_thresh + + def build(self): + return _remove_nones( + { + "initialDelaySeconds": self._initial_delay, + "periodSeconds": self._period, + "timeoutSeconds": self._timeout, + "successThreshold": self._success_thresh, + "failureThreshold": self._failure_thresh, + } + ) + + +class TcpSocketProbeConfig(ProbeConfig): + """ + Configuration builder for TCP liveness and readiness probes + """ + + def __init__(self, port: int, **kwargs): + super().__init__(**kwargs) + self._port = port + + def build(self): + ret = super().build() + ret["tcpSocket"] = {"port": self._port} + return ret + + +class ReplicationConfig(KubeConfig): + """ + Base configuration builder for Kubernetes replication controllers + """ + + _default_kind = "Deployment" + + def __init__( + self, + name, + image, + replicas, + resource_request=None, + resource_limit=None, + liveness_probe=None, + readiness_probe=None, + pre_stop_command=None, + kind=None, + ): + self._name = name + self._kind = kind or self._default_kind + self._image = image + self._replicas = replicas + self._ports = [] + self._volumes = [] + self._envs = dict() + self._labels = dict() + + self.add_default_envs() + + self._resource_request = resource_request + self._resource_limit = resource_limit + + self._liveness_probe = liveness_probe + self._readiness_probe = readiness_probe + + self._pre_stop_command = pre_stop_command + + @property + def api_version(self): + return "apps/v1" if self._kind in ("Deployment", "ReplicaSet") else "v1" + + def add_env(self, name, value=None, field_path=None): + self._envs[name] = ContainerEnvConfig(name, value=value, field_path=field_path) + + def remove_env(self, name): # pragma: no cover + self._envs.pop(name, None) + + def add_simple_envs(self, envs): + for k, v in envs.items() or (): + self.add_env(k, v) + + def add_labels(self, labels): + self._labels.update(labels) + + def add_port(self, container_port): + self._ports.append(PortConfig(container_port)) + + def add_default_envs(self): + pass # pragma: no cover + + def add_volume(self, vol): + self._volumes.append(vol) + + @abc.abstractmethod + def build_container_command(self): + """Output container command""" + + def build_container(self): + resources_dict = { + "requests": self._resource_request.build() + if self._resource_request + else None, + "limits": self._resource_limit.build() if self._resource_limit else None, + } + lifecycle_dict = _remove_nones( + { + "preStop": { + "exec": {"command": self._pre_stop_command}, + } + if self._pre_stop_command + else None, + } + ) + return _remove_nones( + { + "command": self.build_container_command(), + "env": [env.build() for env in self._envs.values()] or None, + "image": self._image, + "name": self._name, + "resources": dict((k, v) for k, v in resources_dict.items() if v) + or None, + "ports": [p.build() for p in self._ports] or None, + "volumeMounts": [vol.build_mount() for vol in self._volumes] or None, + "livenessProbe": self._liveness_probe.build() + if self._liveness_probe + else None, + "readinessProbe": self._readiness_probe.build() + if self._readiness_probe + else None, + "lifecycle": lifecycle_dict or None, + } + ) + + def build_template_spec(self): + result = { + "containers": [self.build_container()], + "volumes": [vol.build() for vol in self._volumes], + } + return dict((k, v) for k, v in result.items() if v) + + def build(self): + return { + "kind": self._kind, + "metadata": { + "name": self._name, + }, + "spec": { + "replicas": int(self._replicas), + "template": { + "metadata": { + "labels": _remove_nones(self._labels) or None, + }, + "spec": self.build_template_spec(), + }, + }, + } + + +class MarsReplicationConfig(ReplicationConfig, abc.ABC): + """ + Base configuration builder for replication controllers for Mars + """ + + rc_name = None + default_readiness_port = 15031 + + def __init__( + self, + replicas, + cpu=None, + memory=None, + limit_resources=False, + memory_limit_ratio=None, + image=None, + modules=None, + volumes=None, + service_name=None, + service_port=None, + **kwargs, + ): + self._cpu = cpu + self._memory, ratio = ( + parse_readable_size(memory) if memory is not None else (None, False) + ) + assert not ratio + + if isinstance(modules, str): + self._modules = modules.split(",") + else: + self._modules = modules + + req_res = ResourceConfig(cpu, memory) if cpu or memory else None + limit_res = ( + ResourceConfig(req_res.cpu, req_res.memory * (memory_limit_ratio or 1)) + if req_res and memory + else None + ) + + self._service_name = service_name + self._service_port = service_port + + super().__init__( + self.rc_name, + image or DEFAULT_IMAGE, + replicas, + resource_request=req_res, + resource_limit=limit_res if limit_resources else None, + readiness_probe=self.config_readiness_probe(), + **kwargs, + ) + if service_port: + self.add_port(service_port) + + for vol in volumes or (): + self.add_volume(vol) + + self.add_labels({"mars/service-type": self.rc_name}) + + def add_default_envs(self): + self.add_env("MARS_K8S_POD_NAME", field_path="metadata.name") + self.add_env("MARS_K8S_POD_NAMESPACE", field_path="metadata.namespace") + self.add_env("MARS_K8S_POD_IP", field_path="status.podIP") + + if self._service_name: + self.add_env("MARS_K8S_SERVICE_NAME", str(self._service_name)) + if self._service_port: + self.add_env("MARS_K8S_SERVICE_PORT", str(self._service_port)) + + self.add_env("MARS_CONTAINER_IP", field_path="status.podIP") + + if self._cpu: + self.add_env("MKL_NUM_THREADS", str(self._cpu)) + self.add_env("MARS_CPU_TOTAL", str(self._cpu)) + if getattr(self, "stat_type", "cgroup") == "cgroup": + self.add_env("MARS_USE_CGROUP_STAT", "1") + + if self._memory: + self.add_env("MARS_MEMORY_TOTAL", str(int(self._memory))) + + if self._modules: + self.add_env("MARS_LOAD_MODULES", ",".join(self._modules)) + + def config_readiness_probe(self): + raise NotImplementedError + + @staticmethod + def get_local_app_module(mod_name): + return __name__.rsplit(".", 1)[0] + "." + mod_name + + def build(self): + result = super().build() + if self._kind in ("Deployment", "ReplicaSet"): + result["spec"]["selector"] = { + "matchLabels": {"mars/service-type": self.rc_name} + } + else: + result["spec"]["selector"] = {"mars/service-type": self.rc_name} + return result + + +class MarsSupervisorsConfig(MarsReplicationConfig): + """ + Configuration builder for Mars supervisor service + """ + + rc_name = "marssupervisor" + + def __init__(self, *args, **kwargs): + self._web_port = kwargs.pop("web_port", None) + self._readiness_port = kwargs.pop("readiness_port", self.default_readiness_port) + super().__init__(*args, **kwargs) + if self._web_port: + self.add_port(self._web_port) + + def config_readiness_probe(self): + return TcpSocketProbeConfig(self._readiness_port, timeout=60, failure_thresh=10) + + def build_container_command(self): + cmd = [ + "/srv/entrypoint.sh", + self.get_local_app_module("supervisor"), + ] + if self._service_port: + cmd += ["-p", str(self._service_port)] + if self._web_port: + cmd += ["-w", str(self._web_port)] + if self._cpu: + cmd += ["--n-process", str(int(math.ceil(self._cpu)))] + return cmd + + +class MarsWorkersConfig(MarsReplicationConfig): + """ + Configuration builder for Mars worker service + """ + + rc_name = "marsworker" + + def __init__(self, *args, **kwargs): + spill_volumes = kwargs.pop("spill_volumes", None) or () + mount_shm = kwargs.pop("mount_shm", True) + self._limit_resources = kwargs["limit_resources"] = kwargs.get( + "limit_resources", True + ) + worker_cache_mem = ( + kwargs.pop("worker_cache_mem", None) or DEFAULT_WORKER_CACHE_MEM + ) + min_cache_mem = kwargs.pop("min_cache_mem", None) + self._readiness_port = kwargs.pop("readiness_port", self.default_readiness_port) + supervisor_web_port = kwargs.pop("supervisor_web_port", None) + + super().__init__(*args, **kwargs) + + self._spill_volumes = [] + for idx, vol in enumerate(spill_volumes): + if isinstance(vol, str): + path = f"/mnt/hostpath{idx}" + self.add_volume(HostPathVolumeConfig(f"host-path-vol-{idx}", path, vol)) + self._spill_volumes.append(path) + else: + self.add_volume(vol) + self._spill_volumes.append(vol.mount_path) + if self._spill_volumes: + self.add_env("MARS_SPILL_DIRS", ":".join(self._spill_volumes)) + + if self._memory: + size_limit = calc_size_by_str(worker_cache_mem, self._memory) + self.add_env("MARS_CACHE_MEM_SIZE", worker_cache_mem) + else: + size_limit = None + + if mount_shm: + self.add_volume( + EmptyDirVolumeConfig("mars-shared", "/dev/shm", size_limit=size_limit) + ) + + if min_cache_mem: + self.add_env("MARS_MIN_CACHE_MEM_SIZE", min_cache_mem) + if supervisor_web_port: + self.add_env("MARS_K8S_SUPERVISOR_WEB_PORT", supervisor_web_port) + + def config_readiness_probe(self): + return TcpSocketProbeConfig(self._readiness_port, timeout=60, failure_thresh=10) + + def build_container_command(self): + cmd = [ + "/srv/entrypoint.sh", + self.get_local_app_module("worker"), + ] + if self._service_port: + cmd += ["-p", str(self._service_port)] + return cmd diff --git a/python/xorbits/_mars/deploy/kubernetes/config.yml b/python/xorbits/_mars/deploy/kubernetes/config.yml new file mode 100644 index 000000000..127ff0ec7 --- /dev/null +++ b/python/xorbits/_mars/deploy/kubernetes/config.yml @@ -0,0 +1,7 @@ +"@inherits": ../oscar/base_config.yml +cluster: + backend: k8s +storage: + backends: [plasma] + plasma: + store_memory: 20% diff --git a/python/xorbits/_mars/deploy/kubernetes/core.py b/python/xorbits/_mars/deploy/kubernetes/core.py new file mode 100644 index 000000000..0eaca00fd --- /dev/null +++ b/python/xorbits/_mars/deploy/kubernetes/core.py @@ -0,0 +1,222 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import logging +import os +from typing import AsyncGenerator, Dict, List, Optional, TypeVar + +from ...services.cluster import WebClusterAPI +from ...services.cluster.backends import ( + AbstractClusterBackend, + register_cluster_backend, +) +from ...services.cluster.core import NodeRole +from ..utils import next_in_thread, wait_all_supervisors_ready +from .config import MarsReplicationConfig + +logger = logging.getLogger(__name__) +RetType = TypeVar("RetType") + + +@register_cluster_backend +class K8SClusterBackend(AbstractClusterBackend): + name = "k8s" + + def __init__( + self, node_role=None, pool_address=None, k8s_config=None, k8s_namespace=None + ): + from kubernetes import client + + self._node_role = node_role + self._pool_address = pool_address + self._k8s_config = k8s_config + + verify_ssl = bool(int(os.environ.get("KUBE_VERIFY_SSL", "1"))) + if not verify_ssl: + c = client.Configuration() + c.verify_ssl = False + client.Configuration.set_default(c) + + self._k8s_namespace = ( + k8s_namespace or os.environ.get("MARS_K8S_POD_NAMESPACE") or "default" + ) + self._service_name = os.environ.get("MARS_K8S_SERVICE_NAME") + self._full_label_selector = None + self._client = client.CoreV1Api(client.ApiClient(self._k8s_config)) + + self._pod_to_ep = dict() + + @classmethod + async def create( + cls, node_role: NodeRole, lookup_address: Optional[str], pool_address: str + ) -> "AbstractClusterBackend": + from kubernetes import client, config + + if lookup_address is None: + k8s_namespace = None + k8s_config = config.load_incluster_config() + else: + address_parts = lookup_address.rsplit("?", 1) + k8s_namespace = None if len(address_parts) == 1 else address_parts[1] + + k8s_config = client.Configuration() + if "://" in address_parts[0]: + k8s_config.host = address_parts[0] + else: + config.load_kube_config( + address_parts[0], client_configuration=k8s_config + ) + return cls(node_role, pool_address, k8s_config, k8s_namespace) + + def __reduce__(self): + return ( + type(self), + ( + self._node_role, + self._pool_address, + self._k8s_config, + self._k8s_namespace, + ), + ) + + @staticmethod + def _format_endpoint_query_result(result: Dict, filter_ready: bool = True): + port = os.environ["MARS_K8S_SERVICE_PORT"] + endpoints = [ + f"{addr['ip']}:{port}" for addr in result["subsets"][0]["addresses"] or [] + ] + if not filter_ready: + endpoints = [ + f"{addr['ip']}:{port}" + for addr in result["subsets"][0]["not_ready_addresses"] or [] + ] + return endpoints + + def _get_web_cluster_api(self): + supervisor_web_port = os.environ["MARS_K8S_SUPERVISOR_WEB_PORT"] + web_url = ( + f"http://{self._service_name}.{self._k8s_namespace}:{supervisor_web_port}" + ) + api = WebClusterAPI(web_url) + return api + + async def _watch_supervisors_by_service_api( + self, + ) -> AsyncGenerator[List[str], None]: + from kubernetes.watch import Watch as K8SWatch + from urllib3.exceptions import ReadTimeoutError + + w = K8SWatch() + + while True: + streamer = w.stream( + self._client.list_namespaced_endpoints, + namespace=self._k8s_namespace, + label_selector=f"mars/service-name={self._service_name}", + timeout_seconds=60, + ) + while True: + try: + event = await next_in_thread(streamer) + obj_dict = event["object"].to_dict() + yield self._format_endpoint_query_result(obj_dict) + except (ReadTimeoutError, StopAsyncIteration): + break + except: # noqa: E722 # pragma: no cover # pylint: disable=bare-except + logger.exception("Unexpected error when watching on kubernetes") + break + + async def _watch_supervisors_by_cluster_web_api(self): + while True: + try: + api = self._get_web_cluster_api() + async for supervisors in api.watch_supervisors(): + yield supervisors + except (OSError, asyncio.TimeoutError): + pass + + async def _get_supervisors_by_service_api( + self, filter_ready: bool = True + ) -> List[str]: + result = ( + await asyncio.to_thread( + self._client.read_namespaced_endpoints, + name=self._service_name, + namespace=self._k8s_namespace, + ) + ).to_dict() + return self._format_endpoint_query_result(result, filter_ready=filter_ready) + + async def _get_supervisors_by_cluster_web_api(self, filter_ready: bool = True): + api = self._get_web_cluster_api() + try: + supervisors = await api.get_supervisors(filter_ready=filter_ready) + return supervisors + except (OSError, asyncio.TimeoutError): # pragma: no cover + return [] + + async def get_supervisors(self, filter_ready: bool = True) -> List[str]: + if self._node_role == NodeRole.SUPERVISOR: + return await self._get_supervisors_by_service_api(filter_ready) + else: + return await self._get_supervisors_by_cluster_web_api(filter_ready) + + async def watch_supervisors(self) -> AsyncGenerator[List[str], None]: + if self._node_role == NodeRole.SUPERVISOR: + watch_fun = self._watch_supervisors_by_service_api + else: + watch_fun = self._watch_supervisors_by_cluster_web_api + + try: + async for supervisors in watch_fun(): + yield supervisors + except asyncio.CancelledError: + pass + + async def request_worker( + self, worker_cpu: int = None, worker_mem: int = None, timeout: int = None + ) -> str: + raise NotImplementedError + + async def release_worker(self, address: str): + raise NotImplementedError + + async def reconstruct_worker(self, address: str): + raise NotImplementedError + + +class K8SServiceMixin: + @staticmethod + def write_pid_file(): + with open("/tmp/mars-service.pid", "w") as pid_file: + pid_file.write(str(os.getpid())) + + async def wait_all_supervisors_ready(self): + """ + Wait till all containers are ready + """ + await wait_all_supervisors_ready(self.args.endpoint) + + async def start_readiness_server(self): + readiness_port = os.environ.get( + "MARS_K8S_READINESS_PORT", MarsReplicationConfig.default_readiness_port + ) + self._readiness_server = await asyncio.start_server( + lambda r, w: None, port=readiness_port + ) + + async def stop_readiness_server(self): + self._readiness_server.close() + await self._readiness_server.wait_closed() diff --git a/python/xorbits/_mars/deploy/kubernetes/docker/Dockerfile b/python/xorbits/_mars/deploy/kubernetes/docker/Dockerfile new file mode 100644 index 000000000..2675538ab --- /dev/null +++ b/python/xorbits/_mars/deploy/kubernetes/docker/Dockerfile @@ -0,0 +1,23 @@ +ARG BASE_CONTAINER=marsproject/mars-base +FROM ${BASE_CONTAINER} + +COPY . /opt/mars/ + +RUN apt-get -yq update --allow-releaseinfo-change \ + && apt-get -yq install gcc g++ \ + && curl -fsSL https://deb.nodesource.com/setup_14.x | sudo -E bash - \ + && sudo apt-get install -y nodejs \ + && /opt/conda/bin/pip install -e /opt/mars \ + && apt-get -yq remove gcc g++ nodejs \ + && apt-get -yq autoremove \ + && apt-get -yq clean \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /usr/local/lib/node_modules +RUN mkdir -p /srv +WORKDIR /srv + +RUN cp /opt/mars/mars/deploy/oscar/file-logging.conf /srv/logging.conf \ + && cp /opt/mars/mars/deploy/kubernetes/docker/entrypoint.sh /srv/entrypoint.sh \ + && chmod a+x /srv/*.sh + +ENTRYPOINT [ "/srv/entrypoint.sh" ] diff --git a/python/xorbits/_mars/deploy/kubernetes/docker/Dockerfile.base b/python/xorbits/_mars/deploy/kubernetes/docker/Dockerfile.base new file mode 100644 index 000000000..2e43d968d --- /dev/null +++ b/python/xorbits/_mars/deploy/kubernetes/docker/Dockerfile.base @@ -0,0 +1,32 @@ +ARG BASE_CONTAINER=continuumio/miniconda3:4.9.2 +FROM ${BASE_CONTAINER} + +COPY retry.sh /srv/retry.sh + +RUN /srv/retry.sh 3 /opt/conda/bin/conda install \ + cloudpickle \ + cython \ + greenlet \ + mkl \ + numba \ + numexpr \ + numpy\>=1.14.0 \ + pandas\>=1.0.0 \ + psutil \ + scikit-learn \ + scipy \ + sqlalchemy \ + tornado \ + lz4 \ + && /srv/retry.sh 3 /opt/conda/bin/conda install -c conda-forge \ + libiconv \ + pyarrow\>=1.0 \ + tiledb-py \ + python-kubernetes \ + uvloop \ + && /opt/conda/bin/conda clean --all -f -y + +RUN apt-get -yq update --allow-releaseinfo-change \ + && apt-get -yq install curl sudo procps \ + && apt-get -yq clean \ + && rm -rf /var/lib/apt/lists/* \ diff --git a/python/xorbits/_mars/deploy/kubernetes/docker/entrypoint.sh b/python/xorbits/_mars/deploy/kubernetes/docker/entrypoint.sh new file mode 100755 index 000000000..97eb6b0cb --- /dev/null +++ b/python/xorbits/_mars/deploy/kubernetes/docker/entrypoint.sh @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +if [[ "$1" == *"/"* ]]; then + $@ +else + /opt/conda/bin/python -m "$1" ${@:2} +fi diff --git a/python/xorbits/_mars/deploy/kubernetes/docker/retry.sh b/python/xorbits/_mars/deploy/kubernetes/docker/retry.sh new file mode 100755 index 000000000..ca87a335e --- /dev/null +++ b/python/xorbits/_mars/deploy/kubernetes/docker/retry.sh @@ -0,0 +1,13 @@ +#!/bin/bash +RETRIES=$1 +shift +for (( RETRY=1; RETRY <= $RETRIES ; RETRY++ )); do + "$@" + EXIT=$? + if [[ $EXIT != 0 ]]; then + echo "Command attempt $RETRY failed" + else + exit 0 + fi +done +exit $EXIT diff --git a/python/xorbits/_mars/deploy/kubernetes/supervisor.py b/python/xorbits/_mars/deploy/kubernetes/supervisor.py new file mode 100644 index 000000000..de15b3e66 --- /dev/null +++ b/python/xorbits/_mars/deploy/kubernetes/supervisor.py @@ -0,0 +1,32 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..oscar.supervisor import SupervisorCommandRunner +from .core import K8SServiceMixin + + +class K8SSupervisorCommandRunner(K8SServiceMixin, SupervisorCommandRunner): + async def start_services(self): + await super().start_services() + await self.start_readiness_server() + + async def stop_services(self): + await self.stop_readiness_server() + await super().stop_services() + + +main = K8SSupervisorCommandRunner() + +if __name__ == "__main__": # pragma: no branch + main() diff --git a/python/xorbits/_mars/deploy/kubernetes/tests/Dockerfile.test b/python/xorbits/_mars/deploy/kubernetes/tests/Dockerfile.test new file mode 100644 index 000000000..2634fd3a7 --- /dev/null +++ b/python/xorbits/_mars/deploy/kubernetes/tests/Dockerfile.test @@ -0,0 +1,18 @@ +ARG BASE_CONTAINER=marsproject/mars-base +FROM ${BASE_CONTAINER} + +RUN /srv/retry.sh 3 /opt/conda/bin/conda install -c pkgs/main \ + coverage\>=5.0 cloudpickle \ + && conda clean --all -f -y + +RUN apt-get -yq update --allow-releaseinfo-change +RUN apt-get -yq install git gcc g++ + +COPY docker-logging.conf /srv/logging.conf +COPY build_ext.sh /srv/build_ext.sh +COPY entrypoint.sh /srv/entrypoint.sh +COPY graceful_stop.sh /srv/graceful_stop.sh + +RUN echo "import coverage; coverage.process_startup()" > \ + $(/opt/conda/bin/python -c "import site; print(site.getsitepackages()[-1])")/coverage.pth +RUN chmod a+x /srv/*.sh diff --git a/python/xorbits/_mars/deploy/kubernetes/tests/__init__.py b/python/xorbits/_mars/deploy/kubernetes/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/deploy/kubernetes/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/deploy/kubernetes/tests/build_ext.sh b/python/xorbits/_mars/deploy/kubernetes/tests/build_ext.sh new file mode 100644 index 000000000..073b2b3ad --- /dev/null +++ b/python/xorbits/_mars/deploy/kubernetes/tests/build_ext.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -e +cd /mnt/mars +/opt/conda/bin/python setup.py build_ext -i diff --git a/python/xorbits/_mars/deploy/kubernetes/tests/docker-logging.conf b/python/xorbits/_mars/deploy/kubernetes/tests/docker-logging.conf new file mode 100644 index 000000000..320ca6cb3 --- /dev/null +++ b/python/xorbits/_mars/deploy/kubernetes/tests/docker-logging.conf @@ -0,0 +1,50 @@ +[loggers] +keys=root,main,deploy,services,oscar,tornado + +[handlers] +keys=stream_handler + +[formatters] +keys=formatter + +[logger_root] +level=WARN +handlers=stream_handler + +[logger_main] +level=DEBUG +handlers=stream_handler +qualname=__main__ +propagate=0 + +[logger_deploy] +level=DEBUG +handlers=stream_handler +qualname=mars.deploy +propagate=0 + +[logger_oscar] +level=DEBUG +handlers=stream_handler +qualname=mars.oscar +propagate=0 + +[logger_services] +level=DEBUG +handlers=stream_handler +qualname=mars.services +propagate=0 + +[logger_tornado] +level=WARN +handlers=stream_handler +qualname=tornado +propagate=0 + +[handler_stream_handler] +class=StreamHandler +formatter=formatter +args=(sys.stderr,) + +[formatter_formatter] +format=%(asctime)s %(name)-12s %(process)d %(levelname)-8s %(message)s diff --git a/python/xorbits/_mars/deploy/kubernetes/tests/entrypoint.sh b/python/xorbits/_mars/deploy/kubernetes/tests/entrypoint.sh new file mode 100755 index 000000000..918d87bfd --- /dev/null +++ b/python/xorbits/_mars/deploy/kubernetes/tests/entrypoint.sh @@ -0,0 +1,20 @@ +#!/bin/bash +set -e +cd /mnt/mars +/opt/conda/bin/pip install -e ".[dev,extra]" + +mkdir -p .dist-coverage +export COVERAGE_FILE=.dist-coverage/.coverage + +COV_RUNNER="/opt/conda/bin/coverage run" + +if [[ $1 == *"supervisor"* ]]; then + $COV_RUNNER -m "$1" --log-conf /srv/logging.conf ${@:2} +elif [[ $1 == *"worker"* ]]; then + $COV_RUNNER -m "$1" --log-conf /srv/logging.conf ${@:2} +else + $COV_RUNNER -m "$1" --log-conf /srv/logging.conf ${@:2} +fi +while [[ -f /tmp/stopping.tmp ]]; do + sleep 1 +done diff --git a/python/xorbits/_mars/deploy/kubernetes/tests/graceful_stop.sh b/python/xorbits/_mars/deploy/kubernetes/tests/graceful_stop.sh new file mode 100644 index 000000000..5153fc3c2 --- /dev/null +++ b/python/xorbits/_mars/deploy/kubernetes/tests/graceful_stop.sh @@ -0,0 +1,16 @@ +#!/bin/bash +set -e +touch /tmp/stopping.tmp +if [[ -f /tmp/mars-service.pid ]]; then + SERVICE_PID="$(cat /tmp/mars-service.pid)" + kill -INT "$SERVICE_PID" || true + CNT=0 + while kill -0 "$SERVICE_PID"; do + sleep 0.5 + CNT=$((CNT+1)) + if [[ $CNT -gt 10 ]]; then + break + fi + done + kill -INT "$SERVICE_PID" || true +fi diff --git a/python/xorbits/_mars/deploy/kubernetes/tests/test_config.py b/python/xorbits/_mars/deploy/kubernetes/tests/test_config.py new file mode 100644 index 000000000..c4494fca6 --- /dev/null +++ b/python/xorbits/_mars/deploy/kubernetes/tests/test_config.py @@ -0,0 +1,129 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..config import ( + EmptyDirVolumeConfig, + MarsSupervisorsConfig, + MarsWorkersConfig, + NamespaceConfig, + RoleBindingConfig, + RoleConfig, + ServiceConfig, +) + + +def test_simple_objects(): + ns_config_dict = NamespaceConfig("ns_name").build() + assert ns_config_dict["metadata"]["name"] == "ns_name" + + role_config_dict = RoleConfig( + "mars-pod-reader", "ns_name", "", "pods", "get,watch,list" + ).build() + assert role_config_dict["metadata"]["name"] == "mars-pod-reader" + assert "get" in role_config_dict["rules"][0]["verbs"] + + role_binding_config_dict = RoleBindingConfig( + "mars-pod-reader-binding", "ns_name", "mars-pod-reader", "default" + ).build() + assert role_binding_config_dict["metadata"]["name"] == "mars-pod-reader-binding" + + service_config_dict = ServiceConfig( + "mars-test-service", "NodePort", "mars/service-type=marssupervisor", 7103, 7103 + ).build() + assert service_config_dict["metadata"]["name"] == "mars-test-service" + + +def test_supervisor_object(): + supervisor_config = MarsSupervisorsConfig( + 1, cpu=2, memory="10g", limit_resources=False, modules=["mars.test_mod"] + ) + supervisor_config.add_simple_envs(dict(TEST_ENV="test_val")) + + supervisor_config_dict = supervisor_config.build() + assert supervisor_config_dict["metadata"]["name"] == "marssupervisor" + assert supervisor_config_dict["spec"]["replicas"] == 1 + + container_dict = supervisor_config_dict["spec"]["template"]["spec"]["containers"][0] + assert int(container_dict["resources"]["requests"]["memory"]) == 10 * 1024**3 + + container_envs = dict((p["name"], p) for p in container_dict["env"]) + assert container_envs["TEST_ENV"]["value"] == "test_val" + assert container_envs["MKL_NUM_THREADS"]["value"] == "2" + assert container_envs["MARS_CPU_TOTAL"]["value"] == "2" + assert int(container_envs["MARS_MEMORY_TOTAL"]["value"]) == 10 * 1024**3 + assert container_envs["MARS_LOAD_MODULES"]["value"] == "mars.test_mod" + + +def test_worker_object(): + worker_config_dict = MarsWorkersConfig( + 4, + cpu=2, + memory=10 * 1024**3, + limit_resources=True, + memory_limit_ratio=2, + spill_volumes=[ + "/tmp/spill_vol", + EmptyDirVolumeConfig("empty-dir", "/tmp/empty"), + ], + worker_cache_mem="20%", + min_cache_mem="10%", + modules="mars.test_mod", + mount_shm=True, + ).build() + assert worker_config_dict["metadata"]["name"] == "marsworker" + assert worker_config_dict["spec"]["replicas"] == 4 + + container_dict = worker_config_dict["spec"]["template"]["spec"]["containers"][0] + assert int(container_dict["resources"]["requests"]["memory"]) == 10 * 1024**3 + assert int(container_dict["resources"]["limits"]["memory"]) == 20 * 1024**3 + + container_envs = dict((p["name"], p) for p in container_dict["env"]) + assert container_envs["MKL_NUM_THREADS"]["value"] == "2" + assert container_envs["MARS_CPU_TOTAL"]["value"] == "2" + assert int(container_envs["MARS_MEMORY_TOTAL"]["value"]) == 10 * 1024**3 + assert container_envs["MARS_LOAD_MODULES"]["value"] == "mars.test_mod" + assert set(container_envs["MARS_SPILL_DIRS"]["value"].split(":")) == { + "/tmp/empty", + "/mnt/hostpath0", + } + assert container_envs["MARS_CACHE_MEM_SIZE"]["value"] == "20%" + + volume_list = worker_config_dict["spec"]["template"]["spec"]["volumes"] + volume_envs = dict((v["name"], v) for v in volume_list) + assert "empty-dir" in volume_envs + assert volume_envs["host-path-vol-0"]["hostPath"]["path"] == "/tmp/spill_vol" + + volume_mounts = dict((v["name"], v) for v in container_dict["volumeMounts"]) + assert volume_mounts["empty-dir"]["mountPath"] == "/tmp/empty" + assert volume_mounts["host-path-vol-0"]["mountPath"] == "/mnt/hostpath0" + + worker_config_dict = MarsWorkersConfig( + 4, + cpu=2, + memory=10 * 1024**3, + limit_resources=False, + spill_volumes=[ + "/tmp/spill_vol", + EmptyDirVolumeConfig("empty-dir", "/tmp/empty"), + ], + modules="mars.test_mod", + mount_shm=False, + ).build() + + volume_list = worker_config_dict["spec"]["template"]["spec"]["volumes"] + assert "shm-volume" not in volume_list + + container_dict = worker_config_dict["spec"]["template"]["spec"]["containers"][0] + volume_mounts = dict((v["name"], v) for v in container_dict["volumeMounts"]) + assert "shm-volume" not in volume_mounts diff --git a/python/xorbits/_mars/deploy/kubernetes/tests/test_kubernetes.py b/python/xorbits/_mars/deploy/kubernetes/tests/test_kubernetes.py new file mode 100644 index 000000000..03eee880d --- /dev/null +++ b/python/xorbits/_mars/deploy/kubernetes/tests/test_kubernetes.py @@ -0,0 +1,284 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import os +import shutil +import subprocess +import tempfile +import uuid +from contextlib import contextmanager +from distutils.spawn import find_executable + +import numpy as np +import pytest + +from .... import tensor as mt +from ....tests.core import mock +from .. import new_cluster +from ..config import HostPathVolumeConfig + +try: + from kubernetes import client as k8s_client + from kubernetes import config as k8s_config +except ImportError: + k8s_client = k8s_config = None + +MARS_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(mt.__file__))) +TEST_ROOT = os.path.dirname(os.path.abspath(__file__)) +DOCKER_ROOT = os.path.join(os.path.dirname(TEST_ROOT), "docker") + +kube_available = ( + find_executable("kubectl") is not None + and find_executable("docker") is not None + and k8s_config is not None +) + + +def _collect_coverage(): + dist_coverage_path = os.path.join(MARS_ROOT, ".dist-coverage") + if os.path.exists(dist_coverage_path): + # change ownership of coverage files + if find_executable("sudo"): + proc = subprocess.Popen( + [ + "sudo", + "-n", + "chown", + "-R", + f"{os.geteuid()}:{os.getegid()}", + dist_coverage_path, + ], + shell=False, + ) + proc.wait() + + # rewrite paths in coverage result files + for fn in glob.glob(os.path.join(dist_coverage_path, ".coverage.*")): + if "COVERAGE_FILE" in os.environ: + new_cov_file = os.environ["COVERAGE_FILE"] + os.path.basename( + fn + ).replace(".coverage", "") + else: + new_cov_file = fn.replace(".dist-coverage" + os.sep, "") + shutil.copyfile(fn, new_cov_file) + shutil.rmtree(dist_coverage_path) + + +def _build_docker_images(use_test_docker_file=True): + image_name = "mars-test-image:" + uuid.uuid1().hex + try: + if use_test_docker_file: + proc = subprocess.Popen( + ["docker", "build", "-f", "Dockerfile.test", "-t", image_name, "."], + cwd=TEST_ROOT, + ) + else: + proc = subprocess.Popen( + [ + "docker", + "build", + "-f", + os.path.join(DOCKER_ROOT, "Dockerfile"), + "-t", + image_name, + ".", + ], + cwd=MARS_ROOT, + ) + if proc.wait() != 0: + raise SystemError("Executing docker build failed.") + + if use_test_docker_file: + proc = subprocess.Popen( + [ + "docker", + "run", + "-v", + MARS_ROOT + ":/mnt/mars", + image_name, + "/srv/build_ext.sh", + ] + ) + if proc.wait() != 0: + raise SystemError("Executing docker run failed.") + except: # noqa: E722 + _remove_docker_image(image_name) + raise + return image_name + + +def _remove_docker_image(image_name, raises=True): + if "CI" not in os.environ: + # delete image iff in CI environment + return + proc = subprocess.Popen(["docker", "rmi", "-f", image_name]) + if proc.wait() != 0 and raises: + raise SystemError("Executing docker rmi failed.") + + +def _load_docker_env(): + if os.path.exists("/var/run/docker.sock") or not shutil.which("minikube"): + return + + proc = subprocess.Popen(["minikube", "docker-env"], stdout=subprocess.PIPE) + proc.wait(30) + for line in proc.stdout: + line = line.decode().split("#", 1)[0] + line = line.strip() # type: str | bytes + export_pos = line.find("export") + if export_pos < 0: + continue + line = line[export_pos + 6 :].strip() + var, value = line.split("=", 1) + os.environ[var] = value.strip('"') + + +@contextmanager +def _start_kube_cluster(use_test_docker_file=True, **kwargs): + _load_docker_env() + image_name = _build_docker_images(use_test_docker_file=use_test_docker_file) + + temp_spill_dir = tempfile.mkdtemp(prefix="test-mars-k8s-") + api_client = k8s_config.new_client_from_config() + kube_api = k8s_client.CoreV1Api(api_client) + + cluster_client = None + try: + if use_test_docker_file: + extra_volumes = [ + HostPathVolumeConfig("mars-src-path", "/mnt/mars", MARS_ROOT) + ] + pre_stop_command = ["rm", "/tmp/stopping.tmp"] + else: + extra_volumes = [] + pre_stop_command = None + + cluster_client = new_cluster( + api_client, + image=image_name, + worker_spill_paths=[temp_spill_dir], + extra_volumes=extra_volumes, + pre_stop_command=pre_stop_command, + timeout=600, + log_when_fail=True, + **kwargs, + ) + + assert cluster_client.endpoint is not None + + pod_items = kube_api.list_namespaced_pod(cluster_client.namespace).to_dict() + + log_processes = [] + for item in pod_items["items"]: + log_processes.append( + subprocess.Popen( + [ + "kubectl", + "logs", + "-f", + "-n", + cluster_client.namespace, + item["metadata"]["name"], + ] + ) + ) + + yield + + if use_test_docker_file: + # turn off service processes with grace to get coverage data + procs = [] + pod_items = kube_api.list_namespaced_pod(cluster_client.namespace).to_dict() + for item in pod_items["items"]: + p = subprocess.Popen( + [ + "kubectl", + "exec", + "-n", + cluster_client.namespace, + item["metadata"]["name"], + "--", + "/srv/graceful_stop.sh", + ] + ) + procs.append(p) + for p in procs: + p.wait() + + [p.terminate() for p in log_processes] + finally: + shutil.rmtree(temp_spill_dir) + if cluster_client: + try: + cluster_client.stop(wait=True, timeout=20) + except TimeoutError: + pass + _collect_coverage() + _remove_docker_image(image_name, False) + + +@pytest.mark.parametrize("use_test_docker_file", [True, False]) +@pytest.mark.skipif(not kube_available, reason="Cannot run without kubernetes") +def test_run_in_kubernetes(use_test_docker_file): + with _start_kube_cluster( + supervisor_cpu=0.5, + supervisor_mem="1G", + worker_cpu=0.5, + worker_mem="1G", + worker_cache_mem="64m", + extra_labels={"mars-test/group": "test-label-name"}, + extra_env={"MARS_K8S_GROUP_LABELS": "mars-test/group"}, + use_test_docker_file=use_test_docker_file, + ): + a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 + b = mt.ones((100, 100), chunk_size=20) * 2 * 1 + 1 + c = (a * b * 2 + 1).sum() + r = c.execute().fetch() + + expected = (np.ones(a.shape) * 2 * 1 + 1) ** 2 * 2 + 1 + np.testing.assert_array_equal(r, expected.sum()) + + +@pytest.mark.skipif(not kube_available, reason="Cannot run without kubernetes") +@mock.patch( + "kubernetes.client.CoreV1Api.create_namespaced_replication_controller", + new=lambda *_, **__: None, +) +@mock.patch( + "kubernetes.client.AppsV1Api.create_namespaced_deployment", + new=lambda *_, **__: None, +) +def test_create_timeout(): + _load_docker_env() + api_client = k8s_config.new_client_from_config() + + cluster = None + try: + extra_vol_config = HostPathVolumeConfig("mars-src-path", "/mnt/mars", MARS_ROOT) + with pytest.raises(TimeoutError): + cluster = new_cluster( + api_client, + image="pseudo_image", + supervisor_cpu=0.5, + supervisor_mem="1G", + worker_cpu=0.5, + worker_mem="1G", + extra_volumes=[extra_vol_config], + timeout=1, + ) + finally: + if cluster: + cluster.stop(wait=True) + _collect_coverage() diff --git a/python/xorbits/_mars/deploy/kubernetes/worker.py b/python/xorbits/_mars/deploy/kubernetes/worker.py new file mode 100644 index 000000000..f2e41cb05 --- /dev/null +++ b/python/xorbits/_mars/deploy/kubernetes/worker.py @@ -0,0 +1,55 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +from ..oscar.worker import WorkerCommandRunner +from .core import K8SServiceMixin + +logger = logging.getLogger(__name__) + + +class K8SWorkerCommandRunner(K8SServiceMixin, WorkerCommandRunner): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + async def start_services(self): + from ...services.cluster import ClusterAPI + from ..oscar.worker import start_worker + + self.write_pid_file() + await start_worker( + self.pool.external_address, + self.args.supervisors, + self.band_to_resource, + list(self.args.load_modules), + self.config, + mark_ready=False, + ) + await self.wait_all_supervisors_ready() + + cluster_api = await ClusterAPI.create(self.args.endpoint) + await cluster_api.mark_node_ready() + + await self.start_readiness_server() + + async def stop_services(self): + await self.stop_readiness_server() + await super().stop_services() + + +main = K8SWorkerCommandRunner() + +if __name__ == "__main__": # pragma: no branch + main() diff --git a/python/xorbits/_mars/deploy/oscar/__init__.py b/python/xorbits/_mars/deploy/oscar/__init__.py new file mode 100644 index 000000000..775a30345 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .ray import RayClusterBackend, new_cluster_in_ray, new_ray_session diff --git a/python/xorbits/_mars/deploy/oscar/base_config.yml b/python/xorbits/_mars/deploy/oscar/base_config.yml new file mode 100644 index 000000000..21f7a2c0d --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/base_config.yml @@ -0,0 +1,86 @@ +services: + - cluster + - session + - storage + - meta + - lifecycle + - scheduling + - subtask + - task + - mutable +cluster: + backend: fixed + node_timeout: 120 + node_check_interval: 1 + log_dir: null +session: + custom_log_dir: null +storage: + default_config: + transfer_block_size: 5 * 1024 ** 2 + plasma: + store_memory: 20% + "@overriding_fields": ["backends"] +meta: + store: dict +task: + default_config: + optimize_tileable_graph: yes + optimize_chunk_graph: yes + fuse_enabled: yes + initial_same_color_num: null + as_broadcaster_successor_num: null + execution_config: + backend: mars +scheduling: + autoscale: + enabled: false + min_workers: 1 # Must >=1, mars need at least 1 worker to fetch data + max_workers: 100 + scheduler_backlog_timeout: 60 + worker_idle_timeout: 120 + speculation: + # Enables (yes) or disables (no) speculative execution of subtasks. + # If enabled, `initial_same_color_num` will be set to 1 to ensure enough homogeneous subtasks to + # calculate statistics + enabled: no + # Don't submit subtasks actually for slow subtasks + dry: no + # The time interval seconds to use before checking for speculative subtasks. + interval: 5 + # The percentage of subtasks that has not finished yet at which to start speculation. + threshold: 75% + # Minimum amount of time seconds a task runs before being considered for speculation. + # This can be used to avoid launching speculative copies of tasks that are very short. + min_task_runtime: 3 + # How many times slower a task is than the median to be considered for speculation. + multiplier: 1.5 + # Max number of concurrent speculative run for a subtask. + max_concurrent_run: 3 + subtask_cancel_timeout: 5 +metrics: + backend: console + # If backend is prometheus, then we can add prometheus config as follows: + # prometheus: + # port: 8988 +oscar: + numa: + # external address scheme, default null, + # available value including: null, ucx + external_addr_scheme: null + # enable internal address for in-process communication + enable_internal_addr: yes + gpu: + # external address scheme, default null, + # available value including: null, ucx + external_addr_scheme: null + # enable internal address for in-process communication + enable_internal_addr: yes + extra_conf: + ucx: + tcp: null + nvlink: null + infiniband: null + rdmacm: null + cuda-copy: null + create-cuda-contex: null diff --git a/python/xorbits/_mars/deploy/oscar/cmdline.py b/python/xorbits/_mars/deploy/oscar/cmdline.py new file mode 100644 index 000000000..10965fe07 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/cmdline.py @@ -0,0 +1,259 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import asyncio +import faulthandler +import glob +import importlib +import json +import logging.config +import os +import sys +import tempfile +from typing import List + +import psutil + +from ...utils import ensure_coverage +from ..utils import get_third_party_modules_from_config, load_service_config_file + +logger = logging.getLogger(__name__) +_is_windows: bool = sys.platform.startswith("win") +ensure_coverage() + + +class OscarCommandRunner: + command_description = None + node_role = None + _port_file_prefix = "mars_service_process" + + def __init__(self): + faulthandler.enable() + + self.args = None + self.ports = None + self.config = {} + self.pool = None + + self.logging_conf = {} + + self._running = False + + def config_args(self, parser): + parser.add_argument("-e", "--endpoint", help="endpoint of the service") + parser.add_argument("-H", "--host", help="host name of the service") + parser.add_argument( + "-p", + "--ports", + help="ports of the service, must equal to num of processes", + ) + parser.add_argument("-c", "--config", help="service configuration") + parser.add_argument( + "-f", "--config-file", help="configuration file of the service" + ) + parser.add_argument( + "-s", + "--supervisors", + help="endpoint of supervisors, needed for workers and webs " + "when kv-store argument is not available, or when you " + "need to use multiple supervisors without kv-store", + ) + parser.add_argument("--log-level", help="log level") + parser.add_argument("--log-format", help="log format") + parser.add_argument( + "--log-conf", help="log config file, logging.conf by default" + ) + parser.add_argument("--load-modules", nargs="*", help="modules to import") + parser.add_argument( + "--use-uvloop", help="use uvloop, 'auto' by default. Use 'no' to disable" + ) + + def _set_log_dir(self): + cluster_config: dict = self.config.get("cluster") + if cluster_config is None: + raise KeyError('"cluster" key is missing!') + log_dir = cluster_config.get("log_dir") + self.logging_conf["log_dir"] = log_dir + self.logging_conf["from_cmd"] = True + + def _get_logging_config_paths(self): + log_conf = self.args.log_conf or "logging.conf" + + return [ + log_conf, + os.path.join(os.path.abspath("."), log_conf), + os.path.join( + os.path.dirname(os.path.abspath(__file__)), "file-logging.conf" + ), + ] + + def config_logging(self): + self._set_log_dir() + + # get level and format cmd line config + log_level = self.args.log_level + level = log_level.upper() if log_level else None + self.logging_conf["level"] = level + formatter = self.args.log_format + if formatter: + self.logging_conf["format"] = formatter + + config_paths = self._get_logging_config_paths() + for i, conf_path in enumerate(config_paths): + if os.path.exists(conf_path): + self.logging_conf["file"] = conf_path + break + + @classmethod + def _build_endpoint_file_path(cls, pid: int = None, asterisk: bool = False): + pid = pid or os.getpid() + return os.path.join( + tempfile.gettempdir(), f'{cls._port_file_prefix}.{"*" if asterisk else pid}' + ) + + def _write_supervisor_endpoint_file(self, args): + file_name = self._build_endpoint_file_path() + with open(file_name, "w") as port_file: + port_file.write(args.endpoint) + return file_name + + def _collect_supervisors_from_dir(self): + endpoints = [] + for fn in glob.glob(self._build_endpoint_file_path(asterisk=True)): + _, pid_str = os.path.basename(fn).rsplit(".", 1) + # detect if process exists + if pid_str.isdigit() and not psutil.pid_exists(int(pid_str)): + continue + with open(fn, "r") as ep_file: + endpoints.append(ep_file.read().strip()) + return endpoints + + @classmethod + def get_default_config_file(cls): + mod_file_path = os.path.dirname( + importlib.import_module(cls.__module__).__file__ + ) + return os.path.join(mod_file_path, "config.yml") + + def parse_args(self, parser, argv, environ=None): + environ = environ or os.environ + args = parser.parse_args(argv) + + if args.endpoint is not None and args.host is not None: # pragma: no cover + raise ValueError("Cannot specify host and endpoint at the same time") + + if "MARS_TASK_DETAIL" in environ: + task_detail = json.loads(environ["MARS_TASK_DETAIL"]) + task_type, task_index = ( + task_detail["task"]["type"], + task_detail["task"]["index"], + ) + + args.host = args.host or task_detail["cluster"][task_type][task_index] + args.supervisors = args.supervisors or ",".join( + task_detail["cluster"]["supervisor"] + ) + + default_host = "0.0.0.0" if not _is_windows else "127.0.0.1" + env_host = os.environ.get( + "MARS_BIND_HOST", os.environ.get("MARS_CONTAINER_IP", default_host) + ) + args.host = args.host or env_host + + args.ports = args.ports or os.environ.get("MARS_BIND_PORT") + if args.ports is not None: + self.ports = [int(p) for p in args.ports.split(",")] + + if args.endpoint is None and len(self.ports or []) == 1: + args.endpoint = f"{args.host}:{self.ports[0]}" + self.ports = None + + args.use_uvloop = args.use_uvloop or "auto" + + if args.config is not None: + self.config = json.loads(args.config) + else: + if args.config_file is None: + args.config_file = self.get_default_config_file() + self.config = load_service_config_file(args.config_file) + + load_modules = [] + for mods in list(args.load_modules or ()) + get_third_party_modules_from_config( + self.config, self.node_role, environ + ): + load_modules.extend(mods.split(",") if mods else []) + args.load_modules = tuple(load_modules) + + if args.supervisors is None: + args.supervisors = ",".join(self._collect_supervisors_from_dir()) + + return args + + async def _main(self, argv): + self.config_logging() + + try: + pool = self.pool = await self.create_actor_pool() + + await self.start_services() + self._running = True + await pool.join() + except asyncio.CancelledError: + if self._running: # pragma: no branch + await self.stop_services() + if self.pool: # pragma: no branch + await self.pool.stop() + + async def create_actor_pool(self): + raise NotImplementedError + + async def start_services(self): + raise NotImplementedError + + async def stop_services(self): + raise NotImplementedError + + def create_loop(self): + use_uvloop = self.args.use_uvloop.strip() + if use_uvloop in ("0", "no"): + loop = asyncio.get_event_loop() + else: + try: + import uvloop + + loop = uvloop.new_event_loop() + asyncio.set_event_loop(loop) + except ImportError: + if use_uvloop == "auto": + loop = asyncio.get_event_loop() + else: # pragma: no cover + raise + return loop + + def __call__(self, argv: List[str] = None): + parser = argparse.ArgumentParser(description=self.command_description) + self.config_args(parser) + self.args = self.parse_args(parser, argv) + + loop = self.create_loop() + task = loop.create_task(self._main(argv)) + + try: + loop.run_until_complete(task) + except KeyboardInterrupt: + task.cancel() + loop.run_until_complete(task) + # avoid displaying exception-unhandled warnings + task.exception() diff --git a/python/xorbits/_mars/deploy/oscar/config.yml b/python/xorbits/_mars/deploy/oscar/config.yml new file mode 100644 index 000000000..fe6918a14 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/config.yml @@ -0,0 +1,3 @@ +"@inherits": base_config.yml +storage: + backends: [shared_memory] diff --git a/python/xorbits/_mars/deploy/oscar/file-logging.conf b/python/xorbits/_mars/deploy/oscar/file-logging.conf new file mode 100644 index 000000000..12b952719 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/file-logging.conf @@ -0,0 +1,74 @@ +[loggers] +keys=root,main,deploy,services,oscar,tornado,dataframe,learn,tensor + +[handlers] +keys=stream_handler,file_handler + +[formatters] +keys=formatter + +[logger_root] +level=WARN +handlers=stream_handler,file_handler + +[logger_main] +level=DEBUG +handlers=stream_handler,file_handler +qualname=__main__ +propagate=0 + +[logger_deploy] +level=DEBUG +handlers=stream_handler,file_handler +qualname=mars.deploy +propagate=0 + +[logger_oscar] +level=DEBUG +handlers=stream_handler,file_handler +qualname=mars.oscar +propagate=0 + +[logger_services] +level=DEBUG +handlers=stream_handler,file_handler +qualname=mars.services +propagate=0 + +[logger_dataframe] +level=DEBUG +handlers=stream_handler,file_handler +qualname=mars.dataframe +propagate=0 + +[logger_learn] +level=DEBUG +handlers=stream_handler,file_handler +qualname=mars.learn +propagate=0 + +[logger_tensor] +level=DEBUG +handlers=stream_handler,file_handler +qualname=mars.tensor +propagate=0 + +[logger_tornado] +level=WARN +handlers=stream_handler,file_handler +qualname=tornado +propagate=0 + +[handler_stream_handler] +class=StreamHandler +formatter=formatter +level=DEBUG +args=(sys.stderr,) + +[handler_file_handler] +class=FileHandler +formatter=formatter +level=DEBUG + +[formatter_formatter] +format=%(asctime)s %(name)-12s %(process)d %(levelname)-8s %(message)s diff --git a/python/xorbits/_mars/deploy/oscar/local.py b/python/xorbits/_mars/deploy/oscar/local.py new file mode 100644 index 000000000..8c256509a --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/local.py @@ -0,0 +1,433 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import atexit +import logging +import os +import sys +from concurrent.futures import Future as SyncFuture +from typing import Dict, List, Union + +import numpy as np + +from ... import oscar as mo +from ...core.entrypoints import init_extension_entrypoints +from ...lib.aio import get_isolation +from ...metrics import init_metrics +from ...oscar.backends.router import Router +from ...resource import cpu_count, cuda_count, mem_total +from ...services import NodeRole +from ...services.task.execution.api import ExecutionConfig +from ...typing import ClientType, ClusterType +from ..utils import get_third_party_modules_from_config, load_config +from .pool import create_supervisor_actor_pool, create_worker_actor_pool +from .service import start_supervisor, start_worker, stop_supervisor, stop_worker +from .session import AbstractSession, _new_session, ensure_isolation_created + +logger = logging.getLogger(__name__) + +_is_exiting_future = SyncFuture() +atexit.register( + lambda: _is_exiting_future.set_result(0) if not _is_exiting_future.done() else None +) + +# The default config file. +DEFAULT_CONFIG_FILE = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "config.yml" +) + +# the default times to retry subtask. +DEFAULT_SUBTASK_MAX_RETRIES = 3 +# the default time to cancel a subtask. +DEFAULT_SUBTASK_CANCEL_TIMEOUT = 5 + + +def _load_config(config: Union[str, Dict] = None): + return load_config(config, default_config_file=DEFAULT_CONFIG_FILE) + + +async def new_cluster_in_isolation( + address: str = "0.0.0.0", + n_worker: int = 1, + n_cpu: Union[int, str] = "auto", + mem_bytes: Union[int, str] = "auto", + cuda_devices: Union[List[int], str] = "auto", + subprocess_start_method: str = None, + backend: str = None, + config: Union[str, Dict] = None, + web: bool = True, + timeout: float = None, + n_supervisor_process: int = 0, + numa_external_addr_scheme: str = None, + numa_enable_internal_addr: bool = None, + gpu_external_addr_scheme: str = None, + gpu_enable_internal_addr: bool = None, + external_addr_scheme: str = None, + enable_internal_addr: bool = None, + oscar_extra_conf: dict = None, + log_config: dict = None, +) -> ClientType: + cluster = LocalCluster( + address, + n_worker, + n_cpu, + mem_bytes, + cuda_devices, + subprocess_start_method, + backend, + config, + web, + n_supervisor_process, + numa_external_addr_scheme=numa_external_addr_scheme, + numa_enable_internal_addr=numa_enable_internal_addr, + gpu_external_addr_scheme=gpu_external_addr_scheme, + gpu_enable_internal_addr=gpu_enable_internal_addr, + external_addr_scheme=external_addr_scheme, + enable_internal_addr=enable_internal_addr, + oscar_extra_conf=oscar_extra_conf, + log_config=log_config, + ) + await cluster.start() + return await LocalClient.create(cluster, timeout) + + +async def new_cluster( + address: str = "0.0.0.0", + n_worker: int = 1, + n_cpu: Union[int, str] = "auto", + mem_bytes: Union[int, str] = "auto", + cuda_devices: Union[List[int], str] = "auto", + subprocess_start_method: str = None, + backend: str = None, + config: Union[str, Dict] = None, + web: bool = True, + loop: asyncio.AbstractEventLoop = None, + use_uvloop: Union[bool, str] = "auto", + n_supervisor_process: int = 0, + numa_external_addr_scheme: str = None, + numa_enable_internal_addr: bool = None, + gpu_external_addr_scheme: str = None, + gpu_enable_internal_addr: bool = None, + external_addr_scheme: str = None, + enable_internal_addr: bool = None, + oscar_extra_conf: dict = None, +) -> ClientType: + coro = new_cluster_in_isolation( + address, + n_worker=n_worker, + n_cpu=n_cpu, + mem_bytes=mem_bytes, + cuda_devices=cuda_devices, + subprocess_start_method=subprocess_start_method, + backend=backend, + config=config, + web=web, + n_supervisor_process=n_supervisor_process, + numa_external_addr_scheme=numa_external_addr_scheme, + numa_enable_internal_addr=numa_enable_internal_addr, + gpu_external_addr_scheme=gpu_external_addr_scheme, + gpu_enable_internal_addr=gpu_enable_internal_addr, + external_addr_scheme=external_addr_scheme, + enable_internal_addr=enable_internal_addr, + oscar_extra_conf=oscar_extra_conf, + ) + isolation = ensure_isolation_created(dict(loop=loop, use_uvloop=use_uvloop)) + fut = asyncio.run_coroutine_threadsafe(coro, isolation.loop) + client = await asyncio.wrap_future(fut) + client.session.as_default() + return client + + +async def stop_cluster(cluster: ClusterType): + isolation = get_isolation() + coro = cluster.stop() + await asyncio.wrap_future(asyncio.run_coroutine_threadsafe(coro, isolation.loop)) + Router.set_instance(None) + + +class LocalCluster: + def __init__( + self: ClusterType, + address: str = "0.0.0.0", + n_worker: int = 1, + n_cpu: Union[int, str] = "auto", + mem_bytes: Union[int, str] = "auto", + cuda_devices: Union[List[int], List[List[int]], str] = "auto", + subprocess_start_method: str = None, + backend: str = None, + config: Union[str, Dict] = None, + web: Union[bool, str] = "auto", + n_supervisor_process: int = 0, + numa_external_addr_scheme: str = None, + numa_enable_internal_addr: bool = None, + gpu_external_addr_scheme: str = None, + gpu_enable_internal_addr: bool = None, + external_addr_scheme: str = None, + enable_internal_addr: str = None, + oscar_extra_conf: dict = None, + log_config: dict = None, + ): + # load third party extensions. + init_extension_entrypoints() + # auto choose the subprocess_start_method. + if subprocess_start_method is None: + subprocess_start_method = ( + "spawn" if sys.platform == "win32" else "forkserver" + ) + self._address = address + self._n_worker = n_worker + self._n_cpu = cpu_count() if n_cpu == "auto" else n_cpu + self._mem_bytes = mem_total() if mem_bytes == "auto" else mem_bytes + self._cuda_devices = self._get_cuda_devices(cuda_devices, n_worker) + self._subprocess_start_method = subprocess_start_method + self._config = load_config(config, default_config_file=DEFAULT_CONFIG_FILE) + execution_config = ExecutionConfig.from_config(self._config, backend=backend) + self._log_config = log_config + self._backend = execution_config.backend + self._web = web + self._n_supervisor_process = n_supervisor_process + + execution_config.merge_from( + ExecutionConfig.from_params( + backend=self._backend, + n_worker=self._n_worker, + n_cpu=self._n_cpu, + mem_bytes=self._mem_bytes, + cuda_devices=self._cuda_devices, + subtask_cancel_timeout=self._config.get("scheduling", {}).get( + "subtask_cancel_timeout", DEFAULT_SUBTASK_CANCEL_TIMEOUT + ), + subtask_max_retries=self._config.get("scheduling", {}).get( + "subtask_max_retries", DEFAULT_SUBTASK_MAX_RETRIES + ), + ) + ) + + # process oscar config + self._process_oscar_config( + numa_external_addr_scheme=numa_external_addr_scheme, + numa_enable_internal_addr=numa_enable_internal_addr, + gpu_external_addr_scheme=gpu_external_addr_scheme, + gpu_enable_internal_addr=gpu_enable_internal_addr, + external_addr_scheme=external_addr_scheme, + enable_internal_addr=enable_internal_addr, + oscar_extra_conf=oscar_extra_conf, + ) + + self._bands_to_resource = execution_config.get_deploy_band_resources() + self._supervisor_pool = None + self._worker_pools = [] + self._exiting_check_task = None + + self.supervisor_address = None + self.web_address = None + + def _process_oscar_config( + self, + numa_external_addr_scheme: str = None, + numa_enable_internal_addr: bool = None, + gpu_external_addr_scheme: str = None, + gpu_enable_internal_addr: bool = None, + external_addr_scheme: str = None, + enable_internal_addr: str = None, + oscar_extra_conf: dict = None, + ): + # process oscar config + assert "oscar" in self._config + oscar_config = self._config["oscar"] + numa_config = oscar_config["numa"] + numa_external_addr_scheme = ( + numa_external_addr_scheme + if numa_external_addr_scheme is not None + else external_addr_scheme + ) + if numa_external_addr_scheme: + numa_config["external_addr_scheme"] = numa_external_addr_scheme + numa_enable_internal_addr = ( + numa_enable_internal_addr + if numa_enable_internal_addr is not None + else enable_internal_addr + ) + if numa_enable_internal_addr is not None: + numa_config["enable_internal_addr"] = numa_enable_internal_addr + gpu_config = oscar_config["gpu"] + gpu_external_addr_scheme = ( + gpu_external_addr_scheme + if gpu_external_addr_scheme is not None + else external_addr_scheme + ) + if gpu_external_addr_scheme: + gpu_config["external_addr_scheme"] = gpu_external_addr_scheme + gpu_enable_internal_addr = ( + gpu_enable_internal_addr + if gpu_enable_internal_addr is not None + else enable_internal_addr + ) + if gpu_enable_internal_addr is not None: + gpu_config["enable_internal_addr"] = gpu_enable_internal_addr + if oscar_extra_conf is not None: + oscar_config["extra_conf"] = oscar_extra_conf + + @staticmethod + def _get_cuda_devices(cuda_devices, n_worker): + if cuda_devices == "auto": + total = cuda_count() + all_devices = np.arange(total) + return [list(arr) for arr in np.array_split(all_devices, n_worker)] + + else: + if not cuda_devices: + return [] + elif isinstance(cuda_devices[0], int): + assert n_worker == 1 + return [cuda_devices] + else: + assert len(cuda_devices) == n_worker + return cuda_devices + + @property + def backend(self): + return self._backend + + @property + def external_address(self): + return self._supervisor_pool.external_address + + async def start(self): + await self._start_supervisor_pool() + await self._start_worker_pools() + # start service + await self._start_service() + + # init metrics to guarantee metrics use in driver + metric_configs = self._config.get("metrics", {}) + metric_backend = metric_configs.get("backend") + init_metrics(metric_backend, config=metric_configs.get(metric_backend)) + + if self._web: + from ...services.web.supervisor import WebActor + + web_actor = await mo.actor_ref( + WebActor.default_uid(), address=self.supervisor_address + ) + self.web_address = await web_actor.get_web_address() + logger.warning("Web service started at %s", self.web_address) + + self._exiting_check_task = asyncio.create_task(self._check_exiting()) + + async def _check_exiting(self): + await asyncio.wrap_future(_is_exiting_future) + await self.stop() + + async def _start_supervisor_pool(self): + supervisor_modules = get_third_party_modules_from_config( + self._config, NodeRole.SUPERVISOR + ) + self._supervisor_pool = await create_supervisor_actor_pool( + self._address, + n_process=self._n_supervisor_process, + modules=supervisor_modules, + subprocess_start_method=self._subprocess_start_method, + metrics=self._config.get("metrics", {}), + web=self._web, + # passing logging conf to config logging when create pools + logging_conf=self._log_config, + oscar_config=self._config.get("oscar"), + ) + self.supervisor_address = self._supervisor_pool.external_address + + async def _start_worker_pools(self): + worker_modules = get_third_party_modules_from_config( + self._config, NodeRole.WORKER + ) + for band_to_resource in self._bands_to_resource: + worker_pool = await create_worker_actor_pool( + self._address, + band_to_resource, + modules=worker_modules, + subprocess_start_method=self._subprocess_start_method, + metrics=self._config.get("metrics", {}), + web=self._web, + # passing logging conf to config logging when create pools + logging_conf=self._log_config, + oscar_config=self._config.get("oscar"), + ) + self._worker_pools.append(worker_pool) + + async def _start_service(self): + self._web = await start_supervisor( + self.supervisor_address, config=self._config, web=self._web + ) + for worker_pool, band_to_resource in zip( + self._worker_pools, self._bands_to_resource + ): + await start_worker( + worker_pool.external_address, + self.supervisor_address, + band_to_resource, + config=self._config, + ) + + async def stop(self): + from .session import SessionAPI + + # delete all sessions + session_api = await SessionAPI.create(self._supervisor_pool.external_address) + await session_api.delete_all_sessions() + + for worker_pool in self._worker_pools: + await stop_worker(worker_pool.external_address, self._config) + await stop_supervisor(self._supervisor_pool.external_address, self._config) + for worker_pool in self._worker_pools: + await worker_pool.stop() + await self._supervisor_pool.stop() + AbstractSession.reset_default() + self._exiting_check_task.cancel() + Router.set_instance(None) + + +class LocalClient: + def __init__(self: ClientType, cluster: ClusterType, session: AbstractSession): + self._cluster = cluster + self.session = session + + @classmethod + async def create( + cls, + cluster: LocalCluster, + timeout: float = None, + ) -> ClientType: + session = await _new_session( + cluster.external_address, + backend=cluster.backend, + default=True, + timeout=timeout, + ) + client = LocalClient(cluster, session) + session.client = client + return client + + @property + def web_address(self): + return self._cluster.web_address + + async def __aenter__(self): + return self + + async def __aexit__(self, *_): + await self.stop() + + async def stop(self): + await stop_cluster(self._cluster) diff --git a/python/xorbits/_mars/deploy/oscar/pool.py b/python/xorbits/_mars/deploy/oscar/pool.py new file mode 100644 index 000000000..67cfb0f7c --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/pool.py @@ -0,0 +1,271 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import configparser +import logging +import os +import sys +import tempfile +from typing import Dict, List, Optional, Tuple + +from ... import oscar as mo +from ...constants import MARS_LOG_PATH_KEY, MARS_LOG_PREFIX, MARS_TMP_DIR_PREFIX +from ...resource import Resource, cuda_count + +logger = logging.getLogger(__name__) + + +def _need_suspend_sigint() -> bool: + try: + from IPython import get_ipython + + return get_ipython() is not None + except ImportError: + return False + + +def _get_root_logger_level_and_format() -> Tuple[str, Optional[str]]: + root = logging.getLogger() + level = logging.getLevelName(root.getEffectiveLevel()) + if level.startswith("WARN"): + level = "WARN" + handler = root.handlers[0] if root.handlers else None + fmt = handler.formatter._fmt if handler else None + return level, fmt + + +def _parse_file_logging_config( + file_path: str, + log_path: str, + level: Optional[str], + formatter: Optional[str] = None, + from_cmd: bool = False, +) -> configparser.RawConfigParser: + """ + If env is ipython (from_cmd=False), the log level and format on the web follow our default configuration file, + and the level and format on the console use the user's configuration (logging.basicConfig) or keep the default. + + If env is cmd (from_cmd=True, e.g. user invokes `python -m mars.worker`), + the log level and format on the web and console follow user's config (--log-level and --log-format) + or our default configuration file. + """ + config = configparser.RawConfigParser() + config.read(file_path) + logger_sections = [ + "logger_root", + "logger_main", + "logger_deploy", + "logger_oscar", + "logger_services", + "logger_dataframe", + "logger_learn", + "logger_tensor", + "handler_stream_handler", + "handler_file_handler", + ] + all_sections = config.sections() + for section in logger_sections: + if level and section in all_sections: + config[section]["level"] = level.upper() + + if "handler_file_handler" in config: + if sys.platform.startswith("win"): + log_path = log_path.replace("\\", "/") + config["handler_file_handler"]["args"] = rf"('{log_path}',)" + if formatter: + format_section = "formatter_formatter" + config[format_section]["format"] = formatter + + stream_handler_sec = "handler_stream_handler" + file_handler_sec = "handler_file_handler" + root_sec = "logger_root" + # If not from cmd (like ipython) and user uses its own config file, + # need to judge that whether handler_stream_handler section is in the config. + if not from_cmd and stream_handler_sec in all_sections: + # console and web log keeps the default config as root logger + root_level, root_fmt = _get_root_logger_level_and_format() + config[file_handler_sec]["level"] = root_level or "WARN" + config[stream_handler_sec]["level"] = root_level or "WARN" + config[root_sec]["level"] = root_level or "WARN" + if root_fmt: + config.add_section("formatter_console") + config["formatter_console"]["format"] = root_fmt + config["formatters"]["keys"] += ",console" + config[stream_handler_sec]["formatter"] = "console" + return config + + +def _config_logging(**kwargs) -> Optional[configparser.RawConfigParser]: + web: bool = kwargs.get("web", True) + # web=False usually means it is a test environment. + if not web: + return + if kwargs.get("logging_conf", None) is None: + return + config = kwargs["logging_conf"] + from_cmd = config.get("from_cmd", False) + log_dir = config.get("log_dir", None) + log_conf_file = config.get("file", None) + level = config.get("level", None) + formatter = config.get("formatter", None) + logging_config_path = log_conf_file or os.path.join( + os.path.dirname(os.path.abspath(__file__)), "file-logging.conf" + ) + # default config, then create a temp file + if (os.environ.get(MARS_LOG_PATH_KEY, None)) is None or ( + not os.path.exists(os.environ[MARS_LOG_PATH_KEY]) + ): + if log_dir is None: + mars_tmp_dir = tempfile.mkdtemp(prefix=MARS_TMP_DIR_PREFIX) + else: + mars_tmp_dir = os.path.join(log_dir, MARS_TMP_DIR_PREFIX) + os.makedirs(mars_tmp_dir, exist_ok=True) + _, file_path = tempfile.mkstemp(prefix=MARS_LOG_PREFIX, dir=mars_tmp_dir) + os.environ[MARS_LOG_PATH_KEY] = file_path + logging_conf = _parse_file_logging_config( + logging_config_path, file_path, level, formatter, from_cmd + ) + # bind user's level and format when using default log conf + logging.config.fileConfig( + logging_conf, + disable_existing_loggers=False, + ) + logger.debug("Use logging config file at %s", logging_config_path) + return logging_conf + else: + logging_conf = _parse_file_logging_config( + logging_config_path, + os.environ[MARS_LOG_PATH_KEY], + level, + formatter, + from_cmd, + ) + logging.config.fileConfig( + logging_conf, + os.environ[MARS_LOG_PATH_KEY], + disable_existing_loggers=False, + ) + logger.debug("Use logging config file at %s", logging_config_path) + return logging_conf + + +async def create_supervisor_actor_pool( + address: str, + n_process: int, + modules: List[str] = None, + ports: List[int] = None, + subprocess_start_method: str = None, + oscar_config: dict = None, + **kwargs, +): + logging_conf = _config_logging(**kwargs) + kwargs["logging_conf"] = logging_conf + if oscar_config: + numa_config = oscar_config.get("numa", dict()) + numa_external_address_scheme = numa_config.get("external_addr_scheme", None) + numa_enable_internal_address = numa_config.get("enable_internal_addr", True) + external_address_schemes = [numa_external_address_scheme] * (n_process + 1) + enable_internal_addresses = [numa_enable_internal_address] * (n_process + 1) + extra_conf = oscar_config["extra_conf"] + else: + external_address_schemes = enable_internal_addresses = extra_conf = None + return await mo.create_actor_pool( + address, + n_process=n_process, + ports=ports, + external_address_schemes=external_address_schemes, + enable_internal_addresses=enable_internal_addresses, + modules=modules, + subprocess_start_method=subprocess_start_method, + suspend_sigint=_need_suspend_sigint(), + extra_conf=extra_conf, + **kwargs, + ) + + +async def create_worker_actor_pool( + address: str, + band_to_resource: Dict[str, Resource], + n_io_process: int = 1, + modules: List[str] = None, + ports: List[int] = None, + cuda_devices: List[int] = None, + subprocess_start_method: str = None, + oscar_config: dict = None, + **kwargs, +): + logging_conf = _config_logging(**kwargs) + kwargs["logging_conf"] = logging_conf + # TODO: support NUMA when ready + n_process = sum( + int(resource.num_cpus) or int(resource.num_gpus) + for resource in band_to_resource.values() + ) + envs = [] + labels = ["main"] + + oscar_config = oscar_config or dict() + numa_config = oscar_config.get("numa", dict()) + numa_external_address_scheme = numa_config.get("external_addr_scheme") + numa_enable_internal_address = numa_config.get("enable_internal_addr") + gpu_config = oscar_config.get("gpu", dict()) + gpu_external_address_scheme = gpu_config.get("external_addr_scheme") + gpu_enable_internal_address = gpu_config.get("enable_internal_addr") + extra_conf = oscar_config.get("extra_conf", dict()) + + if cuda_devices is None: # pragma: no cover + env_devices = os.environ.get("CUDA_VISIBLE_DEVICES") + if not env_devices: + cuda_devices = list(range(cuda_count())) + else: + cuda_devices = [int(i) for i in env_devices.split(",")] + + external_address_schemes = [numa_external_address_scheme] + enable_internal_addresses = [numa_enable_internal_address] + i_gpu = iter(sorted(cuda_devices)) + for band, resource in band_to_resource.items(): + if band.startswith("gpu"): + idx = str(next(i_gpu)) + envs.append({"CUDA_VISIBLE_DEVICES": idx}) + labels.append(f"gpu-{idx}") + external_address_schemes.append(gpu_external_address_scheme) + enable_internal_addresses.append(gpu_enable_internal_address) + else: + assert band.startswith("numa") + num_cpus = int(resource.num_cpus) + if cuda_devices: + # if has cuda device, disable all cuda devices for numa processes + envs.extend([{"CUDA_VISIBLE_DEVICES": "-1"} for _ in range(num_cpus)]) + labels.extend([band] * num_cpus) + external_address_schemes.extend( + [numa_external_address_scheme for _ in range(num_cpus)] + ) + enable_internal_addresses.extend( + [numa_enable_internal_address for _ in range(num_cpus)] + ) + + return await mo.create_actor_pool( + address, + n_process=n_process, + ports=ports, + n_io_process=n_io_process, + labels=labels, + envs=envs, + modules=modules, + subprocess_start_method=subprocess_start_method, + suspend_sigint=_need_suspend_sigint(), + external_address_schemes=external_address_schemes, + enable_internal_addresses=enable_internal_addresses, + extra_conf=extra_conf, + **kwargs, + ) diff --git a/python/xorbits/_mars/deploy/oscar/ray.py b/python/xorbits/_mars/deploy/oscar/ray.py new file mode 100644 index 000000000..1dbcabba8 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/ray.py @@ -0,0 +1,680 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import itertools +import logging +import os +import time +from typing import AsyncGenerator, Dict, List, Optional, Union + +from ... import oscar as mo +from ...core.entrypoints import init_extension_entrypoints +from ...metrics import init_metrics +from ...oscar.backends.ray.driver import RayActorDriver +from ...oscar.backends.ray.pool import RayPoolState +from ...oscar.backends.ray.utils import ( + node_placement_to_address, + process_address_to_placement, + process_placement_to_address, +) +from ...oscar.backends.router import Router +from ...oscar.errors import ReconstructWorkerError +from ...resource import Resource +from ...services import NodeRole +from ...services.cluster.backends.base import ( + AbstractClusterBackend, + register_cluster_backend, +) +from ...services.task.execution.api import ExecutionConfig +from ...utils import lazy_import, retry_callable +from ..utils import get_third_party_modules_from_config, load_config +from .pool import create_supervisor_actor_pool, create_worker_actor_pool +from .service import start_supervisor, start_worker, stop_supervisor, stop_worker +from .session import ( + AbstractSession, + _new_session, + ensure_isolation_created, + new_session, +) + +ray = lazy_import("ray") +logger = logging.getLogger(__name__) + +# The default config file. +DEFAULT_CONFIG_FILE = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "rayconfig.yml" +) +# The default value for supervisor standalone (not share node with worker). +DEFAULT_SUPERVISOR_STANDALONE = False +# The default value for supervisor sub pool count. +DEFAULT_SUPERVISOR_SUB_POOL_NUM = 0 + + +def _load_config(config: Union[str, Dict] = None): + return load_config(config, default_config_file=DEFAULT_CONFIG_FILE) + + +@register_cluster_backend +class RayClusterBackend(AbstractClusterBackend): + name = "ray" + + def __init__(self, lookup_address: str, cluster_state_ref): + self._supervisors = [n.strip() for n in lookup_address.split(",")] + self._cluster_state_ref = cluster_state_ref + + @classmethod + async def create( + cls, node_role: NodeRole, lookup_address: str, pool_address: str + ) -> "RayClusterBackend": + try: + ref = await mo.create_actor( + ClusterStateActor, + uid=ClusterStateActor.default_uid(), + address=lookup_address, + ) + except mo.ActorAlreadyExist: # pragma: no cover + ref = await mo.actor_ref( + ClusterStateActor.default_uid(), address=lookup_address + ) + return cls(lookup_address, ref) + + async def watch_supervisors(self) -> AsyncGenerator[List[str], None]: + yield self._supervisors + + async def get_supervisors(self, filter_ready: bool = True) -> List[str]: + return self._supervisors + + async def new_worker(self, worker_address): + return await self._cluster_state_ref.new_worker(worker_address) + + async def request_worker( + self, worker_cpu: int = None, worker_mem: int = None, timeout: int = None + ) -> str: + return await self._cluster_state_ref.request_worker( + worker_cpu, worker_mem, timeout + ) + + async def release_worker(self, address: str): + return await self._cluster_state_ref.release_worker(address) + + async def reconstruct_worker(self, address: str): + return await self._cluster_state_ref.reconstruct_worker(address) + + def get_cluster_state_ref(self): + return self._cluster_state_ref + + +class ClusterStateActor(mo.StatelessActor): + def __init__(self): + self._worker_cpu, self._worker_mem, self._config = None, None, None + self._pg_name, self._band_to_resource, self._worker_modules = None, None, None + self._pg_counter = itertools.count() + self._worker_count = 0 + self._workers = {} + self._releasing_tasks = {} + self._reconstructing_tasks = {} + + async def __post_create__(self): + self._pg_name, _, _ = process_address_to_placement(self.address) + + def set_config(self, worker_cpu, worker_mem, config): + self._worker_cpu, self._worker_mem, self._config = ( + worker_cpu, + worker_mem, + config, + ) + # TODO(chaokunyang) Support gpu + self._band_to_resource = { + "numa-0": Resource(num_cpus=self._worker_cpu, mem_bytes=self._worker_mem) + } + self._worker_modules = get_third_party_modules_from_config( + self._config, NodeRole.WORKER + ) + + async def request_worker( + self, worker_cpu: int = None, worker_mem: int = None, timeout: int = None + ) -> Optional[str]: + worker_cpu = worker_cpu or self._worker_cpu + worker_mem = worker_mem or self._worker_mem + bundle = { + "CPU": worker_cpu, + # "memory": worker_mem or self._worker_mem + } + band_to_resource = { + "numa-0": Resource(num_cpus=worker_cpu, mem_bytes=worker_mem) + } + start_time = time.time() + logger.info("Start to request worker with resource %s.", bundle) + # TODO rescale ray placement group instead of creating new placement group + pg_name = f"{self._pg_name}_{next(self._pg_counter)}" + pg = ray.util.placement_group(name=pg_name, bundles=[bundle], strategy="SPREAD") + create_pg_timeout = timeout or 120 + try: + await asyncio.wait_for(pg.ready(), timeout=create_pg_timeout) + except asyncio.CancelledError: # pragma: no cover + logger.warning( + "Request worker with placement group %s in %s seconds canceled.", + pg.bundle_specs, + create_pg_timeout, + ) + ray.util.remove_placement_group(pg) + return None + except asyncio.TimeoutError: + logger.warning( + "Request worker failed, " + "can not create placement group %s in %s seconds.", + pg.bundle_specs, + create_pg_timeout, + ) + ray.util.remove_placement_group(pg) + return None + logger.info( + "Creating placement group %s took %.4f seconds", + pg.bundle_specs, + time.time() - start_time, + ) + worker_address = process_placement_to_address(pg_name, 0, 0) + worker_pool = await self.create_worker(worker_address) + await self.start_worker(worker_address, band_to_resource=band_to_resource) + logger.info( + "Request worker %s succeeds in %.4f seconds", + worker_address, + time.time() - start_time, + ) + self._workers[worker_address] = (worker_pool, pg) + return worker_address + + async def create_worker(self, worker_address): + start_time = time.time() + worker_pool = await create_worker_actor_pool( + worker_address, + self._band_to_resource, + modules=self._worker_modules, + metrics=self._config.get("metrics", {}), + ) + logger.info( + "Create worker node %s succeeds in %.4f seconds.", + worker_address, + time.time() - start_time, + ) + return worker_pool + + async def start_worker(self, worker_address, band_to_resource=None): + self._worker_count += 1 + start_time = time.time() + band_to_resource = band_to_resource or self._band_to_resource + await start_worker( + worker_address, self.address, band_to_resource, config=self._config + ) + worker_pool = ray.get_actor(worker_address) + await worker_pool.mark_service_ready.remote() + logger.info( + "Start services on worker %s succeeds in %.4f seconds.", + worker_address, + time.time() - start_time, + ) + return worker_pool + + async def release_worker(self, address: str): + logger.info("Start to release worker %s", address) + task = self._reconstructing_tasks.get(address) + if task is not None: + task.cancel() + + task = self._releasing_tasks.get(address) + if task is not None: + logger.info("Waiting for releasing worker %s", address) + return await task + + async def _release_worker(): + await stop_worker(address, self._config) + pool, pg = self._workers.pop(address) + await pool.actor_pool.remote("stop") + if "COV_CORE_SOURCE" in os.environ: # pragma: no cover + try: + # must clean up first, or coverage info lost + await pool.cleanup.remote() + except: # noqa: E722 # nosec # pylint: disable=bare-except + pass + ray.kill(pool.main_pool) + ray.util.remove_placement_group(pg) + logger.info("Released worker %s", address) + + task = asyncio.create_task(_release_worker()) + task.add_done_callback(lambda _: self._releasing_tasks.pop(address, None)) + self._releasing_tasks[address] = task + return await task + + async def reconstruct_worker(self, address: str): + task = self._releasing_tasks.get(address) + if task is not None: + raise ReconstructWorkerError( + f"Can't reconstruct releasing worker {address}" + ) + + task = self._reconstructing_tasks.get(address) + if task is not None: + logger.info("Waiting for reconstruct worker %s", address) + return await task + + async def _reconstruct_worker(): + logger.info("Reconstruct worker %s", address) + actor = ray.get_actor(address) + # ray call will error when actor is restarting + state = await retry_callable( + actor.state.remote, ex_type=ray.exceptions.RayActorError, sync=False + )() + if state == RayPoolState.SERVICE_READY: + logger.info("Worker %s is service ready.") + return + + if state == RayPoolState.INIT: + await actor.start.remote() + else: + assert state == RayPoolState.POOL_READY + + start_time = time.time() + await start_worker( + address, self.address, self._band_to_resource, config=self._config + ) + await actor.mark_service_ready.remote() + logger.info( + "Start services on worker %s succeeds in %.4f seconds.", + address, + time.time() - start_time, + ) + + task = asyncio.create_task(_reconstruct_worker()) + task.add_done_callback(lambda _: self._reconstructing_tasks.pop(address, None)) + self._reconstructing_tasks[address] = task + return await task + + +async def new_cluster( + cluster_name: str = None, + supervisor_cpu: int = 1, + supervisor_mem: int = 1 * 1024**3, + worker_num: int = 1, + worker_cpu: int = 2, + worker_mem: int = 2 * 1024**3, + backend: str = None, + config: Union[str, Dict] = None, + **kwargs, +): + cluster_name = cluster_name or f"ray-cluster-{int(time.time())}" + if not ray.is_initialized(): + logger.warning("Ray is not started, start the local ray cluster by `ray.init`.") + # add 16 logical cpus for other computing in ray. + ray.init(num_cpus=16 + worker_num * worker_cpu) + ensure_isolation_created(kwargs) + if kwargs: # pragma: no cover + raise TypeError(f"new_cluster got unexpected arguments: {list(kwargs)}") + n_supervisor_process = kwargs.get( + "n_supervisor_process", DEFAULT_SUPERVISOR_SUB_POOL_NUM + ) + cluster = RayCluster( + cluster_name, + supervisor_cpu, + supervisor_mem, + worker_num, + worker_cpu, + worker_mem, + backend, + config, + n_supervisor_process=n_supervisor_process, + ) + try: + await cluster.start() + return await RayClient.create(cluster) + except Exception as ex: # pragma: no cover + # cleanup the cluster if failed. + try: + await cluster.stop() + except Exception as stop_ex: + raise stop_ex from ex + raise ex + + +def new_cluster_in_ray(**kwargs): + isolation = ensure_isolation_created(kwargs) + coro = new_cluster(**kwargs) + fut = asyncio.run_coroutine_threadsafe(coro, isolation.loop) + client = fut.result() + client.session.as_default() + return client + + +new_cluster_in_ray.__doc__ = new_cluster.__doc__ + + +def new_ray_session( + address: str = None, + session_id: str = None, + backend: str = "mars", + default: bool = True, + **new_cluster_kwargs, +) -> AbstractSession: + """ + + Parameters + ---------- + address: str + mars web server address. + session_id: str + session id. If not specified, will be generated automatically. + backend: str + The executor backend. Available values are "mars" and "ray", default is "mars". + default: bool + whether set the session as default session. + new_cluster_kwargs: + See `new_cluster` arguments. + """ + client = None + if not address: + client = new_cluster_in_ray(backend=backend, **new_cluster_kwargs) + session_id = session_id or client.session.session_id + address = client.address + session = new_session( + address=address, session_id=session_id, backend=backend, default=default + ) + session.client = client + if default: + # SyncSession set isolated_session as default session instead. + AbstractSession.default.client = client + return session + + +class RayCluster: + _supervisor_pool: "ray.actor.ActorHandle" + _worker_pools: List["ray.actor.ActorHandle"] + + def __init__( + self, + cluster_name: str, + supervisor_cpu: Union[int, float] = 1, + supervisor_mem: int = 1 * 1024**3, + worker_num: int = 1, + worker_cpu: Union[int, float] = 2, + worker_mem: int = 2 * 1024**3, + backend: str = None, + config: Union[str, Dict] = None, + n_supervisor_process: int = DEFAULT_SUPERVISOR_SUB_POOL_NUM, + ): + # load third party extensions. + init_extension_entrypoints() + self._cluster_name = cluster_name + self._supervisor_cpu = supervisor_cpu + self._supervisor_mem = supervisor_mem + self._n_supervisor_process = n_supervisor_process + self._worker_num = worker_num + self._worker_cpu = worker_cpu + self._worker_mem = worker_mem + self.backend = backend + # load config file to dict. + self._config = load_config(config, default_config_file=DEFAULT_CONFIG_FILE) + self.supervisor_address = None + # Hold actor handles to avoid being freed + self._supervisor_pool = None + self._worker_addresses = [] + self._worker_pools = [] + self._stopped = False + self._cluster_backend = None + self.web_address = None + + async def start(self): + try: + # Python 3.8 support force argument. + logging.basicConfig( + format=ray.ray_constants.LOGGER_FORMAT, level=logging.INFO, force=True + ) + except ValueError: # pragma: no cover + logging.basicConfig( + format=ray.ray_constants.LOGGER_FORMAT, level=logging.INFO + ) + execution_config = ExecutionConfig.from_config( + self._config, backend=self.backend + ) + self.backend = execution_config.backend + if self.backend == "mars": + await self.start_oscar( + self._n_supervisor_process, + self._supervisor_cpu, + self._supervisor_mem, + self._worker_num, + self._worker_cpu, + self._worker_mem, + ) + elif self.backend == "ray": + execution_config.merge_from( + ExecutionConfig.from_params( + backend=self.backend, + n_worker=self._worker_num, + n_cpu=self._worker_num * self._worker_cpu, + mem_bytes=self._worker_mem, + subtask_num_cpus=self._worker_cpu, + subtask_memory=self._worker_mem, + ) + ) + assert self._n_supervisor_process == 0, self._n_supervisor_process + await self.start_oscar( + self._n_supervisor_process, + self._supervisor_cpu, + self._supervisor_mem, + 0, + 0, + 0, + ) + else: + raise ValueError(f"Unsupported backend type: {self.backend}.") + + async def start_oscar( + self, + n_supervisor_process, + supervisor_cpu, + supervisor_mem, + worker_num, + worker_cpu, + worker_mem, + ): + logger.info("Start cluster with config %s", self._config) + # init metrics to guarantee metrics use in driver + metric_configs = self._config.get("metrics", {}) + metric_backend = metric_configs.get("backend") + init_metrics(metric_backend, config=metric_configs.get(metric_backend)) + address_to_resources = dict() + supervisor_standalone = ( + self._config.get("cluster", {}) + .get("ray", {}) + .get("supervisor", {}) + .get("standalone", DEFAULT_SUPERVISOR_STANDALONE) + ) + supervisor_sub_pool_num = ( + self._config.get("cluster", {}) + .get("ray", {}) + .get("supervisor", {}) + .get("sub_pool_num", n_supervisor_process) + ) + from ...storage.ray import support_specify_owner + + if not support_specify_owner(): # pragma: no cover + logger.warning( + "Current installed ray version does not support specify owner, " + "autoscale may not work." + ) + # config['scheduling']['autoscale']['enabled'] = False + self.supervisor_address = process_placement_to_address(self._cluster_name, 0, 0) + if "cluster" not in self._config: # pragma: no cover + self._config["cluster"] = dict() + self._config["cluster"]["lookup_address"] = self.supervisor_address + address_to_resources[node_placement_to_address(self._cluster_name, 0)] = { + "CPU": supervisor_cpu, + # "memory": supervisor_mem, + } + worker_addresses = [] + if supervisor_standalone or worker_num == 0: + for worker_index in range(1, worker_num + 1): + worker_address = process_placement_to_address( + self._cluster_name, worker_index, 0 + ) + worker_addresses.append(worker_address) + worker_node_address = node_placement_to_address( + self._cluster_name, worker_index + ) + address_to_resources[worker_node_address] = { + "CPU": worker_cpu, + # "memory": self._worker_mem, + } + else: + for worker_index in range(worker_num): + worker_process_index = ( + supervisor_sub_pool_num + 1 if worker_index == 0 else 0 + ) + worker_address = process_placement_to_address( + self._cluster_name, worker_index, worker_process_index + ) + worker_addresses.append(worker_address) + worker_node_address = node_placement_to_address( + self._cluster_name, worker_index + ) + address_to_resources[worker_node_address] = { + "CPU": worker_cpu, + # "memory": self._worker_mem, + } + mo.setup_cluster(address_to_resources) + + # third party modules from config + supervisor_modules = get_third_party_modules_from_config( + self._config, NodeRole.SUPERVISOR + ) + + # set global router an empty one. + Router.set_instance(Router(list(), None)) + + # create supervisor actor pool + supervisor_pool_coro = asyncio.create_task( + create_supervisor_actor_pool( + self.supervisor_address, + n_process=supervisor_sub_pool_num, + main_pool_cpus=0, + sub_pool_cpus=0, + modules=supervisor_modules, + metrics=self._config.get("metrics", {}), + ) + ) + worker_pools = [ + asyncio.create_task( + create_worker_actor_pool( + addr, + { + "numa-0": Resource( + num_cpus=worker_cpu, mem_bytes=self._worker_mem + ) + }, + modules=get_third_party_modules_from_config( + self._config, NodeRole.WORKER + ), + metrics=self._config.get("metrics", {}), + ) + ) + for addr in worker_addresses + ] + self._supervisor_pool = await supervisor_pool_coro + logger.info("Create supervisor on node %s succeeds.", self.supervisor_address) + self._cluster_backend = await RayClusterBackend.create( + NodeRole.WORKER, self.supervisor_address, self.supervisor_address + ) + cluster_state_ref = self._cluster_backend.get_cluster_state_ref() + await self._cluster_backend.get_cluster_state_ref().set_config( + worker_cpu, self._worker_mem, self._config + ) + # start service + await start_supervisor(self.supervisor_address, config=self._config) + logger.info( + "Start services on supervisor %s succeeds.", self.supervisor_address + ) + await self._supervisor_pool.mark_service_ready.remote() + worker_pools = await asyncio.gather(*worker_pools) + logger.info("Create %s workers succeeds.", len(worker_pools)) + await asyncio.gather( + *[cluster_state_ref.start_worker(addr) for addr in worker_addresses] + ) + logger.info("Start services on %s workers succeeds.", len(worker_addresses)) + for worker_address, worker_pool in zip(worker_addresses, worker_pools): + self._worker_addresses.append(worker_address) + self._worker_pools.append(worker_pool) + + from ...services.web.supervisor import WebActor + + web_actor = await mo.actor_ref( + WebActor.default_uid(), address=self.supervisor_address + ) + self.web_address = await web_actor.get_web_address() + logger.warning("Web service started at %s", self.web_address) + + async def stop(self): + if not self._stopped: + try: + for worker_address in self._worker_addresses: + await stop_worker(worker_address, self._config) + for pool in self._worker_pools: + await pool.actor_pool.remote("stop") + if self._supervisor_pool is not None: + await stop_supervisor(self.supervisor_address, self._config) + await self._supervisor_pool.actor_pool.remote("stop") + finally: + AbstractSession.reset_default() + RayActorDriver.stop_cluster() + Router.set_instance(None) + self._stopped = True + + +class RayClient: + def __init__(self, cluster: RayCluster, session: AbstractSession): + self._cluster = cluster + self._address = cluster.supervisor_address + self._session = session + # hold ray cluster by client to avoid actor handle out-of-scope + session.client = self + + @classmethod + async def create(cls, cluster: RayCluster) -> "RayClient": + session = await _new_session( + cluster.supervisor_address, default=True, backend=cluster.backend + ) + client = RayClient(cluster, session) + AbstractSession.default.client = client + return client + + @property + def address(self): + return self._session.address + + @property + def session(self): + return self._session + + @property + def web_address(self): + return self._cluster.web_address + + async def __aenter__(self): + return self + + async def __aexit__(self, *_): + await self.stop() + + async def stop(self): + await self._cluster.stop() + AbstractSession.reset_default() diff --git a/python/xorbits/_mars/deploy/oscar/rayconfig.yml b/python/xorbits/_mars/deploy/oscar/rayconfig.yml new file mode 100644 index 000000000..6530466af --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/rayconfig.yml @@ -0,0 +1,20 @@ +"@inherits": base_config.yml +cluster: + backend: ray + ray: + supervisor: + standalone: no + sub_pool_num: 0 +session: + custom_log_dir: null +storage: + backends: [ray] +scheduling: + autoscale: + enabled: false + scheduler_backlog_timeout: 20 + worker_idle_timeout: 40 + subtask_max_retries: 3 + subtask_max_reschedules: 2 +metrics: + backend: ray diff --git a/python/xorbits/_mars/deploy/oscar/service.py b/python/xorbits/_mars/deploy/oscar/service.py new file mode 100644 index 000000000..fc50f9c2d --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/service.py @@ -0,0 +1,97 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import Dict, List, Union + +from ...oscar import ServerClosed +from ...resource import Resource +from ...services import NodeRole, start_services, stop_services + +logger = logging.getLogger(__name__) + + +async def start_supervisor( + address: str, + lookup_address: str = None, + modules: Union[List, str, None] = None, + config: Dict = None, + web: Union[str, bool] = "auto", +): + logger.debug("Starting Mars supervisor at %s", address) + lookup_address = lookup_address or address + backend = config["cluster"].get("backend", "fixed") + if backend == "fixed" and config["cluster"].get("lookup_address") is None: + config["cluster"]["lookup_address"] = lookup_address + if web: + # try to append web to services + config["services"].append("web") + if modules: + config["modules"] = modules + try: + await start_services(NodeRole.SUPERVISOR, config, address=address) + logger.debug("Mars supervisor started at %s", address) + except ImportError: + if web == "auto": + config["services"] = [ + service for service in config["services"] if service != "web" + ] + await start_services(NodeRole.SUPERVISOR, config, address=address) + logger.debug("Mars supervisor started at %s", address) + return False + else: # pragma: no cover + raise + else: + return bool(web) + + +async def stop_supervisor(address: str, config: Dict = None): + try: + await stop_services(NodeRole.SUPERVISOR, address=address, config=config) + except (ConnectionRefusedError, ServerClosed): # pragma: no cover + pass + + +async def start_worker( + address: str, + lookup_address: str, + band_to_resource: Dict[str, Resource], + modules: Union[List, str, None] = None, + config: Dict = None, + mark_ready: bool = True, +): + logger.debug("Starting Mars worker at %s", address) + backend = config["cluster"].get("backend", "fixed") + if backend == "fixed" and config["cluster"].get("lookup_address") is None: + config["cluster"]["lookup_address"] = lookup_address + if config["cluster"].get("resource") is None: + config["cluster"]["resource"] = band_to_resource + if any( + band_name.startswith("gpu-") for band_name in band_to_resource + ): # pragma: no cover + if "cuda" not in config["storage"]["backends"]: + config["storage"]["backends"].append("cuda") + if modules: + config["modules"] = modules + await start_services( + NodeRole.WORKER, config, address=address, mark_ready=mark_ready + ) + logger.debug("Mars worker started at %s", address) + + +async def stop_worker(address: str, config: Dict = None): + try: + await stop_services(NodeRole.WORKER, address=address, config=config) + except (ConnectionRefusedError, ServerClosed): # pragma: no cover + pass diff --git a/python/xorbits/_mars/deploy/oscar/session.py b/python/xorbits/_mars/deploy/oscar/session.py new file mode 100644 index 000000000..4950477e2 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/session.py @@ -0,0 +1,2076 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import concurrent.futures +import itertools +import json +import logging +import random +import string +import threading +import time +import warnings +from abc import ABC, ABCMeta, abstractmethod +from collections import defaultdict +from dataclasses import dataclass +from functools import wraps +from numbers import Integral +from typing import Any, Callable, Coroutine, Dict, List, Optional, Tuple, Union +from urllib.parse import urlparse +from weakref import WeakKeyDictionary, WeakSet, ref + +import numpy as np + +from ... import oscar as mo +from ...config import options +from ...core import ChunkType, TileableGraph, TileableType, enter_mode +from ...core.entrypoints import init_extension_entrypoints +from ...core.operand import Fetch +from ...lib.aio import ( + Isolation, + alru_cache, + get_isolation, + new_isolation, + stop_isolation, +) +from ...metrics import Metrics +from ...services.cluster import AbstractClusterAPI, ClusterAPI +from ...services.lifecycle import AbstractLifecycleAPI, LifecycleAPI +from ...services.meta import AbstractMetaAPI, MetaAPI +from ...services.mutable import MutableAPI, MutableTensor +from ...services.session import AbstractSessionAPI, SessionAPI +from ...services.storage import StorageAPI +from ...services.task import AbstractTaskAPI, TaskAPI, TaskResult +from ...services.task.execution.api import Fetcher +from ...services.web import OscarWebAPI +from ...typing import BandType, ClientType +from ...utils import ( + Timer, + build_fetch, + classproperty, + copy_tileables, + implements, + merge_chunks, + merged_chunk_as_tileable_type, + register_asyncio_task_timeout_detector, +) + +logger = logging.getLogger(__name__) + + +@dataclass +class Progress: + value: float = 0.0 + + +@dataclass +class Profiling: + result: dict = None + + +class ExecutionInfo: + def __init__( + self, + aio_task: asyncio.Task, + progress: Progress, + profiling: Profiling, + loop: asyncio.AbstractEventLoop, + to_execute_tileables: List[TileableType], + ): + self._aio_task = aio_task + self._progress = progress + self._profiling = profiling + self._loop = loop + self._to_execute_tileables = [ref(t) for t in to_execute_tileables] + + self._future_local = threading.local() + + def _ensure_future(self): + try: + self._future_local.future + except AttributeError: + + async def wait(): + return await self._aio_task + + self._future_local.future = fut = asyncio.run_coroutine_threadsafe( + wait(), self._loop + ) + self._future_local.aio_future = asyncio.wrap_future(fut) + + @property + def loop(self): + return self._loop + + @property + def aio_task(self): + return self._aio_task + + def progress(self) -> float: + return self._progress.value + + @property + def to_execute_tileables(self) -> List[TileableType]: + return [t() for t in self._to_execute_tileables] + + def profiling_result(self) -> dict: + return self._profiling.result + + def result(self, timeout=None): + self._ensure_future() + return self._future_local.future.result(timeout=timeout) + + def cancel(self): + self._aio_task.cancel() + + def __getattr__(self, attr): + self._ensure_future() + return getattr(self._future_local.aio_future, attr) + + def __await__(self): + self._ensure_future() + return self._future_local.aio_future.__await__() + + def get_future(self): + self._ensure_future() + return self._future_local.aio_future + + +warning_msg = """ +No session found, local session \ +will be created in background, \ +it may take a while before execution. \ +If you want to new a local session by yourself, \ +run code below: + +``` +import mars + +mars.new_session() +``` +""" + + +class AbstractSession(ABC): + name = None + _default = None + _lock = threading.Lock() + + def __init__(self, address: str, session_id: str): + self._address = address + self._session_id = session_id + self._closed = False + + @property + def address(self): + return self._address + + @property + def session_id(self): + return self._session_id + + def __eq__(self, other): + return ( + isinstance(other, AbstractSession) + and self._address == other.address + and self._session_id == other.session_id + ) + + def __hash__(self): + return hash((AbstractSession, self._address, self._session_id)) + + def as_default(self) -> "AbstractSession": + """ + Mark current session as default session. + """ + AbstractSession._default = self + return self + + @classmethod + def reset_default(cls): + AbstractSession._default = None + + @classproperty + def default(self): + return AbstractSession._default + + +class AbstractAsyncSession(AbstractSession, metaclass=ABCMeta): + @classmethod + @abstractmethod + async def init( + cls, address: str, session_id: str, new: bool = True, **kwargs + ) -> "AbstractSession": + """ + Init a new session. + + Parameters + ---------- + address : str + Address. + session_id : str + Session ID. + new : bool + New a session. + kwargs + + Returns + ------- + session + """ + + async def destroy(self): + """ + Destroy a session. + """ + self.reset_default() + self._closed = True + + @abstractmethod + async def execute(self, *tileables, **kwargs) -> ExecutionInfo: + """ + Execute tileables. + + Parameters + ---------- + tileables + Tileables. + kwargs + """ + + @abstractmethod + async def fetch(self, *tileables, **kwargs) -> list: + """ + Fetch tileables' data. + + Parameters + ---------- + tileables + Tileables. + + Returns + ------- + data + """ + + @abstractmethod + async def _get_ref_counts(self) -> Dict[str, int]: + """ + Get all ref counts + + Returns + ------- + ref_counts + """ + + @abstractmethod + async def fetch_tileable_op_logs( + self, + tileable_op_key: str, + offsets: Union[Dict[str, List[int]], str, int], + sizes: Union[Dict[str, List[int]], str, int], + ) -> Dict: + """ + Fetch logs given tileable op key. + + Parameters + ---------- + tileable_op_key : str + Tileable op key. + offsets + Chunk op key to offsets. + sizes + Chunk op key to sizes. + + Returns + ------- + chunk_key_to_logs + """ + + @abstractmethod + async def get_total_n_cpu(self): + """ + Get number of cluster cpus. + + Returns + ------- + number_of_cpu: int + """ + + @abstractmethod + async def get_cluster_versions(self) -> List[str]: + """ + Get versions used in current Mars cluster + + Returns + ------- + version_list : list + List of versions + """ + + @abstractmethod + async def get_web_endpoint(self) -> Optional[str]: + """ + Get web endpoint of current session + + Returns + ------- + web_endpoint : str + web endpoint + """ + + @abstractmethod + async def create_remote_object( + self, session_id: str, name: str, object_cls, *args, **kwargs + ): + """ + Create remote object + + Parameters + ---------- + session_id : str + Session ID. + name : str + object_cls + args + kwargs + + Returns + ------- + actor_ref + """ + + @abstractmethod + async def get_remote_object(self, session_id: str, name: str): + """ + Get remote object. + + Parameters + ---------- + session_id : str + Session ID. + name : str + + Returns + ------- + actor_ref + """ + + @abstractmethod + async def destroy_remote_object(self, session_id: str, name: str): + """ + Destroy remote object. + + Parameters + ---------- + session_id : str + Session ID. + name : str + """ + + @abstractmethod + async def create_mutable_tensor( + self, + shape: tuple, + dtype: Union[np.dtype, str], + name: str = None, + default_value: Union[int, float] = 0, + chunk_size: Union[int, Tuple] = None, + ): + """ + Create a mutable tensor. + + Parameters + ---------- + shape: tuple + Shape of the mutable tensor. + + dtype: np.dtype or str + Data type of the mutable tensor. + + name: str, optional + Name of the mutable tensor, a random name will be used if not specified. + + default_value: optional + Default value of the mutable tensor. Default is 0. + + chunk_size: int or tuple, optional + Chunk size of the mutable tensor. + + Returns + ------- + MutableTensor + """ + + @abstractmethod + async def get_mutable_tensor(self, name: str): + """ + Get a mutable tensor by name. + + Parameters + ---------- + name: str + Name of the mutable tensor to get. + + Returns + ------- + MutableTensor + """ + + async def stop_server(self): + """ + Stop server. + """ + + +class AbstractSyncSession(AbstractSession, metaclass=ABCMeta): + @classmethod + @abstractmethod + def init( + cls, + address: str, + session_id: str, + backend: str = "mars", + new: bool = True, + **kwargs, + ) -> "AbstractSession": + """ + Init a new session. + + Parameters + ---------- + address : str + Address. + session_id : str + Session ID. + backend : str + Backend. + new : bool + New a session. + kwargs + + Returns + ------- + session + """ + + @abstractmethod + def execute( + self, tileable, *tileables, show_progress: Union[bool, str] = None, **kwargs + ) -> Union[List[TileableType], TileableType, ExecutionInfo]: + """ + Execute tileables. + + Parameters + ---------- + tileable + Tileable. + tileables + Tileables. + show_progress + If show progress. + kwargs + + Returns + ------- + result + """ + + @abstractmethod + def fetch(self, *tileables, **kwargs) -> list: + """ + Fetch tileables. + + Parameters + ---------- + tileables + Tileables. + kwargs + + Returns + ------- + fetched_data : list + """ + + @abstractmethod + def fetch_infos(self, *tileables, fields, **kwargs) -> list: + """ + Fetch infos of tileables. + + Parameters + ---------- + tileables + Tileables. + fields + List of fields + kwargs + + Returns + ------- + fetched_infos : list + """ + + @abstractmethod + def decref(self, *tileables_keys): + """ + Decref tileables. + + Parameters + ---------- + tileables_keys : list + Tileables' keys + """ + + @abstractmethod + def _get_ref_counts(self) -> Dict[str, int]: + """ + Get all ref counts + + Returns + ------- + ref_counts + """ + + @abstractmethod + def fetch_tileable_op_logs( + self, + tileable_op_key: str, + offsets: Union[Dict[str, List[int]], str, int], + sizes: Union[Dict[str, List[int]], str, int], + ) -> Dict: + """ + Fetch logs given tileable op key. + + Parameters + ---------- + tileable_op_key : str + Tileable op key. + offsets + Chunk op key to offsets. + sizes + Chunk op key to sizes. + + Returns + ------- + chunk_key_to_logs + """ + + @abstractmethod + def get_total_n_cpu(self): + """ + Get number of cluster cpus. + + Returns + ------- + number_of_cpu: int + """ + + @abstractmethod + def get_cluster_versions(self) -> List[str]: + """ + Get versions used in current Mars cluster + + Returns + ------- + version_list : list + List of versions + """ + + @abstractmethod + def get_web_endpoint(self) -> Optional[str]: + """ + Get web endpoint of current session + + Returns + ------- + web_endpoint : str + web endpoint + """ + + @abstractmethod + def create_mutable_tensor( + self, + shape: tuple, + dtype: Union[np.dtype, str], + name: str = None, + default_value: Union[int, float] = 0, + chunk_size: Union[int, Tuple] = None, + ): + """ + Create a mutable tensor. + + Parameters + ---------- + shape: tuple + Shape of the mutable tensor. + + dtype: np.dtype or str + Data type of the mutable tensor. + + name: str, optional + Name of the mutable tensor, a random name will be used if not specified. + + default_value: optional + Default value of the mutable tensor. Default is 0. + + chunk_size: int or tuple, optional + Chunk size of the mutable tensor. + + Returns + ------- + MutableTensor + """ + + @abstractmethod + def get_mutable_tensor(self, name: str): + """ + Get a mutable tensor by name. + + Parameters + ---------- + name: str + Name of the mutable tensor to get. + + Returns + ------- + MutableTensor + """ + + def fetch_log( + self, + tileables: List[TileableType], + offsets: List[int] = None, + sizes: List[int] = None, + ): + from ...core.custom_log import fetch + + return fetch(tileables, self, offsets=offsets, sizes=sizes) + + +@dataclass +class ChunkFetchInfo: + tileable: TileableType + chunk: ChunkType + indexes: List[Union[int, slice]] + data: Any = None + + +_submitted_tileables = WeakSet() + + +@enter_mode(build=True, kernel=True) +def gen_submit_tileable_graph( + session: "AbstractSession", + result_tileables: List[TileableType], + warn_duplicated_execution: bool = False, +) -> Tuple[TileableGraph, List[TileableType]]: + tileable_to_copied = dict() + indexer = itertools.count() + result_to_index = {t: i for t, i in zip(result_tileables, indexer)} + result = list() + to_execute_tileables = list() + graph = TileableGraph(result) + + q = list(result_tileables) + while q: + tileable = q.pop() + if tileable in tileable_to_copied: + continue + if tileable.cache and tileable not in result_to_index: + result_to_index[tileable] = next(indexer) + outputs = tileable.op.outputs + inputs = tileable.inputs if session not in tileable._executed_sessions else [] + new_inputs = [] + all_inputs_processed = True + for inp in inputs: + if inp in tileable_to_copied: + new_inputs.append(tileable_to_copied[inp]) + elif session in inp._executed_sessions: + # executed, gen fetch + fetch_input = build_fetch(inp).data + tileable_to_copied[inp] = fetch_input + graph.add_node(fetch_input) + new_inputs.append(fetch_input) + else: + # some input not processed before + all_inputs_processed = False + # put back tileable + q.append(tileable) + q.append(inp) + break + if all_inputs_processed: + if isinstance(tileable.op, Fetch): + new_outputs = [tileable] + elif session in tileable._executed_sessions: + new_outputs = [] + for out in outputs: + fetch_out = tileable_to_copied.get(out, build_fetch(out).data) + new_outputs.append(fetch_out) + else: + new_outputs = [ + t.data for t in copy_tileables(outputs, inputs=new_inputs) + ] + for out, new_out in zip(outputs, new_outputs): + tileable_to_copied[out] = new_out + graph.add_node(new_out) + for new_inp in new_inputs: + graph.add_edge(new_inp, new_out) + + # process results + result.extend([None] * len(result_to_index)) + for t, i in result_to_index.items(): + result[i] = tileable_to_copied[t] + to_execute_tileables.append(t) + + if warn_duplicated_execution: + for n, c in tileable_to_copied.items(): + if not isinstance(c.op, Fetch) and n in _submitted_tileables: + warnings.warn( + f"Tileable {repr(n)} has been submitted before", RuntimeWarning + ) + # add all nodes into submitted tileables + _submitted_tileables.update( + n for n, c in tileable_to_copied.items() if not isinstance(c.op, Fetch) + ) + + return graph, to_execute_tileables + + +class _IsolatedSession(AbstractAsyncSession): + def __init__( + self, + address: str, + session_id: str, + backend: str, + session_api: AbstractSessionAPI, + meta_api: AbstractMetaAPI, + lifecycle_api: AbstractLifecycleAPI, + task_api: AbstractTaskAPI, + mutable_api: MutableAPI, + cluster_api: AbstractClusterAPI, + web_api: Optional[OscarWebAPI], + client: ClientType = None, + timeout: float = None, + request_rewriter: Callable = None, + ): + super().__init__(address, session_id) + self._backend = backend + self._session_api = session_api + self._task_api = task_api + self._meta_api = meta_api + self._lifecycle_api = lifecycle_api + self._mutable_api = mutable_api + self._cluster_api = cluster_api + self._web_api = web_api + self.client = client + self.timeout = timeout + self._request_rewriter = request_rewriter + + self._tileable_to_fetch = WeakKeyDictionary() + self._asyncio_task_timeout_detector_task = ( + register_asyncio_task_timeout_detector() + ) + + # add metrics + self._tileable_graph_gen_time = Metrics.gauge( + "mars.tileable_graph_gen_time_secs", + "Time consuming in seconds to generate a tileable graph", + ("address", "session_id"), + ) + + @classmethod + async def _init( + cls, + address: str, + session_id: str, + backend: str, + new: bool = True, + timeout: float = None, + ): + session_api = await SessionAPI.create(address) + if new: + # create new session + session_address = await session_api.create_session(session_id) + else: + session_address = await session_api.get_session_address(session_id) + lifecycle_api = await LifecycleAPI.create(session_id, session_address) + meta_api = await MetaAPI.create(session_id, session_address) + task_api = await TaskAPI.create(session_id, session_address) + mutable_api = await MutableAPI.create(session_id, session_address) + cluster_api = await ClusterAPI.create(session_address) + try: + web_api = await OscarWebAPI.create(session_address) + except mo.ActorNotExist: + web_api = None + return cls( + address, + session_id, + backend, + session_api, + meta_api, + lifecycle_api, + task_api, + mutable_api, + cluster_api, + web_api, + timeout=timeout, + ) + + @classmethod + @implements(AbstractAsyncSession.init) + async def init( + cls, + address: str, + session_id: str, + backend: str, + new: bool = True, + timeout: float = None, + **kwargs, + ) -> "AbstractAsyncSession": + init_local = kwargs.pop("init_local", False) + request_rewriter = kwargs.pop("request_rewriter", None) + if init_local: + from .local import new_cluster_in_isolation + + return ( + await new_cluster_in_isolation( + address, timeout=timeout, backend=backend, **kwargs + ) + ).session + + if kwargs: # pragma: no cover + unexpected_keys = ", ".join(list(kwargs.keys())) + raise TypeError( + f"Oscar session got unexpected arguments: {unexpected_keys}" + ) + + if urlparse(address).scheme == "http": + return await _IsolatedWebSession._init( + address, + session_id, + backend, + new=new, + timeout=timeout, + request_rewriter=request_rewriter, + ) + else: + return await cls._init( + address, + session_id, + backend, + new=new, + timeout=timeout, + ) + + async def _update_progress(self, task_id: str, progress: Progress): + zero_acc_time = 0 + delay = 0.5 + while True: + try: + last_progress_value = progress.value + progress.value = await self._task_api.get_task_progress(task_id) + if abs(progress.value - last_progress_value) < 1e-4: + # if percentage does not change, we add delay time by 0.5 seconds every time + zero_acc_time = min(5, zero_acc_time + 0.5) + delay = zero_acc_time + else: + # percentage changes, we use percentage speed to calc progress time + zero_acc_time = 0 + speed = abs(progress.value - last_progress_value) / delay + # one percent for one second + delay = 0.01 / speed + delay = max(0.5, min(delay, 5.0)) + await asyncio.sleep(delay) + except asyncio.CancelledError: + break + + async def _run_in_background( + self, + tileables: list, + task_id: str, + progress: Progress, + profiling: Profiling, + ): + with enter_mode(build=True, kernel=True): + # wait for task to finish + cancelled = False + progress_task = asyncio.create_task( + self._update_progress(task_id, progress) + ) + start_time = time.time() + task_result: Optional[TaskResult] = None + try: + if self.timeout is None: + check_interval = 30 + else: + elapsed = time.time() - start_time + check_interval = min(self.timeout - elapsed, 30) + + while True: + task_result = await self._task_api.wait_task( + task_id, timeout=check_interval + ) + if task_result is not None: + break + elif ( + self.timeout is not None + and time.time() - start_time > self.timeout + ): + raise TimeoutError( + f"Task({task_id}) running time > {self.timeout}" + ) + except asyncio.CancelledError: + # cancelled + cancelled = True + await self._task_api.cancel_task(task_id) + finally: + progress_task.cancel() + if task_result is not None: + progress.value = 1.0 + else: + # not finished, set progress + progress.value = await self._task_api.get_task_progress(task_id) + if task_result is not None: + profiling.result = task_result.profiling + if task_result.profiling: + logger.warning( + "Profile task %s execution result:\n%s", + task_id, + json.dumps(task_result.profiling, indent=4), + ) + if task_result.error: + raise task_result.error.with_traceback(task_result.traceback) + if cancelled: + return + fetch_tileables = await self._task_api.get_fetch_tileables(task_id) + assert len(tileables) == len(fetch_tileables) + + for tileable, fetch_tileable in zip(tileables, fetch_tileables): + self._tileable_to_fetch[tileable] = fetch_tileable + # update meta, e.g. unknown shape + tileable.params = fetch_tileable.params + + async def execute(self, *tileables, **kwargs) -> ExecutionInfo: + if self._closed: + raise RuntimeError("Session closed already") + fuse_enabled: bool = kwargs.pop("fuse_enabled", None) + extra_config: dict = kwargs.pop("extra_config", None) + warn_duplicated_execution: bool = kwargs.pop("warn_duplicated_execution", False) + if kwargs: # pragma: no cover + raise TypeError(f"run got unexpected key arguments {list(kwargs)!r}") + + tileables = [ + tileable.data if hasattr(tileable, "data") else tileable + for tileable in tileables + ] + + # build tileable graph + with Timer() as timer: + tileable_graph, to_execute_tileables = gen_submit_tileable_graph( + self, tileables, warn_duplicated_execution=warn_duplicated_execution + ) + + logger.info( + "Time consuming to generate a tileable graph is %ss with address %s, session id %s", + timer.duration, + self.address, + self._session_id, + ) + self._tileable_graph_gen_time.record( + timer.duration, {"address": self.address, "session_id": self._session_id} + ) + + # submit task + task_id = await self._task_api.submit_tileable_graph( + tileable_graph, + fuse_enabled=fuse_enabled, + extra_config=extra_config, + ) + + progress = Progress() + profiling = Profiling() + # create asyncio.Task + aio_task = asyncio.create_task( + self._run_in_background(to_execute_tileables, task_id, progress, profiling) + ) + return ExecutionInfo( + aio_task, + progress, + profiling, + asyncio.get_running_loop(), + to_execute_tileables, + ) + + def _get_to_fetch_tileable( + self, tileable: TileableType + ) -> Tuple[TileableType, List[Union[slice, Integral]]]: + from ...dataframe.indexing.iloc import DataFrameIlocGetItem, SeriesIlocGetItem + from ...tensor.indexing import TensorIndex + + slice_op_types = TensorIndex, DataFrameIlocGetItem, SeriesIlocGetItem + + if hasattr(tileable, "data"): + tileable = tileable.data + + indexes = None + while tileable not in self._tileable_to_fetch: + # if tileable's op is slice, try to check input + if isinstance(tileable.op, slice_op_types): + indexes = tileable.op.indexes + tileable = tileable.inputs[0] + if not all(isinstance(index, (slice, Integral)) for index in indexes): + raise ValueError("Only support fetch data slices") + elif isinstance(tileable.op, Fetch): + break + else: + raise ValueError(f"Cannot fetch unexecuted tileable: {tileable!r}") + + if isinstance(tileable.op, Fetch): + return tileable, indexes + else: + return self._tileable_to_fetch[tileable], indexes + + @classmethod + def _calc_chunk_indexes( + cls, fetch_tileable: TileableType, indexes: List[Union[slice, Integral]] + ) -> Dict[ChunkType, List[Union[slice, int]]]: + from ...tensor.utils import slice_split + + axis_to_slices = { + axis: slice_split(ind, fetch_tileable.nsplits[axis]) + for axis, ind in enumerate(indexes) + } + result = dict() + for chunk_index in itertools.product( + *[v.keys() for v in axis_to_slices.values()] + ): + # slice_obj: use tuple, since numpy complains + # + # FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use + # `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array + # index, `arr[np.array(seq)]`, which will result either in an error or a different result. + slice_obj = [ + axis_to_slices[axis][chunk_idx] + for axis, chunk_idx in enumerate(chunk_index) + ] + chunk = fetch_tileable.cix[chunk_index] + result[chunk] = slice_obj + return result + + def _process_result(self, tileable, result): # pylint: disable=no-self-use + return result + + @alru_cache(cache_exceptions=False) + async def _get_storage_api(self, band: BandType): + if urlparse(self.address).scheme == "http": + from ...services.storage.api import WebStorageAPI + + storage_api = WebStorageAPI( + self._session_id, self.address, band[1], self._request_rewriter + ) + else: + storage_api = await StorageAPI.create(self._session_id, band[0], band[1]) + return storage_api + + async def fetch(self, *tileables, **kwargs) -> list: + if kwargs: # pragma: no cover + unexpected_keys = ", ".join(list(kwargs.keys())) + raise TypeError(f"`fetch` got unexpected arguments: {unexpected_keys}") + + fetcher = Fetcher.create(self._backend, get_storage_api=self._get_storage_api) + + with enter_mode(build=True): + chunks = [] + get_chunk_metas = [] + fetch_infos_list = [] + for tileable in tileables: + fetch_tileable, indexes = self._get_to_fetch_tileable(tileable) + chunk_to_slice = None + if indexes is not None: + chunk_to_slice = self._calc_chunk_indexes(fetch_tileable, indexes) + fetch_infos = [] + for chunk in fetch_tileable.chunks: + if indexes and chunk not in chunk_to_slice: + continue + chunks.append(chunk) + get_chunk_metas.append( + self._meta_api.get_chunk_meta.delay( + chunk.key, + fields=fetcher.required_meta_keys, + ) + ) + indexes = ( + chunk_to_slice[chunk] if chunk_to_slice is not None else None + ) + fetch_infos.append( + ChunkFetchInfo(tileable=tileable, chunk=chunk, indexes=indexes) + ) + fetch_infos_list.append(fetch_infos) + + chunk_metas = await self._meta_api.get_chunk_meta.batch(*get_chunk_metas) + for chunk, meta, fetch_info in zip( + chunks, chunk_metas, itertools.chain(*fetch_infos_list) + ): + await fetcher.append(chunk.key, meta, fetch_info.indexes) + fetched_data = await fetcher.get() + for fetch_info, data in zip( + itertools.chain(*fetch_infos_list), fetched_data + ): + fetch_info.data = data + + result = [] + for tileable, fetch_infos in zip(tileables, fetch_infos_list): + index_to_data = [ + (fetch_info.chunk.index, fetch_info.data) + for fetch_info in fetch_infos + ] + merged = merge_chunks(index_to_data) + merged = merged_chunk_as_tileable_type(merged, tileable) + result.append(self._process_result(tileable, merged)) + return result + + async def fetch_infos(self, *tileables, fields, **kwargs) -> list: + available_fields = { + "data_key", + "object_id", + "object_refs", + "level", + "memory_size", + "store_size", + "bands", + } + if fields is None: + fields = available_fields + else: + for field_name in fields: + if field_name not in available_fields: # pragma: no cover + raise TypeError( + f"`fetch_infos` got unexpected field name: {field_name}" + ) + fields = set(fields) + + if kwargs: # pragma: no cover + unexpected_keys = ", ".join(list(kwargs.keys())) + raise TypeError(f"`fetch` got unexpected arguments: {unexpected_keys}") + # following fields needs to access storage API to get the meta. + _need_query_storage_fields = {"level", "memory_size", "store_size"} + _need_query_storage = bool(_need_query_storage_fields & fields) + with enter_mode(build=True): + chunk_to_bands, fetch_infos_list, result = await self._query_meta_service( + tileables, fields, _need_query_storage + ) + if not _need_query_storage: + assert result is not None + return result + storage_api_to_gets = defaultdict(list) + storage_api_to_fetch_infos = defaultdict(list) + for fetch_info in itertools.chain(*fetch_infos_list): + chunk = fetch_info.chunk + bands = chunk_to_bands[chunk] + storage_api = await self._get_storage_api(bands[0]) + storage_api_to_gets[storage_api].append( + storage_api.get_infos.delay(chunk.key) + ) + storage_api_to_fetch_infos[storage_api].append(fetch_info) + for storage_api in storage_api_to_gets: + fetched_data = await storage_api.get_infos.batch( + *storage_api_to_gets[storage_api] + ) + infos = storage_api_to_fetch_infos[storage_api] + for info, data in zip(infos, fetched_data): + info.data = data + + result = [] + for fetch_infos in fetch_infos_list: + fetched = defaultdict(list) + for fetch_info in fetch_infos: + bands = chunk_to_bands[fetch_info.chunk] + # Currently there's only one item in the returned List from storage_api.get_infos() + data = fetch_info.data[0] + if "data_key" in fields: + fetched["data_key"].append(fetch_info.chunk.key) + if "object_id" in fields: + fetched["object_id"].append(data.object_id) + if "level" in fields: + fetched["level"].append(data.level) + if "memory_size" in fields: + fetched["memory_size"].append(data.memory_size) + if "store_size" in fields: + fetched["store_size"].append(data.store_size) + # data.band misses ip info, e.g. 'numa-0' + # while band doesn't, e.g. (address0, 'numa-0') + if "bands" in fields: + fetched["bands"].append(bands) + result.append(fetched) + + return result + + async def _query_meta_service(self, tileables, fields, query_storage): + chunks = [] + get_chunk_metas = [] + fetch_infos_list = [] + for tileable in tileables: + fetch_tileable, _ = self._get_to_fetch_tileable(tileable) + fetch_infos = [] + for chunk in fetch_tileable.chunks: + chunks.append(chunk) + get_chunk_metas.append( + self._meta_api.get_chunk_meta.delay( + chunk.key, + fields=["bands"] if query_storage else fields - {"data_key"}, + ) + ) + fetch_infos.append( + ChunkFetchInfo(tileable=tileable, chunk=chunk, indexes=None) + ) + fetch_infos_list.append(fetch_infos) + chunk_metas = await self._meta_api.get_chunk_meta.batch(*get_chunk_metas) + if not query_storage: + result = [] + chunk_to_meta = dict(zip(chunks, chunk_metas)) + for fetch_infos in fetch_infos_list: + fetched = defaultdict(list) + for fetch_info in fetch_infos: + if "data_key" in fields: + fetched["data_key"].append(fetch_info.chunk.key) + for field in fields - {"data_key"}: + fetched[field].append(chunk_to_meta[fetch_info.chunk][field]) + result.append(fetched) + return {}, fetch_infos_list, result + chunk_to_bands = { + chunk: meta["bands"] for chunk, meta in zip(chunks, chunk_metas) + } + return chunk_to_bands, fetch_infos_list, None + + async def decref(self, *tileable_keys): + logger.debug("Decref tileables on client: %s", tileable_keys) + return await self._lifecycle_api.decref_tileables(list(tileable_keys)) + + async def _get_ref_counts(self) -> Dict[str, int]: + return await self._lifecycle_api.get_all_chunk_ref_counts() + + async def fetch_tileable_op_logs( + self, + tileable_op_key: str, + offsets: Union[Dict[str, List[int]], str, int], + sizes: Union[Dict[str, List[int]], str, int], + ) -> Dict: + return await self._session_api.fetch_tileable_op_logs( + self.session_id, tileable_op_key, offsets, sizes + ) + + async def get_total_n_cpu(self): + all_bands = await self._cluster_api.get_all_bands() + n_cpu = 0 + for band, resource in all_bands.items(): + _, band_name = band + if band_name.startswith("numa-"): + n_cpu += resource.num_cpus + return n_cpu + + async def get_cluster_versions(self) -> List[str]: + return list(await self._cluster_api.get_mars_versions()) + + async def get_web_endpoint(self) -> Optional[str]: + if self._web_api is None: + return None + return await self._web_api.get_web_address() + + async def destroy(self): + await super().destroy() + await self._session_api.delete_session(self._session_id) + self._tileable_to_fetch.clear() + if self._asyncio_task_timeout_detector_task: # pragma: no cover + self._asyncio_task_timeout_detector_task.cancel() + + async def create_remote_object( + self, session_id: str, name: str, object_cls, *args, **kwargs + ): + return await self._session_api.create_remote_object( + session_id, name, object_cls, *args, **kwargs + ) + + async def get_remote_object(self, session_id: str, name: str): + return await self._session_api.get_remote_object(session_id, name) + + async def destroy_remote_object(self, session_id: str, name: str): + return await self._session_api.destroy_remote_object(session_id, name) + + async def create_mutable_tensor( + self, + shape: tuple, + dtype: Union[np.dtype, str], + name: str = None, + default_value: Union[int, float] = 0, + chunk_size: Union[int, Tuple] = None, + ): + tensor_info = await self._mutable_api.create_mutable_tensor( + shape, dtype, name, default_value, chunk_size + ) + return tensor_info, self._mutable_api + + async def get_mutable_tensor(self, name: str): + tensor_info = await self._mutable_api.get_mutable_tensor(name) + return tensor_info, self._mutable_api + + async def stop_server(self): + if self.client: + await self.client.stop() + + +class _IsolatedWebSession(_IsolatedSession): + @classmethod + async def _init( + cls, + address: str, + session_id: str, + backend: str, + new: bool = True, + timeout: float = None, + request_rewriter: Callable = None, + ): + from ...services.cluster import WebClusterAPI + from ...services.lifecycle import WebLifecycleAPI + from ...services.meta import WebMetaAPI + from ...services.mutable import WebMutableAPI + from ...services.session import WebSessionAPI + from ...services.task import WebTaskAPI + + session_api = WebSessionAPI(address, request_rewriter) + if new: + # create new session + await session_api.create_session(session_id) + lifecycle_api = WebLifecycleAPI(session_id, address, request_rewriter) + meta_api = WebMetaAPI(session_id, address, request_rewriter) + task_api = WebTaskAPI(session_id, address, request_rewriter) + mutable_api = WebMutableAPI(session_id, address, request_rewriter) + cluster_api = WebClusterAPI(address, request_rewriter) + + return cls( + address, + session_id, + backend, + session_api, + meta_api, + lifecycle_api, + task_api, + mutable_api, + cluster_api, + None, + timeout=timeout, + request_rewriter=request_rewriter, + ) + + async def get_web_endpoint(self) -> Optional[str]: + return self.address + + +def _delegate_to_isolated_session(func: Union[Callable, Coroutine]): + if asyncio.iscoroutinefunction(func): + + @wraps(func) + async def inner(session: "AsyncSession", *args, **kwargs): + coro = getattr(session._isolated_session, func.__name__)(*args, **kwargs) + fut = asyncio.run_coroutine_threadsafe(coro, session._loop) + return await asyncio.wrap_future(fut) + + else: + + @wraps(func) + def inner(session: "SyncSession", *args, **kwargs): + coro = getattr(session._isolated_session, func.__name__)(*args, **kwargs) + fut = asyncio.run_coroutine_threadsafe(coro, session._loop) + return fut.result() + + return inner + + +class AsyncSession(AbstractAsyncSession): + def __init__( + self, + address: str, + session_id: str, + isolated_session: _IsolatedSession, + isolation: Isolation, + ): + super().__init__(address, session_id) + + self._isolated_session = _get_isolated_session(isolated_session) + self._isolation = isolation + self._loop = isolation.loop + + @classmethod + def from_isolated_session( + cls, isolated_session: _IsolatedSession + ) -> "AsyncSession": + return cls( + isolated_session.address, + isolated_session.session_id, + isolated_session, + get_isolation(), + ) + + @property + def client(self): + return self._isolated_session.client + + @client.setter + def client(self, client: ClientType): + self._isolated_session.client = client + + @classmethod + @implements(AbstractAsyncSession.init) + async def init( + cls, + address: str, + session_id: str, + backend: str = "mars", + new: bool = True, + **kwargs, + ) -> "AbstractSession": + isolation = ensure_isolation_created(kwargs) + coro = _IsolatedSession.init(address, session_id, backend, new=new, **kwargs) + fut = asyncio.run_coroutine_threadsafe(coro, isolation.loop) + isolated_session = await asyncio.wrap_future(fut) + return AsyncSession(address, session_id, isolated_session, isolation) + + def as_default(self) -> AbstractSession: + AbstractSession._default = self._isolated_session + return self + + @implements(AbstractAsyncSession.destroy) + async def destroy(self): + coro = self._isolated_session.destroy() + await asyncio.wrap_future(asyncio.run_coroutine_threadsafe(coro, self._loop)) + self.reset_default() + + @implements(AbstractAsyncSession.execute) + @_delegate_to_isolated_session + async def execute(self, *tileables, **kwargs) -> ExecutionInfo: + pass # pragma: no cover + + @implements(AbstractAsyncSession.fetch) + async def fetch(self, *tileables, **kwargs) -> list: + coro = _fetch(*tileables, session=self._isolated_session, **kwargs) + return await asyncio.wrap_future( + asyncio.run_coroutine_threadsafe(coro, self._loop) + ) + + @implements(AbstractAsyncSession._get_ref_counts) + @_delegate_to_isolated_session + async def _get_ref_counts(self) -> Dict[str, int]: + pass # pragma: no cover + + @implements(AbstractAsyncSession.fetch_tileable_op_logs) + @_delegate_to_isolated_session + async def fetch_tileable_op_logs( + self, + tileable_op_key: str, + offsets: Union[Dict[str, List[int]], str, int], + sizes: Union[Dict[str, List[int]], str, int], + ) -> Dict: + pass # pragma: no cover + + @implements(AbstractAsyncSession.get_total_n_cpu) + @_delegate_to_isolated_session + async def get_total_n_cpu(self): + pass # pragma: no cover + + @implements(AbstractAsyncSession.get_cluster_versions) + @_delegate_to_isolated_session + async def get_cluster_versions(self) -> List[str]: + pass # pragma: no cover + + @implements(AbstractAsyncSession.create_remote_object) + @_delegate_to_isolated_session + async def create_remote_object( + self, session_id: str, name: str, object_cls, *args, **kwargs + ): + pass # pragma: no cover + + @implements(AbstractAsyncSession.get_remote_object) + @_delegate_to_isolated_session + async def get_remote_object(self, session_id: str, name: str): + pass # pragma: no cover + + @implements(AbstractAsyncSession.destroy_remote_object) + @_delegate_to_isolated_session + async def destroy_remote_object(self, session_id: str, name: str): + pass # pragma: no cover + + @implements(AbstractAsyncSession.create_mutable_tensor) + async def create_mutable_tensor( + self, + shape: tuple, + dtype: Union[np.dtype, str], + name: str = None, + default_value: Union[int, float] = 0, + chunk_size: Union[int, Tuple] = None, + ): + tensor_info, mutable_api = await self._isolated_session.create_mutable_tensor( + shape, dtype, name, default_value, chunk_size + ) + return MutableTensor.create(tensor_info, mutable_api, self._loop) + + @implements(AbstractAsyncSession.get_mutable_tensor) + async def get_mutable_tensor(self, name: str): + tensor_info, mutable_api = await self._isolated_session.get_mutable_tensor(name) + return MutableTensor.create(tensor_info, mutable_api, self._loop) + + @implements(AbstractAsyncSession.get_web_endpoint) + @_delegate_to_isolated_session + async def get_web_endpoint(self) -> Optional[str]: + pass # pragma: no cover + + @implements(AbstractAsyncSession.stop_server) + async def stop_server(self): + coro = self._isolated_session.stop_server() + await asyncio.wrap_future(asyncio.run_coroutine_threadsafe(coro, self._loop)) + stop_isolation() + + +class ProgressBar: + def __init__(self, show_progress): + if not show_progress: + self.progress_bar = None + else: + try: + from tqdm.auto import tqdm + except ImportError: + if show_progress != "auto": # pragma: no cover + raise ImportError("tqdm is required to show progress") + else: + self.progress_bar = None + else: + self.progress_bar = tqdm( + total=100, + bar_format="{l_bar}{bar}| {n:6.2f}/{total_fmt} " + "[{elapsed}<{remaining}, {rate_fmt}{postfix}]", + ) + + self.last_progress: float = 0.0 + + @property + def show_progress(self) -> bool: + return self.progress_bar is not None + + def __enter__(self): + self.progress_bar.__enter__() + + def __exit__(self, *_): + self.progress_bar.__exit__(*_) + + def update(self, progress: float): + progress = min(progress, 100) + last_progress = self.last_progress + if self.progress_bar: + incr = max(progress - last_progress, 0) + self.progress_bar.update(incr) + self.last_progress = max(last_progress, progress) + + +class SyncSession(AbstractSyncSession): + _execution_pool = concurrent.futures.ThreadPoolExecutor(1) + + def __init__( + self, + address: str, + session_id: str, + isolated_session: _IsolatedSession, + isolation: Isolation, + ): + super().__init__(address, session_id) + + self._isolated_session = _get_isolated_session(isolated_session) + self._isolation = isolation + self._loop = isolation.loop + + @classmethod + def from_isolated_session(cls, isolated_session: _IsolatedSession) -> "SyncSession": + return cls( + isolated_session.address, + isolated_session.session_id, + isolated_session, + get_isolation(), + ) + + @classmethod + def init( + cls, + address: str, + session_id: str, + backend: str = "mars", + new: bool = True, + **kwargs, + ) -> "AbstractSession": + isolation = ensure_isolation_created(kwargs) + coro = _IsolatedSession.init(address, session_id, backend, new=new, **kwargs) + fut = asyncio.run_coroutine_threadsafe(coro, isolation.loop) + isolated_session = fut.result() + return SyncSession(address, session_id, isolated_session, isolation) + + def as_default(self) -> AbstractSession: + AbstractSession._default = self._isolated_session + return self + + @property + def _session(self): + return self._isolated_session + + def _new_cancel_event(self): + async def new_event(): + return asyncio.Event() + + return asyncio.run_coroutine_threadsafe(new_event(), self._loop).result() + + @implements(AbstractSyncSession.execute) + def execute( + self, + tileable, + *tileables, + show_progress: Union[bool, str] = None, + warn_duplicated_execution: bool = None, + **kwargs, + ) -> Union[List[TileableType], TileableType, ExecutionInfo]: + wait = kwargs.get("wait", True) + if show_progress is None: + show_progress = options.show_progress + if warn_duplicated_execution is None: + warn_duplicated_execution = options.warn_duplicated_execution + to_execute_tileables = [] + for t in (tileable,) + tileables: + to_execute_tileables.extend(t.op.outputs) + + cancelled = kwargs.get("cancelled") + if cancelled is None: + cancelled = kwargs["cancelled"] = self._new_cancel_event() + + coro = _execute( + *set(to_execute_tileables), + session=self._isolated_session, + show_progress=show_progress, + warn_duplicated_execution=warn_duplicated_execution, + **kwargs, + ) + fut = asyncio.run_coroutine_threadsafe(coro, self._loop) + try: + execution_info: ExecutionInfo = fut.result( + timeout=self._isolated_session.timeout + ) + except KeyboardInterrupt: # pragma: no cover + logger.warning("Cancelling running task") + cancelled.set() + fut.result() + logger.warning("Cancel finished") + + if wait: + return tileable if len(tileables) == 0 else [tileable] + list(tileables) + else: + aio_task = execution_info.aio_task + + async def run(): + await aio_task + return tileable if len(tileables) == 0 else [tileable] + list(tileables) + + async def driver(): + return asyncio.create_task(run()) + + new_aio_task = asyncio.run_coroutine_threadsafe( + driver(), execution_info.loop + ).result() + new_execution_info = ExecutionInfo( + new_aio_task, + execution_info._progress, + execution_info._profiling, + execution_info.loop, + to_execute_tileables, + ) + return new_execution_info + + @implements(AbstractSyncSession.fetch) + def fetch(self, *tileables, **kwargs) -> list: + coro = _fetch(*tileables, session=self._isolated_session, **kwargs) + return asyncio.run_coroutine_threadsafe(coro, self._loop).result() + + @implements(AbstractSyncSession.fetch_infos) + def fetch_infos(self, *tileables, fields, **kwargs) -> list: + coro = _fetch_infos( + *tileables, fields=fields, session=self._isolated_session, **kwargs + ) + return asyncio.run_coroutine_threadsafe(coro, self._loop).result() + + @implements(AbstractSyncSession.decref) + @_delegate_to_isolated_session + def decref(self, *tileables_keys): + pass # pragma: no cover + + @implements(AbstractSyncSession._get_ref_counts) + @_delegate_to_isolated_session + def _get_ref_counts(self) -> Dict[str, int]: + pass # pragma: no cover + + @implements(AbstractSyncSession.fetch_tileable_op_logs) + @_delegate_to_isolated_session + def fetch_tileable_op_logs( + self, + tileable_op_key: str, + offsets: Union[Dict[str, List[int]], str, int], + sizes: Union[Dict[str, List[int]], str, int], + ) -> Dict: + pass # pragma: no cover + + @implements(AbstractSyncSession.get_total_n_cpu) + @_delegate_to_isolated_session + def get_total_n_cpu(self): + pass # pragma: no cover + + @implements(AbstractSyncSession.get_web_endpoint) + @_delegate_to_isolated_session + def get_web_endpoint(self) -> Optional[str]: + pass # pragma: no cover + + @implements(AbstractSyncSession.get_cluster_versions) + @_delegate_to_isolated_session + def get_cluster_versions(self) -> List[str]: + pass # pragma: no cover + + @implements(AbstractSyncSession.create_mutable_tensor) + def create_mutable_tensor( + self, + shape: tuple, + dtype: Union[np.dtype, str], + name: str = None, + default_value: Union[int, float] = 0, + chunk_size: Union[int, Tuple] = None, + ): + coro = self._isolated_session.create_mutable_tensor( + shape, dtype, name, default_value, chunk_size + ) + fut = asyncio.run_coroutine_threadsafe(coro, self._loop) + tensor_info, mutable_api = fut.result() + return MutableTensor.create(tensor_info, mutable_api, self._loop) + + @implements(AbstractSyncSession.get_mutable_tensor) + def get_mutable_tensor(self, name: str): + coro = self._isolated_session.get_mutable_tensor(name) + fut = asyncio.run_coroutine_threadsafe(coro, self._loop) + tensor_info, mutable_api = fut.result() + return MutableTensor.create(tensor_info, mutable_api, self._loop) + + def destroy(self): + coro = self._isolated_session.destroy() + asyncio.run_coroutine_threadsafe(coro, self._loop).result() + self.reset_default() + + def stop_server(self, isolation=True): + try: + coro = self._isolated_session.stop_server() + future = asyncio.run_coroutine_threadsafe(coro, self._loop) + future.result(timeout=5) + finally: + self.reset_default() + if isolation: + stop_isolation() + + def close(self): + self.destroy() + + def __enter__(self): + return self + + def __exit__(self, *_): + self.close() + + +async def _execute_with_progress( + execution_info: ExecutionInfo, + progress_bar: ProgressBar, + progress_update_interval: Union[int, float], + cancelled: asyncio.Event, +): + with progress_bar: + while not cancelled.is_set(): + done, _pending = await asyncio.wait( + [execution_info.get_future()], timeout=progress_update_interval + ) + if not done: + if not cancelled.is_set() and execution_info.progress() is not None: + progress_bar.update(execution_info.progress() * 100) + else: + # done + if not cancelled.is_set(): + progress_bar.update(100) + break + + +async def _execute( + *tileables: Tuple[TileableType], + session: _IsolatedSession = None, + wait: bool = True, + show_progress: Union[bool, str] = "auto", + progress_update_interval: Union[int, float] = 1, + cancelled: asyncio.Event = None, + **kwargs, +): + execution_info = await session.execute(*tileables, **kwargs) + + def _attach_session(future: asyncio.Future): + if future.exception() is None: + for t in execution_info.to_execute_tileables: + t._attach_session(session) + + execution_info.add_done_callback(_attach_session) + cancelled = cancelled or asyncio.Event() + + if wait: + progress_bar = ProgressBar(show_progress) + if progress_bar.show_progress: + await _execute_with_progress( + execution_info, progress_bar, progress_update_interval, cancelled + ) + else: + exec_task = asyncio.ensure_future(execution_info) + cancel_task = asyncio.ensure_future(cancelled.wait()) + await asyncio.wait( + [exec_task, cancel_task], return_when=asyncio.FIRST_COMPLETED + ) + if cancelled.is_set(): + execution_info.remove_done_callback(_attach_session) + execution_info.cancel() + else: + # set cancelled to avoid wait task leak + cancelled.set() + await execution_info + else: + return execution_info + + +def execute( + tileable: TileableType, + *tileables: Tuple[TileableType], + session: SyncSession = None, + wait: bool = True, + new_session_kwargs: dict = None, + show_progress: Union[bool, str] = None, + progress_update_interval=1, + **kwargs, +): + if isinstance(tileable, (tuple, list)) and len(tileables) == 0: + tileable, tileables = tileable[0], tileable[1:] + if session is None: + session = get_default_or_create(**(new_session_kwargs or dict())) + session = _ensure_sync(session) + return session.execute( + tileable, + *tileables, + wait=wait, + show_progress=show_progress, + progress_update_interval=progress_update_interval, + **kwargs, + ) + + +async def _fetch( + tileable: TileableType, + *tileables: Tuple[TileableType], + session: _IsolatedSession = None, + **kwargs, +): + if isinstance(tileable, tuple) and len(tileables) == 0: + tileable, tileables = tileable[0], tileable[1:] + session = _get_isolated_session(session) + data = await session.fetch(tileable, *tileables, **kwargs) + return data[0] if len(tileables) == 0 else data + + +async def _fetch_infos( + tileable: TileableType, + *tileables: Tuple[TileableType], + session: _IsolatedSession = None, + fields: List[str] = None, + **kwargs, +): + if isinstance(tileable, tuple) and len(tileables) == 0: + tileable, tileables = tileable[0], tileable[1:] + session = _get_isolated_session(session) + data = await session.fetch_infos(tileable, *tileables, fields=fields, **kwargs) + return data[0] if len(tileables) == 0 else data + + +def fetch( + tileable: TileableType, + *tileables: Tuple[TileableType], + session: SyncSession = None, + **kwargs, +): + if isinstance(tileable, (tuple, list)) and len(tileables) == 0: + tileable, tileables = tileable[0], tileable[1:] + if session is None: + session = get_default_session() + if session is None: # pragma: no cover + raise ValueError("No session found") + + session = _ensure_sync(session) + return session.fetch(tileable, *tileables, **kwargs) + + +def fetch_infos( + tileable: TileableType, + *tileables: Tuple[TileableType], + fields: List[str], + session: SyncSession = None, + **kwargs, +): + if isinstance(tileable, tuple) and len(tileables) == 0: + tileable, tileables = tileable[0], tileable[1:] + if session is None: + session = get_default_session() + if session is None: # pragma: no cover + raise ValueError("No session found") + session = _ensure_sync(session) + return session.fetch_infos(tileable, *tileables, fields=fields, **kwargs) + + +def fetch_log(*tileables: TileableType, session: SyncSession = None, **kwargs): + if len(tileables) == 1 and isinstance(tileables[0], (list, tuple)): + tileables = tileables[0] + if session is None: + session = get_default_session() + if session is None: # pragma: no cover + raise ValueError("No session found") + session = _ensure_sync(session) + return session.fetch_log(list(tileables), **kwargs) + + +def ensure_isolation_created(kwargs): + loop = kwargs.pop("loop", None) + use_uvloop = kwargs.pop("use_uvloop", "auto") + + try: + return get_isolation() + except KeyError: + if loop is None: + if not use_uvloop: + loop = asyncio.new_event_loop() + else: + try: + import uvloop + + loop = uvloop.new_event_loop() + except ImportError: + if use_uvloop == "auto": + loop = asyncio.new_event_loop() + else: # pragma: no cover + raise + return new_isolation(loop=loop) + + +def _new_session_id(): + return "".join( + random.choice(string.ascii_letters + string.digits) for _ in range(24) + ) + + +async def _new_session( + address: str, + session_id: str = None, + backend: str = "mars", + default: bool = False, + **kwargs, +) -> AbstractSession: + if session_id is None: + session_id = _new_session_id() + + session = await AsyncSession.init( + address, session_id=session_id, backend=backend, new=True, **kwargs + ) + if default: + session.as_default() + return session + + +def new_session( + address: str = None, + session_id: str = None, + backend: str = "mars", + default: bool = True, + new: bool = True, + **kwargs, +) -> AbstractSession: + # load third party extensions. + init_extension_entrypoints() + ensure_isolation_created(kwargs) + + if address is None: + address = "127.0.0.1" + if "init_local" not in kwargs: + kwargs["init_local"] = True + + if session_id is None: + session_id = _new_session_id() + + session = SyncSession.init( + address, session_id=session_id, backend=backend, new=new, **kwargs + ) + if default: + session.as_default() + return session + + +def get_default_session() -> Optional[SyncSession]: + if AbstractSession.default is None: + return + return SyncSession.from_isolated_session(AbstractSession.default) + + +def clear_default_session(): + AbstractSession.reset_default() + + +def get_default_async_session() -> Optional[AsyncSession]: + if AbstractSession.default is None: + return + return AsyncSession.from_isolated_session(AbstractSession.default) + + +def get_default_or_create(**kwargs): + with AbstractSession._lock: + session = AbstractSession.default + if session is None: + # no session attached, try to create one + warnings.warn(warning_msg) + session = new_session("127.0.0.1", init_local=True, **kwargs) + session.as_default() + if isinstance(session, _IsolatedSession): + session = SyncSession.from_isolated_session(session) + return _ensure_sync(session) + + +def stop_server(): + if AbstractSession.default: + SyncSession.from_isolated_session(AbstractSession.default).stop_server() + + +def _get_isolated_session(session: AbstractSession) -> _IsolatedSession: + if hasattr(session, "_isolated_session"): + return session._isolated_session + return session + + +def _ensure_sync(session: AbstractSession) -> SyncSession: + if isinstance(session, SyncSession): + return session + isolated_session = _get_isolated_session(session) + return SyncSession.from_isolated_session(isolated_session) diff --git a/python/xorbits/_mars/deploy/oscar/supervisor.py b/python/xorbits/_mars/deploy/oscar/supervisor.py new file mode 100644 index 000000000..0e5c121c0 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/supervisor.py @@ -0,0 +1,106 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os + +from ... import oscar as mo +from ...services import NodeRole +from ...utils import get_next_port +from .cmdline import OscarCommandRunner +from .local import start_supervisor, stop_supervisor +from .pool import create_supervisor_actor_pool + +logger = logging.getLogger(__name__) + + +class SupervisorCommandRunner(OscarCommandRunner): + command_description = "Mars Supervisor" + node_role = NodeRole.SUPERVISOR + + def __init__(self): + super().__init__() + self._endpoint_file_name = None + + def config_args(self, parser): + super().config_args(parser) + parser.add_argument("-w", "--web-port", help="web port of the service") + parser.add_argument( + "--n-process", help="number of supervisor processes", default="1" + ) + + def parse_args(self, parser, argv, environ=None): + args = super().parse_args(parser, argv, environ=environ) + + if args.endpoint is None: + args.endpoint = f"{args.host}:{get_next_port()}" + self._endpoint_file_name = self._write_supervisor_endpoint_file(args) + + args.supervisors = f"{args.supervisors},{args.endpoint}".strip(",") + + web_config = self.config.get("web", {}) + if args.web_port is not None: + web_config["host"] = args.endpoint.split(":", 1)[0] + web_config["port"] = int(args.web_port) + self.config["web"] = web_config + + return args + + async def create_actor_pool(self): + return await create_supervisor_actor_pool( + self.args.endpoint, + n_process=int(self.args.n_process), + ports=self.ports, + modules=self.args.load_modules, + logging_conf=self.logging_conf, + subprocess_start_method="forkserver" if os.name != "nt" else "spawn", + metrics=self.config.get("metrics", {}), + oscar_config=self.config.get("oscar"), + ) + + async def start_services(self): + start_web = await start_supervisor( + self.pool.external_address, + self.args.supervisors, + self.args.load_modules, + self.config, + ) + if start_web: + from ...services.web.supervisor import WebActor + + web_actor = await mo.actor_ref( + WebActor.default_uid(), address=self.pool.external_address + ) + web_address = await web_actor.get_web_address() + else: # pragma: no cover + web_address = "" + logger.warning( + "Supervisor started at %s, web address: %s", + self.pool.external_address, + web_address, + ) + + async def stop_services(self): + if self._endpoint_file_name is not None: # pragma: no branch + try: + os.unlink(self._endpoint_file_name) + except OSError: # pragma: no cover + pass + return await stop_supervisor(self.pool.external_address, self.config) + + +main = SupervisorCommandRunner() + +if __name__ == "__main__": # pragma: no branch + main() diff --git a/python/xorbits/_mars/deploy/oscar/tests/__init__.py b/python/xorbits/_mars/deploy/oscar/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/deploy/oscar/tests/check_enabled_config.yml b/python/xorbits/_mars/deploy/oscar/tests/check_enabled_config.yml new file mode 100644 index 000000000..8f2c42f84 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/check_enabled_config.yml @@ -0,0 +1,7 @@ +"@inherits": '@default' +task: + default_config: + reserved_finish_tasks: 0 + task_preprocessor_cls: mars.services.task.supervisor.tests.CheckedTaskPreprocessor +subtask: + subtask_processor_cls: mars.services.subtask.worker.tests.CheckedSubtaskProcessor diff --git a/python/xorbits/_mars/deploy/oscar/tests/fault_injection_config.yml b/python/xorbits/_mars/deploy/oscar/tests/fault_injection_config.yml new file mode 100644 index 000000000..efa1aafd1 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/fault_injection_config.yml @@ -0,0 +1,3 @@ +"@inherits": '@default' +third_party_modules: + - mars.services.tests.fault_injection_patch diff --git a/python/xorbits/_mars/deploy/oscar/tests/fault_injection_config_with_rerun.yml b/python/xorbits/_mars/deploy/oscar/tests/fault_injection_config_with_rerun.yml new file mode 100644 index 000000000..e65836240 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/fault_injection_config_with_rerun.yml @@ -0,0 +1,9 @@ +"@inherits": '@default' +third_party_modules: + - mars.services.tests.fault_injection_patch +scheduling: + subtask_max_retries: 2 + subtask_max_reschedules: 2 +storage: + # shared-memory38 may lose object if the process crash after put success. + backends: [plasma] diff --git a/python/xorbits/_mars/deploy/oscar/tests/local_test_config.yml b/python/xorbits/_mars/deploy/oscar/tests/local_test_config.yml new file mode 100644 index 000000000..7a16779da --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/local_test_config.yml @@ -0,0 +1,7 @@ +"@inherits": '@default' +session: + custom_log_dir: auto + plasma: + store_memory: 32M +scheduling: + mem_hard_limit: 0 diff --git a/python/xorbits/_mars/deploy/oscar/tests/local_test_with_ray_config.yml b/python/xorbits/_mars/deploy/oscar/tests/local_test_with_ray_config.yml new file mode 100644 index 000000000..3b7a9646e --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/local_test_with_ray_config.yml @@ -0,0 +1,6 @@ +"@inherits": '@mars/deploy/oscar/rayconfig.yml' +session: + custom_log_dir: auto +scheduling: + subtask_max_retries: 0 + subtask_max_reschedules: 0 diff --git a/python/xorbits/_mars/deploy/oscar/tests/local_test_with_third_parity_modules_config.yml b/python/xorbits/_mars/deploy/oscar/tests/local_test_with_third_parity_modules_config.yml new file mode 100644 index 000000000..be0d7ba38 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/local_test_with_third_parity_modules_config.yml @@ -0,0 +1,6 @@ +"@inherits": '@default' +third_party_modules: + supervisor: + - mars.deploy.oscar.tests.modules.output_pid + worker: + - mars.deploy.oscar.tests.modules.output_pid diff --git a/python/xorbits/_mars/deploy/oscar/tests/local_test_with_vineyard_config.yml b/python/xorbits/_mars/deploy/oscar/tests/local_test_with_vineyard_config.yml new file mode 100644 index 000000000..ee9e80890 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/local_test_with_vineyard_config.yml @@ -0,0 +1,6 @@ +"@inherits": '@mars/deploy/oscar/base_config.yml' +session: + custom_log_dir: auto + +storage: + backends: [vineyard] diff --git a/python/xorbits/_mars/deploy/oscar/tests/modules/__init__.py b/python/xorbits/_mars/deploy/oscar/tests/modules/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/modules/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/deploy/oscar/tests/modules/check_ray_remote_function_options.py b/python/xorbits/_mars/deploy/oscar/tests/modules/check_ray_remote_function_options.py new file mode 100644 index 000000000..2e24192b9 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/modules/check_ray_remote_function_options.py @@ -0,0 +1,25 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ray + +original_remote_function_options = ray.remote_function.RemoteFunction.options + + +def _wrap_original_remote_function_options(*args, **kwargs): + assert kwargs["num_cpus"] == 5, "expect num_cpus==5" + return original_remote_function_options(*args, **kwargs) + + +ray.remote_function.RemoteFunction.options = _wrap_original_remote_function_options diff --git a/python/xorbits/_mars/deploy/oscar/tests/modules/output_pid.py b/python/xorbits/_mars/deploy/oscar/tests/modules/output_pid.py new file mode 100644 index 000000000..0ec15a47d --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/modules/output_pid.py @@ -0,0 +1,23 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile + +output_dir = os.path.join(tempfile.gettempdir(), "test_inject_module_output") + +os.makedirs(output_dir, exist_ok=True) + +with open(os.path.join(output_dir, f"{os.getpid()}"), "w") as f: + f.write("") diff --git a/python/xorbits/_mars/deploy/oscar/tests/modules/replace_op.py b/python/xorbits/_mars/deploy/oscar/tests/modules/replace_op.py new file mode 100644 index 000000000..b52e1cc32 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/modules/replace_op.py @@ -0,0 +1,25 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .....tensor.arithmetic.add import TensorAdd + + +def _replace_op(ctx, op): + # change the op from TensorAdd to TensorSubtract. + type(op)._func_name = "subtract" + executor = type(op).execute + return executor(ctx, op) + + +TensorAdd.register_executor(_replace_op) diff --git a/python/xorbits/_mars/deploy/oscar/tests/modules/utils.py b/python/xorbits/_mars/deploy/oscar/tests/modules/utils.py new file mode 100644 index 000000000..371f4fd6e --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/modules/utils.py @@ -0,0 +1,31 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import tempfile + +import pytest + + +@pytest.fixture +def cleanup_third_party_modules_output(): + output_dir = os.path.join(tempfile.gettempdir(), "test_inject_module_output") + shutil.rmtree(output_dir, ignore_errors=True) + yield + shutil.rmtree(output_dir, ignore_errors=True) + + +def get_output_filenames(): + return os.listdir(os.path.join(tempfile.gettempdir(), "test_inject_module_output")) diff --git a/python/xorbits/_mars/deploy/oscar/tests/ray_test_with_third_parity_modules_config.yml b/python/xorbits/_mars/deploy/oscar/tests/ray_test_with_third_parity_modules_config.yml new file mode 100644 index 000000000..0795a1b67 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/ray_test_with_third_parity_modules_config.yml @@ -0,0 +1,6 @@ +"@inherits": '@mars/deploy/oscar/rayconfig.yml' +third_party_modules: + supervisor: + - mars.deploy.oscar.tests.modules.output_pid + worker: + - mars.deploy.oscar.tests.modules.output_pid diff --git a/python/xorbits/_mars/deploy/oscar/tests/session.py b/python/xorbits/_mars/deploy/oscar/tests/session.py new file mode 100644 index 000000000..2226db362 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/session.py @@ -0,0 +1,154 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import inspect +import os +import uuid + +from ....core import OBJECT_TYPE +from ....deploy.oscar.local import LocalClient, LocalCluster +from ....tests.core import ObjectCheckMixin, _check_args +from ..session import ( + AbstractSession, + AsyncSession, + _ensure_sync, + _IsolatedSession, + ensure_isolation_created, +) + +CONFIG_FILE = os.path.join(os.path.dirname(__file__), "check_enabled_config.yml") + + +class CheckedSession(ObjectCheckMixin, _IsolatedSession): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._tileable_checked = dict() + + check_options = dict() + for key in _check_args: + check_options[key] = kwargs.get(key, True) + self._check_options = check_options + + @staticmethod + def _extract_check_options(extra_config): + check_options = dict() + for key in _check_args: + check_options[key] = extra_config.pop(key, True) + return check_options + + def _process_result(self, tileable, result): + if self._check_options.get("check_all", True): + if ( + not isinstance(tileable, OBJECT_TYPE) + and tileable.key not in self._tileable_checked + ): + self.assert_object_consistent(tileable, result) + return super()._process_result(tileable, result) + + async def fetch(self, *tileables, **kwargs): + extra_config = kwargs.pop("extra_config", dict()) + if kwargs: + unexpected_keys = ", ".join(list(kwargs.keys())) + raise TypeError(f"`fetch` got unexpected arguments: {unexpected_keys}") + + self._check_options = self._extract_check_options(extra_config) + results = await super().fetch(*tileables) + return results + + +async def _new_test_session( + address: str, + session_id: str = None, + backend: str = None, + default: bool = False, + new: bool = True, + timeout: float = None, + **kwargs, +) -> AbstractSession: + if session_id is None: + session_id = str(uuid.uuid4()) + + async def _get_checked_session(_address): + session = AsyncSession.from_isolated_session( + await CheckedSession.init( + _address, + session_id=session_id, + backend=backend, + new=new, + timeout=timeout, + **kwargs, + ) + ) + if default: + session.as_default() + return session + + async def _new_test_cluster_in_isolation(**new_cluster_kwargs): + cluster = LocalCluster(**new_cluster_kwargs) + await cluster.start() + session = await _get_checked_session(cluster.external_address) + client = LocalClient(cluster, session) + session.client = client + return client + + init_local = kwargs.pop("init_local", False) + if init_local: + if "n_cpu" not in kwargs: + # limit to 2 cpu each worker + kwargs["n_cpu"] = 2 * kwargs.get("n_worker", 1) + if "config" not in kwargs: + # enable check for task and subtask processor + kwargs["config"] = CONFIG_FILE + + sig = inspect.signature(LocalCluster) + new_cluster_params = {} + for k in sig.parameters: + if k in kwargs: + new_cluster_params[k] = kwargs.pop(k) + return ( + await _new_test_cluster_in_isolation( + address=address, backend=backend, **new_cluster_params + ) + ).session + return await _get_checked_session(address) + + +def new_test_session( + address: str = None, + session_id: str = None, + backend: str = None, + default: bool = False, + new: bool = True, + **kwargs, +): + isolation = ensure_isolation_created(kwargs) + if address is None: + address = "127.0.0.1" + if "init_local" not in kwargs: + kwargs["init_local"] = True + if "web" not in kwargs: + kwargs["web"] = False + backend = backend or os.environ.get("MARS_CI_BACKEND", "mars") + coro = _new_test_session( + address, + session_id=session_id, + backend=backend, + default=default, + new=new, + **kwargs, + ) + return _ensure_sync( + asyncio.run_coroutine_threadsafe(coro, isolation.loop).result(120) + ) diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_checked_session.py b/python/xorbits/_mars/deploy/oscar/tests/test_checked_session.py new file mode 100644 index 000000000..0e141d36f --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/test_checked_session.py @@ -0,0 +1,97 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict + +import numpy as np +import pytest + +from .... import tensor as mt +from ....config import option_context +from ....core import OperandType, TileableType +from ....services.subtask.worker.tests import CheckedSubtaskProcessor +from ....services.task.supervisor.tests import CheckedTaskPreprocessor +from ..local import _load_config +from ..tests.session import CONFIG_FILE, new_test_session + + +class FakeCheckedTaskPreprocessor(CheckedTaskPreprocessor): + def _check_nsplits(self, tiled: TileableType): + raise RuntimeError("Premeditated") + + +class FakeCheckedSubtaskProcessor(CheckedSubtaskProcessor): + def _execute_operand(self, ctx: Dict[str, Any], op: OperandType): + if self._check_options.get("check_all", True): + raise RuntimeError("Premeditated") + else: + return super()._execute_operand(ctx, op) + + +@pytest.fixture(scope="module") +def setup(): + with option_context({"show_progress": False}): + yield + + +def test_checked_session(setup): + sess = new_test_session(default=True) + + a = mt.ones((10, 10)) + b = a + 1 + b.execute() + + np.testing.assert_array_equal(sess.fetch(b), np.ones((10, 10)) + 1) + + sess.stop_server() + + +def test_check_task_preprocessor(setup): + config = _load_config(CONFIG_FILE) + config["task"][ + "task_preprocessor_cls" + ] = "mars.deploy.oscar.tests.test_checked_session.FakeCheckedTaskPreprocessor" + + sess = new_test_session(default=True, config=config) + + a = mt.ones((10, 10)) + b = a + 1 + + with pytest.raises(RuntimeError, match="Premeditated"): + b.execute() + + # test test config + b.execute(extra_config={"check_nsplits": False}) + + sess.stop_server() + + +def test_check_subtask_processor(setup): + config = _load_config(CONFIG_FILE) + config["subtask"][ + "subtask_processor_cls" + ] = "mars.deploy.oscar.tests.test_checked_session.FakeCheckedSubtaskProcessor" + + sess = new_test_session(default=True, config=config) + + a = mt.ones((10, 10)) + b = a + 1 + + with pytest.raises(RuntimeError, match="Premeditated"): + b.execute() + + # test test config + b.execute(extra_config={"check_all": False}) + + sess.stop_server() diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_clean_up_and_restore_func.py b/python/xorbits/_mars/deploy/oscar/tests/test_clean_up_and_restore_func.py new file mode 100644 index 000000000..8a47d2ce5 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/test_clean_up_and_restore_func.py @@ -0,0 +1,182 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict + +import pandas as pd +import pytest + +from .... import dataframe as md +from ....config import option_context +from ....core import OperandType, TileableGraph +from ....dataframe.base.apply import ApplyOperand +from ....services.subtask.worker.processor import SubtaskProcessor +from ....services.subtask.worker.tests import CheckedSubtaskProcessor +from ....services.task.supervisor.preprocessor import TaskPreprocessor +from ....services.task.supervisor.tests import CheckedTaskPreprocessor +from ....utils import lazy_import +from ..local import _load_config as _load_mars_config +from ..tests.session import CONFIG_FILE, new_test_session + +ray = lazy_import("ray") + + +class MarsBackendFuncCheckedTaskPreprocessor(CheckedTaskPreprocessor): + def tile(self, tileable_graph: TileableGraph): + ops = [t.op for t in tileable_graph if isinstance(t.op, ApplyOperand)] + for op in ops: + assert hasattr(op, "func_key") + assert op.func_key is None + assert op.func is not None + assert callable(op.func) + assert op.need_clean_up_func is False + result = super().tile(tileable_graph) + for op in ops: + assert hasattr(op, "func_key") + assert op.func_key is None + if op.need_clean_up_func: + assert isinstance(op.func, bytes) + else: + assert callable(op.func) + return result + + +class MarsBackendFuncCheckedSubtaskProcessor(CheckedSubtaskProcessor): + def _execute_operand(self, ctx: Dict[str, Any], op: OperandType): + if isinstance(op, ApplyOperand): + assert hasattr(op, "func_key") + assert op.func_key is None + if op.need_clean_up_func: + assert isinstance(op.func, bytes) + else: + assert callable(op.func) + result = super()._execute_operand(ctx, op) + assert op.func is not None + assert callable(op.func) + return result + else: + return super()._execute_operand(ctx, op) + + +class RayBackendFuncTaskPreprocessor(TaskPreprocessor): + def tile(self, tileable_graph: TileableGraph): + ops = [t.op for t in tileable_graph if isinstance(t.op, ApplyOperand)] + for op in ops: + assert hasattr(op, "func_key") + assert op.func_key is None + assert op.func is not None + assert callable(op.func) + assert op.need_clean_up_func is False + result = super().tile(tileable_graph) + for op in ops: + assert hasattr(op, "func_key") + if op.need_clean_up_func: + assert op.func is None + assert isinstance(op.func_key, ray.ObjectRef) + else: + assert callable(op.func) + assert op.func_key is None + return result + + +class RayBackendFuncSubtaskProcessor(SubtaskProcessor): + def _execute_operand(self, ctx: Dict[str, Any], op: OperandType): + if isinstance(op, ApplyOperand): + assert hasattr(op, "func_key") + if op.need_clean_up_func: + assert op.func is None + assert isinstance(op.func_key, ray.ObjectRef) + else: + assert callable(op.func) + assert op.func_key is None + result = super()._execute_operand(ctx, op) + assert op.func is not None + assert callable(op.func) + return result + else: + return super()._execute_operand(ctx, op) + + +@pytest.fixture(scope="module") +def setup(): + with option_context({"show_progress": False}): + yield + + +def test_mars_backend_clean_up_and_restore_func(setup): + config = _load_mars_config(CONFIG_FILE) + config["task"][ + "task_preprocessor_cls" + ] = "mars.deploy.oscar.tests.test_clean_up_and_restore_func.MarsBackendFuncCheckedTaskPreprocessor" + config["subtask"][ + "subtask_processor_cls" + ] = "mars.deploy.oscar.tests.test_clean_up_and_restore_func.MarsBackendFuncCheckedSubtaskProcessor" + + sess = new_test_session(default=True, config=config) + + cols = [chr(ord("A") + i) for i in range(10)] + df_raw = pd.DataFrame(dict((c, [i**2 for i in range(20)]) for c in cols)) + df = md.DataFrame(df_raw, chunk_size=5) + + x_small = pd.Series([i for i in range(10)]) + y_small = pd.Series([i for i in range(10)]) + x_large = pd.Series([i for i in range(10**4)]) + y_large = pd.Series([i for i in range(10**4)]) + + def closure_small(z): + return pd.concat([x_small, y_small], ignore_index=True) + + def closure_large(z): + return pd.concat([x_large, y_large], ignore_index=True) + + r_small = df.apply(closure_small, axis=1) + r_small.execute() + r_large = df.apply(closure_large, axis=1) + r_large.execute() + + sess.stop_server() + + +@pytest.mark.parametrize("multiplier", [1, 3, 4]) +def test_clean_up_and_restore_callable(setup, multiplier): + config = _load_mars_config(CONFIG_FILE) + config["task"][ + "task_preprocessor_cls" + ] = "mars.deploy.oscar.tests.test_clean_up_and_restore_func.MarsBackendFuncCheckedTaskPreprocessor" + config["subtask"][ + "subtask_processor_cls" + ] = "mars.deploy.oscar.tests.test_clean_up_and_restore_func.MarsBackendFuncCheckedSubtaskProcessor" + + sess = new_test_session(default=True, config=config) + + cols = [chr(ord("A") + i) for i in range(10)] + df_raw = pd.DataFrame(dict((c, [i**2 for i in range(20)]) for c in cols)) + df = md.DataFrame(df_raw, chunk_size=5) + + class callable_df: + __slots__ = "x", "__dict__" + + def __init__(self, multiplier: int = 1): + self.x = pd.Series([i for i in range(10**multiplier)]) + self.y = pd.Series([i for i in range(10**multiplier)]) + + def __call__(self, pdf): + return pd.concat([self.x, self.y], ignore_index=True) + + cdf = callable_df(multiplier=multiplier) + + r_callable = df.apply(cdf, axis=1) + r_callable.execute() + + sess.stop_server() diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_cmdline.py b/python/xorbits/_mars/deploy/oscar/tests/test_cmdline.py new file mode 100644 index 000000000..4aba6b9f2 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/test_cmdline.py @@ -0,0 +1,403 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import asyncio +import glob +import json +import logging +import os +import subprocess +import sys +import tempfile +import time +from concurrent import futures +from typing import List + +import numpy as np +import psutil +import pytest + +from .... import tensor as mt +from ....lib.aio import get_isolation, new_isolation, stop_isolation +from ....services import NodeRole +from ....services.cluster import ClusterAPI +from ....session import new_session +from ....tests import flaky +from ....utils import clean_mars_tmp_dir, get_next_port +from ..cmdline import OscarCommandRunner +from ..supervisor import SupervisorCommandRunner +from ..worker import WorkerCommandRunner + +logger = logging.getLogger(__name__) + + +class _ProcessExitedException(Exception): + pass + + +def _wait_supervisor_ready(supervisor_proc: subprocess.Popen, timeout=120): + start_time = time.time() + supervisor_pid = supervisor_proc.pid + while True: + if supervisor_proc.poll() is not None: + raise _ProcessExitedException + + try: + ep_file_name = OscarCommandRunner._build_endpoint_file_path( + pid=supervisor_pid + ) + with open(ep_file_name, "r") as ep_file: + return ep_file.read().strip() + except: # noqa: E722 # pylint: disable=bare-except + if time.time() - start_time > timeout: + raise + pass + finally: + time.sleep(0.1) + + +def _wait_worker_ready( + supervisor_addr, worker_procs: List[subprocess.Popen], n_supervisors=1, timeout=30 +): + async def wait_for_workers(): + start_time = time.time() + while True: + if any(proc.poll() is not None for proc in worker_procs): + raise _ProcessExitedException + + try: + cluster_api = await ClusterAPI.create(supervisor_addr) + sv_info = await cluster_api.get_nodes_info( + role=NodeRole.SUPERVISOR, resource=True + ) + worker_info = await cluster_api.get_nodes_info( + role=NodeRole.WORKER, resource=True + ) + if len(sv_info) >= n_supervisors and len(worker_info) >= len( + worker_procs + ): + break + + logger.info( + "Cluster not satisfied. sv_num=%s worker_num=%s", + len(sv_info), + len(worker_info), + ) + except: # noqa: E722 # pylint: disable=bare-except + logger.exception("Error when waiting for workers to start") + if time.time() - start_time > timeout: + raise + pass + finally: + await asyncio.sleep(0.5) + + isolation = get_isolation() + asyncio.run_coroutine_threadsafe(wait_for_workers(), isolation.loop).result(timeout) + + +_test_port_cache = dict() + + +def _get_labelled_port(label=None, create=True): + test_name = os.environ["PYTEST_CURRENT_TEST"] + if (test_name, label) not in _test_port_cache: + if create: + _test_port_cache[(test_name, label)] = get_next_port(occupy=True) + else: + return None + return _test_port_cache[(test_name, label)] + + +def _stop_processes(procs: List[subprocess.Popen]): + sub_ps_procs = [] + for proc in procs: + if not proc: + continue + + sub_ps_procs.extend(psutil.Process(proc.pid).children(recursive=True)) + proc.terminate() + + for proc in procs: + try: + proc.wait(10) + except subprocess.TimeoutExpired: + pass + + for ps_proc in sub_ps_procs + procs: + try: + ps_proc.kill() + except psutil.NoSuchProcess: + pass + + +supervisor_cmd_start = [sys.executable, "-m", "mars.deploy.oscar.supervisor"] +worker_cmd_start = [sys.executable, "-m", "mars.deploy.oscar.worker"] + + +def _reload_args(args): + return [arg if not callable(arg) else arg() for arg in args] + + +_rerun_errors = ( + _ProcessExitedException, + asyncio.TimeoutError, + futures.TimeoutError, + OSError, + TimeoutError, +) + + +@flaky(max_runs=10, rerun_filter=lambda err, *_: issubclass(err[0], _rerun_errors)) +@pytest.mark.parametrize( + "supervisor_args,worker_args,use_web_addr", + [ + pytest.param( + supervisor_cmd_start, + worker_cmd_start + + [ + "--config-file", + os.path.join(os.path.dirname(__file__), "local_test_config.yml"), + ], + False, + id="bare_start", + ), + pytest.param( + supervisor_cmd_start + + [ + "-e", + lambda: f'127.0.0.1:{_get_labelled_port("supervisor")}', + "-w", + lambda: str(_get_labelled_port("web")), + "--n-process=2", + "--log-level=DEBUG", + ], + worker_cmd_start + + [ + "-e", + lambda: f"127.0.0.1:{get_next_port(occupy=True)}", + "-s", + lambda: f'127.0.0.1:{_get_labelled_port("supervisor")}', + "--config-file", + os.path.join(os.path.dirname(__file__), "local_test_config.yml"), + "--log-level=DEBUG", + "--log-format=%(asctime)s %(message)s", + "--use-uvloop=no", + ], + True, + id="with_supervisors", + ), + ], +) +def test_cmdline_run(supervisor_args, worker_args, use_web_addr): + new_isolation() + sv_proc = w_procs = None + restart_trial = 5 + try: + env = os.environ.copy() + env["MARS_CPU_TOTAL"] = "2" + + for trial in range(restart_trial): + logger.warning("Cluster start attempt %d / %d", trial + 1, restart_trial) + _test_port_cache.clear() + + sv_args = _reload_args(supervisor_args) + sv_proc = subprocess.Popen(sv_args, env=env) + + oscar_port = _get_labelled_port("supervisor", create=False) + if not oscar_port: + oscar_ep = _wait_supervisor_ready(sv_proc) + else: + oscar_ep = f"127.0.0.1:{oscar_port}" + + if use_web_addr: + host = oscar_ep.rsplit(":", 1)[0] + api_ep = f'http://{host}:{_get_labelled_port("web", create=False)}' + else: + api_ep = oscar_ep + + w_procs = [] + for idx in range(2): + proc = subprocess.Popen(_reload_args(worker_args), env=env) + w_procs.append(proc) + # make sure worker ports does not collide + time.sleep(2) + + try: + _wait_worker_ready(oscar_ep, w_procs) + break + except (asyncio.TimeoutError, futures.TimeoutError, TimeoutError): + if trial == restart_trial - 1: + raise + else: + _stop_processes(w_procs + [sv_proc]) + + new_session(api_ep) + data = np.random.rand(10, 10) + res = mt.tensor(data, chunk_size=5).sum().execute().fetch() + np.testing.assert_almost_equal(res, data.sum()) + finally: + stop_isolation() + + ep_file_name = OscarCommandRunner._build_endpoint_file_path(pid=sv_proc.pid) + try: + os.unlink(ep_file_name) + except OSError: + pass + + _stop_processes((w_procs or []) + [sv_proc]) + + port_prefix = os.path.join( + tempfile.gettempdir(), OscarCommandRunner._port_file_prefix + ) + for fn in glob.glob(port_prefix + "*"): + os.unlink(fn) + + +def test_parse_args(): + parser = argparse.ArgumentParser(description="TestService") + app = WorkerCommandRunner() + app.config_args(parser) + + task_detail = """ + { + "cluster": { + "supervisor": ["sv1", "sv2"], + "worker": ["worker1", "worker2"] + }, + "task": { + "type": "worker", + "index": 0 + } + } + """ + + env = { + "MARS_LOAD_MODULES": "extra.module", + "MARS_TASK_DETAIL": task_detail, + "MARS_CACHE_MEM_SIZE": "20M", + "MARS_PLASMA_DIRS": "/dev/shm", + "MARS_SPILL_DIRS": "/tmp", + } + args = app.parse_args(parser, ["-p", "10324"], env) + assert args.host == "worker1" + assert args.endpoint == "worker1:10324" + assert args.supervisors == "sv1,sv2" + assert "extra.module" in args.load_modules + assert app.config["storage"]["plasma"] == { + "store_memory": "20M", + "plasma_directory": "/dev/shm", + } + assert app.config["storage"]["disk"] == { + "root_dirs": "/tmp", + } + + +@pytest.fixture +def init_app(): + parser = argparse.ArgumentParser(description="TestService") + app = WorkerCommandRunner() + app.config_args(parser) + yield app, parser + + # clean + clean_mars_tmp_dir() + + +def test_parse_no_log_dir(init_app): + app, parser = init_app + + assert not app.config + assert len(app.config) == 0 + + with pytest.raises(KeyError): + try: + app._set_log_dir() + except ValueError: + pytest.fail() + + _ = app.parse_args(parser, ["--supervisors", "127.0.0.1"]) + assert app.config["cluster"] + assert not app.config["cluster"]["log_dir"] + app._set_log_dir() + assert app.logging_conf["from_cmd"] is True + assert not app.logging_conf["log_dir"] + + +def test_parse_log_dir(init_app): + app, parser = init_app + log_dir = tempfile.mkdtemp() + _ = app.parse_args(parser, ["--supervisors", "127.0.0.1"]) + app.config["cluster"]["log_dir"] = log_dir + assert os.path.exists(app.config["cluster"]["log_dir"]) + app._set_log_dir() + assert app.logging_conf["log_dir"] == log_dir + + +def test_config_logging(init_app): + app, parser = init_app + app.args = app.parse_args(parser, ["--supervisors", "127.0.0.1"]) + app.config_logging() + expected_path = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "file-logging.conf" + ) + assert app.logging_conf["file"] == expected_path + + +def test_parse_third_party_modules(): + config = { + "third_party_modules": { + "supervisor": ["supervisor.module"], + "worker": ["worker.module"], + } + } + env = {"MARS_LOAD_MODULES": "extra.module"} + + parser = argparse.ArgumentParser(description="TestService") + app = WorkerCommandRunner() + app.config_args(parser) + args = app.parse_args( + parser, + [ + "-c", + json.dumps(config), + "-p", + "10324", + "-s", + "sv1,sv2", + "--load-modules", + "load.module", + ], + env, + ) + assert args.load_modules == ("load.module", "worker.module", "extra.module") + + parser = argparse.ArgumentParser(description="TestService") + app = SupervisorCommandRunner() + app.config_args(parser) + args = app.parse_args( + parser, + [ + "-c", + json.dumps(config), + "-p", + "10324", + "-s", + "sv1,sv2", + "--load-modules", + "load.module", + ], + env, + ) + assert args.load_modules == ("load.module", "supervisor.module", "extra.module") diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_fault_injection.py b/python/xorbits/_mars/deploy/oscar/tests/test_fault_injection.py new file mode 100644 index 000000000..a6dab9cdf --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/test_fault_injection.py @@ -0,0 +1,334 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import traceback + +import numpy as np +import pandas as pd +import pytest + +from .... import dataframe as md +from .... import tensor as mt +from ....oscar.errors import ServerClosed +from ....remote import spawn +from ....services.tests.fault_injection_manager import ( + AbstractFaultInjectionManager, + ExtraConfigKey, + FaultInjectionError, + FaultInjectionUnhandledError, + FaultPosition, + FaultType, +) +from ....tensor.base.psrs import PSRSConcatPivot +from ..local import new_cluster +from ..session import get_default_async_session + +CONFIG_FILE = os.path.join(os.path.dirname(__file__), "fault_injection_config.yml") +RERUN_SUBTASK_CONFIG_FILE = os.path.join( + os.path.dirname(__file__), "fault_injection_config_with_rerun.yml" +) + + +@pytest.fixture +async def fault_cluster(request): + param = getattr(request, "param", {}) + start_method = os.environ.get("POOL_START_METHOD", None) + client = await new_cluster( + subprocess_start_method=start_method, + config=param.get("config", CONFIG_FILE), + n_worker=2, + n_cpu=2, + ) + async with client: + yield client + + +async def create_fault_injection_manager( + session_id, address, fault_count, fault_type, fault_op_types=None +): + class FaultInjectionManager(AbstractFaultInjectionManager): + def __init__(self): + self._fault_count = fault_count + + def set_fault_count(self, count): + self._fault_count = count + + def get_fault_count(self): + return self._fault_count + + def get_fault(self, pos: FaultPosition, ctx=None) -> FaultType: + # Check op types if fault_op_types provided. + if fault_op_types and type(ctx.get("operand")) not in fault_op_types: + return FaultType.NoFault + if self._fault_count.get(pos, 0) > 0: + self._fault_count[pos] -= 1 + return fault_type + return FaultType.NoFault + + await FaultInjectionManager.create(session_id, address) + return FaultInjectionManager.name + + +@pytest.mark.parametrize( + "fault_and_exception", + [ + [ + FaultType.Exception, + {FaultPosition.ON_EXECUTE_OPERAND: 1}, + pytest.raises(FaultInjectionError, match="Fault Injection"), + True, + ], + [ + FaultType.UnhandledException, + {FaultPosition.ON_EXECUTE_OPERAND: 1}, + pytest.raises( + FaultInjectionUnhandledError, match="Fault Injection Unhandled" + ), + True, + ], + [ + FaultType.ProcessExit, + {FaultPosition.ON_EXECUTE_OPERAND: 1}, + pytest.raises(ServerClosed), + False, # The ServerClosed raised from current process directly. + ], + [ + FaultType.Exception, + {FaultPosition.ON_RUN_SUBTASK: 1}, + pytest.raises(FaultInjectionError, match="Fault Injection"), + False, + ], + ], +) +@pytest.mark.asyncio +async def test_fault_inject_subtask_processor(fault_cluster, fault_and_exception): + fault_type, fault_count, first_run_raises, check_error_prefix = fault_and_exception + name = await create_fault_injection_manager( + session_id=fault_cluster.session.session_id, + address=fault_cluster.session.address, + fault_count=fault_count, + fault_type=fault_type, + ) + extra_config = {ExtraConfigKey.FAULT_INJECTION_MANAGER_NAME: name} + + raw = np.random.RandomState(0).rand(10, 10) + a = mt.tensor(raw, chunk_size=5) + b = a + 1 + + with first_run_raises as ex: + b.execute(extra_config=extra_config) + + if check_error_prefix: + assert str(ex.value).count("address") == 1 + assert str(ex.value).count("pid") == 1 + + # execute again may raise an ConnectionRefusedError if the + # ProcessExit occurred. + + +@pytest.mark.parametrize( + "fault_cluster", [{"config": RERUN_SUBTASK_CONFIG_FILE}], indirect=True +) +@pytest.mark.parametrize( + "fault_config", + [ + [ + FaultType.Exception, + {FaultPosition.ON_EXECUTE_OPERAND: 1}, + pytest.raises(FaultInjectionError, match="Fault Injection"), + ], + [ + FaultType.ProcessExit, + {FaultPosition.ON_EXECUTE_OPERAND: 1}, + pytest.raises(ServerClosed), + ], + [ + FaultType.Exception, + {FaultPosition.ON_RUN_SUBTASK: 1}, + pytest.raises(FaultInjectionError, match="Fault Injection"), + ], + ], +) +@pytest.mark.asyncio +async def test_rerun_subtask(fault_cluster, fault_config): + fault_type, fault_count, expect_raises = fault_config + name = await create_fault_injection_manager( + session_id=fault_cluster.session.session_id, + address=fault_cluster.session.address, + fault_count=fault_count, + fault_type=fault_type, + ) + extra_config = {ExtraConfigKey.FAULT_INJECTION_MANAGER_NAME: name} + session = get_default_async_session() + + raw = np.random.RandomState(0).rand(10, 10) + a = mt.tensor(raw, chunk_size=5) + b = a + 1 + + info = await session.execute(b, extra_config=extra_config) + await info + assert info.result() is None + assert info.exception() is None + + r = await session.fetch(b) + np.testing.assert_array_equal(r, raw + 1) + + fault_injection_manager = await session.get_remote_object( + fault_cluster.session.session_id, name + ) + await fault_injection_manager.set_fault_count({FaultPosition.ON_EXECUTE_OPERAND: 1}) + + # the extra config overwrites the default config. + extra_config["subtask_max_retries"] = 0 + extra_config["subtask_max_reschedules"] = 0 + info = await session.execute(b, extra_config=extra_config) + with expect_raises: + await info + + +@pytest.mark.parametrize( + "fault_cluster", [{"config": RERUN_SUBTASK_CONFIG_FILE}], indirect=True +) +@pytest.mark.parametrize( + "fault_config", + [ + [FaultType.Exception, {FaultPosition.ON_EXECUTE_OPERAND: 1}, [PSRSConcatPivot]], + [ + FaultType.ProcessExit, + {FaultPosition.ON_EXECUTE_OPERAND: 1}, + [PSRSConcatPivot], + ], + ], +) +@pytest.mark.asyncio +async def test_rerun_subtask_describe(fault_cluster, fault_config): + fault_type, fault_count, fault_op_types = fault_config + name = await create_fault_injection_manager( + session_id=fault_cluster.session.session_id, + address=fault_cluster.session.address, + fault_count=fault_count, + fault_type=fault_type, + fault_op_types=fault_op_types, + ) + extra_config = {ExtraConfigKey.FAULT_INJECTION_MANAGER_NAME: name} + session = get_default_async_session() + + s = np.random.RandomState(0) + raw = pd.DataFrame(s.rand(100, 4), columns=list("abcd")) + df = md.DataFrame(raw, chunk_size=30) + + r = df.describe() + info = await session.execute(r, extra_config=extra_config) + await info + assert info.result() is None + assert info.exception() is None + assert info.progress() == 1 + res = await session.fetch(r) + pd.testing.assert_frame_equal(res, raw.describe()) + + fault_injection_manager = await session.get_remote_object( + fault_cluster.session.session_id, name + ) + remain_fault_count = await fault_injection_manager.get_fault_count() + for key in fault_count: + assert remain_fault_count[key] == 0 + + +@pytest.mark.parametrize( + "fault_cluster", [{"config": RERUN_SUBTASK_CONFIG_FILE}], indirect=True +) +@pytest.mark.parametrize( + "fault_config", + [ + [ + FaultType.UnhandledException, + {FaultPosition.ON_EXECUTE_OPERAND: 1}, + pytest.raises(FaultInjectionUnhandledError), + ["_UnhandledException", "handle_fault"], + ], + [ + FaultType.Exception, + {FaultPosition.ON_EXECUTE_OPERAND: 100}, + pytest.raises(FaultInjectionError), + ["_ExceedMaxRerun", "handle_fault"], + ], + ], +) +@pytest.mark.asyncio +async def test_rerun_subtask_fail(fault_cluster, fault_config): + fault_type, fault_count, expect_raises, exception_match = fault_config + name = await create_fault_injection_manager( + session_id=fault_cluster.session.session_id, + address=fault_cluster.session.address, + fault_count=fault_count, + fault_type=fault_type, + ) + exception_typename, stack_string = exception_match + extra_config = {ExtraConfigKey.FAULT_INJECTION_MANAGER_NAME: name} + + raw = np.random.RandomState(0).rand(10, 10) + a = mt.tensor(raw, chunk_size=5) + b = a + 1 + + with expect_raises as e: + b.execute(extra_config=extra_config) + + tb_str = "".join(traceback.format_tb(e.tb)) + assert e.value.__wrapname__ == exception_typename, tb_str + assert e.traceback[-1].name == stack_string, tb_str + + +@pytest.mark.parametrize( + "fault_cluster", [{"config": RERUN_SUBTASK_CONFIG_FILE}], indirect=True +) +@pytest.mark.parametrize( + "fault_config", + [ + [ + FaultType.Exception, + {FaultPosition.ON_EXECUTE_OPERAND: 1}, + pytest.raises(FaultInjectionError, match="RemoteFunction"), + ["_UnretryableException", "handle_fault"], + ], + [ + FaultType.ProcessExit, + {FaultPosition.ON_EXECUTE_OPERAND: 1}, + pytest.raises(ServerClosed), + ["_UnretryableException", "*"], + ], + ], +) +@pytest.mark.asyncio +async def test_retryable(fault_cluster, fault_config): + fault_type, fault_count, expect_raises, exception_match = fault_config + name = await create_fault_injection_manager( + session_id=fault_cluster.session.session_id, + address=fault_cluster.session.address, + fault_count=fault_count, + fault_type=fault_type, + ) + exception_typename, stack_string = exception_match + extra_config = {ExtraConfigKey.FAULT_INJECTION_MANAGER_NAME: name} + + def f(x): + return x + 1 + + r = spawn(f, args=(1,), retry_when_fail=False) + with expect_raises as e: + r.execute(extra_config=extra_config) + + tb_str = "".join(traceback.format_tb(e.tb)) + assert e.value.__wrapname__ == exception_typename, tb_str + assert stack_string == "*" or e.traceback[-1].name == stack_string, tb_str diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_local.py b/python/xorbits/_mars/deploy/oscar/tests/test_local.py new file mode 100644 index 000000000..593e652b3 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/test_local.py @@ -0,0 +1,1326 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import copy +import gc +import os +import subprocess +import sys +import tempfile +import textwrap +import threading +import time +import uuid +import weakref + +import numpy as np +import pandas as pd +import psutil +import pytest + +try: + import vineyard +except ImportError: + vineyard = None + +from .... import dataframe as md +from .... import remote as mr +from .... import tensor as mt +from ....config import option_context +from ....core.context import get_context +from ....lib.aio import new_isolation +from ....oscar.backends.router import Router +from ....services.storage import StorageAPI +from ....services.task.supervisor.task import TaskProcessor +from ....storage import StorageLevel +from ....tensor.arithmetic.add import TensorAdd +from ....tests.core import DICT_NOT_EMPTY, check_dict_structure_same, mock, require_cupy +from ....utils import lazy_import +from ..local import _load_config, new_cluster +from ..session import ( + AsyncSession, + ExecutionInfo, + Profiling, + Progress, + _execute_with_progress, + _IsolatedWebSession, + clear_default_session, + execute, + fetch, + fetch_infos, + get_default_async_session, + get_default_session, + new_session, + stop_server, +) +from ..tests.session import new_test_session +from .modules.utils import ( # noqa: F401; pylint: disable=unused-variable + cleanup_third_party_modules_output, + get_output_filenames, +) + +CONFIG_TEST_FILE = os.path.join(os.path.dirname(__file__), "local_test_config.yml") + +CONFIG_VINEYARD_TEST_FILE = os.path.join( + os.path.dirname(__file__), "local_test_with_vineyard_config.yml" +) + + +CONFIG_THIRD_PARTY_MODULES_TEST_FILE = os.path.join( + os.path.dirname(__file__), "local_test_with_third_parity_modules_config.yml" +) + +EXPECT_PROFILING_STRUCTURE = { + "supervisor": { + "general": { + "optimize": 0.0005879402160644531, + "incref_fetch_tileables": 0.0010840892791748047, + "stage_*": { + "tile(*)": 0.008243083953857422, + "gen_subtask_graph(*)": 0.012202978134155273, + "run": 0.27870702743530273, + "total": 0.30318617820739746, + }, + "total": 0.30951380729675293, + }, + "serialization": {}, + "most_calls": DICT_NOT_EMPTY, + "slow_calls": DICT_NOT_EMPTY, + "band_subtasks": DICT_NOT_EMPTY, + "slow_subtasks": DICT_NOT_EMPTY, + } +} +EXPECT_PROFILING_STRUCTURE_NO_SLOW = copy.deepcopy(EXPECT_PROFILING_STRUCTURE) +EXPECT_PROFILING_STRUCTURE_NO_SLOW["supervisor"]["slow_calls"] = {} +EXPECT_PROFILING_STRUCTURE_NO_SLOW["supervisor"]["slow_subtasks"] = {} + +params = ["default"] +if vineyard is not None: + params.append("vineyard") + + +@pytest.mark.parametrize(indirect=True) +@pytest.fixture(params=params) +async def create_cluster(request): + if request.param == "default": + config = CONFIG_TEST_FILE + elif request.param == "vineyard": + config = CONFIG_VINEYARD_TEST_FILE + else: + config = None + start_method = os.environ.get("POOL_START_METHOD", None) + client = await new_cluster( + subprocess_start_method=start_method, + config=config, + n_worker=2, + n_cpu=4, + use_uvloop=False, + ) + async with client: + if request.param == "default": + assert client.session.client is not None + yield client, request.param + + +def _assert_storage_cleaned(session_id: str, addr: str, level: StorageLevel): + async def _assert(session_id: str, addr: str, level: StorageLevel): + storage_api = await StorageAPI.create(session_id, addr) + assert len(await storage_api.list(level)) == 0 + info = await storage_api.get_storage_level_info(level) + assert info.used_size == 0 + + isolation = new_isolation() + asyncio.run_coroutine_threadsafe( + _assert(session_id, addr, level), isolation.loop + ).result() + + +@pytest.mark.parametrize("backend", ["mars"]) +@pytest.mark.parametrize("_new_session", [new_session, new_test_session]) +def test_new_session_backend(_new_session, backend): + from ....services.task.execution.api import _name_to_config_cls + + config_cls = _name_to_config_cls[backend] + original_config_init = config_cls.__init__ + original_deploy_band_resources = config_cls.get_deploy_band_resources + with mock.patch.object( + config_cls, "__init__", autospec=True + ) as config_init, mock.patch.object( + config_cls, "get_deploy_band_resources", autospec=True + ) as deploy_band_resources: + return_deploy_band_resources = [] + + def _wrap_original_deploy_band_resources(*args, **kwargs): + nonlocal return_deploy_band_resources + return_deploy_band_resources = original_deploy_band_resources( + *args, **kwargs + ) + return return_deploy_band_resources + + config_init.side_effect = original_config_init + deploy_band_resources.side_effect = _wrap_original_deploy_band_resources + sess = _new_session( + backend=backend, n_cpu=2, web=False, use_uvloop=False, default=True + ) + try: + assert config_init.call_count > 0 + assert deploy_band_resources.call_count > 0 + worker_pools = sess.default.client._cluster._worker_pools + assert len(worker_pools) == len(return_deploy_band_resources) + a = mt.ones((10, 10)) + b = a + 1 + res = b.to_numpy() + np.testing.assert_array_equal(res, np.ones((10, 10)) + 1) + finally: + sess.stop_server() + + assert get_default_async_session() is None + + +@pytest.mark.asyncio +async def test_vineyard_operators(create_cluster): + param = create_cluster[1] + if param != "vineyard": + pytest.skip("Vineyard is not enabled") + + session = get_default_async_session() + + # tensor + raw = np.random.RandomState(0).rand(55, 55) + a = mt.tensor(raw, chunk_size=15) + info = await session.execute(a) # n.b.: pre-execute + await info + + b = mt.to_vineyard(a) + info = await session.execute(b) + await info + object_id = (await session.fetch(b))[0] + + c = mt.from_vineyard(object_id) + info = await session.execute(c) + await info + tensor = await session.fetch(c) + np.testing.assert_allclose(tensor, raw) + + # dataframe + raw = pd.DataFrame({"a": np.arange(0, 55), "b": np.arange(55, 110)}) + a = md.DataFrame(raw, chunk_size=15) + b = a.to_vineyard() # n.b.: no pre-execute + info = await session.execute(b) + await info + object_id = (await session.fetch(b))[0][0] + + c = md.from_vineyard(object_id) + info = await session.execute(c) + await info + df = await session.fetch(c) + pd.testing.assert_frame_equal(df, raw) + + +@pytest.mark.parametrize( + "config", + [ + [ + { + "enable_profiling": { + "slow_calls_duration_threshold": 0, + "slow_subtasks_duration_threshold": 0, + } + }, + EXPECT_PROFILING_STRUCTURE, + ], + [ + { + "enable_profiling": { + "slow_calls_duration_threshold": 1000, + "slow_subtasks_duration_threshold": 1000, + } + }, + EXPECT_PROFILING_STRUCTURE_NO_SLOW, + ], + [{}, {}], + ], +) +@pytest.mark.asyncio +async def test_execute(create_cluster, config): + session = get_default_async_session() + assert session.address is not None + assert session.session_id is not None + + raw = np.random.RandomState(0).rand(10, 10) + a = mt.tensor(raw, chunk_size=5) + b = a + 1 + + extra_config, expect_profiling_structure = config + + info = await session.execute(b, extra_config=extra_config) + await info + if extra_config: + check_dict_structure_same(info.profiling_result(), expect_profiling_structure) + else: + assert not info.profiling_result() + assert info.result() is None + assert info.exception() is None + assert info.progress() == 1 + np.testing.assert_equal(raw + 1, await session.fetch(b)) + + with pytest.raises(ValueError): + await session.fetch(b + 1) + + with pytest.raises(ValueError): + await session.fetch(b[b < 0.6]) + + del a, b + + if ( + not isinstance(session._isolated_session, _IsolatedWebSession) + and session.client + ): + worker_pools = session.client._cluster._worker_pools + await session.destroy() + for worker_pool in worker_pools: + if hasattr(worker_pool, "external_address"): + _assert_storage_cleaned( + session.session_id, + worker_pool.external_address, + StorageLevel.MEMORY, + ) + + +@pytest.mark.asyncio +async def test_iterative_tiling(create_cluster): + session = get_default_async_session() + + raw = np.random.RandomState(0).rand(30, 5) + raw_df = pd.DataFrame(raw, index=np.arange(1, 31)) + + df = md.DataFrame(raw_df, chunk_size=10) + df = df[df[0] < 0.7] + df2 = df.shift(2) + + info = await session.execute(df2) + await info + assert info.result() is None + result = await session.fetch(df2) + + expected = raw_df[raw_df[0] < 0.7].shift(2) + pd.testing.assert_frame_equal(result, expected) + + # test meta + assert df2.index_value.min_val >= 1 + assert df2.index_value.max_val <= 30 + + if ( + not isinstance(session._isolated_session, _IsolatedWebSession) + and session.client + ): + worker_pools = session.client._cluster._worker_pools + await session.destroy() + for worker_pool in worker_pools: + if hasattr(worker_pool, "external_address"): + _assert_storage_cleaned( + session.session_id, + worker_pool.external_address, + StorageLevel.MEMORY, + ) + + +@pytest.mark.asyncio +async def test_execute_describe(create_cluster): + s = np.random.RandomState(0) + raw = pd.DataFrame(s.rand(100, 4), columns=list("abcd")) + df = md.DataFrame(raw, chunk_size=30) + + session = get_default_async_session() + r = df.describe() + info = await session.execute(r) + await info + assert info.result() is None + assert info.exception() is None + assert info.progress() == 1 + res = await session.fetch(r) + pd.testing.assert_frame_equal(res, raw.describe()) + + if ( + not isinstance(session._isolated_session, _IsolatedWebSession) + and session.client + ): + worker_pools = session.client._cluster._worker_pools + await session.destroy() + for worker_pool in worker_pools: + if hasattr(worker_pool, "external_address"): + _assert_storage_cleaned( + session.session_id, + worker_pool.external_address, + StorageLevel.MEMORY, + ) + + +@pytest.mark.asyncio +async def test_execute_apply_closure(create_cluster): + # DataFrame + cols = [chr(ord("A") + i) for i in range(10)] + raw = pd.DataFrame(dict((c, [i**2 for i in range(20)]) for c in cols)) + df = md.DataFrame(raw, chunk_size=5) + + x1 = pd.Series([i for i in range(10**4)]) + y1 = pd.Series([i for i in range(10**4)]) + + def dataframe_closure(z1): + return pd.concat([x1, y1], ignore_index=True) + + session = get_default_async_session() + df_r = df.apply(dataframe_closure, axis=1) + df_info = await session.execute(df_r) + await df_info + assert df_info.result() is None + assert df_info.exception() is None + assert df_info.progress() == 1 + + df_result = await session.fetch(df_r) + df_expected = raw.apply(dataframe_closure, axis=1) + pd.testing.assert_frame_equal(df_result, df_expected) + + # Series + idxes = [chr(ord("A") + i) for i in range(20)] + s_raw = pd.Series([i**2 for i in range(20)], index=idxes) + + series = md.Series(s_raw, chunk_size=5) + + x2, y2 = 1, 2 + + def series_closure(z2): + return [z2 + x2, z2 + y2] + + series_r = series.apply(series_closure, convert_dtype=False) + series_info = await session.execute(series_r) + await series_info + assert series_info.result() is None + assert series_info.exception() is None + assert series_info.progress() == 1 + + series_result = await session.fetch(series_r) + series_expected = s_raw.apply(series_closure, convert_dtype=False) + pd.testing.assert_series_equal(series_result, series_expected) + + if ( + not isinstance(session._isolated_session, _IsolatedWebSession) + and session.client + ): + worker_pools = session.client._cluster._worker_pools + await session.destroy() + for worker_pool in worker_pools: + if hasattr(worker_pool, "external_address"): + _assert_storage_cleaned( + session.session_id, + worker_pool.external_address, + StorageLevel.MEMORY, + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("multiplier", [1, 3, 4]) +async def test_execute_callable_closure(create_cluster, multiplier): + # DataFrame + cols = [chr(ord("A") + i) for i in range(10)] + raw = pd.DataFrame(dict((c, [i**2 for i in range(20)]) for c in cols)) + df = md.DataFrame(raw, chunk_size=5) + + class callable_df: + __slots__ = "x", "__dict__" + + def __init__(self, multiplier: int = 1): + self.x = pd.Series([i for i in range(10**multiplier)]) + self.y = pd.Series([i for i in range(10**multiplier)]) + + def __call__(self, pdf): + return pd.concat([self.x, self.y], ignore_index=True) + + session = get_default_async_session() + cdf = callable_df(multiplier=multiplier) + df_r = df.apply(cdf, axis=1) + df_info = await session.execute(df_r) + await df_info + assert df_info.result() is None + assert df_info.exception() is None + assert df_info.progress() == 1 + + df_result = await session.fetch(df_r) + df_expected = raw.apply(cdf, axis=1) + pd.testing.assert_frame_equal(df_result, df_expected) + + if ( + not isinstance(session._isolated_session, _IsolatedWebSession) + and session.client + ): + worker_pools = session.client._cluster._worker_pools + await session.destroy() + for worker_pool in worker_pools: + if hasattr(worker_pool, "external_address"): + _assert_storage_cleaned( + session.session_id, + worker_pool.external_address, + StorageLevel.MEMORY, + ) + + +@pytest.mark.asyncio +async def test_sync_execute_in_async(create_cluster): + a = mt.ones((10, 10)) + b = a + 1 + res = b.to_numpy() + np.testing.assert_array_equal(res, np.ones((10, 10)) + 1) + + +@pytest.mark.asyncio +async def test_fetch_infos(create_cluster): + raw = np.random.RandomState(0).rand(30, 5) + raw_df = pd.DataFrame(raw, index=np.arange(1, 31)) + + df = md.DataFrame(raw_df, chunk_size=10) + df.execute() + fetched_infos = df.fetch_infos() + + assert "object_id" in fetched_infos + assert "level" in fetched_infos + assert "memory_size" in fetched_infos + assert "store_size" in fetched_infos + assert "bands" in fetched_infos + + fetched_infos = df.fetch_infos(fields=["object_id", "bands"]) + assert "object_id" in fetched_infos + assert "bands" in fetched_infos + assert len(fetched_infos) == 2 + + fetch_infos((df, df), fields=None) + results_infos = mr.ExecutableTuple([df, df]).execute()._fetch_infos() + assert len(results_infos) == 2 + assert "object_id" in results_infos[0] + assert "level" in results_infos[0] + assert "memory_size" in results_infos[0] + assert "store_size" in results_infos[0] + assert "bands" in results_infos[0] + + +async def _run_web_session_test(web_address): + session_id = str(uuid.uuid4()) + session = await AsyncSession.init(web_address, session_id) + session.as_default() + + raw = np.random.RandomState(0).rand(10, 10) + a = mt.tensor(raw, chunk_size=5) + b = a + 1 + + info = await session.execute(b) + await info + assert info.result() is None + assert info.exception() is None + assert info.progress() == 1 + np.testing.assert_equal(raw + 1, await session.fetch(b)) + del a, b + + # Test spawn a local function by the web session. + def _my_func(): + print("output from function") + + r = mr.spawn(_my_func) + info = await session.execute(r) + await info + assert info.result() is None + assert info.exception() is None + assert info.progress() == 1 + assert "output from function" in str(r.fetch_log(session=session)) + assert "output from function" in str( + r.fetch_log(session=session, offsets="0k", sizes=[1000]) + ) + assert "output from function" in str( + r.fetch_log(session=session, offsets={r.op.key: "0k"}, sizes=[1000]) + ) + + df = md.DataFrame([1, 2, 3]) + # Test apply a lambda by the web session. + r = df.apply(lambda x: x) + info = await session.execute(r) + await info + assert info.result() is None + assert info.exception() is None + assert info.progress() == 1 + pd.testing.assert_frame_equal(await session.fetch(r), pd.DataFrame([1, 2, 3])) + + AsyncSession.reset_default() + await session.destroy() + + +@pytest.mark.parametrize( + "config", + [ + [ + { + "enable_profiling": { + "slow_calls_duration_threshold": 0, + "slow_subtasks_duration_threshold": 0, + } + }, + EXPECT_PROFILING_STRUCTURE, + ], + [ + { + "enable_profiling": { + "slow_calls_duration_threshold": 1000, + "slow_subtasks_duration_threshold": 1000, + } + }, + EXPECT_PROFILING_STRUCTURE_NO_SLOW, + ], + [{}, {}], + ], +) +@pytest.mark.asyncio +async def test_web_session(create_cluster, config): + client = create_cluster[0] + session_id = str(uuid.uuid4()) + web_address = client.web_address + session = await AsyncSession.init( + web_address, session_id, request_rewriter=lambda x: x + ) + assert await session.get_web_endpoint() == web_address + session.as_default() + assert isinstance(session._isolated_session, _IsolatedWebSession) + await test_execute(client, config) + await test_iterative_tiling(client) + AsyncSession.reset_default() + await session.destroy() + await _run_web_session_test(web_address) + + worker_pools = client._cluster._worker_pools + for worker_pool in worker_pools: + if hasattr(worker_pool, "external_address"): + _assert_storage_cleaned( + session.session_id, worker_pool.external_address, StorageLevel.MEMORY + ) + + +@pytest.mark.parametrize("config", [{"backend": "mars"}]) +def test_sync_execute(config): + session = new_session( + backend=config["backend"], n_cpu=2, web=False, use_uvloop=False + ) + + # web not started + assert session._session.client.web_address is None + assert session.get_web_endpoint() is None + + with session: + raw = np.random.RandomState(0).rand(10, 5) + a = mt.tensor(raw, chunk_size=5).sum(axis=1) + b = a.execute(show_progress=False) + assert b is a + result = a.fetch() + np.testing.assert_array_equal(result, raw.sum(axis=1)) + + c = b + 1 + c.execute(show_progress=False) + result = c.fetch() + np.testing.assert_array_equal(result, raw.sum(axis=1) + 1) + + c = mt.tensor(raw, chunk_size=5).sum() + d = session.execute(c) + assert d is c + assert abs(session.fetch(d) - raw.sum()) < 0.001 + + with tempfile.TemporaryDirectory() as tempdir: + file_path = os.path.join(tempdir, "test.csv") + pdf = pd.DataFrame( + np.random.RandomState(0).rand(100, 10), + columns=[f"col{i}" for i in range(10)], + ) + pdf.to_csv(file_path, index=False) + + df = md.read_csv( + file_path, + chunk_bytes=os.stat(file_path).st_size / 5, + incremental_index=True, + ) + result = df.sum(axis=1).execute().fetch() + expected = pd.read_csv(file_path).sum(axis=1) + pd.testing.assert_series_equal(result, expected) + + df = md.read_csv( + file_path, + chunk_bytes=os.stat(file_path).st_size / 5, + incremental_index=True, + ) + result = df.head(10).execute().fetch() + expected = pd.read_csv(file_path).head(10) + pd.testing.assert_frame_equal(result, expected) + + for worker_pool in session._session.client._cluster._worker_pools: + _assert_storage_cleaned( + session.session_id, worker_pool.external_address, StorageLevel.MEMORY + ) + + session.stop_server() + assert get_default_async_session() is None + + +def test_no_default_session(): + raw = np.random.RandomState(0).rand(10, 10) + a = mt.tensor(raw, chunk_size=5) + b = a + 1 + + with pytest.warns(Warning): + execute(b, show_progress=False) + + np.testing.assert_array_equal(fetch(b), raw + 1) + fetch_infos(b, fields=None) + assert get_default_async_session() is not None + stop_server() + assert get_default_async_session() is None + + +@pytest.mark.asyncio +async def test_session_set_progress(create_cluster): + session = get_default_async_session() + assert session.address is not None + assert session.session_id is not None + + def f1(interval: float, count: int): + for idx in range(count): + time.sleep(interval) + get_context().set_progress((1 + idx) * 1.0 / count) + + r = mr.spawn(f1, args=(0.5, 10)) + + info = await session.execute(r) + + for _ in range(20): + if 0 < info.progress() < 1: + break + await asyncio.sleep(0.1) + else: + raise Exception(f"progress test failed, actual value {info.progress()}.") + + await info + assert info.result() is None + assert info.exception() is None + assert info.progress() == 1 + + +@pytest.mark.asyncio +async def test_session_get_progress(create_cluster): + session = get_default_async_session() + assert session.address is not None + assert session.session_id is not None + + raw = np.random.rand(100, 4) + t = mt.tensor(raw, chunk_size=50) + + def f1(c): + time.sleep(0.5) + return c + + t1 = t.sum() + t2 = t1.map_chunk(f1) + r = t2.map_chunk(f1) + info = await session.execute(r) + + for _ in range(100): + if 0 < info.progress() < 1: + break + await asyncio.sleep(0.1) + else: + raise Exception(f"progress test failed, actual value {info.progress()}.") + + await info + assert info.result() is None + assert info.exception() is None + assert info.progress() == 1 + + +@pytest.fixture +def setup_session(request): + param = getattr(request, "param", {}) + config = param.get("config", {}) + session = new_session( + backend=config.get("backend", "mars"), n_cpu=2, use_uvloop=False, config=config + ) + assert session.get_web_endpoint() is not None + + try: + with session, option_context({"show_progress": False}): + yield session + finally: + session.stop_server() + + +WeakTaskProcessorRefs = weakref.WeakSet() + + +class CheckRefTaskProcessor(TaskProcessor): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + WeakTaskProcessorRefs.add(self) + + async def run(self): + # Trigger tileable gc before execute. + gc.collect() + return await super().run() + + @staticmethod + def check_ref_count(count): + for _ in range(10): + if len(WeakTaskProcessorRefs) == count: + break + time.sleep(1) + else: + raise Exception( + f"Check TaskProcessor weakref failed, expect {count} instances, " + f"but got {WeakTaskProcessorRefs}" + ) + + +@pytest.mark.parametrize( + "setup_session", + [ + { + "config": { + "task.default_config.reserved_finish_tasks": 2, + "task.task_processor_cls": CheckRefTaskProcessor, + } + } + ], + indirect=True, +) +def test_decref(setup_session): + session = setup_session + + a = mt.ones((10, 10)) + b = mt.ones((10, 10)) + c = b + 1 + d = mt.ones((5, 5)) + + a.execute() + b.execute() + c.execute() + d.execute() + + CheckRefTaskProcessor.check_ref_count(4) + + del a + ref_counts = session._get_ref_counts() + assert len(ref_counts) == 3 + del b + ref_counts = session._get_ref_counts() + assert len(ref_counts) == 3 + del c + ref_counts = session._get_ref_counts() + assert len(ref_counts) == 1 + del d + ref_counts = session._get_ref_counts() + assert len(ref_counts) == 0 + + CheckRefTaskProcessor.check_ref_count(2) + + rs = np.random.RandomState(0) + pdf = pd.DataFrame({"a": rs.randint(10, size=10), "b": rs.rand(10)}) + df = md.DataFrame(pdf, chunk_size=5) + df2 = df.groupby("a").agg("mean", method="shuffle") + result = df2.execute().fetch() + expected = pdf.groupby("a").agg("mean") + pd.testing.assert_frame_equal(result, expected) + + CheckRefTaskProcessor.check_ref_count(3) + + del df, df2 + ref_counts = session._get_ref_counts() + assert len(ref_counts) == 0 + + CheckRefTaskProcessor.check_ref_count(2) + + with tempfile.TemporaryDirectory() as tempdir: + file_path = os.path.join(tempdir, "test.csv") + pdf = pd.DataFrame( + np.random.RandomState(0).rand(100, 10), + columns=[f"col{i}" for i in range(10)], + ) + pdf.to_csv(file_path, index=False) + + df = md.read_csv(file_path, chunk_bytes=os.stat(file_path).st_size / 5) + df2 = df.head(10) + + result = df2.execute().fetch() + expected = pdf.head(10) + pd.testing.assert_frame_equal(result, expected) + + del df, df2 + + ref_counts = session._get_ref_counts() + assert len(ref_counts) == 0 + + for a in ((1, 1, 1, 2, 2, 3), [1, 1, 1, 2, 2, 3]): + splits = mt.split(a, (3, 5)) + assert len(splits) == 3 + splits0 = splits[0].execute().fetch() + np.testing.assert_array_equal(splits0, (1, 1, 1)) + splits1 = splits[1].execute().fetch() + np.testing.assert_array_equal(splits1, (2, 2)) + splits2 = splits[2].execute().fetch() + np.testing.assert_array_equal(splits2, (3,)) + + del splits, splits0, splits1, splits2 + + gc.collect() + ref_counts = session._get_ref_counts() + assert len(ref_counts) == 0 + + worker_addr = session._session.client._cluster._worker_pools[0].external_address + _assert_storage_cleaned(session.session_id, worker_addr, StorageLevel.MEMORY) + + +def _assert_worker_pool_storage_cleaned(session): + worker_addr = session._session.client._cluster._worker_pools[0].external_address + _assert_storage_cleaned(session.session_id, worker_addr, StorageLevel.MEMORY) + + +def _cancel_when_execute(session, cancelled): + def run(): + time.sleep(200) + + rs = [mr.spawn(run) for _ in range(10)] + execute(*rs, cancelled=cancelled) + + assert all(not r._executed_sessions for r in rs) + + del rs + time.sleep(0.5) + ref_counts = session._get_ref_counts() + assert len(ref_counts) == 0 + + +def _cancel_assert_when_execute(session, cancelled): + _assert_worker_pool_storage_cleaned(session) + _cancel_when_execute(session, cancelled) + + +class SlowTileAdd(TensorAdd): + @classmethod + def tile(cls, op): + time.sleep(2) + return (yield from TensorAdd.tile(op)) + + +def _cancel_when_tile(session, cancelled): + a = mt.tensor([1, 2, 3]) + for i in range(20): + a = SlowTileAdd(dtype=np.dtype(np.int64))(a, 1) + execute(a, cancelled=cancelled) + + assert not a._executed_sessions + + del a + time.sleep(0.5) + ref_counts = session._get_ref_counts() + assert len(ref_counts) == 0 + + +@pytest.mark.parametrize("test_func", [_cancel_assert_when_execute, _cancel_when_tile]) +def test_cancel(create_cluster, test_func): + session = get_default_session() + + async def _new_cancel_event(): + return asyncio.Event() + + isolation = new_isolation() + cancelled = asyncio.run_coroutine_threadsafe( + _new_cancel_event(), isolation.loop + ).result() + + def cancel(): + time.sleep(0.5) + cancelled.set() + + t = threading.Thread(target=cancel) + t.daemon = True + t.start() + + start = time.time() + test_func(session, cancelled) + assert time.time() - start < 20 + + # submit another task + raw = np.random.rand(10, 10) + t = mt.tensor(raw, chunk_size=(10, 5)) + np.testing.assert_array_equal(t.execute().fetch(), raw) + + +def test_load_third_party_modules(cleanup_third_party_modules_output): # noqa: F811 + config = _load_config() + + config["third_party_modules"] = set() + with pytest.raises(TypeError, match="set"): + new_session(n_cpu=2, web=False, config=config) + + config["third_party_modules"] = {"supervisor": ["not_exists_for_supervisor"]} + with pytest.raises(ModuleNotFoundError, match="not_exists_for_supervisor"): + new_session(n_cpu=2, web=False, config=config) + + config["third_party_modules"] = {"worker": ["not_exists_for_worker"]} + with pytest.raises(ModuleNotFoundError, match="not_exists_for_worker"): + new_session(n_cpu=2, web=False, config=config) + + config["third_party_modules"] = ["mars.deploy.oscar.tests.modules.replace_op"] + session = new_session(n_cpu=2, web=False, config=config) + # web not started + assert session._session.client.web_address is None + + with session: + raw = np.random.RandomState(0).rand(10, 10) + a = mt.tensor(raw, chunk_size=5) + b = a + 1 + b.execute(show_progress=False) + result = b.fetch() + + np.testing.assert_equal(raw - 1, result) + + session.stop_server() + assert get_default_session() is None + + session = new_session( + n_cpu=2, web=False, config=CONFIG_THIRD_PARTY_MODULES_TEST_FILE + ) + # web not started + assert session._session.client.web_address is None + + with session: + # 1 main pool, 3 sub pools(2 worker + 1 io). + assert len(get_output_filenames()) == 4 + + session.stop_server() + assert get_default_session() is None + + +@mock.patch("asyncio.base_events.logger") +def test_show_progress_raise_exception(m_log): + loop = asyncio.get_event_loop() + event = asyncio.Event() + + class ProgressBar: + def __init__(self, *args, **kwargs): + pass + + def __enter__(self): + pass + + def __exit__(self, *_): + pass + + def update(self, progress: float): + pass + + async def _exec(): + progress = Progress() + profiling = Profiling() + execution_info = ExecutionInfo( + asyncio.create_task(event.wait()), progress, profiling, loop, list() + ) + progress_bar = ProgressBar(True) + cancel_event = asyncio.Event() + loop.call_later(2, cancel_event.set) + await _execute_with_progress(execution_info, progress_bar, 0.01, cancel_event) + execution_info.get_future().set_exception(Exception("Expect Exception!!!")) + + loop.run_until_complete(_exec()) + assert len(m_log.mock_calls) < 3 + + +min_task_runtime = 2 + + +@pytest.fixture +async def speculative_cluster(): + config = _load_config() + config["scheduling"]["speculation"]["enabled"] = True + config["scheduling"]["speculation"]["dry"] = False + config["scheduling"]["speculation"]["interval"] = 0.5 + config["scheduling"]["speculation"]["threshold"] = 0.2 + config["scheduling"]["speculation"]["min_task_runtime"] = min_task_runtime + config["scheduling"]["speculation"]["multiplier"] = 2 + config["scheduling"]["speculation"]["max_concurrent_run"] = 10 + config["scheduling"]["subtask_cancel_timeout"] = 0.1 + config["scheduling"]["enable_kill_slot"] = True + config["storage"]["backends"] = ["plasma"] + config["storage"]["plasma"]["store_memory"] = 10 * 1024 * 1024 + client = await new_cluster( + config=config, + n_worker=5, + n_cpu=10, + use_uvloop=False, + ) + async with client: + yield client + + +@pytest.mark.timeout(timeout=500) +@pytest.mark.asyncio +async def test_task_speculation_execution(speculative_cluster): + series_size = 10 + + def time_consuming(start, x): + print(f"subtask index {x}") + if ( + x >= series_size - 1 + ): # leave some workers not excluded from speculative submit. + if time.time() - start < min_task_runtime: + print(f"subtask with index {x} starts to hang.") + time.sleep(1000000) + return x * x + + from functools import partial + + assert ( + md.Series(list(range(series_size)), chunk_size=1) + .apply(partial(time_consuming, time.time())) + .sum() + .execute() + .fetch() + == pd.Series(list(range(series_size))).apply(lambda x: x * x).sum() + ) + + +def test_naive_code_file(): + code_file = """ + import mars + import mars.tensor as mt + import os + + mars.new_session() + try: + result_path = os.environ["RESULTPATH"] + with open(result_path, "w") as outf: + outf.write(str(mt.ones((10, 10)).sum().execute())) + finally: + mars.stop_server() + """ + + with tempfile.TemporaryDirectory() as temp_dir: + try: + script_path = os.path.join(temp_dir, "test_file.py") + result_path = os.path.join(temp_dir, "result.txt") + + with open(script_path, "w") as file_obj: + file_obj.write(textwrap.dedent(code_file)) + + env = os.environ.copy() + env["PYTHONPATH"] = os.path.pathsep.join(sys.path) + env["RESULTPATH"] = result_path + proc = subprocess.Popen([sys.executable, script_path], env=env) + pid = proc.pid + proc.wait(120) + + with open(result_path, "r") as inp_file: + assert 100 == int(float(inp_file.read())) + except subprocess.TimeoutExpired: + try: + procs = [psutil.Process(pid)] + procs.extend(procs[0].children(True)) + for proc in reversed(procs): + try: + proc.kill() + except psutil.NoSuchProcess: + pass + except psutil.NoSuchProcess: + pass + raise + + +ucp = lazy_import("ucp") +_OSCAR_CONF_TEMPLATE = """ +"@inherits": '@default' +oscar: + numa: + external_addr_scheme: {scheme} + enable_internal_addr: {enable_inaddr} +""" + + +schemes = [None] +if ucp is not None: + schemes.append("ucx") + + +@pytest.mark.parametrize("scheme", schemes) +@pytest.mark.parametrize("enable_inaddr", [False, True]) +@pytest.mark.parametrize("manner", ["numa", "all", "config_file"]) +def test_oscar_configs(scheme, enable_inaddr, manner): + def test(sess): + def verify(): + router = Router.get_instance() + prefix = "" if not scheme else f"{scheme}://" + assert router._mapping + assert all(addr.startswith(prefix) for addr in router._mapping) + if enable_inaddr: + assert all(inaddr is not None for inaddr in router._mapping.values()) + else: + assert all(inaddr is None for inaddr in router._mapping.values()) + + with sess: + sess.execute(*[mr.spawn(verify) for _ in range(4)]) + + sess.stop_server() + assert get_default_async_session() is None + + if manner == "numa": + session = new_session( + n_cpu=2, + web=False, + cuda_devices=None, + numa_external_addr_scheme=scheme, + numa_enable_internal_addr=enable_inaddr, + oscar_extra_conf={"ucx": {"tcp": True}}, + ) + test(session) + elif manner == "all": + session = new_session( + n_cpu=2, + web=False, + cuda_devices=None, + external_addr_scheme=scheme, + enable_internal_addr=enable_inaddr, + ) + test(session) + else: + scheme_str = "" if not scheme else scheme + enable_inaddr_str = "yes" if enable_inaddr else "no" + config_content = _OSCAR_CONF_TEMPLATE.format( + scheme=scheme_str, enable_inaddr=enable_inaddr_str + ) + with tempfile.NamedTemporaryFile(mode="w+", suffix=".yml") as f: + f.write(config_content) + f.flush() + session = new_session(config=f.name, n_cpu=2, web=False, cuda_devices=None) + + test(session) + + +@require_cupy +@pytest.mark.parametrize("scheme", schemes) +@pytest.mark.parametrize("enable_inaddr", [False, True]) +@pytest.mark.parametrize("manner", ["gpu", "all"]) +def test_gpu_oscar_configs(scheme, enable_inaddr, manner): + def test(sess): + def verify(): + router = Router.get_instance() + prefix = "" if not scheme else f"{scheme}://" + # only verify GPU process + assert {addr for addr in router._mapping if addr == router.external_address} + assert all( + addr.startswith(prefix) + for addr in router._mapping + if addr == router.external_address + ) + if enable_inaddr: + assert all( + inaddr is not None + for addr, inaddr in router._mapping.items() + if addr == router.external_address + ) + else: + assert all( + inaddr is None + for addr, inaddr in router._mapping.items() + if addr == router.external_address + ) + + with sess: + sess.execute(*[mr.spawn(verify, gpu=True) for _ in range(2)]) + + sess.stop_server() + assert get_default_async_session() is None + + if manner == "gpu": + session = new_session( + n_cpu=2, + web=False, + cuda_devices=[0], + gpu_external_addr_scheme=scheme, + gpu_enable_internal_addr=enable_inaddr, + oscar_extra_conf={"ucx": {"create-cuda-contex": True}}, + ) + test(session) + else: + session = new_session( + n_cpu=2, + web=False, + cuda_devices=[0], + external_addr_scheme=scheme, + enable_internal_addr=enable_inaddr, + ) + test(session) + + +def test_default_oscar_config(): + session = new_session(n_cpu=2, web=False, cuda_devices=None) + + def verify(): + router = Router.get_instance() + assert router._mapping + # enabled inner address by default + assert all(inaddr is not None for inaddr in router._mapping.values()) + + with session: + session.execute(*[mr.spawn(verify) for _ in range(4)]) + + session.stop_server() + assert get_default_async_session() is None + + +@pytest.mark.parametrize("config", [{"backend": "mars"}]) +def test_fetch_concat(config): + session = new_session( + backend=config["backend"], n_cpu=2, web=False, use_uvloop=False + ) + assert session is not None + + with session: + data = {"A": [i for i in range(10)]} + df0 = md.DataFrame(data) + df1 = df0[["A"]] + df2 = df0[["A"]] + df1 = df1.execute() + df2 = df2.execute() + df3 = md.concat([df1, df2], axis=1) + ret = df3.execute() + df4 = ret.fetch() + + pdf0 = pd.DataFrame(data) + pdf1 = pdf0[["A"]] + pdf2 = pdf0[["A"]] + pdf3 = pd.concat([pdf1, pdf2], axis=1) + + assert pdf3.equals(df4) + + for worker_pool in session._session.client._cluster._worker_pools: + _assert_storage_cleaned( + session.session_id, worker_pool.external_address, StorageLevel.MEMORY + ) + + session.stop_server() + assert get_default_async_session() is None + + +def test_clear_default_session(setup): + assert get_default_session() is not None + clear_default_session() + assert get_default_session() is None diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_pool.py b/python/xorbits/_mars/deploy/oscar/tests/test_pool.py new file mode 100644 index 000000000..57e843914 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/test_pool.py @@ -0,0 +1,115 @@ +# Copyright 2022 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import tempfile + +import pytest + +from ....constants import MARS_LOG_PATH_KEY, MARS_TMP_DIR_PREFIX +from ....utils import clean_mars_tmp_dir +from ..pool import ( + _parse_file_logging_config, + _config_logging, + _get_root_logger_level_and_format, +) + + +@pytest.fixture +def init(): + root_level, _ = _get_root_logger_level_and_format() + file_logging_config = os.path.join( + os.path.dirname(__file__), "..", "file-logging.conf" + ) + logger_sections = [ + "logger_main", + "logger_deploy", + "logger_oscar", + "logger_services", + "logger_dataframe", + "logger_learn", + "logger_tensor", + "handler_file_handler", + ] + yield file_logging_config, logger_sections, root_level + + # clean + clean_mars_tmp_dir() + + +def test_parse_file_logging_config(init): + fp, sections, root_level = init + log_path = "mock_path" + config = _parse_file_logging_config(fp, log_path, "FATAL") + assert config["handler_stream_handler"]["level"] == root_level + assert config["handler_stream_handler"].get("formatter") is not None + assert config["handler_stream_handler"]["formatter"] == "console" + for sec in sections: + if sec != "handler_file_handler": + assert config[sec]["level"] == "FATAL" + else: + assert config[sec]["level"] == root_level + + formatter = "foo" + config = _parse_file_logging_config(fp, log_path, "FATAL", formatter=formatter) + assert config["formatter_formatter"]["format"] == formatter + + config = _parse_file_logging_config(fp, log_path, level="", formatter=formatter) + assert config["logger_dataframe"]["level"] == "DEBUG" + + config = _parse_file_logging_config( + fp, log_path, level="", formatter=formatter, from_cmd=True + ) + assert config["logger_tensor"]["level"] == "DEBUG" + + assert config["handler_stream_handler"]["level"] == "DEBUG" + assert config["formatter_formatter"]["format"] == formatter + + +def test_config_logging(init, caplog): + _, _, root_level = init + kwargs = {"logging_conf": {}} + with caplog.at_level(logging.DEBUG): + _config_logging(**kwargs) + log_path = os.environ.get(MARS_LOG_PATH_KEY) + assert log_path is not None + assert os.path.basename(os.path.dirname(log_path)).startswith(MARS_TMP_DIR_PREFIX) + + clean_mars_tmp_dir() + + with tempfile.TemporaryDirectory() as folder: + kwargs = {"logging_conf": {"log_dir": folder, "from_cmd": True}} + _config_logging(**kwargs) + log_path = os.environ.get(MARS_LOG_PATH_KEY) + assert log_path is not None + assert os.path.dirname(os.path.dirname(log_path)) == folder + + cnt = 0 + file_handler = None + for handler in logging.getLogger().handlers: + if isinstance(handler, logging.FileHandler): + cnt += 1 + file_handler = handler + assert cnt == 1 + assert file_handler is not None + assert file_handler.level == logging.getLevelName("DEBUG") + assert file_handler.baseFilename == os.environ.get(MARS_LOG_PATH_KEY) + + +def test_pool_with_no_web_config(init): + kwargs = {"web": False} + _config_logging(**kwargs) + log_path = os.environ.get(MARS_LOG_PATH_KEY) + assert log_path is None diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_ray.py b/python/xorbits/_mars/deploy/oscar/tests/test_ray.py new file mode 100644 index 000000000..67a629c15 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/test_ray.py @@ -0,0 +1,333 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import copy +import operator +import os +from functools import reduce + +import numpy as np +import pytest + +from .... import dataframe as md +from .... import tensor as mt +from ....oscar.errors import ReconstructWorkerError +from ....tests.core import DICT_NOT_EMPTY, mock, require_ray +from ....utils import lazy_import +from ..ray import ClusterStateActor, _load_config, new_cluster +from ..session import get_default_session, new_session +from ..tests import test_local +from .modules.utils import ( # noqa: F401 # pylint: disable=unused-variable + cleanup_third_party_modules_output, + get_output_filenames, +) + +ray = lazy_import("ray") + +CONFIG_FILE = os.path.join(os.path.dirname(__file__), "local_test_with_ray_config.yml") + +EXPECT_PROFILING_STRUCTURE = { + "supervisor": { + "general": { + "optimize": 0.0005879402160644531, + "incref_fetch_tileables": 0.0010840892791748047, + "stage_*": { + "tile(*)": 0.008243083953857422, + "gen_subtask_graph(*)": 0.012202978134155273, + "run": 0.27870702743530273, + "total": 0.30318617820739746, + }, + "total": 0.30951380729675293, + }, + "serialization": { + "serialize": 0.014928340911865234, + "deserialize": 0.0011813640594482422, + "total": 0.016109704971313477, + }, + "most_calls": DICT_NOT_EMPTY, + "slow_calls": DICT_NOT_EMPTY, + "band_subtasks": DICT_NOT_EMPTY, + "slow_subtasks": DICT_NOT_EMPTY, + } +} +EXPECT_PROFILING_STRUCTURE_NO_SLOW = copy.deepcopy(EXPECT_PROFILING_STRUCTURE) +EXPECT_PROFILING_STRUCTURE_NO_SLOW["supervisor"]["slow_calls"] = {} +EXPECT_PROFILING_STRUCTURE_NO_SLOW["supervisor"]["slow_subtasks"] = {} + + +@pytest.fixture +async def create_cluster(request): + param = getattr(request, "param", {}) + ray_config = _load_config(CONFIG_FILE) + ray_config.update(param.get("config", {})) + client = await new_cluster( + supervisor_mem=1 * 1024**3, + worker_num=2, + worker_cpu=2, + worker_mem=1 * 1024**3, + config=ray_config, + ) + async with client: + yield client, param + + +@require_ray +@pytest.mark.parametrize( + "config", + [ + [ + { + "enable_profiling": { + "slow_calls_duration_threshold": 0, + "slow_subtasks_duration_threshold": 0, + } + }, + EXPECT_PROFILING_STRUCTURE, + ], + [ + { + "enable_profiling": { + "slow_calls_duration_threshold": 1000, + "slow_subtasks_duration_threshold": 1000, + } + }, + EXPECT_PROFILING_STRUCTURE_NO_SLOW, + ], + [{}, {}], + ], +) +@pytest.mark.asyncio +async def test_execute(ray_start_regular_shared, create_cluster, config): + await test_local.test_execute(create_cluster, config) + + +@require_ray +@pytest.mark.asyncio +async def test_iterative_tiling(ray_start_regular_shared, create_cluster): + await test_local.test_iterative_tiling(create_cluster) + + +@require_ray +@pytest.mark.asyncio +async def test_execute_describe(ray_start_regular_shared, create_cluster): + await test_local.test_execute_describe(create_cluster) + + +@require_ray +@pytest.mark.asyncio +async def test_execute_apply_closure(ray_start_regular_shared, create_cluster): + await test_local.test_execute_apply_closure(create_cluster) + + +@require_ray +@pytest.mark.parametrize("multiplier", [1, 3, 4]) +@pytest.mark.asyncio +async def test_execute_callable_closure( + ray_start_regular_shared, create_cluster, multiplier +): + await test_local.test_execute_callable_closure(create_cluster, multiplier) + + +@require_ray +@pytest.mark.parametrize( + "create_cluster", + [ + { + "config": { + "task.task_preprocessor_cls": "mars.deploy.oscar.tests.test_clean_up_and_restore_func.RayBackendFuncTaskPreprocessor", + "subtask.subtask_processor_cls": "mars.deploy.oscar.tests.test_clean_up_and_restore_func.RayBackendFuncSubtaskProcessor", + } + } + ], + indirect=True, +) +@pytest.mark.asyncio +async def test_ray_oscar_clean_up_and_restore_func( + ray_start_regular_shared, create_cluster +): + await test_local.test_execute_apply_closure(create_cluster) + + +@require_ray +@pytest.mark.asyncio +async def test_fetch_infos(ray_start_regular_shared, create_cluster): + await test_local.test_fetch_infos(create_cluster) + df = md.DataFrame(mt.random.RandomState(0).rand(5000, 1, chunk_size=1000)) + df.execute() + fetched_infos = df.fetch_infos(fields=["object_refs"]) + object_refs = reduce(operator.concat, fetched_infos["object_refs"]) + assert len(fetched_infos) == 1 + assert len(object_refs) == 5 + + +@require_ray +@pytest.mark.asyncio +def test_sync_execute(ray_start_regular_shared, create_cluster): + client = create_cluster[0] + assert client.session + session = new_session(address=client.address) + with session: + raw = np.random.RandomState(0).rand(10, 5) + a = mt.tensor(raw, chunk_size=5).sum(axis=1) + b = a.execute(show_progress=False) + assert b is a + result = a.fetch() + np.testing.assert_array_equal(result, raw.sum(axis=1)) + + c = mt.tensor(raw, chunk_size=5).sum() + d = session.execute(c) + assert d is c + assert abs(session.fetch(d) - raw.sum()) < 0.001 + + assert get_default_session() is None + + +def _run_web_session(web_address): + import asyncio + + asyncio.new_event_loop().run_until_complete( + test_local._run_web_session_test(web_address) + ) + return True + + +def _sync_web_session_test(web_address): + new_session(web_address) + raw = np.random.RandomState(0).rand(10, 5) + a = mt.tensor(raw, chunk_size=5).sum(axis=1) + b = a.execute(show_progress=False) + assert b is a + return True + + +@require_ray +@pytest.mark.parametrize( + "config", + [ + [ + { + "enable_profiling": { + "slow_calls_duration_threshold": 0, + "slow_subtasks_duration_threshold": 0, + } + }, + EXPECT_PROFILING_STRUCTURE, + ], + [ + { + "enable_profiling": { + "slow_calls_duration_threshold": 1000, + "slow_subtasks_duration_threshold": 1000, + } + }, + EXPECT_PROFILING_STRUCTURE_NO_SLOW, + ], + [{}, {}], + ], +) +@pytest.mark.asyncio +async def test_web_session(ray_start_regular_shared, create_cluster, config): + client = create_cluster[0] + await test_local.test_web_session(create_cluster, config) + web_address = client.web_address + assert await ray.remote(_run_web_session).remote(web_address) + assert await ray.remote(_sync_web_session_test).remote(web_address) + + +@require_ray +def test_load_config(): + default_config = _load_config() + assert default_config["scheduling"]["autoscale"]["enabled"] is False + default_config = _load_config({"scheduling": {"autoscale": {"enabled": True}}}) + assert default_config["scheduling"]["autoscale"]["enabled"] is True + default_config = _load_config( + { + "scheduling.autoscale.enabled": True, + "scheduling.autoscale.scheduler_backlog_timeout": 1, + } + ) + assert default_config["scheduling"]["autoscale"]["enabled"] is True + assert default_config["scheduling"]["autoscale"]["scheduler_backlog_timeout"] == 1 + with pytest.raises(ValueError): + _load_config({"scheduling.autoscale.enabled": True, "scheduling.autoscale": {}}) + assert _load_config(CONFIG_FILE)["session"]["custom_log_dir"] == "auto" + + +@require_ray +@pytest.mark.asyncio +@mock.patch("mars.deploy.oscar.ray.stop_worker") +async def test_reconstruct_worker_during_releasing_worker(fake_stop_worker): + stop_worker = asyncio.Event() + lock = asyncio.Event() + + async def _stop_worker(*args): + stop_worker.set() + await lock.wait() + + fake_stop_worker.side_effect = _stop_worker + cluster_state = ClusterStateActor() + release_task = asyncio.create_task(cluster_state.release_worker("abc")) + await stop_worker.wait() + with pytest.raises(ReconstructWorkerError, match="releasing"): + await cluster_state.reconstruct_worker("abc") + release_task.cancel() + + +@require_ray +@pytest.mark.asyncio +@mock.patch("mars.deploy.oscar.ray.stop_worker") +@mock.patch("ray.get_actor") +async def test_release_worker_during_reconstructing_worker( + fake_get_actor, fake_stop_worker +): + get_actor = asyncio.Event() + lock = asyncio.Event() + + class FakeActorMethod: + async def remote(self): + get_actor.set() + await lock.wait() + + class FakeActor: + state = FakeActorMethod() + + def _get_actor(*args, **kwargs): + return FakeActor + + async def _stop_worker(*args): + await lock.wait() + + fake_get_actor.side_effect = _get_actor + fake_stop_worker.side_effect = _stop_worker + cluster_state = ClusterStateActor() + reconstruct_task = asyncio.create_task(cluster_state.reconstruct_worker("abc")) + await get_actor.wait() + release_task = asyncio.create_task(cluster_state.release_worker("abc")) + with pytest.raises(asyncio.CancelledError): + await reconstruct_task + release_task.cancel() + + +@require_ray +@pytest.mark.asyncio +def test_init_metrics_on_ray(ray_start_regular_shared, create_cluster): + client = create_cluster[0] + assert client.session + from ....metrics import api + + assert client._cluster._config.get("metrics", {}).get("backend") == "ray" + assert api._metric_backend == "ray" + + client.session.stop_server() diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_ray_client.py b/python/xorbits/_mars/deploy/oscar/tests/test_ray_client.py new file mode 100644 index 000000000..f39bbc28d --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/test_ray_client.py @@ -0,0 +1,97 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import subprocess +import sys +import tempfile +import threading + +import pytest + +from ....tests.core import require_ray +from ....utils import lazy_import +from .test_ray_cluster_standalone import new_ray_session_test + +ray = lazy_import("ray") + + +@require_ray +@pytest.mark.parametrize( + "backend", + [ + "mars", + "ray", + ], +) +def test_ray_client(backend): + server_code = """import time +import ray.util.client.server.server as ray_client_server + +server = ray_client_server.init_and_serve("{address}", num_cpus=20) +print("OK", flush=True) +while True: + time.sleep(1) +""" + + address = "127.0.0.1:50051" + + with tempfile.NamedTemporaryFile(mode="w", suffix=".py") as f: + f.write(server_code.format(address=address)) + f.flush() + + proc = subprocess.Popen([sys.executable, "-u", f.name], stdout=subprocess.PIPE) + + try: + + def _check_ready(expect_exit=False): + while True: + line = proc.stdout.readline() + if proc.returncode is not None: + if expect_exit: + break + raise Exception( + f"Failed to start ray server at {address}, " + f"the return code is {proc.returncode}." + ) + if b"OK" in line: + break + + # Avoid ray.init timeout. + _check_ready() + + # Avoid blocking the subprocess when the stdout pipe is full. + t = threading.Thread(target=_check_ready, args=(True,), daemon=True) + t.start() + try: + import ray + + ray.client(address).connect() # Ray 1.4 + except Exception: + try: + from ray.util.client import ray + + ray.connect(address) # Ray 1.2 + except Exception: + import ray + + ray.init(f"ray://{address}") # Ray latest + ray._inside_client_test = True + try: + new_ray_session_test(backend=backend) + finally: + ray._inside_client_test = False + ray.shutdown() + finally: + proc.kill() + proc.wait() diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_ray_cluster_standalone.py b/python/xorbits/_mars/deploy/oscar/tests/test_ray_cluster_standalone.py new file mode 100644 index 000000000..cf9d6639c --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/test_ray_cluster_standalone.py @@ -0,0 +1,160 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import mars +import pytest + +from .... import dataframe as md +from .... import tensor as mt +from ....tests.core import mock, require_ray +from ....utils import lazy_import +from ..ray import _load_config, new_cluster, new_cluster_in_ray, new_ray_session + +ray = lazy_import("ray") + + +@require_ray +def test_new_cluster_in_ray(stop_ray): + cluster = new_cluster_in_ray(worker_num=2) + mt.random.RandomState(0).rand(100, 5).sum().execute() + cluster.session.execute(mt.random.RandomState(0).rand(100, 5).sum()) + mars.execute(mt.random.RandomState(0).rand(100, 5).sum()) + session = new_ray_session(address=cluster.address, session_id="abcd", default=True) + session.execute(mt.random.RandomState(0).rand(100, 5).sum()) + mars.execute(mt.random.RandomState(0).rand(100, 5).sum()) + cluster.stop() + + +@require_ray +@pytest.mark.parametrize( + "backend", + [ + "mars", + "ray", + ], +) +def test_new_ray_session(stop_ray, backend): + new_ray_session_test(backend) + + +def new_ray_session_test(backend): + session = new_ray_session( + session_id="abc", worker_num=2, worker_mem=512 * 1024**2, backend=backend + ) + mt.random.RandomState(0).rand(100, 5).sum().execute() + session.execute(mt.random.RandomState(0).rand(100, 5).sum()) + mars.execute(mt.random.RandomState(0).rand(100, 5).sum()) + session = new_ray_session( + session_id="abcd", + worker_num=2, + default=True, + worker_mem=512 * 1024**2, + backend=backend, + ) + session.execute(mt.random.RandomState(0).rand(100, 5).sum()) + mars.execute(mt.random.RandomState(0).rand(100, 5).sum()) + df = md.DataFrame(mt.random.rand(100, 4), columns=list("abcd")) + # Convert mars dataframe to ray dataset + ds = md.to_ray_dataset(df) + print(ds.schema(), ds.count()) + ds.filter(lambda row: row["a"] > 0.5).show(5) + # Convert ray dataset to mars dataframe + df2 = md.read_ray_dataset(ds) + print(df2.head(5).execute()) + # Test ray cluster exists after session got gc. + del session + import gc + + gc.collect() + mars.execute(mt.random.RandomState(0).rand(100, 5).sum()) + + +@require_ray +@pytest.mark.parametrize( + "test_option", + [ + [True, 0, ["ray://test_cluster/1/0", "ray://test_cluster/2/0"]], + [False, 0, ["ray://test_cluster/0/1", "ray://test_cluster/1/0"]], + [True, 2, ["ray://test_cluster/1/0", "ray://test_cluster/2/0"]], + [False, 5, ["ray://test_cluster/0/6", "ray://test_cluster/1/0"]], + ], +) +@pytest.mark.asyncio +async def test_optional_supervisor_node(ray_start_regular, test_option): + import logging + + logging.basicConfig(level=logging.INFO) + supervisor_standalone, supervisor_sub_pool_num, worker_addresses = test_option + config = _load_config() + config["cluster"]["ray"]["supervisor"]["standalone"] = supervisor_standalone + config["cluster"]["ray"]["supervisor"]["sub_pool_num"] = supervisor_sub_pool_num + client = await new_cluster( + "test_cluster", + supervisor_mem=1 * 1024**3, + worker_num=2, + worker_cpu=2, + worker_mem=1 * 1024**3, + config=config, + ) + async with client: + assert client.address == "ray://test_cluster/0/0" + assert client._cluster._worker_addresses == worker_addresses + + +@require_ray +@pytest.mark.asyncio +async def test_new_ray_session_config(stop_ray): + original_placement_group = ray.util.placement_group + with mock.patch.object( + ray.util, "placement_group", autospec=True + ) as mock_placement_group: + + def _wrap_original_placement_group(*args, **kwargs): + assert {"CPU": 3} in kwargs["bundles"] + return original_placement_group(*args, **kwargs) + + mock_placement_group.side_effect = _wrap_original_placement_group + mars.new_ray_session( + supervisor_cpu=3, + worker_cpu=5, + backend="ray", + default=True, + config={ + "third_party_modules": [ + "mars.deploy.oscar.tests.modules.check_ray_remote_function_options" + ] + }, + ) + mt.random.RandomState(0).rand(100, 5).sum().execute() + + # It seems crashes CI. + # mars.stop_server() + # + # actors = ray.state.actors() + # assert len(actors) == 1 + # assert list(actors.values())[0]["State"] == "DEAD" + + mars.new_ray_session( + supervisor_cpu=3, + worker_cpu=4, + backend="ray", + default=True, + config={ + "third_party_modules": [ + "mars.deploy.oscar.tests.modules.check_ray_remote_function_options" + ] + }, + ) + with pytest.raises(AssertionError): + mt.random.RandomState(0).rand(100, 5).sum().execute() diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_ray_dag.py b/python/xorbits/_mars/deploy/oscar/tests/test_ray_dag.py new file mode 100644 index 000000000..acc2b36a3 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/test_ray_dag.py @@ -0,0 +1,227 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import os +import time + +import pytest + +from .... import get_context +from .... import tensor as mt +from ....tests import test_session +from ....tests.core import DICT_NOT_EMPTY, require_ray +from ....utils import lazy_import +from ..local import new_cluster +from ..session import get_default_async_session, new_session +from ..tests import test_local +from ..tests.session import new_test_session +from ..tests.test_local import _cancel_when_execute, _cancel_when_tile +from .modules.utils import ( # noqa: F401; pylint: disable=unused-variable + cleanup_third_party_modules_output, + get_output_filenames, +) + +ray = lazy_import("ray") + +EXPECT_PROFILING_STRUCTURE = { + "supervisor": { + "general": { + "optimize": 0.0005879402160644531, + "stage_*": { + "tile(*)": 0.008243083953857422, + "gen_subtask_graph(*)": 0.012202978134155273, + "run": 0.27870702743530273, + "total": 0.30318617820739746, + }, + "total": 0.30951380729675293, + }, + "serialization": {}, + "most_calls": DICT_NOT_EMPTY, + "slow_calls": DICT_NOT_EMPTY, + "band_subtasks": {}, + "slow_subtasks": {}, + } +} +EXPECT_PROFILING_STRUCTURE_NO_SLOW = copy.deepcopy(EXPECT_PROFILING_STRUCTURE) +EXPECT_PROFILING_STRUCTURE_NO_SLOW["supervisor"]["slow_calls"] = {} + + +@pytest.mark.parametrize(indirect=True) +@pytest.fixture +async def create_cluster(request): + param = getattr(request, "param", {}) + start_method = os.environ.get("POOL_START_METHOD", None) + client = await new_cluster( + subprocess_start_method=start_method, + backend="ray", + n_worker=2, + n_cpu=2, + use_uvloop=False, + config=param.get("config", None), + ) + async with client: + assert client.session.client is not None + yield client, {} + + +@require_ray +@pytest.mark.parametrize("backend", ["ray"]) +@pytest.mark.parametrize("_new_session", [new_session, new_test_session]) +def test_new_session_backend(ray_start_regular_shared2, _new_session, backend): + test_local.test_new_session_backend(_new_session, backend) + + +@require_ray +@pytest.mark.parametrize( + "config", + [ + [ + { + "enable_profiling": { + "slow_calls_duration_threshold": 0, + "slow_subtasks_duration_threshold": 0, + } + }, + EXPECT_PROFILING_STRUCTURE, + ], + [ + { + "enable_profiling": { + "slow_calls_duration_threshold": 1000, + "slow_subtasks_duration_threshold": 1000, + } + }, + EXPECT_PROFILING_STRUCTURE_NO_SLOW, + ], + [{}, {}], + ], +) +@pytest.mark.asyncio +async def test_execute(ray_start_regular_shared2, create_cluster, config): + await test_local.test_execute(create_cluster, config) + + +@require_ray +@pytest.mark.asyncio +async def test_iterative_tiling(ray_start_regular_shared2, create_cluster): + await test_local.test_iterative_tiling(create_cluster) + + +@require_ray +@pytest.mark.parametrize("config", [{"backend": "ray"}]) +def test_sync_execute(ray_start_regular_shared2, config): + test_local.test_sync_execute(config) + + +@require_ray +@pytest.mark.parametrize( + "create_cluster", + [{"config": {"task.execution_config.ray.monitor_interval_seconds": 0}}], + indirect=True, +) +@pytest.mark.asyncio +async def test_session_get_progress(ray_start_regular_shared2, create_cluster): + await test_local.test_session_get_progress(create_cluster) + + +@require_ray +@pytest.mark.parametrize("test_func", [_cancel_when_execute, _cancel_when_tile]) +def test_cancel(ray_start_regular_shared2, create_cluster, test_func): + test_local.test_cancel(create_cluster, test_func) + + +@require_ray +@pytest.mark.parametrize("config", [{"backend": "ray"}]) +def test_executor_context_gc(ray_start_regular_shared2, config): + session = new_session( + backend=config["backend"], + n_cpu=2, + web=False, + use_uvloop=False, + config={"task.execution_config.ray.monitor_interval_seconds": 0}, + ) + + assert session._session.client.web_address is None + assert session.get_web_endpoint() is None + + def f1(c): + time.sleep(0.5) + return c + + with session: + t1 = mt.random.randint(10, size=(100, 10), chunk_size=100) + t2 = mt.random.randint(10, size=(100, 10), chunk_size=50) + t3 = t2 + t1 + t4 = t3.sum(0) + t5 = t4.map_chunk(f1) + r = t5.execute() + result = r.fetch() + assert result is not None + assert len(result) == 10 + context = get_context() + assert len(context._task_context) < 5 + + session.stop_server() + assert get_default_async_session() is None + + +@require_ray +@pytest.mark.asyncio +async def test_execute_describe(ray_start_regular_shared2, create_cluster): + # `describe` contains multiple shuffle. + await test_local.test_execute_describe(create_cluster) + + +@require_ray +@pytest.mark.parametrize("method", ["shuffle", "broadcast", None]) +@pytest.mark.parametrize("auto_merge", ["after", "before"]) +def test_merge_groupby(ray_start_regular_shared2, setup, method, auto_merge): + # add ray_dag decorator to the test_merge_groupby makes the raylet crash. + test_session.test_merge_groupby(setup, method, auto_merge) + + +@require_ray +@pytest.mark.asyncio +async def test_execute_apply_closure(ray_start_regular_shared2, create_cluster): + await test_local.test_execute_apply_closure(create_cluster) + + +@require_ray +@pytest.mark.parametrize("multiplier", [1, 3, 4]) +@pytest.mark.asyncio +async def test_execute_callable_closure( + ray_start_regular_shared2, create_cluster, multiplier +): + await test_local.test_execute_callable_closure(create_cluster, multiplier) + + +@require_ray +@pytest.mark.parametrize( + "create_cluster", + [ + { + "config": { + "task.task_preprocessor_cls": "mars.deploy.oscar.tests.test_clean_up_and_restore_func.RayBackendFuncTaskPreprocessor", + "subtask.subtask_processor_cls": "mars.deploy.oscar.tests.test_clean_up_and_restore_func.RayBackendFuncSubtaskProcessor", + } + } + ], + indirect=True, +) +@pytest.mark.asyncio +async def test_ray_dag_clean_up_and_restore_func( + ray_start_regular_shared2, create_cluster +): + await test_local.test_execute_apply_closure(create_cluster) diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_ray_dag_failover.py b/python/xorbits/_mars/deploy/oscar/tests/test_ray_dag_failover.py new file mode 100644 index 000000000..988bc8690 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/test_ray_dag_failover.py @@ -0,0 +1,114 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import operator +from functools import reduce + +import mars +import pandas as pd +import pytest + +from .... import dataframe as md +from .... import tensor as mt +from ....tests.core import require_ray +from ....utils import lazy_import + +ray = lazy_import("ray") +try: + from ray.exceptions import ObjectReconstructionFailedMaxAttemptsExceededError +except ImportError: # pragma: no cover + ObjectReconstructionFailedMaxAttemptsExceededError = None + + +@require_ray +@pytest.mark.parametrize( + "ray_large_cluster", + [{"num_nodes": 0}], + indirect=True, +) +@pytest.mark.parametrize("reconstruction_enabled", [True, False]) +@pytest.mark.skipif( + ObjectReconstructionFailedMaxAttemptsExceededError is None, + reason="Not support ObjectReconstructionFailedMaxAttemptsExceededError", +) +def test_basic_object_reconstruction( + ray_large_cluster, reconstruction_enabled, stop_mars +): + config = { + "num_heartbeats_timeout": 10, + "raylet_heartbeat_period_milliseconds": 200, + "object_timeout_milliseconds": 200, + } + # Workaround to reset the config to the default value. + if not reconstruction_enabled: + config["lineage_pinning_enabled"] = False + subtask_max_retries = 0 + else: + subtask_max_retries = 1 + + cluster = ray_large_cluster + # Head node with no resources. + cluster.add_node( + num_cpus=0, + _system_config=config, + enable_object_reconstruction=reconstruction_enabled, + ) + ray.init(address=cluster.address) + # Node to place the initial object. + node_to_kill = cluster.add_node(num_cpus=1, object_store_memory=10**8) + mars.new_session( + backend="ray", + config={"scheduling.subtask_max_retries": subtask_max_retries}, + default=True, + ) + cluster.wait_for_nodes() + + df = md.DataFrame(mt.random.RandomState(0).rand(2_000_000, 1, chunk_size=1_000_000)) + df.execute() + # this will submit new ray tasks + df2 = df.map_chunk(lambda pdf: pdf * 2).execute() + executed_infos = df2.fetch_infos(fields=["object_refs"]) + object_refs = reduce(operator.concat, executed_infos["object_refs"]) + head5 = df2.head(5).to_pandas() + + cluster.remove_node(node_to_kill, allow_graceful=False) + node_to_kill = cluster.add_node(num_cpus=1, object_store_memory=10**8) + + # use a dependent_task to avoid fetch lost objects to local + @ray.remote + def dependent_task(x): + return x + + if reconstruction_enabled: + ray.get([dependent_task.remote(ref) for ref in object_refs]) + new_head5 = df2.head(5).to_pandas() + pd.testing.assert_frame_equal(head5, new_head5) + else: + with pytest.raises(ray.exceptions.RayTaskError): + df2.head(5).to_pandas() + with pytest.raises(ray.exceptions.ObjectLostError): + ray.get(object_refs) + + # Losing the object a second time will cause reconstruction to fail because + # we have reached the max task retries. + cluster.remove_node(node_to_kill, allow_graceful=False) + cluster.add_node(num_cpus=1, object_store_memory=10**8) + + if reconstruction_enabled: + with pytest.raises(ObjectReconstructionFailedMaxAttemptsExceededError): + ray.get(object_refs) + else: + with pytest.raises(ray.exceptions.ObjectLostError): + ray.get(object_refs) diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_ray_dag_oscar.py b/python/xorbits/_mars/deploy/oscar/tests/test_ray_dag_oscar.py new file mode 100644 index 000000000..c085e840c --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/test_ray_dag_oscar.py @@ -0,0 +1,89 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pytest + +from ....tests.core import require_ray +from ....utils import lazy_import +from ..ray import _load_config, new_cluster +from ..tests import test_local + +ray = lazy_import("ray") +CONFIG_FILE = os.path.join(os.path.dirname(__file__), "local_test_with_ray_config.yml") + + +@pytest.fixture +async def create_cluster(request): + param = getattr(request, "param", {}) + ray_config = _load_config(CONFIG_FILE) + ray_config.update(param.get("config", {})) + client = await new_cluster( + supervisor_mem=1 * 1024**3, + worker_num=2, + worker_cpu=2, + worker_mem=1 * 1024**3, + backend="ray", + config=ray_config, + ) + async with client: + yield client, param + + +@require_ray +@pytest.mark.asyncio +async def test_iterative_tiling(ray_start_regular_shared2, create_cluster): + await test_local.test_iterative_tiling(create_cluster) + + +@pytest.mark.asyncio +@require_ray +async def test_execute_describe(ray_start_regular_shared2, create_cluster): + await test_local.test_execute_describe(create_cluster) + + +@require_ray +@pytest.mark.asyncio +async def test_execute_apply_closure(ray_start_regular_shared2, create_cluster): + await test_local.test_execute_apply_closure(create_cluster) + + +@require_ray +@pytest.mark.parametrize("multiplier", [1, 3, 4]) +@pytest.mark.asyncio +async def test_execute_callable_closure( + ray_start_regular_shared2, create_cluster, multiplier +): + await test_local.test_execute_callable_closure(create_cluster, multiplier) + + +@require_ray +@pytest.mark.parametrize( + "create_cluster", + [ + { + "config": { + "task.task_preprocessor_cls": "mars.deploy.oscar.tests.test_clean_up_and_restore_func.RayBackendFuncTaskPreprocessor", + "subtask.subtask_processor_cls": "mars.deploy.oscar.tests.test_clean_up_and_restore_func.RayBackendFuncSubtaskProcessor", + } + } + ], + indirect=True, +) +@pytest.mark.asyncio +async def test_ray_dag_oscar_clean_up_and_restore_func( + ray_start_regular_shared2, create_cluster +): + await test_local.test_execute_apply_closure(create_cluster) diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_ray_fault_injection.py b/python/xorbits/_mars/deploy/oscar/tests/test_ray_fault_injection.py new file mode 100644 index 000000000..16486c9de --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/test_ray_fault_injection.py @@ -0,0 +1,207 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pytest + +from ....oscar.errors import ServerClosed +from ....services.tests.fault_injection_manager import ( + FaultInjectionError, + FaultInjectionUnhandledError, + FaultPosition, + FaultType, +) +from ....tensor.base.psrs import PSRSConcatPivot +from ....tests.core import require_ray +from ....utils import lazy_import +from ..ray import _load_config, new_cluster +from ..tests import test_fault_injection + +ray = lazy_import("ray") + +RAY_CONFIG_FILE = os.path.join( + os.path.dirname(__file__), "local_test_with_ray_config.yml" +) +FAULT_INJECTION_CONFIG = { + "third_party_modules": ["mars.services.tests.fault_injection_patch"], +} +SUBTASK_RERUN_CONFIG = { + "scheduling": { + "subtask_max_retries": 2, + "subtask_max_reschedules": 2, + } +} + + +@pytest.fixture +async def fault_cluster(request): + param = getattr(request, "param", {}) + ray_config = _load_config(RAY_CONFIG_FILE) + ray_config.update(FAULT_INJECTION_CONFIG) + ray_config.update(param.get("config", {})) + client = await new_cluster( + worker_num=2, + worker_cpu=2, + worker_mem=1 * 1024**3, + config=ray_config, + ) + async with client: + yield client + + +@require_ray +@pytest.mark.parametrize( + "fault_and_exception", + [ + [ + FaultType.Exception, + {FaultPosition.ON_EXECUTE_OPERAND: 1}, + pytest.raises(FaultInjectionError, match="Fault Injection"), + True, + ], + [ + FaultType.UnhandledException, + {FaultPosition.ON_EXECUTE_OPERAND: 1}, + pytest.raises( + FaultInjectionUnhandledError, match="Fault Injection Unhandled" + ), + True, + ], + [ + FaultType.ProcessExit, + {FaultPosition.ON_EXECUTE_OPERAND: 1}, + pytest.raises(ServerClosed), + False, # The ServerClosed raised from current process directly. + ], + [ + FaultType.Exception, + {FaultPosition.ON_RUN_SUBTASK: 1}, + pytest.raises(FaultInjectionError, match="Fault Injection"), + True, + ], + ], +) +@pytest.mark.asyncio +async def test_fault_inject_subtask_processor( + ray_start_regular_shared, fault_cluster, fault_and_exception +): + await test_fault_injection.test_fault_inject_subtask_processor( + fault_cluster, fault_and_exception + ) + + +@require_ray +@pytest.mark.parametrize( + "fault_cluster", [{"config": SUBTASK_RERUN_CONFIG}], indirect=True +) +@pytest.mark.parametrize( + "fault_config", + [ + [ + FaultType.Exception, + {FaultPosition.ON_EXECUTE_OPERAND: 1}, + pytest.raises(FaultInjectionError, match="Fault Injection"), + ], + [ + FaultType.ProcessExit, + {FaultPosition.ON_EXECUTE_OPERAND: 1}, + pytest.raises(ServerClosed), + ], + [ + FaultType.Exception, + {FaultPosition.ON_RUN_SUBTASK: 1}, + pytest.raises(FaultInjectionError, match="Fault Injection"), + ], + ], +) +@pytest.mark.asyncio +async def test_rerun_subtask(ray_start_regular_shared, fault_cluster, fault_config): + await test_fault_injection.test_rerun_subtask(fault_cluster, fault_config) + + +@require_ray +@pytest.mark.parametrize( + "fault_cluster", [{"config": SUBTASK_RERUN_CONFIG}], indirect=True +) +@pytest.mark.parametrize( + "fault_config", + [ + [FaultType.Exception, {FaultPosition.ON_EXECUTE_OPERAND: 1}, [PSRSConcatPivot]], + [ + FaultType.ProcessExit, + {FaultPosition.ON_EXECUTE_OPERAND: 1}, + [PSRSConcatPivot], + ], + ], +) +@pytest.mark.asyncio +async def test_rerun_subtask_describe( + ray_start_regular_shared, fault_cluster, fault_config +): + await test_fault_injection.test_rerun_subtask_describe(fault_cluster, fault_config) + + +@require_ray +@pytest.mark.parametrize( + "fault_cluster", [{"config": SUBTASK_RERUN_CONFIG}], indirect=True +) +@pytest.mark.parametrize( + "fault_config", + [ + [ + FaultType.UnhandledException, + {FaultPosition.ON_EXECUTE_OPERAND: 1}, + pytest.raises(FaultInjectionUnhandledError), + ["_UnhandledException", "handle_fault"], + ], + [ + FaultType.Exception, + {FaultPosition.ON_EXECUTE_OPERAND: 100}, + pytest.raises(FaultInjectionError), + ["_ExceedMaxRerun", "handle_fault"], + ], + ], +) +@pytest.mark.asyncio +async def test_rerun_subtask_fail( + ray_start_regular_shared, fault_cluster, fault_config +): + await test_fault_injection.test_rerun_subtask_fail(fault_cluster, fault_config) + + +@require_ray +@pytest.mark.parametrize( + "fault_cluster", [{"config": SUBTASK_RERUN_CONFIG}], indirect=True +) +@pytest.mark.parametrize( + "fault_config", + [ + [ + FaultType.Exception, + {FaultPosition.ON_EXECUTE_OPERAND: 1}, + pytest.raises(FaultInjectionError, match="RemoteFunction"), + ["_UnretryableException", "handle_fault"], + ], + [ + FaultType.ProcessExit, + {FaultPosition.ON_EXECUTE_OPERAND: 1}, + pytest.raises(ServerClosed), + ["_UnretryableException", "*"], + ], + ], +) +@pytest.mark.asyncio +async def test_retryable(ray_start_regular_shared, fault_cluster, fault_config): + await test_fault_injection.test_retryable(fault_cluster, fault_config) diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_ray_load_modules.py b/python/xorbits/_mars/deploy/oscar/tests/test_ray_load_modules.py new file mode 100644 index 000000000..e29e2550a --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/test_ray_load_modules.py @@ -0,0 +1,129 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import numpy as np +import pytest + +from .... import tensor as mt +from ....tests.core import require_ray +from ....utils import lazy_import +from ..ray import _load_config, new_cluster +from ..session import get_default_session, new_session +from .modules.utils import ( # noqa: F401 # pylint: disable=unused-variable + cleanup_third_party_modules_output, + get_output_filenames, +) + +ray = lazy_import("ray") + +CONFIG_FILE = os.path.join(os.path.dirname(__file__), "local_test_with_ray_config.yml") +CONFIG_THIRD_PARTY_MODULES_TEST_FILE = os.path.join( + os.path.dirname(__file__), "ray_test_with_third_parity_modules_config.yml" +) + + +@pytest.fixture +async def create_cluster(request): + param = getattr(request, "param", {}) + ray_config = _load_config(CONFIG_FILE) + ray_config.update(param.get("config", {})) + client = await new_cluster( + supervisor_mem=1 * 1024**3, + worker_num=2, + worker_cpu=2, + worker_mem=1 * 1024**3, + config=ray_config, + ) + async with client: + yield client, param + + +@require_ray +@pytest.mark.parametrize( + "config_exception", + [ + [set(), pytest.raises(TypeError, match="set")], + [ + {"supervisor": ["not_exists_for_supervisor"]}, + pytest.raises(ModuleNotFoundError, match="not_exists_for_supervisor"), + ], + [ + {"worker": ["not_exists_for_worker"]}, + pytest.raises(ModuleNotFoundError, match="not_exists_for_worker"), + ], + ], +) +@pytest.mark.asyncio +async def test_load_third_party_modules(ray_start_regular, config_exception): + third_party_modules_config, expected_exception = config_exception + config = _load_config() + + config["third_party_modules"] = third_party_modules_config + with expected_exception: + await new_cluster( + worker_num=1, + worker_cpu=1, + worker_mem=1 * 1024**3, + config=config, + ) + + +@require_ray +@pytest.mark.parametrize( + "create_cluster", + [ + { + "config": { + "third_party_modules": { + "worker": ["mars.deploy.oscar.tests.modules.replace_op"] + }, + }, + } + ], + indirect=True, +) +@pytest.mark.asyncio +def test_load_third_party_modules2(ray_start_regular, create_cluster): + client = create_cluster[0] + assert client.session + session = new_session(address=client.address) + with session: + raw = np.random.RandomState(0).rand(10, 10) + a = mt.tensor(raw, chunk_size=5) + b = a + 1 + b.execute(show_progress=False) + result = b.fetch() + + np.testing.assert_equal(raw - 1, result) + + assert get_default_session() is None + + +@require_ray +@pytest.mark.asyncio +async def test_load_third_party_modules_from_config( + ray_start_regular, cleanup_third_party_modules_output # noqa: F811 +): + client = await new_cluster( + supervisor_mem=1 * 1024**3, + worker_num=1, + worker_cpu=1, + worker_mem=1 * 1024**3, + config=CONFIG_THIRD_PARTY_MODULES_TEST_FILE, + ) + async with client: + # 1 supervisor, 1 worker main pools, 1 worker sub pools. + assert len(get_output_filenames()) == 3 diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_ray_scheduling.py b/python/xorbits/_mars/deploy/oscar/tests/test_ray_scheduling.py new file mode 100644 index 000000000..f125c95bc --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/tests/test_ray_scheduling.py @@ -0,0 +1,323 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import asyncio +import logging +import os +import time + +import numpy as np +import pandas as pd +import pytest + +from .... import dataframe as md +from .... import oscar as mo +from .... import tensor as mt +from ....oscar.backends.ray.utils import ( + kill_and_wait, + process_address_to_placement, + process_placement_to_address, +) +from ....services.cluster import ClusterAPI +from ....services.scheduling.supervisor.autoscale import AutoscalerActor +from ....tests.core import require_ray +from ....utils import lazy_import +from ..ray import _load_config, new_cluster +from ..tests import test_local + +ray = lazy_import("ray") + +logger = logging.getLogger(__name__) + + +@pytest.fixture +async def speculative_cluster(): + client = await new_cluster( + "test_cluster", + worker_num=5, + worker_cpu=2, + worker_mem=512 * 1024**2, + supervisor_mem=100 * 1024**2, + config={ + "scheduling": { + "speculation": { + "enabled": True, + "dry": False, + "interval": 0.5, + "threshold": 0.2, + "min_task_runtime": 2, + "multiplier": 1.5, + }, + # used to kill hanged subtask to release slot. + "subtask_cancel_timeout": 0.1, + }, + }, + ) + async with client: + yield client + + +@pytest.mark.parametrize("ray_large_cluster", [{"num_nodes": 2}], indirect=True) +@pytest.mark.timeout(timeout=500) +@require_ray +@pytest.mark.asyncio +async def test_task_speculation_execution(ray_large_cluster, speculative_cluster): + await test_local.test_task_speculation_execution(speculative_cluster) + + +@pytest.mark.parametrize( + "ray_large_cluster", [{"num_nodes": 1, "num_cpus": 3}], indirect=True +) +@require_ray +@pytest.mark.asyncio +async def test_request_worker(ray_large_cluster): + worker_cpu, worker_mem = 1, 100 * 1024**2 + client = await new_cluster( + worker_num=0, worker_cpu=worker_cpu, worker_mem=worker_mem + ) + async with client: + cluster_state_ref = client._cluster._cluster_backend.get_cluster_state_ref() + # Note that supervisor took one node + workers = await asyncio.gather( + *[cluster_state_ref.request_worker(timeout=5) for _ in range(2)] + ) + assert all(worker is not None for worker in workers) + assert not await cluster_state_ref.request_worker(timeout=5) + release_workers = [ + cluster_state_ref.release_worker(worker) for worker in workers + ] + # Duplicate release workers requests should be handled. + release_workers.extend( + [cluster_state_ref.release_worker(worker) for worker in workers] + ) + await asyncio.gather(*release_workers) + assert await cluster_state_ref.request_worker(timeout=5) + cluster_state_ref.reconstruct_worker() + + +@pytest.mark.parametrize( + "ray_large_cluster", [{"num_nodes": 1, "num_cpus": 3}], indirect=True +) +@require_ray +@pytest.mark.asyncio +async def test_reconstruct_worker(ray_large_cluster): + worker_cpu, worker_mem = 1, 100 * 1024**2 + client = await new_cluster( + worker_num=0, worker_cpu=worker_cpu, worker_mem=worker_mem + ) + async with client: + cluster_api = await ClusterAPI.create(client._cluster.supervisor_address) + worker = await cluster_api.request_worker(timeout=5) + pg_name, bundle_index, process_index = process_address_to_placement(worker) + worker_sub_pool = process_placement_to_address( + pg_name, bundle_index, process_index + 1 + ) + + worker_actor = ray.get_actor(worker) + worker_pid = await worker_actor.getpid.remote() + # the worker pool actor should be destroyed even we get actor. + worker_sub_pool_actor = ray.get_actor(worker_sub_pool) + worker_sub_pool_pid = await worker_sub_pool_actor.getpid.remote() + + # kill worker main pool + await kill_and_wait(ray.get_actor(worker)) + + # duplicated reconstruct worker request can be handled. + await asyncio.gather( + cluster_api.reconstruct_worker(worker), + cluster_api.reconstruct_worker(worker), + ) + worker_actor = ray.get_actor(worker) + new_worker_pid = await worker_actor.getpid.remote() + worker_sub_pool_actor = ray.get_actor(worker_sub_pool) + new_worker_sub_pool_pid = await worker_sub_pool_actor.getpid.remote() + assert new_worker_pid != worker_pid + assert new_worker_sub_pool_pid != worker_sub_pool_pid + + # the compute should be ok after the worker is reconstructed. + raw = np.random.RandomState(0).rand(10, 5) + a = mt.tensor(raw, chunk_size=5).sum(axis=1) + b = a.execute(show_progress=False) + assert b is a + result = a.fetch() + np.testing.assert_array_equal(result, raw.sum(axis=1)) + + +@pytest.mark.parametrize( + "ray_large_cluster", [{"num_nodes": 2, "num_cpus": 4}], indirect=True +) +@pytest.mark.parametrize("init_workers", [0, 1]) +@require_ray +@pytest.mark.asyncio +async def test_auto_scale_out(ray_large_cluster, init_workers: int): + client = await new_cluster( + worker_num=init_workers, + worker_cpu=2, + worker_mem=200 * 1024**2, + supervisor_mem=1 * 1024**3, + config={ + "scheduling.autoscale.enabled": True, + "scheduling.autoscale.scheduler_backlog_timeout": 1, + "scheduling.autoscale.worker_idle_timeout": 10000000, + "scheduling.autoscale.max_workers": 10, + }, + ) + async with client: + + def time_consuming(x): + time.sleep(1) + return x * x + + series_size = 100 + assert ( + md.Series(list(range(series_size)), chunk_size=1) + .apply(time_consuming) + .sum() + .execute() + .fetch() + == pd.Series(list(range(series_size))).apply(lambda x: x * x).sum() + ) + autoscaler_ref = mo.create_actor_ref( + uid=AutoscalerActor.default_uid(), + address=client._cluster.supervisor_address, + ) + assert await autoscaler_ref.get_dynamic_worker_nums() > 0 + + +@pytest.mark.timeout(timeout=600) +@pytest.mark.parametrize( + "ray_large_cluster", [{"num_nodes": 2, "num_cpus": 4}], indirect=True +) +@require_ray +@pytest.mark.asyncio +async def test_auto_scale_in(ray_large_cluster): + config = _load_config() + config["scheduling"]["autoscale"]["enabled"] = True + config["scheduling"]["autoscale"]["worker_idle_timeout"] = 1 + config["scheduling"]["autoscale"]["max_workers"] = 4 + config["scheduling"]["autoscale"]["min_workers"] = 2 + client = await new_cluster( + worker_num=0, + worker_cpu=2, + worker_mem=200 * 1024**2, + supervisor_mem=1 * 1024**3, + config=config, + ) + async with client: + autoscaler_ref = mo.create_actor_ref( + uid=AutoscalerActor.default_uid(), + address=client._cluster.supervisor_address, + ) + new_worker_nums = 3 + await asyncio.gather( + *[autoscaler_ref.request_worker() for _ in range(new_worker_nums)] + ) + series_size = 100 + assert ( + md.Series(list(range(series_size)), chunk_size=20).sum().execute().fetch() + == pd.Series(list(range(series_size))).sum() + ) + while await autoscaler_ref.get_dynamic_worker_nums() > 2: + dynamic_workers = await autoscaler_ref.get_dynamic_workers() + logger.info(f"Waiting %s workers to be released.", dynamic_workers) + await asyncio.sleep(1) + await asyncio.sleep(1) + assert await autoscaler_ref.get_dynamic_worker_nums() == 2 + + +@pytest.mark.timeout(timeout=500) +@pytest.mark.parametrize("ray_large_cluster", [{"num_nodes": 4}], indirect=True) +@require_ray +@pytest.mark.asyncio +async def test_ownership_when_scale_in(ray_large_cluster): + client = await new_cluster( + worker_num=0, + worker_cpu=2, + worker_mem=1 * 1024**3, + supervisor_mem=200 * 1024**2, + config={ + "scheduling.autoscale.enabled": True, + "scheduling.autoscale.scheduler_check_interval": 0.1, + "scheduling.autoscale.scheduler_backlog_timeout": 0.5, + "scheduling.autoscale.worker_idle_timeout": 1, + "scheduling.autoscale.min_workers": 1, + "scheduling.autoscale.max_workers": 4, + }, + ) + async with client: + autoscaler_ref = mo.create_actor_ref( + uid=AutoscalerActor.default_uid(), + address=client._cluster.supervisor_address, + ) + num_chunks, chunk_size = 10, 4 + df = md.DataFrame( + mt.random.rand(num_chunks * chunk_size, 4, chunk_size=chunk_size), + columns=list("abcd"), + ) + latch_actor = ray.remote(CountDownLatch).remote(1) + pid = os.getpid() + + def f(pdf, latch): + if os.getpid() != pid: + # type inference will call this function too + ray.get(latch.wait.remote()) + return pdf + + df = df.map_chunk( + f, + args=(latch_actor,), + ) + info = df.execute(wait=False) + while await autoscaler_ref.get_dynamic_worker_nums() <= 1: + logger.info("Waiting workers to be created.") + await asyncio.sleep(1) + await latch_actor.count_down.remote() + await info + assert info.exception() is None + assert info.progress() == 1 + logger.info("df execute succeed.") + + while await autoscaler_ref.get_dynamic_worker_nums() > 1: + dynamic_workers = await autoscaler_ref.get_dynamic_workers() + logger.info("Waiting workers %s to be released.", dynamic_workers) + await asyncio.sleep(1) + # Test data on node of released worker can still be fetched + pd_df = df.fetch() + groupby_sum_df = ( + df.rechunk(chunk_size * 2).groupby("a").apply(lambda pdf: pdf.sum()) + ) + logger.info(groupby_sum_df.execute()) + while await autoscaler_ref.get_dynamic_worker_nums() > 1: + dynamic_workers = await autoscaler_ref.get_dynamic_workers() + logger.info(f"Waiting workers %s to be released.", dynamic_workers) + await asyncio.sleep(1) + assert df.to_pandas().to_dict() == pd_df.to_dict() + assert ( + groupby_sum_df.to_pandas().to_dict() + == pd_df.groupby("a").apply(lambda pdf: pdf.sum()).to_dict() + ) + + +class CountDownLatch: + def __init__(self, cnt): + self.cnt = cnt + + def count_down(self): + self.cnt -= 1 + + def get_count(self): + return self.cnt + + async def wait(self): + while self.cnt != 0: + await asyncio.sleep(0.01) diff --git a/python/xorbits/_mars/deploy/oscar/worker.py b/python/xorbits/_mars/deploy/oscar/worker.py new file mode 100644 index 000000000..7eeb05833 --- /dev/null +++ b/python/xorbits/_mars/deploy/oscar/worker.py @@ -0,0 +1,126 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from ...resource import Resource, cpu_count, cuda_count, mem_total +from ...services import NodeRole +from ...utils import get_next_port +from .cmdline import OscarCommandRunner +from .local import start_worker, stop_worker +from .pool import create_worker_actor_pool + + +class WorkerCommandRunner(OscarCommandRunner): + command_description = "Mars Worker" + node_role = NodeRole.WORKER + + def __init__(self): + super().__init__() + self.band_to_resource = dict() + self.cuda_devices = [] + self.n_io_process = 1 + + def config_args(self, parser): + super().config_args(parser) + parser.add_argument("--n-cpu", help="num of CPU to use", default="auto") + parser.add_argument( + "--mem-bytes", help="bytes of memory to use", default="auto" + ) + parser.add_argument("--n-io-process", help="num of IO processes", default="1") + parser.add_argument( + "--cuda-devices", + help="CUDA device to use, if not specified, will use " + "all available devices", + default="auto", + ) + + def parse_args(self, parser, argv, environ=None): + environ = environ or os.environ + args = super().parse_args(parser, argv, environ=environ) + + if ( + self.config.get("cluster", {}).get("backend", "fixed") == "fixed" + and not args.supervisors + ): # pragma: no cover + raise ValueError("--supervisors is needed to start Mars Worker") + + if args.endpoint is None: + args.endpoint = f"{args.host}:{get_next_port()}" + self.n_io_process = int(args.n_io_process) + + n_cpu = cpu_count() if args.n_cpu == "auto" else int(args.n_cpu) + mem_bytes = mem_total() if args.mem_bytes == "auto" else int(args.mem_bytes) + + if "CUDA_VISIBLE_DEVICES" in os.environ: # pragma: no cover + args.cuda_devices = os.environ["CUDA_VISIBLE_DEVICES"].strip() + + if args.cuda_devices == "auto": + self.cuda_devices = list(range(cuda_count())) + elif args.cuda_devices.strip() == "": # pragma: no cover + # allow using CPU only + self.cuda_devices = [] + else: # pragma: no cover + self.cuda_devices = [int(i) for i in args.cuda_devices.split(",")] + + self.band_to_resource = band_to_resource = dict() + band_to_resource["numa-0"] = Resource(num_cpus=n_cpu, mem_bytes=mem_bytes) + for i in self.cuda_devices: # pragma: no cover + band_to_resource[f"gpu-{i}"] = Resource(num_gpus=1) + + storage_config = self.config["storage"] = self.config.get("storage", {}) + backends = storage_config["backends"] = storage_config.get("backends", []) + plasma_config = storage_config["plasma"] = storage_config.get("plasma", {}) + disk_config = storage_config["disk"] = storage_config.get("disk", {}) + if "MARS_CACHE_MEM_SIZE" in environ: + plasma_config["store_memory"] = environ["MARS_CACHE_MEM_SIZE"] + if "MARS_PLASMA_DIRS" in environ: + plasma_config["plasma_directory"] = environ["MARS_PLASMA_DIRS"] + if "MARS_SPILL_DIRS" in environ: + backends.append("disk") + disk_config["root_dirs"] = environ["MARS_SPILL_DIRS"] + + return args + + async def create_actor_pool(self): + return await create_worker_actor_pool( + self.args.endpoint, + self.band_to_resource, + ports=self.ports, + n_io_process=self.n_io_process, + modules=list(self.args.load_modules), + logging_conf=self.logging_conf, + cuda_devices=self.cuda_devices, + subprocess_start_method="forkserver" if os.name != "nt" else "spawn", + metrics=self.config.get("metrics", {}), + oscar_config=self.config.get("oscar"), + ) + + async def start_services(self): + return await start_worker( + self.pool.external_address, + self.args.supervisors, + self.band_to_resource, + list(self.args.load_modules), + self.config, + ) + + async def stop_services(self): + return await stop_worker(self.pool.external_address, self.config) + + +main = WorkerCommandRunner() + +if __name__ == "__main__": # pragma: no branch + main() diff --git a/python/xorbits/_mars/deploy/tests/__init__.py b/python/xorbits/_mars/deploy/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/deploy/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/deploy/tests/base_test_cfg.yml b/python/xorbits/_mars/deploy/tests/base_test_cfg.yml new file mode 100644 index 000000000..e7a53db89 --- /dev/null +++ b/python/xorbits/_mars/deploy/tests/base_test_cfg.yml @@ -0,0 +1,13 @@ +"@inherits": '@default' +test_list: + - item1 + - item2 +test_list2: + - item1 + - item2 +test_dict: + key1: val1 + key2: + key2_key1: + val2 +"@overriding_fields": ["test_list2"] diff --git a/python/xorbits/_mars/deploy/tests/inherit_test_cfg1.yml b/python/xorbits/_mars/deploy/tests/inherit_test_cfg1.yml new file mode 100644 index 000000000..4ee50c47a --- /dev/null +++ b/python/xorbits/_mars/deploy/tests/inherit_test_cfg1.yml @@ -0,0 +1,5 @@ +"@inherits": '@mars/deploy/tests/base_test_cfg.yml' +test_list: + - item3 +test_list2: # overriding + - item3 diff --git a/python/xorbits/_mars/deploy/tests/inherit_test_cfg2.yml b/python/xorbits/_mars/deploy/tests/inherit_test_cfg2.yml new file mode 100644 index 000000000..ffdaf7181 --- /dev/null +++ b/python/xorbits/_mars/deploy/tests/inherit_test_cfg2.yml @@ -0,0 +1,6 @@ +"@inherits": inherit_test_cfg1.yml +test_dict: + key2: + key2_key1: + val2_modified + key3: val3 diff --git a/python/xorbits/_mars/deploy/tests/test_utils.py b/python/xorbits/_mars/deploy/tests/test_utils.py new file mode 100644 index 000000000..b6d4af9e5 --- /dev/null +++ b/python/xorbits/_mars/deploy/tests/test_utils.py @@ -0,0 +1,105 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pytest + +from ...services import NodeRole +from ..utils import ( + get_third_party_modules_from_config, + load_service_config_file, + next_in_thread, +) + +_cwd = os.path.abspath(os.getcwd()) + + +@pytest.mark.parametrize("cwd", [_cwd, os.path.dirname(_cwd)]) +def test_load_service_config(cwd): + old_cwd = os.getcwd() + try: + os.chdir(cwd) + cfg = load_service_config_file( + os.path.join(os.path.dirname(__file__), "inherit_test_cfg2.yml") + ) + + assert "services" in cfg + assert cfg["test_list"] == ["item1", "item2", "item3"] + assert cfg["test_list2"] == ["item3"] + assert set(cfg["test_dict"].keys()) == {"key1", "key2", "key3"} + assert set(cfg["test_dict"]["key2"].values()) == {"val2_modified"} + assert all(not k.startswith("@") for k in cfg.keys()) + finally: + os.chdir(old_cwd) + + +def test_get_third_party_modules_from_config(): + r = get_third_party_modules_from_config({}, NodeRole.SUPERVISOR) + assert r == [] + + r = get_third_party_modules_from_config({}, NodeRole.WORKER) + assert r == [] + + config = {"third_party_modules": {"supervisor": ["a.module"]}} + r = get_third_party_modules_from_config(config, NodeRole.SUPERVISOR) + assert r == ["a.module"] + r = get_third_party_modules_from_config(config, NodeRole.WORKER) + assert r == [] + + config = {"third_party_modules": {"worker": ["b.module"]}} + r = get_third_party_modules_from_config(config, NodeRole.WORKER) + assert r == ["b.module"] + r = get_third_party_modules_from_config(config, NodeRole.SUPERVISOR) + assert r == [] + + config = {"third_party_modules": ["ab.module"]} + r = get_third_party_modules_from_config(config, NodeRole.SUPERVISOR) + assert r == ["ab.module"] + r = get_third_party_modules_from_config(config, NodeRole.WORKER) + assert r == ["ab.module"] + + os.environ["MARS_LOAD_MODULES"] = "c.module,d.module" + try: + r = get_third_party_modules_from_config(config, NodeRole.SUPERVISOR) + assert r == ["ab.module", "c.module", "d.module"] + r = get_third_party_modules_from_config(config, NodeRole.WORKER) + assert r == ["ab.module", "c.module", "d.module"] + r = get_third_party_modules_from_config({}, NodeRole.SUPERVISOR) + assert r == ["c.module", "d.module"] + r = get_third_party_modules_from_config({}, NodeRole.WORKER) + assert r == ["c.module", "d.module"] + finally: + os.environ.pop("MARS_LOAD_MODULES", None) + + config = {"third_party_modules": "ab.module"} + with pytest.raises(TypeError, match="str"): + get_third_party_modules_from_config(config, NodeRole.SUPERVISOR) + config = {"third_party_modules": {"supervisor": "a.module"}} + with pytest.raises(TypeError, match="str"): + get_third_party_modules_from_config(config, NodeRole.SUPERVISOR) + + +@pytest.mark.asyncio +async def test_next_in_thread(): + def gen_fun(): + yield 1 + yield 2 + + gen = gen_fun() + + assert await next_in_thread(gen) == 1 + assert await next_in_thread(gen) == 2 + with pytest.raises(StopAsyncIteration): + await next_in_thread(gen) diff --git a/python/xorbits/_mars/deploy/utils.py b/python/xorbits/_mars/deploy/utils.py new file mode 100644 index 000000000..b378a4589 --- /dev/null +++ b/python/xorbits/_mars/deploy/utils.py @@ -0,0 +1,223 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import os +import time +import warnings +from typing import Callable, Dict, List, TextIO, Union + +import yaml + +from ..services import NodeRole +from ..utils import flatten_dict_to_nested_dict, merge_dict + +DEFAULT_CONFIG_FILE = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "oscar/config.yml" +) + + +def wait_services_ready( + selectors: List, min_counts: List[int], count_fun: Callable, timeout=None +): + readies = [0] * len(selectors) + start_time = time.time() + while True: + all_satisfy = True + for idx, selector in enumerate(selectors): + if readies[idx] < min_counts[idx]: + all_satisfy = False + readies[idx] = count_fun(selector) + break + if all_satisfy: + break + if timeout and timeout + start_time < time.time(): + raise TimeoutError("Wait cluster start timeout") + time.sleep(1) + + +def load_service_config_file(path: Union[str, TextIO]) -> Dict: + mars_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + + cfg_stack = [] # type: List[Dict] + cfg_file_set = set() + if isinstance(path, str): + path = os.path.abspath(path) + + while path is not None: + if path in cfg_file_set: # pragma: no cover + raise ValueError("Recursive config inherit detected") + + if not hasattr(path, "read"): + with open(path) as file: + cfg = yaml.safe_load(file) + else: + cfg = yaml.safe_load(path) + cfg_stack.append(cfg) + cfg_file_set.add(path) + + inherit_path = cfg.pop("@inherits", None) + if not inherit_path: + path = None + elif os.path.isfile(inherit_path): + path = inherit_path + elif inherit_path == "@default": + path = DEFAULT_CONFIG_FILE + elif inherit_path.startswith("@mars"): + path = inherit_path.replace("@mars", mars_path) + else: + path = os.path.join(os.path.dirname(path), inherit_path) + + def _override_cfg(src: Union[Dict, List], override: Union[Dict, List]): + if isinstance(override, dict): + overriding_fields = set(src.get("@overriding_fields") or set()) + for key, val in override.items(): + if ( + key not in src + or not isinstance(val, (list, dict)) + or key in overriding_fields + ): + src[key] = val + else: + _override_cfg(src[key], override[key]) + else: + src.extend(override) + + def _clear_meta_cfg(src: Dict): + meta_keys = [] + for k, v in src.items(): + if k.startswith("@"): + meta_keys.append(k) + elif isinstance(v, dict): + _clear_meta_cfg(v) + + for k in meta_keys: + src.pop(k) + + cfg = cfg_stack[-1] + for new_cfg in cfg_stack[-2::-1]: + _override_cfg(cfg, new_cfg) + + _clear_meta_cfg(cfg) + return cfg + + +def _merge_config(full_config: Dict, config: Dict) -> Dict: + """ + Merge the config to full_config, the config support flatten key, e.g. + + config={ + 'scheduling.autoscale.enabled': True, + 'scheduling.autoscale.scheduler_check_interval': 1, + 'scheduling.autoscale.scheduler_backlog_timeout': 1, + 'scheduling.autoscale.worker_idle_timeout': 10, + 'scheduling.autoscale.min_workers': 1, + 'scheduling.autoscale.max_workers': 4 + } + """ + if not config: + return full_config + if not isinstance(config, Dict): # pragma: no cover + raise ValueError( + f"The config should be a dict, but the type is {type(config)}." + ) + flatten_keys = set(k for k in config.keys() if isinstance(k, str) and "." in k) + nested_flatten_config = flatten_dict_to_nested_dict( + {k: config[k] for k in flatten_keys} + ) + nested_config = {k: config[k] for k in config.keys() if k not in flatten_keys} + config = merge_dict(nested_config, nested_flatten_config, overwrite=False) + merge_dict(full_config, config) + return full_config + + +def load_config(config: Union[str, Dict], default_config_file: str): + """ + Load config based on the default_config. + """ + # use default config + if isinstance(config, str): + filename = config + config = load_service_config_file(filename) + else: + full_config = load_service_config_file(default_config_file) + config = _merge_config(full_config, config) + if config["scheduling"]["speculation"]["enabled"] is True: + # if `initial_same_color_num` > 1, coloring based fusion will make subtask too heterogeneous such that + # the speculative scheduler can't get enough homogeneous subtasks to calculate statistics + warnings.warn( + "speculative execution is enabled, set initial_same_color_num to 1 to " + "ensure enough homogeneous subtasks to calculate statistics." + ) + config["task"]["default_config"]["initial_same_color_num"] = 1 + ray_execution_config = config["task"]["execution_config"].setdefault("ray", {}) + subtask_max_retries = config["scheduling"].get("subtask_max_retries") + if subtask_max_retries is not None: + ray_execution_config.setdefault("subtask_max_retries", subtask_max_retries) + return config + + +async def wait_all_supervisors_ready(endpoint): + """ + Wait till all containers are ready + """ + from ..services.cluster import ClusterAPI + + cluster_api = None + + while True: + try: + cluster_api = await ClusterAPI.create(endpoint) + break + except: # noqa: E722 # pylint: disable=bare-except # pragma: no cover + await asyncio.sleep(0.1) + continue + + assert cluster_api is not None + await cluster_api.wait_all_supervisors_ready() + + +def get_third_party_modules_from_config(config: Dict, role: NodeRole, environ=None): + environ = environ or os.environ + third_party_modules = config.get("third_party_modules", []) + if isinstance(third_party_modules, list): + modules = third_party_modules + elif isinstance(third_party_modules, dict): + key = { + NodeRole.SUPERVISOR: "supervisor", + NodeRole.WORKER: "worker", + } + modules = third_party_modules.get(key[role], []) + if not isinstance(modules, list): + raise TypeError( + f"The value type of third_party_modules.{key[role]} " + f"should be a list, but got a {type(modules)} instead." + ) + else: + raise TypeError( + f"The value type of third_party_modules should be a list " + f"or dict, but got a {type(third_party_modules)} instead." + ) + + all_modules = [] + for mods in tuple(modules or ()) + (environ.get("MARS_LOAD_MODULES"),): + all_modules.extend(mods.split(",") if mods else []) + return all_modules + + +async def next_in_thread(gen): + res = await asyncio.to_thread(next, gen, StopIteration) + if res is StopIteration: + raise StopAsyncIteration + return res diff --git a/python/xorbits/_mars/deploy/yarn/__init__.py b/python/xorbits/_mars/deploy/yarn/__init__.py new file mode 100644 index 000000000..82f91f84c --- /dev/null +++ b/python/xorbits/_mars/deploy/yarn/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .client import YarnClusterClient, new_cluster diff --git a/python/xorbits/_mars/deploy/yarn/client.py b/python/xorbits/_mars/deploy/yarn/client.py new file mode 100644 index 000000000..109dc67e2 --- /dev/null +++ b/python/xorbits/_mars/deploy/yarn/client.py @@ -0,0 +1,225 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import random +import time +import uuid + +from ...session import new_session +from ...utils import calc_size_by_str, to_str +from ..utils import wait_services_ready +from .config import MarsApplicationConfig, MarsSupervisorConfig, MarsWorkerConfig + +logger = logging.getLogger(__name__) + + +class YarnClusterClient: + def __init__(self, skein_client, application_id, endpoint, is_client_managed=False): + self._skein_client = skein_client + self._is_client_managed = is_client_managed + self._application_id = application_id + self._endpoint = endpoint + self._session = new_session(endpoint) + + @property + def session(self): + return self._session + + @property + def endpoint(self): + return self._endpoint + + @property + def application_id(self): + return self._application_id + + def stop(self, status="SUCCEEDED"): + import skein + + try: + skein_client = skein.Client() + app_client = skein_client.connect(self._application_id) + app_client.shutdown(status=status) + if self._is_client_managed: + self._skein_client.close() + except skein.ApplicationNotRunningError: + pass + + +def _get_ready_container_count(app_client, svc): + container_ids = set( + c.yarn_container_id for c in app_client.get_containers([svc], ["RUNNING"]) + ) + prefixes = app_client.kv.get_prefix(svc) + registered_ids = set(to_str(v).rsplit("@", 1)[-1] for v in prefixes.values()) + return len(container_ids.intersection(registered_ids)) + + +def new_cluster( + environment=None, + supervisor_num=1, + supervisor_cpu=None, + supervisor_mem=None, + worker_num=1, + worker_cpu=None, + worker_mem=None, + worker_spill_paths=None, + worker_cache_mem=None, + min_worker_num=None, + timeout=None, + log_config=None, + skein_client=None, + app_name=None, + app_queue=None, + **kwargs, +): + import skein + + from .supervisor import YarnSupervisorCommandRunner + + def _override_envs(src, updates): + ret = src.copy() + ret.update(updates) + return ret + + if worker_cpu is None or worker_mem is None: # pragma: no cover + raise TypeError("`worker_cpu` and `worker_mem` must be specified") + + app_name = app_name or f"mars-app-{uuid.uuid4()}" + supervisor_mem = calc_size_by_str(supervisor_mem, None) + worker_mem = calc_size_by_str(worker_mem, None) + + log_when_fail = kwargs.pop("log_when_fail", False) + + supervisor_extra_modules = kwargs.pop("supervisor_extra_modules", None) + worker_extra_modules = kwargs.pop("worker_extra_modules", None) + + cmd_tmpl = kwargs.pop("cmd_tmpl", None) + + extra_envs = kwargs.pop("extra_env", dict()) + supervisor_extra_env = _override_envs( + extra_envs, kwargs.pop("supervisor_extra_env", dict()) + ) + worker_extra_env = _override_envs( + extra_envs, kwargs.pop("worker_extra_env", dict()) + ) + + extra_args = kwargs.pop("extra_args", "") + supervisor_extra_args = ( + extra_args + " " + kwargs.pop("supervisor_extra_args", "") + ).strip() + worker_extra_args = (extra_args + " " + kwargs.pop("worker_extra_args", "")).strip() + + supervisor_log_config = kwargs.pop("supervisor_log_config", log_config) + worker_log_config = kwargs.pop("worker_log_config", log_config) + + supervisor_config = MarsSupervisorConfig( + instances=supervisor_num, + environment=environment, + cpu=supervisor_cpu, + memory=supervisor_mem, + modules=supervisor_extra_modules, + env=supervisor_extra_env, + log_config=supervisor_log_config, + extra_args=supervisor_extra_args, + cmd_tmpl=cmd_tmpl, + ) + worker_config = MarsWorkerConfig( + instances=worker_num, + environment=environment, + cpu=worker_cpu, + memory=worker_mem, + spill_dirs=worker_spill_paths, + worker_cache_mem=worker_cache_mem, + modules=worker_extra_modules, + env=worker_extra_env, + log_config=worker_log_config, + extra_args=worker_extra_args, + cmd_tmpl=cmd_tmpl, + ) + app_config = MarsApplicationConfig( + app_name, + app_queue, + supervisor_config=supervisor_config, + worker_config=worker_config, + ) + + skein_client = skein_client or skein.Client() + app_id = None + try: + is_client_managed = skein_client is not None + app_id = skein_client.submit(app_config.build()) + + check_start_time = time.time() + while True: + try: + app_client = skein_client.connect(app_id) + break + except skein.ApplicationNotRunningError: # pragma: no cover + time.sleep(0.5) + if timeout and time.time() - check_start_time > timeout: + raise + + logger.debug( + "Application client for %s at %s retrieved", app_id, app_client.address + ) + + # wait until supervisors and expected num of workers are ready + min_worker_num = int(min_worker_num or worker_num) + limits = [supervisor_num, min_worker_num] + services = [MarsSupervisorConfig.service_name, MarsWorkerConfig.service_name] + + wait_services_ready( + services, + limits, + lambda svc: _get_ready_container_count(app_client, svc), + timeout=None if not timeout else timeout - (time.time() - check_start_time), + ) + web_endpoint_kv = app_client.kv.get_prefix( + YarnSupervisorCommandRunner.web_service_name + ) + web_endpoint = random.choice( + [to_str(v).split("@", 1)[0] for v in web_endpoint_kv.values()] + ) + return YarnClusterClient( + skein_client, + app_client.id, + web_endpoint, + is_client_managed=is_client_managed, + ) + except: # noqa: E722 + skein_client = skein.Client() + try: + if log_when_fail: + if app_id is not None: + try: + app_client = skein_client.connect(app_id) + app_client.shutdown(status="FAILED") + except skein.ApplicationNotRunningError: + pass + + try: + logs = skein_client.application_logs(app_id) + logger.error("Error when creating cluster:\n%s", logs.dumps()) + except ValueError: + logger.error( + "Error when creating cluster and failed to get logs" + ) + else: + logger.error("Error when creating cluster and no logs from cluster") + finally: + if app_id is not None: + skein_client.kill_application(app_id) + raise diff --git a/python/xorbits/_mars/deploy/yarn/config.py b/python/xorbits/_mars/deploy/yarn/config.py new file mode 100644 index 000000000..b5aaa6df0 --- /dev/null +++ b/python/xorbits/_mars/deploy/yarn/config.py @@ -0,0 +1,308 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import textwrap +from urllib.parse import urlparse + +from ...utils import parse_readable_size + + +def _remove_nones(cfg): + return dict((k, v) for k, v in cfg.items() if v is not None) + + +def _get_local_app_module(mod_name): + return __name__.rsplit(".", 1)[0] + "." + mod_name.rsplit(".", 1)[-1] + + +class SecurityConfig: + def __init__(self, cert_file=None, key_file=None): + self._cert_file = cert_file + self._key_file = key_file + + def build(self): + return dict(cert_file=self._cert_file, key_file=self._key_file) + + +class AppFileConfig: + def __init__( + self, source, file_type=None, visibility=None, size=None, timestamp=None + ): + self._source = source + self._file_type = file_type + self._visibility = visibility + self._size = size + self._timestamp = timestamp + + def build(self): + if all( + v is None + for v in (self._file_type, self._visibility, self._size, self._timestamp) + ): + return self._source + else: + return _remove_nones( + dict( + source=self._source, + type=self._file_type, + visibility=self._visibility, + size=self._size, + timestamp=self._timestamp, + ) + ) + + +class AppContainerConfig: + def __init__(self, cpu=None, memory=None, env=None, files=None, script=None): + self._cpu = cpu + + if memory is not None: + real_mem, is_percent = parse_readable_size(memory) + assert not is_percent + self._memory = real_mem + else: + self._memory = None + + self._env = env + self._script = script + self._files = files + + self.add_default_envs() + + def build_script(self): + return self._script + + def add_default_envs(self): + pass + + def add_env(self, k, v): + if self._env is None: + self._env = dict() + self._env[k] = v + + def build(self): + return _remove_nones( + dict( + resources=dict( + vcores=self._cpu, + memory=f"{self._memory // 1024 ** 2} MiB" if self._memory else None, + ), + env=self._env, + script=self.build_script(), + files=dict((k, v.build()) for k, v in self._files.items()) + if self._files + else None, + ) + ) + + +class AppMasterConfig(AppContainerConfig): + def __init__(self, security=None, **kwargs): + super().__init__(**kwargs) + self._security = security + + def build(self): + d = super().build() + if self._security is not None: + d["security"] = self._security.build() + return d + + +class AppServiceConfig(AppContainerConfig): + def __init__( + self, instances=1, depends=None, allow_failures=False, max_restarts=0, **kwargs + ): + super().__init__(**kwargs) + if isinstance(depends, str): + depends = [depends] + + self._allow_failures = allow_failures + self._depends = depends or [] + self._max_restarts = max_restarts + self._instances = instances + + def build(self): + d = super().build() + d.update( + dict( + instances=self._instances, + depends=self._depends, + allow_failures=self._allow_failures, + max_restarts=self._max_restarts, + ) + ) + return d + + +class MarsServiceConfig(AppServiceConfig): + service_name = None + + def __init__( + self, + environment, + modules=None, + cmd_tmpl=None, + cpu=None, + memory=None, + log_config=None, + extra_args=None, + **kwargs, + ): + files = kwargs.pop("files", dict()) + kwargs["files"] = files + + parsed = urlparse(environment) + self._env_scheme = parsed.scheme + + if parsed.scheme: + import mars + + self._source_path = os.path.dirname( + os.path.dirname(os.path.abspath(mars.__file__)) + ) + + self._env_path = environment[len(parsed.scheme) + 3 :] + self._path_environ = os.environ["PATH"] + else: + self._source_path = None + self._env_path = environment + self._path_environ = None + + self._cmd_tmpl = cmd_tmpl or '"{executable}"' + if not self._env_scheme: + files["mars_env"] = AppFileConfig(environment) + + self._log_config = log_config + if log_config: + files["logging.conf"] = AppFileConfig(log_config) + + self._modules = modules.split(",") if isinstance(modules, str) else modules + + self._extra_args = extra_args or "" + + cpu = cpu or 1 + memory = memory or "1 GiB" + super().__init__(cpu=cpu, memory=memory, **kwargs) + + def add_default_envs(self): + if self._cpu: + self.add_env("MKL_NUM_THREADS", str(self._cpu)) + self.add_env("MARS_CPU_TOTAL", str(self._cpu)) + self.add_env("MARS_USE_PROCESS_STAT", "1") + + if self._memory: + self.add_env("MARS_MEMORY_TOTAL", str(int(self._memory))) + + if self._modules: + self.add_env("MARS_LOAD_MODULES", ",".join(self._modules)) + + if self._path_environ: + self.add_env("MARS_YARN_PATH", self._path_environ) + + if self._source_path: + self.add_env("MARS_SOURCE_PATH", self._source_path) + + def build_script(self): + bash_lines = [ + textwrap.dedent( + """ + #!/bin/bash + if [[ "$YARN_CONTAINER_RUNTIME_TYPE" == "docker" ]]; then + export MARS_USE_CGROUP_STAT=1 + else + export MARS_USE_PROCESS_STAT=1 + fi + if [[ -n $MARS_SOURCE_PATH ]]; then export PYTHONPATH=$PYTHONPATH:$MARS_SOURCE_PATH; fi + if [[ -n $MARS_YARN_PATH ]]; then export PATH=$MARS_YARN_PATH:$PATH; fi + """ + ).strip() + ] + + if not self._env_scheme: + bash_lines.append("source mars_env/bin/activate") + python_executable = "mars_env/bin/python" + elif self._env_scheme == "conda": + bash_lines.append(f'conda activate "{self._env_path}"') + python_executable = "python" + elif self._env_scheme == "venv": + bash_lines.append(f'source "{self._env_path}/bin/activate"') + python_executable = self._env_path + "/bin/python" + else: # pragma: no cover + python_executable = self._env_path + + cmd = self._cmd_tmpl.format(executable=python_executable) + bash_lines.append( + f"{cmd} -m {_get_local_app_module(self.service_name)} {self._extra_args} > /tmp/{self.service_name}.stdout.log 2> /tmp/{self.service_name}.stderr.log" + ) + return "\n".join(bash_lines) + "\n" + + +class MarsSupervisorConfig(MarsServiceConfig): + service_name = "mars.supervisor" + web_service_name = "mars.web" + + +class MarsWorkerConfig(MarsServiceConfig): + service_name = "mars.worker" + + def __init__(self, environment, worker_cache_mem=None, spill_dirs=None, **kwargs): + kwargs["depends"] = MarsSupervisorConfig.service_name + super().__init__(environment, **kwargs) + + if worker_cache_mem: + self.add_env("MARS_CACHE_MEM_SIZE", worker_cache_mem) + + if spill_dirs: + self.add_env( + "MARS_SPILL_DIRS", + spill_dirs if isinstance(spill_dirs, str) else ":".join(spill_dirs), + ) + + +class MarsApplicationConfig: + def __init__( + self, + name=None, + queue=None, + file_systems=None, + master=None, + supervisor_config=None, + worker_config=None, + ): + self._name = name + self._queue = queue or "default" + self._file_systems = file_systems or [] + self._master = master or AppMasterConfig(cpu=1, memory="512 MiB") + self._supervisor_config = supervisor_config + self._worker_config = worker_config + + def build(self): + services = _remove_nones( + { + MarsSupervisorConfig.service_name: self._supervisor_config.build() + if self._supervisor_config + else None, + MarsWorkerConfig.service_name: self._worker_config.build() + if self._worker_config + else None, + } + ) + return dict( + name=self._name, + queue=self._queue, + file_systems=self._file_systems, + master=self._master.build() if self._master else None, + services=services, + ) diff --git a/python/xorbits/_mars/deploy/yarn/config.yml b/python/xorbits/_mars/deploy/yarn/config.yml new file mode 100644 index 000000000..b3d0be729 --- /dev/null +++ b/python/xorbits/_mars/deploy/yarn/config.yml @@ -0,0 +1,3 @@ +"@inherits": ../oscar/config.yml +cluster: + backend: yarn diff --git a/python/xorbits/_mars/deploy/yarn/core.py b/python/xorbits/_mars/deploy/yarn/core.py new file mode 100644 index 000000000..4277ec86e --- /dev/null +++ b/python/xorbits/_mars/deploy/yarn/core.py @@ -0,0 +1,200 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import logging +import os +import signal +import uuid +from collections import defaultdict +from typing import AsyncGenerator, Dict, List, Optional, TypeVar + +from ... import oscar as mo +from ...services import NodeRole +from ...services.cluster.backends import ( + AbstractClusterBackend, + register_cluster_backend, +) +from ...utils import to_binary, to_str +from ..utils import wait_all_supervisors_ready +from .config import MarsSupervisorConfig, MarsWorkerConfig + +try: + from skein import ApplicationClient + from skein import Client as SkeinClient + from skein import ConnectionError as SkeinConnectionError + from skein import SkeinError + from skein import properties as skein_props +except ImportError: # pragma: no cover + ApplicationClient, SkeinClient, skein_props = None, None, None + SkeinConnectionError, SkeinError = None, None + +RetType = TypeVar("RetType") +logger = logging.getLogger(__name__) + +_role_to_config = { + NodeRole.SUPERVISOR: MarsSupervisorConfig, + NodeRole.WORKER: MarsWorkerConfig, +} + + +class YarnNodeWatchActor(mo.Actor): + def __init__(self): + assert ApplicationClient is not None + self._app_client = ApplicationClient.from_current() + + self._nodes = defaultdict(set) + self._supervisor_watch_task = None + self._role_to_events = defaultdict(list) + + async def __post_create__(self): + self._supervisor_watch_task = asyncio.create_task( + self._watch_nodes(NodeRole.SUPERVISOR) + ) + + async def __pre_destroy__(self): + if self._supervisor_watch_task is not None: # pragma: no branch + self._watch_task.cancel() + + async def get_container_mappings(self, role: NodeRole) -> Dict[str, str]: + key_prefix = _role_to_config[role].service_name + + container_specs = await asyncio.to_thread( + self._app_client.get_containers, [key_prefix] + ) + cid_to_endpoint = {c.yarn_container_id: None for c in container_specs} + + prefixes = await asyncio.to_thread(self._app_client.kv.get_prefix, key_prefix) + for val in prefixes.values(): + ep, cid = to_str(val).split("@", 1) + cid_to_endpoint[cid] = ep + return cid_to_endpoint + + async def _watch_nodes(self, role: NodeRole): + while True: + try: + mappings = await self.get_container_mappings(role) + eps = set(v for v in mappings.values() if v is not None) + + if eps != self._nodes[role]: + logger.info("New endpoints retrieved: %r", eps) + events = self._role_to_events.pop(role, []) + for ev in events: + ev.set() + self._nodes[role] = eps + await asyncio.sleep(1) + except SkeinConnectionError: # pragma: no cover + logger.warning("Skein application down, process will terminate") + os.kill(os.getpid(), signal.SIGTERM) + except (SkeinError, asyncio.CancelledError): # pragma: no cover + logger.exception("Error when watching nodes") + break + + async def get_nodes(self, role: NodeRole) -> List[str]: + if not self._nodes[role]: + mappings = await self.get_container_mappings(role) + eps = set(v for v in mappings.values() if v is not None) + self._nodes[role] = eps + return list(self._nodes[role]) + + async def wait_nodes(self, role: NodeRole): + event = asyncio.Event() + self._role_to_events[role].append(event) + + async def waiter(): + await event.wait() + return list(self._supervisors) + + return waiter() + + +@register_cluster_backend +class YarnClusterBackend(AbstractClusterBackend): + name = "yarn" + + def __init__(self, pool_address: str, watch_ref: mo.ActorRef = None): + self._pool_address = pool_address + self._watch_ref = watch_ref + + @classmethod + async def create( + cls, node_role: NodeRole, lookup_address: Optional[str], pool_address: str + ) -> "AbstractClusterBackend": + try: + ref = await mo.create_actor( + YarnNodeWatchActor, + uid=YarnNodeWatchActor.default_uid(), + address=pool_address, + ) + except mo.ActorAlreadyExist: # pragma: no cover + ref = await mo.actor_ref( + YarnNodeWatchActor.default_uid(), address=pool_address + ) + return YarnClusterBackend(pool_address, ref) + + async def get_supervisors(self, filter_ready: bool = True) -> List[str]: + if filter_ready: + return await self._watch_ref.get_nodes(NodeRole.SUPERVISOR) + else: + mapping = await self._watch_ref.get_container_mappings(NodeRole.SUPERVISOR) + return [v if v is not None else k for k, v in mapping.items()] + + async def watch_supervisors(self) -> AsyncGenerator[List[str], None]: + while True: + yield await self._watch_ref.wait_nodes(NodeRole.SUPERVISOR) + + async def request_worker( + self, worker_cpu: int = None, worker_mem: int = None, timeout: int = None + ) -> str: + raise NotImplementedError + + async def release_worker(self, address: str): + raise NotImplementedError + + async def reconstruct_worker(self, address: str): + raise NotImplementedError + + +class YarnServiceMixin(object): + service_name = None + + @property + def app_client(self): + if not hasattr(self, "_app_client"): + self._app_client = ApplicationClient.from_current() + return self._app_client + + def get_container_ip(self): + svc_containers = self.app_client.get_containers([self.service_name]) + container = next( + c + for c in svc_containers + if c.yarn_container_id == skein_props["yarn_container_id"] + ) + return container.yarn_node_http_address.split(":")[0] + + def register_endpoint(self, prefix: str = None, endpoint: str = None): + prefix = prefix or self.service_name + endpoint = endpoint or self.args.endpoint + + container_key = prefix + "-" + str(uuid.uuid1()) + self.app_client.kv[container_key] = to_binary( + f'{endpoint}@{skein_props["yarn_container_id"]}' + ) + + async def wait_all_supervisors_ready(self): + """ + Wait till all containers are ready, both in yarn and in Cluster Service + """ + await wait_all_supervisors_ready(self.args.endpoint) diff --git a/python/xorbits/_mars/deploy/yarn/supervisor.py b/python/xorbits/_mars/deploy/yarn/supervisor.py new file mode 100644 index 000000000..2e21a0f58 --- /dev/null +++ b/python/xorbits/_mars/deploy/yarn/supervisor.py @@ -0,0 +1,45 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from ..oscar.supervisor import SupervisorCommandRunner +from .config import MarsSupervisorConfig +from .core import YarnServiceMixin + + +class YarnSupervisorCommandRunner(YarnServiceMixin, SupervisorCommandRunner): + service_name = MarsSupervisorConfig.service_name + web_service_name = MarsSupervisorConfig.web_service_name + + def __call__(self, *args, **kwargs): + os.environ["MARS_CONTAINER_IP"] = self.get_container_ip() + return super().__call__(*args, **kwargs) + + async def start_services(self): + self.register_endpoint() + + await super().start_services() + + from ...services.web import OscarWebAPI + + web_api = await OscarWebAPI.create(self.args.endpoint) + web_endpoint = await web_api.get_web_address() + self.register_endpoint(self.web_service_name, web_endpoint) + + +main = YarnSupervisorCommandRunner() + +if __name__ == "__main__": # pragma: no branch + main() diff --git a/python/xorbits/_mars/deploy/yarn/tests/__init__.py b/python/xorbits/_mars/deploy/yarn/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/deploy/yarn/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/deploy/yarn/tests/test_config.py b/python/xorbits/_mars/deploy/yarn/tests/test_config.py new file mode 100644 index 000000000..a78b2013c --- /dev/null +++ b/python/xorbits/_mars/deploy/yarn/tests/test_config.py @@ -0,0 +1,133 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from .... import __file__ as mars_file +from ..config import ( + AppFileConfig, + AppMasterConfig, + MarsApplicationConfig, + MarsSupervisorConfig, + MarsWorkerConfig, + SecurityConfig, +) + + +def test_simple_object(): + config = SecurityConfig("/path/to/cert.pem", "/path/to/key.pem").build() + assert config["cert_file"] == "/path/to/cert.pem" + assert config["key_file"] == "/path/to/key.pem" + + config = AppFileConfig(source="/path/to/file").build() + assert config == "/path/to/file" + config = AppFileConfig(source="/path/to/file", file_type="archive").build() + assert config["source"] == "/path/to/file" + assert config["type"] == "archive" + + config = AppMasterConfig( + security=SecurityConfig("/path/to/cert.pem", "/path/to/key.pem"), + cpu=1, + memory="512 MiB", + ).build() + assert config["security"]["cert_file"] == "/path/to/cert.pem" + assert config["security"]["key_file"] == "/path/to/key.pem" + assert config["resources"]["vcores"] == 1 + + +def test_supervisor_config(): + config = MarsSupervisorConfig( + "/path/to/packed.tar.gz", + "mars.test_mod", + cpu=2, + memory="10 GiB", + env={"TEST_ENV": "test_val"}, + extra_args="-Dsupervisor.default_cpu_usage=0", + ).build() + assert config["files"]["mars_env"] == "/path/to/packed.tar.gz" + assert "mars.deploy.yarn.supervisor" in config["script"] + + config_envs = config["env"] + assert config_envs["TEST_ENV"] == "test_val" + assert config_envs["MKL_NUM_THREADS"] == "2" + assert config_envs["MARS_CPU_TOTAL"] == "2" + assert int(config_envs["MARS_MEMORY_TOTAL"]) == 10 * 1024**3 + assert config_envs["MARS_LOAD_MODULES"] == "mars.test_mod" + + config = MarsSupervisorConfig( + "conda://path/to_env", + "mars.test_mod", + cpu=2, + memory="10 GiB", + log_config="logging.conf", + env={"TEST_ENV": "test_val"}, + extra_args="-Dsupervisor.default_cpu_usage=0", + ).build() + config_envs = config["env"] + assert config_envs["MARS_SOURCE_PATH"] == os.path.dirname( + os.path.dirname(mars_file) + ) + + config = MarsSupervisorConfig( + "venv://path/to_env", + "mars.test_mod", + cpu=2, + log_config="logging.conf", + env={"TEST_ENV": "test_val"}, + extra_args="-Dsupervisor.default_cpu_usage=0", + ).build() + config_envs = config["env"] + assert config_envs["MARS_SOURCE_PATH"] == os.path.dirname( + os.path.dirname(mars_file) + ) + + +def test_worker_config(): + config = MarsWorkerConfig("/path/to/packed.tar.gz").build() + assert "mars.deploy.yarn.worker" in config["script"] + assert config["depends"] == [MarsSupervisorConfig.service_name] + + config = MarsWorkerConfig( + "/path/to/packed.tar.gz", + worker_cache_mem="10g", + spill_dirs=["/spill/dir1", "/spill/dir2"], + ).build() + config_envs = config["env"] + assert config_envs["MARS_CACHE_MEM_SIZE"] == "10g" + assert config_envs["MARS_SPILL_DIRS"].split(":") == ["/spill/dir1", "/spill/dir2"] + + +def test_app_config(): + supervisor_config = MarsSupervisorConfig( + "/path/to/packed.tar.gz", + "mars.test_mod", + cpu=2, + memory="10 GiB", + env={"TEST_ENV": "test_val"}, + extra_args="-Dsupervisor.default_cpu_usage=0", + ) + worker_config = MarsWorkerConfig( + "/path/to/packed.tar.gz", + worker_cache_mem="10g", + spill_dirs=["/spill/dir1", "/spill/dir2"], + ) + + config = MarsApplicationConfig( + name="config-name", + queue="default", + supervisor_config=supervisor_config, + worker_config=worker_config, + ).build() + assert config["name"] == "config-name" + assert config["queue"] == "default" diff --git a/python/xorbits/_mars/deploy/yarn/tests/test_yarn.py b/python/xorbits/_mars/deploy/yarn/tests/test_yarn.py new file mode 100644 index 000000000..dd2b1738e --- /dev/null +++ b/python/xorbits/_mars/deploy/yarn/tests/test_yarn.py @@ -0,0 +1,183 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import logging +import os +import shutil +import sqlite3 +import subprocess +import tempfile +import time +from distutils.spawn import find_executable + +import numpy as np +import pytest + +from .... import tensor as mt +from ....tests.core import flaky, require_hadoop +from ...yarn import new_cluster + +logger = logging.getLogger(__name__) +MARS_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(mt.__file__))) + + +def _collect_coverage(): + time.sleep(5) + dist_coverage_path = os.path.join(MARS_ROOT, ".dist-coverage") + if os.path.exists(dist_coverage_path): + # change ownership of coverage files + if find_executable("sudo"): + proc = subprocess.Popen( + [ + "sudo", + "-n", + "chown", + "-R", + f"{os.geteuid()}:{os.getegid()}", + dist_coverage_path, + ], + shell=False, + ) + proc.wait() + + # rewrite paths in coverage result files + for fn in glob.glob(os.path.join(dist_coverage_path, ".coverage.*")): + cov_db = sqlite3.connect(fn) + c = cov_db.cursor() + c.execute( + f"UPDATE file SET path=REPLACE(path, '{MARS_ROOT + os.path.sep}', '')" + ) + cov_db.commit() + cov_db.close() + + if "COVERAGE_FILE" in os.environ: + new_cov_file = os.environ["COVERAGE_FILE"] + os.path.basename( + fn + ).replace(".coverage", "") + else: + new_cov_file = fn.replace(".dist-coverage" + os.sep, "") + shutil.copyfile(fn, new_cov_file) + shutil.rmtree(dist_coverage_path) + + +def _run_yarn_test_with_env(env_path, timeout): + cluster = None + + coverage_result = os.path.join(MARS_ROOT, ".dist-coverage", ".coverage") + cov_dir = os.path.join(MARS_ROOT, ".dist-coverage") + os.makedirs(cov_dir, exist_ok=True) + os.chmod(cov_dir, 0o777) + try: + log_config_file = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "yarn-logging.conf" + ) + + cmd_tmpl = ( + '"{executable}" -m coverage run --source=%s/mars --rcfile=%s/setup.cfg' + % (MARS_ROOT, MARS_ROOT) + ) + extra_env = { + "COVERAGE_FILE": coverage_result, + "COVERAGE_PROCESS_START": f"{MARS_ROOT}/setup.cfg", + } + cluster = new_cluster( + env_path, + timeout=timeout, + worker_cpu=1, + worker_mem="1G", + extra_env=extra_env, + log_config=log_config_file, + extra_args=f"--config-file {MARS_ROOT}/mars/deploy/yarn/tests/test_yarn_config.yml", + log_when_fail=True, + cmd_tmpl=cmd_tmpl, + ) + assert cluster.endpoint is not None + + check_time = time.time() + while cluster.session.get_total_n_cpu() == 0: + time.sleep(1) + if time.time() - check_time > 5: + raise SystemError("Worker not ready") + + a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 + b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 + r = (a * b * 2 + 1).sum().execute().fetch() + + expected = (np.ones(a.shape) * 2 * 1 + 1) ** 2 * 2 + 1 + np.testing.assert_array_equal(r, expected.sum()) + finally: + if cluster is not None: + cluster.stop() + _collect_coverage() + + +@require_hadoop +@flaky(max_runs=3) +def test_run_with_conda_env(): + _run_yarn_test_with_env("conda://" + os.environ["CONDA_PREFIX"], 600) + + +@require_hadoop +@flaky(max_runs=3) +def test_run_with_packed_env(): + import conda_pack + + temp_dir = os.environ.get("MARS_YARN_TEST_DIR") + clean_after_test = False + if temp_dir is None: + clean_after_test = True + temp_dir = tempfile.mkdtemp(prefix="test-mars-yarn-") + else: + os.makedirs(temp_dir, exist_ok=True) + + packed_env_file = os.path.join(temp_dir, "mars-test-env.tar.gz") + if not os.path.exists(packed_env_file): + try: + conda_pack.pack(output=packed_env_file, ignore_editable_packages=True) + except conda_pack.CondaPackException: + logger.exception("Failed to pack environment, this test will be skipped") + return + + try: + _run_yarn_test_with_env(packed_env_file, 1200) + finally: + if clean_after_test: + shutil.rmtree(temp_dir) + + +@require_hadoop +@flaky(max_runs=3) +def test_create_timeout(): + cluster = None + try: + env_path = "conda://" + os.environ["CONDA_PREFIX"] + log_config_file = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "yarn-logging.conf" + ) + + with pytest.raises(TimeoutError): + cluster = new_cluster( + env_path, + log_config=log_config_file, + worker_cpu=1, + worker_mem="1G", + worker_cache_mem="64m", + log_when_fail=True, + timeout=1, + ) + finally: + if cluster is not None: + cluster.stop() + _collect_coverage() diff --git a/python/xorbits/_mars/deploy/yarn/tests/test_yarn_config.yml b/python/xorbits/_mars/deploy/yarn/tests/test_yarn_config.yml new file mode 100644 index 000000000..489d11d29 --- /dev/null +++ b/python/xorbits/_mars/deploy/yarn/tests/test_yarn_config.yml @@ -0,0 +1,4 @@ +"@inherits": '@mars/deploy/yarn/config.yml' +storage: + plasma: + store_memory: 20M diff --git a/python/xorbits/_mars/deploy/yarn/tests/yarn-logging.conf b/python/xorbits/_mars/deploy/yarn/tests/yarn-logging.conf new file mode 100644 index 000000000..320ca6cb3 --- /dev/null +++ b/python/xorbits/_mars/deploy/yarn/tests/yarn-logging.conf @@ -0,0 +1,50 @@ +[loggers] +keys=root,main,deploy,services,oscar,tornado + +[handlers] +keys=stream_handler + +[formatters] +keys=formatter + +[logger_root] +level=WARN +handlers=stream_handler + +[logger_main] +level=DEBUG +handlers=stream_handler +qualname=__main__ +propagate=0 + +[logger_deploy] +level=DEBUG +handlers=stream_handler +qualname=mars.deploy +propagate=0 + +[logger_oscar] +level=DEBUG +handlers=stream_handler +qualname=mars.oscar +propagate=0 + +[logger_services] +level=DEBUG +handlers=stream_handler +qualname=mars.services +propagate=0 + +[logger_tornado] +level=WARN +handlers=stream_handler +qualname=tornado +propagate=0 + +[handler_stream_handler] +class=StreamHandler +formatter=formatter +args=(sys.stderr,) + +[formatter_formatter] +format=%(asctime)s %(name)-12s %(process)d %(levelname)-8s %(message)s diff --git a/python/xorbits/_mars/deploy/yarn/worker.py b/python/xorbits/_mars/deploy/yarn/worker.py new file mode 100644 index 000000000..4d29b69e7 --- /dev/null +++ b/python/xorbits/_mars/deploy/yarn/worker.py @@ -0,0 +1,52 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from ..oscar.worker import WorkerCommandRunner +from .config import MarsWorkerConfig +from .core import YarnServiceMixin + + +class YarnWorkerCommandRunner(YarnServiceMixin, WorkerCommandRunner): + service_name = MarsWorkerConfig.service_name + + def __call__(self, *args, **kwargs): + os.environ["MARS_CONTAINER_IP"] = self.get_container_ip() + return super().__call__(*args, **kwargs) + + async def start_services(self): + from ...services.cluster import ClusterAPI + from ..oscar.worker import start_worker + + self.register_endpoint() + + await start_worker( + self.pool.external_address, + self.args.supervisors, + self.band_to_resource, + list(self.args.load_modules), + self.config, + mark_ready=False, + ) + await self.wait_all_supervisors_ready() + + cluster_api = await ClusterAPI.create(self.args.endpoint) + await cluster_api.mark_node_ready() + + +main = YarnWorkerCommandRunner() + +if __name__ == "__main__": # pragma: no branch + main() diff --git a/python/xorbits/_mars/learn/__init__.py b/python/xorbits/_mars/learn/__init__.py new file mode 100644 index 000000000..e6012d6f5 --- /dev/null +++ b/python/xorbits/_mars/learn/__init__.py @@ -0,0 +1,26 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import cluster, ensemble, neighbors, preprocessing, proxima, utils + +# register operands +# import torch first, or some issue emerges, +# see https://github.com/pytorch/pytorch/issues/2575 +from .contrib import lightgbm, pytorch, statsmodels, tensorflow, xgboost +from .metrics import pairwise + +for _mod in [xgboost, tensorflow, pytorch, lightgbm, proxima, neighbors, statsmodels]: + _mod.register_op() + +del _mod, pairwise, preprocessing, utils diff --git a/python/xorbits/_mars/learn/base.py b/python/xorbits/_mars/learn/base.py new file mode 100644 index 000000000..5eb03aeef --- /dev/null +++ b/python/xorbits/_mars/learn/base.py @@ -0,0 +1,179 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from sklearn.base import BaseEstimator as SklearnBaseEstimator + +from .utils.validation import check_array, check_X_y + + +class ClassifierMixin: + """Mixin class for all classifiers in scikit-learn.""" + + _estimator_type = "classifier" + + def score(self, X, y, sample_weight=None, session=None, run_kwargs=None): + """ + Return the mean accuracy on the given test data and labels. + + In multi-label classification, this is the subset accuracy + which is a harsh metric since you require for each sample that + each label set be correctly predicted. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Test samples. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + True labels for X. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + score : Tensor + Mean accuracy of self.predict(X) wrt. y. + """ + from .metrics import accuracy_score + + result = accuracy_score( + y, + self.predict(X), + sample_weight=sample_weight, + session=session, + run_kwargs=run_kwargs, + ) + return result + + +class RegressorMixin: + """Mixin class for all regression estimators in scikit-learn.""" + + _estimator_type = "regressor" + + def score(self, X, y, sample_weight=None): + """Return the coefficient of determination :math:`R^2` of the + prediction. + + The coefficient :math:`R^2` is defined as :math:`(1 - \\frac{u}{v})`, + where :math:`u` is the residual sum of squares ``((y_true - y_pred) + ** 2).sum()`` and :math:`v` is the total sum of squares ``((y_true - + y_true.mean()) ** 2).sum()``. The best possible score is 1.0 and it + can be negative (because the model can be arbitrarily worse). A + constant model that always predicts the expected value of `y`, + disregarding the input features, would get a :math:`R^2` score of + 0.0. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Test samples. For some estimators this may be a precomputed + kernel matrix or a list of generic objects instead with shape + ``(n_samples, n_samples_fitted)``, where ``n_samples_fitted`` + is the number of samples used in the fitting for the estimator. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + True values for `X`. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + score : Tensor + :math:`R^2` of ``self.predict(X)`` wrt. `y`. + + Notes + ----- + The :math:`R^2` score used when calling ``score`` on a regressor uses + ``multioutput='uniform_average'`` from version 0.23 to keep consistent + with default value of :func:`~sklearn.metrics.r2_score`. + This influences the ``score`` method of all the multioutput + regressors (except for + :class:`~sklearn.multioutput.MultiOutputRegressor`). + """ + + from .metrics import r2_score + + y_pred = self.predict(X) + return r2_score(y, y_pred, sample_weight=sample_weight) + + def _more_tags(self): # noqa: R0201 # pylint: disable=no-self-use + return {"requires_y": True} + + +class BaseEstimator(SklearnBaseEstimator): + def _validate_data( + self, X, y=None, reset=True, validate_separately=False, **check_params + ): + """Validate input data and set or check the `n_features_in_` attribute. + + Parameters + ---------- + X : {array-like, sparse matrix, dataframe} of shape \ + (n_samples, n_features) + The input samples. + y : array-like of shape (n_samples,), default=None + The targets. If None, `check_array` is called on `X` and + `check_X_y` is called otherwise. + reset : bool, default=True + Whether to reset the `n_features_in_` attribute. + If False, the input will be checked for consistency with data + provided when reset was last True. + validate_separately : False or tuple of dicts, default=False + Only used if y is not None. + If False, call validate_X_y(). Else, it must be a tuple of kwargs + to be used for calling check_array() on X and y respectively. + **check_params : kwargs + Parameters passed to :func:`sklearn.utils.check_array` or + :func:`sklearn.utils.check_X_y`. Ignored if validate_separately + is not False. + + Returns + ------- + out : tensor or tuple of these + The validated input. A tuple is returned if `y` is not None. + """ + + if y is None: + if hasattr(self, "_get_tags") and self._get_tags().get( + "requires_y", False + ): # pragma: no cover + raise ValueError( + f"This {type(self).__name__} estimator requires y to be passed, " + "but the target y is None." + ) + X = check_array(X, **check_params) + out = X + elif isinstance(y, str) and y == "no_validation": + X = check_array(X, **check_params) + out = X + else: # pragma: no cover + if validate_separately: + # We need this because some estimators validate X and y + # separately, and in general, separately calling check_array() + # on X and y isn't equivalent to just calling check_X_y() + # :( + check_X_params, check_y_params = validate_separately + X = check_array(X, **check_X_params) + y = check_array(y, **check_y_params) + else: + X, y = check_X_y(X, y, **check_params) + out = X, y + + if check_params.get("ensure_2d", True) and hasattr(self, "_check_n_features"): + self._check_n_features(X, reset=reset) + + return out diff --git a/python/xorbits/_mars/learn/cluster/__init__.py b/python/xorbits/_mars/learn/cluster/__init__.py new file mode 100644 index 000000000..6baa53cd6 --- /dev/null +++ b/python/xorbits/_mars/learn/cluster/__init__.py @@ -0,0 +1,43 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +try: + from ._kmeans import KMeans, k_means + + def _install(): + from ._k_means_common import KMeansInertia, KMeansRelocateEmptyClusters + from ._k_means_elkan_iter import ( + KMeansElkanInitBounds, + KMeansElkanPostprocess, + KMeansElkanUpdate, + ) + from ._k_means_init import KMeansPlusPlusInit + from ._k_means_lloyd_iter import KMeansLloydPostprocess, KMeansLloydUpdate + + del ( + KMeansInertia, + KMeansRelocateEmptyClusters, + KMeansElkanInitBounds, + KMeansElkanUpdate, + KMeansElkanPostprocess, + KMeansPlusPlusInit, + KMeansLloydUpdate, + KMeansLloydPostprocess, + ) + + _install() + del _install +except ImportError: + KMeans = None + k_means = None diff --git a/python/xorbits/_mars/learn/cluster/_k_means_common.py b/python/xorbits/_mars/learn/cluster/_k_means_common.py new file mode 100644 index 000000000..3799728a9 --- /dev/null +++ b/python/xorbits/_mars/learn/cluster/_k_means_common.py @@ -0,0 +1,402 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes +from ... import tensor as mt +from ...core import OutputType, recursive_tile +from ...serialization.serializables import KeyField +from ...tensor.array_utils import as_same_device, device, sparse +from ...tensor.core import TensorOrder +from ...utils import has_unknown_shape +from ..operands import LearnOperand, LearnOperandMixin +from ._k_means_fast import _inertia_dense, _inertia_sparse, merge_update_chunks + + +class KMeansInertia(LearnOperand, LearnOperandMixin): + _op_type_ = opcodes.KMEANS_INERTIA + + _x = KeyField("x") + _sample_weight = KeyField("sample_weight") + _centers = KeyField("centers") + _labels = KeyField("labels") + + def __init__( + self, + x=None, + sample_weight=None, + centers=None, + labels=None, + output_types=None, + **kw + ): + super().__init__( + _x=x, + _sample_weight=sample_weight, + _centers=centers, + _labels=labels, + _output_types=output_types, + **kw + ) + if self._output_types is None: + self._output_types = [OutputType.tensor] + + @property + def x(self): + return self._x + + @property + def sample_weight(self): + return self._sample_weight + + @property + def centers(self): + return self._centers + + @property + def labels(self): + return self._labels + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + for field in ("_x", "_sample_weight", "_centers", "_labels"): + if getattr(self, field, None) is not None: + setattr(self, field, next(inputs_iter)) + + def __call__(self): + params = {"shape": (), "dtype": np.dtype(float), "order": TensorOrder.C_ORDER} + return self.new_tileable( + [self._x, self._sample_weight, self._centers, self._labels], kws=[params] + ) + + @classmethod + def tile(cls, op: "KMeansInertia"): + if has_unknown_shape(*op.inputs): + yield + x = op.x + x = yield from recursive_tile(x.rechunk({1: x.shape[1]})) + sample_weight = yield from recursive_tile( + op.sample_weight.rechunk({0: x.nsplits[0]}) + ) + labels = yield from recursive_tile(op.labels.rechunk({0: x.nsplits[0]})) + centers = op.centers + centers = yield from recursive_tile(centers.rechunk(centers.shape)) + + out_chunks = [] + for x_chunk, sample_weight_chunk, labels_chunk in zip( + x.chunks, sample_weight.chunks, labels.chunks + ): + chunk_op = op.copy().reset_key() + chunk_params = { + "shape": (1,), + "dtype": np.dtype(float), + "order": TensorOrder.C_ORDER, + "index": x_chunk.index, + } + out_chunk = chunk_op.new_chunk( + [x_chunk, sample_weight_chunk, centers.chunks[0], labels_chunk], + kws=[chunk_params], + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + params = op.outputs[0].params + params["shape"] = (x.chunk_shape[0],) + params["chunks"] = out_chunks + params["nsplits"] = ((1,) * x.chunk_shape[0],) + out = new_op.new_tileable(op.inputs, kws=[params]).sum() + out = yield from recursive_tile(out) + return [out] + + @classmethod + def execute(cls, ctx, op): + (x, sample_weight, centers, labels), device_id, xp = as_same_device( + [ctx[inp.key] for inp in op.inputs], + device=op.device, + ret_extra=True, + copy_if_not_writeable=True, + ) + + with device(device_id): + if xp is np: + method = _inertia_dense + elif xp is sparse: + method = _inertia_sparse + else: # pragma: no cover + raise NotImplementedError("Cannot run inertial on GPU") + + result = method(x, sample_weight, centers, labels) + ctx[op.outputs[0].key] = np.array([result]) + + +def _inertia(X, sample_weight, centers, labels): + op = KMeansInertia(x=X, sample_weight=sample_weight, centers=centers, labels=labels) + return op() + + +def _execute_merge_update(ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[inp.key] for inp in op.inputs], + op.device, + ret_extra=True, + copy_if_not_writeable=True, + ) + length = len(inputs) // 2 + assert len(inputs) % 2 == 0 + centers_new_chunks = inputs[:length] + weight_in_cluster_chunks = inputs[length:] + + with device(device_id): + weight_in_clusters = np.zeros( + op.n_clusters, dtype=weight_in_cluster_chunks[0].dtype + ) + centers_new = np.zeros_like(centers_new_chunks[0]) + n_clusters = op.n_clusters + n_features = centers_new_chunks[0].shape[1] + + for weight_in_clusters_chunk, centers_new_chunk in zip( + weight_in_cluster_chunks, centers_new_chunks + ): + merge_update_chunks( + n_clusters, + n_features, + weight_in_clusters, + weight_in_clusters_chunk, + centers_new, + centers_new_chunk, + ) + + # centers new + ctx[op.outputs[0].key] = centers_new + # weight_in_clusters + ctx[op.outputs[1].key] = weight_in_clusters + + +class KMeansRelocateEmptyClusters(LearnOperand, LearnOperandMixin): + _op_type_ = opcodes.KMEANS_RELOCASTE_EMPTY_CLUSTERS + + _empty_clusters = KeyField("empty_clusters") + _far_x = KeyField("far_x") + _far_labels = KeyField("far_labels") + _far_sample_weights = KeyField("far_sample_weight") + _centers_new = KeyField("centers_new") + _weight_in_clusters = KeyField("weight_in_clusters") + + def __init__( + self, + empty_clusters=None, + far_x=None, + far_labels=None, + far_sample_weights=None, + centers_new=None, + weight_in_clusters=None, + output_types=None, + **kw + ): + super().__init__( + _empty_clusters=empty_clusters, + _far_x=far_x, + _far_labels=far_labels, + _far_sample_weights=far_sample_weights, + _centers_new=centers_new, + _weight_in_clusters=weight_in_clusters, + _output_types=output_types, + **kw + ) + if self._output_types is None: + self._output_types = [OutputType.tensor] * self.output_limit + + @property + def empty_clusters(self): + return self._empty_clusters + + @property + def far_x(self): + return self._far_x + + @property + def far_labels(self): + return self._far_labels + + @property + def far_sample_weights(self): + return self._far_sample_weights + + @property + def centers_new(self): + return self._centers_new + + @property + def weight_in_clusters(self): + return self._weight_in_clusters + + @property + def output_limit(self): + return 2 + + @property + def _input_fields(self): + return ( + "_empty_clusters", + "_far_x", + "_far_labels", + "_far_sample_weights", + "_centers_new", + "_weight_in_clusters", + ) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + for field in self._input_fields: + ob = getattr(self, field) + if ob is not None: + setattr(self, field, next(inputs_iter)) + + def __call__(self): + kws = [ + # centers_new + self._centers_new.params, + # weight_in_clusters + self._weight_in_clusters.params, + ] + return self.new_tileables( + [getattr(self, field) for field in self._input_fields], kws=kws + ) + + @classmethod + def tile(cls, op: "KMeansRelocateEmptyClusters"): + empty_clusters = yield from recursive_tile( + op.empty_clusters.rechunk(op.empty_clusters.shape) + ) + far_x = yield from recursive_tile(op.far_x.rechunk(op.far_x.shape)) + far_labels = yield from recursive_tile( + op.far_labels.rechunk(op.far_labels.shape) + ) + far_sample_weight = yield from recursive_tile( + op.far_sample_weights.rechunk(op.far_sample_weights.shape) + ) + centers_new = yield from recursive_tile( + op.centers_new.rechunk(op.centers_new.shape) + ) + weight_in_clusters = yield from recursive_tile( + op.weight_in_clusters.rechunk(op.weight_in_clusters.shape) + ) + + chunk_op = op.copy().reset_key() + out_centers_new_chunk, out_weight_in_clusters_chunk = chunk_op.new_chunks( + [ + empty_clusters.chunks[0], + far_x.chunks[0], + far_labels.chunks[0], + far_sample_weight.chunks[0], + centers_new.chunks[0], + weight_in_clusters.chunks[0], + ], + kws=[centers_new.chunks[0].params, weight_in_clusters.chunks[0].params], + ) + + out_centers_new_params = centers_new.params + out_centers_new_params["nsplits"] = centers_new.nsplits + out_centers_new_params["chunks"] = [out_centers_new_chunk] + out_weight_in_clusters_params = weight_in_clusters.params + out_weight_in_clusters_params["nsplits"] = weight_in_clusters.nsplits + out_weight_in_clusters_params["chunks"] = [out_weight_in_clusters_chunk] + new_op = op.copy() + return new_op.new_tileables( + op.inputs, kws=[out_centers_new_params, out_weight_in_clusters_params] + ) + + @classmethod + def execute(cls, ctx, op): + ( + ( + empty_clusters, + far_x, + far_labels, + far_sample_weight, + center_new, + weight_in_clusters, + ), + device_id, + xp, + ) = as_same_device( + [ctx[inp.key] for inp in op.inputs], op.device, ret_extra=True + ) + + out_centers_new = center_new.copy() + out_weight_in_clusters = weight_in_clusters.copy() + del center_new, weight_in_clusters + + n_empty = empty_clusters.shape[0] + n_features = far_x.shape[1] + + for idx in range(n_empty): + new_cluster_id = empty_clusters[idx] + weight = far_sample_weight[idx] + old_cluster_id = far_labels[idx] + + for k in range(n_features): + out_centers_new[old_cluster_id, k] -= far_x[idx, k] * weight + out_centers_new[new_cluster_id, k] = far_x[idx, k] * weight + + out_weight_in_clusters[new_cluster_id] = weight + out_weight_in_clusters[old_cluster_id] -= weight + + ctx[op.outputs[0].key] = out_centers_new + ctx[op.outputs[1].key] = out_weight_in_clusters + + +def _relocate_empty_clusters( + X, + sample_weight, + centers_old, + centers_new, + weight_in_clusters, + labels, + to_run=None, + session=None, + run_kwargs=None, +): + to_run = to_run or list() + empty_clusters = mt.where(mt.equal(weight_in_clusters, 0))[0].astype(mt.int32) + to_run.append(empty_clusters) + + mt.ExecutableTuple(to_run).execute(session=session, **(run_kwargs or dict())) + + n_empty = empty_clusters.shape[0] + + if n_empty == 0: + return centers_new, weight_in_clusters + + distances = ((mt.asarray(X) - mt.asarray(centers_old)[labels]) ** 2).sum(axis=1) + far_from_centers = mt.argpartition(distances, -n_empty)[: -n_empty - 1 : -1].astype( + np.int32 + ) + + far_x = X[far_from_centers] + far_labels = labels[far_from_centers] + far_sample_weight = sample_weight[far_from_centers] + + op = KMeansRelocateEmptyClusters( + empty_clusters=empty_clusters, + far_x=far_x, + far_labels=far_labels, + far_sample_weights=far_sample_weight, + centers_new=centers_new, + weight_in_clusters=weight_in_clusters, + ) + return op() diff --git a/python/xorbits/_mars/learn/cluster/_k_means_elkan.pyx b/python/xorbits/_mars/learn/cluster/_k_means_elkan.pyx new file mode 100644 index 000000000..db3a41708 --- /dev/null +++ b/python/xorbits/_mars/learn/cluster/_k_means_elkan.pyx @@ -0,0 +1,375 @@ +# cython: profile=False, boundscheck=False, wraparound=False, cdivision=True +# +# Author: Andreas Mueller +# +# Licence: BSD 3 clause + +# TODO: We still need to use ndarrays instead of typed memoryviews when using +# fused types and when the array may be read-only (for instance when it's +# provided by the user). This is fixed in cython > 0.3. + +import numpy as np +cimport numpy as np +cimport cython +from cython cimport floating +from sklearn.utils.extmath import row_norms + +from ._k_means_fast cimport _euclidean_dense_dense +from ._k_means_fast cimport _euclidean_sparse_dense + + +np.import_array() + + +def init_bounds_dense( + np.ndarray[floating, ndim=2, mode='c'] X, # IN + floating[:, ::1] centers, # IN + floating[:, ::1] center_half_distances, # IN + int[::1] labels, # OUT + floating[::1] upper_bounds, # OUT + floating[:, ::1] lower_bounds): # OUT + """Initialize upper and lower bounds for each sample for dense input data. + + Given X, centers and the pairwise distances divided by 2.0 between the + centers this calculates the upper bounds and lower bounds for each sample. + The upper bound for each sample is set to the distance between the sample + and the closest center. + + The lower bound for each sample is a one-dimensional array of n_clusters. + For each sample i assume that the previously assigned cluster is c1 and the + previous closest distance is dist, for a new cluster c2, the + lower_bound[i][c2] is set to distance between the sample and this new + cluster, if and only if dist > center_half_distances[c1][c2]. This prevents + computation of unnecessary distances for each sample to the clusters that + it is unlikely to be assigned to. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features), dtype=floating + The input data. + + centers : ndarray of shape (n_clusters, n_features), dtype=floating + The cluster centers. + + center_half_distances : ndarray of shape (n_clusters, n_clusters), \ + dtype=floating + The half of the distance between any 2 clusters centers. + + labels : ndarray of shape(n_samples), dtype=int + The label for each sample. This array is modified in place. + + upper_bounds : ndarray of shape(n_samples,), dtype=floating + The upper bound on the distance between each sample and its closest + cluster center. This array is modified in place. + + lower_bounds : ndarray, of shape(n_samples, n_clusters), dtype=floating + The lower bound on the distance between each sample and each cluster + center. This array is modified in place. + """ + cdef: + int n_samples = X.shape[0] + int n_clusters = centers.shape[0] + int n_features = X.shape[1] + + floating min_dist, dist + int best_cluster, i, j + + for i in range(n_samples): + best_cluster = 0 + min_dist = _euclidean_dense_dense(&X[i, 0], ¢ers[0, 0], + n_features, False) + lower_bounds[i, 0] = min_dist + for j in range(1, n_clusters): + if min_dist > center_half_distances[best_cluster, j]: + dist = _euclidean_dense_dense(&X[i, 0], ¢ers[j, 0], + n_features, False) + lower_bounds[i, j] = dist + if dist < min_dist: + min_dist = dist + best_cluster = j + labels[i] = best_cluster + upper_bounds[i] = min_dist + + +def init_bounds_sparse( + X, # IN + floating[:, ::1] centers, # IN + floating[:, ::1] center_half_distances, # IN + int[::1] labels, # OUT + floating[::1] upper_bounds, # OUT + floating[:, ::1] lower_bounds): # OUT + """Initialize upper and lower bounds for each sample for sparse input data. + + Given X, centers and the pairwise distances divided by 2.0 between the + centers this calculates the upper bounds and lower bounds for each sample. + The upper bound for each sample is set to the distance between the sample + and the closest center. + + The lower bound for each sample is a one-dimensional array of n_clusters. + For each sample i assume that the previously assigned cluster is c1 and the + previous closest distance is dist, for a new cluster c2, the + lower_bound[i][c2] is set to distance between the sample and this new + cluster, if and only if dist > center_half_distances[c1][c2]. This prevents + computation of unnecessary distances for each sample to the clusters that + it is unlikely to be assigned to. + + Parameters + ---------- + X : sparse matrix of shape (n_samples, n_features), dtype=floating + The input data. Must be in CSR format. + + centers : ndarray of shape (n_clusters, n_features), dtype=floating + The cluster centers. + + center_half_distances : ndarray of shape (n_clusters, n_clusters), \ + dtype=floating + The half of the distance between any 2 clusters centers. + + labels : ndarray of shape(n_samples), dtype=int + The label for each sample. This array is modified in place. + + upper_bounds : ndarray of shape(n_samples,), dtype=floating + The upper bound on the distance between each sample and its closest + cluster center. This array is modified in place. + + lower_bounds : ndarray of shape(n_samples, n_clusters), dtype=floating + The lower bound on the distance between each sample and each cluster + center. This array is modified in place. + """ + cdef: + int n_samples = X.shape[0] + int n_clusters = centers.shape[0] + int n_features = X.shape[1] + + floating[::1] X_data = X.data + int[::1] X_indices = X.indices + int[::1] X_indptr = X.indptr + + floating min_dist, dist + int best_cluster, i, j + + floating[::1] centers_squared_norms = row_norms(centers, squared=True) + + for i in range(n_samples): + best_cluster = 0 + min_dist = _euclidean_sparse_dense( + X_data[X_indptr[i]: X_indptr[i + 1]], + X_indices[X_indptr[i]: X_indptr[i + 1]], + centers[0], centers_squared_norms[0], False) + + lower_bounds[i, 0] = min_dist + for j in range(1, n_clusters): + if min_dist > center_half_distances[best_cluster, j]: + dist = _euclidean_sparse_dense( + X_data[X_indptr[i]: X_indptr[i + 1]], + X_indices[X_indptr[i]: X_indptr[i + 1]], + centers[j], centers_squared_norms[j], False) + lower_bounds[i, j] = dist + if dist < min_dist: + min_dist = dist + best_cluster = j + labels[i] = best_cluster + upper_bounds[i] = min_dist + + +def update_chunk_dense( + np.ndarray[floating, ndim=2, mode='c'] X, # IN + floating[::1] sample_weight, # IN + floating[:, ::1] centers_old, # IN + floating[:, ::1] center_half_distances, # IN + floating[::1] distance_next_center, # IN + int[::1] labels, # INOUT + floating[::1] upper_bounds, # INOUT + floating[:, ::1] lower_bounds, # INOUT + floating[:, ::1] centers_new, # OUT + floating[::1] weight_in_clusters, # OUT + bint update_centers=True): + return _update_chunk_dense(&X[0, 0], sample_weight, centers_old, + center_half_distances, + distance_next_center, labels, + upper_bounds, lower_bounds, + ¢ers_new[0, 0], &weight_in_clusters[0], + update_centers) + + +cdef void _update_chunk_dense( + floating *X, # IN + # expecting C aligned 2D array. XXX: Can be + # replaced by const memoryview when cython min + # version is >= 0.3 + floating[::1] sample_weight, # IN + floating[:, ::1] centers_old, # IN + floating[:, ::1] center_half_distances, # IN + floating[::1] distance_next_center, # IN + int[::1] labels, # INOUT + floating[::1] upper_bounds, # INOUT + floating[:, ::1] lower_bounds, # INOUT + floating *centers_new, # OUT + floating *weight_in_clusters, # OUT + bint update_centers) nogil: + """K-means combined EM step for one dense data chunk. + + Compute the partial contribution of a single data chunk to the labels and + centers. + """ + cdef: + int n_samples = labels.shape[0] + int n_clusters = centers_old.shape[0] + int n_features = centers_old.shape[1] + + floating upper_bound, distance + int i, j, k, label + + for i in range(n_samples): + upper_bound = upper_bounds[i] + bounds_tight = 0 + label = labels[i] + + # Next center is not far away from the currently assigned center. + # Sample might need to be assigned to another center. + if not distance_next_center[label] >= upper_bound: + + for j in range(n_clusters): + + # If this holds, then center_index is a good candidate for the + # sample to be relabelled, and we need to confirm this by + # recomputing the upper and lower bounds. + if (j != label + and (upper_bound > lower_bounds[i, j]) + and (upper_bound > center_half_distances[label, j])): + + # Recompute upper bound by calculating the actual distance + # between the sample and its current assigned center. + if not bounds_tight: + upper_bound = _euclidean_dense_dense( + X + i * n_features, ¢ers_old[label, 0], n_features, False) + lower_bounds[i, label] = upper_bound + bounds_tight = 1 + + # If the condition still holds, then compute the actual + # distance between the sample and center. If this is less + # than the previous distance, reassign label. + if (upper_bound > lower_bounds[i, j] + or (upper_bound > center_half_distances[label, j])): + + distance = _euclidean_dense_dense( + X + i * n_features, ¢ers_old[j, 0], n_features, False) + lower_bounds[i, j] = distance + if distance < upper_bound: + label = j + upper_bound = distance + + labels[i] = label + upper_bounds[i] = upper_bound + + if update_centers: + weight_in_clusters[label] += sample_weight[i] + for k in range(n_features): + centers_new[label * n_features + k] += X[i * n_features + k] * sample_weight[i] + + +def update_chunk_sparse( + X, # IN + floating[::1] sample_weight, # IN + floating[:, ::1] centers_old, # IN + floating[:, ::1] center_half_distances, # IN + floating[::1] distance_next_center, # IN + int[::1] labels, # INOUT + floating[::1] upper_bounds, # INOUT + floating[:, ::1] lower_bounds, # INOUT + floating[:, ::1] centers_new, # OUT + floating[::1] weight_in_clusters, # OUT + bint update_centers=True): + cdef: + floating[::1] X_data = X.data + int[::1] X_indices = X.indices + int[::1] X_indptr = X.indptr + + floating[::1] centers_squared_norms = row_norms(centers_old, squared=True) + + return _update_chunk_sparse( + X_data, X_indices, X_indptr, sample_weight, centers_old, + centers_squared_norms, center_half_distances, + distance_next_center, labels, upper_bounds, lower_bounds, + ¢ers_new[0, 0], &weight_in_clusters[0], update_centers + ) + + +cdef void _update_chunk_sparse( + floating[::1] X_data, # IN + int[::1] X_indices, # IN + int[::1] X_indptr, # IN + floating[::1] sample_weight, # IN + floating[:, ::1] centers_old, # IN + floating[::1] centers_squared_norms, # IN + floating[:, ::1] center_half_distances, # IN + floating[::1] distance_next_center, # IN + int[::1] labels, # INOUT + floating[::1] upper_bounds, # INOUT + floating[:, ::1] lower_bounds, # INOUT + floating *centers_new, # OUT + floating *weight_in_clusters, # OUT + bint update_centers) nogil: + """K-means combined EM step for one sparse data chunk. + + Compute the partial contribution of a single data chunk to the labels and + centers. + """ + cdef: + int n_samples = labels.shape[0] + int n_clusters = centers_old.shape[0] + int n_features = centers_old.shape[1] + + floating upper_bound, distance + int i, j, k, label + int s = X_indptr[0] + + for i in range(n_samples): + upper_bound = upper_bounds[i] + bounds_tight = 0 + label = labels[i] + + # Next center is not far away from the currently assigned center. + # Sample might need to be assigned to another center. + if not distance_next_center[label] >= upper_bound: + + for j in range(n_clusters): + + # If this holds, then center_index is a good candidate for the + # sample to be relabelled, and we need to confirm this by + # recomputing the upper and lower bounds. + if (j != label + and (upper_bound > lower_bounds[i, j]) + and (upper_bound > center_half_distances[label, j])): + + # Recompute upper bound by calculating the actual distance + # between the sample and its current assigned center. + if not bounds_tight: + upper_bound = _euclidean_sparse_dense( + X_data[X_indptr[i] - s: X_indptr[i + 1] - s], + X_indices[X_indptr[i] - s: X_indptr[i + 1] - s], + centers_old[label], centers_squared_norms[label], False) + lower_bounds[i, label] = upper_bound + bounds_tight = 1 + + # If the condition still holds, then compute the actual + # distance between the sample and center. If this is less + # than the previous distance, reassign label. + if (upper_bound > lower_bounds[i, j] + or (upper_bound > center_half_distances[label, j])): + distance = _euclidean_sparse_dense( + X_data[X_indptr[i] - s: X_indptr[i + 1] - s], + X_indices[X_indptr[i] - s: X_indptr[i + 1] - s], + centers_old[j], centers_squared_norms[j], False) + lower_bounds[i, j] = distance + if distance < upper_bound: + label = j + upper_bound = distance + + labels[i] = label + upper_bounds[i] = upper_bound + + if update_centers: + weight_in_clusters[label] += sample_weight[i] + for k in range(X_indptr[i] - s, X_indptr[i + 1] - s): + centers_new[label * n_features + X_indices[k]] += X_data[k] * sample_weight[i] diff --git a/python/xorbits/_mars/learn/cluster/_k_means_elkan_iter.py b/python/xorbits/_mars/learn/cluster/_k_means_elkan_iter.py new file mode 100644 index 000000000..54f14d33c --- /dev/null +++ b/python/xorbits/_mars/learn/cluster/_k_means_elkan_iter.py @@ -0,0 +1,866 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes +from ...core import OutputType, recursive_tile +from ...core.operand import OperandStage +from ...serialization.serializables import BoolField, Int32Field, KeyField +from ...tensor.array_utils import as_same_device, cp, device, sparse +from ...tensor.core import TensorOrder +from ...utils import has_unknown_shape +from ..operands import LearnOperand, LearnOperandMixin +from ._k_means_common import _execute_merge_update, _relocate_empty_clusters +from ._k_means_elkan import ( + init_bounds_dense, + init_bounds_sparse, + update_chunk_dense, + update_chunk_sparse, +) +from ._k_means_fast import update_center, update_upper_lower_bounds + + +class KMeansElkanInitBounds(LearnOperand, LearnOperandMixin): + _op_type_ = opcodes.KMEANS_ELKAN_INIT_BOUNDS + + _x = KeyField("x") + _centers = KeyField("centers") + _center_half_distances = KeyField("center_half_distances") + _n_clusters = Int32Field("n_clusters") + + def __init__( + self, + x=None, + centers=None, + center_half_distances=None, + n_clusters=None, + sparse=None, + gpu=None, + output_types=None, + **kw + ): + super().__init__( + _x=x, + _centers=centers, + _center_half_distances=center_half_distances, + _n_clusters=n_clusters, + sparse=sparse, + gpu=gpu, + _output_types=output_types, + **kw + ) + if self._output_types is None: + self._output_types = [OutputType.tensor] * self.output_limit + + @property + def x(self): + return self._x + + @property + def centers(self): + return self._centers + + @property + def center_half_distances(self): + return self._center_half_distances + + @property + def n_clusters(self): + return self._n_clusters + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._x = self._inputs[0] + self._centers = self._inputs[1] + self._center_half_distances = self._inputs[2] + + @property + def output_limit(self): + return 3 + + def __call__(self): + params = [] + # labels + params.append( + { + "shape": (self._x.shape[0],), + "dtype": np.dtype(np.int32), + "order": TensorOrder.C_ORDER, + } + ) + # upper bounds + params.append( + { + "shape": (self._x.shape[0],), + "dtype": self._x.dtype, + "order": TensorOrder.C_ORDER, + } + ) + # lower bounds + params.append( + { + "shape": (self._x.shape[0], self._n_clusters), + "dtype": self._x.dtype, + "order": TensorOrder.C_ORDER, + } + ) + return self.new_tileables( + [self._x, self._centers, self._center_half_distances], kws=params + ) + + @classmethod + def tile(cls, op: "KMeansElkanInitBounds"): + # unify chunks on axis 0 + if has_unknown_shape(op.centers, op.center_half_distances): + yield + x = op.x + centers = yield from recursive_tile(op.centers.rechunk(op.centers.shape)) + center_half_distances = yield from recursive_tile( + op.center_half_distances.rechunk(op.center_half_distances.shape) + ) + + out_chunks = [list() for _ in range(op.output_limit)] + for c in x.chunks: + chunk_op = op.copy().reset_key() + chunk_params = [] + # labels chunk + chunk_params.append( + { + "shape": (c.shape[0],), + "index": (c.index[0],), + "dtype": np.dtype(np.int32), + "order": TensorOrder.C_ORDER, + } + ) + # upper bounds + chunk_params.append( + { + "shape": (c.shape[0],), + "index": (c.index[0],), + "dtype": c.dtype, + "order": TensorOrder.C_ORDER, + } + ) + # lower bounds + chunk_params.append( + { + "shape": (c.shape[0], op.n_clusters), + "index": (c.index[0], 0), + "dtype": c.dtype, + "order": TensorOrder.C_ORDER, + } + ) + chunks = chunk_op.new_chunks( + [c, centers.chunks[0], center_half_distances.chunks[0]], + kws=chunk_params, + ) + for i, out_chunk in enumerate(chunks): + out_chunks[i].append(out_chunk) + + out_nsplits = [ + (x.nsplits[0],), + (x.nsplits[0],), + (x.nsplits[0], (op.n_clusters,)), + ] + out_params = [out.params for out in op.outputs] + for i, chunks in enumerate(out_chunks): + out_params[i]["chunks"] = chunks + for i, nsplits in enumerate(out_nsplits): + out_params[i]["nsplits"] = nsplits + new_op = op.copy() + return new_op.new_tileables(op.inputs, kws=out_params) + + @classmethod + def execute(cls, ctx, op): + (x, centers, center_half_distances), device_id, xp = as_same_device( + [ctx[inp.key] for inp in op.inputs], + device=op.device, + ret_extra=True, + copy_if_not_writeable=True, + ) + + with device(device_id): + if xp is cp: # pragma: no cover + raise NotImplementedError("cannot support init_bounds for kmeans elkan") + + n_samples = x.shape[0] + n_clusters = op.n_clusters + + labels = np.full(n_samples, -1, dtype=np.int32) + upper_bounds = np.zeros(n_samples, dtype=x.dtype) + lower_bounds = np.zeros((n_samples, n_clusters), dtype=x.dtype) + + if xp is np: + init_bounds = init_bounds_dense + else: + assert xp is sparse + init_bounds = init_bounds_sparse + + init_bounds( + x, centers, center_half_distances, labels, upper_bounds, lower_bounds + ) + + ctx[op.outputs[0].key] = labels + ctx[op.outputs[1].key] = upper_bounds + ctx[op.outputs[2].key] = lower_bounds + + +def init_bounds(X, centers, center_half_distances, n_clusters): + op = KMeansElkanInitBounds( + x=X, + centers=centers, + center_half_distances=center_half_distances, + n_clusters=n_clusters, + sparse=False, + gpu=X.op.gpu, + ) + return op() + + +class KMeansElkanUpdate(LearnOperand, LearnOperandMixin): + _op_type_ = opcodes.KMEANS_ELKAN_UPDATE + + _x = KeyField("x") + _sample_weight = KeyField("sample_weight") + _centers_old = KeyField("centers_old") + _center_half_distances = KeyField("center_half_distances") + _distance_next_center = KeyField("distance_next_center") + _labels = KeyField("labels") + _upper_bounds = KeyField("upper_bounds") + _lower_bounds = KeyField("lower_bounds") + _update_centers = BoolField("update_centers") + _n_clusters = Int32Field("n_clusters") + + def __init__( + self, + x=None, + sample_weight=None, + centers_old=None, + center_half_distances=None, + distance_next_center=None, + labels=None, + upper_bounds=None, + lower_bounds=None, + update_centers=None, + n_clusters=None, + output_types=None, + **kw + ): + super().__init__( + _x=x, + _sample_weight=sample_weight, + _centers_old=centers_old, + _center_half_distances=center_half_distances, + _distance_next_center=distance_next_center, + _labels=labels, + _upper_bounds=upper_bounds, + _lower_bounds=lower_bounds, + _update_centers=update_centers, + _n_clusters=n_clusters, + _output_types=output_types, + **kw + ) + if self._output_types is None: + self._output_types = [OutputType.tensor] * self.output_limit + + @property + def x(self): + return self._x + + @property + def sample_weight(self): + return self._sample_weight + + @property + def centers_old(self): + return self._centers_old + + @property + def center_half_distances(self): + return self._center_half_distances + + @property + def distance_next_center(self): + return self._distance_next_center + + @property + def labels(self): + return self._labels + + @property + def upper_bounds(self): + return self._upper_bounds + + @property + def lower_bounds(self): + return self._lower_bounds + + @property + def update_centers(self): + return self._update_centers + + @property + def n_clusters(self): + return self._n_clusters + + @property + def output_limit(self): + return 5 if self.stage != OperandStage.reduce else 2 + + @property + def _input_fields(self): + return ( + "_x", + "_sample_weight", + "_centers_old", + "_center_half_distances", + "_distance_next_center", + "_labels", + "_upper_bounds", + "_lower_bounds", + ) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if self.stage != OperandStage.reduce: + input_fields = self._input_fields + assert len(input_fields) == len(self._inputs) + inputs_iter = iter(inputs) + for field in input_fields: + setattr(self, field, next(inputs_iter)) + + def __call__(self): + kws = list( + ( + # labels + self._labels.params, + # upper_bounds + self._upper_bounds.params, + # lower_bounds + self._lower_bounds.params, + ) + ) + # centers_new + kws.append( + { + "shape": (self._n_clusters, self._x.shape[1]), + "dtype": self._centers_old.dtype, + "order": TensorOrder.C_ORDER, + } + ) + # weight_in_clusters + kws.append( + { + "shape": (self._n_clusters,), + "dtype": self._centers_old.dtype, + "order": TensorOrder.C_ORDER, + } + ) + return self.new_tileables( + [getattr(self, f) for f in self._input_fields], kws=kws + ) + + @classmethod + def tile(cls, op: "KMeansElkanUpdate"): + if has_unknown_shape(*op.inputs): + yield + x = op.x + if x.chunk_shape[1] != 1: # pragma: no cover + x = yield from recursive_tile(x.rechunk({1: x.shape[1]})) + sample_weight = yield from recursive_tile( + op.sample_weight.rechunk({0: x.nsplits[0]}) + ) + labels = yield from recursive_tile(op.labels.rechunk({0: x.nsplits[0]})) + upper_bounds = yield from recursive_tile( + op.upper_bounds.rechunk({0: x.nsplits[0]}) + ) + lower_bounds = yield from recursive_tile( + op.lower_bounds.rechunk({0: x.nsplits[0], 1: op.lower_bounds.shape[1]}) + ) + centers_old = yield from recursive_tile( + op.centers_old.rechunk(op.centers_old.shape) + ) + center_half_distances = yield from recursive_tile( + op.center_half_distances.rechunk(op.center_half_distances.shape) + ) + distance_next_center = yield from recursive_tile( + op.distance_next_center.rechunk(op.distance_next_center.shape) + ) + + out_chunks = [list() for _ in range(op.output_limit)] + for i in range(x.chunk_shape[0]): + x_chunk = x.cix[i, 0] + sample_weight_chunk = sample_weight.cix[i,] + labels_chunk = labels.cix[i,] + upper_bounds_chunk = upper_bounds.cix[i,] + lower_bounds_chunk = lower_bounds.cix[i, 0] + chunk_op = op.copy().reset_key() + chunk_op.stage = OperandStage.map + chunk_kws = list( + ( + # labels + labels_chunk.params, + # upper_bounds + upper_bounds_chunk.params, + # lower_boudns + lower_bounds_chunk.params, + ) + ) + # centers_new + chunk_kws.append( + { + "index": (0, 0), + "shape": (op.n_clusters, x_chunk.shape[1]), + "dtype": centers_old.dtype, + "order": TensorOrder.C_ORDER, + } + ) + # weight_in_clusters + chunk_kws.append( + { + "index": (0,), + "shape": (op.n_clusters,), + "dtype": centers_old.dtype, + "order": TensorOrder.C_ORDER, + } + ) + chunks = chunk_op.new_chunks( + [ + x_chunk, + sample_weight_chunk, + centers_old.chunks[0], + center_half_distances.chunks[0], + distance_next_center.chunks[0], + labels_chunk, + upper_bounds_chunk, + lower_bounds_chunk, + ], + kws=chunk_kws, + ) + assert len(chunks) == len(out_chunks) + for oc, c in zip(out_chunks, chunks): + oc.append(c) + + label_chunks, upper_bounds_chunks, lower_bounds_chunks = out_chunks[:3] + centers_new_chunks, weight_in_cluster_chunks = out_chunks[3:] + + if op.update_centers: + # merge centers_new and weight_in_clusters + merge_op = KMeansElkanUpdate( + stage=OperandStage.reduce, n_clusters=op.n_clusters + ) + merge_chunk_kw = [ + centers_new_chunks[0].params, + weight_in_cluster_chunks[0].params, + ] + centers_new_chunk, weight_in_cluster_chunk = merge_op.new_chunks( + centers_new_chunks + weight_in_cluster_chunks, kws=merge_chunk_kw + ) + else: + # the data is meaningless, just pick one + centers_new_chunk = centers_new_chunks[0] + weight_in_cluster_chunk = weight_in_cluster_chunks[0] + + out_params = [out.params for out in op.outputs] + # labels + out_params[0]["nsplits"] = labels.nsplits + out_params[0]["chunks"] = label_chunks + # upper_bounds + out_params[1]["nsplits"] = upper_bounds.nsplits + out_params[1]["chunks"] = upper_bounds_chunks + # lower_bounds + out_params[2]["nsplits"] = lower_bounds.nsplits + out_params[2]["chunks"] = lower_bounds_chunks + # centers_new + out_params[3]["nsplits"] = tuple((s,) for s in op.outputs[3].shape) + out_params[3]["chunks"] = [centers_new_chunk] + # weight_in_clusters + out_params[4]["nsplits"] = tuple((s,) for s in op.outputs[4].shape) + out_params[4]["chunks"] = [weight_in_cluster_chunk] + new_op = op.copy() + return new_op.new_tileables(op.inputs, kws=out_params) + + @classmethod + def _execute_reduce(cls, ctx, op): + return _execute_merge_update(ctx, op) + + @classmethod + def execute(cls, ctx, op: "KMeansElkanUpdate"): + if op.stage == OperandStage.reduce: + return cls._execute_reduce(ctx, op) + else: + ( + ( + x, + sample_weight, + centers_old, + center_half_distances, + distance_next_center, + labels, + upper_bounds, + lower_bounds, + ), + device_id, + xp, + ) = as_same_device( + [ctx[inp.key] for inp in op.inputs], + device=op.device, + ret_extra=True, + copy_if_not_writeable=True, + ) + + with device(device_id): + if not op.update_centers: + centers_new = centers_old.copy() + else: + centers_new = np.zeros_like(centers_old) + weight_in_clusters = np.zeros(op.n_clusters, dtype=x.dtype) + + if xp is np: + method = update_chunk_dense + elif xp is sparse: + method = update_chunk_sparse + else: # pragma: no cover + raise NotImplementedError("Does not support run on GPU") + + out_labels, out_upper_bounds, out_lower_bounds = ( + labels.copy(), + upper_bounds.copy(), + lower_bounds.copy(), + ) + method( + x, + sample_weight, + centers_old, + center_half_distances, + distance_next_center, + out_labels, + out_upper_bounds, + out_lower_bounds, + centers_new, + weight_in_clusters, + op.update_centers, + ) + + # labels + ctx[op.outputs[0].key] = out_labels + # upper_bounds + ctx[op.outputs[1].key] = out_upper_bounds + # lower_bounds + ctx[op.outputs[2].key] = out_lower_bounds + # centers_new + ctx[op.outputs[3].key] = centers_new + # weight_in_cluster + ctx[op.outputs[4].key] = weight_in_clusters + + +class KMeansElkanPostprocess(LearnOperand, LearnOperandMixin): + _op_type_ = opcodes.KMEANS_ELKAN_POSTPROCESS + + _centers_old = KeyField("centers_old") + _centers_new = KeyField("centers_new") + _center_shift = KeyField("center_shift") + _lower_bounds = KeyField("lower_bounds") + _upper_bounds = KeyField("upper_bounds") + _labels = KeyField("labels") + _weight_in_clusters = KeyField("weight_in_clusters") + + def __init__( + self, + centers_old=None, + centers_new=None, + center_shift=None, + lower_bounds=None, + upper_bounds=None, + labels=None, + weight_in_clusters=None, + output_types=None, + **kw + ): + super().__init__( + _centers_old=centers_old, + _centers_new=centers_new, + _center_shift=center_shift, + _lower_bounds=lower_bounds, + _upper_bounds=upper_bounds, + _labels=labels, + _weight_in_clusters=weight_in_clusters, + _output_types=output_types, + **kw + ) + if self._output_types is None: + self._output_types = [OutputType.tensor] * self.output_limit + + @property + def centers_old(self): + return self._centers_old + + @property + def centers_new(self): + return self._centers_new + + @property + def center_shift(self): + return self._center_shift + + @property + def lower_bounds(self): + return self._lower_bounds + + @property + def upper_bounds(self): + return self._upper_bounds + + @property + def labels(self): + return self._labels + + @property + def weight_in_clusters(self): + return self._weight_in_clusters + + @property + def output_limit(self): + if self.stage is None: + # for tileable + return 4 + elif self.stage == OperandStage.combine: + return 2 + else: + assert self.stage == OperandStage.reduce + return 2 + + @property + def _input_fields(self): + return ( + "_centers_old", + "_centers_new", + "_center_shift", + "_lower_bounds", + "_upper_bounds", + "_labels", + "_weight_in_clusters", + ) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + for field in self._input_fields: + ob = getattr(self, field) + if ob is not None: + setattr(self, field, next(inputs_iter)) + + def __call__(self): + kws = [ + # centers_new + self._centers_new.params, + # center_shift + self._center_shift.params, + # upper_bounds + self._upper_bounds.params, + # lower_bounds + self._lower_bounds.params, + ] + return self.new_tileables( + [getattr(self, f) for f in self._input_fields], kws=kws + ) + + @classmethod + def tile(cls, op: "KMeansElkanPostprocess"): + assert len(op.centers_old.chunks) == 1 + assert len(op.centers_new.chunks) == 1 + assert len(op.center_shift.chunks) == 1 + assert len(op._weight_in_clusters.chunks) == 1 + assert op.lower_bounds.chunk_shape[1] == 1 + + centers_old_chunk = op.centers_old.chunks[0] + centers_new_chunk = op.centers_new.chunks[0] + center_shift_chunk = op.center_shift.chunks[0] + weight_in_clusters_chunk = op.weight_in_clusters.chunks[0] + + # calculate center shift first + centers_new_chunk, center_shift_chunk = KMeansElkanPostprocess( + centers_old=centers_old_chunk, + centers_new=centers_new_chunk, + center_shift=center_shift_chunk, + weight_in_clusters=weight_in_clusters_chunk, + stage=OperandStage.combine, + ).new_chunks( + [ + centers_old_chunk, + centers_new_chunk, + center_shift_chunk, + weight_in_clusters_chunk, + ], + kws=[centers_new_chunk.params, center_shift_chunk.params], + ) + + upper_bounds_chunks, lower_bounds_chunks = [], [] + for upper_bound_chunk, lower_bound_chunk, labels_chunk in zip( + op.upper_bounds.chunks, op.lower_bounds.chunks, op.labels.chunks + ): + chunk_kws = [upper_bound_chunk.params, lower_bound_chunk.params] + upper_bound_chk, lower_bound_chk = KMeansElkanPostprocess( + center_shift=center_shift_chunk, + lower_bounds=lower_bound_chunk, + upper_bounds=upper_bound_chunk, + labels=labels_chunk, + stage=OperandStage.reduce, + ).new_chunks( + [ + center_shift_chunk, + lower_bound_chunk, + upper_bound_chunk, + labels_chunk, + ], + kws=chunk_kws, + ) + upper_bounds_chunks.append(upper_bound_chk) + lower_bounds_chunks.append(lower_bound_chk) + + centers_new_kw = op.centers_new.params + centers_new_kw["chunks"] = [centers_new_chunk] + centers_new_kw["nsplits"] = op.centers_new.nsplits + center_shift_kw = op.center_shift.params + center_shift_kw["chunks"] = [center_shift_chunk] + center_shift_kw["nsplits"] = op.center_shift.nsplits + upper_bounds_kw = op.upper_bounds.params + upper_bounds_kw["chunks"] = upper_bounds_chunks + upper_bounds_kw["nsplits"] = op.upper_bounds.nsplits + lower_bounds_kw = op.lower_bounds.params + lower_bounds_kw["chunks"] = lower_bounds_chunks + lower_bounds_kw["nsplits"] = op.lower_bounds.nsplits + new_op = op.copy() + return new_op.new_tileables( + op.inputs, + kws=[centers_new_kw, center_shift_kw, upper_bounds_kw, lower_bounds_kw], + ) + + @classmethod + def _execute_combine(cls, ctx, op): + ( + (centers_old, centers_new, center_shift, weight_in_clusters), + device_id, + xp, + ) = as_same_device( + [ctx[inp.key] for inp in op.inputs], + op.device, + ret_extra=True, + copy_if_not_writeable=True, + ) + + with device(device_id): + out_centers_new = centers_new.copy() + out_center_shift = center_shift.copy() + update_center( + centers_old, out_centers_new, out_center_shift, weight_in_clusters + ) + + ctx[op.outputs[0].key] = out_centers_new + ctx[op.outputs[1].key] = out_center_shift + + @classmethod + def _execute_reduce(cls, ctx, op): + ( + (center_shift, lower_bounds, upper_bounds, labels), + device_id, + xp, + ) = as_same_device( + [ctx[inp.key] for inp in op.inputs], + op.device, + ret_extra=True, + copy_if_not_writeable=True, + ) + + with device(device_id): + out_upper_bounds = upper_bounds.copy() + out_lower_bounds = lower_bounds.copy() + update_upper_lower_bounds( + out_upper_bounds, out_lower_bounds, labels, center_shift + ) + ctx[op.outputs[0].key] = out_upper_bounds + ctx[op.outputs[1].key] = out_lower_bounds + + @classmethod + def execute(cls, ctx, op): + if op.stage == OperandStage.combine: + return cls._execute_combine(ctx, op) + else: + assert op.stage == OperandStage.reduce + return cls._execute_reduce(ctx, op) + + +def elkan_iter( + X, + sample_weight, + centers_old, + center_half_distances, + distance_next_center, + upper_bounds, + lower_bounds, + labels, + center_shift, + update_centers=True, + session=None, + run_kwargs=None, +): + update_op = KMeansElkanUpdate( + x=X, + sample_weight=sample_weight, + centers_old=centers_old, + center_half_distances=center_half_distances, + distance_next_center=distance_next_center, + labels=labels, + upper_bounds=upper_bounds, + lower_bounds=lower_bounds, + update_centers=update_centers, + n_clusters=centers_old.shape[0], + ) + to_run = [] + ret = update_op() + to_run.extend(ret) + labels, upper_bounds, lower_bounds, centers_new, weight_in_clusters = ret + + if update_centers: + centers_new, weight_in_clusters = _relocate_empty_clusters( + X, + sample_weight, + centers_old, + centers_new, + weight_in_clusters, + labels, + to_run=to_run, + session=session, + run_kwargs=run_kwargs, + ) + postprocess = KMeansElkanPostprocess( + centers_old=centers_old, + centers_new=centers_new, + center_shift=center_shift, + lower_bounds=lower_bounds, + upper_bounds=upper_bounds, + labels=labels, + weight_in_clusters=weight_in_clusters, + ) + centers_new, center_shift, upper_bounds, lower_bounds = postprocess() + + return ( + centers_new, + weight_in_clusters, + upper_bounds, + lower_bounds, + labels, + center_shift, + ) diff --git a/python/xorbits/_mars/learn/cluster/_k_means_fast.pxd b/python/xorbits/_mars/learn/cluster/_k_means_fast.pxd new file mode 100644 index 000000000..29d704ef7 --- /dev/null +++ b/python/xorbits/_mars/learn/cluster/_k_means_fast.pxd @@ -0,0 +1,12 @@ +from cython cimport floating +cimport numpy as np + + +cdef floating _euclidean_dense_dense(floating*, floating*, int, bint) nogil + +cdef floating _euclidean_sparse_dense(floating[::1], int[::1], floating[::1], + floating, bint) nogil + +cdef void _average_centers(floating[:, ::1], floating[::1]) + +cdef void _center_shift(floating[:, ::1], floating[:, ::1], floating[::1]) diff --git a/python/xorbits/_mars/learn/cluster/_k_means_fast.pyx b/python/xorbits/_mars/learn/cluster/_k_means_fast.pyx new file mode 100644 index 000000000..ebad20ccf --- /dev/null +++ b/python/xorbits/_mars/learn/cluster/_k_means_fast.pyx @@ -0,0 +1,213 @@ +# cython: profile=False, boundscheck=False, wraparound=False, cdivision=True +# Profiling is enabled by default as the overhead does not seem to be +# measurable on this specific use case. + +# Author: Peter Prettenhofer +# Olivier Grisel +# Lars Buitinck +# +# License: BSD 3 clause + +# TODO: We still need to use ndarrays instead of typed memoryviews when using +# fused types and when the array may be read-only (for instance when it's +# provided by the user). This is fixed in cython > 0.3. + +import numpy as np +cimport numpy as np +cimport cython +from cython cimport floating +from libc.math cimport sqrt + +try: + from sklearn.utils.extmath import row_norms +except ImportError: # pragma: no cover + row_norms = None + + +np.import_array() + + +ctypedef np.float64_t DOUBLE +ctypedef np.int32_t INT + + +cdef floating _euclidean_dense_dense( + floating* a, # IN + floating* b, # IN + int n_features, + bint squared) nogil: + """Euclidean distance between a dense and b dense""" + cdef: + int i + int n = n_features // 4 + int rem = n_features % 4 + floating result = 0 + + # We manually unroll the loop for better cache optimization. + for i in range(n): + result += ((a[0] - b[0]) * (a[0] - b[0]) + +(a[1] - b[1]) * (a[1] - b[1]) + +(a[2] - b[2]) * (a[2] - b[2]) + +(a[3] - b[3]) * (a[3] - b[3])) + a += 4; b += 4 + + for i in range(rem): + result += (a[i] - b[i]) * (a[i] - b[i]) + + return result if squared else sqrt(result) + + +cdef floating _euclidean_sparse_dense( + floating[::1] a_data, # IN + int[::1] a_indices, # IN + floating[::1] b, # IN + floating b_squared_norm, + bint squared) nogil: + """Euclidean distance between a sparse and b dense""" + cdef: + int nnz = a_indices.shape[0] + int i + floating tmp, bi + floating result = 0.0 + + for i in range(nnz): + bi = b[a_indices[i]] + tmp = a_data[i] - bi + result += tmp * tmp - bi * bi + + result += b_squared_norm + + if result < 0: result = 0.0 + + return result if squared else sqrt(result) + + +cpdef floating _inertia_dense( + np.ndarray[floating, ndim=2, mode='c'] X, # IN + floating[::1] sample_weight, # IN + floating[:, ::1] centers, # IN + int[::1] labels): # IN + """Compute inertia for dense input data + + Sum of squared distance between each sample and its assigned center. + """ + cdef: + int n_samples = X.shape[0] + int n_features = X.shape[1] + int i, j + + floating sq_dist = 0.0 + floating inertia = 0.0 + + for i in range(n_samples): + j = labels[i] + sq_dist = _euclidean_dense_dense(&X[i, 0], ¢ers[j, 0], + n_features, True) + inertia += sq_dist * sample_weight[i] + + return inertia + + +cpdef floating _inertia_sparse( + X, # IN + floating[::1] sample_weight, # IN + floating[:, ::1] centers, # IN + int[::1] labels): # IN + """Compute inertia for sparse input data + + Sum of squared distance between each sample and its assigned center. + """ + cdef: + floating[::1] X_data = X.data + int[::1] X_indices = X.indices + int[::1] X_indptr = X.indptr + + int n_samples = X.shape[0] + int n_features = X.shape[1] + int i, j + + floating sq_dist = 0.0 + floating inertia = 0.0 + + floating[::1] centers_squared_norms = row_norms(centers, squared=True) + + for i in range(n_samples): + j = labels[i] + sq_dist = _euclidean_sparse_dense( + X_data[X_indptr[i]: X_indptr[i + 1]], + X_indices[X_indptr[i]: X_indptr[i + 1]], + centers[j], centers_squared_norms[j], True) + inertia += sq_dist * sample_weight[i] + + return inertia + + +cdef void _average_centers( + floating[:, ::1] centers, # INOUT + floating[::1] weight_in_clusters): # IN + """Average new centers wrt weights.""" + cdef: + int n_clusters = centers.shape[0] + int n_features = centers.shape[1] + int j, k + floating alpha + + for j in range(n_clusters): + if weight_in_clusters[j] > 0: + alpha = 1.0 / weight_in_clusters[j] + for k in range(n_features): + centers[j, k] *= alpha + + +cdef void _center_shift( + floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # IN + floating[::1] center_shift): # OUT + """Compute shift between old and new centers.""" + cdef: + int n_clusters = centers_old.shape[0] + int n_features = centers_old.shape[1] + int j + + for j in range(n_clusters): + center_shift[j] = _euclidean_dense_dense( + ¢ers_new[j, 0], ¢ers_old[j, 0], n_features, False) + + +def update_center( + floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # INOUT + floating[::1] center_shift, # OUT + floating[::1] weight_in_clusters): # IN + _average_centers(centers_new, weight_in_clusters) + _center_shift(centers_old, centers_new, center_shift) + + +def merge_update_chunks(int n_clusters, + int n_features, + floating[::1] weight_in_clusters, + floating[::1] weight_in_clusters_chunk, + floating[:, ::1] centers_new, + floating[:, ::1] centers_new_chunk): + for j in range(n_clusters): + weight_in_clusters[j] += weight_in_clusters_chunk[j] + for k in range(n_features): + centers_new[j, k] += centers_new_chunk[j, k] + + +def update_upper_lower_bounds( + floating[::1] upper_bounds, # INOUT + floating[:, ::1] lower_bounds, # INOUT + int[::1] labels, # IN + floating[::1] center_shift): # IN + cdef: + int n_samples = upper_bounds.shape[0] + int n_clusters = lower_bounds.shape[1] + + for i in range(n_samples): + upper_bounds[i] += center_shift[labels[i]] + + for j in range(n_clusters): + lower_bounds[i, j] -= center_shift[j] + if lower_bounds[i, j] < 0: + lower_bounds[i, j] = 0 diff --git a/python/xorbits/_mars/learn/cluster/_k_means_init.py b/python/xorbits/_mars/learn/cluster/_k_means_init.py new file mode 100644 index 000000000..81808a411 --- /dev/null +++ b/python/xorbits/_mars/learn/cluster/_k_means_init.py @@ -0,0 +1,503 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes +from ... import tensor as mt +from ...core import OutputType, recursive_tile +from ...core.operand import OperandStage +from ...serialization.serializables import Int32Field, KeyField +from ...tensor.array_utils import as_same_device, device +from ...tensor.core import TensorOrder +from ...tensor.random import RandomStateField +from ...utils import has_unknown_shape +from ..metrics import euclidean_distances +from ..operands import LearnOperand, LearnOperandMixin + + +def _kmeans_plus_plus_init( + X, x_squared_norms, random_state, n_clusters: int, n_local_trials: int = None +): + n_samples, n_features = X.shape + + centers = mt.empty((n_clusters, n_features), dtype=X.dtype) + + assert x_squared_norms is not None, "x_squared_norms None in _k_init" + + # Set the number of local seeding trials if none is given + if n_local_trials is None: + # This is what Arthur/Vassilvitskii tried, but did not report + # specific results for other than mentioning in the conclusion + # that it helped. + n_local_trials = 2 + int(np.log(n_clusters)) + + # Pick first center randomly + center_id = random_state.randint(n_samples) + if X.issparse(): # pragma: no cover + centers[0] = X[center_id].todense() + else: + centers[0] = X[center_id] + + # Initialize list of closest distances and calculate current potential + closest_dist_sq = euclidean_distances( + centers[0, mt.newaxis], X, Y_norm_squared=x_squared_norms, squared=True + ) + current_pot = closest_dist_sq.sum() + + # Pick the remaining n_clusters-1 points + for c in range(1, n_clusters): + # Choose center candidates by sampling with probability proportional + # to the squared distance to the closest existing center + rand_vals = random_state.random_sample(n_local_trials) * current_pot + candidate_ids = mt.searchsorted(closest_dist_sq.cumsum(), rand_vals) + # XXX: numerical imprecision can result in a candidate_id out of range + candidate_ids = mt.clip(candidate_ids, None, closest_dist_sq.size - 1) + + # Compute distances to center candidates + distance_to_candidates = euclidean_distances( + X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True + ) + + # update closest distances squared and potential for each candidate + distance_to_candidates = mt.minimum(closest_dist_sq, distance_to_candidates) + + candidates_pot = distance_to_candidates.sum(axis=1) + + # Decide which candidate is the best + best_candidate = mt.argmin(candidates_pot) + current_pot = candidates_pot[best_candidate] + closest_dist_sq = distance_to_candidates[best_candidate] + best_candidate = candidate_ids[best_candidate] + + # Permanently add best center candidate found in local tries + if X.issparse(): # pragma: no cover + c_center = X[best_candidate].todense() + else: + c_center = X[best_candidate] + + centers[c] = c_center + + return centers + + +class KMeansPlusPlusInit(LearnOperand, LearnOperandMixin): + _op_type_ = opcodes.KMEANS_PLUS_PLUS_INIT + + _x = KeyField("x") + _n_clusters = Int32Field("n_clusters") + _x_squared_norms = KeyField("x_squared_norms") + _state = RandomStateField("state") + _n_local_trials = Int32Field("n_local_trials") + + def __init__( + self, + x=None, + n_clusters=None, + x_squared_norms=None, + state=None, + n_local_trials=None, + output_types=None, + **kw + ): + super().__init__( + _x=x, + _n_clusters=n_clusters, + _x_squared_norms=x_squared_norms, + _state=state, + _n_local_trials=n_local_trials, + _output_types=output_types, + **kw + ) + if self._output_types is None: + self._output_types = [OutputType.tensor] + + @property + def x(self): + return self._x + + @property + def n_clusters(self): + return self._n_clusters + + @property + def x_squared_norms(self): + return self._x_squared_norms + + @property + def state(self): + return self._state + + @property + def n_local_trials(self): + return self._n_local_trials + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._x = self._inputs[0] + self._x_squared_norms = self._inputs[-1] + + def __call__(self): + inputs = [self._x, self._x_squared_norms] + kw = { + "shape": (self._n_clusters, self._x.shape[1]), + "dtype": self._x.dtype, + "order": TensorOrder.C_ORDER, + } + return self.new_tileable(inputs, kws=[kw]) + + @classmethod + def _tile_one_chunk(cls, op: "KMeansPlusPlusInit"): + out = op.outputs[0] + + chunk_op = op.copy().reset_key() + chunk_kw = out.params.copy() + chunk_kw["index"] = (0, 0) + chunk_inputs = [op.x.chunks[0], op.x_squared_norms.chunks[0]] + chunk = chunk_op.new_chunk(chunk_inputs, kws=[chunk_kw]) + + kw = out.params + kw["chunks"] = [chunk] + kw["nsplits"] = tuple((s,) for s in out.shape) + new_op = op.copy() + return new_op.new_tileables(op.inputs, kws=[kw]) + + @classmethod + def tile(cls, op: "KMeansPlusPlusInit"): + if len(op.x.chunks) == 1: + assert len(op.x_squared_norms.chunks) == 1 + return cls._tile_one_chunk(op) + else: + return (yield from cls._tile_k_init(op)) + + @classmethod + def _tile_k_init(cls, op: "KMeansPlusPlusInit"): + X = op.x + n_clusters = op.n_clusters + x_squared_norms = op.x_squared_norms + random_state = op.state + n_local_trials = op.n_local_trials + + centers = _kmeans_plus_plus_init( + X, x_squared_norms, random_state, n_clusters, n_local_trials + ) + return (yield from recursive_tile(centers)) + + @classmethod + def execute(cls, ctx, op: "KMeansPlusPlusInit"): + try: + from sklearn.cluster._kmeans import _kmeans_plusplus + except ImportError: # pragma: no cover + try: + from sklearn.cluster._kmeans import _k_init + except ImportError: + from sklearn.cluster.k_means_ import _k_init + + def _kmeans_plusplus(*args, **kwargs): + return _k_init(*args, **kwargs), None + + (x, x_squared_norms), device_id, _ = as_same_device( + [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + ctx[op.outputs[0].key] = _kmeans_plusplus( + x, + op.n_clusters, + x_squared_norms=x_squared_norms, + random_state=op.state, + n_local_trials=op.n_local_trials, + )[0] + + +############################################################################### +# Initialization heuristic + + +def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None): + """Init n_clusters seeds according to k-means++ + + Parameters + ---------- + X : array or sparse matrix, shape (n_samples, n_features) + The data to pick seeds for. To avoid memory copy, the input data + should be double precision (dtype=np.float64). + + n_clusters : integer + The number of seeds to choose + + x_squared_norms : array, shape (n_samples,) + Squared Euclidean norm of each data point. + + random_state : int, RandomState instance + The generator used to initialize the centers. Use an int to make the + randomness deterministic. + See :term:`Glossary `. + + n_local_trials : integer, optional + The number of seeding trials for each center (except the first), + of which the one reducing inertia the most is greedily chosen. + Set to None to make the number of trials depend logarithmically + on the number of seeds (2+log(k)); this is the default. + + Notes + ----- + Selects initial cluster centers for k-mean clustering in a smart way + to speed up convergence. see: Arthur, D. and Vassilvitskii, S. + "k-means++: the advantages of careful seeding". ACM-SIAM symposium + on Discrete algorithms. 2007 + + Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip, + which is the implementation used in the aforementioned paper. + """ + op = KMeansPlusPlusInit( + x=X, + n_clusters=n_clusters, + x_squared_norms=x_squared_norms, + state=random_state, + n_local_trials=n_local_trials, + ) + return op() + + +class KMeansScalablePlusPlusInit(LearnOperand, LearnOperandMixin): + _op_type_ = opcodes.KMEANS_SCALABLE_PLUS_PLUS_INIT + + _x = KeyField("x") + _n_clusters = Int32Field("n_clusters") + _x_squared_norms = KeyField("x_squared_norms") + _state = RandomStateField("state") + _init_iter = Int32Field("init_iter") + _oversampling_factor = Int32Field("oversampling_factor") + + def __init__( + self, + x=None, + n_clusters=None, + x_squared_norms=None, + state=None, + init_iter=None, + oversampling_factor=None, + output_types=None, + **kw + ): + super().__init__( + _x=x, + _n_clusters=n_clusters, + _x_squared_norms=x_squared_norms, + _state=state, + _init_iter=init_iter, + _oversampling_factor=oversampling_factor, + _output_types=output_types, + **kw + ) + if self._output_types is None: + self._output_types = [OutputType.tensor] + + @property + def x(self): + return self._x + + @property + def n_clusters(self): + return self._n_clusters + + @property + def x_squared_norms(self): + return self._x_squared_norms + + @property + def state(self): + return self._state + + @property + def init_iter(self): + return self._init_iter + + @property + def oversampling_factor(self): + return self._oversampling_factor + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if self._x is not None: + self._x = self._inputs[0] + if self._x_squared_norms is not None: + self._x_squared_norms = self._inputs[-1] + + def __call__(self): + inputs = [self._x, self._x_squared_norms] + kw = { + "shape": (self._n_clusters, self._x.shape[1]), + "dtype": self._x.dtype, + "order": TensorOrder.C_ORDER, + } + return self.new_tileable(inputs, kws=[kw]) + + @classmethod + def tile(cls, op: "KMeansScalablePlusPlusInit"): + if has_unknown_shape(*op.inputs): + yield + + x = mt.tensor(op.x) + x_squared_norms = mt.atleast_2d(op.x_squared_norms) + out = op.outputs[0] + + random_state = op.state + rs = mt.random.RandomState.from_numpy(random_state) + + n_samples, n_features = x.shape + n_clusters = op.n_clusters + + # step 1, sample a centroid + centers = x[random_state.randint(n_samples, size=1)] + + for _ in range(op.init_iter): + distances = euclidean_distances( + x, centers, X_norm_squared=x_squared_norms, squared=True + ) + + # calculate the cost of data with respect to current centers + cost = mt.sum(mt.min(distances, axis=1)) + + # calculate the distribution to sample new centers + distribution = mt.full(len(distances), 1 / len(distances)) + mt.true_divide( + mt.min(distances, axis=1), cost, where=cost != 0, out=distribution + ) + + # pick new centers + new_centers_size = op.oversampling_factor * n_clusters + new_centers = x[rs.choice(n_samples, new_centers_size, p=distribution)] + + centers = mt.concatenate([centers, new_centers]) + + # rechunk centers into one chunk + centers = (yield from recursive_tile(centers)).rechunk(centers.shape) + + distances = yield from recursive_tile( + euclidean_distances( + x, centers, X_norm_squared=x_squared_norms, squared=True + ) + ) + + map_index_to_chunks = {} + # calculate weight for each chunk + for c in distances.chunks: + map_chunk_op = KMeansScalablePlusPlusInit(stage=OperandStage.map) + map_chunk_kw = { + "shape": (len(centers),), + "dtype": np.dtype(np.int64), + "order": TensorOrder.C_ORDER, + "index": c.index, + } + map_chunk = map_chunk_op.new_chunk([c], kws=[map_chunk_kw]) + map_index_to_chunks[c.index] = map_chunk + + combine_chunks = [] + for i in range(distances.chunk_shape[0]): + map_chunks = [ + map_index_to_chunks[i, j] for j in range(distances.chunk_shape[1]) + ] + combine_chunk_op = KMeansScalablePlusPlusInit(stage=OperandStage.combine) + combine_chunk_kw = { + "shape": (len(centers),), + "dtype": np.dtype(np.int64), + "order": TensorOrder.C_ORDER, + "index": (i,), + } + combine_chunk = combine_chunk_op.new_chunk( + map_chunks, kws=[combine_chunk_kw] + ) + combine_chunks.append(combine_chunk) + + reduce_chunk_op = KMeansScalablePlusPlusInit( + n_clusters=op.n_clusters, state=random_state, stage=OperandStage.reduce + ) + reduce_chunk_kw = out.params + reduce_chunk_kw["index"] = (0, 0) + reduce_chunk = reduce_chunk_op.new_chunk( + [centers.chunks[0]] + combine_chunks, kws=[reduce_chunk_kw] + ) + + new_op = op.copy() + kw = out.params + kw["chunks"] = [reduce_chunk] + kw["nsplits"] = tuple((s,) for s in out.shape) + return new_op.new_tileables(op.inputs, kws=[kw]) + + @classmethod + def _execute_map(cls, ctx, op: "KMeansScalablePlusPlusInit"): + distances = ctx[op.inputs[0].key] + min_distance_ids = np.argmin(distances, axis=1) + min_distances = distances[range(len(distances)), min_distance_ids] + ctx[op.outputs[0].key] = (min_distances, min_distance_ids) + + @classmethod + def _execute_combine(cls, ctx, op: "KMeansScalablePlusPlusInit"): + out = op.outputs[0] + all_distances, all_min_distance_ids = tuple( + zip(*(ctx[inp.key] for inp in op.inputs)) + ) + distances = np.stack(all_distances).T + min_distance_ids = np.stack(all_min_distance_ids).T + + combined_min_distance_id = np.argmin(distances, axis=1) + min_distance_ids = min_distance_ids[ + range(len(distances)), combined_min_distance_id + ] + count = np.bincount(min_distance_ids) + result = np.zeros(out.shape[0], dtype=np.int64) + result[: len(count)] = count + ctx[out.key] = result + + @classmethod + def _execute_reduce(cls, ctx, op: "KMeansScalablePlusPlusInit"): + from sklearn.cluster import KMeans + + inputs = [ctx[inp.key] for inp in op.inputs] + + count = np.zeros(inputs[1].shape[0], dtype=np.int64) + for inp in inputs[1:]: + count += inp + weight = count / count.sum() + + centers = inputs[0] + + kmeans = KMeans(n_clusters=op.n_clusters, n_init=1, random_state=op.state) + kmeans.fit(centers, sample_weight=weight) + ctx[op.outputs[0].key] = kmeans.cluster_centers_ + + @classmethod + def execute(cls, ctx, op: "KMeansScalablePlusPlusInit"): + if op.stage == OperandStage.map: + return cls._execute_map(ctx, op) + elif op.stage == OperandStage.combine: + return cls._execute_combine(ctx, op) + else: + return cls._execute_reduce(ctx, op) + + +def _scalable_k_init( + X, n_clusters, x_squared_norms, random_state, oversampling_factor=2, init_iter=5 +): + op = KMeansScalablePlusPlusInit( + x=X, + n_clusters=n_clusters, + x_squared_norms=x_squared_norms, + state=random_state, + init_iter=init_iter, + oversampling_factor=oversampling_factor, + ) + return op() diff --git a/python/xorbits/_mars/learn/cluster/_k_means_lloyd.pyx b/python/xorbits/_mars/learn/cluster/_k_means_lloyd.pyx new file mode 100644 index 000000000..d0190ecf5 --- /dev/null +++ b/python/xorbits/_mars/learn/cluster/_k_means_lloyd.pyx @@ -0,0 +1,184 @@ +# cython: profile=False, boundscheck=False, wraparound=False, cdivision=True +# +# Licence: BSD 3 clause + +# TODO: We still need to use ndarrays instead of typed memoryviews when using +# fused types and when the array may be read-only (for instance when it's +# provided by the user). This is fixed in cython > 0.3. + +import numpy as np +cimport numpy as np +from cython cimport floating +from libc.stdlib cimport malloc, free +from libc.float cimport DBL_MAX, FLT_MAX + +from ..utils._cython_blas cimport _gemm +from ..utils._cython_blas cimport RowMajor, Trans, NoTrans + + +np.import_array() + + +def update_chunk_dense( + np.ndarray[floating, ndim=2, mode='c'] X, # IN + floating[::1] sample_weight, # IN + floating[::1] x_squared_norms, # IN + floating[:, ::1] centers_old, # IN + floating[::1] centers_squared_norms, # IN + int[::1] labels, # OUT + floating[:, ::1] centers_new, # OUT + floating[::1] weight_in_clusters, # OUT + bint update_centers=True): + cdef: + int n_samples = X.shape[0] + int n_clusters = centers_old.shape[0] + floating *pairwise_distances + + pairwise_distances = malloc(n_samples * n_clusters * sizeof(floating)) + result = _update_chunk_dense(&X[0, 0], sample_weight, x_squared_norms, + centers_old, centers_squared_norms, + labels, ¢ers_new[0, 0], + &weight_in_clusters[0], pairwise_distances, + update_centers) + free(pairwise_distances) + return result + + +cdef void _update_chunk_dense( + floating *X, # IN + # expecting C aligned 2D array. XXX: Can be + # replaced by const memoryview when cython min + # version is >= 0.3 + floating[::1] sample_weight, # IN + floating[::1] x_squared_norms, # IN + floating[:, ::1] centers_old, # IN + floating[::1] centers_squared_norms, # IN + int[::1] labels, # OUT + floating *centers_new, # OUT + floating *weight_in_clusters, # OUT + floating *pairwise_distances, # OUT + bint update_centers) nogil: + """K-means combined EM step for one dense data chunk. + + Compute the partial contribution of a single data chunk to the labels and + centers. + """ + cdef: + int n_samples = labels.shape[0] + int n_clusters = centers_old.shape[0] + int n_features = centers_old.shape[1] + + floating sq_dist, min_sq_dist + int i, j, k, label + + # Instead of computing the full pairwise squared distances matrix, + # ||X - C||² = ||X||² - 2 X.C^T + ||C||², we only need to store + # the - 2 X.C^T + ||C||² term since the argmin for a given sample only + # depends on the centers. + # pairwise_distances = ||C||² + for i in range(n_samples): + for j in range(n_clusters): + pairwise_distances[i * n_clusters + j] = centers_squared_norms[j] + + # pairwise_distances += -2 * X.dot(C.T) + _gemm(RowMajor, NoTrans, Trans, n_samples, n_clusters, n_features, + -2.0, X, n_features, ¢ers_old[0, 0], n_features, + 1.0, pairwise_distances, n_clusters) + + for i in range(n_samples): + min_sq_dist = pairwise_distances[i * n_clusters] + label = 0 + for j in range(1, n_clusters): + sq_dist = pairwise_distances[i * n_clusters + j] + if sq_dist < min_sq_dist: + min_sq_dist = sq_dist + label = j + labels[i] = label + + if update_centers: + weight_in_clusters[label] += sample_weight[i] + for k in range(n_features): + centers_new[label * n_features + k] += X[i * n_features + k] * sample_weight[i] + + +def update_chunk_sparse( + X, # IN + floating[::1] sample_weight, # IN + floating[::1] x_squared_norms, # IN + floating[:, ::1] centers_old, # IN + floating[::1] centers_squared_norms, # IN + int[::1] labels, # OUT + floating[:, ::1] centers_new, # OUT + floating[::1] weight_in_clusters, # OUT + bint update_centers=True): + cdef: + floating[::1] X_data = X.data + int[::1] X_indices = X.indices + int[::1] X_indptr = X.indptr + + X_data = X.data + X_indices = X.indices + X_indptr = X.indptr + + return _update_chunk_sparse( + X_data, X_indices, X_indptr, sample_weight, + x_squared_norms, centers_old, centers_squared_norms, + labels, ¢ers_new[0, 0], &weight_in_clusters[0], + update_centers + ) + + +cdef void _update_chunk_sparse( + floating[::1] X_data, # IN + int[::1] X_indices, # IN + int[::1] X_indptr, # IN + floating[::1] sample_weight, # IN + floating[::1] x_squared_norms, # IN + floating[:, ::1] centers_old, # IN + floating[::1] centers_squared_norms, # IN + int[::1] labels, # OUT + floating *centers_new, # OUT + floating *weight_in_clusters, # OUT + bint update_centers) nogil: + """K-means combined EM step for one sparse data chunk. + + Compute the partial contribution of a single data chunk to the labels and + centers. + """ + cdef: + int n_samples = labels.shape[0] + int n_clusters = centers_old.shape[0] + int n_features = centers_old.shape[1] + + floating sq_dist, min_sq_dist + int i, j, k, label + floating max_floating = FLT_MAX if floating is float else DBL_MAX + int s = X_indptr[0] + + # XXX Precompute the pairwise distances matrix is not worth for sparse + # currently. Should be tested when BLAS (sparse x dense) matrix + # multiplication is available. + for i in range(n_samples): + min_sq_dist = max_floating + label = 0 + + for j in range(n_clusters): + sq_dist = 0.0 + for k in range(X_indptr[i] - s, X_indptr[i + 1] - s): + sq_dist += centers_old[j, X_indices[k]] * X_data[k] + + # Instead of computing the full squared distance with each cluster, + # ||X - C||² = ||X||² - 2 X.C^T + ||C||², we only need to compute + # the - 2 X.C^T + ||C||² term since the argmin for a given sample + # only depends on the centers C. + sq_dist = centers_squared_norms[j] -2 * sq_dist + if sq_dist < min_sq_dist: + min_sq_dist = sq_dist + label = j + + labels[i] = label + + if update_centers: + weight_in_clusters[label] += sample_weight[i] + for k in range(X_indptr[i] - s, X_indptr[i + 1] - s): + centers_new[label * n_features + X_indices[k]] += X_data[k] * sample_weight[i] diff --git a/python/xorbits/_mars/learn/cluster/_k_means_lloyd_iter.py b/python/xorbits/_mars/learn/cluster/_k_means_lloyd_iter.py new file mode 100644 index 000000000..ca9121fee --- /dev/null +++ b/python/xorbits/_mars/learn/cluster/_k_means_lloyd_iter.py @@ -0,0 +1,449 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from sklearn.utils.extmath import row_norms as sklearn_row_norms + +from ... import opcodes +from ...core import OutputType, recursive_tile +from ...core.operand import OperandStage +from ...serialization.serializables import BoolField, Int32Field, KeyField +from ...tensor.array_utils import as_same_device, device, sparse +from ...tensor.core import TensorOrder +from ...utils import has_unknown_shape +from ..operands import LearnOperand, LearnOperandMixin +from ._k_means_common import _execute_merge_update, _relocate_empty_clusters +from ._k_means_fast import update_center +from ._k_means_lloyd import update_chunk_dense, update_chunk_sparse + + +class KMeansLloydUpdate(LearnOperand, LearnOperandMixin): + _op_type_ = opcodes.KMEANS_LLOYD_UPDATE + + _x = KeyField("x") + _sample_weight = KeyField("sample_weight") + _x_squared_norms = KeyField("x_squared_norms") + _centers_old = KeyField("centers_old") + _labels = KeyField("labels") + _update_centers = BoolField("update_centers") + _n_clusters = Int32Field("n_clusters") + + def __init__( + self, + x=None, + sample_weight=None, + x_squared_norms=None, + centers_old=None, + labels=None, + update_centers=None, + n_clusters=None, + output_types=None, + **kw + ): + super().__init__( + _x=x, + _sample_weight=sample_weight, + _x_squared_norms=x_squared_norms, + _centers_old=centers_old, + _labels=labels, + _update_centers=update_centers, + _n_clusters=n_clusters, + _output_types=output_types, + **kw + ) + if self._output_types is None: + self._output_types = [OutputType.tensor] * self.output_limit + + @property + def x(self): + return self._x + + @property + def sample_weight(self): + return self._sample_weight + + @property + def x_squared_norms(self): + return self._x_squared_norms + + @property + def centers_old(self): + return self._centers_old + + @property + def labels(self): + return self._labels + + @property + def update_centers(self): + return self._update_centers + + @property + def n_clusters(self): + return self._n_clusters + + @property + def output_limit(self): + return 3 if self.stage != OperandStage.reduce else 2 + + @property + def _input_fields(self): + return "_x", "_sample_weight", "_x_squared_norms", "_centers_old", "_labels" + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + for field in self._input_fields: + if getattr(self, field, None) is not None: + setattr(self, field, next(inputs_iter)) + + def __call__(self): + kws = [ + # labels + self._labels.params, + # centers_new + { + "shape": (self._n_clusters, self._x.shape[1]), + "dtype": self._centers_old.dtype, + "order": TensorOrder.C_ORDER, + }, + # weight_in_clusters + { + "shape": (self._n_clusters,), + "dtype": self._centers_old.dtype, + "order": TensorOrder.C_ORDER, + }, + ] + return self.new_tileables( + [getattr(self, field) for field in self._input_fields], kws=kws + ) + + @classmethod + def tile(cls, op: "KMeansLloydUpdate"): + if has_unknown_shape(*op.inputs): + yield + x = op.x + if x.chunk_shape[1] != 1: # pragma: no cover + x = yield from recursive_tile(x.rechunk({1: x.shape[1]})) + sample_weight = yield from recursive_tile( + op.sample_weight.rechunk({0: x.nsplits[0]}) + ) + x_squared_norms = yield from recursive_tile( + op.x_squared_norms.rechunk({0: x.nsplits[0]}) + ) + labels = yield from recursive_tile(op.labels.rechunk({0: x.nsplits[0]})) + assert len(op.centers_old.chunks) == 1 + + labels_chunks, centers_new_chunks, weight_in_clusters_chunks = [], [], [] + for i in range(x.chunk_shape[0]): + x_chunk = x.cix[i, 0] + sample_weight_chunk = sample_weight.cix[i,] + x_squared_norms_chunk = x_squared_norms.cix[i,] + labels_chunk = labels.cix[i,] + chunk_op = op.copy().reset_key() + chunk_op.stage = OperandStage.map + chunk_kws = [ + labels_chunk.params, + { + "index": (0, 0), + "shape": (op.n_clusters, x_chunk.shape[1]), + "dtype": op.centers_old.dtype, + "order": TensorOrder.C_ORDER, + }, + { + "index": (0,), + "shape": (op.n_clusters,), + "dtype": op.centers_old.dtype, + "order": TensorOrder.C_ORDER, + }, + ] + ( + labels_chunk, + centers_new_chunk, + weight_in_clusters_chunk, + ) = chunk_op.new_chunks( + [ + x_chunk, + sample_weight_chunk, + x_squared_norms_chunk, + op.centers_old.chunks[0], + labels_chunk, + ], + kws=chunk_kws, + ) + labels_chunks.append(labels_chunk) + centers_new_chunks.append(centers_new_chunk) + weight_in_clusters_chunks.append(weight_in_clusters_chunk) + + if op.update_centers: + # merge centers_new and weight_in_clusters + merge_op = KMeansLloydUpdate( + stage=OperandStage.reduce, n_clusters=op.n_clusters + ) + merge_chunk_kw = [ + centers_new_chunks[0].params, + weight_in_clusters_chunks[0].params, + ] + centers_new_chunk, weight_in_cluster_chunk = merge_op.new_chunks( + centers_new_chunks + weight_in_clusters_chunks, kws=merge_chunk_kw + ) + else: + # the data is meaningless, just pick one + centers_new_chunk = centers_new_chunks[0] + weight_in_cluster_chunk = weight_in_clusters_chunks[0] + + out_params = [out.params for out in op.outputs] + # labels + out_params[0]["nsplits"] = labels.nsplits + out_params[0]["chunks"] = labels_chunks + # centers_new + out_params[1]["nsplits"] = tuple((s,) for s in op.outputs[1].shape) + out_params[1]["chunks"] = [centers_new_chunk] + # weight_in_clusters + out_params[2]["nsplits"] = tuple((s,) for s in op.outputs[2].shape) + out_params[2]["chunks"] = [weight_in_cluster_chunk] + new_op = op.copy() + return new_op.new_tileables(op.inputs, kws=out_params) + + @classmethod + def _execute_reduce(cls, ctx, op): + return _execute_merge_update(ctx, op) + + @classmethod + def execute(cls, ctx, op: "KMeansLloydUpdate"): + if op.stage == OperandStage.reduce: + return cls._execute_reduce(ctx, op) + else: + ( + (x, sample_weight, x_squared_norms, centers_old, labels), + device_id, + xp, + ) = as_same_device( + [ctx[inp.key] for inp in op.inputs], + device=op.device, + ret_extra=True, + copy_if_not_writeable=True, + ) + + with device(device_id): + if not op.update_centers: + centers_new = centers_old.copy() + else: + centers_new = np.zeros_like(centers_old) + weight_in_clusters = np.zeros(op.n_clusters, dtype=x.dtype) + centers_squared_norms = sklearn_row_norms(centers_old, squared=True) + + if xp is np: + method = update_chunk_dense + elif xp is sparse: + method = update_chunk_sparse + else: # pragma: no cover + raise NotImplementedError("Does not support run on GPU") + out_labels = labels.copy() + method( + x, + sample_weight, + x_squared_norms, + centers_old, + centers_squared_norms, + out_labels, + centers_new, + weight_in_clusters, + op.update_centers, + ) + + # labels + ctx[op.outputs[0].key] = out_labels + # centers_new + ctx[op.outputs[1].key] = centers_new + # weight_in_cluster + ctx[op.outputs[2].key] = weight_in_clusters + + +class KMeansLloydPostprocess(LearnOperand, LearnOperandMixin): + _op_type_ = opcodes.KMEANS_LLOYD_POSTPROCESS + + _centers_old = KeyField("centers_old") + _centers_new = KeyField("centers_new") + _center_shift = KeyField("center_shift") + _weight_in_clusters = KeyField("weight_in_clusters") + + def __init__( + self, + centers_old=None, + centers_new=None, + center_shift=None, + weight_in_clusters=None, + output_types=None, + **kw + ): + super().__init__( + _centers_old=centers_old, + _centers_new=centers_new, + _center_shift=center_shift, + _weight_in_clusters=weight_in_clusters, + _output_types=output_types, + **kw + ) + if self._output_types is None: + self._output_types = [OutputType.tensor] * self.output_limit + + @property + def centers_old(self): + return self._centers_old + + @property + def centers_new(self): + return self._centers_new + + @property + def center_shift(self): + return self._center_shift + + @property + def weight_in_clusters(self): + return self._weight_in_clusters + + @property + def output_limit(self): + return 2 + + @property + def _input_fields(self): + return "_centers_old", "_centers_new", "_center_shift", "_weight_in_clusters" + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + for field in self._input_fields: + ob = getattr(self, field) + if ob is not None: + setattr(self, field, next(inputs_iter)) + + def __call__(self): + kws = [ + # centers_new + self._centers_new.params, + # center_shift + self._center_shift.params, + ] + return self.new_tileables( + [getattr(self, f) for f in self._input_fields], kws=kws + ) + + @classmethod + def tile(cls, op: "KMeansLloydPostprocess"): + assert len(op.centers_old.chunks) == 1 + assert len(op.centers_new.chunks) == 1 + assert len(op.center_shift.chunks) == 1 + assert len(op.weight_in_clusters.chunks) == 1 + + centers_old_chunk = op.centers_old.chunks[0] + centers_new_chunk = op.centers_new.chunks[0] + center_shift_chunk = op.center_shift.chunks[0] + weight_in_clusters_chunk = op.weight_in_clusters.chunks[0] + centers_new_chunk, center_shift_chunk = KMeansLloydPostprocess( + centers_old=centers_old_chunk, + centers_new=centers_new_chunk, + center_shift=center_shift_chunk, + weight_in_clusters=weight_in_clusters_chunk, + ).new_chunks( + [ + centers_old_chunk, + centers_new_chunk, + center_shift_chunk, + weight_in_clusters_chunk, + ], + kws=[centers_new_chunk.params, center_shift_chunk.params], + ) + + centers_new_kw = op.centers_new.params + centers_new_kw["chunks"] = [centers_new_chunk] + centers_new_kw["nsplits"] = op.centers_new.nsplits + center_shift_kw = op.center_shift.params + center_shift_kw["chunks"] = [center_shift_chunk] + center_shift_kw["nsplits"] = op.center_shift.nsplits + new_op = op.copy() + return new_op.new_tileables(op.inputs, kws=[centers_new_kw, center_shift_kw]) + + @classmethod + def execute(cls, ctx, op: "KMeansLloydPostprocess"): + ( + (centers_old, centers_new, center_shift, weight_in_clusters), + device_id, + xp, + ) = as_same_device( + [ctx[inp.key] for inp in op.inputs], + op.device, + ret_extra=True, + copy_if_not_writeable=True, + ) + + with device(device_id): + out_center_shift = center_shift.copy() + out_centers_new = centers_new.copy() + update_center( + centers_old, out_centers_new, out_center_shift, weight_in_clusters + ) + + ctx[op.outputs[0].key] = out_centers_new + ctx[op.outputs[1].key] = out_center_shift + + +def lloyd_iter( + X, + sample_weight, + x_squared_norms, + centers_old, + labels, + center_shift, + update_centers=True, + session=None, + run_kwargs=None, +): + update_op = KMeansLloydUpdate( + x=X, + sample_weight=sample_weight, + x_squared_norms=x_squared_norms, + centers_old=centers_old, + labels=labels, + update_centers=update_centers, + n_clusters=centers_old.shape[0], + ) + to_run = [] + ret = update_op() + to_run.extend(ret) + labels, centers_new, weight_in_clusters = ret + + if update_centers: + centers_new, weight_in_clusters = _relocate_empty_clusters( + X, + sample_weight, + centers_old, + centers_new, + weight_in_clusters, + labels, + to_run=to_run, + session=session, + run_kwargs=run_kwargs, + ) + postprocess = KMeansLloydPostprocess( + centers_old=centers_old, + centers_new=centers_new, + center_shift=center_shift, + weight_in_clusters=weight_in_clusters, + ) + centers_new, center_shift = postprocess() + + return centers_new, weight_in_clusters, labels, center_shift diff --git a/python/xorbits/_mars/learn/cluster/_kmeans.py b/python/xorbits/_mars/learn/cluster/_kmeans.py new file mode 100644 index 000000000..112b3b245 --- /dev/null +++ b/python/xorbits/_mars/learn/cluster/_kmeans.py @@ -0,0 +1,1122 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings + +import numpy as np +from sklearn.base import ClusterMixin, TransformerMixin +from sklearn.exceptions import ConvergenceWarning + +from ... import tensor as mt +from ...tensor.utils import check_random_state +from ..base import BaseEstimator +from ..metrics.pairwise import euclidean_distances +from ..utils.extmath import row_norms +from ..utils.validation import ( + _check_sample_weight, + _num_samples, + check_array, + check_is_fitted, +) +from ._k_means_common import _inertia +from ._k_means_elkan_iter import elkan_iter, init_bounds +from ._k_means_init import _k_init, _scalable_k_init +from ._k_means_lloyd_iter import lloyd_iter + +############################################################################### +# K-means batch estimation by EM (expectation maximization) + + +def _validate_center_shape(X, n_centers, centers): + """Check if centers is compatible with X and n_centers""" + if len(centers) != n_centers: + raise ValueError( + "The shape of the initial centers (%s) " + "does not match the number of clusters %i" % (centers.shape, n_centers) + ) + if centers.shape[1] != X.shape[1]: + raise ValueError( + "The number of features of the initial centers %s " + "does not match the number of features of the data %s." + % (centers.shape[1], X.shape[1]) + ) + + +def _tolerance(X, tol): + """Return a tolerance which is independent of the dataset""" + variances = mt.var(X, axis=0) + return mt.mean(variances) * tol + + +def _check_normalize_sample_weight(sample_weight, X): + """Set sample_weight if None, and check for correct dtype""" + + sample_weight_was_none = sample_weight is None + + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + if not sample_weight_was_none: + # normalize the weights to sum up to n_samples + # an array of 1 (i.e. samples_weight is None) is already normalized + n_samples = len(sample_weight) + scale = n_samples / sample_weight.sum() + sample_weight *= scale + return sample_weight + + +def k_means( + X, + n_clusters, + sample_weight=None, + init="k-means||", + n_init=10, + max_iter=300, + verbose=False, + tol=1e-4, + random_state=None, + copy_x=True, + algorithm="auto", + oversampling_factor=2, + init_iter=5, + return_n_iter=False, +): + """K-means clustering algorithm. + + Parameters + ---------- + X : Tensor, shape (n_samples, n_features) + The observations to cluster. It must be noted that the data + will be converted to C ordering, which will cause a memory copy + if the given data is not C-contiguous. + + n_clusters : int + The number of clusters to form as well as the number of + centroids to generate. + + sample_weight : array-like, shape (n_samples,), optional + The weights for each observation in X. If None, all observations + are assigned equal weight (default: None) + + init : {'k-means++', 'k-means||', 'random', or tensor, or a callable}, optional + Method for initialization, default to 'k-means||': + + 'k-means++' : selects initial cluster centers for k-mean + clustering in a smart way to speed up convergence. See section + Notes in k_init for more details. + + 'k-means||': scalable k-means++. + + 'random': choose k observations (rows) at random from data for + the initial centroids. + + If an ndarray is passed, it should be of shape (n_clusters, n_features) + and gives the initial centers. + + If a callable is passed, it should take arguments X, k and + and a random state and return an initialization. + + n_init : int, optional, default: 10 + Number of time the k-means algorithm will be run with different + centroid seeds. The final results will be the best output of + n_init consecutive runs in terms of inertia. + + max_iter : int, optional, default 300 + Maximum number of iterations of the k-means algorithm to run. + + verbose : boolean, optional + Verbosity mode. + + tol : float, optional + The relative increment in the results before declaring convergence. + + random_state : int, RandomState instance or None (default) + Determines random number generation for centroid initialization. Use + an int to make the randomness deterministic. + See :term:`Glossary `. + + copy_x : bool, optional + When pre-computing distances it is more numerically accurate to center + the data first. If copy_x is True (default), then the original data is + not modified, ensuring X is C-contiguous. If False, the original data + is modified, and put back before the function returns, but small + numerical differences may be introduced by subtracting and then adding + the data mean, in this case it will also not ensure that data is + C-contiguous which may cause a significant slowdown. + + algorithm : "auto", "full" or "elkan", default="auto" + K-means algorithm to use. The classical EM-style algorithm is "full". + The "elkan" variation is more efficient by using the triangle + inequality, but currently doesn't support sparse data. "auto" chooses + "elkan" for dense data and "full" for sparse data. + + oversampling_factor: int, default=2 + Only work for kmeans||, used in each iteration in kmeans||. + + init_iter: int, default=5 + Only work for kmeans||, indicates how may iterations required. + + return_n_iter : bool, optional + Whether or not to return the number of iterations. + + Returns + ------- + centroid : float ndarray with shape (k, n_features) + Centroids found at the last iteration of k-means. + + label : integer ndarray with shape (n_samples,) + label[i] is the code or index of the centroid the + i'th observation is closest to. + + inertia : float + The final value of the inertia criterion (sum of squared distances to + the closest centroid for all observations in the training set). + + best_n_iter : int + Number of iterations corresponding to the best results. + Returned only if `return_n_iter` is set to True. + """ + + est = KMeans( + n_clusters=n_clusters, + init=init, + n_init=n_init, + max_iter=max_iter, + verbose=verbose, + tol=tol, + random_state=random_state, + copy_x=copy_x, + algorithm=algorithm, + oversampling_factor=oversampling_factor, + init_iter=init_iter, + ).fit(X, sample_weight=sample_weight) + if return_n_iter: + return est.cluster_centers_, est.labels_, est.inertia_, est.n_iter_ + else: + return est.cluster_centers_, est.labels_, est.inertia_ + + +def _kmeans_single_elkan( + X, + sample_weight, + centers_init, + n_clusters, + max_iter=300, + verbose=False, + x_squared_norms=None, + tol=1e-4, + X_mean=None, + session=None, + run_kwargs=None, +): + sample_weight = _check_normalize_sample_weight(sample_weight, X) + + centers = centers_init + # execute X, centers and tol first + tol = mt.asarray(tol) + to_run = [X, sample_weight, centers, x_squared_norms, tol] + if X_mean is not None: + to_run.append(X_mean) + mt.ExecutableTuple(to_run).execute(session=session, **(run_kwargs or dict())) + tol = tol.fetch(session=session) + + if verbose: + print("Initialization complete") + + center_half_distances = euclidean_distances(centers) / 2 + distance_next_center = mt.partition( + mt.asarray(center_half_distances), kth=1, axis=0 + )[1] + center_shift = mt.zeros(n_clusters, dtype=X.dtype) + + labels, upper_bounds, lower_bounds = init_bounds( + X, centers, center_half_distances, n_clusters + ) + + for i in range(max_iter): + to_runs = [] + + ( + centers_new, + weight_in_clusters, + upper_bounds, + lower_bounds, + labels, + center_shift, + ) = elkan_iter( + X, + sample_weight, + centers, + center_half_distances, + distance_next_center, + upper_bounds, + lower_bounds, + labels, + center_shift, + session=session, + run_kwargs=run_kwargs, + ) + to_runs.extend( + [ + centers_new, + weight_in_clusters, + upper_bounds, + lower_bounds, + labels, + center_shift, + ] + ) + + # compute new pairwise distances between centers and closest other + # center of each center for next iterations + center_half_distances = euclidean_distances(centers_new) / 2 + distance_next_center = mt.partition( + mt.asarray(center_half_distances), kth=1, axis=0 + )[1] + to_runs.extend([center_half_distances, distance_next_center]) + + if verbose: + inertia = _inertia(X, sample_weight, centers, labels) + to_runs.append(inertia) + + center_shift_tot = (center_shift**2).sum() + to_runs.append(center_shift_tot) + + mt.ExecutableTuple(to_runs).execute(session=session, **(run_kwargs or dict())) + + if verbose: + inertia_data = inertia.fetch(session=session) + print(f"Iteration {i}, inertia {inertia_data}") + + center_shift_tot = center_shift_tot.fetch(session=session) + if center_shift_tot <= tol: + if verbose: # pragma: no cover + print( + f"Converged at iteration {i}: center shift {center_shift_tot} " + f"within tolerance {tol}" + ) + break + + centers, centers_new = centers_new, centers + + if center_shift_tot > 0: + # rerun E-step so that predicted labels match cluster centers + ( + centers_new, + weight_in_clusters, + upper_bounds, + lower_bounds, + labels, + center_shift, + ) = elkan_iter( + X, + sample_weight, + centers, + center_half_distances, + distance_next_center, + upper_bounds, + lower_bounds, + labels, + center_shift, + update_centers=False, + session=session, + run_kwargs=run_kwargs, + ) + + inertia = _inertia(X, sample_weight, centers, labels) + + mt.ExecutableTuple([labels, inertia, centers]).execute( + session=session, **(run_kwargs or dict()) + ) + return labels, inertia, centers, i + 1 + + +def _kmeans_single_lloyd( + X, + sample_weight, + centers_init, + n_clusters, + max_iter=300, + verbose=False, + x_squared_norms=None, + tol=1e-4, + X_mean=None, + session=None, + run_kwargs=None, +): + sample_weight = _check_normalize_sample_weight(sample_weight, X) + + centers = centers_init + # execute X, centers and tol first + tol = mt.asarray(tol) + to_run = [X, centers, x_squared_norms, tol] + if X_mean is not None: + to_run.append(X_mean) + mt.ExecutableTuple(to_run).execute(session=session, **(run_kwargs or dict())) + tol = tol.fetch(session=session) + + if verbose: # pragma: no cover + print("Initialization complete") + + labels = mt.full(X.shape[0], -1, dtype=mt.int32) + center_shift = mt.zeros(n_clusters, dtype=X.dtype) + + for i in range(max_iter): + to_runs = [] + + centers_new, weight_in_clusters, labels, center_shift = lloyd_iter( + X, + sample_weight, + x_squared_norms, + centers, + labels, + center_shift, + update_centers=True, + session=session, + run_kwargs=run_kwargs, + ) + to_runs.extend([centers_new, weight_in_clusters, labels, center_shift]) + + if verbose: + inertia = _inertia(X, sample_weight, centers, labels) + to_runs.append(inertia) + + center_shift_tot = (center_shift**2).sum() + to_runs.append(center_shift_tot) + + mt.ExecutableTuple(to_runs).execute(session=session, **(run_kwargs or dict())) + + if verbose: # pragma: no cover + inertia_data = inertia.fetch(session=session) + print(f"Iteration {i}, inertia {inertia_data}") + + center_shift_tot = center_shift_tot.fetch(session=session) + if center_shift_tot <= tol: + if verbose: # pragma: no cover + print( + f"Converged at iteration {i}: center shift {center_shift_tot} " + f"within tolerance {tol}" + ) + break + + centers, centers_new = centers_new, centers + + if center_shift_tot > 0: + # rerun E-step so that predicted labels match cluster centers + centers_new, weight_in_clusters, labels, center_shift = lloyd_iter( + X, + sample_weight, + x_squared_norms, + centers, + labels, + center_shift, + update_centers=False, + session=session, + run_kwargs=run_kwargs, + ) + + inertia = _inertia(X, sample_weight, centers, labels) + + mt.ExecutableTuple([labels, inertia, centers]).execute( + session=session, **(run_kwargs or dict()) + ) + return labels, inertia, centers, i + 1 + + +def _labels_inertia( + X, sample_weight, x_squared_norms, centers, session=None, run_kwargs=None +): + """E step of the K-means EM algorithm. + + Compute the labels and the inertia of the given samples and centers. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples to assign to the labels. If sparse matrix, must be in + CSR format. + + sample_weight : array-like of shape (n_samples,) + The weights for each observation in X. + + x_squared_norms : Tensor of shape (n_samples,) + Precomputed squared euclidean norm of each data point, to speed up + computations. + + centers : Tensor, shape (n_clusters, n_features) + The cluster centers. + + Returns + ------- + labels : ndarray of shape (n_samples,) + The resulting assignment + + inertia : float + Sum of squared distances of samples to their closest cluster center. + """ + n_samples = X.shape[0] + n_clusters = centers.shape[0] + + sample_weight = _check_normalize_sample_weight(sample_weight, X) + labels = mt.full(n_samples, -1, dtype=np.int32) + weight_in_clusters = mt.zeros(n_clusters, dtype=centers.dtype) + center_shift = mt.zeros_like(weight_in_clusters) + + centers, weight_in_clusters, labels, center_shift = lloyd_iter( + X, + sample_weight, + x_squared_norms, + centers, + labels, + center_shift, + update_centers=False, + session=session, + run_kwargs=run_kwargs, + ) + + inertia = _inertia(X, sample_weight, centers, labels) + + return labels, inertia + + +def _init_centroids( + X, + n_clusters=8, + init="k-means++", + random_state=None, + x_squared_norms=None, + init_size=None, + oversampling_factor=2, + init_iter=5, +): + """Compute the initial centroids + + Parameters + ---------- + + X : Tensor of shape (n_samples, n_features) + The input samples. + + n_clusters : int, default=8 + number of centroids. + + init : {'k-means++', 'k-means||', 'random', tensor, callable}, default="k-means++" + Method for initialization. + + random_state : int, RandomState instance, default=None + Determines random number generation for centroid initialization. Use + an int to make the randomness deterministic. + See :term:`Glossary `. + + x_squared_norms : tensor of shape (n_samples,), default=None + Squared euclidean norm of each data point. Pass it if you have it at + hands already to avoid it being recomputed here. Default: None + + init_size : int, default=None + Number of samples to randomly sample for speeding up the + initialization (sometimes at the expense of accuracy): the + only algorithm is initialized by running a batch KMeans on a + random subset of the data. This needs to be larger than k. + + Returns + ------- + centers : tensor of shape(k, n_features) + """ + random_state = check_random_state(random_state).to_numpy() + n_samples = X.shape[0] + + if x_squared_norms is None: + x_squared_norms = row_norms(X, squared=True) + + if init_size is not None and init_size < n_samples: # pragma: no cover + if init_size < n_clusters: + warnings.warn( + f"init_size={init_size} should be larger than k={n_clusters}. " + "Setting it to 3*k", + RuntimeWarning, + stacklevel=2, + ) + init_size = 3 * n_clusters + init_indices = random_state.randint(0, n_samples, init_size) + X = X[init_indices] + x_squared_norms = x_squared_norms[init_indices] + n_samples = X.shape[0] + elif n_samples < n_clusters: + raise ValueError( + f"n_samples={n_samples} should be larger than n_clusters={n_clusters}" + ) + + if isinstance(init, str) and init == "k-means++": + centers = _k_init( + X, n_clusters, random_state=random_state, x_squared_norms=x_squared_norms + ) + elif isinstance(init, str) and init == "k-means||": + centers = _scalable_k_init( + X, + n_clusters, + random_state=random_state, + x_squared_norms=x_squared_norms, + oversampling_factor=oversampling_factor, + init_iter=init_iter, + ) + elif isinstance(init, str) and init == "random": + seeds = random_state.choice(n_samples, size=n_clusters, replace=False) + centers = X[seeds].rechunk((n_clusters, X.shape[1])) + elif hasattr(init, "__array__"): + # ensure that the centers have the same dtype as X + # this is a requirement of fused types of cython + centers = mt.array(init, dtype=X.dtype) + elif callable(init): + centers = init(X, n_clusters, random_state=random_state) + centers = mt.asarray(centers, dtype=X.dtype) + else: # pragma: no cover + raise ValueError( + "the init parameter for the k-means should " + "be 'k-means++' or 'random' or a tensor, " + f"'{init}' (type '{type(init)}') was passed." + ) + + if centers.issparse(): + centers = centers.todense() + + _validate_center_shape(X, n_clusters, centers) + return centers + + +class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): + """K-Means clustering. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + + n_clusters : int, default=8 + The number of clusters to form as well as the number of + centroids to generate. + + init : {'k-means++', 'k-means||', 'random'} or tensor of shape \ + (n_clusters, n_features), default='k-means||' + Method for initialization, defaults to 'k-means||': + + 'k-means++' : selects initial cluster centers for k-mean + clustering in a smart way to speed up convergence. See section + Notes in k_init for more details. + + 'k-means||': scalable k-means++. + + 'random': choose k observations (rows) at random from data for + the initial centroids. + + If a tensor is passed, it should be of shape (n_clusters, n_features) + and gives the initial centers. + + n_init : int, default=1 + Number of time the k-means algorithm will be run with different + centroid seeds. The final results will be the best output of + n_init consecutive runs in terms of inertia. + + max_iter : int, default=300 + Maximum number of iterations of the k-means algorithm for a + single run. + + tol : float, default=1e-4 + Relative tolerance with regards to inertia to declare convergence. + + verbose : int, default=0 + Verbosity mode. + + random_state : int, RandomState instance, default=None + Determines random number generation for centroid initialization. Use + an int to make the randomness deterministic. + See :term:`Glossary `. + + copy_x : bool, default=True + When pre-computing distances it is more numerically accurate to center + the data first. If copy_x is True (default), then the original data is + not modified, ensuring X is C-contiguous. If False, the original data + is modified, and put back before the function returns, but small + numerical differences may be introduced by subtracting and then adding + the data mean, in this case it will also not ensure that data is + C-contiguous which may cause a significant slowdown. + + algorithm : {"auto", "full", "elkan"}, default="auto" + K-means algorithm to use. The classical EM-style algorithm is "full". + The "elkan" variation is more efficient by using the triangle + inequality, but currently doesn't support sparse data. "auto" chooses + "elkan" for dense data and "full" for sparse data. + + oversampling_factor: int, default=2 + Only work for kmeans||, used in each iteration in kmeans||. + + init_iter: int, default=5 + Only work for kmeans||, indicates how may iterations required. + + Attributes + ---------- + cluster_centers_ : tensor of shape (n_clusters, n_features) + Coordinates of cluster centers. If the algorithm stops before fully + converging (see ``tol`` and ``max_iter``), these will not be + consistent with ``labels_``. + + labels_ : tensor of shape (n_samples,) + Labels of each point + + inertia_ : float + Sum of squared distances of samples to their closest cluster center. + + n_iter_ : int + Number of iterations run. + + See Also + -------- + + MiniBatchKMeans + Alternative online implementation that does incremental updates + of the centers positions using mini-batches. + For large scale learning (say n_samples > 10k) MiniBatchKMeans is + probably much faster than the default batch implementation. + + Notes + ----- + The k-means problem is solved using either Lloyd's or Elkan's algorithm. + + The average complexity is given by O(k n T), were n is the number of + samples and T is the number of iteration. + + The worst case complexity is given by O(n^(k+2/p)) with + n = n_samples, p = n_features. (D. Arthur and S. Vassilvitskii, + 'How slow is the k-means method?' SoCG2006) + + In practice, the k-means algorithm is very fast (one of the fastest + clustering algorithms available), but it falls in local minima. That's why + it can be useful to restart it several times. + + If the algorithm stops before fully converging (because of ``tol`` or + ``max_iter``), ``labels_`` and ``cluster_centers_`` will not be consistent, + i.e. the ``cluster_centers_`` will not be the means of the points in each + cluster. Also, the estimator will reassign ``labels_`` after the last + iteration to make ``labels_`` consistent with ``predict`` on the training + set. + + Examples + -------- + + >>> from mars.learn.cluster import KMeans + >>> import mars.tensor as mt + >>> X = mt.array([[1, 2], [1, 4], [1, 0], + ... [10, 2], [10, 4], [10, 0]]) + >>> kmeans = KMeans(n_clusters=2, random_state=0, init='k-means++').fit(X) + >>> kmeans.labels_ + array([1, 1, 1, 0, 0, 0], dtype=int32) + >>> kmeans.predict([[0, 0], [12, 3]]) + array([1, 0], dtype=int32) + >>> kmeans.cluster_centers_ + array([[10., 2.], + [ 1., 2.]]) + """ + + def __init__( + self, + n_clusters=8, + init="k-means||", + n_init=1, + max_iter=300, + tol=1e-4, + verbose=0, + random_state=None, + copy_x=True, + algorithm="auto", + oversampling_factor=2, + init_iter=5, + ): + self.n_clusters = n_clusters + self.init = init + self.max_iter = max_iter + self.tol = tol + self.n_init = n_init + self.verbose = verbose + self.random_state = random_state + self.copy_x = copy_x + self.algorithm = algorithm + self.oversampling_factor = oversampling_factor + self.init_iter = init_iter + + def _check_params(self, X): + # n_init + if self.n_init <= 0: + raise ValueError(f"n_init should be > 0, got {self.n_init} instead.") + self._n_init = self.n_init + + # max_iter + if self.max_iter <= 0: + raise ValueError(f"max_iter should be > 0, got {self.max_iter} instead.") + + # n_clusters + if X.shape[0] < self.n_clusters: + raise ValueError( + f"n_samples={X.shape[0]} should be >= n_clusters={self.n_clusters}." + ) + + # tol + self._tol = _tolerance(X, self.tol) + + # algorithm + if self.algorithm not in ("auto", "full", "elkan"): + raise ValueError( + f"Algorithm must be 'auto', 'full' or 'elkan', " + f"got {self.algorithm} instead." + ) + + self._algorithm = self.algorithm + if self._algorithm == "auto": + # note(xuye.qin): + # Different from scikit-learn, + # for now, full seems more efficient when data is large, + # elkan needs to be tuned more + # old: algorithm = "full" if self.n_clusters == 1 else "elkan" + self._algorithm = "full" + if self._algorithm == "elkan" and self.n_clusters == 1: + warnings.warn( + "algorithm='elkan' doesn't make sense for a single " + "cluster. Using 'full' instead.", + RuntimeWarning, + ) + self._algorithm = "full" + + # init + if not ( + hasattr(self.init, "__array__") + or callable(self.init) + or ( + isinstance(self.init, str) + and self.init in ["k-means++", "k-means||", "random"] + ) + ): + raise ValueError( + f"init should be either 'k-means++', 'k-mean||', 'random', " + f"a tensor, a ndarray or a " + f"callable, got '{self.init}' instead." + ) + + if hasattr(self.init, "__array__") and self._n_init != 1: + warnings.warn( + f"Explicit initial center position passed: performing only" + f" one init in {self.__class__.__name__} instead of " + f"n_init={self._n_init}.", + RuntimeWarning, + stacklevel=2, + ) + self._n_init = 1 + + def _check_test_data(self, X): + X = check_array( + X, + accept_sparse=True, + dtype=[np.float64, np.float32], + order="C", + accept_large_sparse=False, + ) + n_samples, n_features = X.shape + expected_n_features = self.cluster_centers_.shape[1] + if not n_features == expected_n_features: # pragma: no cover + raise ValueError( + f"Incorrect number of features. Got {n_features} features, " + f"expected {expected_n_features}" + ) + + return X + + def fit(self, X, y=None, sample_weight=None, session=None, run_kwargs=None): + """Compute k-means clustering. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training instances to cluster. It must be noted that the data + will be converted to C ordering, which will cause a memory + copy if the given data is not C-contiguous. + If a sparse matrix is passed, a copy will be made if it's not in + CSR format. + + y : Ignored + Not used, present here for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None + The weights for each observation in X. If None, all observations + are assigned equal weight. + + Returns + ------- + self + Fitted estimator. + """ + expect_chunk_size_on_columns = mt.tensor(X).shape[1] + if not np.isnan(expect_chunk_size_on_columns): + X = mt.tensor(X, chunk_size={1: expect_chunk_size_on_columns}) + + X = self._validate_data( + X, + accept_sparse=True, + dtype=[np.float64, np.float32], + order="C", + copy=self.copy_x, + accept_large_sparse=False, + ) + # verify that the number of samples given is larger than k + if np.isnan(_num_samples(X)): # pragma: no cover + X.execute(session=session, **(run_kwargs or dict())) + + self._check_params(X) + random_state = check_random_state(self.random_state).to_numpy() + + tol = _tolerance(X, self.tol) + + # Validate init array + init = self.init + if hasattr(init, "__array__"): + init = check_array(init, dtype=X.dtype.type, copy=True, order="C") + _validate_center_shape(X, self.n_clusters, init) + + # subtract of mean of x for more accurate distance computations + X_mean = None + if not X.issparse(): + X_mean = X.mean(axis=0) + # The copy was already done above + X -= X_mean + + if hasattr(init, "__array__"): + init -= X_mean + + # precompute squared norms of data points + x_squared_norms = row_norms(X, squared=True) + + best_labels, best_inertia, best_centers = None, None, None + + if self._algorithm == "full": + kmeans_single = _kmeans_single_lloyd + else: + kmeans_single = _kmeans_single_elkan + + for i in range(self._n_init): # pylint: disable=unused-variable + # Initialize centers + centers_init = _init_centroids( + X, + self.n_clusters, + init, + random_state=random_state, + x_squared_norms=x_squared_norms, + oversampling_factor=self.oversampling_factor, + init_iter=self.init_iter, + ) + + # run a k-means once + labels, inertia, centers, n_iter_ = kmeans_single( + X, + sample_weight, + centers_init, + self.n_clusters, + max_iter=self.max_iter, + verbose=self.verbose, + tol=tol, + x_squared_norms=x_squared_norms, + X_mean=X_mean, + session=session, + run_kwargs=run_kwargs, + ) + inertia = inertia.fetch(session=session) + # determine if these results are the best so far + if best_inertia is None or inertia < best_inertia: + best_labels = labels + best_centers = centers + best_inertia = inertia + best_n_iter = n_iter_ + + if not X.issparse(): + if not self.copy_x: # pragma: no cover + X += X_mean + best_centers += X_mean + best_centers.execute(session=session, **(run_kwargs or dict())) + + distinct_clusters = len(set(best_labels.fetch(session=session))) + if distinct_clusters < self.n_clusters: # pragma: no cover + warnings.warn( + f"Number of distinct clusters ({distinct_clusters}) found smaller than " + f"n_clusters ({self.n_clusters}). Possibly due to duplicate points in X.", + ConvergenceWarning, + stacklevel=2, + ) + + self.cluster_centers_ = best_centers + self.labels_ = best_labels + self.inertia_ = best_inertia + self.n_iter_ = best_n_iter + return self + + def fit_predict(self, X, y=None, sample_weight=None, session=None, run_kwargs=None): + """Compute cluster centers and predict cluster index for each sample. + + Convenience method; equivalent to calling fit(X) followed by + predict(X). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + New data to transform. + + y : Ignored + Not used, present here for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None + The weights for each observation in X. If None, all observations + are assigned equal weight. + + Returns + ------- + labels : ndarray of shape (n_samples,) + Index of the cluster each sample belongs to. + """ + return self.fit( + X, sample_weight=sample_weight, session=session, run_kwargs=run_kwargs + ).labels_ + + def fit_transform( + self, X, y=None, sample_weight=None, session=None, run_kwargs=None + ): + """Compute clustering and transform X to cluster-distance space. + + Equivalent to fit(X).transform(X), but more efficiently implemented. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + New data to transform. + + y : Ignored + Not used, present here for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None + The weights for each observation in X. If None, all observations + are assigned equal weight. + + Returns + ------- + X_new : array of shape (n_samples, n_clusters) + X transformed in the new space. + """ + # Currently, this just skips a copy of the data if it is not in + # np.array or CSR format already. + # XXX This skips _check_test_data, which may change the dtype; + # we should refactor the input validation. + self.fit(X, sample_weight=sample_weight, session=session, run_kwargs=run_kwargs) + return self._transform(X, session=session, run_kwargs=run_kwargs) + + def transform(self, X, session=None, run_kwargs=None): + """Transform X to a cluster-distance space. + + In the new space, each dimension is the distance to the cluster + centers. Note that even if X is sparse, the array returned by + `transform` will typically be dense. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + New data to transform. + + Returns + ------- + X_new : tensor of shape (n_samples, n_clusters) + X transformed in the new space. + """ + check_is_fitted(self) + + X = self._check_test_data(X) + return self._transform(X, session=session, run_kwargs=run_kwargs) + + def _transform(self, X, session=None, run_kwargs=None): + """guts of transform method; no input validation""" + return euclidean_distances(X, self.cluster_centers_).execute( + session=session, **(run_kwargs or dict()) + ) + + def predict(self, X, sample_weight=None, session=None, run_kwargs=None): + """Predict the closest cluster each sample in X belongs to. + + In the vector quantization literature, `cluster_centers_` is called + the code book and each value returned by `predict` is the index of + the closest code in the code book. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + New data to predict. + + sample_weight : array-like of shape (n_samples,), default=None + The weights for each observation in X. If None, all observations + are assigned equal weight. + + Returns + ------- + labels : tensor of shape (n_samples,) + Index of the cluster each sample belongs to. + """ + check_is_fitted(self) + + X = self._check_test_data(X) + x_squared_norms = row_norms(X, squared=True) + + result = _labels_inertia( + X, + sample_weight, + x_squared_norms, + self.cluster_centers_, + session=session, + run_kwargs=run_kwargs, + )[0] + result.execute(session=session, *(run_kwargs or dict())) + return result + + def score(self, X, y=None, sample_weight=None, session=None, run_kwargs=None): + """Opposite of the value of X on the K-means objective. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + New data. + + y : Ignored + Not used, present here for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None + The weights for each observation in X. If None, all observations + are assigned equal weight. + + Returns + ------- + score : float + Opposite of the value of X on the K-means objective. + """ + check_is_fitted(self) + + X = self._check_test_data(X) + x_squared_norms = row_norms(X, squared=True) + + result = -_labels_inertia( + X, + sample_weight, + x_squared_norms, + self.cluster_centers_, + session=session, + run_kwargs=run_kwargs, + )[1] + result.execute(session=session, **(run_kwargs or dict())) + return result diff --git a/python/xorbits/_mars/learn/cluster/tests/__init__.py b/python/xorbits/_mars/learn/cluster/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/learn/cluster/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/learn/cluster/tests/test_k_means.py b/python/xorbits/_mars/learn/cluster/tests/test_k_means.py new file mode 100644 index 000000000..d75b9bb14 --- /dev/null +++ b/python/xorbits/_mars/learn/cluster/tests/test_k_means.py @@ -0,0 +1,513 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +from io import StringIO + +import numpy as np +import pytest +import scipy.sparse as sp + +try: + from sklearn.datasets import make_blobs + from sklearn.metrics.cluster import v_measure_score + from sklearn.utils._testing import assert_raise_message +except ImportError: + pass + +from .... import tensor as mt +from ....config import options +from ....core import ChunkGraphBuilder, TileableGraph, TileableGraphBuilder +from .. import KMeans, k_means +from .._kmeans import _init_centroids + + +@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed") +@pytest.mark.parametrize("representation", ["dense", "sparse"]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("algo", ["full", "elkan"]) +def test_k_means_results(setup, representation, dtype, algo): + array_constr = {"dense": np.array, "sparse": sp.csr_matrix}[representation] + + X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype) + sample_weight = [3, 1, 1, 3] # will be rescaled to [1.5, 0.5, 0.5, 1.5] + init_centers = np.array([[0, 0], [1, 1]], dtype=dtype) + + expected_labels = [0, 0, 1, 1] + expected_inertia = 0.1875 + expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype) + expected_n_iter = 2 + + kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo) + kmeans.fit(X, sample_weight=sample_weight) + + np.testing.assert_array_equal(kmeans.labels_, expected_labels) + np.testing.assert_almost_equal(kmeans.inertia_, expected_inertia) + np.testing.assert_array_almost_equal(kmeans.cluster_centers_, expected_centers) + assert kmeans.n_iter_ == expected_n_iter + + +@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed") +@pytest.mark.parametrize("representation", ["dense", "sparse"]) +@pytest.mark.parametrize("algo", ["full", "elkan"]) +def test_relocated_clusters(setup, representation, algo): + # check that empty clusters are relocated as expected + + # second center too far from others points will be empty at first iter + init_centers = np.array([[0.5, 0.5], [3, 3]]) + + expected_labels = [0, 0, 1, 1] + expected_inertia = 0.25 + expected_centers = [[0.25, 0], [0.75, 1]] + expected_n_iter = 3 + + array_constr = {"dense": np.array, "sparse": sp.csr_matrix}[representation] + X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]]) + + kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo) + kmeans.fit(X) + + np.testing.assert_array_equal(kmeans.labels_, expected_labels) + np.testing.assert_almost_equal(kmeans.inertia_, expected_inertia) + np.testing.assert_array_almost_equal(kmeans.cluster_centers_, expected_centers) + assert kmeans.n_iter_ == expected_n_iter + + +@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed") +@pytest.mark.parametrize("distribution", ["normal", "blobs"]) +@pytest.mark.parametrize("tol", [1e-2, 1e-4, 1e-8]) +def test_elkan_results(setup, distribution, tol): + # check that results are identical between lloyd and elkan algorithms + + rnd = np.random.RandomState(0) + if distribution == "normal": + X = rnd.normal(size=(5000, 10)) + else: + X, _ = make_blobs(random_state=rnd) + + km_full = KMeans( + algorithm="full", + n_clusters=5, + random_state=0, + n_init=1, + tol=tol, + init="k-means++", + ) + km_elkan = KMeans( + algorithm="elkan", + n_clusters=5, + random_state=0, + n_init=1, + tol=tol, + init="k-means++", + ) + + km_full.fit(X) + km_elkan.fit(X) + np.testing.assert_allclose(km_elkan.cluster_centers_, km_full.cluster_centers_) + np.testing.assert_array_equal(km_elkan.labels_, km_full.labels_) + + assert km_elkan.n_iter_ == km_full.n_iter_ + assert km_elkan.inertia_ == pytest.approx(km_full.inertia_, rel=1e-6) + + +@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed") +def test_k_means_convergence(setup): + for algorithm in ["full", "elkan"]: + # Check that KMeans stops when convergence is reached when tol=0. (#16075) + rnd = np.random.RandomState(0) + X = rnd.normal(size=(5000, 10)) + + km = KMeans( + algorithm=algorithm, + n_clusters=5, + random_state=0, + n_init=1, + tol=0, + max_iter=300, + init="k-means++", + ).fit(X) + + assert km.n_iter_ < 300 + + +@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed") +def test_elkan_results_sparse(setup): + for distribution in ["normal", "blobs"]: + # check that results are identical between lloyd and elkan algorithms + # with sparse input + rnd = np.random.RandomState(0) + if distribution == "normal": + X = sp.random(100, 100, density=0.1, format="csr", random_state=rnd) + X.data = rnd.randn(len(X.data)) + else: + X, _ = make_blobs(n_samples=100, n_features=100, random_state=rnd) + X = sp.csr_matrix(X) + + km_full = KMeans( + algorithm="full", n_clusters=5, random_state=0, n_init=1, init="k-means++" + ) + km_elkan = KMeans( + algorithm="elkan", n_clusters=5, random_state=0, n_init=1, init="k-means++" + ) + + km_full.fit(X) + km_elkan.fit(X) + np.testing.assert_allclose(km_elkan.cluster_centers_, km_full.cluster_centers_) + np.testing.assert_allclose(km_elkan.labels_, km_full.labels_) + + +@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed") +def test_k_means_new_centers(setup): + # Explore the part of the code where a new center is reassigned + X = np.array( + [ + [0, 0, 1, 1], + [0, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 0, 0], + [0, 0, 0, 0], + [0, 1, 0, 0], + ] + ) + labels = [0, 1, 2, 1, 1, 2] + bad_centers = np.array([[+0, 1, 0, 0], [0.2, 0, 0.2, 0.2], [+0, 0, 0, 0]]) + + km = KMeans( + n_clusters=3, + init=bad_centers, + n_init=1, + max_iter=10, + random_state=1, + algorithm="elkan", + ) + for this_X in (X, sp.coo_matrix(X)): + km.fit(this_X) + this_labels = km.labels_.fetch() + # Reorder the labels so that the first instance is in cluster 0, + # the second in cluster 1, ... + this_labels = np.unique(this_labels, return_index=True)[1][this_labels] + np.testing.assert_array_equal(this_labels, labels) + + +def _check_fitted_model(km, n_clusters, n_features, true_labels): + # check that the number of clusters centers and distinct labels match + # the expectation + centers = km.cluster_centers_ + assert centers.shape == (n_clusters, n_features) + + labels = km.labels_.fetch() + assert np.unique(labels).shape[0] == n_clusters + + # check that the labels assignment are perfect (up to a permutation) + assert v_measure_score(true_labels, labels) == 1.0 + assert km.inertia_ > 0.0 + + # check error on dataset being too small + assert_raise_message( + ValueError, + "n_samples=1 should be >= n_clusters=%d" % km.n_clusters, + km.fit, + [[0.0, 1.0]], + ) + + +@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed") +def test_k_means_init(setup): + # non centered, sparse centers to check the + centers = np.array( + [ + [0.0, 5.0, 0.0, 0.0, 0.0], + [1.0, 1.0, 4.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 5.0, 1.0], + ] + ) + n_samples = 100 + n_clusters, n_features = centers.shape + X, true_labels = make_blobs( + n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42 + ) + X_csr = sp.csr_matrix(X) + for data in [X, X_csr]: + for init in ["random", "k-means++", "k-means||", centers.copy()]: + data = mt.tensor(data, chunk_size=50) + km = KMeans( + init=init, + n_clusters=n_clusters, + random_state=42, + n_init=1, + algorithm="elkan", + ) + km.fit(data) + _check_fitted_model(km, n_clusters, n_features, true_labels) + + X = mt.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]]) + kmeans = KMeans(n_clusters=2, random_state=0, n_init=1, init="k-means||").fit(X) + assert sorted(kmeans.cluster_centers_.fetch().tolist()) == sorted( + [[10.0, 2.0], [1.0, 2.0]] + ) + + +@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed") +def test_k_means_n_init(setup): + rnd = np.random.RandomState(0) + X = rnd.normal(size=(40, 2)) + + # two regression tests on bad n_init argument + # previous bug: n_init <= 0 threw non-informative TypeError (#3858) + with pytest.raises(ValueError, match="n_init"): + KMeans(n_init=0, init="k-means++").fit(X) + with pytest.raises(ValueError, match="n_init"): + KMeans(n_init=-1, init="k-means++").fit(X) + + +@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed") +def test_k_means_explicit_init_shape(setup): + # test for sensible errors when giving explicit init + # with wrong number of features or clusters + rnd = np.random.RandomState(0) + X = rnd.normal(size=(40, 3)) + + # mismatch of number of features + km = KMeans(n_init=1, init=X[:, :2], n_clusters=len(X), algorithm="elkan") + msg = "does not match the number of features of the data" + with pytest.raises(ValueError, match=msg): + km.fit(X) + # for callable init + km = KMeans( + n_init=1, + init=lambda X_, k, random_state: X_[:, :2], + n_clusters=len(X), + algorithm="elkan", + ) + with pytest.raises(ValueError, match=msg): + km.fit(X) + # mismatch of number of clusters + msg = "does not match the number of clusters" + km = KMeans(n_init=1, init=X[:2, :], n_clusters=3, algorithm="elkan") + with pytest.raises(ValueError, match=msg): + km.fit(X) + # for callable init + km = KMeans( + n_init=1, + init=lambda X_, k, random_state: X_[:2, :], + n_clusters=3, + algorithm="elkan", + ) + with pytest.raises(ValueError, match=msg): + km.fit(X) + + +@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed") +def test_k_means_fortran_aligned_data(setup): + # Check the KMeans will work well, even if X is a fortran-aligned data. + X = np.asfortranarray([[0, 0], [0, 1], [0, 1]]) + centers = np.array([[0, 0], [0, 1]]) + labels = np.array([0, 1, 1]) + km = KMeans( + n_init=1, init=centers, random_state=42, n_clusters=2, algorithm="elkan" + ) + km.fit(X) + np.testing.assert_array_almost_equal(km.cluster_centers_, centers) + np.testing.assert_array_equal(km.labels_, labels) + + +@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed") +@pytest.mark.parametrize("algo", ["full", "elkan"]) +@pytest.mark.parametrize( + "seed, max_iter, tol", + [ + (0, 2, 1e-7), # strict non-convergence + (1, 2, 1e-1), # loose non-convergence + (3, 300, 1e-7), # strict convergence + (4, 300, 1e-1), # loose convergence + ], +) +def test_k_means_fit_predict(setup, algo, seed, max_iter, tol): + # check that fit.predict gives same result as fit_predict + rng = np.random.RandomState(seed) + + X = make_blobs(n_samples=1000, n_features=10, centers=10, random_state=rng)[0] + + kmeans = KMeans( + algorithm=algo, + n_clusters=10, + random_state=seed, + tol=tol, + max_iter=max_iter, + init="k-means++", + ) + + labels_1 = kmeans.fit(X).predict(X) + labels_2 = kmeans.fit_predict(X) + + # Due to randomness in the order in which chunks of data are processed when + # using more than one thread, the absolute values of the labels can be + # different between the 2 strategies but they should correspond to the same + # clustering. + assert pytest.approx(v_measure_score(labels_1, labels_2)) == 1 + + +@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed") +def test_transform(setup): + centers = np.array( + [ + [0.0, 5.0, 0.0, 0.0, 0.0], + [1.0, 1.0, 4.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 5.0, 1.0], + ] + ) + n_samples = 100 + n_clusters, n_features = centers.shape + X = make_blobs( + n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42 + )[0] + + km = KMeans(n_clusters=n_clusters, init="k-means++", algorithm="elkan") + km.fit(X) + X_new = km.transform(km.cluster_centers_).fetch() + + for c in range(n_clusters): + assert X_new[c, c] == 0 + for c2 in range(n_clusters): + if c != c2: + assert X_new[c, c2] > 0 + + +@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed") +def test_fit_transform(setup): + centers = np.array( + [ + [0.0, 5.0, 0.0, 0.0, 0.0], + [1.0, 1.0, 4.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 5.0, 1.0], + ] + ) + n_samples = 100 + X = make_blobs( + n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42 + )[0] + X1 = ( + KMeans(n_clusters=3, random_state=51, init="k-means++", algorithm="elkan") + .fit(X) + .transform(X) + ) + X2 = KMeans( + n_clusters=3, random_state=51, init="k-means++", algorithm="elkan" + ).fit_transform(X) + np.testing.assert_array_almost_equal(X1, X2) + + +@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed") +def test_score(setup): + centers = np.array( + [ + [0.0, 5.0, 0.0, 0.0, 0.0], + [1.0, 1.0, 4.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 5.0, 1.0], + ] + ) + n_samples = 100 + n_clusters, n_features = centers.shape + X = make_blobs( + n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42 + )[0] + + for algo in ["full", "elkan"]: + # Check that fitting k-means with multiple inits gives better score + km1 = KMeans( + n_clusters=n_clusters, + max_iter=1, + random_state=42, + n_init=1, + algorithm=algo, + init="k-means++", + ) + s1 = km1.fit(X).score(X).fetch() + km2 = KMeans( + n_clusters=n_clusters, + max_iter=10, + random_state=42, + n_init=1, + algorithm=algo, + init="k-means++", + ) + s2 = km2.fit(X).score(X).fetch() + assert s2 > s1 + + +@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed") +def test_k_means_function(setup): + # test calling the k_means function directly + + # non centered, sparse centers to check the + centers = np.array( + [ + [0.0, 5.0, 0.0, 0.0, 0.0], + [1.0, 1.0, 4.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 5.0, 1.0], + ] + ) + n_samples = 100 + n_clusters, n_features = centers.shape + X, true_labels = make_blobs( + n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42 + ) + + # catch output + old_stdout = sys.stdout + sys.stdout = StringIO() + try: + cluster_centers, labels, inertia = k_means( + X, n_clusters=n_clusters, sample_weight=None, verbose=True, init="k-means++" + ) + finally: + sys.stdout = old_stdout + centers = cluster_centers + assert centers.shape == (n_clusters, n_features) + + labels = labels.fetch() + assert np.unique(labels).shape[0] == n_clusters + + # check that the labels assignment are perfect (up to a permutation) + assert v_measure_score(true_labels, labels) == 1.0 + assert inertia > 0.0 + + # check warning when centers are passed + with pytest.warns(RuntimeWarning): + k_means( + X, + n_clusters=n_clusters, + sample_weight=None, + init=centers, + ) + + # to many clusters desired + with pytest.raises(ValueError): + k_means(X, n_clusters=X.shape[0] + 1, sample_weight=None, init="k-means++") + + +@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed") +def test_k_means_init_large_n_clusters(): + chunk_bytes_limit = options.chunk_store_limit * 2 + n_cluster = 2000 + x = mt.random.rand(1000_000, 64, chunk_size=250_000) + + centers = _init_centroids(x, n_cluster, init="k-means||") + t_graph = next(TileableGraphBuilder(TileableGraph([centers])).build()) + graph = next(ChunkGraphBuilder(t_graph).build()) + for c in graph: + nbytes = c.nbytes + if not np.isnan(nbytes): + assert nbytes <= chunk_bytes_limit diff --git a/python/xorbits/_mars/learn/contrib/__init__.py b/python/xorbits/_mars/learn/contrib/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/learn/contrib/joblib/__init__.py b/python/xorbits/_mars/learn/contrib/joblib/__init__.py new file mode 100644 index 000000000..c5df56dec --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/joblib/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .backend import register_mars_backend diff --git a/python/xorbits/_mars/learn/contrib/joblib/backend.py b/python/xorbits/_mars/learn/contrib/joblib/backend.py new file mode 100644 index 000000000..ebdbaccb3 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/joblib/backend.py @@ -0,0 +1,87 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import concurrent.futures + +from .... import remote +from ....deploy.oscar.session import get_default_session, new_session + +try: + from joblib.parallel import ( + AutoBatchingMixin, + ParallelBackendBase, + register_parallel_backend, + ) +except ImportError: + ParallelBackendBase = object + AutoBatchingMixin = object + register_parallel_backend = None + + +class MarsDistributedBackend(AutoBatchingMixin, ParallelBackendBase): + MIN_IDEAL_BATCH_DURATION = 0.2 + MAX_IDEAL_BATCH_DURATION = 1.0 + supports_timeout = True + + def __init__(self, service=None, session=None, backend=None, n_parallel=None): + super().__init__() + + if session is None: + if service is not None: + self.session = new_session(service, backend=backend, default=False) + else: + self.session = get_default_session() + else: + self.session = session + + self.n_parallel = n_parallel or 1 + self.executor = None + + def get_nested_backend(self): + return MarsDistributedBackend(session=self.session), -1 + + def configure(self, n_jobs=1, parallel=None, **backend_args): + self.parallel = parallel + n_parallel = self.effective_n_jobs(n_jobs) + self.executor = concurrent.futures.ThreadPoolExecutor(n_parallel) + return n_parallel + + def effective_n_jobs(self, n_jobs): + eff_n_jobs = super(MarsDistributedBackend, self).effective_n_jobs(n_jobs) + if n_jobs == -1: + eff_n_jobs = self.n_parallel + return eff_n_jobs + + def apply_async(self, func, callback=None): + # todo allow execute f() in remote end to reduce data copy latency + def f(): + spawned = [] + for func_obj, args, kwargs in func.items: + spawned.append(remote.spawn(func_obj, args=args, kwargs=kwargs)) + + ret = ( + remote.ExecutableTuple(spawned) + .execute(session=self.session) + .fetch(self.session) + ) + callback(ret) + return ret + + future = self.executor.submit(f) + future.get = future.result + return future + + +def register_mars_backend(): + register_parallel_backend("mars", MarsDistributedBackend) diff --git a/python/xorbits/_mars/learn/contrib/joblib/tests/__init__.py b/python/xorbits/_mars/learn/contrib/joblib/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/joblib/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/learn/contrib/joblib/tests/test_backend.py b/python/xorbits/_mars/learn/contrib/joblib/tests/test_backend.py new file mode 100644 index 000000000..d3c763919 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/joblib/tests/test_backend.py @@ -0,0 +1,38 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import joblib +import numpy as np +from sklearn.datasets import load_digits +from sklearn.model_selection import RandomizedSearchCV +from sklearn.svm import SVC + +from .. import register_mars_backend + +register_mars_backend() + + +def test_sk_learn_svc_train(setup): + digits = load_digits() + param_space = { + "C": np.logspace(-6, 6, 30), + "gamma": np.logspace(-8, 8, 30), + "tol": np.logspace(-4, -1, 30), + "class_weight": [None, "balanced"], + } + model = SVC(kernel="rbf") + search = RandomizedSearchCV(model, param_space, cv=5, n_iter=5, verbose=10) + + with joblib.parallel_backend("mars", n_parallel=16): + search.fit(digits.data, digits.target) diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/__init__.py b/python/xorbits/_mars/learn/contrib/lightgbm/__init__.py new file mode 100644 index 000000000..55a078a6f --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/lightgbm/__init__.py @@ -0,0 +1,34 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ._predict import predict, predict_proba + + +def register_op(): + from ._align import align_data_set + from ._train import train + + del train, align_data_set + + +from ..utils import config_mod_getattr as _config_mod_getattr + +_config_mod_getattr( + { + "LGBMClassifier": ".classifier.LGBMClassifier", + "LGBMRegressor": ".regressor.LGBMRegressor", + "LGBMRanker": ".ranker.LGBMRanker", + }, + globals(), +) diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/_align.py b/python/xorbits/_mars/learn/contrib/lightgbm/_align.py new file mode 100644 index 000000000..1784a3bbf --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/lightgbm/_align.py @@ -0,0 +1,130 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .... import opcodes +from ....core import ExecutableTuple, get_output_types, recursive_tile +from ....serialization.serializables import AnyField +from ....utils import has_unknown_shape +from ...operands import LearnOperand, LearnOperandMixin + + +class LGBMAlign(LearnOperand, LearnOperandMixin): + _op_type_ = opcodes.LGBM_ALIGN + + _data = AnyField("data") + _label = AnyField("label") + _sample_weight = AnyField("sample_weight") + _init_score = AnyField("init_score") + + def __init__( + self, + data=None, + label=None, + sample_weight=None, + init_score=None, + output_types=None, + **kw + ): + super().__init__( + _data=data, + _label=label, + _sample_weight=sample_weight, + _init_score=init_score, + _output_types=output_types, + **kw + ) + + @property + def data(self): + return self._data + + @property + def label(self): + return self._label + + @property + def sample_weight(self): + return self._sample_weight + + @property + def init_score(self): + return self._init_score + + @property + def output_limit(self): + return 2 if self._sample_weight is None else 3 + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + it = iter(inputs) + self._data = next(it) + for attr in ("_label", "_sample_weight", "_init_score"): + if getattr(self, attr) is not None: + setattr(self, attr, next(it)) + + def __call__(self): + kws, inputs = [], [] + for arg in [self.data, self.label, self.sample_weight, self.init_score]: + if hasattr(arg, "params"): + kws.append(arg.params) + inputs.append(arg) + tileables = self.new_tileables(inputs, kws=kws) + return ExecutableTuple(tileables) + + @classmethod + def tile(cls, op: "LGBMAlign"): + inputs = [ + d + for d in [op.data, op.label, op.sample_weight, op.init_score] + if d is not None + ] + data = op.data + + # check inputs to make sure no unknown chunk shape exists + if has_unknown_shape(*inputs): + yield + + if len(data.nsplits[1]) != 1: + data = yield from recursive_tile(data.rechunk({1: data.shape[1]})) + outputs = [data] + for inp in inputs[1:]: + if inp is not None: + outputs.append( + (yield from recursive_tile(inp.rechunk((data.nsplits[0],)))) + ) + + kws = [] + for o in outputs: + kw = o.params.copy() + kw.update(dict(chunks=o.chunks, nsplits=o.nsplits)) + kws.append(kw) + + new_op = op.copy().reset_key() + tileables = new_op.new_tileables(inputs, kws=kws) + + return tileables + + +def align_data_set(dataset): + out_types = get_output_types( + dataset.data, dataset.label, dataset.sample_weight, dataset.init_score + ) + op = LGBMAlign( + data=dataset.data, + label=dataset.label, + sample_weight=dataset.sample_weight, + init_score=dataset.init_score, + output_types=out_types, + ) + return op() diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/_predict.py b/python/xorbits/_mars/learn/contrib/lightgbm/_predict.py new file mode 100644 index 000000000..dd18b5766 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/lightgbm/_predict.py @@ -0,0 +1,240 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pickle + +import numpy as np +import pandas as pd + +from .... import opcodes +from ....core import recursive_tile +from ....dataframe.utils import parse_index +from ....serialization.serializables import BoolField, BytesField, DictField, KeyField +from ....tensor.core import TENSOR_TYPE, TensorOrder +from ...operands import LearnOperand, LearnOperandMixin, OutputType + + +class LGBMPredict(LearnOperand, LearnOperandMixin): + _op_type_ = opcodes.LGBM_PREDICT + + _data = KeyField("data") + _model = BytesField("model", on_serialize=pickle.dumps, on_deserialize=pickle.loads) + _proba = BoolField("proba") + _kwds = DictField("kwds") + + def __init__( + self, data=None, model=None, proba=None, kwds=None, output_types=None, **kw + ): + super().__init__( + _data=data, + _model=model, + _proba=proba, + _kwds=kwds, + _output_types=output_types, + **kw, + ) + + @property + def data(self): + return self._data + + @property + def model(self): + return self._model + + @property + def proba(self) -> bool: + return self._proba + + @property + def kwds(self) -> dict: + return self._kwds + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + it = iter(inputs) + self._data = next(it) + + def __call__(self): + num_class = int(getattr(self.model, "n_classes_", 2)) + if self.proba: + shape = (self.data.shape[0], num_class) + else: + shape = (self.data.shape[0],) + + if self._proba: + dtype = np.dtype(np.float_) + elif hasattr(self.model, "classes_"): + dtype = np.array(self.model.classes_).dtype + else: + dtype = getattr(self.model, "out_dtype_", np.dtype("float")) + + if self.output_types[0] == OutputType.tensor: + # tensor + return self.new_tileable( + [self.data], shape=shape, dtype=dtype, order=TensorOrder.C_ORDER + ) + elif self.output_types[0] == OutputType.dataframe: + # dataframe + dtypes = pd.Series([dtype] * num_class) + columns_value = parse_index(pd.Index(self.model.classes_), store_data=True) + return self.new_tileable( + [self.data], + shape=shape, + dtypes=dtypes, + columns_value=columns_value, + index_value=self.data.index_value, + ) + else: + return self.new_tileable( + [self.data], + shape=shape, + index_value=self.data.index_value, + name="predictions", + dtype=dtype, + ) + + @classmethod + def tile(cls, op: "LGBMPredict"): + out = op.outputs[0] + out_chunks = [] + data = op.data + if data.chunk_shape[1] > 1: + data = yield from recursive_tile(data.rechunk({1: op.data.shape[1]})) + + for in_chunk in data.chunks: + chunk_op = op.copy().reset_key() + chunk_index = (in_chunk.index[0],) + + if len(out.shape) > 1: + chunk_shape = (in_chunk.shape[0], out.shape[1]) + chunk_index += (0,) + else: + chunk_shape = (in_chunk.shape[0],) + + if op.output_types[0] == OutputType.tensor: + out_chunk = chunk_op.new_chunk( + [in_chunk], + shape=chunk_shape, + dtype=out.dtype, + order=out.order, + index=chunk_index, + ) + elif op.output_types[0] == OutputType.dataframe: + # dataframe chunk + out_chunk = chunk_op.new_chunk( + [in_chunk], + shape=chunk_shape, + dtypes=out.dtypes, + columns_value=out.columns_value, + index_value=in_chunk.index_value, + index=chunk_index, + ) + else: + # series chunk + out_chunk = chunk_op.new_chunk( + [in_chunk], + shape=chunk_shape, + dtype=out.dtype, + index_value=in_chunk.index_value, + name=out.name, + index=chunk_index, + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + params = out.params + params["chunks"] = out_chunks + nsplits = (data.nsplits[0],) + if out.ndim > 1: + nsplits += ((out.shape[1],),) + params["nsplits"] = nsplits + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + def execute(cls, ctx, op: "LGBMPredict"): + in_data = ctx[op.data.key] + in_data = in_data.spmatrix if hasattr(in_data, "spmatrix") else in_data + out = op.outputs[0] + + if op.data.shape[0] == 0: + result = np.array([]) + elif op.proba: + result = op.model.predict_proba(in_data, **op.kwds) + else: + result = op.model.predict(in_data, **op.kwds) + + if op.output_types[0] == OutputType.dataframe: + result = pd.DataFrame( + result, index=in_data.index, columns=out.columns_value.to_pandas() + ) + elif op.output_types[0] == OutputType.series: + result = pd.Series(result, index=in_data.index, name="predictions") + + ctx[out.key] = result + + +def predict_base(model, data, session=None, run_kwargs=None, run=True, **kwargs): + from lightgbm import LGBMModel + + if not isinstance(model, LGBMModel): + raise TypeError( + f"model has to be a lightgbm.LGBMModel, got {type(model)} instead" + ) + model = model.to_local() if hasattr(model, "to_local") else model + + proba = kwargs.pop("proba", hasattr(model, "classes_")) + + if isinstance(data, TENSOR_TYPE): + output_types = [OutputType.tensor] + elif proba: + output_types = [OutputType.dataframe] + else: + output_types = [OutputType.series] + + op = LGBMPredict( + data=data, + model=model, + gpu=data.op.gpu, + output_types=output_types, + proba=proba, + kwds=kwargs, + ) + result = op() + if run: + result.execute(session=session, **(run_kwargs or dict())) + return result + + +def predict(model, data, session=None, run_kwargs=None, run=True, **kw): + if hasattr(model, "classes_"): + return predict_base( + model, + data, + session=session, + run_kwargs=run_kwargs, + proba=False, + run=run, + **kw, + ) + else: + return predict_base( + model, data, session=session, run_kwargs=run_kwargs, run=run, **kw + ) + + +def predict_proba(model, data, session=None, run_kwargs=None, run=True, **kw): + return predict_base( + model, data, session=session, run_kwargs=run_kwargs, run=run, proba=True, **kw + ) diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/_train.py b/python/xorbits/_mars/learn/contrib/lightgbm/_train.py new file mode 100644 index 000000000..b0b7420ea --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/lightgbm/_train.py @@ -0,0 +1,458 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import logging +import operator +import pickle +from collections import defaultdict +from functools import reduce + +import numpy as np + +from .... import opcodes +from ....core import ExecutableTuple, OutputType, recursive_tile +from ....core.context import get_context +from ....core.operand import MergeDictOperand +from ....serialization.serializables import ( + DictField, + FieldTypes, + Int32Field, + KeyField, + ListField, + StringField, +) +from ...utils import collect_ports, concat_chunks +from ._align import align_data_set +from .core import LGBMModelType, get_model_cls_from_type + +logger = logging.getLogger(__name__) + + +class LGBMTrain(MergeDictOperand): + _op_type_ = opcodes.LGBM_TRAIN + + _model_type = Int32Field( + "model_type", on_serialize=lambda x: x.value, on_deserialize=LGBMModelType + ) + _params = DictField("params", key_type=FieldTypes.string) + _data = KeyField("data") + _label = KeyField("label") + _sample_weight = KeyField("sample_weight") + _init_score = KeyField("init_score") + _kwds = DictField("kwds", key_type=FieldTypes.string) + + _eval_datas = ListField("eval_datas", FieldTypes.key) + _eval_labels = ListField("eval_labels", FieldTypes.key) + _eval_sample_weights = ListField("eval_sample_weights", FieldTypes.key) + _eval_init_scores = ListField("eval_init_scores", FieldTypes.key) + + _workers = ListField("workers", FieldTypes.string) + _worker_id = Int32Field("worker_id") + _worker_ports = KeyField("worker_ports") + + _tree_learner = StringField("tree_learner") + _timeout = Int32Field("timeout") + + def __init__( + self, + model_type=None, + data=None, + label=None, + sample_weight=None, + init_score=None, + eval_datas=None, + eval_labels=None, + eval_sample_weights=None, + eval_init_scores=None, + params=None, + kwds=None, + workers=None, + worker_id=None, + worker_ports=None, + tree_learner=None, + timeout=None, + **kw, + ): + super().__init__( + _model_type=model_type, + _params=params, + _data=data, + _label=label, + _sample_weight=sample_weight, + _init_score=init_score, + _eval_datas=eval_datas, + _eval_labels=eval_labels, + _eval_sample_weights=eval_sample_weights, + _eval_init_scores=eval_init_scores, + _kwds=kwds, + _workers=workers, + _worker_id=worker_id, + _worker_ports=worker_ports, + _tree_learner=tree_learner, + _timeout=timeout, + **kw, + ) + if self.output_types is None: + self.output_types = [OutputType.object] + + @property + def model_type(self) -> LGBMModelType: + return self._model_type + + @property + def data(self): + return self._data + + @property + def label(self): + return self._label + + @property + def sample_weight(self): + return self._sample_weight + + @property + def init_score(self): + return self._init_score + + @property + def eval_datas(self) -> list: + return self._eval_datas or [] + + @property + def eval_labels(self) -> list: + return self._eval_labels or [] + + @property + def eval_sample_weights(self) -> list: + return self._eval_sample_weights or [] + + @property + def eval_init_scores(self) -> list: + return self._eval_init_scores or [] + + @property + def params(self) -> dict: + return self._params or dict() + + @property + def kwds(self) -> dict: + return self._kwds or dict() + + @property + def workers(self) -> list: + return self._workers + + @property + def worker_id(self) -> int: + return self._worker_id + + @property + def worker_ports(self): + return self._worker_ports + + @property + def timeout(self) -> int: + return self._timeout + + @property + def tree_learner(self) -> str: + return self._tree_learner + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + it = iter(inputs) + for attr in ["_data", "_label", "_sample_weight", "_init_score"]: + if getattr(self, attr) is not None: + setattr(self, attr, next(it)) + for attr in [ + "_eval_datas", + "_eval_labels", + "_eval_sample_weights", + "_eval_init_scores", + ]: + new_list = [] + for c in getattr(self, attr, None) or []: + if c is not None: + new_list.append(next(it)) + setattr(self, attr, new_list or None) + + if self._worker_ports is not None: + self._worker_ports = next(it) + + def __call__(self): + inputs = [] + for attr in ["_data", "_label", "_sample_weight", "_init_score"]: + if getattr(self, attr) is not None: + inputs.append(getattr(self, attr)) + for attr in [ + "_eval_datas", + "_eval_labels", + "_eval_sample_weights", + "_eval_init_scores", + ]: + for c in getattr(self, attr, None) or []: + if c is not None: + inputs.append(c) + return self.new_tileable(inputs) + + @staticmethod + def _get_data_chunks_workers(ctx, data): + # data_chunk.inputs is concat, and concat's input is the co-allocated chunks + metas = ctx.get_chunks_meta([c.key for c in data.chunks], fields=["bands"]) + return [m["bands"][0][0] for m in metas] + + @staticmethod + def _concat_chunks_by_worker(chunks, chunk_workers): + worker_to_chunks = defaultdict(list) + for chunk, worker in zip(chunks, chunk_workers): + worker_to_chunks[worker].append(chunk) + worker_to_concat = dict() + for worker, chunks in worker_to_chunks.items(): + worker_to_concat[worker] = concat_chunks(chunks) + return worker_to_concat + + @classmethod + def tile(cls, op: "LGBMTrain"): + ctx = get_context() + data = op.data + worker_to_args = defaultdict(dict) + + workers = cls._get_data_chunks_workers(ctx, data) + + for arg in ["_data", "_label", "_sample_weight", "_init_score"]: + if getattr(op, arg) is not None: + for worker, chunk in cls._concat_chunks_by_worker( + getattr(op, arg).chunks, workers + ).items(): + worker_to_args[worker][arg] = chunk + + if op.eval_datas: + eval_workers_list = [ + cls._get_data_chunks_workers(ctx, d) for d in op.eval_datas + ] + extra_workers = reduce( + operator.or_, (set(w) for w in eval_workers_list) + ) - set(workers) + worker_remap = dict(zip(extra_workers, itertools.cycle(workers))) + if worker_remap: + eval_workers_list = [ + [worker_remap.get(w, w) for w in wl] for wl in eval_workers_list + ] + + for arg in [ + "_eval_datas", + "_eval_labels", + "_eval_sample_weights", + "_eval_init_scores", + ]: + if getattr(op, arg): + for tileable, eval_workers in zip( + getattr(op, arg), eval_workers_list + ): + for worker, chunk in cls._concat_chunks_by_worker( + tileable.chunks, eval_workers + ).items(): + if arg not in worker_to_args[worker]: + worker_to_args[worker][arg] = [] + worker_to_args[worker][arg].append(chunk) + + out_chunks = [] + workers = list(set(workers)) + for worker_id, worker in enumerate(workers): + chunk_op = op.copy().reset_key() + chunk_op.expect_worker = worker + + input_chunks = [] + concat_args = worker_to_args.get(worker, {}) + for arg in [ + "_data", + "_label", + "_sample_weight", + "_init_score", + "_eval_datas", + "_eval_labels", + "_eval_sample_weights", + "_eval_init_scores", + ]: + arg_val = getattr(op, arg) + if arg_val: + arg_chunk = concat_args.get(arg) + setattr(chunk_op, arg, arg_chunk) + if isinstance(arg_chunk, list): + input_chunks.extend(arg_chunk) + else: + input_chunks.append(arg_chunk) + + worker_ports_chunk = ( + yield from recursive_tile(collect_ports(workers, op.data)) + ).chunks[0] + input_chunks.append(worker_ports_chunk) + + chunk_op._workers = workers + chunk_op._worker_ports = worker_ports_chunk + chunk_op._worker_id = worker_id + + data_chunk = concat_args["_data"] + out_chunk = chunk_op.new_chunk( + input_chunks, shape=(np.nan,), index=data_chunk.index[:1] + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_tileables( + op.inputs, chunks=out_chunks, nsplits=((np.nan for _ in out_chunks),) + ) + + @classmethod + def execute(cls, ctx, op: "LGBMTrain"): + if op.merge: + return super().execute(ctx, op) + + from lightgbm.basic import _LIB, _safe_call + + data_val = ctx[op.data.key] + data_val = data_val.spmatrix if hasattr(data_val, "spmatrix") else data_val + + label_val = ctx[op.label.key] + sample_weight_val = ( + ctx[op.sample_weight.key] if op.sample_weight is not None else None + ) + init_score_val = ctx[op.init_score.key] if op.init_score is not None else None + + if op.eval_datas is None: + eval_set, eval_sample_weight, eval_init_score = None, None, None + else: + eval_set, eval_sample_weight, eval_init_score = [], [], [] + for data, label in zip(op.eval_datas, op.eval_labels): + data_eval = ctx[data.key] + data_eval = ( + data_eval.spmatrix if hasattr(data_eval, "spmatrix") else data_eval + ) + eval_set.append((data_eval, ctx[label.key])) + for weight in op.eval_sample_weights: + eval_sample_weight.append( + ctx[weight.key] if weight is not None else None + ) + for score in op.eval_init_scores: + eval_init_score.append(ctx[score.key] if score is not None else None) + + eval_set = eval_set or None + eval_sample_weight = eval_sample_weight or None + eval_init_score = eval_init_score or None + + params = op.params.copy() + # if model is trained, remove unsupported parameters + params.pop("out_dtype_", None) + worker_ports = ctx[op.worker_ports.key] + worker_ips = [worker.split(":", 1)[0] for worker in op.workers] + worker_endpoints = [ + f"{worker}:{port}" for worker, port in zip(worker_ips, worker_ports) + ] + + params["machines"] = ",".join(worker_endpoints) + params["time_out"] = op.timeout + params["num_machines"] = len(worker_endpoints) + params["local_listen_port"] = worker_ports[op.worker_id] + + if (op.tree_learner or "").lower() not in {"data", "feature", "voting"}: + logger.warning( + "Parameter tree_learner not set or set to incorrect value " + f'{op.tree_learner}, using "data" as default' + ) + params["tree_learner"] = "data" + else: + params["tree_learner"] = op.tree_learner + + try: + model_cls = get_model_cls_from_type(op.model_type) + model = model_cls(**params) + model.fit( + data_val, + label_val, + sample_weight=sample_weight_val, + init_score=init_score_val, + eval_set=eval_set, + eval_sample_weight=eval_sample_weight, + eval_init_score=eval_init_score, + **op.kwds, + ) + + if ( + op.model_type == LGBMModelType.RANKER + or op.model_type == LGBMModelType.REGRESSOR + ): + model.set_params(out_dtype_=np.dtype("float")) + elif hasattr(label_val, "dtype"): + model.set_params(out_dtype_=label_val.dtype) + else: + model.set_params(out_dtype_=label_val.dtypes[0]) + + ctx[op.outputs[0].key] = pickle.dumps(model) + finally: + _safe_call(_LIB.LGBM_NetworkFree()) + + +def train(params, train_set, eval_sets=None, **kwargs): + eval_sets = eval_sets or [] + model_type = kwargs.pop("model_type", LGBMModelType.CLASSIFIER) + + evals_result = kwargs.pop("evals_result", dict()) + session = kwargs.pop("session", None) + run_kwargs = kwargs.pop("run_kwargs", None) + if run_kwargs is None: + run_kwargs = dict() + timeout = kwargs.pop("timeout", 120) + base_port = kwargs.pop("base_port", None) + + aligns = align_data_set(train_set) + for eval_set in eval_sets: + aligns += align_data_set(eval_set) + + aligned_iter = iter(ExecutableTuple(aligns).execute(session)) + datas, labels, sample_weights, init_scores = [], [], [], [] + for dataset in [train_set] + eval_sets: + train_kw = dict() + for arg in ["data", "label", "sample_weight", "init_score"]: + if getattr(dataset, arg) is not None: + train_kw[arg] = next(aligned_iter) + else: + train_kw[arg] = None + + datas.append(train_kw["data"]) + labels.append(train_kw["label"]) + sample_weights.append(train_kw["sample_weight"]) + init_scores.append(train_kw["init_score"]) + + op = LGBMTrain( + params=params, + data=datas[0], + label=labels[0], + sample_weight=sample_weights[0], + init_score=init_scores[0], + eval_datas=datas[1:], + eval_labels=labels[1:], + eval_weights=sample_weights[1:], + eval_init_score=init_scores[1:], + model_type=model_type, + timeout=timeout, + lgbm_port=base_port, + kwds=kwargs, + ) + ret = op().execute(session=session, **run_kwargs).fetch(session=session) + + bst = pickle.loads(ret) + evals_result.update(bst.evals_result_ or {}) + return bst diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/classifier.py b/python/xorbits/_mars/learn/contrib/lightgbm/classifier.py new file mode 100644 index 000000000..06c7610e4 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/lightgbm/classifier.py @@ -0,0 +1,71 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...utils import check_consistent_length +from ..utils import make_import_error_func +from ._predict import predict_base +from ._train import train +from .core import LGBMModelType, LGBMScikitLearnBase + +try: + import lightgbm +except ImportError: + lightgbm = None + + +LGBMClassifier = make_import_error_func("lightgbm") +if lightgbm: + + class LGBMClassifier(LGBMScikitLearnBase, lightgbm.LGBMClassifier): + def fit( + self, + X, + y, + sample_weight=None, + init_score=None, + eval_set=None, + eval_sample_weight=None, + eval_init_score=None, + session=None, + run_kwargs=None, + **kwargs + ): + check_consistent_length(X, y, session=session, run_kwargs=run_kwargs) + params = self.get_params(True) + model = train( + params, + self._wrap_train_tuple(X, y, sample_weight, init_score), + eval_sets=self._wrap_eval_tuples( + eval_set, eval_sample_weight, eval_init_score + ), + model_type=LGBMModelType.CLASSIFIER, + session=session, + run_kwargs=run_kwargs, + **kwargs + ) + + self.set_params(**model.get_params()) + self._copy_extra_params(model, self) + return self + + def predict(self, X, **kwargs): + return predict_base(self, X, proba=False, **kwargs) + + def predict_proba(self, X, **kwargs): + return predict_base(self, X, proba=True, **kwargs) + + def to_local(self): + model = lightgbm.LGBMClassifier(**self.get_params()) + self._copy_extra_params(self, model) + return model diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/core.py b/python/xorbits/_mars/learn/contrib/lightgbm/core.py new file mode 100644 index 000000000..4adc198cc --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/lightgbm/core.py @@ -0,0 +1,129 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import enum +import itertools +from collections import namedtuple + +import numpy as np +import pandas as pd + +from ....dataframe import DataFrame as MarsDataFrame +from ....dataframe import Series as MarsSeries +from ....tensor import tensor as mars_tensor + + +class LGBMModelType(enum.Enum): + CLASSIFIER = 0 + REGRESSOR = 1 + RANKER = 2 + + +_model_type_to_model = dict() + + +def get_model_cls_from_type(model_type: LGBMModelType): + import lightgbm + + if not _model_type_to_model: + _model_type_to_model.update( + { + LGBMModelType.CLASSIFIER: lightgbm.LGBMClassifier, + LGBMModelType.REGRESSOR: lightgbm.LGBMRegressor, + LGBMModelType.RANKER: lightgbm.LGBMRanker, + } + ) + return _model_type_to_model[model_type] + + +TrainTuple = namedtuple("TrainTuple", "data label sample_weight init_score") + + +class LGBMScikitLearnBase: + def __init__(self, *args, **kwargs): + if args and isinstance(args[0], self._get_lgbm_class()): + model = args[0] + super().__init__(**model.get_params()) + self._copy_extra_params(model, self) + else: + super().__init__(*args, **kwargs) + + @classmethod + def _get_lgbm_class(cls): + try: + return getattr(cls, "_lgbm_class") + except AttributeError: + lgbm_class = next( + base for base in cls.__bases__ if base.__module__.startswith("lightgbm") + ) + cls._lgbm_class = lgbm_class + return lgbm_class + + @classmethod + def _get_param_names(cls): + return cls._get_lgbm_class()._get_param_names() + + @staticmethod + def _copy_extra_params(source, dest): + params = source.get_params() + attributes = source.__dict__ + extra_param_names = set(attributes.keys()).difference(params.keys()) + for name in extra_param_names: + setattr(dest, name, attributes[name]) + + @staticmethod + def _convert_tileable(obj): + if isinstance(obj, np.ndarray): + return mars_tensor(obj) + elif isinstance(obj, pd.DataFrame): + return MarsDataFrame(obj) + elif isinstance(obj, pd.Series): + return MarsSeries(obj) + return obj + + @classmethod + def _wrap_train_tuple(cls, data, label, sample_weight=None, init_score=None): + data = cls._convert_tileable(data) + label = cls._convert_tileable(label) + sample_weight = cls._convert_tileable(sample_weight) + init_score = cls._convert_tileable(init_score) + return TrainTuple(data, label, sample_weight, init_score) + + @staticmethod + def _wrap_eval_tuples(eval_set=None, eval_sample_weight=None, eval_init_score=None): + if not eval_set: + return None + + tps = [] + for (data, label), weight, score in zip( + eval_set, + eval_sample_weight or itertools.repeat(None), + eval_init_score or itertools.repeat(None), + ): + tps.append(TrainTuple(data, label, weight, score)) + return tps + + def fit(self, X, y, sample_weight=None, **kwargs): + raise NotImplementedError + + def predict(self, X, **kwargs): + raise NotImplementedError + + def predict_proba(self, X, **kwargs): + raise NotImplementedError + + def load_model(self, model): + self.set_params(**self.get_params()) + self._copy_extra_params(model, self) + return self diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/ranker.py b/python/xorbits/_mars/learn/contrib/lightgbm/ranker.py new file mode 100644 index 000000000..3963d5123 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/lightgbm/ranker.py @@ -0,0 +1,72 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...utils import check_consistent_length +from ..utils import make_import_error_func +from ._predict import predict_base +from ._train import train +from .core import LGBMModelType, LGBMScikitLearnBase + +try: + import lightgbm +except ImportError: + lightgbm = None + + +LGBMRanker = make_import_error_func("lightgbm") +if lightgbm: + + class LGBMRanker(LGBMScikitLearnBase, lightgbm.LGBMRanker): + def fit( + self, + X, + y, + sample_weight=None, + init_score=None, + group=None, + eval_set=None, + eval_sample_weight=None, + eval_init_score=None, + session=None, + run_kwargs=None, + **kwargs + ): + check_consistent_length(X, y, session=session, run_kwargs=run_kwargs) + params = self.get_params(True) + model = train( + params, + self._wrap_train_tuple(X, y, sample_weight, init_score), + eval_sets=self._wrap_eval_tuples( + eval_set, eval_sample_weight, eval_init_score + ), + group=group, + model_type=LGBMModelType.RANKER, + session=session, + run_kwargs=run_kwargs, + **kwargs + ) + + self.set_params(**model.get_params()) + self._copy_extra_params(model, self) + return self + + def predict(self, X, **kw): + session = kw.pop("session", None) + run_kwargs = kw.pop("run_kwargs", None) + return predict_base(self, X, session=session, run_kwargs=run_kwargs, **kw) + + def to_local(self): + model = lightgbm.LGBMRanker(**self.get_params()) + self._copy_extra_params(self, model) + return model diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/regressor.py b/python/xorbits/_mars/learn/contrib/lightgbm/regressor.py new file mode 100644 index 000000000..31ed81e4a --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/lightgbm/regressor.py @@ -0,0 +1,71 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...utils import check_consistent_length +from ..utils import make_import_error_func +from ._predict import predict_base +from ._train import train +from .core import LGBMModelType, LGBMScikitLearnBase + +try: + import lightgbm +except ImportError: + lightgbm = None + + +LGBMRegressor = make_import_error_func("lightgbm") +if lightgbm: + + class LGBMRegressor(LGBMScikitLearnBase, lightgbm.LGBMRegressor): + def fit( + self, + X, + y, + sample_weight=None, + init_score=None, + eval_set=None, + eval_sample_weight=None, + eval_init_score=None, + session=None, + run_kwargs=None, + **kwargs + ): + check_consistent_length(X, y, session=session, run_kwargs=run_kwargs) + params = self.get_params(True) + model = train( + params, + self._wrap_train_tuple(X, y, sample_weight, init_score), + eval_sets=self._wrap_eval_tuples( + eval_set, eval_sample_weight, eval_init_score + ), + model_type=LGBMModelType.REGRESSOR, + session=session, + run_kwargs=run_kwargs, + **kwargs + ) + + self.set_params(**model.get_params()) + self._copy_extra_params(model, self) + return self + + def predict(self, X, **kw): + session = kw.pop("session", None) + run_kwargs = kw.pop("run_kwargs", None) + X = self._convert_tileable(X) + return predict_base(self, X, session=session, run_kwargs=run_kwargs, **kw) + + def to_local(self): + model = lightgbm.LGBMRegressor(**self.get_params()) + self._copy_extra_params(self, model) + return model diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/tests/__init__.py b/python/xorbits/_mars/learn/contrib/lightgbm/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/lightgbm/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/tests/test_classifier.py b/python/xorbits/_mars/learn/contrib/lightgbm/tests/test_classifier.py new file mode 100644 index 000000000..ce5db3332 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/lightgbm/tests/test_classifier.py @@ -0,0 +1,173 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile + +import numpy as np +import pandas as pd +import pytest + +from ..... import dataframe as md +from ..... import tensor as mt + +try: + import lightgbm + + from .. import LGBMClassifier +except ImportError: + lightgbm = LGBMClassifier = None + + +n_rows = 1000 +n_columns = 10 +chunk_size = 200 +rs = mt.random.RandomState(0) +X_raw = rs.rand(n_rows, n_columns, chunk_size=chunk_size) +y_raw = rs.rand(n_rows, chunk_size=chunk_size) +filter = rs.rand(n_rows, chunk_size=chunk_size) < 0.8 +X = X_raw[filter] +y = y_raw[filter] + +X_df = md.DataFrame(X) +x_sparse = np.random.rand(n_rows, n_columns) +x_sparse[np.arange(n_rows), np.random.randint(n_columns, size=n_rows)] = np.nan +X_sparse = mt.tensor(x_sparse, chunk_size=chunk_size).tosparse(missing=np.nan)[filter] + + +@pytest.mark.skipif(lightgbm is None, reason="LightGBM not installed") +def test_local_classifier(setup): + y_data = (y * 10).astype(mt.int32) + classifier = LGBMClassifier(n_estimators=2) + classifier.fit(X, y_data, eval_set=[(X, y_data)], verbose=True) + prediction = classifier.predict(X) + + assert prediction.ndim == 1 + assert prediction.shape[0] == len(X) + + assert isinstance(prediction, mt.Tensor) + + # test sparse tensor + X_sparse_data = X_sparse + classifier = LGBMClassifier(n_estimators=2) + classifier.fit( + X_sparse_data, y_data, eval_set=[(X_sparse_data, y_data)], verbose=True + ) + prediction = classifier.predict(X_sparse_data) + + assert prediction.ndim == 1 + assert prediction.shape[0] == len(X) + + assert isinstance(prediction, mt.Tensor) + + prob = classifier.predict_proba(X) + assert prob.shape == X.shape + + prediction_empty = classifier.predict(mt.array([]).reshape((0, X.shape[1]))) + assert prediction_empty.shape == (0,) + + # test dataframe + X_df_data = X_df + classifier = LGBMClassifier(n_estimators=2) + classifier.fit(X_df_data, y_data, verbose=True) + prediction = classifier.predict(X_df_data) + + assert prediction.ndim == 1 + assert prediction.shape[0] == len(X) + + prob = classifier.predict_proba(X_df) + + assert prob.ndim == 2 + assert prob.shape == (len(X), 10) + + # test weight + weights = [mt.random.rand(X.shape[0]), md.Series(mt.random.rand(X.shape[0]))] + y_df = md.DataFrame(y_data) + for weight in weights: + classifier = LGBMClassifier(n_estimators=2) + classifier.fit(X, y_df, sample_weight=weight, verbose=True) + prediction = classifier.predict(X) + + assert prediction.ndim == 1 + assert prediction.shape[0] == len(X) + + # should raise error if weight.ndim > 1 + with pytest.raises(ValueError): + LGBMClassifier(n_estimators=2).fit( + X, y_df, sample_weight=mt.random.rand(1, 1), verbose=True + ) + + # test binary classifier + new_y = (y_data > 0.5).astype(mt.int32) + classifier = LGBMClassifier(n_estimators=2) + classifier.fit(X, new_y, verbose=True) + + prediction = classifier.predict(X) + assert prediction.ndim == 1 + assert prediction.shape[0] == len(X) + + prediction = classifier.predict_proba(X) + assert prediction.ndim == 2 + assert prediction.shape[0] == len(X) + + # test with existing model + X_np = X.execute().fetch() + new_y_np = new_y.execute().fetch() + raw_classifier = lightgbm.LGBMClassifier(n_estimators=2) + raw_classifier.fit(X_np, new_y_np, verbose=True) + + classifier = LGBMClassifier(raw_classifier) + label_result = classifier.predict(X_df) + assert label_result.ndim == 1 + assert label_result.shape[0] == len(X) + + proba_result = classifier.predict_proba(X_df) + assert proba_result.ndim == 2 + assert proba_result.shape[0] == len(X) + + +@pytest.mark.skipif(lightgbm is None, reason="LightGBM not installed") +def test_local_classifier_from_to_parquet(setup): + n_rows = 1000 + n_columns = 10 + rs = np.random.RandomState(0) + X = rs.rand(n_rows, n_columns) + y = (rs.rand(n_rows) > 0.5).astype(np.int32) + df = pd.DataFrame(X, columns=[f"c{i}" for i in range(n_columns)]) + + # test with existing model + classifier = lightgbm.LGBMClassifier(n_estimators=2) + classifier.fit(X, y, verbose=True) + + with tempfile.TemporaryDirectory() as d: + result_dir = os.path.join(d, "result") + os.mkdir(result_dir) + data_dir = os.path.join(d, "data") + os.mkdir(data_dir) + + df.iloc[:500].to_parquet(os.path.join(d, "data", "data1.parquet")) + df.iloc[500:].to_parquet(os.path.join(d, "data", "data2.parquet")) + + df = md.read_parquet(data_dir) + model = LGBMClassifier() + model.load_model(classifier) + result = model.predict(df, run=False) + r = md.DataFrame(result).to_parquet(result_dir) + + r.execute() + + ret = md.read_parquet(result_dir).to_pandas().iloc[:, 0].to_numpy() + expected = classifier.predict(X) + expected = np.stack([1 - expected, expected]).argmax(axis=0) + np.testing.assert_array_equal(ret, expected) diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/tests/test_ranker.py b/python/xorbits/_mars/learn/contrib/lightgbm/tests/test_ranker.py new file mode 100644 index 000000000..f060be69b --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/lightgbm/tests/test_ranker.py @@ -0,0 +1,68 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from ..... import tensor as mt + +try: + import lightgbm + + from .. import LGBMRanker +except ImportError: + lightgbm = LGBMRanker = None + + +n_rows = 1000 +n_columns = 10 +chunk_size = 200 +rs = mt.random.RandomState(0) +X_raw = rs.rand(n_rows, n_columns, chunk_size=chunk_size) +y_raw = rs.rand(n_rows, chunk_size=chunk_size) + + +@pytest.mark.skipif(lightgbm is None, reason="LightGBM not installed") +def test_local_ranker(setup): + y = (y_raw * 10).astype(mt.int32) + ranker = LGBMRanker(n_estimators=2) + ranker.fit(X_raw, y, group=[X_raw.shape[0]], verbose=True) + prediction = ranker.predict(X_raw) + + assert prediction.ndim == 1 + assert prediction.shape[0] == len(X_raw) + + assert isinstance(prediction, mt.Tensor) + result = prediction.fetch() + assert prediction.dtype == result.dtype + + # test weight + weight = mt.random.rand(X_raw.shape[0]) + ranker = LGBMRanker(verbosity=1, n_estimators=2) + ranker.fit(X_raw, y, group=[X_raw.shape[0]], sample_weight=weight) + prediction = ranker.predict(X_raw) + + assert prediction.ndim == 1 + assert prediction.shape[0] == len(X_raw) + result = prediction.fetch() + assert prediction.dtype == result.dtype + + # test local model + X_np = X_raw.execute().fetch() + y_np = y.execute().fetch() + raw_ranker = lightgbm.LGBMRanker(verbosity=1, n_estimators=2) + raw_ranker.fit(X_np, y_np, group=[X_raw.shape[0]]) + prediction = LGBMRanker(raw_ranker).predict(X_raw) + + assert prediction.ndim == 1 + assert prediction.shape[0] == len(X_raw) diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/tests/test_regressor.py b/python/xorbits/_mars/learn/contrib/lightgbm/tests/test_regressor.py new file mode 100644 index 000000000..22f00e5cb --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/lightgbm/tests/test_regressor.py @@ -0,0 +1,91 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pytest + +from ..... import tensor as mt + +try: + import lightgbm + + from .. import LGBMRegressor +except ImportError: + lightgbm = LGBMRegressor = None + + +n_rows = 1000 +n_columns = 10 +chunk_size = 200 +rs = mt.random.RandomState(0) +X = rs.rand(n_rows, n_columns, chunk_size=chunk_size) +y = rs.randint(0, 10, n_rows, chunk_size=chunk_size) + + +@pytest.mark.skipif(lightgbm is None, reason="LightGBM not installed") +def test_local_regressor(setup): + regressor = LGBMRegressor(n_estimators=2) + regressor.fit(X, y, verbose=True) + prediction = regressor.predict(X) + + assert prediction.ndim == 1 + assert prediction.shape[0] == len(X) + + assert isinstance(prediction, mt.Tensor) + result = prediction.fetch() + assert prediction.dtype == result.dtype + + # test weight + weight = mt.random.rand(X.shape[0]) + regressor = LGBMRegressor(verbosity=1, n_estimators=2) + regressor.fit(X, y, sample_weight=weight) + prediction = regressor.predict(X) + + assert prediction.ndim == 1 + assert prediction.shape[0] == len(X) + result = prediction.fetch() + assert prediction.dtype == result.dtype + + # test numpy tensor + try: + from sklearn.datasets import make_classification + + X_array, y_array = make_classification() + regressor = LGBMRegressor(n_estimators=2) + regressor.fit(X_array, y_array, verbose=True) + prediction = regressor.predict(X_array) + + assert prediction.ndim == 1 + assert prediction.shape[0] == len(X_array) + + X_df = pd.DataFrame(X_array) + y_df = pd.Series(y_array) + regressor = LGBMRegressor(n_estimators=2) + regressor.fit(X_df, y_df, verbose=True) + prediction = regressor.predict(X_df) + + assert prediction.ndim == 1 + assert prediction.shape[0] == len(X_df) + except ImportError: + pass + + # test existing model + X_np = X.execute().fetch() + y_np = y.execute().fetch() + raw_regressor = lightgbm.LGBMRegressor(verbosity=1, n_estimators=2) + raw_regressor.fit(X_np, y_np) + prediction = LGBMRegressor(raw_regressor).predict(X) + + assert prediction.ndim == 1 + assert prediction.shape[0] == len(X) diff --git a/python/xorbits/_mars/learn/contrib/pytorch/__init__.py b/python/xorbits/_mars/learn/contrib/pytorch/__init__.py new file mode 100644 index 000000000..77f4deef9 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/pytorch/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .dataset import MarsDataset # noqa: F401 # pylint: disable=unused-import +from .run_script import run_pytorch_script +from .sampler import ( # noqa: F401 # pylint: disable=unused-import + DistributedSampler, + RandomSampler, + SequentialSampler, + SubsetRandomSampler, +) + + +def register_op(): + from .run_script import RunPyTorch + + del RunPyTorch diff --git a/python/xorbits/_mars/learn/contrib/pytorch/dataset.py b/python/xorbits/_mars/learn/contrib/pytorch/dataset.py new file mode 100644 index 000000000..283b600b1 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/pytorch/dataset.py @@ -0,0 +1,97 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +from typing import List + +import numpy as np +import pandas as pd + +try: + import torch + from torch.utils.data import Dataset +except ImportError: # pragma: no cover + torch = None + Dataset = object + +from .... import execute +from ....core.context import get_context +from ....dataframe.core import DATAFRAME_TYPE, SERIES_TYPE +from ....tensor.core import TENSOR_TYPE +from ....utils import require_not_none + +ACCEPT_TYPE = ( + TENSOR_TYPE, + DATAFRAME_TYPE, + SERIES_TYPE, + np.ndarray, + pd.DataFrame, + pd.Series, + List, +) + + +@require_not_none(torch) +class MarsDataset(Dataset): + r"""MarsDataset that inherit from torch.utils.data.Dataset. + It converts from Mars basic datatype such as Tensor, + DataFrame, Series. Additionally, it's constructor can receive + np.ndarray, pd.DataFrame, pd.Series type. + """ + + def __init__(self, *tileables, fetch_kwargs=None): + self._context = get_context() + self._tileables = tileables + self._fetch_kwargs = fetch_kwargs or dict() + self._executed = False + self._check_type() + + def _check_type(self): + for t in self._tileables: + if not isinstance(t, ACCEPT_TYPE): + raise TypeError(f"Unexpected dataset type: {type(t)}") + + def _execute(self): + execute_data = [t for t in self._tileables if isinstance(t, ACCEPT_TYPE[:3])] + if len(execute_data): + execute(execute_data) + + def __len__(self): + return self._tileables[0].shape[0] + + def __getitem__(self, index): + if not self._executed: + self._execute() + self._executed = True + return tuple(self.get_data(t, index) for t in self._tileables) + + def get_data(self, t, index): + fetch_kwargs = dict() + if self._fetch_kwargs: + fetch_kwargs = copy.deepcopy(self._fetch_kwargs) + + if isinstance(t, TENSOR_TYPE): + return t[index].fetch(**fetch_kwargs) + elif isinstance(t, np.ndarray): + return t[index] + elif isinstance(t, DATAFRAME_TYPE): + return t.iloc[index].fetch(**fetch_kwargs).values + elif isinstance(t, SERIES_TYPE): + return t.iloc[index].fetch(**fetch_kwargs) + elif isinstance(t, pd.DataFrame): + return t.iloc[index].values + elif isinstance(t, pd.Series): + return t.iloc[index] + else: + return t[index] diff --git a/python/xorbits/_mars/learn/contrib/pytorch/run_script.py b/python/xorbits/_mars/learn/contrib/pytorch/run_script.py new file mode 100644 index 000000000..7e15a60a8 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/pytorch/run_script.py @@ -0,0 +1,162 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Any, BinaryIO, Dict, List, Optional, TextIO, Union + +import numpy as np + +from .... import opcodes as OperandDef +from ....core.context import get_context +from ....remote.run_script import RunScript, _extract_inputs +from ....serialization.serializables import Int32Field, StringField +from ....typing import SessionType, TileableType +from ....utils import to_binary +from ..utils import pick_workers + + +class RunPyTorch(RunScript): + _op_type_ = OperandDef.RUN_PYTORCH + + # used for chunk op + _master_port = Int32Field("master_port") + _master_addr = StringField("master_addr") + _rank = Int32Field("rank") + _init_method = StringField("init_method") + + def __init__( + self, master_port=None, master_addr=None, init_method=None, gpu=None, **kw + ): + super().__init__( + _master_port=master_port, + _master_addr=master_addr, + _init_method=init_method, + gpu=gpu, + **kw + ) + + @property + def master_port(self): + return self._master_port + + @property + def master_addr(self): + return self._master_addr + + @property + def init_method(self): + return self._init_method + + @classmethod + def tile(cls, op): + ctx = get_context() + + workers = pick_workers(ctx.get_worker_addresses(), op.world_size) + data, input_chunks = cls._get_chunk_data(op) + + out_chunks = [] + for i in range(op.world_size): + chunk_op = op.copy().reset_key() + chunk_op._data = data + chunk_op.expect_worker = workers[i] + if op.init_method is None: + chunk_op._master_port = op.master_port + chunk_op._master_addr = workers[0].split(":", 1)[0] + chunk_op._rank = i + chunk_op._init_method = op.init_method + out_chunks.append(chunk_op.new_chunk(input_chunks, index=(i,))) + + new_op = op.copy() + return new_op.new_tileables( + op.inputs, + chunks=out_chunks, + nsplits=(tuple(np.nan for _ in range(len(out_chunks))),), + ) + + @classmethod + def _build_envs(cls, ctx, op): + envs = super()._build_envs(ctx, op) + if op.master_port is not None: + envs["MASTER_PORT"] = str(op.master_port) + if op.master_addr is not None: + envs["MASTER_ADDR"] = str(op.master_addr) + return envs + + @classmethod + def execute(cls, ctx, op): + assert ctx.local_address.split(":")[0] == op.expect_worker.split(":")[0] + + super().execute(ctx, op) + + +def run_pytorch_script( + script: Union[bytes, str, BinaryIO, TextIO], + n_workers: int, + data: Dict[str, TileableType] = None, + gpu: Optional[bool] = None, + command_argv: List[str] = None, + retry_when_fail: bool = False, + session: SessionType = None, + run_kwargs: Dict[str, Any] = None, + port: int = None, +): + """ + Run PyTorch script in Mars cluster. + + Parameters + ---------- + script: str or file-like object + Script to run + n_workers : int + Number of PyTorch workers + data : dict + Variable name to data. + gpu : bool + Run PyTorch script on GPU + command_argv : list + Extra command args for script + retry_when_fail : bool + If True, retry when function failed. + session + Mars session, if not provided, will use default one. + run_kwargs : dict + Extra kwargs for `session.run`. + port : int + Port of PyTorch worker or ps, will automatically increase for the same worker + + Returns + ------- + status + return {'status': 'ok'} if succeeded, or error raised + """ + if int(n_workers) <= 0: + raise ValueError("n_workers should be at least 1") + if hasattr(script, "read"): + code = script.read() + else: + with open(os.path.abspath(script), "rb") as f: + code = f.read() + + inputs = _extract_inputs(data) + port = 29500 if port is None else port + op = RunPyTorch( + data=data, + code=to_binary(code), + world_size=int(n_workers), + retry_when_fail=retry_when_fail, + gpu=gpu, + master_port=port, + command_args=command_argv, + ) + return op(inputs).execute(session=session, **(run_kwargs or {})) diff --git a/python/xorbits/_mars/learn/contrib/pytorch/sampler.py b/python/xorbits/_mars/learn/contrib/pytorch/sampler.py new file mode 100644 index 000000000..f783d7609 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/pytorch/sampler.py @@ -0,0 +1,287 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Iterator, Optional, Sequence, Sized + +try: + import torch + from torch.utils.data import Sampler +except ImportError: # pragma: no cover + torch = None + Sampler = object + +from ....utils import require_not_none + + +@require_not_none(torch) +class SequentialSampler(Sampler): + r""" + Samples elements sequentially, always in the same order. + + Args: + data_source (Dataset): dataset to sample from + """ + data_source: Sized + + def __init__(self, data_source): + self.data_source = data_source + + def __iter__(self) -> Iterator[int]: + return iter(range(len(self.data_source))) + + def __len__(self) -> int: + return len(self.data_source) + + +@require_not_none(torch) +class RandomSampler(Sampler): + r""" + Samples elements randomly. If without replacement, then sample from a shuffled dataset. + If with replacement, then user can specify :attr:`num_samples` to draw. + + Args: + data_source (Dataset): dataset to sample from + replacement (bool): samples are drawn on-demand with replacement if ``True``, default=``False`` + num_samples (int): number of samples to draw, default=`len(dataset)`. This argument + is supposed to be specified only when `replacement` is ``True``. + generator (Generator): Generator used in sampling. + """ + data_source: Sized + replacement: bool + + def __init__( + self, data_source, replacement=False, num_samples=None, generator=None + ): + self.data_source = data_source + self.replacement = replacement + self._num_samples = num_samples + self.generator = generator + + if not isinstance(self.replacement, bool): + raise ValueError( + "replacement should be a boolean value, but got " + f"replacement={self.replacement}" + ) + + if self._num_samples is not None and not replacement: + raise ValueError( + "With replacement=False, num_samples should not be specified, " + "since a random permute will be performed." + ) + + if not isinstance(self.num_samples, int) or self.num_samples <= 0: + raise ValueError( + "num_samples should be a positive integer " + f"value, but got num_samples={self.num_samples}" + ) + + @property + def num_samples(self): + # dataset size might change at runtime + if self._num_samples is None: + return len(self.data_source) + return self._num_samples + + def __iter__(self): + n = len(self.data_source) + if self.generator is None: + generator = torch.Generator() + generator.manual_seed( + int(torch.empty((), dtype=torch.int64).random_().item()) + ) + else: + generator = self.generator + if self.replacement: + for _ in range(self.num_samples // 32): + yield from torch.randint( + high=n, size=(32,), dtype=torch.int64, generator=generator + ).tolist() + yield from torch.randint( + high=n, + size=(self.num_samples % 32,), + dtype=torch.int64, + generator=generator, + ).tolist() + else: + yield from torch.randperm(n, generator=generator).tolist() + + def __len__(self) -> int: + return self.num_samples + + +@require_not_none(torch) +class SubsetRandomSampler(Sampler): + """ + Samples elements randomly from a given list of indices, without replacement. + + Args: + indices (sequence): a sequence of indices + generator (Generator): Generator used in sampling. + """ + + indices: Sequence[int] + + def __init__(self, indices: Sequence[int], generator=None) -> None: + self.indices = indices + self.generator = generator + + def __iter__(self) -> Iterator[int]: + return ( + self.indices[i] + for i in torch.randperm(len(self.indices), generator=self.generator) + ) + + def __len__(self) -> int: + return len(self.indices) + + +@require_not_none(torch) +class DistributedSampler(Sampler): + r""" + Sampler that restricts data loading to a subset of the dataset. + + It is especially useful in conjunction with + :class:`torch.nn.parallel.DistributedDataParallel`. In such a case, each + process can pass a :class:`~torch.utils.data.DistributedSampler` instance as a + :class:`~torch.utils.data.DataLoader` sampler, and load a subset of the + original dataset that is exclusive to it. + + .. note:: + Dataset is assumed to be of constant size. + + Args: + dataset: Dataset used for sampling. + num_replicas (int, optional): Number of processes participating in + distributed training. By default, :attr:`world_size` is retrieved from the + current distributed group. + rank (int, optional): Rank of the current process within :attr:`num_replicas`. + By default, :attr:`rank` is retrieved from the current distributed + group. + shuffle (bool, optional): If ``True`` (default), sampler will shuffle the + indices. + seed (int, optional): random seed used to shuffle the sampler if + :attr:`shuffle=True`. This number should be identical across all + processes in the distributed group. Default: ``0``. + drop_last (bool, optional): if ``True``, then the sampler will drop the + tail of the data to make it evenly divisible across the number of + replicas. If ``False``, the sampler will add extra indices to make + the data evenly divisible across the replicas. Default: ``False``. + + .. warning:: + In distributed mode, calling the :meth:`set_epoch` method at + the beginning of each epoch **before** creating the :class:`DataLoader` iterator + is necessary to make shuffling work properly across multiple epochs. Otherwise, + the same ordering will be always used. + + Example:: + + >>> sampler = DistributedSampler(dataset) if is_distributed else None + >>> loader = DataLoader(dataset, shuffle=(sampler is None), + ... sampler=sampler) + >>> for epoch in range(start_epoch, n_epochs): + ... if is_distributed: + ... sampler.set_epoch(epoch) + ... train(loader) + """ + + def __init__( + self, + dataset, + num_replicas: Optional[int] = None, + rank: Optional[int] = None, + shuffle: bool = True, + seed: int = 0, + drop_last: bool = False, + ) -> None: + import torch.distributed as dist + + if num_replicas is None: # pragma: no cover + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + num_replicas = dist.get_world_size() + if rank is None: # pragma: no cover + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + rank = dist.get_rank() + if rank >= num_replicas or rank < 0: + raise ValueError( + "Invalid rank {}, rank should be in the interval" + " [0, {}]".format(rank, num_replicas - 1) + ) + + self.dataset = dataset + self.num_replicas = num_replicas + self.rank = rank + self.epoch = 0 + self.drop_last = drop_last + # If the dataset length is evenly divisible by # of replicas, then there + # is no need to drop any data, since the dataset will be split equally. + if self.drop_last and len(self.dataset) % self.num_replicas != 0: + # Split to nearest available length that is evenly divisible. + # This is to ensure each rank receives the same amount of data when + # using this Sampler. + self.num_samples = math.ceil( + (len(self.dataset) - self.num_replicas) / self.num_replicas + ) + else: + self.num_samples = math.ceil(len(self.dataset) / self.num_replicas) + self.total_size = self.num_samples * self.num_replicas + self.shuffle = shuffle + self.seed = seed + + def generate_indices(self): + if self.shuffle: + # deterministically shuffle based on epoch and seed + g = torch.Generator() + g.manual_seed(self.seed + self.epoch) + indices = torch.randperm(len(self.dataset), generator=g).tolist() + else: + indices = list(range(len(self.dataset))) + + if not self.drop_last: + # add extra samples to make it evenly divisible + padding_size = self.total_size - len(indices) + if padding_size <= len(indices): + indices += indices[:padding_size] + else: + indices += (indices * math.ceil(padding_size / len(indices)))[ + :padding_size + ] + else: + # remove tail of data to make it evenly divisible. + indices = indices[: self.total_size] + assert len(indices) == self.total_size + + # subsample + indices = indices[self.rank : self.total_size : self.num_replicas] + assert len(indices) == self.num_samples + + return indices + + def __iter__(self): + return iter(self.generate_indices()) + + def __len__(self): + return self.num_samples + + def set_epoch(self, epoch: int): + r"""Sets the epoch for this sampler. When :attr:`shuffle=True`, this ensures all replicas + use a different random ordering for each epoch. Otherwise, the next iteration of this + sampler will yield the same ordering. + + Args: + epoch (int): Epoch number. + """ + self.epoch = epoch diff --git a/python/xorbits/_mars/learn/contrib/pytorch/tests/__init__.py b/python/xorbits/_mars/learn/contrib/pytorch/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/pytorch/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/learn/contrib/pytorch/tests/pytorch_dataset.py b/python/xorbits/_mars/learn/contrib/pytorch/tests/pytorch_dataset.py new file mode 100644 index 000000000..737472ab2 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/pytorch/tests/pytorch_dataset.py @@ -0,0 +1,80 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys + + +def get_model(): + import torch.nn as nn + + return nn.Sequential( + nn.Linear(32, 64), + nn.ReLU(), + nn.Linear(64, 64), + nn.ReLU(), + nn.Linear(64, 10), + nn.Softmax(), + ) + + +def main(feature_data, labels): + import torch.distributed as dist + import torch.nn as nn + import torch.optim as optim + import torch.utils.data + from mars.learn.contrib.pytorch import DistributedSampler, MarsDataset + + dist.init_process_group(backend="gloo") + torch.manual_seed(42) + + data = feature_data + labels = labels + + train_dataset = MarsDataset(data, labels) + assert len(train_dataset) == 1000 + + train_sampler = DistributedSampler(train_dataset, shuffle=True) + train_loader = torch.utils.data.DataLoader( + dataset=train_dataset, + batch_size=32, + shuffle=(train_sampler is None), + sampler=train_sampler, + ) + + model = nn.parallel.DistributedDataParallel(get_model()) + optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5) + criterion = nn.BCELoss() + + for i in range(2): + # 2 epochs + train_sampler.set_epoch(i) + running_loss = 0.0 + for _, (batch_data, batch_labels) in enumerate(train_loader): + outputs = model(batch_data) + loss = criterion(outputs.squeeze(), batch_labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + running_loss += loss.item() + print(f"running_loss is {loss.item()}") + + print("Done!") + + +if __name__ == "__main__": + assert len(sys.argv) == 2 + assert sys.argv[1] == "multiple" + feature_data = globals()["feature_data"] + labels = globals()["labels"] + main(feature_data, labels) diff --git a/python/xorbits/_mars/learn/contrib/pytorch/tests/pytorch_sample.py b/python/xorbits/_mars/learn/contrib/pytorch/tests/pytorch_sample.py new file mode 100644 index 000000000..b6f298744 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/pytorch/tests/pytorch_sample.py @@ -0,0 +1,66 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys + + +def get_model(): + import torch.nn as nn + + return nn.Sequential( + nn.Linear(32, 64), + nn.ReLU(), + nn.Linear(64, 64), + nn.ReLU(), + nn.Linear(64, 10), + nn.Softmax(), + ) + + +def main(): + import torch.distributed as dist + import torch.nn as nn + import torch.optim as optim + import torch.utils.data + + dist.init_process_group(backend="gloo") + torch.manual_seed(42) + + data = torch.rand((1000, 32), dtype=torch.float32) + labels = torch.randint(1, (1000, 10), dtype=torch.float32) + + train_dataset = torch.utils.data.TensorDataset(data, labels) + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) + train_loader = torch.utils.data.DataLoader( + dataset=train_dataset, batch_size=32, shuffle=False, sampler=train_sampler + ) + + model = nn.parallel.DistributedDataParallel(get_model()) + optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5) + criterion = nn.BCELoss() + + for _ in range(2): + # 2 epochs + for _, (batch_data, batch_labels) in enumerate(train_loader): + outputs = model(batch_data) + loss = criterion(outputs.squeeze(), batch_labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + + +if __name__ == "__main__": + assert len(sys.argv) == 2 + assert sys.argv[1] == "multiple" + main() diff --git a/python/xorbits/_mars/learn/contrib/pytorch/tests/test_dataset.py b/python/xorbits/_mars/learn/contrib/pytorch/tests/test_dataset.py new file mode 100644 index 000000000..d70d5463c --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/pytorch/tests/test_dataset.py @@ -0,0 +1,325 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pytest + +from ..... import dataframe as md +from ..... import tensor as mt +from .....utils import lazy_import +from .. import ( + DistributedSampler, + MarsDataset, + RandomSampler, + SequentialSampler, + SubsetRandomSampler, + run_pytorch_script, +) + +torch_installed = lazy_import("torch") is not None + + +@pytest.mark.skipif(not torch_installed, reason="pytorch not installed") +def test_mars_dataset(setup): + import numpy as np + import pandas as pd + from torch.utils.data import Dataset + + # Mars tensor + data = mt.random.rand(1000, 32, dtype="f4") + labels = mt.random.randint(0, 2, (1000, 10), dtype="f4") + + data_verify = data[1].execute().fetch() + labels_verify = labels[1].execute().fetch() + + train_dataset = MarsDataset(data, labels) + + assert isinstance(train_dataset, Dataset) + np.testing.assert_array_equal(train_dataset[1][0], data_verify) + np.testing.assert_array_equal(train_dataset[1][1], labels_verify) + assert len(train_dataset) == 1000 + + # np ndarray + data = np.random.rand(1000, 32) + labels = np.random.randint(0, 2, (1000, 10)) + + data_verify = data[1] + labels.dtype = "float32" + labels_verify = labels[1] + + train_dataset = MarsDataset(data, labels) + np.testing.assert_array_equal(train_dataset[1][0], data_verify) + np.testing.assert_array_equal(train_dataset[1][1], labels_verify) + assert len(train_dataset) == 1000 + + # Mars dataframe + data = md.DataFrame(data) + labels = md.DataFrame(labels) + + data_verify = data.iloc[1].execute().fetch().values + labels_verify = labels.iloc[1].execute().fetch().values + + train_dataset = MarsDataset( + data, labels, fetch_kwargs={"extra_config": {"check_series_name": False}} + ) + np.testing.assert_array_equal(train_dataset[1][0], data_verify) + np.testing.assert_array_equal(train_dataset[1][1], labels_verify) + assert len(train_dataset) == 1000 + + # Mars Series + label = labels[1] + + label_verify = label[1].execute().fetch() + + train_dataset = MarsDataset( + data, label, fetch_kwargs={"extra_config": {"check_series_name": False}} + ) + np.testing.assert_array_equal(train_dataset[1][0], data_verify) + assert train_dataset[1][1] == label_verify + assert len(train_dataset) == 1000 + + # pandas dataframe + data = pd.DataFrame(np.random.rand(1000, 32)) + labels = pd.DataFrame(np.random.randint(0, 2, (1000, 10)), dtype="float32") + + data_verify = data.iloc[1].values + labels_verify = labels.iloc[1].values + + train_dataset = MarsDataset(data, labels) + np.testing.assert_array_equal(train_dataset[1][0], data_verify) + np.testing.assert_array_equal(train_dataset[1][1], labels_verify) + assert len(train_dataset) == 1000 + + # pands series + label = labels[1] + label_verify = label[1] + + train_dataset = MarsDataset(data, label) + np.testing.assert_array_equal(train_dataset[1][0], data_verify) + assert train_dataset[1][1] == label_verify + assert len(train_dataset) == 1000 + + # test TypeError + label = tuple(range(1000)) + + with pytest.raises(TypeError) as e: + train_dataset = MarsDataset(data, label) + exec_msg = e.value.args[0] + assert exec_msg == "Unexpected dataset type: " + + +@pytest.mark.skipif(not torch_installed, reason="pytorch not installed") +def test_sequential_sampler(setup_cluster): + import torch + + data = mt.random.rand(1000, 32, dtype="f4") + labels = mt.random.randint(0, 2, (1000, 10), dtype="f4") + + train_dataset = MarsDataset(data, labels) + assert len(train_dataset) == 1000 + + train_sampler = SequentialSampler(train_dataset) + assert len(train_sampler) == 1000 + + train_loader = torch.utils.data.DataLoader( + dataset=train_dataset, batch_size=32, sampler=train_sampler + ) + + model = torch.nn.Sequential( + torch.nn.Linear(32, 64), + torch.nn.ReLU(), + torch.nn.Linear(64, 64), + torch.nn.ReLU(), + torch.nn.Linear(64, 10), + torch.nn.Softmax(dim=1), + ) + + optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5) + criterion = torch.nn.BCELoss() + for _ in range(2): + # 2 epochs + for _, (batch_data, batch_labels) in enumerate(train_loader): + outputs = model(batch_data) + loss = criterion(outputs.squeeze(), batch_labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + + +@pytest.mark.skipif(not torch_installed, reason="pytorch not installed") +def test_random_sampler(setup_cluster): + import torch + + data = mt.random.rand(1000, 32, dtype="f4") + labels = mt.random.randint(0, 2, (1000, 10), dtype="f4") + + train_dataset = MarsDataset(data, labels) + + # test __init__() + with pytest.raises(ValueError) as e: + train_sampler = RandomSampler(train_dataset, replacement=1) + exec_msg = e.value.args[0] + assert exec_msg == "replacement should be a boolean value, but got replacement=1" + + with pytest.raises(ValueError) as e: + train_sampler = RandomSampler(train_dataset, num_samples=900) + exec_msg = e.value.args[0] + assert ( + exec_msg + == "With replacement=False, num_samples should not " + + "be specified, since a random permute will be performed." + ) + + with pytest.raises(ValueError) as e: + train_sampler = RandomSampler(train_dataset, replacement=True, num_samples=-1) + exec_msg = e.value.args[0] + assert ( + exec_msg + == "num_samples should be a positive integer value, but got num_samples=-1" + ) + + train_sampler = RandomSampler(train_dataset) + + # test __len__ num_samples() + assert len(train_sampler) == 1000 + assert train_sampler.num_samples == 1000 + + # test __iter__ + g_cpu = torch.Generator() + g_cpu.manual_seed(2147483647) + + train_sampler = RandomSampler(train_dataset, generator=g_cpu) + assert len(train_sampler) == 1000 + train_loader = torch.utils.data.DataLoader( + dataset=train_dataset, batch_size=32, sampler=train_sampler + ) + for _, (batch_data, batch_labels) in enumerate(train_loader): + assert len(batch_data[0]) == 32 + assert len(batch_labels[0]) == 10 + + train_sampler = RandomSampler(train_dataset, replacement=True, num_samples=900) + assert len(train_sampler) == 900 + train_loader = torch.utils.data.DataLoader( + dataset=train_dataset, batch_size=32, sampler=train_sampler + ) + for _, (batch_data, batch_labels) in enumerate(train_loader): + assert len(batch_data[0]) == 32 + assert len(batch_labels[0]) == 10 + + # torch train + model = torch.nn.Sequential( + torch.nn.Linear(32, 64), + torch.nn.ReLU(), + torch.nn.Linear(64, 64), + torch.nn.ReLU(), + torch.nn.Linear(64, 10), + torch.nn.Softmax(dim=1), + ) + + optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5) + criterion = torch.nn.BCELoss() + for _ in range(2): + # 2 epochs + for _, (batch_data, batch_labels) in enumerate(train_loader): + outputs = model(batch_data) + loss = criterion(outputs.squeeze(), batch_labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + + +@pytest.mark.skipif(not torch_installed, reason="pytorch not installed") +def test_subset_random_sampler(setup_cluster): + import numpy as np + import torch + + data = mt.random.rand(1000, 32, dtype="f4") + labels = mt.random.randint(0, 2, (1000, 10), dtype="f4") + data.execute() + labels.execute() + + train_dataset = MarsDataset(data, labels) + train_sampler = SubsetRandomSampler( + np.random.choice(range(len(train_dataset)), len(train_dataset)) + ) + + assert len(train_sampler) == 1000 + train_loader = torch.utils.data.DataLoader( + dataset=train_dataset, batch_size=32, sampler=train_sampler + ) + for _, (batch_data, batch_labels) in enumerate(train_loader): + assert len(batch_data[0]) == 32 + assert len(batch_labels[0]) == 10 + + +@pytest.mark.skipif(not torch_installed, reason="pytorch not installed") +def test_distributed_sampler(setup_cluster): + import torch + + data = mt.random.rand(1001, 32, dtype="f4") + labels = mt.random.randint(0, 2, (1001, 10), dtype="f4") + + train_dataset = MarsDataset(data, labels) + + with pytest.raises(ValueError) as e: + train_sampler = DistributedSampler(train_dataset, num_replicas=2, rank=-1) + exec_msg = e.value.args[0] + assert exec_msg == "Invalid rank -1, rank should be in the interval [0, 1]" + + train_sampler = DistributedSampler( + train_dataset, num_replicas=2, rank=0, drop_last=True, shuffle=True + ) + assert len(train_sampler) == 500 + train_loader = torch.utils.data.DataLoader( + dataset=train_dataset, batch_size=32, sampler=train_sampler + ) + for _, (batch_data, batch_labels) in enumerate(train_loader): + assert len(batch_data[0]) == 32 + assert len(batch_labels[0]) == 10 + + train_sampler = DistributedSampler( + train_dataset, num_replicas=2, rank=0, drop_last=False, shuffle=False + ) + train_sampler.set_epoch(10) + assert len(train_sampler) == 501 + train_loader = torch.utils.data.DataLoader( + dataset=train_dataset, batch_size=32, sampler=train_sampler + ) + for _, (batch_data, batch_labels) in enumerate(train_loader): + assert len(batch_data[0]) == 32 + assert len(batch_labels[0]) == 10 + + +@pytest.mark.skipif(not torch_installed, reason="pytorch not installed") +def test_mars_dataset_script(setup_cluster): + sess = setup_cluster + path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "pytorch_dataset.py" + ) + + data = mt.random.rand(1000, 32, dtype="f4") + labels = mt.random.randint(0, 2, (1000, 10), dtype="f4") + + assert ( + run_pytorch_script( + path, + n_workers=2, + data={"feature_data": data, "labels": labels}, + command_argv=["multiple"], + port=9945, + session=sess, + ).fetch()["status"] + == "ok" + ) diff --git a/python/xorbits/_mars/learn/contrib/pytorch/tests/test_run_script.py b/python/xorbits/_mars/learn/contrib/pytorch/tests/test_run_script.py new file mode 100644 index 000000000..811c13ab7 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/pytorch/tests/test_run_script.py @@ -0,0 +1,34 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pytest + +from .....utils import lazy_import +from .. import run_pytorch_script + +torch_installed = lazy_import("torch") is not None + + +@pytest.mark.skipif(not torch_installed, reason="pytorch not installed") +def test_distributed_run_py_torch_script(setup_cluster): + sess = setup_cluster + path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "pytorch_sample.py") + assert ( + run_pytorch_script( + path, n_workers=2, command_argv=["multiple"], port=9945, session=sess + ).fetch()["status"] + == "ok" + ) diff --git a/python/xorbits/_mars/learn/contrib/statsmodels/__init__.py b/python/xorbits/_mars/learn/contrib/statsmodels/__init__.py new file mode 100644 index 000000000..13eeef52a --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/statsmodels/__init__.py @@ -0,0 +1,21 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .api import MarsDistributedModel, MarsResults + + +def register_op(): + from . import predict, train + + del train, predict diff --git a/python/xorbits/_mars/learn/contrib/statsmodels/api.py b/python/xorbits/_mars/learn/contrib/statsmodels/api.py new file mode 100644 index 000000000..515e7b0c1 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/statsmodels/api.py @@ -0,0 +1,101 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pickle # nosec # pylint: disable=import_pickle + +from .predict import StatsModelsPredict +from .train import StatsModelsTrain + +try: + from statsmodels.base.distributed_estimation import DistributedModel +except ImportError: + DistributedModel = None + + +class MarsDistributedModel: + def __init__( + self, + factor=None, + num_partitions=None, + model_class=None, + init_kwds=None, + estimation_method=None, + estimation_kwds=None, + join_method=None, + join_kwds=None, + results_class=None, + results_kwds=None, + ): + self._factor = factor + self._sm_model = DistributedModel( + num_partitions or 10, + model_class=model_class, + init_kwds=init_kwds, + estimation_method=estimation_method, + estimation_kwds=estimation_kwds, + join_method=join_method, + join_kwds=join_kwds, + results_class=results_class, + results_kwds=results_kwds, + ) + + def fit(self, endog, exog, session=None, **kwargs): + num_partitions = None if self._factor is not None else self._sm_model.partitions + run_kwargs = kwargs.pop("run_kwargs", dict()) + op = StatsModelsTrain( + endog=endog, + exog=exog, + num_partitions=num_partitions, + factor=self._factor, + model_class=self._sm_model.model_class, + init_kwds=self._sm_model.init_kwds, + fit_kwds=kwargs, + estimation_method=self._sm_model.estimation_method, + estimation_kwds=self._sm_model.estimation_kwds, + join_method=self._sm_model.join_method, + join_kwds=self._sm_model.join_kwds, + results_class=self._sm_model.results_class, + results_kwds=self._sm_model.results_kwds, + ) + result = ( + op(exog, endog) + .execute(session=session, **run_kwargs) + .fetch(session=session) + ) + return MarsResults(pickle.loads(result)) # nosec + + +class MarsResults: + def __init__(self, model): + self._model = model + + @property + def model(self): + return self._model + + def __getattr__(self, item): + if item == "_model": + raise AttributeError + return getattr(self._model, item) + + def __mars_tokenize__(self): + return pickle.dumps(self.model) + + def predict(self, exog, *args, **kwargs): + session = kwargs.pop("session", None) + run_kwargs = kwargs.pop("run_kwargs", dict()) + op = StatsModelsPredict( + model_results=self, predict_args=args, predict_kwargs=kwargs + ) + return op(exog).execute(session=session, **run_kwargs) diff --git a/python/xorbits/_mars/learn/contrib/statsmodels/predict.py b/python/xorbits/_mars/learn/contrib/statsmodels/predict.py new file mode 100644 index 000000000..1dbd84a97 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/statsmodels/predict.py @@ -0,0 +1,110 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pickle # nosec # pylint: disable=import_pickle + +import numpy as np + +from .... import opcodes +from ....core import OutputType, recursive_tile +from ....dataframe.core import DATAFRAME_TYPE, SERIES_TYPE +from ....serialization.serializables import BytesField, DictField, TupleField +from ...operands import LearnOperand, LearnOperandMixin + + +class StatsModelsPredict(LearnOperand, LearnOperandMixin): + _op_code_ = opcodes.STATSMODELS_PREDICT + + _model_results = BytesField( + "model_results", on_serialize=pickle.dumps, on_deserialize=pickle.loads + ) + _predict_args = TupleField("predict_args") + _predict_kwargs = DictField("predict_kwargs") + + def __init__( + self, model_results=None, predict_args=None, predict_kwargs=None, **kw + ): + super().__init__( + _model_results=model_results, + _predict_args=predict_args, + _predict_kwargs=predict_kwargs, + **kw + ) + + @property + def model_results(self): + return self._model_results + + @property + def predict_args(self) -> tuple: + return self._predict_args + + @property + def predict_kwargs(self) -> dict: + return self._predict_kwargs + + def __call__(self, exog): + if isinstance(exog, (DATAFRAME_TYPE, SERIES_TYPE)): + self._output_types = [OutputType.series] + kwargs = dict( + shape=exog.shape[:1], + index_value=exog.index_value, + dtype=np.dtype("float"), + ) + else: + self._output_types = [OutputType.tensor] + kwargs = dict( + shape=exog.shape[:1], + dtype=np.dtype("float"), + ) + return self.new_tileable([exog], **kwargs) + + @classmethod + def tile(cls, op: "StatsModelsPredict"): + exog = op.inputs[0] + out = op.outputs[0] + + if exog.ndim > 1 and exog.chunk_shape[1] > 1: + exog = yield from recursive_tile(exog.rechunk({1: exog.shape[1]})) + + chunks = [] + for in_chunk in exog.chunks: + if isinstance(exog, (DATAFRAME_TYPE, SERIES_TYPE)): + kwargs = dict( + index=in_chunk.index[:1], + shape=in_chunk.shape[:1], + index_value=in_chunk.index_value, + dtype=out.dtype, + ) + else: + kwargs = dict( + index=in_chunk.index[:1], + shape=in_chunk.shape[:1], + dtype=out.dtype, + ) + + new_op = op.copy().reset_key() + chunks.append(new_op.new_chunk([in_chunk], **kwargs)) + + new_op = op.copy().reset_key() + return new_op.new_tileables( + [exog], chunks=chunks, nsplits=(exog.nsplits[0],), **out.params + ) + + @classmethod + def execute(cls, ctx, op: "StatsModelsPredict"): + in_data = ctx[op.inputs[0].key] + ctx[op.outputs[0].key] = op.model_results.model.predict( + in_data, *op.predict_args, **op.predict_kwargs + ) diff --git a/python/xorbits/_mars/learn/contrib/statsmodels/tests/__init__.py b/python/xorbits/_mars/learn/contrib/statsmodels/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/statsmodels/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/learn/contrib/statsmodels/tests/test_statsmodels.py b/python/xorbits/_mars/learn/contrib/statsmodels/tests/test_statsmodels.py new file mode 100644 index 000000000..acbf12432 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/statsmodels/tests/test_statsmodels.py @@ -0,0 +1,48 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from ..... import tensor as mt + +try: + import statsmodels + + from .. import MarsDistributedModel, MarsResults +except ImportError: # pragma: no cover + statsmodels = MarsDistributedModel = MarsResults = None + + +n_rows = 1000 +n_columns = 10 +chunk_size = 200 +rs = mt.random.RandomState(0) +X = rs.rand(n_rows, n_columns, chunk_size=chunk_size) +y = rs.rand(n_rows, chunk_size=chunk_size) +filter = rs.rand(n_rows, chunk_size=chunk_size) < 0.8 +X = X[filter] +y = y[filter] + + +@pytest.mark.skipif(statsmodels is None, reason="statsmodels not installed") +def test_distributed_stats_models(setup): + y_data = (y * 10).astype(mt.int32) + model = MarsDistributedModel(factor=1.2) + result = model.fit(y_data, X, alpha=0.2) + prediction = result.predict(X) + + X.execute() + + assert prediction.ndim == 1 + assert prediction.shape[0] == len(X) diff --git a/python/xorbits/_mars/learn/contrib/statsmodels/train.py b/python/xorbits/_mars/learn/contrib/statsmodels/train.py new file mode 100644 index 000000000..9aa8fd53b --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/statsmodels/train.py @@ -0,0 +1,244 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pickle # nosec # pylint: disable=import_pickle + +import cloudpickle + +from .... import opcodes +from ....core import OutputType, recursive_tile +from ....core.context import get_context +from ....core.operand import MergeDictOperand, OperandStage +from ....serialization.serializables import ( + BytesField, + DictField, + Float32Field, + FunctionField, + Int32Field, + KeyField, +) +from ....utils import has_unknown_shape + + +class StatsModelsTrain(MergeDictOperand): + _op_type_ = opcodes.STATSMODELS_TRAIN + + _exog = KeyField("exog") # exogenous + _endog = KeyField("endog") # endogenous + + _num_partitions = Int32Field("num_partitions") + _partition_id = Int32Field("partition_id") + _factor = Float32Field("factor") + _model_class = BytesField( + "model_class", on_serialize=cloudpickle.dumps, on_deserialize=cloudpickle.loads + ) + _init_kwds = DictField("init_kwds") + _fit_kwds = DictField("fit_kwds") + _estimation_method = FunctionField("estimation_method") + _estimation_kwds = DictField("estimation_kwds") + _join_method = FunctionField("join_method") + _join_kwds = DictField("join_kwds") + _results_class = BytesField( + "results_class", + on_serialize=cloudpickle.dumps, + on_deserialize=cloudpickle.loads, + ) + _results_kwds = DictField("results_kwds") + + def __init__( + self, + exog=None, + endog=None, + num_partitions=None, + partition_id=None, + factor=None, + model_class=None, + init_kwds=None, + fit_kwds=None, + estimation_method=None, + estimation_kwds=None, + join_method=None, + join_kwds=None, + results_class=None, + results_kwds=None, + **kw + ): + super().__init__( + _exog=exog, + _endog=endog, + _num_partitions=num_partitions, + _partition_id=partition_id, + _factor=factor, + _model_class=model_class, + _init_kwds=init_kwds, + _fit_kwds=fit_kwds, + _estimation_method=estimation_method, + _estimation_kwds=estimation_kwds, + _join_method=join_method, + _join_kwds=join_kwds, + _results_class=results_class, + _results_kwds=results_kwds, + **kw + ) + + @property + def exog(self): + return self._exog + + @property + def endog(self): + return self._endog + + @property + def num_partitions(self): + return self._num_partitions + + @property + def partition_id(self): + return self._partition_id + + @property + def factor(self): + return self._factor + + @property + def model_class(self): + return self._model_class + + @property + def init_kwds(self) -> dict: + return self._init_kwds + + @property + def fit_kwds(self) -> dict: + return self._fit_kwds + + @property + def estimation_method(self): + return self._estimation_method + + @property + def estimation_kwds(self) -> dict: + return self._estimation_kwds + + @property + def join_method(self): + return self._join_method + + @property + def join_kwds(self) -> dict: + return self._join_kwds + + @property + def results_class(self): + return self._results_class + + @property + def results_kwds(self) -> dict: + return self._results_kwds + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(inputs) + self._exog = next(inputs_iter) + self._endog = next(inputs_iter) + + def __call__(self, exog, endog): + self._output_types = [OutputType.object] + return self.new_tileable([exog, endog]) + + @classmethod + def tile(cls, op: "StatsModelsTrain"): + if op.factor is not None: + ctx = get_context() + cluster_cpu_count = ctx.get_total_n_cpu() + assert cluster_cpu_count > 0 + num_partitions = int(cluster_cpu_count * op.factor) + else: + num_partitions = op.num_partitions + + if has_unknown_shape(op.exog, op.endog): + yield + + exog = op.exog + if exog.ndim > 1 and exog.chunk_shape[1] > 1: + exog = exog.rechunk({1: exog.shape[1]}) + exog = yield from recursive_tile(exog.rebalance(num_partitions=num_partitions)) + endog = yield from recursive_tile( + op.endog.rebalance(num_partitions=num_partitions) + ) + + assert len(exog.chunks) == len(endog.chunks) + + # generate map stage + map_chunks = [] + for part_id, (exog_chunk, endog_chunk) in enumerate( + zip(exog.chunks, endog.chunks) + ): + new_op = op.copy().reset_key() + new_op._factor = None + new_op._partition_id = part_id + new_op._num_partitions = num_partitions + new_op.stage = OperandStage.map + + map_chunks.append( + new_op.new_chunk([exog_chunk, endog_chunk], index=exog_chunk.index) + ) + + # generate combine (join) stage + new_op = op.copy().reset_key() + new_op._factor = None + new_op._num_partitions = num_partitions + new_op.stage = OperandStage.combine + + combine_chunk = new_op.new_chunk(map_chunks, index=(0,)) + + # generate tileable + new_op = op.copy().reset_key() + return new_op.new_tileables(op.inputs, chunks=[combine_chunk]) + + @classmethod + def _execute_map(cls, ctx, op: "StatsModelsTrain"): + endog = ctx[op.endog.key] + exog = ctx[op.exog.key] + + # code from statsmodels.base.distributed_estimation::_helper_fit_partition + model = op.model_class(endog, exog, **op.init_kwds) + results = op.estimation_method( + model, + op.partition_id, + op.num_partitions, + fit_kwds=op.fit_kwds, + **op.estimation_kwds + ) + ctx[op.outputs[0].key] = pickle.dumps(results) + + @classmethod + def _execute_combine(cls, ctx, op: "StatsModelsTrain"): + # code from statsmodels.base.distributed_estimation::DistributedModel.fit + results_list = [pickle.loads(ctx[inp.key]) for inp in op.inputs] # nosec + params = op.join_method(results_list, **op.join_kwds) + res_mod = op.model_class([0], [0], **op.init_kwds) + result = op.results_class(res_mod, params, **op.results_kwds) + + ctx[op.outputs[0].key] = pickle.dumps(result) + + @classmethod + def execute(cls, ctx, op: "StatsModelsTrain"): + if op.merge: # pragma: no cover + super().execute(ctx, op) + elif op.stage == OperandStage.combine: + cls._execute_combine(ctx, op) + else: + cls._execute_map(ctx, op) diff --git a/python/xorbits/_mars/learn/contrib/tensorflow/__init__.py b/python/xorbits/_mars/learn/contrib/tensorflow/__init__.py new file mode 100644 index 000000000..2c1651bc7 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/tensorflow/__init__.py @@ -0,0 +1,24 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .dataset import ( # noqa: F401 # pylint: disable=unused-import + gen_tensorflow_dataset, +) +from .run_script import run_tensorflow_script + + +def register_op(): + from .run_script import RunTensorFlow + + del RunTensorFlow diff --git a/python/xorbits/_mars/learn/contrib/tensorflow/dataset.py b/python/xorbits/_mars/learn/contrib/tensorflow/dataset.py new file mode 100644 index 000000000..e6ef3a92b --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/tensorflow/dataset.py @@ -0,0 +1,195 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import copy +from typing import List, Tuple + +import numpy as np +import pandas as pd + +from .... import execute +from ....core.context import get_context +from ....dataframe.core import DATAFRAME_TYPE, SERIES_TYPE +from ....tensor.core import TENSOR_TYPE +from ....utils import lazy_import, require_not_none + +tf = lazy_import("tensorflow") + + +ACCEPT_TYPE = ( + TENSOR_TYPE, + DATAFRAME_TYPE, + SERIES_TYPE, + np.ndarray, + pd.DataFrame, + pd.Series, + List, +) + + +@require_not_none(tf) +class MarsDataset: + def __init__( + self, + tensors, + output_shapes: Tuple[int, ...] = None, + output_types: Tuple[np.dtype, ...] = None, + fetch_kwargs=None, + ): + self._context = get_context() + self._tensors = tensors + self._output_shapes = output_shapes + self._output_types = output_types + self._fetch_kwargs = fetch_kwargs or dict() + self._executed = False + self._check_and_convert() + + def _check_and_convert(self): + if not isinstance(self._tensors, Tuple): + self._tensors = (self._tensors,) + for t in self._tensors: + if not isinstance(t, ACCEPT_TYPE): + raise TypeError(f"Unexpected dataset type: {type(t)}") + + if not self._executed: + self._execute() + self._executed = True + + if not self._output_shapes: + get_shape = ( + lambda t: tuple(()) + if isinstance(t, (List, SERIES_TYPE, pd.Series)) + else t.shape[1:] + ) + self._output_shapes = ( + get_shape(self._tensors[0]) + if len(self._tensors) == 1 + else tuple(get_shape(t) for t in self._tensors) + ) + + if not self._output_types: + get_type = ( + lambda t: type(t[0]) + if isinstance(t, List) + else t[0].dtype + if isinstance(t, (DATAFRAME_TYPE, pd.DataFrame)) + else t.dtype + ) + self._output_types = ( + get_type(self._tensors[0]) + if len(self._tensors) == 1 + else tuple(tf.as_dtype(get_type(t)) for t in self._tensors) + ) + + def _execute(self): # pragma: no cover + execute_data = [t for t in self._tensors if isinstance(t, ACCEPT_TYPE[:3])] + + if len(execute_data) > 0: + execute(execute_data) + + def get_data(self, t, index): # pragma: no cover + # coverage not included as now there is no solution to cover tensorflow methods + # see https://github.com/tensorflow/tensorflow/issues/33759 for more details. + fetch_kwargs = dict() + if self._fetch_kwargs: + fetch_kwargs = copy.deepcopy(self._fetch_kwargs) + + if isinstance(t, TENSOR_TYPE): + return t[index].fetch(**fetch_kwargs) + elif isinstance(t, np.ndarray): + return t[index] + elif isinstance(t, DATAFRAME_TYPE): + return t.iloc[index].fetch(**fetch_kwargs).values + elif isinstance(t, SERIES_TYPE): + return t.iloc[index].fetch(**fetch_kwargs) + elif isinstance(t, pd.DataFrame): + return t.iloc[index].values + elif isinstance(t, pd.Series): + return t.iloc[index] + else: + return t[index] + + def to_tf(self) -> "tf.data.Dataset": + """Get TF Dataset. + + convert into a tensorflow.data.Dataset + """ + + def make_generator(): # pragma: no cover + if not self._executed: + self._execute() + self._executed = True + + for i in range(len(self._tensors[0])): + if len(self._tensors) == 1: + yield self.get_data(self._tensors[0], i) + else: + yield tuple(self.get_data(t, i) for t in self._tensors) + + return tf.data.Dataset.from_generator( + make_generator, + output_types=self._output_types, + output_shapes=self._output_shapes, + ) + + +def gen_tensorflow_dataset( + tensors, + output_shapes: Tuple[int, ...] = None, + output_types: Tuple[np.dtype, ...] = None, + fetch_kwargs=None, +): + """ + convert mars data type to tf.data.Dataset. Note this is based tensorflow 2.0 + For example + ----------- + >>> # convert a tensor to tf.data.Dataset. + >>> data = mt.tensor([[1, 2], [3, 4]]) + >>> dataset = gen_tensorflow_dataset(data) + >>> list(dataset.as_numpy_iterator()) + [array([1, 2]), array([3, 4])] + >>> dataset.element_spec + TensorSpec(shape=(2,), dtype=tf.int64, name=None) + + >>> # convert a tuple of tensors to tf.data.Dataset. + >>> data1 = mt.tensor([1, 2]); data2 = mt.tensor([3, 4]); data3 = mt.tensor([5, 6]) + >>> dataset = gen_tensorflow_dataset((data1, data2, data3)) + >>> list(dataset.as_numpy_iterator()) + [(1, 3, 5), (2, 4, 6)] + + Parameters + ---------- + tensors: Mars data type or a tuple consisting of Mars data type + the data that convert to tf.data.dataset + output_shapes: + A (nested) structure of `tf.TensorShape` objects corresponding to + each component of an element yielded from mars object. + output_types: + A (nested) structure of `tf.DType` objects corresponding to each + component of an element yielded from mars object. + fetch_kwargs: + the parameters of mars object executes fetch() operation. + Returns + ------- + tf.data.Dataset + """ + mars_dataset = MarsDataset( + tensors, + output_shapes=output_shapes, + output_types=output_types, + fetch_kwargs=fetch_kwargs, + ) + + return mars_dataset.to_tf() diff --git a/python/xorbits/_mars/learn/contrib/tensorflow/run_script.py b/python/xorbits/_mars/learn/contrib/tensorflow/run_script.py new file mode 100644 index 000000000..221ade0f5 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/tensorflow/run_script.py @@ -0,0 +1,225 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +from typing import Any, BinaryIO, Dict, List, Optional, TextIO, Union + +import numpy as np + +from .... import opcodes as OperandDef +from ....core import recursive_tile +from ....core.context import get_context +from ....remote.run_script import RunScript, _extract_inputs +from ....serialization.serializables import ( + BytesField, + DictField, + Int32Field, + StringField, +) +from ....typing import SessionType, TileableType +from ....utils import to_binary +from ...utils import collect_ports +from ..utils import pick_workers + + +class RunTensorFlow(RunScript): + _op_type_ = OperandDef.RUN_TENSORFLOW + + _code = BytesField("code") + _n_workers = Int32Field("n_workers") + _n_ps = Int32Field("n_ps") + _tf_config = DictField("tf_config") + _port = Int32Field("port") + # used for chunk op + _tf_task_type = StringField("tf_task_type") + _tf_task_index = Int32Field("tf_task_index") + + def __init__( + self, + n_workers=None, + n_ps=None, + tf_config=None, + port=None, + tf_task_type=None, + tf_task_index=None, + gpu=None, + **kw, + ): + super().__init__( + _n_workers=n_workers, + _n_ps=n_ps, + _tf_config=tf_config, + _port=port, + _tf_task_type=tf_task_type, + _tf_task_index=tf_task_index, + gpu=gpu, + **kw, + ) + + @property + def n_workers(self): + return self._n_workers + + @property + def n_ps(self): + return self._n_ps or 0 + + @property + def n_roles(self): + return self._n_workers + self._n_ps + + @property + def tf_config(self): + return self._tf_config + + @property + def port(self): + return self._port + + @property + def tf_task_type(self): + return self._tf_task_type + + @property + def tf_task_index(self): + return self._tf_task_index + + @classmethod + def tile(cls, op): + ctx = get_context() + + cluster_conf = {"worker": []} + if op.n_ps > 0: + cluster_conf["ps"] = [] + n_workers = op.n_workers + + out_chunks = [] + worker_addresses = ctx.get_worker_addresses() + picked_workers = pick_workers(worker_addresses, op.n_roles) + data, input_chunks = cls._get_chunk_data(op) + + ports = yield from recursive_tile(collect_ports(worker_addresses)) + yield ports.chunks + ports = ctx.get_chunks_result([ports.chunks[0].key])[0] + + i = 0 + for worker, port in zip(picked_workers, ports): + worker_addr = worker.rsplit(":", 1)[0] + chunk_op = op.copy().reset_key() + chunk_op._data = data + addr = f"{worker_addr}:{port}" + # tell graph actor that the chunk should be executed on the exact worker + chunk_op.expect_worker = worker + tp = "worker" if i < n_workers else "ps" + chunk_op._tf_task_type = tp + idx = i if i < n_workers else i - n_workers + chunk_op._tf_task_index = idx + cluster_conf[tp].append(addr) + chunk_op._tf_config = { + "cluster": cluster_conf, + "task": {"type": tp, "index": idx}, + } + out_chunks.append(chunk_op.new_chunk(input_chunks, index=(i,))) + i += 1 + + new_op = op.copy() + return new_op.new_tileables( + op.inputs, + chunks=out_chunks, + nsplits=(tuple(np.nan for _ in range(len(out_chunks))),), + ) + + @classmethod + def _build_envs(cls, ctx, op): + envs = super()._build_envs(ctx, op) + envs.update({"TF_CONFIG": json.dumps(op.tf_config)}) + return envs + + @classmethod + def execute(cls, ctx, op): + if op.merge: + return super().execute(ctx, op) + + assert ctx.local_address.split(":")[0] == op.expect_worker.split(":")[0] + + super().execute(ctx, op) + + if op.tf_task_type == "worker" and op.tf_task_index == 0: + ctx[op.outputs[0].key] = {"status": "ok"} + else: + ctx[op.outputs[0].key] = {} + + +def run_tensorflow_script( + script: Union[bytes, str, BinaryIO, TextIO], + n_workers: int, + n_ps: int = 0, + data: Dict[str, TileableType] = None, + gpu: Optional[bool] = None, + command_argv: List[str] = None, + retry_when_fail: bool = False, + session: SessionType = None, + run_kwargs: Dict[str, Any] = None, +): + """ + Run TensorFlow script in Mars cluster. + + Parameters + ---------- + script: str or file-like object + Script to run + n_workers : int + Number of TensorFlow workers. + n_ps : int + Number of TensorFlow PS workers. + data : dict + Variable name to data. + gpu : bool + Run PyTorch script on GPU + command_argv : list + Extra command args for script + retry_when_fail : bool + If True, retry when function failed. + session + Mars session, if not provided, will use default one. + run_kwargs : dict + Extra kwargs for `session.run`. + + Returns + ------- + status + return {'status': 'ok'} if succeeded, or error raised + """ + if int(n_workers) <= 0: + raise ValueError("n_workers should be at least 1") + if int(n_ps) < 0: + raise ValueError("n_ps should be at least 0") + if hasattr(script, "read"): + code = script.read() + else: + with open(os.path.abspath(script), "rb") as f: + code = f.read() + + inputs = _extract_inputs(data) + op = RunTensorFlow( + data=data, + code=to_binary(code), + n_workers=int(n_workers), + n_ps=int(n_ps), + retry_when_fail=retry_when_fail, + gpu=gpu, + command_args=command_argv, + ) + return op(inputs).execute(session=session, **(run_kwargs or {})) diff --git a/python/xorbits/_mars/learn/contrib/tensorflow/tests/__init__.py b/python/xorbits/_mars/learn/contrib/tensorflow/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/tensorflow/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/learn/contrib/tensorflow/tests/test_dataset.py b/python/xorbits/_mars/learn/contrib/tensorflow/tests/test_dataset.py new file mode 100644 index 000000000..9ef4aa135 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/tensorflow/tests/test_dataset.py @@ -0,0 +1,157 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pytest + +from ..... import dataframe as md +from ..... import tensor as mt +from .....utils import lazy_import +from .. import gen_tensorflow_dataset, run_tensorflow_script + +tf_installed = lazy_import("tensorflow") is not None + + +@pytest.mark.skipif(not tf_installed, reason="tensorflow not installed") +def test_mars_dataset(setup_cluster): + import numpy as np + import pandas as pd + + tf_dataset_ops = lazy_import("tensorflow.python.data.ops.dataset_ops") + + # Mars tensor + data = mt.random.rand(1000, 32, dtype="f4") + data_verify = data[:10].execute().fetch() + + dataset = gen_tensorflow_dataset(data) + assert isinstance(dataset, tf_dataset_ops.DatasetV2) + for _, data_1batch in enumerate(dataset.repeat().batch(10).take(1)): + np.testing.assert_array_equal(data_1batch, data_verify) + + # Mars tensors + data = mt.random.rand(1000, 32, dtype="f4") + labels = mt.random.randint(0, 2, (1000, 10), dtype="f4") + + data_verify = data[:10].execute().fetch() + labels_verify = labels[:10].execute().fetch() + + dataset = gen_tensorflow_dataset((data, labels)) + assert isinstance(dataset, tf_dataset_ops.DatasetV2) + for _, (data_1batch, label_1batch) in enumerate(dataset.repeat().batch(10).take(1)): + np.testing.assert_array_equal(data_1batch, data_verify) + np.testing.assert_array_equal(label_1batch, labels_verify) + + # np ndarray + data = np.random.rand(1000, 32) + labels = np.random.randint(0, 2, (1000, 10)) + + data_verify = data[:10] + labels_verify = labels[:10] + + dataset = gen_tensorflow_dataset((data, labels)) + assert isinstance(dataset, tf_dataset_ops.DatasetV2) + for _, (data_1batch, label_1batch) in enumerate(dataset.repeat().batch(10).take(1)): + np.testing.assert_array_equal(data_1batch, data_verify) + np.testing.assert_array_equal(label_1batch, labels_verify) + + # Mars dataframe + data = md.DataFrame(data) + labels = md.DataFrame(labels) + + data_verify = data.iloc[:10].execute().fetch().values + labels_verify = labels.iloc[:10].execute().fetch().values + + dataset = gen_tensorflow_dataset( + (data, labels), fetch_kwargs={"extra_config": {"check_series_name": False}} + ) + assert isinstance(dataset, tf_dataset_ops.DatasetV2) + for _, (data_1batch, label_1batch) in enumerate(dataset.repeat().batch(10).take(1)): + np.testing.assert_array_equal(data_1batch, data_verify) + np.testing.assert_array_equal(label_1batch, labels_verify) + + # Mars series + label = labels[1] + + label_verify = label[:10].execute().fetch() + + dataset = gen_tensorflow_dataset( + (data, label), fetch_kwargs={"extra_config": {"check_series_name": False}} + ) + assert isinstance(dataset, tf_dataset_ops.DatasetV2) + for _, (data_1batch, label_1batch) in enumerate(dataset.repeat().batch(10).take(1)): + np.testing.assert_array_equal(data_1batch, data_verify) + np.testing.assert_array_equal(label_1batch, label_verify) + + # pandas dataframe + data = pd.DataFrame(np.random.rand(1000, 32)) + labels = pd.DataFrame(np.random.randint(0, 2, (1000, 10)), dtype="float32") + + data_verify = data.iloc[:10].values + labels_verify = labels.iloc[:10].values + dataset = gen_tensorflow_dataset((data, labels)) + assert isinstance(dataset, tf_dataset_ops.DatasetV2) + for _, (data_1batch, label_1batch) in enumerate(dataset.repeat().batch(10).take(1)): + np.testing.assert_array_equal(data_1batch, data_verify) + np.testing.assert_array_equal(label_1batch, labels_verify) + + # pandas series + label = labels[1] + + label_verify = label[:10] + + dataset = gen_tensorflow_dataset((data, label)) + assert isinstance(dataset, tf_dataset_ops.DatasetV2) + for _, (data_1batch, label_1batch) in enumerate(dataset.repeat().batch(10).take(1)): + np.testing.assert_array_equal(data_1batch, data_verify) + np.testing.assert_array_equal(label_1batch, label_verify) + + # list + label = label.tolist() + + label_verify = label[:10] + + dataset = gen_tensorflow_dataset((data, label)) + assert isinstance(dataset, tf_dataset_ops.DatasetV2) + for _, (data_1batch, label_1batch) in enumerate(dataset.repeat().batch(10).take(1)): + np.testing.assert_array_equal(data_1batch, data_verify) + np.testing.assert_array_equal(label_1batch, label_verify) + + # test TypeError + label = tuple(range(1000)) + + with pytest.raises(TypeError) as e: + dataset = gen_tensorflow_dataset((data, label)) + exec_msg = e.value.args[0] + assert exec_msg == "Unexpected dataset type: " + + +@pytest.mark.skipif(not tf_installed, reason="tensorflow not installed") +def test_mars_dataset_script(setup_cluster): + sess = setup_cluster + path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tf_dataset.py") + + data = mt.random.rand(1000, 32, dtype="f4") + labels = mt.random.randint(0, 2, (1000, 10), dtype="f4") + + assert ( + run_tensorflow_script( + path, + n_workers=2, + data={"feature_data": data, "labels": labels}, + command_argv=["multiple"], + session=sess, + ).fetch()["status"] + == "ok" + ) diff --git a/python/xorbits/_mars/learn/contrib/tensorflow/tests/test_run_script.py b/python/xorbits/_mars/learn/contrib/tensorflow/tests/test_run_script.py new file mode 100644 index 000000000..e3d86aa1c --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/tensorflow/tests/test_run_script.py @@ -0,0 +1,43 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pytest + +try: + import tensorflow +except ImportError: + tensorflow = None + +from .. import run_tensorflow_script + + +@pytest.mark.skipif(tensorflow is None, reason="tensorflow not installed") +def test_local_run_tensor_flow_script(setup_cluster): + path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "tf_distributed_sample.py" + ) + assert ( + run_tensorflow_script(path, n_workers=2, command_argv=["multiple"]).fetch()[ + "status" + ] + == "ok" + ) + + with pytest.raises(ValueError): + run_tensorflow_script(path, n_workers=0) + + with pytest.raises(ValueError): + run_tensorflow_script(path, 2, n_ps=-1) diff --git a/python/xorbits/_mars/learn/contrib/tensorflow/tests/tf_dataset.py b/python/xorbits/_mars/learn/contrib/tensorflow/tests/tf_dataset.py new file mode 100644 index 000000000..67afc4d64 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/tensorflow/tests/tf_dataset.py @@ -0,0 +1,60 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import sys + +import tensorflow as tf +from mars.learn.contrib.tensorflow import gen_tensorflow_dataset +from tensorflow.keras import layers +from tensorflow.python.data.ops.dataset_ops import DatasetV2 + + +def get_model(): + model = tf.keras.Sequential() + model.add(layers.Dense(64, activation="relu")) + model.add(layers.Dense(64, activation="relu")) + model.add(layers.Dense(10, activation="softmax")) + model.compile( + optimizer=tf.keras.optimizers.Adam(0.01), + loss="categorical_crossentropy", + metrics=["accuracy"], + ) + return model + + +def train(feature_data, labels): + data = feature_data + labels = labels + + db_train = gen_tensorflow_dataset((data, labels)) + assert isinstance(db_train, DatasetV2) + db_train = db_train.batch(32) + + model = get_model() + model.fit(db_train, epochs=2) + + +if __name__ == "__main__": + assert json.loads(os.environ["TF_CONFIG"])["task"]["index"] in {0, 1} + assert len(sys.argv) == 2 + assert sys.argv[1] == "multiple" + + feature_data = globals()["feature_data"] + labels = globals()["labels"] + multiworker_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() + + with multiworker_strategy.scope(): + train(feature_data, labels) diff --git a/python/xorbits/_mars/learn/contrib/tensorflow/tests/tf_distributed_sample.py b/python/xorbits/_mars/learn/contrib/tensorflow/tests/tf_distributed_sample.py new file mode 100644 index 000000000..4041a5e7e --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/tensorflow/tests/tf_distributed_sample.py @@ -0,0 +1,48 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import sys + +import numpy as np +import tensorflow as tf +from tensorflow.keras import layers + + +def get_model(): + model = tf.keras.Sequential() + model.add(layers.Dense(64, activation="relu")) + model.add(layers.Dense(64, activation="relu")) + model.add(layers.Dense(10, activation="softmax")) + model.compile( + optimizer=tf.keras.optimizers.Adam(0.01), + loss="categorical_crossentropy", + metrics=["accuracy"], + ) + return model + + +assert json.loads(os.environ["TF_CONFIG"])["task"]["index"] in {0, 1} +assert len(sys.argv) == 2 +assert sys.argv[1] == "multiple" + +multiworker_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() + +with multiworker_strategy.scope(): + data = np.random.random((1000, 32)) + labels = np.random.random((1000, 10)) + + model = get_model() + model.fit(data, labels, epochs=2, batch_size=32) diff --git a/python/xorbits/_mars/learn/contrib/tsfresh/__init__.py b/python/xorbits/_mars/learn/contrib/tsfresh/__init__.py new file mode 100644 index 000000000..31f64ca7e --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/tsfresh/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .core import MarsDistributor diff --git a/python/xorbits/_mars/learn/contrib/tsfresh/core.py b/python/xorbits/_mars/learn/contrib/tsfresh/core.py new file mode 100644 index 000000000..917160b4a --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/tsfresh/core.py @@ -0,0 +1,66 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd + +from .... import remote as mr +from ....deploy.oscar.session import get_default_session +from ....utils import ceildiv + +try: + try: + # fix for tsfresh 0.17.0, from this version on, + # we need to inherit from IterableDistributorBaseClass + from tsfresh.utilities.distribution import ( + IterableDistributorBaseClass as DistributorBaseClass, + ) + except ImportError: # pragma: no cover + from tsfresh.utilities.distribution import DistributorBaseClass +except ImportError: # pragma: no cover + DistributorBaseClass = object + + +class MarsDistributor(DistributorBaseClass): + def __init__(self, session=None): + self._session = session or get_default_session() + + def calculate_best_chunk_size(self, data_length): + n_cores = self._session.get_total_n_cpu() + return ceildiv(data_length, n_cores) + + def distribute(self, func, partitioned_chunks, kwargs): + def _wrapped_func(*args, **kw): + # Series.value_counts() may not be able to handle + if not getattr(pd.Series.value_counts, "_wrapped", False): + old_value_counts = pd.Series.value_counts + + def _wrapped_value_counts(obj, *args, **kw): + try: + return old_value_counts(obj, *args, **kw) + except ValueError: # pragma: no cover + return old_value_counts(obj.copy(), *args, **kw) + + pd.Series.value_counts = _wrapped_value_counts + pd.Series.value_counts._wrapped = True + + return func(*args, **kw) + + tasks = [] + for partitioned_chunk in partitioned_chunks: + tasks.append( + mr.spawn(_wrapped_func, args=(partitioned_chunk,), kwargs=kwargs) + ) + executed = mr.ExecutableTuple(tasks).execute(session=self._session) + fetched = executed.fetch(session=self._session) + return [item for results in fetched for item in results] diff --git a/python/xorbits/_mars/learn/contrib/tsfresh/tests/__init__.py b/python/xorbits/_mars/learn/contrib/tsfresh/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/tsfresh/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/learn/contrib/tsfresh/tests/test_tsfresh.py b/python/xorbits/_mars/learn/contrib/tsfresh/tests/test_tsfresh.py new file mode 100644 index 000000000..8643dc0e7 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/tsfresh/tests/test_tsfresh.py @@ -0,0 +1,49 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +try: + import tsfresh + from tsfresh import extract_features + from tsfresh.examples import robot_execution_failures + from tsfresh.feature_extraction import ComprehensiveFCParameters + from tsfresh.utilities.dataframe_functions import impute +except ImportError: + tsfresh = None + +from .....deploy.oscar.session import get_default_session, new_session +from .. import MarsDistributor + + +@pytest.mark.skipif(tsfresh is None, reason="tsfresh not installed") +def test_distributed_ts_fresh(setup): + robot_execution_failures.download_robot_execution_failures() + df, y = robot_execution_failures.load_robot_execution_failures() + default_session = get_default_session() + sync_session = new_session(default_session.address, default=False) + dist = MarsDistributor(session=sync_session) + + df = df.iloc[:200].copy() + + extraction_settings = ComprehensiveFCParameters() + extract_features( + df, + column_id="id", + column_sort="time", + default_fc_parameters=extraction_settings, + # we impute = remove all NaN features automatically + impute_function=impute, + distributor=dist, + ) diff --git a/python/xorbits/_mars/learn/contrib/utils.py b/python/xorbits/_mars/learn/contrib/utils.py new file mode 100644 index 000000000..9e408b150 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/utils.py @@ -0,0 +1,83 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys + +import numpy as np + + +def make_import_error_func(package_name): + def _func(*_, **__): # pragma: no cover + raise ImportError( + f"Cannot import {package_name}, please reinstall that package." + ) + + return _func + + +def pick_workers(workers, size): + """ + Pick workers from a list. + + This method will try to pick workers as balanced as it can. + + 1. If size <= len(workers), randomly pick workers from the list. + 2. If size > len(workers), just select all workers in a random order, + then see the rest size, if it's still more than the workers size, + return all workers in a random order, if not, + randomly select workers from the list. + + :param workers: workers list + :param size: number to pick from workers list + :return: ndarray of selected workers whose length is `size` + """ + result = np.empty(size, dtype=object) + rest = size + while rest > 0: + start = size - rest + to_pick_size = min(size - start, len(workers)) + result[start : start + to_pick_size] = np.random.permutation(workers)[ + :to_pick_size + ] + rest = rest - to_pick_size + return result + + +def config_mod_getattr(mod_dict, globals_): + def __getattr__(name): + import importlib + + if name in mod_dict: + mod_name, cls_name = mod_dict[name].rsplit(".", 1) + mod = importlib.import_module(mod_name, globals_["__name__"]) + cls = globals_[name] = getattr(mod, cls_name) + return cls + else: # pragma: no cover + raise AttributeError(name) + + if sys.version_info[:2] < (3, 7): + for _mod in mod_dict.keys(): + __getattr__(_mod) + + def __dir__(): + return sorted([n for n in globals_ if not n.startswith("_")] + list(mod_dict)) + + globals_.update( + { + "__getattr__": __getattr__, + "__dir__": __dir__, + "__all__": list(__dir__()), + "__warningregistry__": dict(), + } + ) diff --git a/python/xorbits/_mars/learn/contrib/xgboost/__init__.py b/python/xorbits/_mars/learn/contrib/xgboost/__init__.py new file mode 100644 index 000000000..21678e4e8 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/xgboost/__init__.py @@ -0,0 +1,34 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .dmatrix import MarsDMatrix +from .predict import predict +from .train import train + + +def register_op(): + from .start_tracker import StartTracker + + del StartTracker + + +from ..utils import config_mod_getattr as _config_mod_getattr + +_config_mod_getattr( + { + "XGBClassifier": ".classifier.XGBClassifier", + "XGBRegressor": ".regressor.XGBRegressor", + }, + globals(), +) diff --git a/python/xorbits/_mars/learn/contrib/xgboost/classifier.py b/python/xorbits/_mars/learn/contrib/xgboost/classifier.py new file mode 100644 index 000000000..3fe7e9edc --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/xgboost/classifier.py @@ -0,0 +1,102 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..utils import make_import_error_func +from .core import XGBScikitLearnBase, xgboost + +XGBClassifier = make_import_error_func("xgboost") +if xgboost: + from xgboost.sklearn import XGBClassifierBase + + from .... import tensor as mt + from .core import wrap_evaluation_matrices + from .predict import predict + from .train import train + + class XGBClassifier(XGBScikitLearnBase, XGBClassifierBase): + """ + Implementation of the scikit-learn API for XGBoost classification. + """ + + def fit( + self, + X, + y, + sample_weight=None, + base_margin=None, + eval_set=None, + sample_weight_eval_set=None, + base_margin_eval_set=None, + **kw, + ): + session = kw.pop("session", None) + run_kwargs = kw.pop("run_kwargs", dict()) + if kw: + raise TypeError( + f"fit got an unexpected keyword argument '{next(iter(kw))}'" + ) + + dtrain, evals = wrap_evaluation_matrices( + None, + X, + y, + sample_weight, + base_margin, + eval_set, + sample_weight_eval_set, + base_margin_eval_set, + ) + params = self.get_xgb_params() + + self.classes_ = mt.unique(y, aggregate_size=1).to_numpy( + session=session, **run_kwargs + ) + self.n_classes_ = len(self.classes_) + + if self.n_classes_ > 2: + params["objective"] = "multi:softprob" + params["num_class"] = self.n_classes_ + else: + params["objective"] = "binary:logistic" + + self.evals_result_ = dict() + result = train( + params, + dtrain, + num_boost_round=self.get_num_boosting_rounds(), + evals=evals, + evals_result=self.evals_result_, + session=session, + run_kwargs=run_kwargs, + ) + self._Booster = result + return self + + def predict(self, data, **kw): + session = kw.pop("session", None) + run_kwargs = kw.pop("run_kwargs", dict()) + run = kw.pop("run", True) + prob = predict(self.get_booster(), data, run=False, **kw) + if prob.ndim > 1: + prediction = mt.argmax(prob, axis=1) + else: + prediction = (prob > 0.5).astype(mt.int64) + if run: + prediction.execute(session=session, **run_kwargs) + return prediction + + def predict_proba(self, data, ntree_limit=None, **kw): + if ntree_limit is not None: + raise NotImplementedError("ntree_limit is not currently supported") + return predict(self.get_booster(), data, **kw) diff --git a/python/xorbits/_mars/learn/contrib/xgboost/core.py b/python/xorbits/_mars/learn/contrib/xgboost/core.py new file mode 100644 index 000000000..d1349a382 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/xgboost/core.py @@ -0,0 +1,152 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Callable, List, Optional, Tuple + +try: + import xgboost +except ImportError: + xgboost = None + +from .dmatrix import MarsDMatrix + +XGBScikitLearnBase = None +if xgboost: + + class XGBScikitLearnBase(xgboost.XGBModel): + """ + Base class for implementing scikit-learn interface + """ + + def fit( + self, + X, + y, + sample_weights=None, + eval_set=None, + sample_weight_eval_set=None, + **kw, + ): + """ + Fit the regressor. + Parameters + ---------- + X : array_like + Feature matrix + y : array_like + Labels + sample_weight : array_like + instance weights + eval_set : list, optional + A list of (X, y) tuple pairs to use as validation sets, for which + metrics will be computed. + Validation metrics will help us track the performance of the model. + sample_weight_eval_set : list, optional + A list of the form [L_1, L_2, ..., L_n], where each L_i is a list + of group weights on the i-th validation set. + """ + raise NotImplementedError + + def predict(self, data, **kw): + """ + Predict with `data`. + + Parameters + ---------- + data: data that can be used to perform prediction + Returns + ------- + prediction : mars.tensor.Tensor + """ + raise NotImplementedError + + def wrap_evaluation_matrices( + missing: float, + X: Any, + y: Any, + sample_weight: Optional[Any], + base_margin: Optional[Any], + eval_set: Optional[List[Tuple[Any, Any]]], + sample_weight_eval_set: Optional[List[Any]], + base_margin_eval_set: Optional[List[Any]], + label_transform: Callable = lambda x: x, + ) -> Tuple[Any, Optional[List[Tuple[Any, str]]]]: + """Convert array_like evaluation matrices into DMatrix. Perform validation on the way.""" + train_dmatrix = MarsDMatrix( + data=X, + label=label_transform(y), + weight=sample_weight, + base_margin=base_margin, + missing=missing, + ) + + n_validation = 0 if eval_set is None else len(eval_set) + + def validate_or_none(meta: Optional[List], name: str) -> List: + if meta is None: + return [None] * n_validation + if len(meta) != n_validation: + raise ValueError( + f"{name}'s length does not equal `eval_set`'s length, " + + f"expecting {n_validation}, got {len(meta)}" + ) + return meta + + if eval_set is not None: + sample_weight_eval_set = validate_or_none( + sample_weight_eval_set, "sample_weight_eval_set" + ) + base_margin_eval_set = validate_or_none( + base_margin_eval_set, "base_margin_eval_set" + ) + + evals = [] + for i, (valid_X, valid_y) in enumerate(eval_set): + # Skip the duplicated entry. + if all( + ( + valid_X is X, + valid_y is y, + sample_weight_eval_set[i] is sample_weight, + base_margin_eval_set[i] is base_margin, + ) + ): + evals.append(train_dmatrix) + else: + m = MarsDMatrix( + data=valid_X, + label=label_transform(valid_y), + weight=sample_weight_eval_set[i], + base_margin=base_margin_eval_set[i], + missing=missing, + ) + evals.append(m) + nevals = len(evals) + eval_names = [f"validation_{i}" for i in range(nevals)] + evals = list(zip(evals, eval_names)) + else: + if any( + meta is not None + for meta in [ + sample_weight_eval_set, + base_margin_eval_set, + ] + ): + raise ValueError( + "`eval_set` is not set but one of the other evaluation meta info is " + "not None." + ) + evals = [] + + return train_dmatrix, evals diff --git a/python/xorbits/_mars/learn/contrib/xgboost/dmatrix.py b/python/xorbits/_mars/learn/contrib/xgboost/dmatrix.py new file mode 100644 index 000000000..9809d2386 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/xgboost/dmatrix.py @@ -0,0 +1,359 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from typing import List, Union + +import numpy as np + +from .... import opcodes as OperandDef +from ....core import get_output_types, recursive_tile +from ....core.context import Context, get_context +from ....dataframe.core import DATAFRAME_TYPE +from ....serialization.serializables import BoolField, Float64Field, KeyField, ListField +from ....tensor import tensor as astensor +from ....tensor.core import TENSOR_CHUNK_TYPE, TENSOR_TYPE +from ....typing import ChunkType, TileableType +from ....utils import build_fetch, ensure_own_data, has_unknown_shape +from ...operands import LearnOperand, LearnOperandMixin +from ...utils import concat_chunks, convert_to_tensor_or_dataframe + + +class ToDMatrix(LearnOperand, LearnOperandMixin): + _op_type_ = OperandDef.TO_DMATRIX + + data = KeyField("data") + label = KeyField("label") + missing = Float64Field("missing") + weight = KeyField("weight") + base_margin = KeyField("base_margin") + feature_names = ListField("feature_names") + feature_types = ListField("feature_types") + # if to collocate the data, label and weight + _collocate = BoolField("collocate", default=False) + + @property + def output_limit(self): + if self._collocate: + return ( + 1 + + (self.label is not None) + + (self.weight is not None) + + (self.base_margin is not None) + ) + return 1 + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if self.data is not None: + self.data = self._inputs[0] + has_label = self.label is not None + if has_label: + self.label = self._inputs[1] + if self.weight is not None: + i = 1 if not has_label else 2 + self.weight = self._inputs[i] + if self.base_margin is not None: + self.base_margin = self._inputs[-1] + + @staticmethod + def _get_kw(obj): + if isinstance(obj, TENSOR_TYPE + TENSOR_CHUNK_TYPE): + return {"shape": obj.shape, "dtype": obj.dtype, "order": obj.order} + else: + return { + "shape": obj.shape, + "dtypes": obj.dtypes, + "index_value": obj.index_value, + "columns_value": obj.columns_value, + } + + def __call__(self): + inputs = [self.data] + kw = self._get_kw(self.data) + if self.label is not None: + inputs.append(self.label) + if self.weight is not None: + inputs.append(self.weight) + if self.base_margin is not None: + inputs.append(self.base_margin) + + return self.new_tileable(inputs, **kw) + + @classmethod + def _get_collocated( + cls, + op: "ToDMatrix", + data: TileableType, + label: TileableType, + weight: TileableType, + base_margin: TileableType, + ) -> List[TileableType]: + types = ["data", "label", "weight", "base_margin"] + nsplit = data.nsplits[0] + out_chunkss = [[] for _ in op.inputs] + for i in range(len(nsplit)): + data_chunk = data.cix[i, 0] + inps = [data_chunk] + kws = [] + chunk_op = op.copy().reset_key() + chunk_op._collocate = True + chunk_op.data = data_chunk + output_types = [get_output_types(data)[0]] + data_kw = cls._get_kw(data_chunk) + data_kw["index"] = data_chunk.index + kws.append(data_kw) + for type_name, inp in zip(types[1:], [label, weight, base_margin]): + if inp is None: + continue + inp_chunk = inp.cix[i,] + setattr(chunk_op, type_name, inp_chunk) + inps.append(inp_chunk) + kw = cls._get_kw(inp_chunk) + kw["index"] = inp_chunk.index + kw["type"] = type_name + kws.append(kw) + output_types.append(get_output_types(inp)[0]) + chunk_op.output_types = output_types + out_chunks = chunk_op.new_chunks(inps, kws=kws) + for i, out_chunk in enumerate(out_chunks): + out_chunkss[i].append(out_chunk) + + new_op = op.copy() + new_op._collocate = True + outs = [data, label, weight, base_margin] + params = [out.params.copy() for out in outs if out is not None] + output_types = [] + j = 0 + for i, out in enumerate(outs): + if out is None: + continue + params[j]["nsplits"] = out.nsplits + params[j]["chunks"] = out_chunkss[j] + params[j]["type"] = types[i] + output_types.append(get_output_types(out)[0]) + j += 1 + new_op.output_types = output_types + return new_op.new_tileables(op.inputs, kws=params) + + @staticmethod + def _order_chunk_index(chunks: List[ChunkType]): + ndim = chunks[0].ndim + for i, c in enumerate(chunks): + if ndim == 2: + c._index = (i, 0) + else: + c._index = (i,) + return chunks + + @classmethod + def tile(cls, op: "MarsDMatrix"): + data, label, weight, base_margin = op.data, op.label, op.weight, op.base_margin + + if has_unknown_shape(data): + yield + if data.chunk_shape[1] > 1: + # make sure data's second dimension has only 1 chunk + data = yield from recursive_tile(data.rechunk({1: data.shape[1]})) + nsplit = data.nsplits[0] + # rechunk label + if label is not None: + label = yield from recursive_tile(label.rechunk({0: nsplit})) + # rechunk weight + if weight is not None: + weight = yield from recursive_tile(weight.rechunk({0: nsplit})) + # rechunk base_margin + if base_margin is not None: + base_margin = yield from recursive_tile(base_margin.rechunk({0: nsplit})) + + collocated = cls._get_collocated(op, data, label, weight, base_margin) + collocated_chunks = list( + itertools.chain.from_iterable(c.chunks for c in collocated) + ) + yield collocated_chunks + collocated + + data = build_fetch(collocated[0]) + has_label = False + if label is not None: + has_label = True + label = build_fetch(collocated[1]) + i_weight = -1 + if weight is not None: + i_weight = 1 if not has_label else 2 + weight = build_fetch(collocated[i_weight]) + if base_margin is not None: + base_margin = build_fetch(collocated[-1]) + + ctx = get_context() + + # for distributed, we should concat the chunks + # which allocated on the same worker into one + data_chunk_metas = ctx.get_chunks_meta( + [c.key for c in data.chunks], fields=["bands"] + ) + data_chunk_workers = [m["bands"][0][0] for m in data_chunk_metas] + worker_to_chunks = dict() + for i, worker in enumerate(data_chunk_workers): + size = 1 + sum(it is not None for it in [label, weight, base_margin]) + if worker not in worker_to_chunks: + worker_to_chunks[worker] = [[] for _ in range(size)] + worker_to_chunks[worker][0].append(data.chunks[i]) + if label is not None: + worker_to_chunks[worker][1].append(label.chunks[i]) + if weight is not None: + worker_to_chunks[worker][i_weight].append(weight.chunks[i]) + if base_margin is not None: + worker_to_chunks[worker][-1].append(base_margin.chunks[i]) + ind = itertools.count(0) + out_chunks = [] + for worker, chunks in worker_to_chunks.items(): + data_chunk = concat_chunks(cls._order_chunk_index(chunks[0])) + inps = [data_chunk] + label_chunk = None + if label is not None: + label_chunk = concat_chunks(cls._order_chunk_index(chunks[1])) + inps.append(label_chunk) + weight_chunk = None + if weight is not None: + weight_chunk = concat_chunks(cls._order_chunk_index(chunks[i_weight])) + inps.append(weight_chunk) + base_margin_chunk = None + if base_margin is not None: + base_margin_chunk = concat_chunks(cls._order_chunk_index(chunks[-1])) + inps.append(base_margin_chunk) + chunk_op = ToDMatrix( + data=data_chunk, + label=label_chunk, + missing=op.missing, + weight=weight_chunk, + base_margin=base_margin_chunk, + feature_names=op.feature_names, + feature_types=op.feature_types, + _output_types=op.output_types, + ) + kws = data_chunk.params + kws["index"] = (next(ind), 0) + out_chunks.append(chunk_op.new_chunk(inps, **kws)) + nsplits = (tuple(c.shape[0] for c in out_chunks), (out_chunks[0].shape[1],)) + + new_op = op.copy() + kw = op.outputs[0].params + kw["chunks"] = out_chunks + kw["nsplits"] = nsplits + return new_op.new_tileables(op.inputs, kws=[kw]) + + @staticmethod + def get_xgb_dmatrix(tup, nthread: int = -1): + from xgboost import DMatrix + + data, label, weight, base_margin, missing, feature_names, feature_types = tup + data = data.spmatrix if hasattr(data, "spmatrix") else data + return DMatrix( + ensure_own_data(data), + label=ensure_own_data(label), + missing=missing, + weight=ensure_own_data(weight), + base_margin=base_margin, + feature_names=feature_names, + feature_types=feature_types, + nthread=nthread, + ) + + @staticmethod + def _from_ctx_if_not_none(ctx, chunk): + if chunk is None: + return chunk + return ctx[chunk.key] + + @classmethod + def execute(cls, ctx: Union[dict, Context], op: "ToDMatrix"): + if op._collocate: + outs = op.outputs + ctx[outs[0].key] = ctx[op.inputs[0].key] + has_label = False + if op.label is not None: + has_label = True + ctx[outs[1].key] = ctx[op.inputs[1].key] + if op.weight is not None: + i_weight = 1 if not has_label else 2 + ctx[outs[i_weight].key] = ctx[op.inputs[i_weight].key] + if op.base_margin is not None: + ctx[outs[-1].key] = ctx[op.inputs[-1].key] + else: + out = op.outputs[0] + data = cls._from_ctx_if_not_none(ctx, op.data) + if data is None: + data = np.empty((0, out.shape[1])) + ctx[out.key] = ( + data, + cls._from_ctx_if_not_none(ctx, op.label), + cls._from_ctx_if_not_none(ctx, op.weight), + cls._from_ctx_if_not_none(ctx, op.base_margin), + op.missing, + op.feature_names, + op.feature_types, + ) + + +def check_data(data): + data = convert_to_tensor_or_dataframe(data) + if data.ndim != 2: + raise ValueError(f"Expecting 2-d data, got: {data.ndim}-d") + + return data + + +def check_array_like(y: TileableType, name: str) -> TileableType: + if y is None: + return + y = convert_to_tensor_or_dataframe(y) + if isinstance(y, DATAFRAME_TYPE): + y = y.iloc[:, 0] + y = astensor(y) + if y.ndim != 1: + raise ValueError(f"Expecting 1-d {name}, got: {y.ndim}-d") + return y + + +def to_dmatrix( + data, + label=None, + missing=None, + weight=None, + base_margin=None, + feature_names=None, + feature_types=None, +): + data = check_data(data) + label = check_array_like(label, "label") + weight = check_array_like(weight, "weight") + base_margin = check_array_like(base_margin, "base_margin") + + # If not multiple outputs, try to collect the chunks on same worker into one + # to feed the data into XGBoost for training. + op = ToDMatrix( + data=data, + label=label, + missing=missing, + weight=weight, + base_margin=base_margin, + feature_names=feature_names, + feature_types=feature_types, + gpu=data.op.gpu, + _output_types=get_output_types(data), + ) + return op() + + +MarsDMatrix = to_dmatrix diff --git a/python/xorbits/_mars/learn/contrib/xgboost/predict.py b/python/xorbits/_mars/learn/contrib/xgboost/predict.py new file mode 100644 index 000000000..2952d627c --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/xgboost/predict.py @@ -0,0 +1,204 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pickle + +import numpy as np +import pandas as pd + +from .... import opcodes as OperandDef +from ....core import recursive_tile +from ....dataframe.core import DATAFRAME_CHUNK_TYPE, SERIES_CHUNK_TYPE +from ....dataframe.utils import parse_index +from ....serialization.serializables import BytesField, DictField, FieldTypes, KeyField +from ....tensor.core import TENSOR_TYPE, TensorOrder +from ....utils import ensure_own_data, has_unknown_shape +from ...operands import LearnOperand, LearnOperandMixin, OutputType +from .dmatrix import ToDMatrix, check_data + + +class XGBPredict(LearnOperand, LearnOperandMixin): + _op_type_ = OperandDef.XGBOOST_PREDICT + + data = KeyField("data", default=None) + model = BytesField( + "model", on_serialize=pickle.dumps, on_deserialize=pickle.loads, default=None + ) + kwargs = DictField("kwargs", key_type=FieldTypes.string, default_factory=dict) + + def __init__(self, output_types=None, gpu=None, **kw): + super().__init__(_output_types=output_types, gpu=gpu, **kw) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self.data = self._inputs[0] + + def __call__(self): + num_class = self.model.attr("num_class") + if num_class is not None: + num_class = int(num_class) + if num_class is not None: + shape = (self.data.shape[0], num_class) + else: + shape = (self.data.shape[0],) + inputs = [self.data] + if self.output_types[0] == OutputType.tensor: + # tensor + return self.new_tileable( + inputs, + shape=shape, + dtype=np.dtype(np.float32), + order=TensorOrder.C_ORDER, + ) + elif self.output_types[0] == OutputType.dataframe: + # dataframe + dtypes = pd.DataFrame(np.random.rand(0, num_class), dtype=np.float32).dtypes + return self.new_tileable( + inputs, + shape=shape, + dtypes=dtypes, + columns_value=parse_index(dtypes.index), + index_value=self.data.index_value, + ) + else: + # series + return self.new_tileable( + inputs, + shape=shape, + index_value=self.data.index_value, + name="predictions", + dtype=np.dtype(np.float32), + ) + + @classmethod + def tile(cls, op: "XGBPredict"): + out = op.outputs[0] + out_chunks = [] + data = op.data + if data.chunk_shape[1] > 1: + if has_unknown_shape(op.data): + yield + data = yield from recursive_tile(data.rechunk({1: op.data.shape[1]})) + for in_chunk in data.chunks: + chunk_op = op.copy().reset_key() + chunk_index = (in_chunk.index[0],) + if op.model.attr("num_class"): + chunk_shape = (in_chunk.shape[0], int(op.model.attr("num_class"))) + chunk_index += (0,) + else: + chunk_shape = (in_chunk.shape[0],) + if op.output_types[0] == OutputType.tensor: + out_chunk = chunk_op.new_chunk( + [in_chunk], + shape=chunk_shape, + dtype=out.dtype, + order=out.order, + index=chunk_index, + ) + elif op.output_types[0] == OutputType.dataframe: + # dataframe chunk + out_chunk = chunk_op.new_chunk( + [in_chunk], + shape=chunk_shape, + dtypes=data.dtypes, + columns_value=data.columns_value, + index_value=in_chunk.index_value, + index=chunk_index, + ) + else: + # series chunk + out_chunk = chunk_op.new_chunk( + [in_chunk], + shape=chunk_shape, + dtype=out.dtype, + index_value=in_chunk.index_value, + name=out.name, + index=chunk_index, + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + params = out.params + params["chunks"] = out_chunks + nsplits = (data.nsplits[0],) + if out.ndim > 1: + nsplits += ((out.shape[1],),) + params["nsplits"] = nsplits + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + def execute(cls, ctx, op: "XGBPredict"): + from xgboost import DMatrix + + raw_data = data = ctx[op.data.key] + if isinstance(data, tuple): + data = ToDMatrix.get_xgb_dmatrix(ensure_own_data(data)) + else: + data = data.spmatrix if hasattr(data, "spmatrix") else data + data = DMatrix(data) + + # do not pass arguments that are None + kwargs = dict((k, v) for k, v in op.kwargs.items() if v is not None) + result = op.model.predict(data, **kwargs) + + if isinstance(op.outputs[0], DATAFRAME_CHUNK_TYPE): + result = pd.DataFrame(result, index=raw_data.index) + elif isinstance(op.outputs[0], SERIES_CHUNK_TYPE): + result = pd.Series(result, index=raw_data.index, name="predictions") + + ctx[op.outputs[0].key] = result + + +def predict( + model, + data, + output_margin=False, + ntree_limit=None, + validate_features=True, + base_margin=None, + session=None, + run_kwargs=None, + run=True, +): + import xgboost + + data = check_data(data) + if not isinstance(model, xgboost.Booster): + raise TypeError(f"model has to be a xgboost.Booster, got {type(model)} instead") + + num_class = model.attr("num_class") + if isinstance(data, TENSOR_TYPE): + output_types = [OutputType.tensor] + elif num_class is not None: + output_types = [OutputType.dataframe] + else: + output_types = [OutputType.series] + + kwargs = { + "output_margin": output_margin, + "ntree_limit": ntree_limit, + "validate_features": validate_features, + "base_margin": base_margin, + } + op = XGBPredict( + data=data, + model=model, + kwargs=kwargs, + gpu=data.op.gpu, + output_types=output_types, + ) + result = op() + if run: + result.execute(session=session, **(run_kwargs or dict())) + return result diff --git a/python/xorbits/_mars/learn/contrib/xgboost/regressor.py b/python/xorbits/_mars/learn/contrib/xgboost/regressor.py new file mode 100644 index 000000000..caddc2a3e --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/xgboost/regressor.py @@ -0,0 +1,77 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from ..utils import make_import_error_func +from .core import XGBScikitLearnBase, xgboost + +XGBRegressor = make_import_error_func("xgboost") +if xgboost: + from .core import wrap_evaluation_matrices + from .predict import predict + from .train import train + + class XGBRegressor(XGBScikitLearnBase): + """ + Implementation of the scikit-learn API for XGBoost regressor. + """ + + def fit( + self, + X, + y, + sample_weight=None, + base_margin=None, + eval_set=None, + sample_weight_eval_set=None, + base_margin_eval_set=None, + **kw, + ): + session = kw.pop("session", None) + run_kwargs = kw.pop("run_kwargs", dict()) + if kw: + raise TypeError( + f"fit got an unexpected keyword argument '{next(iter(kw))}'" + ) + + dtrain, evals = wrap_evaluation_matrices( + None, + X, + y, + sample_weight, + base_margin, + eval_set, + sample_weight_eval_set, + base_margin_eval_set, + ) + params = self.get_xgb_params() + self.evals_result_ = dict() + result = train( + params, + dtrain, + num_boost_round=self.get_num_boosting_rounds(), + evals=evals, + evals_result=self.evals_result_, + session=session, + run_kwargs=run_kwargs, + ) + self._Booster = result + return self + + def predict(self, data, **kw): + session = kw.pop("session", None) + run_kwargs = kw.pop("run_kwargs", None) + return predict( + self.get_booster(), data, session=session, run_kwargs=run_kwargs, **kw + ) diff --git a/python/xorbits/_mars/learn/contrib/xgboost/start_tracker.py b/python/xorbits/_mars/learn/contrib/xgboost/start_tracker.py new file mode 100644 index 000000000..a90cf632b --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/xgboost/start_tracker.py @@ -0,0 +1,60 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from threading import Thread + +from .... import opcodes as OperandDef +from ....core import NotSupportTile +from ....serialization.serializables import Int32Field +from ....utils import to_binary +from ...operands import LearnOperand, LearnOperandMixin, OutputType + + +class StartTracker(LearnOperand, LearnOperandMixin): + _op_type_ = OperandDef.START_TRACKER + _op_module_ = "learn.contrib.xgboost" + + n_workers = Int32Field("n_workers", default=None) + + def __init__(self, output_types=None, pure_depends=None, **kw): + super().__init__( + _output_types=output_types, + _pure_depends=pure_depends, + **kw, + ) + if self.output_types is None: + self.output_types = [OutputType.object] + + @classmethod + def tile(cls, op): + raise NotSupportTile("StartTracker is a chunk op") + + @classmethod + def execute(cls, ctx, op): + """Start Rabit tracker""" + from .tracker import RabitTracker + + env = {"DMLC_NUM_WORKER": op.n_workers} + rabit_context = RabitTracker( + host_ip=ctx.get_local_host_ip(), n_workers=op.n_workers + ) + env.update(rabit_context.worker_envs()) + + rabit_context.start(op.n_workers) + thread = Thread(target=rabit_context.join) + thread.daemon = True + thread.start() + + rabit_args = [to_binary(f"{k}={v}") for k, v in env.items()] + ctx[op.outputs[0].key] = rabit_args diff --git a/python/xorbits/_mars/learn/contrib/xgboost/tests/__init__.py b/python/xorbits/_mars/learn/contrib/xgboost/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/xgboost/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/learn/contrib/xgboost/tests/test_classifier.py b/python/xorbits/_mars/learn/contrib/xgboost/tests/test_classifier.py new file mode 100644 index 000000000..2090ce736 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/xgboost/tests/test_classifier.py @@ -0,0 +1,167 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile + +import numpy as np +import pandas as pd +import pytest + +try: + import xgboost +except ImportError: + xgboost = None + +from ..... import dataframe as md +from ..... import tensor as mt +from ..classifier import XGBClassifier + +n_rows = 1000 +n_columns = 10 +chunk_size = 200 +rs = mt.random.RandomState(0) +X_raw = rs.rand(n_rows, n_columns, chunk_size=chunk_size) +y_raw = rs.rand(n_rows, chunk_size=chunk_size) +X_df_raw = md.DataFrame(X_raw) + + +@pytest.mark.skipif(xgboost is None, reason="XGBoost not installed") +def test_local_classifier(setup): + y = (y_raw * 10).astype(mt.int32) + classifier = XGBClassifier(verbosity=1, n_estimators=2) + classifier.fit(X_raw, y, eval_set=[(X_raw, y)]) + prediction = classifier.predict(X_raw) + + assert prediction.ndim == 1 + assert prediction.shape[0] == len(X_raw) + + history = classifier.evals_result() + + assert isinstance(prediction, mt.Tensor) + assert isinstance(history, dict) + + assert list(history)[0] == "validation_0" + # default metrics may differ, see https://github.com/dmlc/xgboost/pull/6183 + eval_metric = list(history["validation_0"])[0] + assert eval_metric in ("merror", "mlogloss") + assert len(history["validation_0"]) == 1 + assert len(history["validation_0"][eval_metric]) == 2 + + prob = classifier.predict_proba(X_raw) + assert prob.shape == X_raw.shape + + # test dataframe + X_df = X_df_raw + classifier = XGBClassifier(verbosity=1, n_estimators=2) + classifier.fit(X_df, y) + prediction = classifier.predict(X_df) + + assert prediction.ndim == 1 + assert prediction.shape[0] == len(X_raw) + + # test weight + weights = [ + mt.random.rand(X_raw.shape[0]), + md.Series(mt.random.rand(X_raw.shape[0])), + md.DataFrame(mt.random.rand(X_raw.shape[0])), + ] + y_df = md.DataFrame(y) + for weight in weights: + classifier = XGBClassifier(verbosity=1, n_estimators=2) + classifier.fit(X_raw, y_df, sample_weight=weight) + prediction = classifier.predict(X_raw) + + assert prediction.ndim == 1 + assert prediction.shape[0] == len(X_raw) + + # should raise error if weight.ndim > 1 + with pytest.raises(ValueError): + XGBClassifier(verbosity=1, n_estimators=2).fit( + X_raw, y_df, sample_weight=mt.random.rand(1, 1) + ) + + # test binary classifier + new_y = (y > 0.5).astype(mt.int32) + classifier = XGBClassifier(verbosity=1, n_estimators=2) + classifier.fit(X_raw, new_y) + prediction = classifier.predict(X_raw) + + assert prediction.ndim == 1 + assert prediction.shape[0] == len(X_raw) + + # test predict data with unknown shape + X2 = X_raw[X_raw[:, 0] > 0.1].astype(mt.int32) + prediction = classifier.predict(X2) + + assert prediction.ndim == 1 + + # test train with unknown shape + cond = X_raw[:, 0] > 0 + X3 = X_raw[cond] + y3 = y[cond] + classifier = XGBClassifier(verbosity=1, n_estimators=2) + classifier.fit(X3, y3) + prediction = classifier.predict(X_raw) + + assert prediction.ndim == 1 + assert prediction.shape[0] == len(X_raw) + + classifier = XGBClassifier(verbosity=1, n_estimators=2) + with pytest.raises(TypeError): + classifier.fit(X_raw, y, wrong_param=1) + classifier.fit(X_raw, y) + with pytest.raises(TypeError): + classifier.predict(X_raw, wrong_param=1) + + +@pytest.mark.skipif(xgboost is None, reason="XGBoost not installed") +def test_local_classifier_from_to_parquet(setup): + n_rows = 1000 + n_columns = 10 + rs = np.random.RandomState(0) + X = rs.rand(n_rows, n_columns) + y = rs.rand(n_rows) + df = pd.DataFrame(X, columns=[f"c{i}" for i in range(n_columns)]) + df["id"] = [f"i{i}" for i in range(n_rows)] + + booster = xgboost.train({}, xgboost.DMatrix(X, y), num_boost_round=2) + + with tempfile.TemporaryDirectory() as d: + m_name = os.path.join(d, "c.model") + result_dir = os.path.join(d, "result") + os.mkdir(result_dir) + data_dir = os.path.join(d, "data") + os.mkdir(data_dir) + + booster.save_model(m_name) + + df.iloc[:500].to_parquet(os.path.join(d, "data", "data1.parquet")) + df.iloc[500:].to_parquet(os.path.join(d, "data", "data2.parquet")) + + df = md.read_parquet(data_dir).set_index("id") + model = XGBClassifier() + model.load_model(m_name) + result = model.predict(df, run=False) + r = md.DataFrame(result).to_parquet(result_dir) + + # tiles to ensure no iterative tiling exists + r.execute() + + ret = md.read_parquet(result_dir).to_pandas().iloc[:, 0].to_numpy() + model2 = xgboost.XGBClassifier() + model2.load_model(m_name) + expected = model2.predict(X) + expected = np.stack([1 - expected, expected]).argmax(axis=0) + np.testing.assert_array_equal(ret, expected) diff --git a/python/xorbits/_mars/learn/contrib/xgboost/tests/test_core.py b/python/xorbits/_mars/learn/contrib/xgboost/tests/test_core.py new file mode 100644 index 000000000..15b558923 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/xgboost/tests/test_core.py @@ -0,0 +1,43 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +try: + import xgboost +except ImportError: + xgboost = None + + +from ..... import tensor as mt + +if xgboost: + from ..core import wrap_evaluation_matrices + + +@pytest.mark.skipif(xgboost is None, reason="XGBoost not installed") +def test_wrap_evaluation_matrices(): + X = mt.random.rand(100, 3) + y = mt.random.randint(3, size=(100,)) + + eval_set = [(mt.random.rand(10, 3), mt.random.randint(3, size=10))] + with pytest.raises(ValueError): + # sample_weight_eval_set size wrong + wrap_evaluation_matrices(0.0, X, y, None, None, eval_set, [], None) + + with pytest.raises(ValueError): + wrap_evaluation_matrices(0.0, X, y, None, None, None, eval_set, None) + + evals = wrap_evaluation_matrices(0.0, X, y, None, None, eval_set, None, None)[1] + assert len(evals) > 0 diff --git a/python/xorbits/_mars/learn/contrib/xgboost/tests/test_predict.py b/python/xorbits/_mars/learn/contrib/xgboost/tests/test_predict.py new file mode 100644 index 000000000..bfa14a29f --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/xgboost/tests/test_predict.py @@ -0,0 +1,68 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest + +try: + import xgboost + from xgboost import Booster +except ImportError: + xgboost = None + +from ..... import dataframe as md +from ..... import tensor as mt +from .. import MarsDMatrix, predict, train + +n_rows = 1000 +n_columns = 10 +chunk_size = 200 +rs = mt.random.RandomState(0) +X = rs.rand(n_rows, n_columns, chunk_size=chunk_size) +y = rs.rand(n_rows, chunk_size=chunk_size) +X_df = md.DataFrame(X) +y_series = md.Series(y) +x_sparse = np.random.rand(n_rows, n_columns) +x_sparse[np.arange(n_rows), np.random.randint(n_columns, size=n_rows)] = np.nan +X_sparse = mt.tensor(x_sparse, chunk_size=chunk_size).tosparse(missing=np.nan) + + +@pytest.mark.skipif(xgboost is None, reason="XGBoost not installed") +def test_local_predict_tensor(setup): + dtrain = MarsDMatrix(X, y) + booster = train({}, dtrain, num_boost_round=2) + assert isinstance(booster, Booster) + + prediction = predict(booster, X) + assert isinstance(prediction.to_numpy(), np.ndarray) + + prediction = predict(booster, X_sparse) + assert isinstance(prediction.to_numpy(), np.ndarray) + + prediction = predict(booster, dtrain) + assert isinstance(prediction.fetch(), np.ndarray) + + with pytest.raises(TypeError): + predict(None, X) + + +@pytest.mark.skipif(xgboost is None, reason="XGBoost not installed") +def test_local_predict_dataframe(setup): + dtrain = MarsDMatrix(X_df, y_series) + booster = train({}, dtrain, num_boost_round=2) + assert isinstance(booster, Booster) + + prediction = predict(booster, X_df) + assert isinstance(prediction.to_pandas(), pd.Series) diff --git a/python/xorbits/_mars/learn/contrib/xgboost/tests/test_regressor.py b/python/xorbits/_mars/learn/contrib/xgboost/tests/test_regressor.py new file mode 100644 index 000000000..73ed0dee9 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/xgboost/tests/test_regressor.py @@ -0,0 +1,67 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +try: + import xgboost +except ImportError: + xgboost = None + +from ..... import tensor as mt +from ..regressor import XGBRegressor + +n_rows = 1000 +n_columns = 10 +chunk_size = 200 +rs = mt.random.RandomState(0) +X = rs.rand(n_rows, n_columns, chunk_size=chunk_size) +y = rs.rand(n_rows, chunk_size=chunk_size) + + +@pytest.mark.skipif(xgboost is None, reason="XGBoost not installed") +def test_local_regressor(setup): + regressor = XGBRegressor(verbosity=1, n_estimators=2) + regressor.set_params(tree_method="hist") + regressor.fit(X, y, eval_set=[(X, y)]) + prediction = regressor.predict(X) + + assert prediction.ndim == 1 + assert prediction.shape[0] == len(X) + + history = regressor.evals_result() + + assert isinstance(prediction, mt.Tensor) + assert isinstance(history, dict) + + assert list(history["validation_0"])[0] == "rmse" + assert len(history["validation_0"]["rmse"]) == 2 + + # test weight + weight = mt.random.rand(X.shape[0]) + classifier = XGBRegressor(verbosity=1, n_estimators=2) + regressor.set_params(tree_method="hist") + classifier.fit(X, y, sample_weight=weight) + prediction = classifier.predict(X) + + assert prediction.ndim == 1 + assert prediction.shape[0] == len(X) + + # test wrong params + regressor = XGBRegressor(verbosity=1, n_estimators=2) + with pytest.raises(TypeError): + regressor.fit(X, y, wrong_param=1) + regressor.fit(X, y) + with pytest.raises(TypeError): + regressor.predict(X, wrong_param=1) diff --git a/python/xorbits/_mars/learn/contrib/xgboost/tests/test_train.py b/python/xorbits/_mars/learn/contrib/xgboost/tests/test_train.py new file mode 100644 index 000000000..eeeda48db --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/xgboost/tests/test_train.py @@ -0,0 +1,120 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest + +try: + import xgboost + from xgboost import Booster +except ImportError: + xgboost = None +from ..... import dataframe as md +from ..... import tensor as mt +from .....tests.core import require_ray +from .. import MarsDMatrix, train + +n_rows = 1000 +n_columns = 10 +chunk_size = 200 +rs = mt.random.RandomState(0) +X = rs.rand(n_rows, n_columns, chunk_size=chunk_size) +y = rs.rand(n_rows, chunk_size=chunk_size) +X_df = md.DataFrame(X) +y_series = md.Series(y) +x_sparse = np.random.rand(n_rows, n_columns) +x_sparse[np.arange(n_rows), np.random.randint(n_columns, size=n_rows)] = np.nan +X_sparse = mt.tensor(x_sparse, chunk_size=chunk_size).tosparse(missing=np.nan) + + +@pytest.mark.skipif(xgboost is None, reason="XGBoost not installed") +def test_local_train_tensor(setup): + dtrain = MarsDMatrix(X, y) + booster = train({}, dtrain, num_boost_round=2) + assert isinstance(booster, Booster) + + +@pytest.mark.skipif(xgboost is None, reason="XGBoost not installed") +def test_local_train_sparse_tensor(setup): + dtrain = MarsDMatrix(X_sparse, y) + booster = train({}, dtrain, num_boost_round=2) + assert isinstance(booster, Booster) + + +@pytest.mark.skipif(xgboost is None, reason="XGBoost not installed") +def test_local_train_dataframe(setup): + dtrain = MarsDMatrix(X_df, y_series) + booster = train({}, dtrain, num_boost_round=2) + assert isinstance(booster, Booster) + + +@pytest.mark.skipif(xgboost is None, reason="XGBoost not installed") +@pytest.mark.parametrize("chunk_size", [n_rows // 5, n_rows]) +def test_train_evals(setup_cluster, chunk_size): + rs = mt.random.RandomState(0) + # keep 1 chunk for X and y + X = rs.rand(n_rows, n_columns, chunk_size=(n_rows, n_columns // 2)) + y = rs.rand(n_rows, chunk_size=n_rows) + base_margin = rs.rand(n_rows, chunk_size=n_rows) + dtrain = MarsDMatrix(X, y, base_margin=base_margin) + eval_x = MarsDMatrix( + rs.rand(n_rows, n_columns, chunk_size=chunk_size), + rs.rand(n_rows, chunk_size=chunk_size), + ) + evals = [(eval_x, "eval_x")] + eval_result = dict() + booster = train( + {}, dtrain, num_boost_round=2, evals=evals, evals_result=eval_result + ) + assert isinstance(booster, Booster) + assert len(eval_result) > 0 + + with pytest.raises(TypeError): + train( + {}, + dtrain, + num_boost_round=2, + evals=[("eval_x", eval_x)], + evals_result=eval_result, + ) + + +@require_ray +def test_train_on_ray_cluster(ray_start_regular, ray_create_mars_cluster): + rs = mt.random.RandomState(0) + # keep 1 chunk for X and y + X = rs.rand(n_rows, n_columns, chunk_size=(n_rows, n_columns // 2)) + y = rs.rand(n_rows, chunk_size=n_rows) + base_margin = rs.rand(n_rows, chunk_size=n_rows) + dtrain = MarsDMatrix(X, y, base_margin=base_margin) + eval_x = MarsDMatrix( + rs.rand(n_rows, n_columns, chunk_size=n_rows // 5), + rs.rand(n_rows, chunk_size=n_rows // 5), + ) + evals = [(eval_x, "eval_x")] + eval_result = dict() + booster = train( + {}, dtrain, num_boost_round=2, evals=evals, evals_result=eval_result + ) + assert isinstance(booster, Booster) + assert len(eval_result) > 0 + + with pytest.raises(TypeError): + train( + {}, + dtrain, + num_boost_round=2, + evals=[("eval_x", eval_x)], + evals_result=eval_result, + ) diff --git a/python/xorbits/_mars/learn/contrib/xgboost/tracker.py b/python/xorbits/_mars/learn/contrib/xgboost/tracker.py new file mode 100644 index 000000000..76167eb1e --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/xgboost/tracker.py @@ -0,0 +1,503 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This script is a variant of dmlc-core/dmlc_tracker/tracker.py, +which is a specialized version for xgboost tasks. +""" + +import argparse +import logging + +# pylint: disable=invalid-name, missing-docstring, too-many-arguments, too-many-locals +# pylint: disable=too-many-branches, too-many-statements, too-many-instance-attributes +import socket +import struct +import sys +from threading import Thread +from typing import Dict, List, Optional, Set, Tuple, Union + +_RingMap = Dict[int, Tuple[int, int]] +_TreeMap = Dict[int, List[int]] + +logger = logging.getLogger(__name__) + + +class ExSocket: + """ + Extension of socket to handle recv and send of special data + """ + + def __init__(self, sock: socket.socket) -> None: + self.sock = sock + + def recvall(self, nbytes: int) -> bytes: + """Receive number of bytes.""" + res = [] + nread = 0 + while nread < nbytes: + chunk = self.sock.recv(min(nbytes - nread, 1024)) + nread += len(chunk) + res.append(chunk) + return b"".join(res) + + def recvint(self) -> int: + """Receive an integer of 32 bytes""" + return struct.unpack("@i", self.recvall(4))[0] + + def sendint(self, value: int) -> None: + """Send an integer of 32 bytes""" + self.sock.sendall(struct.pack("@i", value)) + + def sendstr(self, value: str) -> None: + """Send a Python string""" + self.sendint(len(value)) + self.sock.sendall(value.encode()) + + def recvstr(self) -> str: + """Receive a Python string""" + slen = self.recvint() + return self.recvall(slen).decode() + + +# magic number used to verify existence of data +MAGIC_NUM = 0xFF99 + + +def get_some_ip(host: str) -> str: + """Get ip from host""" + return socket.getaddrinfo(host, None)[0][4][0] + + +def get_family(addr: str) -> int: + """Get network family from address.""" + return socket.getaddrinfo(addr, None)[0][0] + + +class WorkerEntry: + """Handler to each worker.""" + + def __init__(self, sock: socket.socket, s_addr: Tuple[str, int]): + worker = ExSocket(sock) + self.sock = worker + self.host = get_some_ip(s_addr[0]) + magic = worker.recvint() + assert magic == MAGIC_NUM, f"invalid magic number={magic} from {self.host}" + worker.sendint(MAGIC_NUM) + self.rank = worker.recvint() + self.world_size = worker.recvint() + self.jobid = worker.recvstr() + self.cmd = worker.recvstr() + self.wait_accept = 0 + self.port: Optional[int] = None + + def print(self, use_logger: bool) -> None: + """Execute the print command from worker.""" + msg = self.sock.recvstr() + # On dask we use print to avoid setting global verbosity. + if use_logger: + logger.info(msg.strip()) + else: + print(msg.strip(), flush=True) + + def decide_rank(self, job_map: Dict[str, int]) -> int: + """Get the rank of current entry.""" + if self.rank >= 0: + return self.rank + if self.jobid != "NULL" and self.jobid in job_map: + return job_map[self.jobid] + return -1 + + def assign_rank( + self, + rank: int, + wait_conn: Dict[int, "WorkerEntry"], + tree_map: _TreeMap, + parent_map: Dict[int, int], + ring_map: _RingMap, + ) -> List[int]: + """Assign the rank for current entry.""" + self.rank = rank + nnset = set(tree_map[rank]) + rprev, rnext = ring_map[rank] + self.sock.sendint(rank) + # send parent rank + self.sock.sendint(parent_map[rank]) + # send world size + self.sock.sendint(len(tree_map)) + self.sock.sendint(len(nnset)) + # send the rprev and next link + for r in nnset: + self.sock.sendint(r) + # send prev link + if rprev not in (-1, rank): + nnset.add(rprev) + self.sock.sendint(rprev) + else: + self.sock.sendint(-1) + # send next link + if rnext not in (-1, rank): + nnset.add(rnext) + self.sock.sendint(rnext) + else: + self.sock.sendint(-1) + + return self._get_remote(wait_conn, nnset) + + def _get_remote( + self, wait_conn: Dict[int, "WorkerEntry"], nnset: Set[int] + ) -> List[int]: + while True: + ngood = self.sock.recvint() + goodset = set([]) + for _ in range(ngood): + goodset.add(self.sock.recvint()) + assert goodset.issubset(nnset) + badset = nnset - goodset + conset = [] + for r in badset: + if r in wait_conn: + conset.append(r) + self.sock.sendint(len(conset)) + self.sock.sendint(len(badset) - len(conset)) + for r in conset: + self.sock.sendstr(wait_conn[r].host) + port = wait_conn[r].port + assert port is not None + self.sock.sendint(port) + self.sock.sendint(r) + nerr = self.sock.recvint() + if nerr != 0: + continue + self.port = self.sock.recvint() + rmset = [] + # all connection was successuly setup + for r in conset: + wait_conn[r].wait_accept -= 1 + if wait_conn[r].wait_accept == 0: + rmset.append(r) + for r in rmset: + wait_conn.pop(r, None) + self.wait_accept = len(badset) - len(conset) + return rmset + + +class RabitTracker: + """ + tracker for rabit + """ + + def __init__( + self, host_ip: str, n_workers: int, port: int = 0, use_logger: bool = False + ) -> None: + """A Python implementation of RABIT tracker. + Parameters + .......... + use_logger: + Use logging.info for tracker print command. When set to False, Python print + function is used instead. + """ + sock = socket.socket(get_family(host_ip), socket.SOCK_STREAM) + sock.bind((host_ip, port)) + self.port = sock.getsockname()[1] + sock.listen(256) + self.sock = sock + self.host_ip = host_ip + self.thread: Optional[Thread] = None + self.n_workers = n_workers + self._use_logger = use_logger + logger.info("start listen on %s:%d", host_ip, self.port) + + def __del__(self) -> None: + if hasattr(self, "sock"): + self.sock.close() + + @staticmethod + def _get_neighbor(rank: int, n_workers: int) -> List[int]: + rank = rank + 1 + ret = [] + if rank > 1: + ret.append(rank // 2 - 1) + if rank * 2 - 1 < n_workers: + ret.append(rank * 2 - 1) + if rank * 2 < n_workers: + ret.append(rank * 2) + return ret + + def worker_envs(self) -> Dict[str, Union[str, int]]: + """ + get environment variables for workers + can be passed in as args or envs + """ + return {"DMLC_TRACKER_URI": self.host_ip, "DMLC_TRACKER_PORT": self.port} + + def _get_tree(self, n_workers: int) -> Tuple[_TreeMap, Dict[int, int]]: + tree_map: _TreeMap = {} + parent_map: Dict[int, int] = {} + for r in range(n_workers): + tree_map[r] = self._get_neighbor(r, n_workers) + parent_map[r] = (r + 1) // 2 - 1 + return tree_map, parent_map + + def find_share_ring( + self, tree_map: _TreeMap, parent_map: Dict[int, int], rank: int + ) -> List[int]: + """ + get a ring structure that tends to share nodes with the tree + return a list starting from rank + """ + nset = set(tree_map[rank]) + cset = nset - set([parent_map[rank]]) + if not cset: + return [rank] + rlst = [rank] + cnt = 0 + for v in cset: + vlst = self.find_share_ring(tree_map, parent_map, v) + cnt += 1 + if cnt == len(cset): + vlst.reverse() + rlst += vlst + return rlst + + def get_ring(self, tree_map: _TreeMap, parent_map: Dict[int, int]) -> _RingMap: + """ + get a ring connection used to recover local data + """ + assert parent_map[0] == -1 + rlst = self.find_share_ring(tree_map, parent_map, 0) + assert len(rlst) == len(tree_map) + ring_map: _RingMap = {} + n_workers = len(tree_map) + for r in range(n_workers): + rprev = (r + n_workers - 1) % n_workers + rnext = (r + 1) % n_workers + ring_map[rlst[r]] = (rlst[rprev], rlst[rnext]) + return ring_map + + def get_link_map(self, n_workers: int) -> Tuple[_TreeMap, Dict[int, int], _RingMap]: + """ + get the link map, this is a bit hacky, call for better algorithm + to place similar nodes together + """ + tree_map, parent_map = self._get_tree(n_workers) + ring_map = self.get_ring(tree_map, parent_map) + rmap = {0: 0} + k = 0 + for i in range(n_workers - 1): + k = ring_map[k][1] + rmap[k] = i + 1 + + ring_map_: _RingMap = {} + tree_map_: _TreeMap = {} + parent_map_: Dict[int, int] = {} + for k, v in ring_map.items(): + ring_map_[rmap[k]] = (rmap[v[0]], rmap[v[1]]) + for k, tree_nodes in tree_map.items(): + tree_map_[rmap[k]] = [rmap[x] for x in tree_nodes] + for k, parent in parent_map.items(): + if k != 0: + parent_map_[rmap[k]] = rmap[parent] + else: + parent_map_[rmap[k]] = -1 + return tree_map_, parent_map_, ring_map_ + + def accept_workers(self, n_workers: int) -> None: + """Wait for all workers to connect to the tracker.""" + # set of nodes that finishes the job + shutdown: Dict[int, WorkerEntry] = {} + # set of nodes that is waiting for connections + wait_conn: Dict[int, WorkerEntry] = {} + # maps job id to rank + job_map: Dict[str, int] = {} + # list of workers that is pending to be assigned rank + pending: List[WorkerEntry] = [] + # lazy initialize tree_map + tree_map = None + + while len(shutdown) != n_workers: + fd, s_addr = self.sock.accept() + s = WorkerEntry(fd, s_addr) + if s.cmd == "print": + s.print(self._use_logger) + continue + if s.cmd == "shutdown": + assert s.rank >= 0 and s.rank not in shutdown + assert s.rank not in wait_conn + shutdown[s.rank] = s + logger.debug("Received %s signal from %d", s.cmd, s.rank) + continue + assert s.cmd in ("start", "recover") + # lazily initialize the workers + if tree_map is None: + assert s.cmd == "start" + if s.world_size > 0: + n_workers = s.world_size + tree_map, parent_map, ring_map = self.get_link_map(n_workers) + # set of nodes that is pending for getting up + todo_nodes = list(range(n_workers)) + else: + assert s.world_size in (-1, n_workers) + if s.cmd == "recover": + assert s.rank >= 0 + + rank = s.decide_rank(job_map) + # batch assignment of ranks + if rank == -1: + assert todo_nodes + pending.append(s) + if len(pending) == len(todo_nodes): + pending.sort(key=lambda x: x.host) + for s in pending: + rank = todo_nodes.pop(0) + if s.jobid != "NULL": + job_map[s.jobid] = rank + s.assign_rank(rank, wait_conn, tree_map, parent_map, ring_map) + if s.wait_accept > 0: + wait_conn[rank] = s + logger.debug( + "Received %s signal from %s; assign rank %d", + s.cmd, + s.host, + s.rank, + ) + if not todo_nodes: + logger.info("@tracker All of %d nodes getting started", n_workers) + else: + s.assign_rank(rank, wait_conn, tree_map, parent_map, ring_map) + logger.debug("Received %s signal from %d", s.cmd, s.rank) + if s.wait_accept > 0: + wait_conn[rank] = s + logger.info("@tracker All nodes finishes job") + + def start(self, n_workers: int) -> None: + """Start the tracker, it will wait for `n_workers` to connect.""" + + def run() -> None: + self.accept_workers(n_workers) + + self.thread = Thread(target=run, args=(), daemon=True) + self.thread.start() + + def join(self) -> None: + """Wait for the tracker to finish.""" + while self.thread is not None and self.thread.is_alive(): + self.thread.join(100) + + def alive(self) -> bool: + """Whether the tracker thread is alive""" + return self.thread is not None and self.thread.is_alive() + + +def get_host_ip(host_ip: Optional[str] = None) -> str: + """Get the IP address of current host. If `host_ip` is not none then it will be + returned as it's + """ + if host_ip is None or host_ip == "auto": + host_ip = "ip" + + if host_ip == "dns": + host_ip = socket.getfqdn() + elif host_ip == "ip": + from socket import gaierror + + try: + host_ip = socket.gethostbyname(socket.getfqdn()) + except gaierror: + logger.debug( + "gethostbyname(socket.getfqdn()) failed... trying on hostname()" + ) + host_ip = socket.gethostbyname(socket.gethostname()) + if host_ip.startswith("127."): + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + # doesn't have to be reachable + s.connect(("10.255.255.255", 1)) + host_ip = s.getsockname()[0] + + assert host_ip is not None + return host_ip + + +def start_rabit_tracker(args: argparse.Namespace) -> None: + """Standalone function to start rabit tracker. + Parameters + ---------- + args: arguments to start the rabit tracker. + """ + envs = {"DMLC_NUM_WORKER": args.num_workers, "DMLC_NUM_SERVER": args.num_servers} + rabit = RabitTracker( + host_ip=get_host_ip(args.host_ip), n_workers=args.num_workers, use_logger=True + ) + envs.update(rabit.worker_envs()) + rabit.start(args.num_workers) + sys.stdout.write("DMLC_TRACKER_ENV_START\n") + # simply write configuration to stdout + for k, v in envs.items(): + sys.stdout.write(f"{k}={v}\n") + sys.stdout.write("DMLC_TRACKER_ENV_END\n") + sys.stdout.flush() + rabit.join() + + +def main() -> None: + """Main function if tracker is executed in standalone mode.""" + parser = argparse.ArgumentParser(description="Rabit Tracker start.") + parser.add_argument( + "--num-workers", + required=True, + type=int, + help="Number of worker process to be launched.", + ) + parser.add_argument( + "--num-servers", + default=0, + type=int, + help="Number of server process to be launched. Only used in PS jobs.", + ) + parser.add_argument( + "--host-ip", + default=None, + type=str, + help=( + "Host IP addressed, this is only needed " + + "if the host IP cannot be automatically guessed." + ), + ) + parser.add_argument( + "--log-level", + default="INFO", + type=str, + choices=["INFO", "DEBUG"], + help="Logging level of the logger.", + ) + args = parser.parse_args() + + fmt = "%(asctime)s %(levelname)s %(message)s" + if args.log_level == "INFO": + level = logging.INFO + elif args.log_level == "DEBUG": + level = logging.DEBUG + else: + raise RuntimeError(f"Unknown logging level {args.log_level}") + + logging.basicConfig(format=fmt, level=level) + + if args.num_servers == 0: + start_rabit_tracker(args) + else: + raise RuntimeError("Do not yet support start ps tracker in standalone mode.") + + +if __name__ == "__main__": + main() diff --git a/python/xorbits/_mars/learn/contrib/xgboost/train.py b/python/xorbits/_mars/learn/contrib/xgboost/train.py new file mode 100644 index 000000000..11df52972 --- /dev/null +++ b/python/xorbits/_mars/learn/contrib/xgboost/train.py @@ -0,0 +1,280 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import logging +import pickle +from collections import OrderedDict, defaultdict + +import numpy as np + +from .... import opcodes as OperandDef +from ....core import OutputType +from ....core.context import get_context +from ....core.operand import MergeDictOperand +from ....serialization.serializables import DictField, FieldTypes, KeyField, ListField +from ....utils import ensure_own_data +from .dmatrix import ToDMatrix, to_dmatrix +from .start_tracker import StartTracker + +logger = logging.getLogger(__name__) + + +def _on_serialize_evals(evals_val): + if evals_val is None: + return None + return [list(x) for x in evals_val] + + +class XGBTrain(MergeDictOperand): + _op_type_ = OperandDef.XGBOOST_TRAIN + + params = DictField("params", key_type=FieldTypes.string, default=None) + dtrain = KeyField("dtrain", default=None) + evals = ListField("evals", on_serialize=_on_serialize_evals, default=None) + kwargs = DictField("kwargs", key_type=FieldTypes.string, default=None) + tracker = KeyField("tracker", default=None) + + def __init__(self, gpu=None, **kw): + super().__init__(gpu=gpu, **kw) + if self.output_types is None: + self.output_types = [OutputType.object] + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self.dtrain = self._inputs[0] + rest = self._inputs[1:] + if self.tracker is not None: + self.tracker = self._inputs[-1] + rest = rest[:-1] + if self.evals is not None: + evals_dict = OrderedDict(self.evals) + new_evals_dict = OrderedDict() + for new_key, val in zip(rest, evals_dict.values()): + new_evals_dict[new_key] = val + self.evals = list(new_evals_dict.items()) + + def __call__(self): + inputs = [self.dtrain] + if self.evals is not None: + inputs.extend(e[0] for e in self.evals) + return self.new_tileable(inputs) + + @staticmethod + def _get_dmatrix_chunks_workers(ctx, dmatrix): + # dmatrix_chunk.inputs is concat, and concat's input is the coallocated chunks + metas = ctx.get_chunks_meta( + [c.inputs[0].inputs[0].key for c in dmatrix.chunks], fields=["bands"] + ) + return [m["bands"][0][0] for m in metas] + + @classmethod + def tile(cls, op: "XGBTrain"): + ctx = get_context() + + inp = op.inputs[0] + in_chunks = inp.chunks + workers = cls._get_dmatrix_chunks_workers(ctx, inp) + worker_to_in_chunks = dict(zip(workers, in_chunks)) + n_chunk = len(in_chunks) + out_chunks = [] + worker_to_evals = defaultdict(dict) + if op.evals is not None: + for dm, ev in op.evals: + ev_workers = cls._get_dmatrix_chunks_workers(ctx, dm) + for ev_worker, ev_chunk in zip(ev_workers, dm.chunks): + worker_to_evals[ev_worker][ev] = ev_chunk + + all_workers = set(workers) + all_workers.update(worker_to_evals) + + i = itertools.count(n_chunk) + tracker_chunk = StartTracker( + n_workers=len(all_workers), pure_depends=[True] * n_chunk + ).new_chunk(in_chunks, shape=()) + for worker in all_workers: + chunk_op = op.copy().reset_key() + chunk_op.expect_worker = worker + chunk_op.tracker = tracker_chunk + if worker in worker_to_in_chunks: + in_chunk = worker_to_in_chunks[worker] + else: + in_chunk_op = ToDMatrix( + data=None, + label=None, + weight=None, + base_margin=None, + missing=inp.op.missing, + feature_names=inp.op.feature_names, + feature_types=inp.op.feature_types, + _output_types=inp.op.output_types, + ) + params = inp.params.copy() + params["index"] = (next(i), 0) + params["shape"] = (0, inp.shape[1]) + in_chunk = in_chunk_op.new_chunk(None, kws=[params]) + chunk_evals = [] + for dm, ev in op.evals: + try: + chunk_evals.append((worker_to_evals[worker][ev], ev)) + except KeyError: + # create a new eval chunk + eval_chunk_op = ToDMatrix( + data=None, + label=None, + weight=None, + base_margin=None, + missing=dm.op.missing, + feature_names=dm.op.feature_names, + feature_types=dm.op.feature_types, + _output_types=dm.op.output_types, + ) + params = dm.params.copy() + params["index"] = (0, 0) + params["shape"] = (0, dm.shape[1]) + eval_chunk = eval_chunk_op.new_chunk(None, kws=[params]) + chunk_evals.append((eval_chunk, ev)) + chunk_op.evals = chunk_evals + input_chunks = ( + [in_chunk] + [pair[0] for pair in chunk_evals] + [tracker_chunk] + ) + out_chunk = chunk_op.new_chunk( + input_chunks, shape=(np.nan,), index=in_chunk.index[:1] + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_tileables( + op.inputs, chunks=out_chunks, nsplits=((np.nan for _ in out_chunks),) + ) + + @classmethod + def execute(cls, ctx, op: "XGBTrain"): + if op.merge: + return super().execute(ctx, op) + + from xgboost import rabit, train + + params = op.params.copy() + + n_threads = 0 + if op.tracker is None: + # non distributed + ctx_n_threads = -1 + else: + # distributed + ctx_n_threads = ctx.get_slots() + + # fix parallelism on nodes + for p in ["nthread", "n_jobs"]: + if ( + params.get(p, None) is not None + and params.get(p, ctx_n_threads) != ctx_n_threads + ): # pragma: no cover + logger.info("Overriding `nthreads` defined in Mars worker.") + n_threads = params[p] + break + if n_threads == 0 or n_threads is None: # pragma: no branch + n_threads = ctx_n_threads + params.update({"nthread": n_threads, "n_jobs": n_threads}) + + dtrain = ToDMatrix.get_xgb_dmatrix( + ensure_own_data(ctx[op.dtrain.key]), nthread=n_threads + ) + evals = tuple() + if op.evals is not None: + eval_dmatrices = [ + ToDMatrix.get_xgb_dmatrix( + ensure_own_data(ctx[t[0].key]), nthread=n_threads + ) + for t in op.evals + ] + evals = tuple((m, ev[1]) for m, ev in zip(eval_dmatrices, op.evals)) + + if op.tracker is None: + # non distributed + local_history = dict() + kwargs = dict() if op.kwargs is None else op.kwargs + bst = train( + params, dtrain, evals=evals, evals_result=local_history, **kwargs + ) + ctx[op.outputs[0].key] = { + "booster": pickle.dumps(bst), + "history": local_history, + } + else: + # distributed + logger.debug("Distributed train params: %r", params) + + rabit_args = ctx[op.tracker.key] + rabit.init( + [ + arg.tobytes() if isinstance(arg, memoryview) else arg + for arg in rabit_args + ] + ) + try: + logger.debug( + "Start to train data, train size: %s, evals sizes: %s", + dtrain.num_row(), + [ev[0].num_row() for ev in evals], + ) + local_history = dict() + bst = train( + params, dtrain, evals=evals, evals_result=local_history, **op.kwargs + ) + ret = {"booster": pickle.dumps(bst), "history": local_history} + if rabit.get_rank() != 0: + ret = {} + ctx[op.outputs[0].key] = ret + finally: + rabit.finalize() + + +def train(params, dtrain, evals=(), **kwargs): + """ + Train XGBoost model in Mars manner. + + Parameters + ---------- + Parameters are the same as `xgboost.train`. + + Returns + ------- + results: Booster + """ + + evals_result = kwargs.pop("evals_result", dict()) + session = kwargs.pop("session", None) + run_kwargs = kwargs.pop("run_kwargs", dict()) + + processed_evals = [] + if evals: + for eval_dmatrix, name in evals: + if not isinstance(name, str): + raise TypeError("evals must a list of pairs (DMatrix, string)") + if hasattr(eval_dmatrix, "op") and isinstance(eval_dmatrix.op, ToDMatrix): + processed_evals.append((eval_dmatrix, name)) + else: + processed_evals.append((to_dmatrix(eval_dmatrix), name)) + + op = XGBTrain(params=params, dtrain=dtrain, evals=processed_evals, kwargs=kwargs) + t = op() + ret = t.execute(session=session, **run_kwargs).fetch(session=session) + evals_result.update(ret["history"]) + bst = pickle.loads(ret["booster"]) + num_class = params.get("num_class") + if num_class: + bst.set_attr(num_class=str(num_class)) + return bst diff --git a/python/xorbits/_mars/learn/datasets/__init__.py b/python/xorbits/_mars/learn/datasets/__init__.py new file mode 100644 index 000000000..62a6a122b --- /dev/null +++ b/python/xorbits/_mars/learn/datasets/__init__.py @@ -0,0 +1,20 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .samples_generator import ( + make_blobs, + make_classification, + make_low_rank_matrix, + make_regression, +) diff --git a/python/xorbits/_mars/learn/datasets/samples_generator.py b/python/xorbits/_mars/learn/datasets/samples_generator.py new file mode 100644 index 000000000..b1b14b40c --- /dev/null +++ b/python/xorbits/_mars/learn/datasets/samples_generator.py @@ -0,0 +1,633 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numbers +from collections.abc import Iterable + +import numpy as np + +from ... import tensor as mt +from ...tensor import linalg +from ...tensor.utils import check_random_state +from ..utils import check_array +from ..utils import shuffle as util_shuffle + +# ------------------------------------------------------------------- +# Original implementation is in `sklearn.datasets.samples_generator`. +# ------------------------------------------------------------------- + + +def make_classification( + n_samples=100, + n_features=20, + n_informative=2, + n_redundant=2, + n_repeated=0, + n_classes=2, + n_clusters_per_class=2, + weights=None, + flip_y=0.01, + class_sep=1.0, + hypercube=True, + shift=0.0, + scale=1.0, + shuffle=True, + random_state=None, +): + """Generate a random n-class classification problem. + + This initially creates clusters of points normally distributed (std=1) + about vertices of an ``n_informative``-dimensional hypercube with sides of + length ``2*class_sep`` and assigns an equal number of clusters to each + class. It introduces interdependence between these features and adds + various types of further noise to the data. + + Without shuffling, ``X`` horizontally stacks features in the following + order: the primary ``n_informative`` features, followed by ``n_redundant`` + linear combinations of the informative features, followed by ``n_repeated`` + duplicates, drawn randomly with replacement from the informative and + redundant features. The remaining features are filled with random noise. + Thus, without shuffling, all useful features are contained in the columns + ``X[:, :n_informative + n_redundant + n_repeated]``. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_samples : int, optional (default=100) + The number of samples. + + n_features : int, optional (default=20) + The total number of features. These comprise ``n_informative`` + informative features, ``n_redundant`` redundant features, + ``n_repeated`` duplicated features and + ``n_features-n_informative-n_redundant-n_repeated`` useless features + drawn at random. + + n_informative : int, optional (default=2) + The number of informative features. Each class is composed of a number + of gaussian clusters each located around the vertices of a hypercube + in a subspace of dimension ``n_informative``. For each cluster, + informative features are drawn independently from N(0, 1) and then + randomly linearly combined within each cluster in order to add + covariance. The clusters are then placed on the vertices of the + hypercube. + + n_redundant : int, optional (default=2) + The number of redundant features. These features are generated as + random linear combinations of the informative features. + + n_repeated : int, optional (default=0) + The number of duplicated features, drawn randomly from the informative + and the redundant features. + + n_classes : int, optional (default=2) + The number of classes (or labels) of the classification problem. + + n_clusters_per_class : int, optional (default=2) + The number of clusters per class. + + weights : list of floats or None (default=None) + The proportions of samples assigned to each class. If None, then + classes are balanced. Note that if ``len(weights) == n_classes - 1``, + then the last class weight is automatically inferred. + More than ``n_samples`` samples may be returned if the sum of + ``weights`` exceeds 1. + + flip_y : float, optional (default=0.01) + The fraction of samples whose class are randomly exchanged. Larger + values introduce noise in the labels and make the classification + task harder. + + class_sep : float, optional (default=1.0) + The factor multiplying the hypercube size. Larger values spread + out the clusters/classes and make the classification task easier. + + hypercube : boolean, optional (default=True) + If True, the clusters are put on the vertices of a hypercube. If + False, the clusters are put on the vertices of a random polytope. + + shift : float, array of shape [n_features] or None, optional (default=0.0) + Shift features by the specified value. If None, then features + are shifted by a random value drawn in [-class_sep, class_sep]. + + scale : float, array of shape [n_features] or None, optional (default=1.0) + Multiply features by the specified value. If None, then features + are scaled by a random value drawn in [1, 100]. Note that scaling + happens after shifting. + + shuffle : boolean, optional (default=True) + Shuffle the samples and the features. + + random_state : int, RandomState instance or None (default) + Determines random number generation for dataset creation. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + X : tensor of shape [n_samples, n_features] + The generated samples. + + y : tensor of shape [n_samples] + The integer labels for class membership of each sample. + + Notes + ----- + The algorithm is adapted from Guyon [1] and was designed to generate + the "Madelon" dataset. + + References + ---------- + .. [1] I. Guyon, "Design of experiments for the NIPS 2003 variable + selection benchmark", 2003. + + See also + -------- + make_blobs: simplified variant + make_multilabel_classification: unrelated generator for multilabel tasks + """ + from sklearn.datasets._samples_generator import _generate_hypercube + + generator = check_random_state(random_state) + np_generator = generator.to_numpy() + + # Count features, clusters and samples + if n_informative + n_redundant + n_repeated > n_features: + raise ValueError( + "Number of informative, redundant and repeated " + "features must sum to less than the number of total" + " features" + ) + # Use log2 to avoid overflow errors + if n_informative < np.log2(n_classes * n_clusters_per_class): + raise ValueError( + "n_classes * n_clusters_per_class must" + " be smaller or equal 2 ** n_informative" + ) + if weights and len(weights) not in [n_classes, n_classes - 1]: + raise ValueError("Weights specified but incompatible with number of classes.") + + n_useless = n_features - n_informative - n_redundant - n_repeated + n_clusters = n_classes * n_clusters_per_class + + if weights and len(weights) == (n_classes - 1): + weights = weights + [1.0 - sum(weights)] + + if weights is None: + weights = [1.0 / n_classes] * n_classes + weights[-1] = 1.0 - sum(weights[:-1]) + + # Distribute samples among clusters by weight + n_samples_per_cluster = [ + int(n_samples * weights[k % n_classes] / n_clusters_per_class) + for k in range(n_clusters) + ] + + for i in range(n_samples - sum(n_samples_per_cluster)): + n_samples_per_cluster[i % n_clusters] += 1 + + # Initialize X and y + X = mt.zeros((n_samples, n_features)) + y = mt.zeros(n_samples, dtype=mt.int) + + # Build the polytope whose vertices become cluster centroids + centroids = _generate_hypercube(n_clusters, n_informative, np_generator).astype( + float, copy=False + ) + centroids *= 2 * class_sep + centroids -= class_sep + if not hypercube: + centroids *= np_generator.rand(n_clusters, 1) + centroids *= np_generator.rand(1, n_informative) + + # Initially draw informative features from the standard normal + X[:, :n_informative] = generator.randn(n_samples, n_informative) + + # Create each cluster; a variant of make_blobs + stop = 0 + for k, centroid in enumerate(centroids): + start, stop = stop, stop + n_samples_per_cluster[k] + y[start:stop] = k % n_classes # assign labels + X_k = X[start:stop, :n_informative] # slice a view of the cluster + + A = 2 * generator.rand(n_informative, n_informative) - 1 + X_k[...] = mt.dot(X_k, A) # introduce random covariance + + X_k += centroid # shift the cluster to a vertex + + # Create redundant features + if n_redundant > 0: + B = 2 * generator.rand(n_informative, n_redundant) - 1 + X[:, n_informative : n_informative + n_redundant] = mt.dot( + X[:, :n_informative], B + ) + + # Repeat some features + if n_repeated > 0: + n = n_informative + n_redundant + indices = ((n - 1) * generator.rand(n_repeated) + 0.5).astype(mt.intp) + X[:, n : n + n_repeated] = X[:, indices] + + # Fill useless features + if n_useless > 0: + X[:, -n_useless:] = generator.randn(n_samples, n_useless) + + # Randomly replace labels + if flip_y >= 0.0: + flip_mask = generator.rand(n_samples) < flip_y + y = mt.where(flip_mask, generator.randint(n_classes, size=len(y)), y) + + # Randomly shift and scale + if shift is None: + shift = (2 * generator.rand(n_features) - 1) * class_sep + X += shift + + if scale is None: + scale = 1 + 100 * generator.rand(n_features) + X *= scale + + if shuffle: + # Randomly permute samples + X, y = util_shuffle(X, y, random_state=generator, axes=(0, 1)) + + return X, y + + +def make_regression( + n_samples=100, + n_features=100, + *, + n_informative=10, + n_targets=1, + bias=0.0, + effective_rank=None, + tail_strength=0.5, + noise=0.0, + shuffle=True, + coef=False, + random_state=None, +): + """Generate a random regression problem. + + The input set can either be well conditioned (by default) or have a low + rank-fat tail singular profile. See :func:`make_low_rank_matrix` for + more details. + + The output is generated by applying a (potentially biased) random linear + regression model with `n_informative` nonzero regressors to the previously + generated input and some gaussian centered noise with some adjustable + scale. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_samples : int, default=100 + The number of samples. + + n_features : int, default=100 + The number of features. + + n_informative : int, default=10 + The number of informative features, i.e., the number of features used + to build the linear model used to generate the output. + + n_targets : int, default=1 + The number of regression targets, i.e., the dimension of the y output + vector associated with a sample. By default, the output is a scalar. + + bias : float, default=0.0 + The bias term in the underlying linear model. + + effective_rank : int, default=None + if not None: + The approximate number of singular vectors required to explain most + of the input data by linear combinations. Using this kind of + singular spectrum in the input allows the generator to reproduce + the correlations often observed in practice. + if None: + The input set is well conditioned, centered and gaussian with + unit variance. + + tail_strength : float, default=0.5 + The relative importance of the fat noisy tail of the singular values + profile if `effective_rank` is not None. When a float, it should be + between 0 and 1. + + noise : float, default=0.0 + The standard deviation of the gaussian noise applied to the output. + + shuffle : bool, default=True + Shuffle the samples and the features. + + coef : bool, default=False + If True, the coefficients of the underlying linear model are returned. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset creation. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + X : tensor of shape (n_samples, n_features) + The input samples. + + y : tensor of shape (n_samples,) or (n_samples, n_targets) + The output values. + + coef : tensor of shape (n_features,) or (n_features, n_targets) + The coefficient of the underlying linear model. It is returned only if + coef is True. + """ + n_informative = min(n_features, n_informative) + generator = check_random_state(random_state) + + if effective_rank is None: + # Randomly generate a well conditioned input set + X = generator.randn(n_samples, n_features) + + else: + # Randomly generate a low rank, fat tail input set + X = make_low_rank_matrix( + n_samples=n_samples, + n_features=n_features, + effective_rank=effective_rank, + tail_strength=tail_strength, + random_state=generator, + ) + + # Generate a ground truth model with only n_informative features being non + # zeros (the other features are not correlated to y and should be ignored + # by a sparsifying regularizers such as L1 or elastic net) + ground_truth = mt.zeros((n_features, n_targets)) + ground_truth[:n_informative, :] = 100 * generator.rand(n_informative, n_targets) + + y = mt.dot(X, ground_truth) + bias + + # Add noise + if noise > 0.0: + y += generator.normal(scale=noise, size=y.shape) + + # Randomly permute samples and features + if shuffle: + X, y = util_shuffle(X, y, random_state=generator) + + indices = mt.arange(n_features) + generator.shuffle(indices) + X[:, :] = X[:, indices] + ground_truth = ground_truth[indices] + + y = mt.squeeze(y) + + if coef: + return X, y, mt.squeeze(ground_truth) + + else: + return X, y + + +def make_blobs( + n_samples=100, + n_features=2, + centers=None, + cluster_std=1.0, + center_box=(-10.0, 10.0), + shuffle=True, + random_state=None, +): + """Generate isotropic Gaussian blobs for clustering. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_samples : int or array-like, optional (default=100) + If int, it is the total number of points equally divided among + clusters. + If array-like, each element of the sequence indicates + the number of samples per cluster. + + n_features : int, optional (default=2) + The number of features for each sample. + + centers : int or array of shape [n_centers, n_features], optional + (default=None) + The number of centers to generate, or the fixed center locations. + If n_samples is an int and centers is None, 3 centers are generated. + If n_samples is array-like, centers must be + either None or an array of length equal to the length of n_samples. + + cluster_std : float or sequence of floats, optional (default=1.0) + The standard deviation of the clusters. + + center_box : pair of floats (min, max), optional (default=(-10.0, 10.0)) + The bounding box for each cluster center when centers are + generated at random. + + shuffle : boolean, optional (default=True) + Shuffle the samples. + + random_state : int, RandomState instance or None (default) + Determines random number generation for dataset creation. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + X : tensor of shape [n_samples, n_features] + The generated samples. + + y : tensor of shape [n_samples] + The integer labels for cluster membership of each sample. + + Examples + -------- + >>> from sklearn.datasets import make_blobs + >>> X, y = make_blobs(n_samples=10, centers=3, n_features=2, + ... random_state=0) + >>> print(X.shape) + (10, 2) + >>> y + array([0, 0, 1, 0, 2, 2, 2, 1, 1, 0]) + >>> X, y = make_blobs(n_samples=[3, 3, 4], centers=None, n_features=2, + ... random_state=0) + >>> print(X.shape) + (10, 2) + >>> y + array([0, 1, 2, 0, 2, 2, 2, 1, 1, 0]) + + See also + -------- + make_classification: a more intricate variant + """ + from ..utils.checks import AssertAllFinite + + generator = check_random_state(random_state) + + if isinstance(n_samples, numbers.Integral): + # Set n_centers by looking at centers arg + if centers is None: + centers = 3 + + if isinstance(centers, numbers.Integral): + n_centers = centers + centers = generator.uniform( + center_box[0], center_box[1], size=(n_centers, n_features) + ) + + else: + centers = check_array(centers) + n_features = centers.shape[1] + n_centers = centers.shape[0] + + else: + # Set n_centers by looking at [n_samples] arg + n_centers = len(n_samples) + if centers is None: + centers = generator.uniform( + center_box[0], center_box[1], size=(n_centers, n_features) + ) + try: + assert len(centers) == n_centers + except TypeError: + raise ValueError( + f"Parameter `centers` must be array-like. Got {centers!r} instead" + ) + except AssertionError: + raise ValueError( + "Length of `n_samples` not consistent" + f" with number of centers. Got n_samples = {n_samples} " + f"and centers = {centers}" + ) + else: + centers = check_array(centers) + n_features = centers.shape[1] + + # stds: if cluster_std is given as list, it must be consistent + # with the n_centers + if hasattr(cluster_std, "__len__") and len(cluster_std) != n_centers: + if isinstance(centers.op, AssertAllFinite): + centers = centers.op.inputs[0] + raise ValueError( + "Length of `clusters_std` not consistent with " + f"number of centers. Got centers = {centers} " + f"and cluster_std = {cluster_std}" + ) + + if isinstance(cluster_std, numbers.Real): + cluster_std = mt.full(len(centers), cluster_std) + + X = [] + y = [] + + if isinstance(n_samples, Iterable): + n_samples_per_center = n_samples + else: + n_samples_per_center = [int(n_samples // n_centers)] * n_centers + + for i in range(n_samples % n_centers): + n_samples_per_center[i] += 1 + + for i, (n, std) in enumerate(zip(n_samples_per_center, cluster_std)): + if n == 0: + continue + X.append(generator.normal(loc=centers[i], scale=std, size=(n, n_features))) + y += [i] * n + + X = mt.concatenate(X) + y = mt.array(y) + + if shuffle: + X, y = util_shuffle(X, y, random_state=generator) + + return X, y + + +def make_low_rank_matrix( + n_samples=100, + n_features=100, + effective_rank=10, + tail_strength=0.5, + random_state=None, + chunk_size=None, +): + """Generate a mostly low rank matrix with bell-shaped singular values + + Most of the variance can be explained by a bell-shaped curve of width + effective_rank: the low rank part of the singular values profile is:: + + (1 - tail_strength) * exp(-1.0 * (i / effective_rank) ** 2) + + The remaining singular values' tail is fat, decreasing as:: + + tail_strength * exp(-0.1 * i / effective_rank). + + The low rank part of the profile can be considered the structured + signal part of the data while the tail can be considered the noisy + part of the data that cannot be summarized by a low number of linear + components (singular vectors). + + This kind of singular profiles is often seen in practice, for instance: + - gray level pictures of faces + - TF-IDF vectors of text documents crawled from the web + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_samples : int, optional (default=100) + The number of samples. + + n_features : int, optional (default=100) + The number of features. + + effective_rank : int, optional (default=10) + The approximate number of singular vectors required to explain most of + the data by linear combinations. + + tail_strength : float between 0.0 and 1.0, optional (default=0.5) + The relative importance of the fat noisy tail of the singular values + profile. + + random_state : int, RandomState instance or None (default) + Determines random number generation for dataset creation. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + + Returns + ------- + X : array of shape [n_samples, n_features] + The matrix. + """ + generator = check_random_state(random_state) + n = min(n_samples, n_features) + + # Random (ortho normal) vectors + u, _ = linalg.qr(generator.randn(n_samples, n, chunk_size=chunk_size)) + v, _ = linalg.qr(generator.randn(n_features, n, chunk_size=chunk_size)) + + # Index of the singular values + singular_ind = mt.arange(n, dtype=mt.float64, chunk_size=chunk_size) + + # Build the singular profile by assembling signal and noise components + low_rank = (1 - tail_strength) * mt.exp(-1.0 * (singular_ind / effective_rank) ** 2) + tail = tail_strength * mt.exp(-0.1 * singular_ind / effective_rank) + s = mt.identity(n) * (low_rank + tail) + + return mt.dot(mt.dot(u, s), v.T) diff --git a/python/xorbits/_mars/learn/datasets/tests/__init__.py b/python/xorbits/_mars/learn/datasets/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/learn/datasets/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/learn/datasets/tests/test_samples_generator.py b/python/xorbits/_mars/learn/datasets/tests/test_samples_generator.py new file mode 100644 index 000000000..8c026a556 --- /dev/null +++ b/python/xorbits/_mars/learn/datasets/tests/test_samples_generator.py @@ -0,0 +1,336 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +from functools import partial + +import numpy as np +from sklearn.utils._testing import ( + assert_almost_equal, + assert_array_almost_equal, + assert_raise_message, + assert_raises, +) + +from .... import tensor as mt +from ....tensor.linalg import svd +from ..samples_generator import ( + make_blobs, + make_classification, + make_low_rank_matrix, + make_regression, +) + + +def test_make_classification(setup): + weights = [0.1, 0.25] + X, y = make_classification( + n_samples=100, + n_features=20, + n_informative=5, + n_redundant=1, + n_repeated=1, + n_classes=3, + n_clusters_per_class=1, + hypercube=False, + shift=None, + scale=None, + weights=weights, + random_state=0, + flip_y=-1, + ) + + assert weights == [0.1, 0.25] + assert X.shape == (100, 20) + assert y.shape == (100,) + assert mt.unique(y).to_numpy().shape == (3,) + assert (y == 0).sum().to_numpy() == 10 + assert (y == 1).sum().to_numpy() == 25 + assert (y == 2).sum().to_numpy() == 65 + + # Test for n_features > 30 + X, y = make_classification( + n_samples=2000, + n_features=31, + n_informative=31, + n_redundant=0, + n_repeated=0, + hypercube=True, + scale=0.5, + random_state=0, + ) + + X = X.to_numpy() + assert X.shape == (2000, 31) + assert y.shape == (2000,) + assert ( + np.unique(X.view([("", X.dtype)] * X.shape[1])) + .view(X.dtype) + .reshape(-1, X.shape[1]) + .shape[0] + == 2000 + ) + + +def test_make_classification_informative_features(setup): + """Test the construction of informative features in make_classification + + Also tests `n_clusters_per_class`, `n_classes`, `hypercube` and + fully-specified `weights`. + """ + # Create very separate clusters; check that vertices are unique and + # correspond to classes + class_sep = 1e6 + make = partial( + make_classification, + class_sep=class_sep, + n_redundant=0, + n_repeated=0, + flip_y=0, + shift=0, + scale=1, + shuffle=False, + ) + + for n_informative, weights, n_clusters_per_class in [ + (2, [1], 1), + (2, [1 / 3] * 3, 1), + (2, [1 / 4] * 4, 1), + (2, [1 / 2] * 2, 2), + (2, [3 / 4, 1 / 4], 2), + (10, [1 / 3] * 3, 10), + (np.int_(64), [1], 1), + ]: + n_classes = len(weights) + n_clusters = n_classes * n_clusters_per_class + n_samples = n_clusters * 50 + + for hypercube in (False, True): + generated = make( + n_samples=n_samples, + n_classes=n_classes, + weights=weights, + n_features=n_informative, + n_informative=n_informative, + n_clusters_per_class=n_clusters_per_class, + hypercube=hypercube, + random_state=0, + ) + + X, y = mt.ExecutableTuple(generated).execute().fetch() + assert X.shape == (n_samples, n_informative) + assert y.shape == (n_samples,) + + # Cluster by sign, viewed as strings to allow uniquing + signs = np.sign(X) + signs = signs.view(dtype=f"|S{signs.strides[0]}") + unique_signs, cluster_index = np.unique(signs, return_inverse=True) + + assert len(unique_signs) == n_clusters + + clusters_by_class = defaultdict(set) + for cluster, cls in zip(cluster_index, y): + clusters_by_class[cls].add(cluster) + for clusters in clusters_by_class.values(): + assert len(clusters) == n_clusters_per_class + assert len(clusters_by_class) == n_classes + + assert_array_almost_equal( + np.bincount(y) / len(y) // weights, + [1] * n_classes, + err_msg="Wrong number of samples per class", + ) + + # Ensure on vertices of hypercube + for cluster in range(len(unique_signs)): + centroid = X[cluster_index == cluster].mean(axis=0) + if hypercube: + assert_array_almost_equal( + np.abs(centroid) / class_sep, + np.ones(n_informative), + decimal=5, + err_msg="Clusters are not centered on hypercube vertices", + ) + else: + assert_raises( + AssertionError, + assert_array_almost_equal, + np.abs(centroid) / class_sep, + np.ones(n_informative), + decimal=5, + err_msg="Clusters should not be centered " + "on hypercube vertices", + ) + + assert_raises( + ValueError, + make, + n_features=2, + n_informative=2, + n_classes=5, + n_clusters_per_class=1, + ) + assert_raises( + ValueError, + make, + n_features=2, + n_informative=2, + n_classes=3, + n_clusters_per_class=2, + ) + + +def test_make_regression(setup): + X, y, c = make_regression( + n_samples=100, + n_features=10, + n_informative=3, + effective_rank=5, + coef=True, + bias=0.0, + noise=1.0, + random_state=0, + ) + X, y, c = mt.ExecutableTuple((X, y, c)).execute().fetch() + + assert X.shape == (100, 10), "X shape mismatch" + assert y.shape == (100,), "y shape mismatch" + assert c.shape == (10,), "coef shape mismatch" + assert sum(c != 0.0) == 3, "Unexpected number of informative features" + + # Test that y ~= np.dot(X, c) + bias + N(0, 1.0). + assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1) + + # Test with small number of features. + X, y = make_regression(n_samples=100, n_features=1) # n_informative=3 + assert X.shape == (100, 1) + + +def test_make_regression_multitarget(): + X, y, c = make_regression( + n_samples=100, + n_features=10, + n_informative=3, + n_targets=3, + coef=True, + noise=1.0, + random_state=0, + ) + X, y, c = mt.ExecutableTuple((X, y, c)).execute().fetch() + + assert X.shape == (100, 10), "X shape mismatch" + assert y.shape == (100, 3), "y shape mismatch" + assert c.shape == (10, 3), "coef shape mismatch" + np.testing.assert_array_equal( + sum(c != 0.0), 3, "Unexpected number of informative features" + ) + + # Test that y ~= np.dot(X, c) + bias + N(0, 1.0) + assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1) + + +def test_make_blobs(setup): + cluster_stds = np.array([0.05, 0.2, 0.4]) + cluster_centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]]) + X, y = make_blobs( + random_state=0, + n_samples=50, + n_features=2, + centers=cluster_centers, + cluster_std=cluster_stds, + ) + X, y = mt.ExecutableTuple((X, y)).execute().fetch() + assert X.shape == (50, 2) + assert y.shape == (50,) + assert np.unique(y).shape == (3,) + for i, (ctr, std) in enumerate(zip(cluster_centers, cluster_stds)): + assert_almost_equal((X[y == i] - ctr).std(), std, 1, "Unexpected std") + + +def test_make_blobs_n_samples_list(setup): + n_samples = [50, 30, 20] + X, y = make_blobs(n_samples=n_samples, n_features=2, random_state=0) + X, y = mt.ExecutableTuple((X, y)).execute().fetch() + + assert X.shape == (sum(n_samples), 2) + assert all(np.bincount(y, minlength=len(n_samples)) == n_samples) is True + + +def test_make_blobs_n_samples_list_with_centers(setup): + n_samples = [20, 20, 20] + centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]]) + cluster_stds = np.array([0.05, 0.2, 0.4]) + X, y = make_blobs( + n_samples=n_samples, centers=centers, cluster_std=cluster_stds, random_state=0 + ) + X, y = mt.ExecutableTuple((X, y)).execute().fetch() + + assert X.shape == (sum(n_samples), 2) + assert all(np.bincount(y, minlength=len(n_samples)) == n_samples) is True + for i, (ctr, std) in enumerate(zip(centers, cluster_stds)): + assert_almost_equal((X[y == i] - ctr).std(), std, 1, "Unexpected std") + + +def test_make_blobs_n_samples_centers_none(setup): + for n_samples in [[5, 3, 0], np.array([5, 3, 0]), tuple([5, 3, 0])]: + centers = None + X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=0) + X, y = mt.ExecutableTuple((X, y)).execute().fetch() + + assert X.shape == (sum(n_samples), 2) + assert all(np.bincount(y, minlength=len(n_samples)) == n_samples) is True + + +def test_make_blobs_error(setup): + n_samples = [20, 20, 20] + centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]]) + cluster_stds = np.array([0.05, 0.2, 0.4]) + wrong_centers_msg = ( + "Length of `n_samples` not consistent " + f"with number of centers. Got n_samples = {n_samples} " + f"and centers = {centers[:-1]}" + ) + assert_raise_message( + ValueError, wrong_centers_msg, make_blobs, n_samples, centers=centers[:-1] + ) + wrong_std_msg = ( + "Length of `clusters_std` not consistent with " + f"number of centers. Got centers = {mt.tensor(centers)} " + f"and cluster_std = {cluster_stds[:-1]}" + ) + assert_raise_message( + ValueError, + wrong_std_msg, + make_blobs, + n_samples, + centers=centers, + cluster_std=cluster_stds[:-1], + ) + wrong_type_msg = f"Parameter `centers` must be array-like. Got {3!r} instead" + assert_raise_message(ValueError, wrong_type_msg, make_blobs, n_samples, centers=3) + + +def test_make_low_rank_matrix(setup): + X = make_low_rank_matrix( + n_samples=50, + n_features=25, + effective_rank=5, + tail_strength=0.01, + random_state=0, + ) + + assert X.shape == (50, 25) + + _, s, _ = svd(X) + assert (s.sum() - 5).to_numpy() < 0.1 diff --git a/python/xorbits/_mars/learn/decomposition/__init__.py b/python/xorbits/_mars/learn/decomposition/__init__.py new file mode 100644 index 000000000..0fdc6656e --- /dev/null +++ b/python/xorbits/_mars/learn/decomposition/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ._pca import PCA +from ._truncated_svd import TruncatedSVD diff --git a/python/xorbits/_mars/learn/decomposition/_base.py b/python/xorbits/_mars/learn/decomposition/_base.py new file mode 100644 index 000000000..956140950 --- /dev/null +++ b/python/xorbits/_mars/learn/decomposition/_base.py @@ -0,0 +1,185 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from abc import ABCMeta, abstractmethod + +from sklearn.base import BaseEstimator, TransformerMixin + +from ... import tensor as mt +from ...tensor import linalg +from ..utils import check_array +from ..utils.validation import check_is_fitted + +# ----------------------------------------------------------- +# Original implementation is in `sklearn.decomposition.base`. +# ----------------------------------------------------------- + + +class _BasePCA(BaseEstimator, TransformerMixin, metaclass=ABCMeta): + """Base class for PCA methods. + + Warning: This class should not be used directly. + Use derived classes instead. + """ + + def get_covariance(self, session=None): + """Compute data covariance with the generative model. + + ``cov = components_.T * S**2 * components_ + sigma2 * eye(n_features)`` + where S**2 contains the explained variances, and sigma2 contains the + noise variances. + + Returns + ------- + cov : Tensor, shape=(n_features, n_features) + Estimated covariance of data. + """ + components_ = self.components_ + exp_var = self.explained_variance_ + if self.whiten: + components_ = components_ * mt.sqrt(exp_var[:, mt.newaxis]) + exp_var_diff = mt.maximum(exp_var - self.noise_variance_, 0.0) + cov = mt.dot(components_.T * exp_var_diff, components_) + cov.flat[:: len(cov) + 1] += self.noise_variance_ # modify diag inplace + cov.execute(session=session) + return cov + + def get_precision(self, session=None): + """Compute data precision matrix with the generative model. + + Equals the inverse of the covariance but computed with + the matrix inversion lemma for efficiency. + + Returns + ------- + precision : Tensor, shape=(n_features, n_features) + Estimated precision of data. + """ + n_features = self.components_.shape[1] + + # handle corner cases first + if self.n_components_ == 0: + precision = mt.eye(n_features) / self.noise_variance_ + precision.execute(session=session) + return precision + if self.n_components_ == n_features: + precision = linalg.inv(self.get_covariance()) + precision.execute(session=session) + return precision + + # Get precision using matrix inversion lemma + components_ = self.components_ + exp_var = self.explained_variance_ + if self.whiten: + components_ = components_ * mt.sqrt(exp_var[:, mt.newaxis]) + exp_var_diff = mt.maximum(exp_var - self.noise_variance_, 0.0) + precision = mt.dot(components_, components_.T) / self.noise_variance_ + precision.flat[:: len(precision) + 1] += 1.0 / exp_var_diff + precision = mt.dot(components_.T, mt.dot(linalg.inv(precision), components_)) + precision /= -(self.noise_variance_**2) + precision.flat[:: len(precision) + 1] += 1.0 / self.noise_variance_ + precision.execute(session=session) + return precision + + @abstractmethod + def fit(X, y=None, session=None, run_kwargs=None): + """Placeholder for fit. Subclasses should implement this method! + + Fit the model with X. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Training data, where n_samples is the number of samples and + n_features is the number of features. + + Returns + ------- + self : object + Returns the instance itself. + """ + + def transform(self, X, session=None): + """Apply dimensionality reduction to X. + + X is projected on the first principal components previously extracted + from a training set. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + New data, where n_samples is the number of samples + and n_features is the number of features. + session : session to run + + Returns + ------- + X_new : array-like, shape (n_samples, n_components) + + Examples + -------- + + >>> import numpy as np + >>> from sklearn.decomposition import IncrementalPCA + >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) + >>> ipca = IncrementalPCA(n_components=2, batch_size=3) + >>> ipca.fit(X) + IncrementalPCA(batch_size=3, copy=True, n_components=2, whiten=False) + >>> ipca.transform(X) # doctest: +SKIP + """ + check_is_fitted(self, ["mean_", "components_"], all_or_any=all) + + X = check_array(X) + if self.mean_ is not None: + X = X - self.mean_ + X_transformed = mt.dot(X, self.components_.T) + if self.whiten: + X_transformed /= mt.sqrt(self.explained_variance_) + X_transformed.execute(session=session) + return X_transformed + + def inverse_transform(self, X, session=None): + """Transform data back to its original space. + + In other words, return an input X_original whose transform would be X. + + Parameters + ---------- + X : array-like, shape (n_samples, n_components) + New data, where n_samples is the number of samples + and n_components is the number of components. + session : session to run + + Returns + ------- + X_original array-like, shape (n_samples, n_features) + + Notes + ----- + If whitening is enabled, inverse_transform will compute the + exact inverse operation, which includes reversing whitening. + """ + if self.whiten: + ret = ( + mt.dot( + X, + mt.sqrt(self.explained_variance_[:, mt.newaxis]) * self.components_, + ) + + self.mean_ + ) + else: + ret = mt.dot(X, self.components_) + self.mean_ + ret.execute(session=session) + return ret diff --git a/python/xorbits/_mars/learn/decomposition/_pca.py b/python/xorbits/_mars/learn/decomposition/_pca.py new file mode 100644 index 000000000..6056c42de --- /dev/null +++ b/python/xorbits/_mars/learn/decomposition/_pca.py @@ -0,0 +1,644 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import numbers +from math import log, sqrt + +import numpy as np +from scipy.special import gammaln +from sklearn.utils.extmath import fast_logdet +from sklearn.utils.validation import check_is_fitted + +from ... import remote as mr +from ... import tensor as mt +from ...core import ExecutableTuple +from ...lib.sparse import issparse +from ...tensor.array_utils import get_array_module +from ...tensor.core import TENSOR_TYPE +from ...tensor.linalg import randomized_svd +from ...tensor.linalg.randomized_svd import svd_flip +from ...tensor.utils import check_random_state +from ..utils import check_array +from ._base import _BasePCA + + +def _assess_dimension(spectrum, rank, n_samples): + """Compute the log-likelihood of a rank ``rank`` dataset. + + The dataset is assumed to be embedded in gaussian noise of shape(n, + dimf) having spectrum ``spectrum``. + + Parameters + ---------- + spectrum : array of shape (n_features) + Data spectrum. + rank : int + Tested rank value. It should be strictly lower than n_features, + otherwise the method isn't specified (division by zero in equation + (31) from the paper). + n_samples : int + Number of samples. + + Returns + ------- + ll : float, + The log-likelihood + + Notes + ----- + This implements the method of `Thomas P. Minka: + Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604` + """ + + xp = get_array_module(spectrum, nosparse=True) + + n_features = spectrum.shape[0] + if not 1 <= rank < n_features: # pragma: no cover + raise ValueError("the tested rank should be in [1, n_features - 1]") + + eps = 1e-15 + + if spectrum[rank - 1] < eps: # pragma: no cover + # When the tested rank is associated with a small eigenvalue, there's + # no point in computing the log-likelihood: it's going to be very + # small and won't be the max anyway. Also, it can lead to numerical + # issues below when computing pa, in particular in log((spectrum[i] - + # spectrum[j]) because this will take the log of something very small. + return -np.inf + + pu = -rank * log(2.0) + for i in range(1, rank + 1): + pu += ( + gammaln((n_features - i + 1) / 2.0) + - log(np.pi) * (n_features - i + 1) / 2.0 + ) + + pl = xp.sum(xp.log(spectrum[:rank])) + pl = -pl * n_samples / 2.0 + + v = max(eps, xp.sum(spectrum[rank:]) / (n_features - rank)) + pv = -xp.log(v) * n_samples * (n_features - rank) / 2.0 + + m = n_features * rank - rank * (rank + 1.0) / 2.0 + pp = log(2.0 * np.pi) * (m + rank) / 2.0 + + pa = 0.0 + spectrum_ = spectrum.copy() + spectrum_[rank:n_features] = v + for i in range(rank): + for j in range(i + 1, len(spectrum)): + pa += log( + (spectrum[i] - spectrum[j]) * (1.0 / spectrum_[j] - 1.0 / spectrum_[i]) + ) + log(n_samples) + + ll = pu + pl + pv + pp - pa / 2.0 - rank * log(n_samples) / 2.0 + + return ll + + +def _infer_dimension(spectrum, n_samples): + """Infers the dimension of a dataset with a given spectrum. + + The returned value will be in [1, n_features - 1]. + """ + xp = get_array_module(spectrum, nosparse=True) + + ll = xp.empty_like(spectrum) + ll[0] = -np.inf # we don't want to return n_components = 0 + for rank in range(1, spectrum.shape[0]): + ll[rank] = _assess_dimension(spectrum, rank, n_samples) + return ll.argmax() + + +class PCA(_BasePCA): + """Principal component analysis (PCA) + + Linear dimensionality reduction using Singular Value Decomposition of the + data to project it to a lower dimensional space. The input data is centered + but not scaled for each feature before applying the SVD. + + It uses the LAPACK implementation of the full SVD or a randomized truncated + SVD by the method of Halko et al. 2009, depending on the shape of the input + data and the number of components to extract. + + It can also use the scipy.sparse.linalg ARPACK implementation of the + truncated SVD. + + Notice that this class does not support sparse input. See + :class:`TruncatedSVD` for an alternative with sparse data. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int, float, None or string + Number of components to keep. + if n_components is not set all components are kept:: + + n_components == min(n_samples, n_features) + + If ``n_components == 'mle'`` and ``svd_solver == 'full'``, Minka's + MLE is used to guess the dimension. Use of ``n_components == 'mle'`` + will interpret ``svd_solver == 'auto'`` as ``svd_solver == 'full'``. + + If ``0 < n_components < 1`` and ``svd_solver == 'full'``, select the + number of components such that the amount of variance that needs to be + explained is greater than the percentage specified by n_components. + + If ``svd_solver == 'arpack'``, the number of components must be + strictly less than the minimum of n_features and n_samples. + + Hence, the None case results in:: + + n_components == min(n_samples, n_features) - 1 + + copy : bool (default True) + If False, data passed to fit are overwritten and running + fit(X).transform(X) will not yield the expected results, + use fit_transform(X) instead. + + whiten : bool, optional (default False) + When True (False by default) the `components_` vectors are multiplied + by the square root of n_samples and then divided by the singular values + to ensure uncorrelated outputs with unit component-wise variances. + + Whitening will remove some information from the transformed signal + (the relative variance scales of the components) but can sometime + improve the predictive accuracy of the downstream estimators by + making their data respect some hard-wired assumptions. + + svd_solver : string {'auto', 'full', 'arpack', 'randomized'} + auto : + the solver is selected by a default policy based on `X.shape` and + `n_components`: if the input data is larger than 500x500 and the + number of components to extract is lower than 80% of the smallest + dimension of the data, then the more efficient 'randomized' + method is enabled. Otherwise the exact full SVD is computed and + optionally truncated afterwards. + full : + run exact full SVD calling the standard LAPACK solver via + `scipy.linalg.svd` and select the components by postprocessing + arpack : + run SVD truncated to n_components calling ARPACK solver via + `scipy.sparse.linalg.svds`. It requires strictly + 0 < n_components < min(X.shape) + randomized : + run randomized SVD by the method of Halko et al. + + tol : float >= 0, optional (default .0) + Tolerance for singular values computed by svd_solver == 'arpack'. + + iterated_power : int >= 0, or 'auto', (default 'auto') + Number of iterations for the power method computed by + svd_solver == 'randomized'. + + random_state : int, RandomState instance or None, optional (default None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. Used when ``svd_solver`` == 'arpack' or 'randomized'. + + Attributes + ---------- + components_ : tensor, shape (n_components, n_features) + Principal axes in feature space, representing the directions of + maximum variance in the data. The components are sorted by + ``explained_variance_``. + + explained_variance_ : tensor, shape (n_components,) + The amount of variance explained by each of the selected components. + + Equal to n_components largest eigenvalues + of the covariance matrix of X. + + explained_variance_ratio_ : tensor, shape (n_components,) + Percentage of variance explained by each of the selected components. + + If ``n_components`` is not set then all components are stored and the + sum of the ratios is equal to 1.0. + + singular_values_ : tensor, shape (n_components,) + The singular values corresponding to each of the selected components. + The singular values are equal to the 2-norms of the ``n_components`` + variables in the lower-dimensional space. + + mean_ : tensor, shape (n_features,) + Per-feature empirical mean, estimated from the training set. + + Equal to `X.mean(axis=0)`. + + n_components_ : int + The estimated number of components. When n_components is set + to 'mle' or a number between 0 and 1 (with svd_solver == 'full') this + number is estimated from input data. Otherwise it equals the parameter + n_components, or the lesser value of n_features and n_samples + if n_components is None. + + noise_variance_ : float + The estimated noise covariance following the Probabilistic PCA model + from Tipping and Bishop 1999. See "Pattern Recognition and + Machine Learning" by C. Bishop, 12.2.1 p. 574 or + http://www.miketipping.com/papers/met-mppca.pdf. It is required to + compute the estimated data covariance and score samples. + + Equal to the average of (min(n_features, n_samples) - n_components) + smallest eigenvalues of the covariance matrix of X. + + References + ---------- + For n_components == 'mle', this class uses the method of *Minka, T. P. + "Automatic choice of dimensionality for PCA". In NIPS, pp. 598-604* + + Implements the probabilistic PCA model from: + Tipping, M. E., and Bishop, C. M. (1999). "Probabilistic principal + component analysis". Journal of the Royal Statistical Society: + Series B (Statistical Methodology), 61(3), 611-622. + via the score and score_samples methods. + See http://www.miketipping.com/papers/met-mppca.pdf + + For svd_solver == 'arpack', refer to `scipy.sparse.linalg.svds`. + + For svd_solver == 'randomized', see: + *Halko, N., Martinsson, P. G., and Tropp, J. A. (2011). + "Finding structure with randomness: Probabilistic algorithms for + constructing approximate matrix decompositions". + SIAM review, 53(2), 217-288.* and also + *Martinsson, P. G., Rokhlin, V., and Tygert, M. (2011). + "A randomized algorithm for the decomposition of matrices". + Applied and Computational Harmonic Analysis, 30(1), 47-68.* + + + Examples + -------- + >>> import mars.tensor as mt + >>> from mars.learn.decomposition import PCA + >>> X = mt.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) + >>> pca = PCA(n_components=2) + >>> pca.fit(X) # doctest: +NORMALIZE_WHITESPACE + PCA(copy=True, iterated_power='auto', n_components=2, random_state=None, + svd_solver='auto', tol=0.0, whiten=False) + >>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS + [0.9924... 0.0075...] + >>> print(pca.singular_values_) # doctest: +ELLIPSIS + [6.30061... 0.54980...] + + >>> pca = PCA(n_components=2, svd_solver='full') + >>> pca.fit(X) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + PCA(copy=True, iterated_power='auto', n_components=2, random_state=None, + svd_solver='full', tol=0.0, whiten=False) + >>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS + [0.9924... 0.00755...] + >>> print(pca.singular_values_) # doctest: +ELLIPSIS + [6.30061... 0.54980...] + + See also + -------- + KernelPCA + SparsePCA + TruncatedSVD + IncrementalPCA + """ + + def __init__( + self, + n_components=None, + copy=True, + whiten=False, + svd_solver="auto", + tol=0.0, + iterated_power="auto", + random_state=None, + ): + self.n_components = n_components + self.copy = copy + self.whiten = whiten + self.svd_solver = svd_solver + self.tol = tol + self.iterated_power = iterated_power + self.random_state = random_state + + def fit(self, X, y=None, session=None, run_kwargs=None): + """Fit the model with X. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Training data, where n_samples is the number of samples + and n_features is the number of features. + + y : Ignored + + Returns + ------- + self : object + Returns the instance itself. + """ + self._fit(X, session=session, run=True, run_kwargs=run_kwargs) + return self + + def fit_transform(self, X, y=None, session=None): + """Fit the model with X and apply the dimensionality reduction on X. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Training data, where n_samples is the number of samples + and n_features is the number of features. + + y : Ignored + + Returns + ------- + X_new : array-like, shape (n_samples, n_components) + + """ + U, S, _ = self._fit(X, session=session, run=False) + U = U[:, : self.n_components_] + + if self.whiten: + # X_new = X * V / S * sqrt(n_samples) = U * sqrt(n_samples) + U *= sqrt(X.shape[0] - 1) + else: + # X_new = X * V = U * S * V^T * V = U * S + U *= S[: self.n_components_] + + self._run([U], session=session) + return U + + def _run(self, result, session=None, run_kwargs=None): + to_run_tensors = list(result) + if isinstance(self.noise_variance_, TENSOR_TYPE): + to_run_tensors.append(self.noise_variance_) + to_run_tensors.append(self.components_) + to_run_tensors.append(self.explained_variance_) + to_run_tensors.append(self.explained_variance_ratio_) + to_run_tensors.append(self.singular_values_) + + ExecutableTuple(to_run_tensors).execute(session=session, **(run_kwargs or {})) + + def _fit(self, X, session=None, run=True, run_kwargs=None): + """Dispatch to the right submethod depending on the chosen solver.""" + + # Raise an error for sparse input. + # This is more informative than the generic one raised by check_array. + if (hasattr(X, "issparse") and X.issparse()) or issparse(X): + raise TypeError( + "PCA does not support sparse input. See " + "TruncatedSVD for a possible alternative." + ) + + X = check_array( + X, dtype=[mt.float64, mt.float32], ensure_2d=True, copy=self.copy + ) + + # Handle n_components==None + if self.n_components is None: + if self.svd_solver != "arpack": + n_components = min(X.shape) + else: + n_components = min(X.shape) - 1 + else: + n_components = self.n_components + + # Handle svd_solver + self._fit_svd_solver = self.svd_solver + if self._fit_svd_solver == "auto": + # Small problem or n_components == 'mle', just call full PCA + if max(X.shape) <= 500 or n_components == "mle": + self._fit_svd_solver = "full" + elif n_components >= 1 and n_components < 0.8 * min(X.shape): + self._fit_svd_solver = "randomized" + # This is also the case of n_components in (0,1) + else: + self._fit_svd_solver = "full" + + # Call different fits for either full or truncated SVD + if self._fit_svd_solver == "full": + ret = self._fit_full(X, n_components, session=session) + elif self._fit_svd_solver in ["arpack", "randomized"]: + ret = self._fit_truncated(X, n_components, self._fit_svd_solver) + else: + raise ValueError(f"Unrecognized svd_solver='{self._fit_svd_solver}'") + + if run: + self._run(ret, session=session, run_kwargs=run_kwargs) + return ret + + def _fit_full(self, X, n_components, session=None, run_kwargs=None): + """Fit the model by computing full SVD on X""" + n_samples, n_features = X.shape + + if n_components == "mle": + if n_samples < n_features: + raise ValueError( + "n_components='mle' is only supported if n_samples >= n_features" + ) + elif not 0 <= n_components <= min(n_samples, n_features): + raise ValueError( + "n_components=%r must be between 0 and " + "min(n_samples, n_features)=%r with " + "svd_solver='full'" % (n_components, min(n_samples, n_features)) + ) + elif n_components >= 1: + if not isinstance(n_components, (numbers.Integral, np.integer)): + raise ValueError( + "n_components=%r must be of type int " + "when greater than or equal to 1, " + "was of type=%r" % (n_components, type(n_components)) + ) + + # Center data + self.mean_ = mt.mean(X, axis=0) + X -= self.mean_ + + U, S, V = mt.linalg.svd(X) + # flip eigenvectors' sign to enforce deterministic output + U, V = svd_flip(U, V) + + components_ = V + + # Get variance explained by singular values + explained_variance_ = (S**2) / (n_samples - 1) + total_var = explained_variance_.sum() + explained_variance_ratio_ = explained_variance_ / total_var + singular_values_ = S.copy() # Store the singular values. + + # Postprocess the number of components required + if n_components == "mle": + n_components = mr.spawn( + _infer_dimension, + args=(explained_variance_, n_samples), + resolve_tileable_input=True, + ) + ExecutableTuple([n_components, U, V]).execute( + session=session, **(run_kwargs or dict()) + ) + n_components = n_components.fetch(session=session) + elif 0 < n_components < 1.0: + # number of components for which the cumulated explained + # variance percentage is superior to the desired threshold + # ratio_cumsum = stable_cumsum(explained_variance_ratio_) + ratio_cumsum = explained_variance_ratio_.cumsum() + n_components = (mt.searchsorted(ratio_cumsum, n_components) + 1).to_numpy( + session=session, **(run_kwargs or dict()) + ) + + # Compute noise covariance using Probabilistic PCA model + # The sigma2 maximum likelihood (cf. eq. 12.46) + if n_components < min(n_features, n_samples): + self.noise_variance_ = explained_variance_[n_components:].mean() + else: + self.noise_variance_ = 0.0 + + self.n_samples_, self.n_features_ = n_samples, n_features + self.components_ = components_[:n_components] + self.n_components_ = n_components + self.explained_variance_ = explained_variance_[:n_components] + self.explained_variance_ratio_ = explained_variance_ratio_[:n_components] + self.singular_values_ = singular_values_[:n_components] + + return U, S, V + + def _fit_truncated(self, X, n_components, svd_solver): + """Fit the model by computing truncated SVD (by ARPACK or randomized) + on X + """ + n_samples, n_features = X.shape + + if isinstance(n_components, str): + raise ValueError( + "n_components=%r cannot be a string " + "with svd_solver='%s'" % (n_components, svd_solver) + ) + elif not 1 <= n_components <= min(n_samples, n_features): + raise ValueError( + "n_components=%r must be between 1 and " + "min(n_samples, n_features)=%r with " + "svd_solver='%s'" + % (n_components, min(n_samples, n_features), svd_solver) + ) + elif not isinstance(n_components, (numbers.Integral, np.integer)): + raise ValueError( + "n_components=%r must be of type int " + "when greater than or equal to 1, was of type=%r" + % (n_components, type(n_components)) + ) + elif svd_solver == "arpack" and n_components == min(n_samples, n_features): + raise ValueError( + "n_components=%r must be strictly less than " + "min(n_samples, n_features)=%r with " + "svd_solver='%s'" + % (n_components, min(n_samples, n_features), svd_solver) + ) + + random_state = check_random_state(self.random_state) + + # Center data + self.mean_ = mt.mean(X, axis=0) + X -= self.mean_ + + if svd_solver == "arpack": + # # random init solution, as ARPACK does it internally + # v0 = random_state.uniform(-1, 1, size=min(X.shape)) + # U, S, V = svds(X, k=n_components, tol=self.tol, v0=v0) + # # svds doesn't abide by scipy.linalg.svd/randomized_svd + # # conventions, so reverse its outputs. + # S = S[::-1] + # # flip eigenvectors' sign to enforce deterministic output + # U, V = svd_flip(U[:, ::-1], V[::-1]) + raise NotImplementedError("Does not support arpack svd_resolver") + + elif svd_solver == "randomized": + # sign flipping is done inside + U, S, V = randomized_svd( + X, + n_components=n_components, + n_iter=self.iterated_power, + flip_sign=True, + random_state=random_state, + ) + + self.n_samples_, self.n_features_ = n_samples, n_features + self.components_ = V + self.n_components_ = n_components + + # Get variance explained by singular values + self.explained_variance_ = (S**2) / (n_samples - 1) + total_var = mt.var(X, ddof=1, axis=0) + self.explained_variance_ratio_ = self.explained_variance_ / total_var.sum() + self.singular_values_ = S.copy() # Store the singular values. + + if self.n_components_ < min(n_features, n_samples): + self.noise_variance_ = total_var.sum() - self.explained_variance_.sum() + self.noise_variance_ /= min(n_features, n_samples) - n_components + else: + self.noise_variance_ = 0.0 + + return U, S, V + + def _score_samples(self, X, session=None): + check_is_fitted(self, "mean_") + + X = check_array(X) + Xr = X - self.mean_ + n_features = X.shape[1] + precision = self.get_precision().fetch(session=session) + log_like = -0.5 * (Xr * (mt.dot(Xr, precision))).sum(axis=1) + log_like -= 0.5 * (n_features * log(2.0 * mt.pi) - fast_logdet(precision)) + return log_like + + def score_samples(self, X, session=None): + """Return the log-likelihood of each sample. + + See. "Pattern Recognition and Machine Learning" + by C. Bishop, 12.2.1 p. 574 + or http://www.miketipping.com/papers/met-mppca.pdf + + Parameters + ---------- + X : tensor, shape(n_samples, n_features) + The data. + + Returns + ------- + ll : tensor, shape (n_samples,) + Log-likelihood of each sample under the current model + """ + log_like = self._score_samples(X, session=session) + log_like.execute(session=session) + return log_like + + def score(self, X, y=None, session=None): + """Return the average log-likelihood of all samples. + + See. "Pattern Recognition and Machine Learning" + by C. Bishop, 12.2.1 p. 574 + or http://www.miketipping.com/papers/met-mppca.pdf + + Parameters + ---------- + X : tensor, shape(n_samples, n_features) + The data. + + y : Ignored + + Returns + ------- + ll : float + Average log-likelihood of the samples under the current model + """ + ret = mt.mean(self._score_samples(X)) + ret.execute(session=session) + return ret diff --git a/python/xorbits/_mars/learn/decomposition/_truncated_svd.py b/python/xorbits/_mars/learn/decomposition/_truncated_svd.py new file mode 100644 index 000000000..1aad878bd --- /dev/null +++ b/python/xorbits/_mars/learn/decomposition/_truncated_svd.py @@ -0,0 +1,258 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import numpy as np +from sklearn.base import BaseEstimator, TransformerMixin + +from ... import tensor as mt +from ...core import ExecutableTuple +from ...tensor.linalg import randomized_svd +from ...tensor.utils import check_random_state +from ..utils import check_array + +__all__ = ["TruncatedSVD"] + + +class TruncatedSVD(BaseEstimator, TransformerMixin): + """Dimensionality reduction using truncated SVD (aka LSA). + + This transformer performs linear dimensionality reduction by means of + truncated singular value decomposition (SVD). Contrary to PCA, this + estimator does not center the data before computing the singular value + decomposition. This means it can work with scipy.sparse matrices + efficiently. + + In particular, truncated SVD works on term count/tf-idf matrices as + returned by the vectorizers in sklearn.feature_extraction.text. In that + context, it is known as latent semantic analysis (LSA). + + This estimator supports two algorithms: a fast randomized SVD solver, and + a "naive" algorithm that uses ARPACK as an eigensolver on (X * X.T) or + (X.T * X), whichever is more efficient. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int, default = 2 + Desired dimensionality of output data. + Must be strictly less than the number of features. + The default value is useful for visualisation. For LSA, a value of + 100 is recommended. + + algorithm : string, default = "randomized" + SVD solver to use. Either "arpack" for the ARPACK wrapper in SciPy + (scipy.sparse.linalg.svds), or "randomized" for the randomized + algorithm due to Halko (2009). + + n_iter : int, optional (default 5) + Number of iterations for randomized SVD solver. Not used by ARPACK. + The default is larger than the default in `randomized_svd` to handle + sparse matrices that may have large slowly decaying spectrum. + + random_state : int, RandomState instance or None, optional, default = None + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + + tol : float, optional + Tolerance for ARPACK. 0 means machine precision. Ignored by randomized + SVD solver. + + Attributes + ---------- + components_ : array, shape (n_components, n_features) + + explained_variance_ : array, shape (n_components,) + The variance of the training samples transformed by a projection to + each component. + + explained_variance_ratio_ : array, shape (n_components,) + Percentage of variance explained by each of the selected components. + + singular_values_ : array, shape (n_components,) + The singular values corresponding to each of the selected components. + The singular values are equal to the 2-norms of the ``n_components`` + variables in the lower-dimensional space. + + Examples + -------- + >>> from mars.learn.decomposition import TruncatedSVD + >>> import mars.tensor as mt + >>> from sklearn.random_projection import sparse_random_matrix + >>> X = mt.tensor(sparse_random_matrix(100, 100, density=0.01, random_state=42)) + >>> svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42) + >>> svd.fit(X) # doctest: +NORMALIZE_WHITESPACE + TruncatedSVD(algorithm='randomized', n_components=5, n_iter=7, + random_state=42, tol=0.0) + >>> print(svd.explained_variance_ratio_) # doctest: +ELLIPSIS + [0.0606... 0.0584... 0.0497... 0.0434... 0.0372...] + >>> print(svd.explained_variance_ratio_.sum()) # doctest: +ELLIPSIS + 0.249... + >>> print(svd.singular_values_) # doctest: +ELLIPSIS + [2.5841... 2.5245... 2.3201... 2.1753... 2.0443...] + + See also + -------- + PCA + + References + ---------- + Finding structure with randomness: Stochastic algorithms for constructing + approximate matrix decompositions + Halko, et al., 2009 (arXiv:909) https://arxiv.org/pdf/0909.4061.pdf + + Notes + ----- + SVD suffers from a problem called "sign indeterminacy", which means the + sign of the ``components_`` and the output from transform depend on the + algorithm and random state. To work around this, fit instances of this + class to data once, then keep the instance around to do transformations. + + """ + + def __init__( + self, + n_components=2, + algorithm="randomized", + n_iter=5, + random_state=None, + tol=0.0, + ): + self.algorithm = algorithm + self.n_components = n_components + self.n_iter = n_iter + self.random_state = random_state + self.tol = tol + + def fit(self, X, y=None, session=None): + """Fit LSI model on training data X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Training data. + session : session to run + + y : Ignored + + Returns + ------- + self : object + Returns the transformer object. + """ + self.fit_transform(X, session=session) + return self + + def fit_transform(self, X, y=None, session=None): + """Fit LSI model to X and perform dimensionality reduction on X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Training data. + session : session to run + + y : Ignored + + Returns + ------- + X_new : array, shape (n_samples, n_components) + Reduced version of X. This will always be a dense array. + """ + X = check_array(X, accept_sparse=["csr", "csc"], ensure_min_features=2) + random_state = check_random_state(self.random_state) + + if self.algorithm == "arpack": + # U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol) + # # svds doesn't abide by scipy.linalg.svd/randomized_svd + # # conventions, so reverse its outputs. + # Sigma = Sigma[::-1] + # U, VT = svd_flip(U[:, ::-1], VT[::-1]) + raise NotImplementedError("Does not support arpack for truncated_svd") + + elif self.algorithm == "randomized": + k = self.n_components + n_features = X.shape[1] + if k >= n_features: + raise ValueError( + f"n_components must be < n_features; got {k} >= {n_features}" + ) + U, Sigma, VT = randomized_svd( + X, self.n_components, n_iter=self.n_iter, random_state=random_state + ) + else: + raise ValueError(f"unknown algorithm {self.algorithm!r}") + + self.components_ = VT + + # Calculate explained variance & explained variance ratio + X_transformed = U * Sigma + self.explained_variance_ = exp_var = np.var(X_transformed, axis=0) + full_var = mt.var(X, axis=0).sum() + self.explained_variance_ratio_ = exp_var / full_var + self.singular_values_ = Sigma # Store the singular values. + + to_run_tensors = [ + X_transformed, + self.components_, + self.explained_variance_, + self.explained_variance_ratio_, + self.singular_values_, + ] + + ExecutableTuple(to_run_tensors).execute(session=session) + return X_transformed + + def transform(self, X, session=None): + """Perform dimensionality reduction on X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + New data. + session : session to run + + Returns + ------- + X_new : array, shape (n_samples, n_components) + Reduced version of X. This will always be a dense array. + """ + X = check_array(X, accept_sparse="csr") + ret = mt.dot(X, self.components_.T) + ret.execute(session=session) + return ret + + def inverse_transform(self, X, session=None): + """Transform X back to its original space. + + Returns an array X_original whose transform would be X. + + Parameters + ---------- + X : array-like, shape (n_samples, n_components) + New data. + session : session to run + + Returns + ------- + X_original : array, shape (n_samples, n_features) + Note that this is always a dense array. + """ + X = check_array(X) + ret = mt.dot(X, self.components_) + ret.execute(session=session) + return ret diff --git a/python/xorbits/_mars/learn/decomposition/tests/__init__.py b/python/xorbits/_mars/learn/decomposition/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/learn/decomposition/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/learn/decomposition/tests/test_pca.py b/python/xorbits/_mars/learn/decomposition/tests/test_pca.py new file mode 100644 index 000000000..b42ff5368 --- /dev/null +++ b/python/xorbits/_mars/learn/decomposition/tests/test_pca.py @@ -0,0 +1,730 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from itertools import product + +import numpy as np +import pytest +import scipy as sp +from sklearn import datasets +from sklearn.utils._testing import ( + assert_almost_equal, + assert_array_almost_equal, + assert_raise_message, + assert_raises, + assert_raises_regex, +) + +from .... import tensor as mt +from .._pca import PCA, _assess_dimension, _infer_dimension + +iris = mt.tensor(datasets.load_iris().data) +# solver_list not includes arpack +solver_list = ["full", "randomized", "auto"] + + +def test_pca(setup): + X = iris + + for n_comp in np.arange(X.shape[1]): + pca = PCA(n_components=n_comp, svd_solver="full") + + X_r = pca.fit(X).transform(X).fetch() + np.testing.assert_equal(X_r.shape[1], n_comp) + + X_r2 = pca.fit_transform(X).fetch() + assert_array_almost_equal(X_r, X_r2) + + X_r = pca.transform(X).fetch() + X_r2 = pca.fit_transform(X).fetch() + assert_array_almost_equal(X_r, X_r2) + + # Test get_covariance and get_precision + cov = pca.get_covariance() + precision = pca.get_precision() + assert_array_almost_equal( + mt.dot(cov, precision).to_numpy(), np.eye(X.shape[1]), 12 + ) + + # test explained_variance_ratio_ == 1 with all components + pca = PCA(svd_solver="full") + pca.fit(X) + np.testing.assert_allclose(pca.explained_variance_ratio_.sum().to_numpy(), 1.0, 3) + + +def test_pca_randomized_solver(setup): + # PCA on dense arrays + X = iris + + # Loop excluding the 0, invalid for randomized + for n_comp in np.arange(1, X.shape[1]): + pca = PCA(n_components=n_comp, svd_solver="randomized", random_state=0) + + X_r = pca.fit(X).transform(X) + np.testing.assert_equal(X_r.shape[1], n_comp) + + X_r2 = pca.fit_transform(X) + assert_array_almost_equal(X_r.fetch(), X_r2.fetch()) + + X_r = pca.transform(X) + assert_array_almost_equal(X_r.fetch(), X_r2.fetch()) + + # Test get_covariance and get_precision + cov = pca.get_covariance() + precision = pca.get_precision() + assert_array_almost_equal( + mt.dot(cov, precision).to_numpy(), mt.eye(X.shape[1]).to_numpy(), 12 + ) + + pca = PCA(n_components=0, svd_solver="randomized", random_state=0) + with pytest.raises(ValueError): + pca.fit(X) + + pca = PCA(n_components=0, svd_solver="randomized", random_state=0) + with pytest.raises(ValueError): + pca.fit(X) + # Check internal state + assert ( + pca.n_components + == PCA(n_components=0, svd_solver="randomized", random_state=0).n_components + ) + assert ( + pca.svd_solver + == PCA(n_components=0, svd_solver="randomized", random_state=0).svd_solver + ) + + +def test_whitening(setup): + # Check that PCA output has unit-variance + rng = np.random.RandomState(0) + n_samples = 100 + n_features = 80 + n_components = 30 + rank = 50 + + # some low rank data with correlated features + X = mt.dot( + rng.randn(n_samples, rank), + mt.dot(mt.diag(mt.linspace(10.0, 1.0, rank)), rng.randn(rank, n_features)), + ) + # the component-wise variance of the first 50 features is 3 times the + # mean component-wise variance of the remaining 30 features + X[:, :50] *= 3 + + assert X.shape == (n_samples, n_features) + + # the component-wise variance is thus highly varying: + assert X.std(axis=0).std().to_numpy() > 43.8 + + for solver, copy in product(solver_list, (True, False)): + # whiten the data while projecting to the lower dim subspace + X_ = X.copy() # make sure we keep an original across iterations. + pca = PCA( + n_components=n_components, + whiten=True, + copy=copy, + svd_solver=solver, + random_state=0, + iterated_power=7, + ) + # test fit_transform + X_whitened = pca.fit_transform(X_.copy()) + assert X_whitened.shape == (n_samples, n_components) + X_whitened2 = pca.transform(X_) + assert_array_almost_equal(X_whitened.fetch(), X_whitened2.fetch()) + + assert_almost_equal( + X_whitened.std(ddof=1, axis=0).to_numpy(), np.ones(n_components), decimal=6 + ) + assert_almost_equal(X_whitened.mean(axis=0).to_numpy(), np.zeros(n_components)) + + X_ = X.copy() + pca = PCA( + n_components=n_components, whiten=False, copy=copy, svd_solver=solver + ).fit(X_) + X_unwhitened = pca.transform(X_) + assert X_unwhitened.shape == (n_samples, n_components) + + # in that case the output components still have varying variances + assert_almost_equal(X_unwhitened.std(axis=0).std().to_numpy(), 74.1, 1) + # we always center, so no test for non-centering. + + +def test_explained_variance(setup): + # Check that PCA output has unit-variance + rng = np.random.RandomState(0) + n_samples = 100 + n_features = 80 + + X = mt.tensor(rng.randn(n_samples, n_features)) + + pca = PCA(n_components=2, svd_solver="full").fit(X) + rpca = PCA(n_components=2, svd_solver="randomized", random_state=42).fit(X) + assert_array_almost_equal( + pca.explained_variance_.to_numpy(), rpca.explained_variance_.to_numpy(), 1 + ) + assert_array_almost_equal( + pca.explained_variance_ratio_.to_numpy(), + rpca.explained_variance_ratio_.to_numpy(), + 1, + ) + + # compare to empirical variances + expected_result = np.linalg.eig(np.cov(X.to_numpy(), rowvar=False))[0] + expected_result = sorted(expected_result, reverse=True)[:2] + + X_pca = pca.transform(X) + assert_array_almost_equal( + pca.explained_variance_.to_numpy(), mt.var(X_pca, ddof=1, axis=0).to_numpy() + ) + assert_array_almost_equal(pca.explained_variance_.to_numpy(), expected_result) + + X_rpca = rpca.transform(X) + assert_array_almost_equal( + rpca.explained_variance_.to_numpy(), + mt.var(X_rpca, ddof=1, axis=0).to_numpy(), + decimal=1, + ) + assert_array_almost_equal( + rpca.explained_variance_.to_numpy(), expected_result, decimal=1 + ) + + # Same with correlated data + X = datasets.make_classification( + n_samples, n_features, n_informative=n_features - 2, random_state=rng + )[0] + X = mt.tensor(X) + + pca = PCA(n_components=2).fit(X) + rpca = PCA(n_components=2, svd_solver="randomized", random_state=rng).fit(X) + assert_array_almost_equal( + pca.explained_variance_ratio_.to_numpy(), + rpca.explained_variance_ratio_.to_numpy(), + 5, + ) + + +def test_singular_values(setup): + # Check that the PCA output has the correct singular values + + rng = np.random.RandomState(0) + n_samples = 100 + n_features = 80 + + X = mt.tensor(rng.randn(n_samples, n_features)) + + pca = PCA(n_components=2, svd_solver="full", random_state=rng).fit(X) + rpca = PCA(n_components=2, svd_solver="randomized", random_state=rng).fit(X) + assert_array_almost_equal( + pca.singular_values_.fetch(), rpca.singular_values_.fetch(), 1 + ) + + # Compare to the Frobenius norm + X_pca = pca.transform(X) + X_rpca = rpca.transform(X) + assert_array_almost_equal( + mt.sum(pca.singular_values_**2.0).to_numpy(), + (mt.linalg.norm(X_pca, "fro") ** 2.0).to_numpy(), + 12, + ) + assert_array_almost_equal( + mt.sum(rpca.singular_values_**2.0).to_numpy(), + (mt.linalg.norm(X_rpca, "fro") ** 2.0).to_numpy(), + 0, + ) + + # Compare to the 2-norms of the score vectors + assert_array_almost_equal( + pca.singular_values_.fetch(), + mt.sqrt(mt.sum(X_pca**2.0, axis=0)).to_numpy(), + 12, + ) + assert_array_almost_equal( + rpca.singular_values_.fetch(), + mt.sqrt(mt.sum(X_rpca**2.0, axis=0)).to_numpy(), + 2, + ) + + # Set the singular values and see what we get back + rng = np.random.RandomState(0) + n_samples = 100 + n_features = 110 + + X = mt.tensor(rng.randn(n_samples, n_features)) + + pca = PCA(n_components=3, svd_solver="full", random_state=rng) + rpca = PCA(n_components=3, svd_solver="randomized", random_state=rng) + X_pca = pca.fit_transform(X) + + X_pca /= mt.sqrt(mt.sum(X_pca**2.0, axis=0)) + X_pca[:, 0] *= 3.142 + X_pca[:, 1] *= 2.718 + + X_hat = mt.dot(X_pca, pca.components_) + pca.fit(X_hat) + rpca.fit(X_hat) + assert_array_almost_equal(pca.singular_values_.fetch(), [3.142, 2.718, 1.0], 14) + assert_array_almost_equal(rpca.singular_values_.fetch(), [3.142, 2.718, 1.0], 14) + + +def test_pca_check_projection(setup): + # Test that the projection of data is correct + rng = np.random.RandomState(0) + n, p = 100, 3 + X = mt.tensor(rng.randn(n, p) * 0.1) + X[:10] += mt.array([3, 4, 5]) + Xt = 0.1 * mt.tensor(rng.randn(1, p)) + mt.array([3, 4, 5]) + + for solver in solver_list: + Yt = PCA(n_components=2, svd_solver=solver).fit(X).transform(Xt) + Yt /= mt.sqrt((Yt**2).sum()) + + assert_almost_equal(mt.abs(Yt[0][0]).to_numpy(), 1.0, 1) + + +def test_pca_inverse(setup): + # Test that the projection of data can be inverted + rng = np.random.RandomState(0) + n, p = 50, 3 + X = mt.tensor(rng.randn(n, p)) # spherical data + X[:, 1] *= 0.00001 # make middle component relatively small + X += [5, 4, 3] # make a large mean + + # same check that we can find the original data from the transformed + # signal (since the data is almost of rank n_components) + pca = PCA(n_components=2, svd_solver="full").fit(X) + Y = pca.transform(X) + Y_inverse = pca.inverse_transform(Y) + assert_almost_equal(X.to_numpy(), Y_inverse.to_numpy(), decimal=3) + + # same as above with whitening (approximate reconstruction) + for solver in solver_list: + pca = PCA(n_components=2, whiten=True, svd_solver=solver) + pca.fit(X) + Y = pca.transform(X) + Y_inverse = pca.inverse_transform(Y) + assert_almost_equal(X.to_numpy(), Y_inverse.to_numpy(), decimal=3) + + +def test_pca_validation(setup): + for solver in solver_list: + # Ensures that solver-specific extreme inputs for the n_components + # parameter raise errors + X = mt.array([[0, 1, 0], [1, 0, 0]]) + smallest_d = 2 # The smallest dimension + lower_limit = {"randomized": 1, "full": 0, "auto": 0} + + # We conduct the same test on X.T so that it is invariant to axis. + for data in [X, X.T]: + for n_components in [-1, 3]: + if solver == "auto": + solver_reported = "full" + else: + solver_reported = solver + + assert_raises_regex( + ValueError, + f"n_components={n_components}L? must be between " + rf"{lower_limit[solver]}L? and min\(n_samples, n_features\)=" + f"{smallest_d}L? with svd_solver='{solver_reported}'", + PCA(n_components, svd_solver=solver).fit, + data, + ) + + n_components = 1.0 + type_ncom = type(n_components) + assert_raise_message( + ValueError, + f"n_components={n_components} must be of type int " + f"when greater than or equal to 1, was of type={type_ncom}", + PCA(n_components, svd_solver=solver).fit, + data, + ) + + +def test_n_components_none(setup): + for solver in solver_list: + # Ensures that n_components == None is handled correctly + X = iris + # We conduct the same test on X.T so that it is invariant to axis. + for data in [X, X.T]: + pca = PCA(svd_solver=solver) + pca.fit(data) + assert pca.n_components_ == min(data.shape) + + +def test_randomized_pca_check_projection(setup): + # Test that the projection by randomized PCA on dense data is correct + rng = np.random.RandomState(0) + n, p = 100, 3 + X = mt.tensor(rng.randn(n, p) * 0.1) + X[:10] += mt.array([3, 4, 5]) + Xt = 0.1 * mt.tensor(rng.randn(1, p)) + mt.array([3, 4, 5]) + + Yt = ( + PCA(n_components=2, svd_solver="randomized", random_state=0) + .fit(X) + .transform(Xt) + ) + Yt /= np.sqrt((Yt**2).sum()) + + assert_almost_equal(mt.abs(Yt[0][0]).to_numpy(), 1.0, 1) + + +def test_randomized_pca_check_list(setup): + # Test that the projection by randomized PCA on list data is correct + X = mt.tensor([[1.0, 0.0], [0.0, 1.0]]) + X_transformed = ( + PCA(n_components=1, svd_solver="randomized", random_state=0).fit(X).transform(X) + ) + assert X_transformed.shape == (2, 1) + assert_almost_equal(X_transformed.mean().to_numpy(), 0.00, 2) + assert_almost_equal(X_transformed.std().to_numpy(), 0.71, 2) + + +def test_randomized_pca_inverse(setup): + # Test that randomized PCA is inversible on dense data + rng = np.random.RandomState(0) + n, p = 50, 3 + X = mt.tensor(rng.randn(n, p)) # spherical data + X[:, 1] *= 0.00001 # make middle component relatively small + X += [5, 4, 3] # make a large mean + + # same check that we can find the original data from the transformed signal + # (since the data is almost of rank n_components) + pca = PCA(n_components=2, svd_solver="randomized", random_state=0).fit(X) + Y = pca.transform(X) + Y_inverse = pca.inverse_transform(Y) + assert_almost_equal(X.to_numpy(), Y_inverse.to_numpy(), decimal=2) + + # same as above with whitening (approximate reconstruction) + pca = PCA(n_components=2, whiten=True, svd_solver="randomized", random_state=0).fit( + X + ) + Y = pca.transform(X) + Y_inverse = pca.inverse_transform(Y) + relative_max_delta = (mt.abs(X - Y_inverse) / mt.abs(X).mean()).max() + assert relative_max_delta.to_numpy() < 1e-5 + + +def test_n_components_mle(setup): + # Ensure that n_components == 'mle' doesn't raise error for auto/full + # svd_solver and raises error for arpack/randomized svd_solver + rng = np.random.RandomState(0) + n_samples = 600 + n_features = 10 + X = mt.tensor(rng.randn(n_samples, n_features)) + n_components_dict = {} + for solver in solver_list: + pca = PCA(n_components="mle", svd_solver=solver) + if solver in ["auto", "full"]: + pca.fit(X) + n_components_dict[solver] = pca.n_components_ + else: # arpack/randomized solver + error_message = ( + f"n_components='mle' cannot be a string with svd_solver='{solver}'" + ) + assert_raise_message(ValueError, error_message, pca.fit, X) + assert n_components_dict["auto"] == n_components_dict["full"] + + +def test_pca_dim(setup): + # Check automated dimensionality setting + rng = np.random.RandomState(0) + n, p = 100, 5 + X = mt.tensor(rng.randn(n, p) * 0.1) + X[:10] += mt.array([3, 4, 5, 1, 2]) + pca = PCA(n_components="mle", svd_solver="full").fit(X) + assert pca.n_components == "mle" + assert pca.n_components_ == 1 + + +def test_infer_dim_1(setup): + # TODO: explain what this is testing + # Or at least use explicit variable names... + n, p = 1000, 5 + rng = np.random.RandomState(0) + X = ( + mt.tensor(rng.randn(n, p)) * 0.1 + + mt.tensor(rng.randn(n, 1)) * mt.array([3, 4, 5, 1, 2]) + + mt.array([1, 0, 7, 4, 6]) + ) + pca = PCA(n_components=p, svd_solver="full") + pca.fit(X) + spect = pca.explained_variance_.to_numpy() + ll = np.array([_assess_dimension(spect, k, n) for k in range(1, p)]) + assert ll[1] > ll.max() - 0.01 * n + + +def test_infer_dim_2(setup): + # TODO: explain what this is testing + # Or at least use explicit variable names... + n, p = 1000, 5 + rng = np.random.RandomState(0) + X = mt.tensor(rng.randn(n, p) * 0.1) + X[:10] += mt.array([3, 4, 5, 1, 2]) + X[10:20] += mt.array([6, 0, 7, 2, -1]) + pca = PCA(n_components=p, svd_solver="full") + pca.fit(X) + spect = pca.explained_variance_.fetch() + assert _infer_dimension(spect, n) > 1 + + +def test_infer_dim_3(setup): + n, p = 100, 5 + rng = np.random.RandomState(0) + X = mt.tensor(rng.randn(n, p) * 0.1) + X[:10] += mt.array([3, 4, 5, 1, 2]) + X[10:20] += mt.array([6, 0, 7, 2, -1]) + X[30:40] += 2 * mt.array([-1, 1, -1, 1, -1]) + pca = PCA(n_components=p, svd_solver="full") + pca.fit(X) + spect = pca.explained_variance_.fetch() + assert _infer_dimension(spect, n) > 2 + + +def test_infer_dim_by_explained_variance(setup): + X = iris + pca = PCA(n_components=0.95, svd_solver="full") + pca.fit(X) + assert pca.n_components == 0.95 + assert pca.n_components_ == 2 + + pca = PCA(n_components=0.01, svd_solver="full") + pca.fit(X) + assert pca.n_components == 0.01 + assert pca.n_components_ == 1 + + rng = np.random.RandomState(0) + # more features than samples + X = mt.tensor(rng.rand(5, 20)) + pca = PCA(n_components=0.5, svd_solver="full").fit(X) + assert pca.n_components == 0.5 + assert pca.n_components_ == 2 + + +def test_pca_score(setup): + # Test that probabilistic PCA scoring yields a reasonable score + n, p = 1000, 3 + rng = np.random.RandomState(0) + X = mt.tensor(rng.randn(n, p) * 0.1) + mt.array([3, 4, 5]) + for solver in solver_list: + pca = PCA(n_components=2, svd_solver=solver) + pca.fit(X) + ll1 = pca.score(X) + h = -0.5 * mt.log(2 * mt.pi * mt.exp(1) * 0.1**2) * p + np.testing.assert_almost_equal((ll1 / h).to_numpy(), 1, 0) + + +def test_pca_score2(setup): + # Test that probabilistic PCA correctly separated different datasets + n, p = 100, 3 + rng = np.random.RandomState(0) + X = mt.tensor(rng.randn(n, p) * 0.1) + mt.array([3, 4, 5]) + for solver in solver_list: + pca = PCA(n_components=2, svd_solver=solver) + pca.fit(X) + ll1 = pca.score(X) + ll2 = pca.score(mt.tensor(rng.randn(n, p) * 0.2) + mt.array([3, 4, 5])) + assert ll1.fetch() > ll2.fetch() + + # Test that it gives different scores if whiten=True + pca = PCA(n_components=2, whiten=True, svd_solver=solver) + pca.fit(X) + ll2 = pca.score(X) + assert ll1.fetch() > ll2.fetch() + + +def test_pca_score3(setup): + # Check that probabilistic PCA selects the right model + n, p = 200, 3 + rng = np.random.RandomState(0) + Xl = mt.tensor( + rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) + np.array([1, 0, 7]) + ) + Xt = mt.tensor( + rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) + np.array([1, 0, 7]) + ) + ll = mt.zeros(p) + for k in range(p): + pca = PCA(n_components=k, svd_solver="full") + pca.fit(Xl) + ll[k] = pca.score(Xt) + + assert ll.argmax().to_numpy() == 1 + + +def test_pca_score_with_different_solvers(setup): + digits = datasets.load_digits() + X_digits = mt.tensor(digits.data) + + pca_dict = { + svd_solver: PCA(n_components=30, svd_solver=svd_solver, random_state=0) + for svd_solver in solver_list + } + + for pca in pca_dict.values(): + pca.fit(X_digits) + # Sanity check for the noise_variance_. For more details see + # https://github.com/scikit-learn/scikit-learn/issues/7568 + # https://github.com/scikit-learn/scikit-learn/issues/8541 + # https://github.com/scikit-learn/scikit-learn/issues/8544 + assert mt.all((pca.explained_variance_ - pca.noise_variance_) >= 0).to_numpy() + + # Compare scores with different svd_solvers + score_dict = { + svd_solver: pca.score(X_digits).to_numpy() + for svd_solver, pca in pca_dict.items() + } + assert_almost_equal(score_dict["full"], score_dict["randomized"], decimal=3) + + +def test_pca_zero_noise_variance_edge_cases(setup): + # ensure that noise_variance_ is 0 in edge cases + # when n_components == min(n_samples, n_features) + n, p = 100, 3 + + rng = np.random.RandomState(0) + X = mt.tensor(rng.randn(n, p) * 0.1) + mt.array([3, 4, 5]) + # arpack raises ValueError for n_components == min(n_samples, + # n_features) + svd_solvers = ["full", "randomized"] + + for svd_solver in svd_solvers: + pca = PCA(svd_solver=svd_solver, n_components=p) + pca.fit(X) + assert pca.noise_variance_ == 0 + + pca.fit(X.T) + assert pca.noise_variance_ == 0 + + +def test_svd_solver_auto(setup): + rng = np.random.RandomState(0) + X = mt.tensor(rng.uniform(size=(1000, 50))) + + # case: n_components in (0,1) => 'full' + pca = PCA(n_components=0.5) + pca.fit(X) + pca_test = PCA(n_components=0.5, svd_solver="full") + pca_test.fit(X) + assert_array_almost_equal( + pca.components_.to_numpy(), pca_test.components_.to_numpy() + ) + + # case: max(X.shape) <= 500 => 'full' + pca = PCA(n_components=5, random_state=0) + Y = X[:10, :] + pca.fit(Y) + pca_test = PCA(n_components=5, svd_solver="full", random_state=0) + pca_test.fit(Y) + assert_array_almost_equal( + pca.components_.to_numpy(), pca_test.components_.to_numpy() + ) + + # case: n_components >= .8 * min(X.shape) => 'full' + pca = PCA(n_components=50) + pca.fit(X) + pca_test = PCA(n_components=50, svd_solver="full") + pca_test.fit(X) + assert_array_almost_equal( + pca.components_.to_numpy(), pca_test.components_.to_numpy() + ) + + # n_components >= 1 and n_components < .8 * min(X.shape) => 'randomized' + pca = PCA(n_components=10, random_state=0) + pca.fit(X) + pca_test = PCA(n_components=10, svd_solver="randomized", random_state=0) + pca_test.fit(X) + assert_array_almost_equal( + pca.components_.to_numpy(), pca_test.components_.to_numpy() + ) + + +def test_pca_sparse_input(setup): + for svd_solver in solver_list: + X = np.random.RandomState(0).rand(5, 4) + X = mt.tensor(sp.sparse.csr_matrix(X)) + assert X.issparse() is True + + pca = PCA(n_components=3, svd_solver=svd_solver) + + assert_raises(TypeError, pca.fit, X) + + +def test_pca_bad_solver(setup): + X = mt.tensor(np.random.RandomState(0).rand(5, 4)) + pca = PCA(n_components=3, svd_solver="bad_argument") + with pytest.raises(ValueError): + pca.fit(X) + + +def test_pca_dtype_preservation(setup): + for svd_solver in solver_list: + _check_pca_float_dtype_preservation(svd_solver) + _check_pca_int_dtype_upcast_to_double(svd_solver) + + +def _check_pca_float_dtype_preservation(svd_solver): + # Ensure that PCA does not upscale the dtype when input is float32 + X_64 = mt.tensor( + np.random.RandomState(0).rand(1000, 4).astype(np.float64, copy=False) + ) + X_32 = X_64.astype(np.float32) + + pca_64 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_64) + pca_32 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_32) + + assert pca_64.components_.dtype == np.float64 + assert pca_32.components_.dtype == np.float32 + assert pca_64.transform(X_64).dtype == np.float64 + assert pca_32.transform(X_32).dtype == np.float32 + + # decimal=5 fails on mac with scipy = 1.1.0 + assert_array_almost_equal( + pca_64.components_.to_numpy(), pca_32.components_.to_numpy(), decimal=4 + ) + + +def _check_pca_int_dtype_upcast_to_double(svd_solver): + # Ensure that all int types will be upcast to float64 + X_i64 = mt.tensor(np.random.RandomState(0).randint(0, 1000, (1000, 4))) + X_i64 = X_i64.astype(np.int64, copy=False) + X_i32 = X_i64.astype(np.int32, copy=False) + + pca_64 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_i64) + pca_32 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_i32) + + assert pca_64.components_.dtype == np.float64 + assert pca_32.components_.dtype == np.float64 + assert pca_64.transform(X_i64).dtype == np.float64 + assert pca_32.transform(X_i32).dtype == np.float64 + + assert_array_almost_equal( + pca_64.components_.to_numpy(), pca_32.components_.to_numpy(), decimal=5 + ) + + +def test_pca_deterministic_output(setup): + rng = np.random.RandomState(0) + X = mt.tensor(rng.rand(10, 10)) + + for solver in solver_list: + transformed_X = np.zeros((20, 2)) + for i in range(20): + pca = PCA(n_components=2, svd_solver=solver, random_state=rng) + transformed_X[i, :] = pca.fit_transform(X)[0].fetch() + np.testing.assert_allclose( + transformed_X, np.tile(transformed_X[0, :], 20).reshape(20, 2) + ) diff --git a/python/xorbits/_mars/learn/decomposition/tests/test_truncated_svd.py b/python/xorbits/_mars/learn/decomposition/tests/test_truncated_svd.py new file mode 100644 index 000000000..6a53e7e0b --- /dev/null +++ b/python/xorbits/_mars/learn/decomposition/tests/test_truncated_svd.py @@ -0,0 +1,166 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +import scipy.sparse as sp +from sklearn.utils import check_random_state +from sklearn.utils._testing import assert_array_almost_equal, assert_array_less + +from .... import tensor as mt +from .. import TruncatedSVD + +# Make an X that looks somewhat like a small tf-idf matrix. +# XXX newer versions of SciPy >0.16 have scipy.sparse.rand for this. +shape = 60, 55 +n_samples, n_features = shape +rng = check_random_state(42) +X = rng.randint(-100, 20, np.product(shape)).reshape(shape) +X = sp.csr_matrix(np.maximum(X, 0), dtype=np.float64) +X.data[:] = 1 + np.log(X.data) +Xdense = X.A +n_samples = n_samples +n_features = n_features + + +def test_attributes(setup): + for n_components in (10, 25, 41): + tsvd = TruncatedSVD(n_components).fit(X) + assert tsvd.n_components == n_components + assert tsvd.components_.shape == (n_components, n_features) + + +def test_too_many_components(setup): + for n_components in (n_features, n_features + 1): + tsvd = TruncatedSVD(n_components=n_components, algorithm="randomized") + with pytest.raises(ValueError): + tsvd.fit(X) + + +def test_sparse_formats(setup): + tsvd = TruncatedSVD(n_components=11) + Xtrans = tsvd.fit_transform(Xdense) + assert Xtrans.shape == (n_samples, 11) + Xtrans = tsvd.transform(Xdense) + assert Xtrans.shape == (n_samples, 11) + + +def test_inverse_transform(setup): + # We need a lot of components for the reconstruction to be "almost + # equal" in all positions. XXX Test means or sums instead? + tsvd = TruncatedSVD(n_components=52, random_state=42, algorithm="randomized") + Xt = tsvd.fit_transform(X) + Xinv = tsvd.inverse_transform(Xt) + assert_array_almost_equal(Xinv.fetch(), Xdense, decimal=1) + + +def test_integers(setup): + Xint = X.astype(np.int64) + tsvd = TruncatedSVD(n_components=6) + Xtrans = tsvd.fit_transform(Xint) + assert Xtrans.shape == (n_samples, tsvd.n_components) + + +def test_explained_variance(setup): + # Test sparse data + svd_r_10_sp = TruncatedSVD(10, algorithm="randomized", random_state=42) + svd_r_20_sp = TruncatedSVD(20, algorithm="randomized", random_state=42) + X_trans_r_10_sp = svd_r_10_sp.fit_transform(X) + X_trans_r_20_sp = svd_r_20_sp.fit_transform(X) + + # Test dense data + svd_r_10_de = TruncatedSVD(10, algorithm="randomized", random_state=42) + svd_r_20_de = TruncatedSVD(20, algorithm="randomized", random_state=42) + X_trans_r_10_de = svd_r_10_de.fit_transform(X.toarray()) + X_trans_r_20_de = svd_r_20_de.fit_transform(X.toarray()) + + # helper arrays for tests below + svds = (svd_r_10_sp, svd_r_20_sp, svd_r_10_de, svd_r_20_de) + svds_trans = ( + (svd_r_10_sp, X_trans_r_10_sp), + (svd_r_20_sp, X_trans_r_20_sp), + (svd_r_10_de, X_trans_r_10_de), + (svd_r_20_de, X_trans_r_20_de), + ) + svds_10_v_20 = ( + (svd_r_10_sp, svd_r_20_sp), + (svd_r_10_de, svd_r_20_de), + ) + svds_sparse_v_dense = ( + (svd_r_10_sp, svd_r_10_de), + (svd_r_20_sp, svd_r_20_de), + ) + + # Assert the 1st component is equal + for svd_10, svd_20 in svds_10_v_20: + assert_array_almost_equal( + svd_10.explained_variance_ratio_.to_numpy(), + svd_20.explained_variance_ratio_[:10].to_numpy(), + decimal=4, + ) + + # Assert that 20 components has higher explained variance than 10 + for svd_10, svd_20 in svds_10_v_20: + assert ( + svd_20.explained_variance_ratio_.sum().to_numpy() + > svd_10.explained_variance_ratio_.sum().to_numpy() + ) + + # Assert that all the values are greater than 0 + for svd in svds: + assert_array_less(0.0, svd.explained_variance_ratio_.to_numpy()) + + # Assert that total explained variance is less than 1 + for svd in svds: + assert_array_less(svd.explained_variance_ratio_.sum().to_numpy(), 1.0) + + # Compare sparse vs. dense + for svd_sparse, svd_dense in svds_sparse_v_dense: + assert_array_almost_equal( + svd_sparse.explained_variance_ratio_.to_numpy(), + svd_dense.explained_variance_ratio_.to_numpy(), + ) + + # Test that explained_variance is correct + for svd, transformed in svds_trans: + total_variance = mt.var(X.toarray(), axis=0).sum().to_numpy() + variances = mt.var(transformed, axis=0) + true_explained_variance_ratio = variances / total_variance + + assert_array_almost_equal( + svd.explained_variance_ratio_.to_numpy(), + true_explained_variance_ratio.to_numpy(), + ) + + +def test_singular_values(setup): + # Check that the TruncatedSVD output has the correct singular values + + # Set the singular values and see what we get back + rng = np.random.RandomState(0) + n_samples = 100 + n_features = 110 + + X = rng.randn(n_samples, n_features) + + rpca = TruncatedSVD(n_components=3, algorithm="randomized", random_state=rng) + X_rpca = rpca.fit_transform(X) + + X_rpca /= mt.sqrt(mt.sum(X_rpca**2.0, axis=0)) + X_rpca[:, 0] *= 3.142 + X_rpca[:, 1] *= 2.718 + + X_hat_rpca = mt.dot(X_rpca, rpca.components_) + rpca.fit(X_hat_rpca) + assert_array_almost_equal(rpca.singular_values_.to_numpy(), [3.142, 2.718, 1.0], 14) diff --git a/python/xorbits/_mars/learn/ensemble/__init__.py b/python/xorbits/_mars/learn/ensemble/__init__.py new file mode 100644 index 000000000..565282bff --- /dev/null +++ b/python/xorbits/_mars/learn/ensemble/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ._bagging import BaggingClassifier, BaggingRegressor +from ._blockwise import BlockwiseVotingClassifier, BlockwiseVotingRegressor +from ._iforest import IsolationForest diff --git a/python/xorbits/_mars/learn/ensemble/_bagging.py b/python/xorbits/_mars/learn/ensemble/_bagging.py new file mode 100644 index 000000000..ed6bf0032 --- /dev/null +++ b/python/xorbits/_mars/learn/ensemble/_bagging.py @@ -0,0 +1,1711 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import enum +import itertools +import warnings +from collections import defaultdict +from typing import Callable, Iterable, List, Optional, Tuple, Union + +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin +from sklearn.base import clone as clone_estimator +from sklearn.utils import check_random_state as sklearn_check_random_state + +from ... import opcodes +from ... import tensor as mt +from ...core import OutputType, get_output_types, recursive_tile +from ...core.context import Context +from ...core.operand import OperandStage +from ...dataframe.core import DATAFRAME_TYPE +from ...dataframe.utils import parse_index +from ...deploy.oscar.session import execute +from ...serialization.serializables import ( + AnyField, + BoolField, + DictField, + FieldTypes, + Float32Field, + FunctionField, + Int8Field, + Int64Field, + ReferenceField, + TupleField, +) +from ...tensor.core import TENSOR_CHUNK_TYPE +from ...tensor.random import RandomStateField +from ...tensor.utils import gen_random_seeds +from ...typing import TileableType +from ...utils import has_unknown_shape +from ..operands import LearnOperand, LearnOperandMixin, LearnShuffleProxy +from ..utils import column_or_1d, convert_to_tensor_or_dataframe +from ..utils.multiclass import check_classification_targets +from ..utils.shuffle import LearnShuffle +from ..utils.validation import check_is_fitted + + +def _extract_bagging_io(io_list: Iterable, op: LearnOperand, output: bool = False): + if not isinstance(io_list, Iterable): + io_list = [io_list] + input_iter = iter(io_list) + out = [ + next(input_iter), + next(input_iter) if op.with_labels else None, + next(input_iter) if op.with_weights else None, + next(input_iter) if output and op.with_feature_indices else None, + ] + return out + + +def _get_by_iloc(x, idx, axis=0): + if hasattr(x, "iloc"): + item_getter = x.iloc + else: + item_getter = x + if axis == 0: + return item_getter[idx] + else: + return item_getter[:, idx] + + +def _concat_on_axis(data_list, axis=0, out_chunk=None): + if isinstance(out_chunk, TENSOR_CHUNK_TYPE): + return np.concatenate(data_list, axis=axis) + else: + return pd.concat(data_list, axis=axis) + + +def _concat_by_row(row, out_chunk=None): + arr = np.empty((1,), dtype=object) + arr[0] = _concat_on_axis(row.tolist(), axis=0, out_chunk=out_chunk) + return arr + + +def _set_random_states(estimator, random_state=None): + random_state = sklearn_check_random_state(random_state) + to_set = {} + for key in sorted(estimator.get_params(deep=True)): + if key == "random_state" or key.endswith("__random_state"): + to_set[key] = random_state.randint(np.iinfo(np.int32).max) + + if to_set: + estimator.set_params(**to_set) + + +def _make_estimator(estimator, random_state=None): + """Make and configure a copy of the `base_estimator_` attribute. + + Warning: This method should be used to properly instantiate new + sub-estimators. + """ + estimator = clone_estimator(estimator) + if random_state is not None: + _set_random_states(estimator, random_state) + return estimator + + +class BaggingSample(LearnShuffle, LearnOperandMixin): + _op_type_ = opcodes.BAGGING_SHUFFLE_SAMPLE + + n_estimators: int = Int64Field("n_estimators") + max_samples = AnyField("max_samples") + max_features = AnyField("max_features") + bootstrap: bool = BoolField("bootstrap") + bootstrap_features: bool = BoolField("bootstrap_features") + + random_state = RandomStateField("random_state") + sample_random_state = RandomStateField("sample_random_state") + feature_random_state = RandomStateField("feature_random_state") + + reducer_ratio: float = Float32Field("reducer_ratio") + column_offset: int = Int64Field("column_offset", default=None) + + chunk_shape: Tuple[int] = TupleField("chunk_shape", FieldTypes.int64) + with_labels: bool = BoolField("with_labels") + with_weights: bool = BoolField("with_weights") + with_feature_indices: bool = BoolField("with_feature_indices") + + def __init__( + self, + max_samples: Union[int, float] = 1.0, + max_features: Union[int, float] = 1.0, + bootstrap: bool = True, + bootstrap_features: bool = False, + random_state: np.random.RandomState = None, + reducer_ratio: float = 1.0, + **kw, + ): + super().__init__( + bootstrap=bootstrap, + bootstrap_features=bootstrap_features, + max_samples=max_samples, + max_features=max_features, + reducer_ratio=reducer_ratio, + random_state=random_state, + **kw, + ) + if self.random_state is None: + self.random_state = np.random.RandomState() + + @property + def output_limit(self) -> int: + if self.stage != OperandStage.map: + return 1 + self.with_labels + self.with_weights + self.with_feature_indices + return 1 + + def __call__( + self, + in_sample: TileableType, + in_labels: Optional[TileableType] = None, + in_weights: Optional[TileableType] = None, + ): + self._output_types = get_output_types(in_sample, in_labels, in_weights) + + self.with_labels = in_labels is not None + self.with_weights = in_weights is not None + axis_keep_shape = [ + isinstance(self.max_samples, float) and self.max_samples == 1.0, + isinstance(self.max_features, float) and self.max_features == 1.0, + ] + self.with_feature_indices = not axis_keep_shape[1] or self.bootstrap_features + if self.with_feature_indices: + self._output_types += (OutputType.tensor,) + + new_shape = tuple( + s if keep_shape else np.nan + for s, keep_shape in zip(in_sample.shape, axis_keep_shape) + ) + + kws = [] + + data_params = in_sample.params + data_params["shape"] = new_shape + kws.append(data_params) + + if in_labels is not None: + labels_params = in_labels.params + labels_params["shape"] = (new_shape[0],) + kws.append(labels_params) + + if in_weights is not None: + weights_params = in_weights.params + weights_params["shape"] = (new_shape[0],) + kws.append(weights_params) + + if self.with_feature_indices: + feature_params = { + "shape": (self.n_estimators, new_shape[1]), + "dtype": np.dtype(int), + } + kws.append(feature_params) + + inputs = [in_sample] + if in_labels is not None: + inputs.append(in_labels) + if in_weights is not None: + inputs.append(in_weights) + + return self.new_tileables(inputs, kws=kws) + + @classmethod + def _scatter_samples( + cls, + max_samples: Union[int, float], + nsplits: Tuple[int], + random_state: np.random.RandomState, + n_estimators: int, + ) -> np.ndarray: + nsp_array = np.array(nsplits) + dim_size = nsp_array.sum() + if isinstance(max_samples, int): + expect_sample_count = max_samples + else: + expect_sample_count = int(max_samples * nsp_array.sum()) + + if expect_sample_count == dim_size: + return np.array([list(nsplits)] * n_estimators) + + split_probs = nsp_array / dim_size + return random_state.multinomial( + expect_sample_count, split_probs, size=n_estimators + ) + + @classmethod + def tile(cls, op: "BaggingSample"): + in_sample, in_labels, in_weights, _ = _extract_bagging_io( + op.inputs, op, output=False + ) + out_data, out_labels, out_weights, out_feature_indices = _extract_bagging_io( + op.outputs, op, output=True + ) + + # make sure all shapes are computed + if ( + has_unknown_shape(in_sample) + or (in_labels is not None and has_unknown_shape(in_labels)) + or (in_weights is not None and has_unknown_shape(in_weights)) + ): + yield + + to_tile = [] + if in_labels is not None: + in_labels = in_labels.rechunk({0: in_sample.nsplits[0]}) + to_tile.append(in_labels) + if in_weights is not None: + in_weights = in_weights.rechunk({0: in_sample.nsplits[0]}) + to_tile.append(in_weights) + + # tile rechunks + if to_tile: + tiled = yield from recursive_tile(to_tile) + tiled_iter = iter(tiled) + if in_labels is not None: + in_labels = next(tiled_iter) + if in_weights is not None: + in_weights = next(tiled_iter) + + random_seeds = [ + gen_random_seeds(n, op.random_state) for n in in_sample.chunk_shape + ] + + axis_keep_shape = [ + isinstance(op.max_samples, float) + and op.max_samples == 1.0 + and not op.bootstrap, + isinstance(op.max_features, float) + and op.max_features == 1.0 + and not op.bootstrap_features, + ] + + n_reducers = ( + op.n_reducers + if getattr(op, "n_reducers", None) + else max(1, int(in_sample.chunk_shape[0] * op.reducer_ratio)) + ) + + # todo implement sampling without replacements + map_chunks = [] + max_samples_splits = cls._scatter_samples( + op.max_samples, in_sample.nsplits[0], op.random_state, op.n_estimators + ) + max_features_splits = cls._scatter_samples( + op.max_features, in_sample.nsplits[1], op.random_state, op.n_estimators + ) + + column_cum_offset = np.concatenate([[0], np.cumsum(in_sample.nsplits[1])]) + for chunk in in_sample.chunks: + new_op = op.copy().reset_key() + new_op.random_state = None + new_op.sample_random_state = np.random.RandomState( + random_seeds[0][chunk.index[0]] + ) + new_op.feature_random_state = np.random.RandomState( + random_seeds[1][chunk.index[1]] + ) + new_op.stage = OperandStage.map + new_op.max_samples = max_samples_splits[:, chunk.index[0]] + new_op.max_features = max_features_splits[:, chunk.index[1]] + new_op.n_reducers = n_reducers + new_op.column_offset = int(column_cum_offset[chunk.index[1]]) + + if chunk.index[0] != 0: + new_op.with_feature_indices = False + + if chunk.index[1] != in_sample.chunk_shape[1] - 1: + new_op.with_weights = False + new_op.with_labels = False + + params = chunk.params + params["shape"] = tuple( + s if keep_shape else np.nan + for s, keep_shape in zip(chunk.shape, axis_keep_shape) + ) + + input_chunks = [chunk] + if new_op.with_labels: + input_chunks.append(in_labels.cix[chunk.index[0]]) + if new_op.with_weights: + input_chunks.append(in_weights.cix[chunk.index[0]]) + map_chunks.append(new_op.new_chunk(input_chunks, **params)) + + shuffle_op = LearnShuffleProxy(output_types=[OutputType.tensor]).new_chunk( + map_chunks, dtype=np.dtype(int), shape=() + ) + + remain_reducers = op.n_estimators % n_reducers + reduce_data_chunks = [] + reduce_labels_chunks = [] + reduce_weights_chunks = [] + reduce_feature_chunks = [] + for idx in range(n_reducers): + new_op = op.copy().reset_key() + new_op.random_state = None + new_op.stage = OperandStage.reduce + new_op.reducer_ordinal = idx + new_op.n_reducers = n_reducers + new_op.chunk_shape = in_sample.chunk_shape + new_op.n_estimators = op.n_estimators // n_reducers + if remain_reducers: + remain_reducers -= 1 + new_op.n_estimators += 1 + + if new_op.n_estimators == 0: + continue + + kws = [] + + data_params = out_data.params + data_params["index"] = (idx, 0) + data_params["shape"] = (np.nan, out_data.shape[1]) + kws.append(data_params) + + if op.with_labels: + labels_params = out_labels.params + labels_params["index"] = (idx,) + labels_params["shape"] = (np.nan,) + kws.append(labels_params) + + if op.with_weights: + weights_params = out_weights.params + weights_params["index"] = (idx,) + weights_params["shape"] = (np.nan,) + kws.append(weights_params) + + if op.with_feature_indices: + feature_params = { + "index": (idx, 0), + "shape": (new_op.n_estimators, out_feature_indices.shape[1]), + "dtype": np.dtype(int), + } + kws.append(feature_params) + + chunks = new_op.new_chunks([shuffle_op], kws=kws) + ( + data_chunk, + labels_chunk, + weights_chunk, + feature_chunk, + ) = _extract_bagging_io(chunks, op, output=True) + + reduce_data_chunks.append(data_chunk) + if labels_chunk is not None: + reduce_labels_chunks.append(labels_chunk) + if weights_chunk is not None: + reduce_weights_chunks.append(weights_chunk) + if feature_chunk is not None: + reduce_feature_chunks.append(feature_chunk) + + new_op = op.copy().reset_key() + + kws = [ + { + "chunks": reduce_data_chunks, + "nsplits": ((np.nan,) * len(reduce_data_chunks), (out_data.shape[1],)), + **out_data.params, + } + ] + if op.with_labels: + kws.append( + { + "chunks": reduce_labels_chunks, + "nsplits": ((np.nan,) * len(reduce_data_chunks),), + **out_labels.params, + } + ) + if op.with_weights: + kws.append( + { + "chunks": reduce_weights_chunks, + "nsplits": ((np.nan,) * len(reduce_data_chunks),), + **out_weights.params, + } + ) + if op.with_feature_indices: + estimator_nsplit = tuple(c.op.n_estimators for c in reduce_data_chunks) + kws.append( + { + "chunks": reduce_feature_chunks, + "nsplits": (estimator_nsplit, (out_feature_indices.shape[1],)), + **out_feature_indices.params, + } + ) + return new_op.new_tileables(op.inputs, kws=kws) + + @classmethod + def _gen_sample_indices( + cls, + max_range: int, + size: int, + random_state: np.random.RandomState, + with_replacement: bool = False, + ): + if not with_replacement: + result = random_state.choice(np.arange(max_range), size, False) + else: + result = random_state.randint(0, max_range - 1, size) + result.sort() + return result + + @classmethod + def _execute_map(cls, ctx, op: "BaggingSample"): + in_sample, in_labels, in_weights, _ = _extract_bagging_io( + op.inputs, op, output=False + ) + in_sample_data = ctx[in_sample.key] + in_labels_data = ctx[in_labels.key] if op.with_labels else None + in_weights_data = ctx[in_weights.key] if op.with_weights else None + out_samples = op.outputs[0] + + remains = op.n_estimators % op.n_reducers + reducer_iters = [ + itertools.repeat(idx, 1 + op.n_estimators // op.n_reducers) + for idx in range(remains) + ] + reducer_iters += [ + itertools.repeat(idx, op.n_estimators // op.n_reducers) + for idx in range(remains, op.n_reducers) + ] + reducer_iter = itertools.chain(*reducer_iters) + + result_store = defaultdict(lambda: ([], [], [], [])) + for est_id in range(op.n_estimators): + sampled_data = in_sample_data + sampled_labels = in_labels_data + sampled_weights = in_weights_data + + if op.max_samples[est_id] != in_sample_data.shape[0]: + sample_indices = cls._gen_sample_indices( + in_sample_data.shape[0], + op.max_samples[est_id], + op.sample_random_state, + op.bootstrap, + ) + + sampled_data = _get_by_iloc(sampled_data, sample_indices) + if sampled_labels is not None: + sampled_labels = _get_by_iloc(sampled_labels, sample_indices) + if sampled_weights is not None: + sampled_weights = _get_by_iloc(sampled_weights, sample_indices) + + if op.max_features[est_id] != in_sample_data.shape[1]: + feature_indices = cls._gen_sample_indices( + in_sample_data.shape[1], + op.max_features[est_id], + op.feature_random_state, + op.bootstrap_features, + ) + + sampled_data = _get_by_iloc(sampled_data, feature_indices, axis=1) + if not op.with_feature_indices: + feature_indices = None + else: + feature_indices = None + + samples, labels, weights, feature_idx_array = result_store[ + next(reducer_iter) + ] + samples.append(sampled_data) + if sampled_labels is not None: + labels.append(sampled_labels) + if sampled_weights is not None: + weights.append(sampled_weights) + if feature_indices is not None: + feature_idx_array.append(feature_indices + op.column_offset) + + for ( + reducer_id, + ( + samples, + labels, + weights, + feature_idx_array, + ), + ) in result_store.items(): + ctx[out_samples.key, (reducer_id, 0)] = ( + ctx.get_current_chunk().index, + tuple(samples + labels + weights + feature_idx_array), + ) + + @classmethod + def _execute_reduce(cls, ctx, op: "BaggingSample"): + out_data, out_labels, out_weights, out_feature_indices = _extract_bagging_io( + op.outputs, op, output=True + ) + sample_holder = [ + np.empty(op.chunk_shape, dtype=object) for _ in range(op.n_estimators) + ] + + labels_holder = ( + [np.empty(op.chunk_shape[0], dtype=object) for _ in range(op.n_estimators)] + if op.with_labels + else None + ) + + weights_holder = ( + [np.empty(op.chunk_shape[0], dtype=object) for _ in range(op.n_estimators)] + if op.with_weights + else None + ) + + feature_indices_holder = ( + [np.empty(op.chunk_shape[1], dtype=object) for _ in range(op.n_estimators)] + if op.with_feature_indices + else None + ) + + input_indexes = [idx for idx, _ in op.iter_mapper_data(ctx)] + for input_key, input_idx in zip(op.iter_mapper_keys(), input_indexes): + add_feature_index = input_idx[0] == 0 + add_label_weight = input_idx[1] == op.chunk_shape[1] - 1 + chunk_data = ctx[input_key, out_data.index][-1] + + num_groups = 1 + if add_feature_index and op.with_feature_indices: + # contains feature indices + num_groups += 1 + if add_label_weight: # contains label or weight + num_groups += int(op.with_weights) + int(op.with_labels) + + sample_count = len(chunk_data) // num_groups + assert len(chunk_data) % num_groups == 0 + + group_iter = ( + chunk_data[i * sample_count : (i + 1) * sample_count] + for i in range(num_groups) + ) + + for data_idx, sample in enumerate(next(group_iter)): + sample_holder[data_idx][input_idx] = sample + + if add_label_weight: + if op.with_labels: + for data_idx, label in enumerate(next(group_iter)): + labels_holder[data_idx][input_idx[0]] = label + if op.with_weights: + for data_idx, weight in enumerate(next(group_iter)): + weights_holder[data_idx][input_idx[0]] = weight + + if add_feature_index and op.with_feature_indices: + for data_idx, feature_index in enumerate(next(group_iter)): + feature_indices_holder[data_idx][input_idx[1]] = feature_index + + data_results: List[Optional[np.ndarray]] = [None] * len(sample_holder) + for est_idx, sample_mat in enumerate(sample_holder): + row_chunks = np.apply_along_axis( + _concat_by_row, axis=0, arr=sample_mat, out_chunk=out_data + ) + data_results[est_idx] = _concat_on_axis( + row_chunks[0].tolist(), axis=1, out_chunk=out_data + ) + ctx[out_data.key] = tuple(data_results) + + for out, holder in zip( + (out_labels, out_weights, out_feature_indices), + (labels_holder, weights_holder, feature_indices_holder), + ): + if out is None: + continue + results: List[Optional[np.ndarray]] = [None] * len(holder) + for est_idx, labels_vct in enumerate(holder): + results[est_idx] = _concat_on_axis(labels_vct.tolist(), out_chunk=out) + if holder is feature_indices_holder: + ctx[out.key] = np.stack(results) + else: + ctx[out.key] = tuple(results) + + @classmethod + def execute(cls, ctx, op: "BaggingSample"): + if op.stage == OperandStage.map: + cls._execute_map(ctx, op) + else: + cls._execute_reduce(ctx, op) + + +class BaggingSampleReindex(LearnOperand, LearnOperandMixin): + _op_type_ = opcodes.BAGGING_SHUFFLE_REINDEX + + n_estimators: int = Int64Field("n_estimators") + feature_indices: TileableType = ReferenceField("feature_indices", default=None) + + start_col_index: int = Int64Field("start_col_index", 0) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if self.feature_indices is not None: + self.feature_indices = inputs[-1] + + def __call__(self, data: TileableType, feature_indices: TileableType = None): + self._output_types = get_output_types(data) + inputs = [data] + self.feature_indices = feature_indices + params = data.params + if feature_indices is not None: + inputs.append(feature_indices) + params["shape"] = (data.shape[0], np.nan) + if isinstance(data, DATAFRAME_TYPE): + params["index_value"] = parse_index(pd.Index([], dtype=np.int64), data.key) + return self.new_tileable(inputs, **params) + + @classmethod + def tile(cls, op: "BaggingSampleReindex"): + t_data = op.inputs[0] + t_out = op.outputs[0] + t_feature_idxes = op.feature_indices + cum_nsplits = np.cumsum(np.concatenate([[0], t_data.nsplits[1]])) + + if t_feature_idxes is None: + out = t_data + if out.chunk_shape[1] > 1: + out = yield from recursive_tile(out.rechunk({1: (out.shape[1],)})) + return out + + # generate map chunks + map_holder = np.empty( + t_data.chunk_shape + (t_feature_idxes.chunk_shape[0],), + dtype=np.dtype(object), + ) + for chunk in t_data.chunks: + for feature_idx_chunk in t_feature_idxes.chunks: + new_op = op.copy().reset_key() + new_op.stage = OperandStage.map + new_op.start_col_index = int(cum_nsplits[chunk.index[1]]) + params = chunk.params + new_index = params["index"] = chunk.index + ( + feature_idx_chunk.index[0], + ) + if t_feature_idxes.chunk_shape[0] == 1: + new_index = new_index[:-1] + map_holder[new_index] = new_op.new_chunk( + [chunk, feature_idx_chunk], **params + ) + if op.feature_indices.chunk_shape[0] == 1: + chunks = map_holder.reshape((t_data.chunk_shape[0],)).tolist() + else: + + def _gen_combine_chunk(chunks): + new_op = op.copy().reset_key() + new_op.feature_indices = None + new_op.stage = OperandStage.combine + params = chunks[0].params + params["shape"] = (chunks[0].shape[0], op.feature_indices.shape[1]) + params["index"] = (chunks[0].index[0], chunks[0].index[2]) + if isinstance(t_data, DATAFRAME_TYPE): + params["index_value"] = parse_index( + pd.Index([], dtype=np.int64), chunks[0].key + ) + inputs = chunks.tolist() + return new_op.new_chunk(inputs, **params) + + chunks_array = np.apply_along_axis(_gen_combine_chunk, 1, map_holder) + chunks = chunks_array.reshape((chunks_array.size,)).tolist() + + new_op = op.copy().reset_key() + new_nsplits = ( + t_data.nsplits[0], + (op.feature_indices.shape[1],) * t_feature_idxes.chunk_shape[0], + ) + return new_op.new_tileables( + op.inputs, chunks=chunks, nsplits=new_nsplits, **t_out.params + ) + + @classmethod + def _execute_map(cls, ctx, op: "BaggingSampleReindex"): + data = ctx[op.inputs[0].key] + feature_idx = ctx[op.feature_indices.key] - op.start_col_index + filtered = [] + for row in feature_idx: + row = row[(row >= 0) & (row < data.shape[1])] + filtered.append(_get_by_iloc(data, row, axis=1)) + ctx[op.outputs[0].key] = tuple(filtered) + + @classmethod + def _execute_combine(cls, ctx, op: "BaggingSampleReindex"): + data_inputs = [ctx[c.key] for c in op.inputs] + concatenated = [] + for data_input in zip(*data_inputs): + concatenated.append(_concat_on_axis(data_input, 1, op.inputs[0])) + ctx[op.outputs[0].key] = tuple(concatenated) + + @classmethod + def execute(cls, ctx, op: "BaggingSampleReindex"): + if op.stage == OperandStage.combine: + cls._execute_combine(ctx, op) + else: + cls._execute_map(ctx, op) + + +class BaggingFitOperand(LearnOperand, LearnOperandMixin): + _op_type_ = opcodes.BAGGING_FIT + + base_estimator: BaseEstimator = AnyField("base_estimator") + estimator_params: dict = DictField("estimator_params", default=None) + n_estimators: int = Int64Field("n_estimators") + max_samples = AnyField("max_samples", default=1.0) + max_features = AnyField("max_features", default=1.0) + bootstrap: bool = BoolField("bootstrap", default=False) + bootstrap_features: bool = BoolField("bootstrap_features", default=True) + random_state = RandomStateField("random_state", default=None) + + reducer_ratio: float = Float32Field("reducer_ratio") + n_reducers: int = Int64Field("n_reducers") + + labels: TileableType = ReferenceField("labels", default=None) + weights: TileableType = ReferenceField("weights", default=None) + feature_indices: TileableType = ReferenceField("feature_indices", default=None) + with_feature_indices: bool = BoolField("with_feature_indices", default=None) + + def __init__(self, **kwargs): + super().__init__(**kwargs) + if self.random_state is None: + self.random_state = np.random.RandomState() + if self.with_feature_indices is None: + full_features = ( + isinstance(self.max_features, float) and self.max_features == 1.0 + ) + self.with_feature_indices = not full_features or self.bootstrap_features + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + + input_iter = iter(inputs) + next(input_iter) + if self.labels is not None: + self.labels = next(input_iter) + if self.weights is not None: + self.weights = next(input_iter) + if self.feature_indices is not None: + self.feature_indices = next(input_iter) + + def _get_bagging_sample_tileables(self, samples=None): + samples = samples or self.inputs[0] + sample_op = BaggingSample( + n_estimators=self.n_estimators, + max_samples=self.max_samples, + max_features=self.max_features, + bootstrap=self.bootstrap, + bootstrap_features=self.bootstrap_features, + random_state=self.random_state, + reducer_ratio=self.reducer_ratio, + n_reducers=self.n_reducers, + with_weights=self.weights is not None, + with_labels=self.labels is not None, + with_feature_indices=self.with_feature_indices, + ) + return _extract_bagging_io( + sample_op(samples, self.labels, self.weights), sample_op, output=True + ) + + @property + def output_limit(self) -> int: + if self.with_feature_indices: + return 2 + return 1 + + def __call__( + self, + in_data: TileableType, + in_labels: Optional[TileableType] = None, + in_weights: Optional[TileableType] = None, + feature_indices: TileableType = None, + ): + self._output_types = [OutputType.tensor] + inputs = [in_data] + + if in_labels is not None: + self.labels = in_labels + inputs.append(in_labels) + if in_weights is not None: + self.weights = in_weights + inputs.append(in_weights) + + if feature_indices is not None: + self.feature_indices = feature_indices + inputs.append(feature_indices) + + kws = [dict(shape=(self.n_estimators,), dtype=np.dtype(object))] + if self.with_feature_indices: + self._output_types.append(OutputType.tensor) + sample_tileables = self._get_bagging_sample_tileables(in_data) + kws.append(sample_tileables[-1].params) + + return self.new_tileables(inputs, kws=kws) + + @classmethod + def tile(cls, op: "BaggingFitOperand"): + out = op.outputs[0] + sample_tileables = op._get_bagging_sample_tileables() + tiled_sample_iter = iter( + ( + yield from recursive_tile( + tuple(t for t in sample_tileables if t is not None) + ) + ) + ) + sampled, labels, weights, feature_indices = ( + t if t is None else next(tiled_sample_iter) for t in sample_tileables + ) + + estimator_nsplits = (tuple(c.op.n_estimators for c in sampled.chunks),) + + label_chunks = itertools.repeat(None) if labels is None else labels.chunks + weight_chunks = itertools.repeat(None) if weights is None else weights.chunks + + out_chunks = [] + seeds = gen_random_seeds(len(sampled.chunks), op.random_state) + for sample_chunk, label_chunk, weight_chunk, n_estimators in zip( + sampled.chunks, label_chunks, weight_chunks, estimator_nsplits[0] + ): + chunk_op = BaggingFitOperand( + base_estimator=op.base_estimator, + estimator_params=op.estimator_params, + labels=label_chunk, + weights=weight_chunk, + n_estimators=n_estimators, + with_feature_indices=False, + random_state=sklearn_check_random_state(seeds[sample_chunk.index[0]]), + ) + chunk_op._output_types = op._output_types + inputs = [ + c for c in [sample_chunk, label_chunk, weight_chunk] if c is not None + ] + out_chunks.append( + chunk_op.new_chunk( + inputs, + index=(sample_chunk.index[0],), + shape=(n_estimators,), + dtype=out.dtype, + ) + ) + + out_op = op.copy().reset_key() + kws = [ + dict(chunks=out_chunks, nsplits=estimator_nsplits, **out.params), + ] + if feature_indices is not None: + kws.append( + dict( + chunks=feature_indices.chunks, + nsplits=feature_indices.nsplits, + **feature_indices.params, + ) + ) + return out_op.new_tileables(op.inputs, kws=kws, output_limit=len(kws)) + + @classmethod + def execute(cls, ctx: Union[dict, Context], op: "BaggingFitOperand"): + sampled_data = ctx[op.inputs[0].key] + labels_data = ( + ctx[op.labels.key] if op.labels is not None else itertools.repeat(None) + ) + weights_data = ( + ctx[op.weights.key] if op.weights is not None else itertools.repeat(None) + ) + + for k, v in (op.estimator_params or dict()).items(): + setattr(op.base_estimator, k, v) + + new_estimators = [] + seeds = gen_random_seeds(len(sampled_data), op.random_state) + for idx, (sampled, label, weights) in enumerate( + zip(sampled_data, labels_data, weights_data) + ): + estimator = _make_estimator(op.base_estimator, seeds[idx]) + estimator.fit(sampled, y=label, sample_weight=weights) + new_estimators.append(estimator) + ctx[op.outputs[0].key] = np.array(new_estimators, dtype=np.dtype(object)) + + +class PredictionType(enum.Enum): + REGRESSION = 0 + PROBABILITY = 1 + LOG_PROBABILITY = 2 + DECISION_FUNCTION = 3 + + +class BaggingPredictionOperand(LearnOperand, LearnOperandMixin): + _op_type_ = opcodes.BAGGING_PREDICTION + + estimators: TileableType = ReferenceField("estimators") + feature_indices: TileableType = ReferenceField("feature_indices", default=None) + n_classes: Optional[int] = Int64Field("n_classes", default=None) + prediction_type: PredictionType = Int8Field( + "prediction_type", + on_serialize=lambda x: x.value, + on_deserialize=PredictionType, + default=PredictionType.PROBABILITY, + ) + decision_function: Callable = FunctionField("decision_function", default=None) + calc_means: bool = BoolField("calc_means", default=True) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + input_iter = iter(inputs[1:]) + self.estimators = next(input_iter) + if self.feature_indices is not None: + self.feature_indices = next(input_iter) + + def __call__( + self, + instances: TileableType, + estimators: TileableType, + feature_indices: TileableType = None, + ) -> TileableType: + self._output_types = [OutputType.tensor] + self.estimators = estimators + self.feature_indices = feature_indices + + if self.n_classes is not None: + shape = (instances.shape[0], estimators.shape[0], self.n_classes) + else: + shape = (instances.shape[0], estimators.shape[0]) + if self.calc_means: + shape = (shape[0],) + shape[2:] + + params = {"dtype": np.dtype(float), "shape": shape} + inputs = [instances, estimators] + if feature_indices is not None: + inputs.append(feature_indices) + return self.new_tileable(inputs, **params) + + def _get_class_shape(self): + if self.n_classes and self.n_classes > 2: + return self.n_classes + elif self.prediction_type == PredictionType.DECISION_FUNCTION: + return None + else: + return self.n_classes + + @classmethod + def _build_chunks_without_feature_indices( + cls, op: "BaggingPredictionOperand", t_instances: TileableType + ): + class_shape = op._get_class_shape() + chunks = [] + for c_instance in t_instances.chunks: + for c_estimator in op.estimators.chunks: + if class_shape is not None: + params = { + "dtype": np.dtype(float), + "shape": ( + c_instance.shape[0], + class_shape, + c_estimator.shape[0], + ), + "index": (c_instance.index[0], 0, c_estimator.index[0]), + } + else: + params = { + "dtype": np.dtype(float), + "shape": (c_instance.shape[0], c_estimator.shape[0]), + "index": (c_instance.index[0], c_estimator.index[0]), + } + new_op = op.copy().reset_key() + new_op.feature_indices = None + chunks.append(new_op.new_chunk([c_instance, c_estimator], **params)) + return chunks + + @classmethod + def _build_chunks_with_feature_indices( + cls, op: "BaggingPredictionOperand", t_instances: TileableType + ): + class_shape = op._get_class_shape() + chunks = [] + for c in t_instances.chunks: + estimator_chunk = op.estimators.chunks[c.index[1]] + + if class_shape is not None: + params = { + "dtype": np.dtype(float), + "shape": (c.shape[0], class_shape, estimator_chunk.shape[0]), + "index": (c.index[0], 0, c.index[1]), + } + else: + params = { + "dtype": np.dtype(float), + "shape": (c.shape[0], estimator_chunk.shape[0]), + "index": c.index, + } + + new_op = op.copy().reset_key() + new_op.feature_indices = None + chunks.append(new_op.new_chunk([c, estimator_chunk], **params)) + return chunks + + @classmethod + def tile(cls, op: "BaggingPredictionOperand"): + n_estimators = op.estimators.shape[0] + reindex_op = BaggingSampleReindex(n_estimators=n_estimators) + t_instances = yield from recursive_tile( + reindex_op(op.inputs[0], op.feature_indices) + ) + + # for classifiers, form instance-class-estimator array + # for regressors, form instance-estimator array + # and then sum over estimator axis + + if op.feature_indices is None: + chunks = cls._build_chunks_without_feature_indices(op, t_instances) + else: + chunks = cls._build_chunks_with_feature_indices(op, t_instances) + + new_op = op.copy().reset_key() + class_shape = op._get_class_shape() + if class_shape is not None: + params = { + "dtype": np.dtype(float), + "shape": (t_instances.shape[0], class_shape, n_estimators), + } + nsplits = (t_instances.nsplits[0], (class_shape,), op.estimators.nsplits[0]) + else: + params = { + "dtype": np.dtype(float), + "shape": (t_instances.shape[0], n_estimators), + } + nsplits = (t_instances.nsplits[0], op.estimators.nsplits[0]) + estimator_probas = new_op.new_tileable( + op.inputs, chunks=chunks, nsplits=nsplits, **params + ) + + if not op.calc_means: + return estimator_probas + elif op.prediction_type != PredictionType.LOG_PROBABILITY: + return [ + ( + yield from recursive_tile( + mt.sum(estimator_probas, axis=-1) / n_estimators + ) + ) + ] + else: + return [ + ( + yield from recursive_tile( + mt.log(mt.exp(estimator_probas).sum(axis=-1)) + - np.log(n_estimators) + ) + ) + ] + + @classmethod + def _predict_proba(cls, instance, estimator, n_classes): + n_samples = instance.shape[0] + proba = np.zeros((n_samples, n_classes)) + + if hasattr(estimator, "predict_proba"): + proba_estimator = estimator.predict_proba(instance) + if n_classes == len(estimator.classes_): + proba += proba_estimator + + else: + proba[:, estimator.classes_] += proba_estimator[ + :, range(len(estimator.classes_)) + ] + else: + # Resort to voting + predictions = estimator.predict(instance) + for i in range(n_samples): + proba[i, predictions[i]] += 1 + return proba + + @classmethod + def _predict_log_proba(cls, instance, estimator, n_classes): + """Private function used to compute log probabilities within a job.""" + if not hasattr(estimator, "predict_log_proba"): + return np.log(cls._predict_proba(instance, estimator, n_classes)) + + n_samples = instance.shape[0] + log_proba = np.empty((n_samples, n_classes)) + log_proba.fill(-np.inf) + all_classes = np.arange(n_classes, dtype=int) + + log_proba_estimator = estimator.predict_log_proba(instance) + + if n_classes == len(estimator.classes_): + log_proba = np.logaddexp(log_proba, log_proba_estimator) + else: # pragma: no cover + log_proba[:, estimator.classes_] = np.logaddexp( + log_proba[:, estimator.classes_], + log_proba_estimator[:, range(len(estimator.classes_))], + ) + missing = np.setdiff1d(all_classes, estimator.classes_) + log_proba[:, missing] = np.logaddexp(log_proba[:, missing], -np.inf) + return log_proba + + @classmethod + def _decision_function(cls, instance, estimator, func=None): + if func is not None: + return func(instance, estimator) + else: + return estimator.decision_function(instance) + + @classmethod + def execute(cls, ctx: Union[dict, Context], op: "BaggingPredictionOperand"): + instances = ctx[op.inputs[0].key] + estimators = ctx[op.estimators.key] + if not isinstance(instances, tuple): + instances = [instances] * len(estimators) + + estimate_results = [] + for instance, estimator in zip(instances, estimators): + # classifier + if op.prediction_type == PredictionType.PROBABILITY: + estimate_results.append( + cls._predict_proba(instance, estimator, op.n_classes) + ) + elif op.prediction_type == PredictionType.LOG_PROBABILITY: + estimate_results.append( + cls._predict_log_proba(instance, estimator, op.n_classes) + ) + elif op.prediction_type == PredictionType.DECISION_FUNCTION: + estimate_results.append( + cls._decision_function(instance, estimator, op.decision_function) + ) + else: + estimate_results.append(estimator.predict(instance)) + + out = op.outputs[0] + ctx[out.key] = np.stack(estimate_results, axis=out.ndim - 1) + + +class BaseBagging: + def __init__( + self, + base_estimator=None, + n_estimators=10, + *, + max_samples=1.0, + max_features=1.0, + bootstrap=True, + bootstrap_features=False, + oob_score=False, + warm_start=False, + n_jobs=None, + random_state=None, + verbose=0, + reducers=1.0, + ): + self.base_estimator = base_estimator + self.n_estimators = n_estimators + + self.max_samples = max_samples + self.max_features = max_features + self.bootstrap = bootstrap + self.bootstrap_features = bootstrap_features + self.oob_score = oob_score + self.warm_start = warm_start + self.n_jobs = n_jobs + self.random_state = ( + np.random.RandomState(random_state) + if isinstance(random_state, int) + else random_state + ) + self.verbose = verbose + self.reducers = reducers + + self.estimators_ = None + self.estimator_features_ = None + + def _validate_y(self, y, session=None, run_kwargs=None): + if len(y.shape) == 1 or y.shape[1] == 1: + return column_or_1d(y, warn=True) + else: + return y + + def _fit( + self, + X, + y=None, + sample_weight=None, + max_samples=None, + estimator_params=None, + session=None, + run_kwargs=None, + ): + estimator_features, feature_indices = None, None + n_more_estimators = self.n_estimators + + X = convert_to_tensor_or_dataframe(X) + y = convert_to_tensor_or_dataframe(y) if y is not None else None + sample_weight = ( + convert_to_tensor_or_dataframe(sample_weight) + if sample_weight is not None + else None + ) + + y = self._validate_y(y) + + if self.warm_start: + feature_indices = self.estimator_features_ + if self.estimators_ is not None: + exist_estimators = self.estimators_.shape[0] + # move random states to skip duplicated results + self.random_state.rand(exist_estimators) + n_more_estimators = self.n_estimators - exist_estimators + + if n_more_estimators < 0: + raise ValueError( + "n_estimators=%d must be larger or equal to " + "len(estimators_)=%d when warm_start==True" + % (self.n_estimators, self.estimators_.shape[0]) + ) + elif n_more_estimators == 0: + warnings.warn( + "Warm-start fitting without increasing n_estimators does not " + "fit new trees." + ) + return self + + fit_op = BaggingFitOperand( + base_estimator=self.base_estimator, + estimator_params=estimator_params, + n_estimators=n_more_estimators, + max_samples=max_samples or self.max_samples, + max_features=self.max_features, + bootstrap=self.bootstrap, + bootstrap_features=self.bootstrap_features, + random_state=self.random_state, + reducer_ratio=self.reducers if isinstance(self.reducers, float) else None, + n_reducers=self.reducers if isinstance(self.reducers, int) else None, + ) + tileables = fit_op(X, y, sample_weight, feature_indices) + ret = execute(*tileables, session=session, **(run_kwargs or dict())) + + if len(ret) == 2: + estimators, estimator_features = ret + else: + estimators = ret + + if self.estimators_ is not None: + estimators = mt.concatenate([self.estimators_, estimators]) + if self.estimator_features_ is not None: + estimator_features = mt.concatenate( + [self.estimator_features_, estimator_features] + ) + + self.estimators_, self.estimator_features_ = estimators, estimator_features + return self + + def fit(self, X, y=None, sample_weight=None, session=None, run_kwargs=None): + """ + Build a Bagging ensemble of estimators from the training set (X, y). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrices are accepted only if + they are supported by the base estimator. + + y : array-like of shape (n_samples,) + The target values (class labels in classification, real numbers in + regression). + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. + Note that this is supported only if the base estimator supports + sample weighting. + + Returns + ------- + self : object + Fitted estimator. + """ + return self._fit( + X, y, sample_weight=sample_weight, session=session, run_kwargs=run_kwargs + ) + + +class BaggingClassifier(ClassifierMixin, BaseBagging): + """ + A Bagging classifier. + + A Bagging classifier is an ensemble meta-estimator that fits base + classifiers each on random subsets of the original dataset and then + aggregate their individual predictions (either by voting or by averaging) + to form a final prediction. Such a meta-estimator can typically be used as + a way to reduce the variance of a black-box estimator (e.g., a decision + tree), by introducing randomization into its construction procedure and + then making an ensemble out of it. + + This algorithm encompasses several works from the literature. When random + subsets of the dataset are drawn as random subsets of the samples, then + this algorithm is known as Pasting [1]_. If samples are drawn with + replacement, then the method is known as Bagging [2]_. When random subsets + of the dataset are drawn as random subsets of the features, then the method + is known as Random Subspaces [3]_. Finally, when base estimators are built + on subsets of both samples and features, then the method is known as + Random Patches [4]_. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + base_estimator : object, default=None + The base estimator to fit on random subsets of the dataset. + If None, then the base estimator is a + :class:`~sklearn.tree.DecisionTreeClassifier`. + + n_estimators : int, default=10 + The number of base estimators in the ensemble. + + max_samples : int or float, default=1.0 + The number of samples to draw from X to train each base estimator (with + replacement by default, see `bootstrap` for more details). + + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. + + max_features : int or float, default=1.0 + The number of features to draw from X to train each base estimator ( + without replacement by default, see `bootstrap_features` for more + details). + + - If int, then draw `max_features` features. + - If float, then draw `max_features * X.shape[1]` features. + + bootstrap : bool, default=True + Whether samples are drawn with replacement. If False, sampling + without replacement is performed. + + bootstrap_features : bool, default=False + Whether features are drawn with replacement. + + warm_start : bool, default=False + When set to True, reuse the solution of the previous call to fit + and add more estimators to the ensemble, otherwise, just fit + a whole new ensemble. See :term:`the Glossary `. + + random_state : int, RandomState instance or None, default=None + Controls the random resampling of the original dataset + (sample wise and feature wise). + If the base estimator accepts a `random_state` attribute, a different + seed is generated for each instance in the ensemble. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Attributes + ---------- + base_estimator_ : estimator + The base estimator from which the ensemble is grown. + + estimators_ : list of estimators + The collection of fitted base estimators. + + estimators_features_ : list of arrays + The subset of drawn features for each base estimator. + + classes_ : ndarray of shape (n_classes,) + The classes labels. + + n_classes_ : int or list + The number of classes. + + See Also + -------- + BaggingRegressor : A Bagging regressor. + + References + ---------- + + .. [1] L. Breiman, "Pasting small votes for classification in large + databases and on-line", Machine Learning, 36(1), 85-103, 1999. + + .. [2] L. Breiman, "Bagging predictors", Machine Learning, 24(2), 123-140, + 1996. + + .. [3] T. Ho, "The random subspace method for constructing decision + forests", Pattern Analysis and Machine Intelligence, 20(8), 832-844, + 1998. + + .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine + Learning and Knowledge Discovery in Databases, 346-361, 2012. + + Examples + -------- + >>> from sklearn.svm import SVC + >>> from mars.learn.ensemble import BaggingClassifier + >>> from mars.learn.datasets import make_classification + >>> X, y = make_classification(n_samples=100, n_features=4, + ... n_informative=2, n_redundant=0, + ... random_state=0, shuffle=False) + >>> clf = BaggingClassifier(base_estimator=SVC(), + ... n_estimators=10, random_state=0).fit(X, y) + >>> clf.predict([[0, 0, 0, 0]]) + array([1]) + """ + + def _validate_y(self, y, session=None, run_kwargs=None): + to_run = [check_classification_targets(y)] + y = column_or_1d(y, warn=True) + to_run.extend(mt.unique(y, return_inverse=True)) + _, self.classes_, y = execute( + *to_run, session=session, **(run_kwargs or dict()) + ) + self.n_classes_ = len(self.classes_) + + return y + + def _predict_proba(self, X): + check_is_fitted(self) + X = convert_to_tensor_or_dataframe(X) + predict_op = BaggingPredictionOperand( + n_classes=self.n_classes_, + prediction_type=PredictionType.PROBABILITY, + ) + return predict_op(X, self.estimators_, self.estimator_features_) + + def predict(self, X, session=None, run_kwargs=None): + """ + Predict class for X. + + The predicted class of an input sample is computed as the class with + the highest mean predicted probability. If base estimators do not + implement a ``predict_proba`` method, then it resorts to voting. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrices are accepted only if + they are supported by the base estimator. + + Returns + ------- + y : ndarray of shape (n_samples,) + The predicted classes. + """ + probas = self._predict_proba(X) + y = self.classes_.take(mt.argmax(probas, axis=1), axis=0) + return execute(y, session=session, **(run_kwargs or dict())) + + def predict_proba(self, X, session=None, run_kwargs=None): + """ + Predict class probabilities for X. + + The predicted class probabilities of an input sample is computed as + the mean predicted class probabilities of the base estimators in the + ensemble. If base estimators do not implement a ``predict_proba`` + method, then it resorts to voting and the predicted class probabilities + of an input sample represents the proportion of estimators predicting + each class. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrices are accepted only if + they are supported by the base estimator. + + Returns + ------- + p : ndarray of shape (n_samples, n_classes) + The class probabilities of the input samples. The order of the + classes corresponds to that in the attribute :term:`classes_`. + """ + probas = self._predict_proba(X) + return execute(probas, session=session, **(run_kwargs or dict())) + + def predict_log_proba(self, X, session=None, run_kwargs=None): + """ + Predict class log-probabilities for X. + + The predicted class log-probabilities of an input sample is computed as + the log of the mean predicted class probabilities of the base + estimators in the ensemble. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrices are accepted only if + they are supported by the base estimator. + + Returns + ------- + p : ndarray of shape (n_samples, n_classes) + The class log-probabilities of the input samples. The order of the + classes corresponds to that in the attribute :term:`classes_`. + """ + check_is_fitted(self) + X = convert_to_tensor_or_dataframe(X) + predict_op = BaggingPredictionOperand( + n_classes=self.n_classes_, + prediction_type=PredictionType.LOG_PROBABILITY, + ) + probas = predict_op(X, self.estimators_, self.estimator_features_) + return execute(probas, session=session, **(run_kwargs or dict())) + + def decision_function(self, X, session=None, run_kwargs=None): + """ + Average of the decision functions of the base classifiers. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrices are accepted only if + they are supported by the base estimator. + + Returns + ------- + score : ndarray of shape (n_samples, k) + The decision function of the input samples. The columns correspond + to the classes in sorted order, as they appear in the attribute + ``classes_``. Regression and binary classification are special + cases with ``k == 1``, otherwise ``k==n_classes``. + """ + check_is_fitted(self) + X = convert_to_tensor_or_dataframe(X) + predict_op = BaggingPredictionOperand( + n_classes=self.n_classes_, + prediction_type=PredictionType.DECISION_FUNCTION, + ) + result = predict_op(X, self.estimators_, self.estimator_features_) + return execute(result, session=session, **(run_kwargs or dict())) + + +class BaggingRegressor(RegressorMixin, BaseBagging): + """ + A Bagging regressor. + + A Bagging regressor is an ensemble meta-estimator that fits base + regressors each on random subsets of the original dataset and then + aggregate their individual predictions (either by voting or by averaging) + to form a final prediction. Such a meta-estimator can typically be used as + a way to reduce the variance of a black-box estimator (e.g., a decision + tree), by introducing randomization into its construction procedure and + then making an ensemble out of it. + + This algorithm encompasses several works from the literature. When random + subsets of the dataset are drawn as random subsets of the samples, then + this algorithm is known as Pasting [1]_. If samples are drawn with + replacement, then the method is known as Bagging [2]_. When random subsets + of the dataset are drawn as random subsets of the features, then the method + is known as Random Subspaces [3]_. Finally, when base estimators are built + on subsets of both samples and features, then the method is known as + Random Patches [4]_. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + base_estimator : object, default=None + The base estimator to fit on random subsets of the dataset. + If None, then the base estimator is a + :class:`~sklearn.tree.DecisionTreeRegressor`. + + n_estimators : int, default=10 + The number of base estimators in the ensemble. + + max_samples : int or float, default=1.0 + The number of samples to draw from X to train each base estimator (with + replacement by default, see `bootstrap` for more details). + + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. + + max_features : int or float, default=1.0 + The number of features to draw from X to train each base estimator ( + without replacement by default, see `bootstrap_features` for more + details). + + - If int, then draw `max_features` features. + - If float, then draw `max_features * X.shape[1]` features. + + bootstrap : bool, default=True + Whether samples are drawn with replacement. If False, sampling + without replacement is performed. + + bootstrap_features : bool, default=False + Whether features are drawn with replacement. + + warm_start : bool, default=False + When set to True, reuse the solution of the previous call to fit + and add more estimators to the ensemble, otherwise, just fit + a whole new ensemble. See :term:`the Glossary `. + + random_state : int, RandomState instance or None, default=None + Controls the random resampling of the original dataset + (sample wise and feature wise). + If the base estimator accepts a `random_state` attribute, a different + seed is generated for each instance in the ensemble. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Attributes + ---------- + base_estimator_ : estimator + The base estimator from which the ensemble is grown. + + estimators_ : list of estimators + The collection of fitted sub-estimators. + + estimators_features_ : list of arrays + The subset of drawn features for each base estimator. + + See Also + -------- + BaggingClassifier : A Bagging classifier. + + References + ---------- + + .. [1] L. Breiman, "Pasting small votes for classification in large + databases and on-line", Machine Learning, 36(1), 85-103, 1999. + + .. [2] L. Breiman, "Bagging predictors", Machine Learning, 24(2), 123-140, + 1996. + + .. [3] T. Ho, "The random subspace method for constructing decision + forests", Pattern Analysis and Machine Intelligence, 20(8), 832-844, + 1998. + + .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine + Learning and Knowledge Discovery in Databases, 346-361, 2012. + + Examples + -------- + >>> from sklearn.svm import SVR + >>> from mars.learn.ensemble import BaggingRegressor + >>> from mars.learn.datasets import make_regression + >>> X, y = make_regression(n_samples=100, n_features=4, + ... n_informative=2, n_targets=1, + ... random_state=0, shuffle=False) + >>> regr = BaggingRegressor(base_estimator=SVR(), + ... n_estimators=10, random_state=0).fit(X, y) + >>> regr.predict([[0, 0, 0, 0]]) + array([-2.8720...]) + """ + + def predict(self, X, session=None, run_kwargs=None): + """ + Predict regression target for X. + + The predicted regression target of an input sample is computed as the + mean predicted regression targets of the estimators in the ensemble. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrices are accepted only if + they are supported by the base estimator. + + Returns + ------- + y : ndarray of shape (n_samples,) + The predicted values. + """ + check_is_fitted(self) + X = convert_to_tensor_or_dataframe(X) + predict_op = BaggingPredictionOperand( + prediction_type=PredictionType.REGRESSION, + ) + probas = predict_op(X, self.estimators_, self.estimator_features_) + return execute(probas, session=session, **(run_kwargs or dict())) diff --git a/python/xorbits/_mars/learn/ensemble/_blockwise.py b/python/xorbits/_mars/learn/ensemble/_blockwise.py new file mode 100644 index 000000000..b0fbe6db5 --- /dev/null +++ b/python/xorbits/_mars/learn/ensemble/_blockwise.py @@ -0,0 +1,389 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Union + +import numpy as np +from sklearn.base import BaseEstimator as SklearnBaseEstimator +from sklearn.base import clone +from sklearn.utils.validation import check_is_fitted + +from ... import execute, opcodes +from ... import tensor as mt +from ...core import ENTITY_TYPE, OutputType, recursive_tile +from ...core.context import Context +from ...serialization.serializables import ( + AnyField, + BoolField, + DictField, + FieldTypes, + Int64Field, + KeyField, + ListField, + StringField, +) +from ...tensor.core import Tensor, TensorOrder +from ...tensor.utils import decide_unify_split +from ...typing import SessionType +from ..base import BaseEstimator, ClassifierMixin, RegressorMixin +from ..operands import LearnOperand, LearnOperandMixin +from ..utils import check_array + + +class BlockwiseEnsembleFit(LearnOperand, LearnOperandMixin): + _op_type_ = opcodes.BLOCKWISE_ENSEMBLE_FIT + + x = KeyField("x") + y = KeyField("y") + estimator = AnyField("estimator") + kwargs = DictField("kwargs", default_factory=dict) + + def __call__(self): + self._output_types = [OutputType.object] + return self.new_tileable([self.x, self.y]) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self.x = self._inputs[0] + self.y = self._inputs[1] + + @classmethod + def tile(cls, op: "BlockwiseEnsembleFit"): + X, y = op.x, op.y + x_split = X.nsplits[0] + y_split = y.nsplits[0] + out = op.outputs[0] + + if any(np.isnan(s) for s in x_split + y_split) or np.isnan( + X.shape[1] + ): # pragma: no cover + yield + + if x_split != y_split or X.chunk_shape[1] > 1: + x_split = y_split = decide_unify_split(x_split, y_split) + X = X.rechunk({0: x_split, 1: X.shape[1]}) + y = y.rechunk({0: y_split}) + X, y = yield from recursive_tile(X, y) + + out_chunks = [] + for i, _ in enumerate(x_split): + chunk_op = op.copy().reset_key() + out_chunk = chunk_op.new_chunk( + [X.cix[i, 0], y.cix[(i,)]], + index=(i,), + ) + out_chunks.append(out_chunk) + + params = out.params.copy() + params["chunks"] = out_chunks + params["nsplits"] = ((np.nan,) * len(x_split),) + return op.copy().new_tileables(op.inputs, kws=[params]) + + @classmethod + def execute(cls, ctx: Union[dict, Context], op: "BlockwiseEnsembleFit"): + x, y = ctx[op.inputs[0].key], ctx[op.inputs[1].key] + estimator = clone(op.estimator) + ctx[op.outputs[0].key] = estimator.fit(x, y, **op.kwargs) + + +class BlockwiseEnsemblePredict(LearnOperand, LearnOperandMixin): + _op_type_ = opcodes.BLOCKWISE_ENSEMBLE_PREDICT + + x = KeyField("x") + estimators = ListField("estimators", FieldTypes.key) + voting = StringField("voting", default="hard") + proba = BoolField("proba", default=None) + is_classifier = BoolField("is_classifier") + n_classes = Int64Field("n_classes") + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self.x = self._inputs[0] + self.estimators = self._inputs[1:] + + def __call__(self): + self._output_types = [OutputType.tensor] + x_len = self.x.shape[0] + if self.is_classifier: + shape = (x_len, self.n_classes) if self.proba else (x_len,) + dtype = np.dtype(np.float64) if self.proba else np.dtype(np.int64) + else: + shape = (x_len,) + dtype = np.dtype(np.float64) + return self.new_tileable( + [self.x] + self.estimators, + shape=shape, + dtype=dtype, + order=TensorOrder.C_ORDER, + ) + + @classmethod + def tile(cls, op: "BlockwiseEnsemblePredict"): + out = op.outputs[0] + x = op.x + estimators = op.estimators[0] + estimators_chunks = estimators.chunks + + out_chunks = [] + for chunk in x.chunks: + chunk_op = op.copy().reset_key() + if out.ndim == 2: + chunk_shape = (chunk.shape[0], out.shape[1]) + chunk_index = (chunk.index[0], 0) + else: + chunk_shape = (chunk.shape[0],) + chunk_index = (chunk.index[0],) + out_chunk = chunk_op.new_chunk( + [chunk] + estimators_chunks, + shape=chunk_shape, + dtype=out.dtype, + order=out.order, + index=chunk_index, + ) + out_chunks.append(out_chunk) + + if out.ndim == 2: + nsplits = (x.nsplits[0], (out.shape[1],)) + else: + nsplits = (x.nsplits[0],) + params = out.params.copy() + params["nsplits"] = nsplits + params["chunks"] = out_chunks + return op.copy().new_tileables(op.inputs, kws=[params]) + + @classmethod + def execute(cls, ctx: Union[dict, Context], op: "BlockwiseEnsemblePredict"): + x = ctx[op.inputs[0].key] + estimators = [ctx[inp.key] for inp in op.inputs[1:]] + if op.proba or op.voting == "soft": + predictions = [estimator.predict_proba(x) for estimator in estimators] + else: + predictions = [estimator.predict(x) for estimator in estimators] + + if op.is_classifier: + if not op.proba: + result = cls._execute_classifier_predict(predictions, op) + else: + result = cls._execute_classifier_predict_proba(predictions, op) + else: + result = cls._execute_regressor_predict(predictions) + ctx[op.outputs[0].key] = result + + @classmethod + def _execute_classifier_predict( + cls, predictions: List[np.ndarray], op: "BlockwiseEnsemblePredict" + ): + if op.voting == "soft": + prob = np.average(np.stack(predictions), axis=0) + return np.argmax(prob, axis=1) + else: + + def vote(x: np.ndarray): + return np.argmax(np.bincount(x)) + + # hard voting + prediction = np.vstack(predictions).T + return np.apply_along_axis(vote, 1, prediction) + + @classmethod + def _execute_classifier_predict_proba( + cls, predictions: List[np.ndarray], op: "BlockwiseEnsemblePredict" + ): + assert op.voting == "soft" + return np.average(np.stack(predictions), axis=0) + + @classmethod + def _execute_regressor_predict(cls, predictions: List[np.ndarray]): + return np.average(np.vstack(predictions), axis=0) + + +class BlockwiseBaseEstimator(BaseEstimator): + def __init__(self, estimator: SklearnBaseEstimator): + self.estimator = estimator + + def _fit(self, X, y, **kwargs): + X = check_array(X) + op = BlockwiseEnsembleFit(x=X, y=y, estimator=self.estimator, kwargs=kwargs) + self.estimators_ = op() + + +class BlockwiseVotingClassifier(ClassifierMixin, BlockwiseBaseEstimator): + """ + Blockwise training and ensemble voting classifier. + + This classifier trains on blocks / partitions of tensors or DataFrames. + A cloned version of `estimator` will be fit *independently* on each block + or partition of the data. This is useful when the sub estimator + only works on small in-memory data structures like a NumPy array or pandas + DataFrame. + + Prediction is done by the *ensemble* of learned models. + + .. warning:: + + Ensure that your data are sufficiently shuffled prior to training! + If the values of the various blocks / partitions of your dataset are not + distributed similarly, the classifier will give poor results. + + Parameters + ---------- + estimator : Estimator + voting : str, {'hard', 'soft'} (default='hard') + If 'hard', uses predicted class labels for majority rule voting. + Else if 'soft', predicts the class label based on the argmax of + the sums of the predicted probabilities, which is recommended for + an ensemble of well-calibrated classifiers. + classes : list-like, optional + The set of classes that `y` can take. This can also be provided as + a fit param if the underlying estimator requires `classes` at fit time. + + Attributes + ---------- + estimators_ : list of classifiers + The collection of fitted sub-estimators that are `estimator` fitted + on each partition / block of the inputs. + + classes_ : array-like, shape (n_predictions,) + The class labels. + + Examples + -------- + >>> import mars.tensor as mt + >>> from mars.learn.ensemble import BlockwiseVotingClassifier + >>> from sklearn.linear_model import RidgeClassifier + >>> from sklearn.datasets import make_classification + >>> X, y = make_classification(n_samples=100_000) + >>> X, y = mt.tensor(X, chunk_size=10_0000), mt.tensor(y, chunk_size=10_0000) + >>> subestimator = RidgeClassifier(random_state=0) + >>> clf = BlockwiseVotingClassifier(subestimator) + >>> clf.fit(X, y) + """ + + def __init__( + self, + estimator: SklearnBaseEstimator, + voting: str = "hard", + classes: Union[np.ndarray, list, Tensor] = None, + ): + super().__init__(estimator=estimator) + if voting not in ("hard", "soft"): # pragma: no cover + raise ValueError("`voting` could be hard or soft") + self.voting = voting + self.classes = None + if classes is not None: + self.classes = mt.tensor(classes) + + def fit( + self, + X, + y, + classes: Union[np.ndarray, list, Tensor] = None, + session: SessionType = None, + run_kwargs: dict = None, + **kwargs, + ): + if not isinstance(y, ENTITY_TYPE): + y = mt.tensor(y) + if classes is None: + classes = self.classes + to_execute = [] + if classes is None: + classes = mt.unique(y) + to_execute.append(classes) + super()._fit(X, y, **kwargs) + to_execute.append(self.estimators_) + execute(to_execute, session=session, **(run_kwargs or dict())) + self.n_classes_ = len(classes) + + def predict(self, X, session: SessionType = None, run_kwargs: dict = None): + check_is_fitted(self, attributes=["estimators_"]) + X = check_array(X) + op = BlockwiseEnsemblePredict( + x=X, + estimators=[self.estimators_], + voting=self.voting, + proba=False, + is_classifier=True, + n_classes=self.n_classes_, + ) + return op().execute(session=session, **(run_kwargs or dict())) + + def predict_proba(self, X, session: SessionType = None, run_kwargs: dict = None): + if self.voting == "hard": + raise AttributeError(f'predict_proba is not available when voting="hard"') + + check_is_fitted(self, attributes=["estimators_"]) + X = check_array(X) + op = BlockwiseEnsemblePredict( + x=X, + estimators=[self.estimators_], + voting=self.voting, + proba=True, + is_classifier=True, + n_classes=self.n_classes_, + ) + return op().execute(session=session, **(run_kwargs or dict())) + + +class BlockwiseVotingRegressor(RegressorMixin, BlockwiseBaseEstimator): + """ + Blockwise training and ensemble voting regressor. + + This regressor trains on blocks / partitions of tensors or DataFrames. + A cloned version of `estimator` will be fit *independently* on each block + or partition of the data. + + Prediction is done by the *ensemble* of learned models. + + .. warning:: + Ensure that your data are sufficiently shuffled prior to training! + If the values of the various blocks / partitions of your dataset are not + distributed similarly, the regressor will give poor results. + + Parameters + ---------- + estimator : Estimator + + Attributes + ---------- + estimators_ : list of regressors + The collection of fitted sub-estimators that are `estimator` fitted + on each partition / block of the inputs. + + Examples + -------- + >>> import mars.tensor as mt + >>> from mars.learn.ensemble import BlockwiseVotingRegressor + >>> from sklearn.linear_model import LinearRegression + >>> from sklearn.datasets import make_classification + >>> X, y = make_classification(n_samples=100_000) + >>> X, y = mt.tensor(X, chunk_size=10_0000), mt.tensor(y, chunk_size=10_0000) + >>> subestimator = LinearRegression() + >>> clf = BlockwiseVotingRegressor(subestimator) + >>> clf.fit(X, y) + """ + + def fit(self, X, y, session: SessionType = None, run_kwargs: dict = None, **kwargs): + if not isinstance(y, ENTITY_TYPE): + y = mt.tensor(y) + super()._fit(X, y, **kwargs) + self.estimators_.execute(session=session, **(run_kwargs or dict())) + + def predict(self, X, session: SessionType = None, run_kwargs: dict = None): + check_is_fitted(self, attributes=["estimators_"]) + X = check_array(X) + op = BlockwiseEnsemblePredict( + x=X, estimators=[self.estimators_], is_classifier=False + ) + return op().execute(session=session, **(run_kwargs or dict())) diff --git a/python/xorbits/_mars/learn/ensemble/_iforest.py b/python/xorbits/_mars/learn/ensemble/_iforest.py new file mode 100644 index 000000000..46985e0f8 --- /dev/null +++ b/python/xorbits/_mars/learn/ensemble/_iforest.py @@ -0,0 +1,472 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numbers +import warnings +from typing import Union + +import numpy as np +from sklearn.base import OutlierMixin +from sklearn.tree import ExtraTreeRegressor +from sklearn.utils import check_array as sklearn_check_array +from sklearn.utils import gen_batches as sklearn_gen_batches + +from ... import tensor as mt +from ...deploy.oscar.session import execute +from ...lib.sparse import issparse +from ...tensor.utils import check_random_state +from ..utils import convert_to_tensor_or_dataframe, get_chunk_n_rows +from ..utils.validation import _num_samples, check_is_fitted +from ._bagging import BaggingPredictionOperand, BaseBagging, PredictionType + + +def _average_path_length(n_samples_leaf): + """ + The average path length in a n_samples iTree, which is equal to + the average path length of an unsuccessful BST search since the + latter has the same structure as an isolation tree. + Parameters + ---------- + n_samples_leaf : array-like of shape (n_samples,) + The number of training samples in each test sample leaf, for + each estimators. + + Returns + ------- + average_path_length : ndarray of shape (n_samples,) + """ + + n_samples_leaf = sklearn_check_array(n_samples_leaf, ensure_2d=False) + + n_samples_leaf_shape = n_samples_leaf.shape + n_samples_leaf = n_samples_leaf.reshape((1, -1)) + average_path_length = np.zeros(n_samples_leaf.shape) + + mask_1 = n_samples_leaf <= 1 + mask_2 = n_samples_leaf == 2 + not_mask = ~np.logical_or(mask_1, mask_2) + + average_path_length[mask_1] = 0.0 + average_path_length[mask_2] = 1.0 + average_path_length[not_mask] = ( + 2.0 * (np.log(n_samples_leaf[not_mask] - 1.0) + np.euler_gamma) + - 2.0 * (n_samples_leaf[not_mask] - 1.0) / n_samples_leaf[not_mask] + ) + + return average_path_length.reshape(n_samples_leaf_shape) + + +def _tree_decision_function(instance, estimator): + n_samples = _num_samples(instance) + + # We get as many rows as possible within our working_memory budget + # (defined by sklearn.get_config()['working_memory']) to store + # self._max_features in each row during computation. + # + # Note: + # - this will get at least 1 row, even if 1 row of score will + # exceed working_memory. + # - this does only account for temporary memory usage while loading + # the data needed to compute the scores -- the returned scores + # themselves are 1D. + + chunk_n_rows = get_chunk_n_rows( + row_bytes=16 * instance.shape[1], max_n_rows=n_samples + ) + slices = sklearn_gen_batches(n_samples, chunk_n_rows) + + scores = np.zeros(n_samples, order="f") + + for sl in slices: + # compute score on the slices of test samples: + scores[sl] = _compute_score_samples(instance[sl], estimator) + + return scores + + +def _compute_score_samples(instance, estimator): + leaves_index = estimator.apply(instance) + node_indicator = estimator.decision_path(instance) + n_samples_leaf = estimator.tree_.n_node_samples[leaves_index] + + return ( + np.ravel(node_indicator.sum(axis=1)) + + _average_path_length(n_samples_leaf) + - 1.0 + ) + + +class IsolationForest(OutlierMixin, BaseBagging): + """ + Isolation Forest Algorithm. + + Return the anomaly score of each sample using the IsolationForest algorithm + + The IsolationForest 'isolates' observations by randomly selecting a feature + and then randomly selecting a split value between the maximum and minimum + values of the selected feature. + + Since recursive partitioning can be represented by a tree structure, the + number of splittings required to isolate a sample is equivalent to the path + length from the root node to the terminating node. + + This path length, averaged over a forest of such random trees, is a + measure of normality and our decision function. + + Random partitioning produces noticeably shorter paths for anomalies. + Hence, when a forest of random trees collectively produce shorter path + lengths for particular samples, they are highly likely to be anomalies. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_estimators : int, default=100 + The number of base estimators in the ensemble. + + max_samples : "auto", int or float, default="auto" + The number of samples to draw from X to train each base estimator. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. + - If "auto", then `max_samples=min(256, n_samples)`. + + If max_samples is larger than the number of samples provided, + all samples will be used for all trees (no sampling). + + contamination : 'auto' or float, default='auto' + The amount of contamination of the data set, i.e. the proportion + of outliers in the data set. Used when fitting to define the threshold + on the scores of the samples. + + - If 'auto', the threshold is determined as in the + original paper. + - If float, the contamination should be in the range (0, 0.5]. + + max_features : int or float, default=1.0 + The number of features to draw from X to train each base estimator. + + - If int, then draw `max_features` features. + - If float, then draw `max_features * X.shape[1]` features. + + bootstrap : bool, default=False + If True, individual trees are fit on random subsets of the training + data sampled with replacement. If False, sampling without replacement + is performed. + + random_state : int, RandomState instance or None, default=None + Controls the pseudo-randomness of the selection of the feature + and split values for each branching step and each tree in the forest. + + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + warm_start : bool, default=False + When set to ``True``, reuse the solution of the previous call to fit + and add more estimators to the ensemble, otherwise, just fit a whole + new forest. See :term:`the Glossary `. + + Attributes + ---------- + base_estimator_ : ExtraTreeRegressor instance + The child estimator template used to create the collection of + fitted sub-estimators. + + estimators_ : list of ExtraTreeRegressor instances + The collection of fitted sub-estimators. + + estimators_features_ : list of ndarray + The subset of drawn features for each base estimator. + + max_samples_ : int + The actual number of samples. + + offset_ : float + Offset used to define the decision function from the raw scores. We + have the relation: ``decision_function = score_samples - offset_``. + ``offset_`` is defined as follows. When the contamination parameter is + set to "auto", the offset is equal to -0.5 as the scores of inliers are + close to 0 and the scores of outliers are close to -1. When a + contamination parameter different than "auto" is provided, the offset + is defined in such a way we obtain the expected number of outliers + (samples with decision function < 0) in training. + + Notes + ----- + The implementation is based on an ensemble of ExtraTreeRegressor. The + maximum depth of each tree is set to ``ceil(log_2(n))`` where + :math:`n` is the number of samples used to build the tree + (see (Liu et al., 2008) for more details). + + References + ---------- + .. [1] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation forest." + Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on. + .. [2] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation-based + anomaly detection." ACM Transactions on Knowledge Discovery from + Data (TKDD) 6.1 (2012): 3. + + See Also + ---------- + sklearn.covariance.EllipticEnvelope : An object for detecting outliers in a + Gaussian distributed dataset. + sklearn.svm.OneClassSVM : Unsupervised Outlier Detection. + Estimate the support of a high-dimensional distribution. + The implementation is based on libsvm. + sklearn.neighbors.LocalOutlierFactor : Unsupervised Outlier Detection + using Local Outlier Factor (LOF). + + Examples + -------- + >>> from mars.learn.ensemble import IsolationForest + >>> X = [[-1.1], [0.3], [0.5], [100]] + >>> clf = IsolationForest(random_state=0).fit(X) + >>> clf.predict([[0.1], [0], [90]]) + array([ 1, 1, -1]) + """ + + contamination: Union[str, float] + + def __init__( + self, + *, + n_estimators=100, + max_samples="auto", + contamination="auto", + max_features=1.0, + bootstrap=False, + random_state=None, + warm_start=False, + ): + super().__init__( + base_estimator=ExtraTreeRegressor( + max_features=1, splitter="random", random_state=random_state + ), + # here above max_features has no links with self.max_features + bootstrap=bootstrap, + bootstrap_features=False, + n_estimators=n_estimators, + max_samples=max_samples, + max_features=max_features, + warm_start=warm_start, + random_state=random_state, + ) + self.contamination = contamination + + def fit( + self, X, y=None, sample_weight=None, session=None, run_kwargs=None + ) -> "IsolationForest": + """ + Fit estimator. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Use ``dtype=np.float32`` for maximum + efficiency. Sparse matrices are also supported, use sparse + ``csc_matrix`` for maximum efficiency. + + y : Ignored + Not used, present for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. + + Returns + ------- + self : IsolationForest + Fitted estimator. + """ + run_kwargs = run_kwargs or dict() + X = convert_to_tensor_or_dataframe(X) + if issparse(X): # pragma: no cover + raise NotImplementedError + + if np.isnan(X.shape[0]): + execute(X, session=session, **run_kwargs) + + rnd = check_random_state(self.random_state) + y = rnd.uniform(size=X.shape[0]) + + # ensure that max_sample is in [1, n_samples]: + n_samples = X.shape[0] + + if self.contamination != "auto": + if not (0.0 < self.contamination <= 0.5): + raise ValueError( + "contamination must be in (0, 0.5], got: %f" % self.contamination + ) + + if isinstance(self.max_samples, str): + if self.max_samples == "auto": + max_samples = min(256, n_samples) + else: + raise ValueError( + "max_samples (%s) is not supported." + 'Valid choices are: "auto", int or' + "float" % self.max_samples + ) + + elif isinstance(self.max_samples, numbers.Integral): + if self.max_samples > n_samples: + warnings.warn( + "max_samples (%s) is greater than the " + "total number of samples (%s). max_samples " + "will be set to n_samples for estimation." + % (self.max_samples, n_samples) + ) + max_samples = n_samples + else: + max_samples = self.max_samples + else: # float + if not 0.0 < self.max_samples <= 1.0: + raise ValueError( + "max_samples must be in (0, 1], got %r" % self.max_samples + ) + max_samples = int(self.max_samples * X.shape[0]) + + self.max_samples_ = max_samples + max_depth = int(np.ceil(np.log2(max(max_samples, 2)))) + super()._fit( + X, + y, + sample_weight=sample_weight, + max_samples=max_samples, + estimator_params=dict(max_samples=max_samples, max_depth=max_depth), + ) + + if self.contamination == "auto": + # 0.5 plays a special role as described in the original paper. + # we take the opposite as we consider the opposite of their score. + self.offset_ = -0.5 + return self + + # else, define offset_ wrt contamination parameter + self.offset_ = execute( + mt.percentile(self._score_samples(X), 100.0 * self.contamination), + session=session, + **(run_kwargs or dict()), + ).fetch(session=session, **(run_kwargs or dict())) + + return self + + def predict(self, X, session=None, run_kwargs=None): + """ + Predict if a particular sample is an outlier or not. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. + + Returns + ------- + is_inlier : ndarray of shape (n_samples,) + For each observation, tells whether or not (+1 or -1) it should + be considered as an inlier according to the fitted model. + """ + check_is_fitted(self) + is_inlier = mt.ones(X.shape[0], dtype=int) + is_inlier[self._decision_function(X) < 0] = -1 + return execute(is_inlier, session=session, **(run_kwargs or dict())) + + def _decision_function(self, X): + return self._score_samples(X) - self.offset_ + + def decision_function(self, X, session=None, run_kwargs=None): + """ + Average anomaly score of X of the base classifiers. + + The anomaly score of an input sample is computed as + the mean anomaly score of the trees in the forest. + + The measure of normality of an observation given a tree is the depth + of the leaf containing this observation, which is equivalent to + the number of splittings required to isolate this point. In case of + several observations n_left in the leaf, the average path length of + a n_left samples isolation tree is added. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. + + Returns + ------- + scores : ndarray of shape (n_samples,) + The anomaly score of the input samples. + The lower, the more abnormal. Negative scores represent outliers, + positive scores represent inliers. + """ + # We subtract self.offset_ to make 0 be the threshold value for being + # an outlier: + + decision_func = self._decision_function(X) + return execute(decision_func, session=session, **(run_kwargs or dict())) + + def _score_samples(self, X): + check_is_fitted(self) + X = convert_to_tensor_or_dataframe(X) + predict_op = BaggingPredictionOperand( + prediction_type=PredictionType.DECISION_FUNCTION, + decision_function=_tree_decision_function, + calc_means=False, + ) + depths = predict_op(X, self.estimators_, self.estimator_features_).sum(axis=1) + denominator = self.estimators_.shape[0] * _average_path_length( + [self.max_samples_] + ) + return -( + 2 + ** ( + # For a single training sample, denominator and depth are 0. + # Therefore, we set the score manually to 1. + -mt.divide( + depths, + denominator, + out=mt.ones_like(depths), + where=denominator != 0, + ) + ) + ) + + def score_samples(self, X, session=None, run_kwargs=None): + """ + Opposite of the anomaly score defined in the original paper. + + The anomaly score of an input sample is computed as + the mean anomaly score of the trees in the forest. + + The measure of normality of an observation given a tree is the depth + of the leaf containing this observation, which is equivalent to + the number of splittings required to isolate this point. In case of + several observations n_left in the leaf, the average path length of + a n_left samples isolation tree is added. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. + + Returns + ------- + scores : ndarray of shape (n_samples,) + The anomaly score of the input samples. + The lower, the more abnormal. + """ + scores = self._score_samples(X) + return execute(scores, session=session, **(run_kwargs or dict())) diff --git a/python/xorbits/_mars/learn/ensemble/tests/__init__.py b/python/xorbits/_mars/learn/ensemble/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/learn/ensemble/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/learn/ensemble/tests/test_bagging.py b/python/xorbits/_mars/learn/ensemble/tests/test_bagging.py new file mode 100644 index 000000000..cf24dcb07 --- /dev/null +++ b/python/xorbits/_mars/learn/ensemble/tests/test_bagging.py @@ -0,0 +1,322 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import os + +import numpy as np +import pandas as pd +import pytest +from sklearn.datasets import make_classification, make_regression +from sklearn.linear_model import LinearRegression, LogisticRegression +from sklearn.svm import SVC + +from .... import dataframe as md +from .... import execute +from .... import tensor as mt +from ....conftest import MARS_CI_BACKEND +from ....core import enter_mode +from ....services.task.execution.api import Fetcher +from .._bagging import ( + BaggingClassifier, + BaggingRegressor, + BaggingSample, + BaggingSampleReindex, + _extract_bagging_io, +) + + +def _get_tileable_chunk_data(sync_session, tileable): + @enter_mode(build=True) + async def _async_fetch(): + tuples = [] + async_session = sync_session._session + meta_api = async_session._meta_api + + t, indexes = async_session._get_to_fetch_tileable(tileable) + fetcher = Fetcher.create( + MARS_CI_BACKEND, get_storage_api=async_session._get_storage_api + ) + + get_metas = [] + for chunk in t.chunks: + get_metas.append( + meta_api.get_chunk_meta.delay( + chunk.key, fields=fetcher.required_meta_keys + ) + ) + metas = await meta_api.get_chunk_meta.batch(*get_metas) + + for chunk, meta in zip(t.chunks, metas): + await fetcher.append(chunk.key, meta) + all_data = await fetcher.get() + + for chunk, data in zip(t.chunks, all_data): + tuples.append((t, chunk, data)) + return tuples + + future = asyncio.run_coroutine_threadsafe( + _async_fetch(), sync_session._isolation.loop + ) + return future.result(120 if "CI" in os.environ else None) + + +@pytest.mark.parametrize( + "use_dataframe, max_samples, max_features, with_labels, with_weights", + [ + (False, 10, 1.0, False, False), + (False, 10, 0.5, True, True), + (True, 10, 1.0, False, False), + (True, 10, 0.5, True, True), + ], +) +def test_bagging_sample_execution( + setup, use_dataframe, max_samples, max_features, with_labels, with_weights +): + rs = np.random.RandomState(0) + + raw_data = rs.randint(100, size=(100, 50)) + if not use_dataframe: + t = mt.tensor(raw_data, chunk_size=20) + else: + raw_data = pd.DataFrame(raw_data) + t = md.DataFrame(raw_data, chunk_size=20) + + raw_labels = rs.choice([0, 1, 2], size=100) + raw_weights = rs.random(100) + labels = mt.tensor(raw_labels, chunk_size=20) if with_labels else None + weights = mt.tensor(raw_weights, chunk_size=20) if with_weights else None + + sample_op = BaggingSample( + n_estimators=10, + max_samples=max_samples, + max_features=max_features, + random_state=rs, + ) + result_tuple = execute(*sample_op(t, labels, weights)) + t_sampled, t_labels, t_weights, t_feature_indices = _extract_bagging_io( + result_tuple, sample_op, output=True + ) + + label_chunks, weights_chunks, feature_idx_chunks = dict(), dict(), dict() + + for t, chunks_dict in zip((t_labels, t_weights), (label_chunks, weights_chunks)): + if t is None: + continue + for tiled, chunk, chunk_data in _get_tileable_chunk_data(setup, t): + assert len(tiled.chunks) == 5 + chunks_dict[chunk.index] = chunk_data + for d in chunk_data: + assert d.shape == (10,) + + if t_feature_indices is not None: + for tiled, chunk, chunk_data in _get_tileable_chunk_data( + setup, t_feature_indices + ): + assert len(tiled.chunks) == 5 + feature_idx_chunks[chunk.index] = chunk_data + assert chunk_data.shape == (2, int(max_features * raw_data.shape[1])) + + for tiled, chunk, chunk_data in _get_tileable_chunk_data(setup, t_sampled): + assert len(tiled.chunks) == 5 + assert len(chunk_data) == 2 + for est_id, d in enumerate(chunk_data): + assert d.shape == (10, int(max_features * raw_data.shape[1])) + + if use_dataframe: + raw_sliced = raw_data.loc[d.index] + if label_chunks: + label_chunk = label_chunks[(chunk.index[0],)][est_id] + np.testing.assert_array_equal(raw_labels[d.index], label_chunk) + if weights_chunks: + weights_chunk = weights_chunks[(chunk.index[0],)][est_id] + np.testing.assert_array_equal(raw_weights[d.index], weights_chunk) + + if feature_idx_chunks: + feature_indices_chunk = feature_idx_chunks[chunk.index][est_id] + raw_sliced = raw_sliced.iloc[:, feature_indices_chunk] + pd.testing.assert_frame_equal(raw_sliced, d) + + +@pytest.mark.parametrize( + "use_dataframe, max_samples, max_features, column_split", + [ + (False, 10, 1.0, 50), + (False, 10, 0.5, 50), + (True, 10, 1.0, 20), + (True, 10, 0.5, 20), + ], +) +def test_bagging_sample_reindex( + setup, use_dataframe, max_samples, max_features, column_split +): + rs = np.random.RandomState(0) + + raw_insts = rs.randint(100, size=(100, 50)) + raw_data = rs.randint(100, size=(200, 50)) + if not use_dataframe: + t_insts = mt.tensor(raw_insts, chunk_size=column_split) + t_data = mt.tensor(raw_data, chunk_size=column_split) + else: + raw_insts = pd.DataFrame(raw_insts) + raw_data = pd.DataFrame(raw_data) + t_insts = md.DataFrame(raw_insts, chunk_size=column_split) + t_data = md.DataFrame(raw_data, chunk_size=column_split) + + sample_op = BaggingSample( + n_estimators=10, + max_samples=max_samples, + max_features=max_features, + random_state=rs, + ) + result_tuple = execute(*sample_op(t_insts)) + _t_sampled, _label, _weights, t_feature_indices = _extract_bagging_io( + result_tuple, sample_op, output=True + ) + + reindex_op = BaggingSampleReindex(n_estimators=10) + reindexed = execute( + reindex_op(t_data, t_feature_indices), extra_config={"check_dtypes": False} + ) + + for tiled, _chunk, chunk_data in _get_tileable_chunk_data(setup, reindexed): + if t_feature_indices is None: + assert len(tiled.chunks) == np.ceil(raw_data.shape[0] / column_split) + assert chunk_data.shape[1] == 50 + else: + row_chunks = np.ceil(raw_insts.shape[0] / column_split) + assert len(tiled.chunks) == row_chunks * np.ceil( + raw_data.shape[0] / column_split + ) + assert isinstance(chunk_data, tuple) + for chunk_data_piece in chunk_data: + assert chunk_data_piece.shape[1] == 25 + + +@pytest.mark.parametrize( + "use_dataframe, max_samples, max_features, with_weights, base_estimator_cls", + [ + (False, 10, 0.5, False, LogisticRegression), + (True, 10, 1.0, True, SVC), + ], +) +def test_bagging_classifier( + setup, use_dataframe, max_samples, max_features, with_weights, base_estimator_cls +): + rs = np.random.RandomState(0) + + raw_x, raw_y = make_classification( + n_samples=100, + n_features=4, + n_informative=2, + n_redundant=0, + random_state=rs, + shuffle=False, + ) + + if not use_dataframe: + t_x = mt.tensor(raw_x, chunk_size=20) + else: + raw_x = pd.DataFrame(raw_x) + t_x = md.DataFrame(raw_x, chunk_size=20) + + raw_weights = rs.random(100) + t_y = mt.tensor(raw_y, chunk_size=20) + t_weights = mt.tensor(raw_weights, chunk_size=20) if with_weights else None + + clf = BaggingClassifier( + base_estimator=base_estimator_cls(), + n_estimators=10, + max_samples=max_samples, + max_features=max_features, + random_state=rs, + warm_start=True, + ) + clf.fit(t_x, t_y, sample_weight=t_weights) + + for _tiled, _chunk, chunk_data in _get_tileable_chunk_data(setup, clf.estimators_): + assert len(chunk_data) == 2 + assert all(isinstance(c, base_estimator_cls) for c in chunk_data) + + if max_features < 1.0: + assert clf.estimator_features_ is not None + + with pytest.warns(Warning): + clf.fit(t_x, t_y, sample_weight=t_weights) + with pytest.raises(ValueError): + clf.n_estimators = 5 + clf.fit(t_x, t_y, sample_weight=t_weights) + + clf.n_estimators = 20 + clf.fit(t_x, t_y, sample_weight=t_weights) + assert clf.estimators_.shape[0] == 20 + + proba = clf.predict_proba(t_x) + proba_array = proba.fetch() + assert np.all((proba_array >= 0) & (proba_array <= 1)) + assert np.allclose(np.sum(proba_array, axis=1), 1.0) + + log_proba = clf.predict_log_proba(t_x) + exp_log_proba_array = np.exp(log_proba.fetch()) + assert np.all((exp_log_proba_array >= 0) & (exp_log_proba_array <= 1)) + assert np.allclose(np.sum(exp_log_proba_array, axis=1), 1.0) + + y = clf.predict(t_x) + y_array = y.fetch() + assert np.all((y_array == 0) | (y_array == 1)) + + decision_fun = clf.decision_function(t_x) + decision_fun_array = decision_fun.fetch() + assert decision_fun_array.shape == (y_array.shape[0],) + + +@pytest.mark.parametrize( + "use_dataframe, max_samples, max_features, with_weights", + [ + (False, 10, 0.5, False), + (True, 10, 1.0, True), + ], +) +def test_bagging_regressor( + setup, use_dataframe, max_samples, max_features, with_weights +): + rs = np.random.RandomState(0) + + raw_x, raw_y = make_regression( + n_samples=100, n_features=4, n_informative=2, random_state=rs, shuffle=False + ) + + if not use_dataframe: + t_x = mt.tensor(raw_x, chunk_size=20) + else: + raw_x = pd.DataFrame(raw_x) + t_x = md.DataFrame(raw_x, chunk_size=20) + + raw_weights = rs.random(100) + t_y = mt.tensor(raw_y, chunk_size=20) + t_weights = mt.tensor(raw_weights, chunk_size=20) if with_weights else None + + clf = BaggingRegressor( + base_estimator=LinearRegression(), + n_estimators=10, + max_samples=max_samples, + max_features=max_features, + random_state=rs, + warm_start=True, + ) + clf.fit(t_x, t_y, sample_weight=t_weights) + + predict_y = clf.predict(t_x) + predict_y_array = predict_y.fetch() + assert predict_y_array.shape == raw_y.shape diff --git a/python/xorbits/_mars/learn/ensemble/tests/test_blockwise.py b/python/xorbits/_mars/learn/ensemble/tests/test_blockwise.py new file mode 100644 index 000000000..b8599b19d --- /dev/null +++ b/python/xorbits/_mars/learn/ensemble/tests/test_blockwise.py @@ -0,0 +1,109 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +from sklearn.datasets import make_classification +from sklearn.linear_model import LogisticRegression + +from .... import dataframe as md +from .... import tensor as mt +from .. import BlockwiseVotingClassifier, BlockwiseVotingRegressor + +fit_raw_X, fit_raw_y = make_classification() +fit_X, fit_y = mt.tensor(fit_raw_X, chunk_size=25), mt.tensor(fit_raw_y, chunk_size=25) +fit_df_X = md.DataFrame(fit_X) +predict_raw_X, predict_raw_y = make_classification() +predict_X, predict_y = ( + mt.tensor(predict_raw_X, chunk_size=20), + mt.tensor(predict_raw_y, chunk_size=20), +) +predict_df_X = md.DataFrame(predict_X) + + +@pytest.mark.parametrize( + "fit_X, fit_y, predict_X, predict_y", + [ + (fit_X, fit_y, predict_X, predict_y), + (fit_raw_X, fit_raw_y, predict_raw_X, predict_raw_y), + (fit_df_X, fit_raw_y, predict_df_X, predict_raw_y), + ], +) +def test_blockwise_voting_classifier_hard(setup, fit_X, fit_y, predict_X, predict_y): + clf = BlockwiseVotingClassifier(LogisticRegression(solver="lbfgs")) + clf.fit(fit_X, fit_y) + estimators = clf.estimators_.fetch() + if not isinstance(fit_X, np.ndarray): + assert len(estimators) == 4 + + clf.predict(predict_X) + score = clf.score(predict_X, predict_y) + assert isinstance(score.fetch(), float) + + with pytest.raises(AttributeError, match="hard"): + clf.predict_proba(predict_X) + + +@pytest.mark.parametrize( + "fit_X, fit_y, predict_X, predict_y", + [ + (fit_X, fit_y, predict_X, predict_y), + (fit_raw_X, fit_raw_y, predict_raw_X, predict_raw_y), + (fit_df_X, fit_raw_y, predict_df_X, predict_raw_y), + ], +) +def test_blockwise_voting_classifier_soft(setup, fit_X, fit_y, predict_X, predict_y): + clf = BlockwiseVotingClassifier( + LogisticRegression(solver="lbfgs"), + voting="soft", + classes=[0, 1], + ) + clf.fit(fit_X, fit_y) + estimators = clf.estimators_.fetch() + if not isinstance(fit_X, np.ndarray): + assert len(estimators) == 4 + + result = clf.predict(predict_X) + assert result.dtype == np.dtype("int64") + assert result.shape == (predict_X.shape[0],) + + result = clf.predict_proba(predict_X) + assert result.dtype == np.dtype("float64") + assert result.shape == (predict_X.shape[0], 2) + + score = clf.score(predict_X, predict_y) + assert isinstance(score.fetch(), float) + + +@pytest.mark.parametrize( + "fit_X, fit_y, predict_X, predict_y", + [ + (fit_X, fit_y, predict_X, predict_y), + (fit_raw_X, fit_raw_y, predict_raw_X, predict_raw_y), + (fit_df_X, fit_raw_y, predict_df_X, predict_raw_y), + ], +) +def test_blockwise_voting_regressor(setup, fit_X, fit_y, predict_X, predict_y): + est = BlockwiseVotingRegressor(LogisticRegression()) + est.fit(fit_X, fit_y) + estimators = est.estimators_.fetch() + if not isinstance(fit_X, np.ndarray): + assert len(estimators) == 4 + + result = est.predict(predict_X) + assert result.dtype == np.dtype("float64") + assert result.shape == (predict_X.shape[0],) + + score = est.score(predict_X, predict_y) + assert isinstance(score.fetch(), float) diff --git a/python/xorbits/_mars/learn/ensemble/tests/test_iforest.py b/python/xorbits/_mars/learn/ensemble/tests/test_iforest.py new file mode 100644 index 000000000..088a8707e --- /dev/null +++ b/python/xorbits/_mars/learn/ensemble/tests/test_iforest.py @@ -0,0 +1,75 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +from sklearn.datasets import load_iris + +from .... import tensor as mt +from .._iforest import IsolationForest + + +@pytest.mark.parametrize("max_samples", [0.5, 1.0, 10]) +def test_iforest(setup, max_samples): + rs = np.random.RandomState(0) + raw_train = rs.poisson(size=(100, 10)) + t_train = mt.tensor(raw_train, chunk_size=20) + raw_test = rs.poisson(size=(200, 10)) + t_test = mt.tensor(raw_test, chunk_size=20) + + clf = IsolationForest(random_state=rs, n_estimators=10, max_samples=max_samples) + clf.fit(t_train).predict(t_test) + clf.score_samples(t_test) + + +@pytest.mark.parametrize("contamination", [0.25, "auto"]) +def test_iforest_works(setup, contamination): + rs = np.random.RandomState(0) + # toy sample (the last two samples are outliers) + raw = np.array( + [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]] + ) + t = mt.tensor(raw, chunk_size=4) + + # Test IsolationForest + clf = IsolationForest(random_state=rs, contamination=contamination) + clf.fit(t) + decision_func = -clf.decision_function(t).execute().fetch() + pred = clf.predict(t).execute().fetch() + # assert detect outliers: + assert np.min(decision_func[-2:]) > np.max(decision_func[:-2]) + np.testing.assert_array_equal(pred, 6 * [1] + 2 * [-1]) + + +def test_iforest_error(): + """Test that it gives proper exception on deficient input.""" + iris = load_iris() + X = iris.data + + # Test max_samples + with pytest.raises(ValueError): + IsolationForest(max_samples=-1).fit(X) + with pytest.raises(ValueError): + IsolationForest(max_samples=0.0).fit(X) + with pytest.raises(ValueError): + IsolationForest(max_samples=2.0).fit(X) + + with pytest.raises(ValueError): + IsolationForest(max_samples="foobar").fit(X) + with pytest.raises(ValueError): + IsolationForest(max_samples=1.5).fit(X) + + # test X_test n_features match X_train one: + with pytest.raises(ValueError): + IsolationForest().fit(X).predict(X[:, 1:]) diff --git a/python/xorbits/_mars/learn/glm/__init__.py b/python/xorbits/_mars/learn/glm/__init__.py new file mode 100644 index 000000000..14a15022b --- /dev/null +++ b/python/xorbits/_mars/learn/glm/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ._logistic import LogisticRegression + +__all__ = [ + "LogisticRegression", +] diff --git a/python/xorbits/_mars/learn/glm/_logistic.py b/python/xorbits/_mars/learn/glm/_logistic.py new file mode 100644 index 000000000..fdfb19f53 --- /dev/null +++ b/python/xorbits/_mars/learn/glm/_logistic.py @@ -0,0 +1,307 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numbers + +from sklearn.utils.validation import _deprecate_positional_args + +from ... import tensor as mt +from ..base import BaseEstimator +from ..linear_model._base import LinearClassifierMixin +from ..utils.extmath import softmax +from ..utils.multiclass import check_classification_targets +from ..utils.validation import check_is_fitted +from ._optimizers import gradient_descent + + +def _check_solver(solver): + all_solvers = ["SGD"] + if solver not in all_solvers: + raise ValueError( + "Logistic Regression supports only solvers in %s, got" + " %s." % (all_solvers, solver) + ) + return solver + + +def _check_multi_class(multi_class, solver, n_classes): + if multi_class == "auto": + return "multinomial" + if multi_class == "ovr": + if n_classes == 2: + return "multinomial" + else: + raise ValueError( + "Solver %s does not support " + "an ovr backend with number of classes " + "larger than 2." % solver + ) + if multi_class == "multinomial": + return "multinomial" + + raise ValueError( + "multi_class should be 'multinomial', 'ovr' or 'auto'. Got %s." % multi_class + ) + + +class LogisticRegression(LinearClassifierMixin, BaseEstimator): + """ + Logistic Regression (aka logit, MaxEnt) classifier. + + In the multiclass case, the training algorithm uses the one-vs-rest (OvR) + scheme if the 'multi_class' option is set to 'ovr', and uses the + cross-entropy loss if the 'multi_class' option is set to 'multinomial'. + (Currently the 'multinomial' option is supported only by the 'lbfgs', + 'sag', 'saga' and 'newton-cg' solvers.) + + This class implements regularized logistic regression using the + 'liblinear' library, 'newton-cg', 'sag', 'saga' and 'lbfgs' solvers. **Note + that regularization is applied by default**. It can handle both dense + and sparse input. Use C-ordered arrays or CSR matrices containing 64-bit + floats for optimal performance; any other input format will be converted + (and copied). + + The 'newton-cg', 'sag', and 'lbfgs' solvers support only L2 regularization + with primal formulation, or no regularization. The 'liblinear' solver + supports both L1 and L2 regularization, with a dual formulation only for + the L2 penalty. The Elastic-Net regularization is only supported by the + 'saga' solver. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + penalty : {'l1', 'l2', 'elasticnet', 'none'}, default='l2' + Used to specify the norm used in the penalization. The 'newton-cg', + 'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is + only supported by the 'saga' solver. If 'none' (not supported by the + liblinear solver), no regularization is applied. + + .. versionadded:: 0.19 + l1 penalty with SAGA solver (allowing 'multinomial' + L1) + + C : float, default=1.0 + Inverse of regularization strength; must be a positive float. + Like in support vector machines, smaller values specify stronger + regularization. + + fit_intercept : bool, default=True + Specifies if a constant (a.k.a. bias or intercept) should be + added to the decision function. + + random_state : int, RandomState instance, default=None + Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the + data. See :term:`Glossary ` for details. + + solver : SGD (stochastic gradient descent) + + max_iter : int, default=100 + Maximum number of iterations taken for the solvers to converge. + + multi_class : {'auto', 'ovr', 'multinomial'}, default='auto' + If the option chosen is 'ovr', then a binary problem is fit for each + label. For 'multinomial' the loss minimised is the multinomial loss fit + across the entire probability distribution, *even when the data is + binary*. 'multinomial' is unavailable when solver='liblinear'. + 'auto' selects 'ovr' if the data is binary, or if solver='liblinear', + and otherwise selects 'multinomial'. + + .. versionadded:: 0.18 + Stochastic Average Gradient descent solver for 'multinomial' case. + .. versionchanged:: 0.22 + Default changed from 'ovr' to 'auto' in 0.22. + + verbose : int, default=0 + For the liblinear and lbfgs solvers set verbose to any positive + number for verbosity. + + Attributes + ---------- + + coef_ : ndarray of shape (1, n_features) or (n_classes, n_features) + Coefficient of the features in the decision function. + + `coef_` is of shape (1, n_features) when the given problem is binary. + In particular, when `multi_class='multinomial'`, `coef_` corresponds + to outcome 1 (True) and `-coef_` corresponds to outcome 0 (False). + + intercept_ : ndarray of shape (1,) or (n_classes,) + Intercept (a.k.a. bias) added to the decision function. + + If `fit_intercept` is set to False, the intercept is set to zero. + `intercept_` is of shape (1,) when the given problem is binary. + In particular, when `multi_class='multinomial'`, `intercept_` + corresponds to outcome 1 (True) and `-intercept_` corresponds to + outcome 0 (False). + + See Also + -------- + SGDClassifier : Incrementally trained logistic regression (when given + the parameter ``loss="log"``). + LogisticRegressionCV : Logistic regression with built-in cross validation. + + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> from sklearn.linear_model import LogisticRegression + >>> X, y = load_iris(return_X_y=True) + >>> clf = LogisticRegression(random_state=0).fit(X, y) + >>> clf.predict(X[:2, :]) + array([0, 0]) + """ + + @_deprecate_positional_args + def __init__( + self, + penalty="l2", + fit_intercept=False, + C=100, + batch_size=20, + learning_rate=0.1, + random_state=None, + solver="SGD", + max_iter=300, + multi_class="auto", + verbose=0, + ): + self.penalty = penalty + self.fit_intercept = fit_intercept + self.C = C + self.batch_size = batch_size + self.learning_rate = learning_rate + self.random_state = random_state + self.solver = solver + self.max_iter = max_iter + self.multi_class = multi_class + self.verbose = verbose + + def fit(self, X, y): + """ + Fit the model according to the given training data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where n_samples is the number of samples and + n_features is the number of features. + + y : array-like of shape (n_samples,) + Target vector relative to X. + + Returns + ------- + self + Fitted estimator. + """ + # ========== Pre-check ============= + if self.penalty not in ["l2"]: + raise NotImplementedError("Only support L2 penalty.") + + solver = _check_solver(self.solver) + + if not isinstance(self.C, numbers.Number) or self.C < 0: + raise ValueError("Penalty term must be positive; got (C=%r)" % self.C) + + if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0: + raise ValueError( + "Maximum number of iteration must be positive;" + " got (max_iter=%r)" % self.max_iter + ) + + _dtype = [mt.float64, mt.float32] + + X, y = self._validate_data(X, y, accept_sparse="csr", dtype=_dtype, order="C") + + check_classification_targets(y) + + self.classes_ = mt.unique(y) + n_classes = self.classes_.execute().shape[0] + multi_class = _check_multi_class(self.multi_class, solver, n_classes) + + # ========== Fit solver ============ + # Only support stochastic gradient descent for now + if multi_class == "multinomial": + if solver == "SGD": + self.coef_ = gradient_descent( + X, + y, + learning_rate=self.learning_rate, + reg=(1 / self.C), + max_epochs=self.max_iter, + batch_size=self.batch_size, + fit_intercept=self.fit_intercept, + verbose=self.verbose, + ) + self.coef_ = self.coef_.T + + # ========== Post processing ======= + if self.fit_intercept: + self.intercept_ = self.coef_[:, -1] + self.coef_ = self.coef_[:, :-1] + else: + self.intercept_ = mt.zeros(n_classes) + + return self + + def predict_proba(self, X): + """ + Probability estimates. + + The returned estimates for all classes are ordered by the + label of classes. + + For a multi_class problem, if multi_class is set to be "multinomial" + the softmax function is used to find the predicted probability of + each class. + Else use a one-vs-rest approach, i.e calculate the probability + of each class assuming it to be positive using the logistic function. + and normalize these values across all the classes. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Vector to be scored, where `n_samples` is the number of samples and + `n_features` is the number of features. + + Returns + ------- + T : array-like of shape (n_samples, n_classes) + Returns the probability of the sample for each class in the model, + where classes are ordered as they are in ``self.classes_``. + """ + check_is_fitted(self) + decision = self.decision_function(X) + + return softmax(decision, copy=False).execute() + + def predict_log_proba(self, X): + """ + Predict logarithm of probability estimates. + + The returned estimates for all classes are ordered by the + label of classes. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Vector to be scored, where `n_samples` is the number of samples and + `n_features` is the number of features. + + Returns + ------- + T : array-like of shape (n_samples, n_classes) + Returns the log-probability of the sample for each class in the + model, where classes are ordered as they are in ``self.classes_``. + """ + return mt.log(self.predict_proba(X)).execute() diff --git a/python/xorbits/_mars/learn/glm/_optimizers.py b/python/xorbits/_mars/learn/glm/_optimizers.py new file mode 100644 index 000000000..929691c95 --- /dev/null +++ b/python/xorbits/_mars/learn/glm/_optimizers.py @@ -0,0 +1,91 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import numpy as np + +from ... import execute +from ... import tensor as mt +from ...tensor.datasource import tensor as astensor + + +def softmax_loss_and_grad(W, X, y, reg): + N, D = X.shape + K = W.shape[1] + + y_obs = mt.zeros(shape=(N, K)) + for i in range(N): + y_obs[i] = mt.eye(K)[y[i]] + + loss = -1 / N * mt.sum( + y_obs * mt.log(mt.exp(X @ W) / mt.sum(mt.exp(X @ W), axis=1).reshape(-1, 1)) + ) + 0.5 * reg * mt.sum(mt.square(W)) + + dW = mt.zeros(shape=(D, K)) + + # Matrix approach + dW = ( + -1 + / N + * X.T + @ (y_obs - (mt.exp(X @ W) / mt.sum(mt.exp(X @ W), axis=1).reshape(-1, 1))) + + reg * W + ) + + execute(loss, dW) + + return loss, dW + + +def gradient_descent( + X, + y, + learning_rate=1e-3, + reg=1e-5, + max_epochs=100, + batch_size=20, + fit_intercept=True, + verbose=0, +): + # assume y takes values 0...K-1 where K is number of classes + num_classes = (mt.max(y) + 1).to_numpy() + + num_train, dim = X.shape + num_iters_per_epoch = int(math.floor(1.0 * num_train / batch_size)) + + # need extra entries if fit_intercept + if fit_intercept: + X = mt.hstack((X, mt.ones((num_train, 1)))) + W = 0.001 * mt.random.randn(dim + 1, num_classes).execute() + else: + X = astensor(X) + W = 0.001 * mt.random.randn(dim, num_classes).execute() + + for _ in range(max_epochs): + # perform mini-batch SGD update + perm_idx = np.random.permutation(num_train) + for it in range(num_iters_per_epoch): + # print(it, num_iters_per_epoch) + idx = perm_idx[it * batch_size : (it + 1) * batch_size] + batch_x = X[idx] + batch_y = y[idx] + + # evaluate loss and gradient + _, grad = softmax_loss_and_grad(W, batch_x, batch_y, reg) + + # update parameters + W = W - learning_rate * grad + + return W diff --git a/python/xorbits/_mars/learn/glm/tests/__init__.py b/python/xorbits/_mars/learn/glm/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/learn/glm/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/learn/glm/tests/test_logistic.py b/python/xorbits/_mars/learn/glm/tests/test_logistic.py new file mode 100644 index 000000000..0c6b340c8 --- /dev/null +++ b/python/xorbits/_mars/learn/glm/tests/test_logistic.py @@ -0,0 +1,113 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + +import pytest +from sklearn.datasets import load_iris + +from .._logistic import LogisticRegression, _check_multi_class, _check_solver + +# general data load +X, y = load_iris(return_X_y=True) + + +def test_check_solver(setup): + all_solvers = ["SGD"] + for solver in all_solvers: + checked_solver = _check_solver(solver) + assert checked_solver == solver + + invalid_solver = "Newton" + error_msg = re.escape( + "Logistic Regression supports only solvers in %s, " + "got %s." % (all_solvers, invalid_solver) + ) + + with pytest.raises(ValueError, match=error_msg): + _check_solver(invalid_solver) + + +def test_check_multi_class(setup): + all_multi_class = ["auto", "multinomial", "ovr"] + solver = "SGD" + + for multi_class in all_multi_class: + checked_multi_class = _check_multi_class(multi_class, solver, 2) + assert checked_multi_class == "multinomial" + + error_msg = re.escape( + "Solver %s does not support " + "an ovr backend with number of classes " + "larger than 2." % solver + ) + with pytest.raises(ValueError, match=error_msg): + _check_multi_class("ovr", solver, 3) + + invalid_multi_class = "multiovr" + error_msg = re.escape( + "multi_class should be 'multinomial', " + "'ovr' or 'auto'. Got %s." % invalid_multi_class + ) + with pytest.raises(ValueError, match=error_msg): + _check_multi_class(invalid_multi_class, solver, 3) + + +def test_invalid_penalty(setup): + error_msg = re.escape("Only support L2 penalty.") + + with pytest.raises(NotImplementedError, match=error_msg): + model = LogisticRegression(penalty="l1") + model.fit(X, y) + + +def test_invalid_C(setup): + invalid_C = -1 + error_msg = re.escape("Penalty term must be positive; got (C=%r)" % invalid_C) + + with pytest.raises(ValueError, match=error_msg): + model = LogisticRegression(C=invalid_C) + model.fit(X, y) + + +def test_invalid_max_iter(setup): + invalid_max_iter = -1 + error_msg = re.escape( + "Maximum number of iteration must be positive;" + " got (max_iter=%r)" % invalid_max_iter + ) + + with pytest.raises(ValueError, match=error_msg): + model = LogisticRegression(max_iter=invalid_max_iter) + model.fit(X, y) + + +@pytest.mark.parametrize("fit_intercept", [True, False]) +def test_logistic_regression_no_converge(setup, fit_intercept): + # quite slow in local tests, so set max_iter=1 + # suggested max_iter >= 10 + model = LogisticRegression(fit_intercept=fit_intercept, max_iter=1) + model.fit(X, y) + model.predict(X) + model.score(X, y) + model.predict_proba(X) + model.predict_log_proba(X) + + error_msg = re.escape( + "X has %d features per sample; expecting %d" + % (X.shape[1], model.coef_.shape[1] - 1) + ) + model.coef_ = model.coef_[:, :-1] + with pytest.raises(ValueError, match=error_msg): + model.predict(X) diff --git a/python/xorbits/_mars/learn/linear_model/__init__.py b/python/xorbits/_mars/learn/linear_model/__init__.py new file mode 100644 index 000000000..c5c19b92e --- /dev/null +++ b/python/xorbits/_mars/learn/linear_model/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ._base import LinearRegression + +__all__ = [ + "LinearRegression", +] diff --git a/python/xorbits/_mars/learn/linear_model/_base.py b/python/xorbits/_mars/learn/linear_model/_base.py new file mode 100644 index 000000000..6485732dd --- /dev/null +++ b/python/xorbits/_mars/learn/linear_model/_base.py @@ -0,0 +1,367 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numbers +from abc import ABCMeta, abstractmethod + +from numpy.linalg import LinAlgError +from sklearn.base import MultiOutputMixin +from sklearn.utils.validation import _deprecate_positional_args, check_is_fitted + +from ... import execute +from ... import tensor as mt +from ...tensor.datasource import tensor as astensor +from ..base import BaseEstimator, ClassifierMixin, RegressorMixin +from ..preprocessing import normalize as f_normalize +from ..utils.validation import FLOAT_DTYPES, _check_sample_weight, check_array + + +def _preprocess_data( + X, + y, + fit_intercept, + normalize=False, + copy=True, + sample_weight=None, + return_mean=False, + check_input=True, +): + """Center and scale data. + + Centers data to have mean zero along axis 0. If fit_intercept=False or if + the X is a sparse matrix, no centering is done, but normalization can still + be applied. The function returns the statistics necessary to reconstruct + the input data, which are X_offset, y_offset, X_scale, such that the output + + X = (X - X_offset) / X_scale + + X_scale is the L2 norm of X - X_offset. If sample_weight is not None, + then the weighted mean of X and y is zero, and not the mean itself. If + return_mean=True, the mean, eventually weighted, is returned, independently + of whether X was centered (option used for optimization with sparse data in + coordinate_descend). + + This is here because nearly all linear models will want their data to be + centered. This function also systematically makes y consistent with X.dtype + """ + if isinstance(sample_weight, numbers.Number): + sample_weight = None + if sample_weight is not None: + sample_weight = astensor(sample_weight) + + X = astensor(X) + y = astensor(y, dtype=X.dtype) + + if check_input: + X = check_array(X, copy=copy, accept_sparse=["csr", "csc"], dtype=FLOAT_DTYPES) + elif copy: + if X.issparse(): + X = X.copy() + else: + X = X.copy(order="K") + + if fit_intercept: + if X.issparse(): + raise NotImplementedError("Does not support sparse input!") + else: + X_offset = mt.average(X, axis=0, weights=sample_weight) + X = X - X_offset + if normalize: + X, X_scale = f_normalize(X, axis=0, copy=False, return_norm=True) + else: + X_scale = mt.ones(X.shape[1], dtype=X.dtype) + y_offset = mt.average(y, axis=0, weights=sample_weight) + y = y - y_offset + else: + if X.issparse(): + raise NotImplementedError("Does not support sparse input!") + X_offset = mt.zeros(X.shape[1], dtype=X.dtype) + X_scale = mt.ones(X.shape[1], dtype=X.dtype) + if y.ndim == 1: + y_offset = X.dtype.type(0) + else: + y_offset = mt.zeros(y.shape[1], dtype=X.dtype) + + return X, y, X_offset, y_offset, X_scale + + +def _rescale_data(X, y, sample_weight): + """Rescale data sample-wise by square root of sample_weight. + + For many linear models, this enables easy support for sample_weight. + + Returns + ------- + X_rescaled : {array-like, sparse matrix} + + y_rescaled : {array-like, sparse matrix} + """ + n_samples = X.shape[0] + sample_weight = mt.asarray(sample_weight) + if sample_weight.ndim == 0: + sample_weight = mt.full(n_samples, sample_weight, dtype=sample_weight.dtype) + sample_weight = mt.sqrt(sample_weight) + sw_matrix = mt.diag(sample_weight, sparse=True) + X = mt.dot(sw_matrix, X) + y = mt.dot(sw_matrix, y) + return X, y + + +class LinearModel(BaseEstimator, metaclass=ABCMeta): + """Base class for Linear Models""" + + @abstractmethod + def fit(self, X, y, sample_weight=None): + """Fit model.""" + + def _decision_function(self, X): + check_is_fitted(self) + + X = self._validate_data( + X, y="no_validation", accept_sparse=["csr", "csc", "coo"], reset=False + ) + return mt.dot(X, self.coef_.T) + self.intercept_ + + def predict(self, X): + """ + Predict using the linear model. + + Parameters + ---------- + X : array-like or sparse matrix, shape (n_samples, n_features) + Samples. + + Returns + ------- + C : array, shape (n_samples,) + Returns predicted values. + """ + return self._decision_function(X) + + _preprocess_data = staticmethod(_preprocess_data) + + def _set_intercept(self, X_offset, y_offset, X_scale): + """Set the intercept_""" + if self.fit_intercept: + self.coef_ = self.coef_ / X_scale + self.intercept_ = y_offset - mt.dot(X_offset, self.coef_.T) + execute(self.coef_, self.intercept_) + else: + self.intercept_ = mt.tensor(0.0) + self.intercept_.execute() + + def _more_tags(self): # noqa: R0201 # pylint: disable=no-self-use + return {"requires_y": True} + + +class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel): + """ + Ordinary least squares Linear Regression. + + LinearRegression fits a linear model with coefficients w = (w1, ..., wp) + to minimize the residual sum of squares between the observed targets in + the dataset, and the targets predicted by the linear approximation. + + Parameters + ---------- + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to False, no intercept will be used in calculations + (i.e. data is expected to be centered). + + normalize : bool, default=False + This parameter is ignored when ``fit_intercept`` is set to False. + If True, the regressors X will be normalized before regression by + subtracting the mean and dividing by the l2-norm. + If you wish to standardize, please use + :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit`` + on an estimator with ``normalize=False``. + + copy_X : bool, default=True + If True, X will be copied; else, it may be overwritten. + + positive : bool, default=False + When set to ``True``, forces the coefficients to be positive. This + option is only supported for dense arrays. + + Attributes + ---------- + coef_ : array of shape (n_features, ) or (n_targets, n_features) + Estimated coefficients for the linear regression problem. + If multiple targets are passed during the fit (y 2D), this + is a 2D array of shape (n_targets, n_features), while if only + one target is passed, this is a 1D array of length n_features. + + rank_ : int + Rank of matrix `X`. Only available when `X` is dense. + + singular_ : array of shape (min(X, y),) + Singular values of `X`. Only available when `X` is dense. + + intercept_ : float or array of shape (n_targets,) + Independent term in the linear model. Set to 0.0 if + `fit_intercept = False`. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + See Also + -------- + Ridge : Ridge regression addresses some of the + problems of Ordinary Least Squares by imposing a penalty on the + size of the coefficients with l2 regularization. + Lasso : The Lasso is a linear model that estimates + sparse coefficients with l1 regularization. + ElasticNet : Elastic-Net is a linear regression + model trained with both l1 and l2 -norm regularization of the + coefficients. + """ + + @_deprecate_positional_args + def __init__( + self, + *, + fit_intercept=True, + normalize=False, + copy_X=True, + positive=False, + ): + self.fit_intercept = fit_intercept + self.normalize = normalize + self.copy_X = copy_X + self.positive = positive + + def fit(self, X, y, sample_weight=None): + """ + Fit linear model. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) or (n_samples, n_targets) + Target values. Will be cast to X's dtype if necessary. + + sample_weight : array-like of shape (n_samples,), default=None + Individual weights for each sample. + + Returns + ------- + self : object + Fitted Estimator. + """ + accept_sparse = False if self.positive else ["csr", "csc", "coo"] + + X, y = self._validate_data( + X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True + ) + + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + + X, y, X_offset, y_offset, X_scale = self._preprocess_data( + X, + y, + fit_intercept=self.fit_intercept, + normalize=self.normalize, + copy=self.copy_X, + sample_weight=sample_weight, + return_mean=True, + ) + + if sample_weight is not None: + # Sample weight can be implemented via a simple rescaling. + X, y = _rescale_data(X, y, sample_weight) + + if self.positive: + # TODO: implement optimize.nnls first + raise NotImplementedError("Does not support positive coefficients!") + elif X.issparse(): + # TODO: implement sparse.linalg.lsqr first + raise NotImplementedError("Does not support sparse input!") + else: + try: + # In numpy: + # Mat mul does NOT always satisfy associative law + # Tyipical mistake: + # (mt.linalg.inv(X.T @ X) @ (X.T @ y)).T + self.coef_ = (mt.linalg.inv(X.T @ X) @ X.T @ y).T + self.coef_.execute() + except LinAlgError: + # TODO: implement linalg.lstsq first + raise NotImplementedError("Does not support sigular matrix!") + + if y.ndim == 1: + self.coef_ = mt.ravel(self.coef_) + self.coef_.execute() + self._set_intercept(X_offset, y_offset, X_scale) + return self + + +class LinearClassifierMixin(ClassifierMixin): + """Mixin for linear classifiers. + + Handles prediction for sparse and dense X. + """ + + def decision_function(self, X): + """ + Predict confidence scores for samples. + + The confidence score for a sample is proportional to the signed + distance of that sample to the hyperplane. + + Parameters + ---------- + X : array-like or sparse matrix, shape (n_samples, n_features) + Samples. + + Returns + ------- + array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes) + Confidence scores per (sample, class) combination. In the binary + case, confidence score for self.classes_[1] where >0 means this + class would be predicted. + """ + check_is_fitted(self) + + X = check_array(X, accept_sparse="csr") + + n_features = self.coef_.shape[1] + if X.shape[1] != n_features: + raise ValueError( + "X has %d features per sample; expecting %d" % (X.shape[1], n_features) + ) + + scores = mt.dot(X, self.coef_.T) + self.intercept_ + return scores + + def predict(self, X): + """ + Predict class labels for samples in X. + + Parameters + ---------- + X : array-like or sparse matrix, shape (n_samples, n_features) + Samples. + + Returns + ------- + C : array, shape [n_samples] + Predicted class label per sample. + """ + scores = self.decision_function(X) + indices = scores.argmax(axis=1) + return self.classes_[indices].execute() diff --git a/python/xorbits/_mars/learn/linear_model/tests/__init__.py b/python/xorbits/_mars/learn/linear_model/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/learn/linear_model/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/learn/linear_model/tests/test_base.py b/python/xorbits/_mars/learn/linear_model/tests/test_base.py new file mode 100644 index 000000000..2d5fdfbac --- /dev/null +++ b/python/xorbits/_mars/learn/linear_model/tests/test_base.py @@ -0,0 +1,698 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + +import numpy as np +import pytest +from numpy.testing import ( + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) +from scipy import linalg, sparse +from sklearn.datasets import load_iris, make_regression, make_sparse_uncorrelated +from sklearn.linear_model import LinearRegression as sklearn_LR +from sklearn.linear_model._base import make_dataset +from sklearn.utils import check_random_state + +from .. import LinearRegression +from .._base import _preprocess_data, _rescale_data + +rng = np.random.RandomState(0) +rtol = 1e-6 + + +def test_linear_regression(setup): + # Regular model fitting, #samples > 2, #features >= 2 + X = [[1, 1.5], [1.8, 2], [4, 5]] + Y = [1, 2, 3] + + reg = LinearRegression() + reg.fit(X, Y) + + model = sklearn_LR() + model.fit(X, Y) + + assert_array_almost_equal(reg.coef_, model.coef_) + assert_array_almost_equal(reg.intercept_, model.intercept_) + assert_array_almost_equal(reg.predict(X), model.predict(X)) + + # Regular model fitting, #samples <= 2, # features < 2 + error_msg = re.escape("Does not support sigular matrix!") + + X = [[1], [2]] + Y = [1, 2] + + reg = LinearRegression() + reg.fit(X, Y) + + model = sklearn_LR() + model.fit(X, Y) + + assert_array_almost_equal(reg.coef_, model.coef_) + assert_array_almost_equal(reg.intercept_, model.intercept_) + assert_array_almost_equal(reg.predict(X), model.predict(X)) + + # Extra case #1: singular matrix, degenerate input + error_msg = re.escape("Does not support sigular matrix!") + + X = [[1]] + Y = [0] + + reg = LinearRegression() + with pytest.raises(NotImplementedError, match=error_msg): + reg.fit(X, Y) + + # # Extra case #2: algebrically singular matrix but algorithmically not + # # Works locally but not work in github checks + # # May be because the inverse is super large + # X = [[1, 1.5], [1.8, 2]] + # Y = [1, 2] + + # reg = LinearRegression() + # reg.fit(X, Y) + + # model = sklearn_LR() + # model.fit(X, Y) + + # with pytest.raises(AssertionError): + # assert_array_almost_equal(reg.coef_, model.coef_) + + +def test_linear_regression_sample_weights(setup): + # TODO: loop over sparse data as well + + rng = np.random.RandomState(0) + + # It would not work with under-determined systems + for n_samples, n_features in ((6, 5),): + y = rng.randn(n_samples) + X = rng.randn(n_samples, n_features) + sample_weight = 1.0 + rng.rand(n_samples) + + for intercept in (True, False): + # LinearRegression with explicit sample_weight + reg = LinearRegression(fit_intercept=intercept) + reg.fit(X, y, sample_weight=sample_weight) + coefs1 = reg.coef_ + inter1 = reg.intercept_ + + assert reg.coef_.shape == (X.shape[1],) # sanity checks + assert reg.score(X, y).to_numpy() > 0.5 + + # Closed form of the weighted least square + # theta = (X^T W X)^(-1) * X^T W y + W = np.diag(sample_weight) + if intercept is False: + X_aug = X + else: + dummy_column = np.ones(shape=(n_samples, 1)) + X_aug = np.concatenate((dummy_column, X), axis=1) + + coefs2 = linalg.solve(X_aug.T.dot(W).dot(X_aug), X_aug.T.dot(W).dot(y)) + + if intercept is False: + assert_array_almost_equal(coefs1, coefs2) + else: + assert_array_almost_equal(coefs1, coefs2[1:]) + assert_almost_equal(inter1.to_numpy(), coefs2[0]) + + +def test_raises_value_error_if_positive_and_sparse(setup): + error_msg = re.escape( + "A sparse tensor was passed, but dense " + "data is required. Use X.todense() to " + "convert to a dense tensor." + ) + # X must not be sparse if positive == True + X = sparse.eye(10) + y = np.ones(10) + + reg = LinearRegression(positive=True) + + with pytest.raises(TypeError, match=error_msg): + reg.fit(X, y) + + +def test_raises_value_error_if_sample_weights_greater_than_1d(setup): + error_msg = re.escape("Sample weights must be 1D array or scalar") + + X = rng.randn(10, 5) + y = rng.randn(10) + sample_weights_2D = rng.randn(10, 2) ** 2 + 1 + + reg = LinearRegression() + + with pytest.raises(ValueError, match=error_msg): + reg.fit(X, y, sample_weights_2D) + + +def test_fit_intercept(setup): + # Test assertions on betas shape. + X2 = np.array([[0.38349978, 0.61650022], [0.58853682, 0.41146318]]) + X3 = np.array( + [[0.27677969, 0.70693172, 0.01628859], [0.08385139, 0.20692515, 0.70922346]] + ) + y = np.array([1, 1]) + + lr2_without_intercept = LinearRegression(fit_intercept=False).fit(X2, y) + lr2_with_intercept = LinearRegression().fit(X2, y) + + lr3_without_intercept = LinearRegression(fit_intercept=False).fit(X3, y) + lr3_with_intercept = LinearRegression().fit(X3, y) + + assert lr2_with_intercept.coef_.shape == lr2_without_intercept.coef_.shape + assert lr3_with_intercept.coef_.shape == lr3_without_intercept.coef_.shape + assert lr2_without_intercept.coef_.ndim == lr3_without_intercept.coef_.ndim + + +def test_linear_regression_sparse(setup, random_state=0): + # Test that linear regression also works with sparse data + random_state = check_random_state(random_state) + for i in range(10): + n = 100 + X = sparse.eye(n, n) + beta = random_state.rand(n) + y = X * beta[:, np.newaxis] + ols = LinearRegression() + + error_msg = re.escape("Does not support sparse input!") + with pytest.raises(NotImplementedError, match=error_msg): + ols.fit(X, y.ravel()) + + +@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize("fit_intercept", [True, False]) +def test_linear_regression_sparse_equal_dense(setup, normalize, fit_intercept): + # Test that linear regression agrees between sparse and dense + rng = check_random_state(0) + n_samples = 200 + n_features = 2 + X = rng.randn(n_samples, n_features) + X[X < 0.1] = 0.0 + Xcsr = sparse.csr_matrix(X) + y = rng.rand(n_samples) + params = dict(normalize=normalize, fit_intercept=fit_intercept) + clf_dense = LinearRegression(**params) + clf_sparse = LinearRegression(**params) + clf_dense.fit(X, y) + + if fit_intercept is False: + error_msg = re.escape("Does not support sparse input!") + with pytest.raises(NotImplementedError, match=error_msg): + clf_sparse.fit(Xcsr, y) + else: + error_msg = re.escape("Does not support sparse input!") + with pytest.raises(NotImplementedError, match=error_msg): + clf_sparse.fit(Xcsr, y) + + +def test_linear_regression_multiple_outcome(setup, random_state=0): + # Test multiple-outcome linear regressions + X, y = make_regression(random_state=random_state) + + Y = np.vstack((y, y)).T + n_features = X.shape[1] + + reg = LinearRegression() + reg.fit((X), Y) + assert reg.coef_.shape == (2, n_features) + Y_pred = reg.predict(X) + reg.fit(X, y) + y_pred = reg.predict(X) + assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3) + + +def test_linear_regression_sparse_multiple_outcome(setup, random_state=0): + # Test multiple-outcome linear regressions with sparse data + random_state = check_random_state(random_state) + X, y = make_sparse_uncorrelated(random_state=random_state) + X = sparse.coo_matrix(X) + Y = np.vstack((y, y)).T + + ols = LinearRegression() + error_msg = re.escape("Does not support sparse input!") + with pytest.raises(NotImplementedError, match=error_msg): + ols.fit(X, Y) + + error_msg = re.escape("Does not support sparse input!") + with pytest.raises(NotImplementedError, match=error_msg): + ols.fit(X, y.ravel()) + + +# # When optimize.nnls is implemented, one can utilize this test case +# def test_linear_regression_positive(setup): +# # Test nonnegative LinearRegression on a simple dataset. +# X = [[1], [2]] +# y = [1, 2] + +# reg = LinearRegression(positive=True) +# reg.fit(X, y) + +# assert_array_almost_equal(reg.coef_, [1]) +# assert_array_almost_equal(reg.intercept_, [0]) +# assert_array_almost_equal(reg.predict(X), [1, 2]) + +# # test it also for degenerate input +# X = [[1]] +# y = [0] + +# reg = LinearRegression(positive=True) +# reg.fit(X, y) +# assert_allclose(reg.coef_, [0]) +# assert_allclose(reg.intercept_, [0]) +# assert_allclose(reg.predict(X), [0]) + + +# # When optimize.nnls is implemented, one can utilize this test case +# def test_linear_regression_positive_multiple_outcome(setup, random_state=0): +# # Test multiple-outcome nonnegative linear regressions +# random_state = check_random_state(random_state) +# X, y = make_sparse_uncorrelated(random_state=random_state) +# Y = np.vstack((y, y)).T +# n_features = X.shape[1] + +# ols = LinearRegression(positive=True) +# ols.fit(X, Y) +# assert ols.coef_.shape == (2, n_features) +# assert np.all(ols.coef_.to_numpy() >= 0.) +# Y_pred = ols.predict(X) +# ols.fit(X, y.ravel()) +# y_pred = ols.predict(X) +# assert_allclose(np.vstack((y_pred, y_pred)).T, Y_pred) + + +def test_linear_regression_positive_vs_nonpositive(setup): + # Test differences with LinearRegression when positive=False. + X, y = make_sparse_uncorrelated(random_state=0) + + # reg = LinearRegression(positive=True) + reg = sklearn_LR(positive=True) + reg.fit(X, y) + regn = LinearRegression(positive=False) + regn.fit(X, y) + + assert np.mean(((reg.coef_ - regn.coef_) ** 2).to_numpy()) > 1e-3 + + +def test_linear_regression_positive_vs_nonpositive_when_positive(setup): + # Test LinearRegression fitted coefficients + # when the problem is positive. + n_samples = 200 + n_features = 4 + X = rng.rand(n_samples, n_features) + y = X[:, 0] + 2 * X[:, 1] + 3 * X[:, 2] + 1.5 * X[:, 3] + + # reg = LinearRegression(positive=True) + reg = sklearn_LR(positive=True) + reg.fit(X, y) + regn = LinearRegression(positive=False) + regn.fit(X, y) + + assert np.mean(((reg.coef_ - regn.coef_) ** 2).to_numpy()) < 1e-6 + + +# # Failed: DID NOT WARN. +# # No such warning "pandas.DataFrame with sparse columns found." +# def test_linear_regression_pd_sparse_dataframe_warning(): +# pd = pytest.importorskip('pandas') +# # restrict the pd versions < '0.24.0' +# # as they have a bug in is_sparse func +# if parse_version(pd.__version__) < parse_version('0.24.0'): +# pytest.skip("pandas 0.24+ required.") + +# # Warning is raised only when some of the columns is sparse +# df = pd.DataFrame({'0': np.random.randn(10)}) +# for col in range(1, 4): +# arr = np.random.randn(10) +# arr[:8] = 0 +# # all columns but the first column is sparse +# if col != 0: +# arr = pd.arrays.SparseArray(arr, fill_value=0) +# df[str(col)] = arr + +# msg = "pandas.DataFrame with sparse columns found." +# with pytest.warns(UserWarning, match=msg): +# reg = LinearRegression() +# reg.fit(df.iloc[:, 0:2], df.iloc[:, 3]) + +# # does not warn when the whole dataframe is sparse +# df['0'] = pd.arrays.SparseArray(df['0'], fill_value=0) +# assert hasattr(df, "sparse") + +# with pytest.warns(None) as record: +# reg.fit(df.iloc[:, 0:2], df.iloc[:, 3]) +# assert not record + + +def test_preprocess_data(setup): + n_samples = 200 + n_features = 2 + X = rng.rand(n_samples, n_features) + y = rng.rand(n_samples) + expected_X_mean = np.mean(X, axis=0) + expected_X_norm = np.std(X, axis=0) * np.sqrt(X.shape[0]) + expected_y_mean = np.mean(y, axis=0) + + Xt, yt, X_mean, y_mean, X_norm = _preprocess_data( + X, y, fit_intercept=False, normalize=False + ) + assert_array_almost_equal(X_mean, np.zeros(n_features)) + assert_array_almost_equal(y_mean, 0) + assert_array_almost_equal(X_norm, np.ones(n_features)) + assert_array_almost_equal(Xt, X) + assert_array_almost_equal(yt, y) + + Xt, yt, X_mean, y_mean, X_norm = _preprocess_data( + X, y, fit_intercept=True, normalize=False + ) + assert_array_almost_equal(X_mean, expected_X_mean) + assert_array_almost_equal(y_mean, expected_y_mean) + assert_array_almost_equal(X_norm, np.ones(n_features)) + assert_array_almost_equal(Xt, X - expected_X_mean) + assert_array_almost_equal(yt, y - expected_y_mean) + + Xt, yt, X_mean, y_mean, X_norm = _preprocess_data( + X, y, fit_intercept=True, normalize=True + ) + assert_array_almost_equal(X_mean, expected_X_mean) + assert_array_almost_equal(y_mean, expected_y_mean) + assert_array_almost_equal(X_norm, expected_X_norm) + assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm) + assert_array_almost_equal(yt, y - expected_y_mean) + + +def test_preprocess_data_multioutput(setup): + n_samples = 200 + n_features = 3 + n_outputs = 2 + X = rng.rand(n_samples, n_features) + y = rng.rand(n_samples, n_outputs) + expected_y_mean = np.mean(y, axis=0) + + # case 1 + _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=False, normalize=False) + assert_array_almost_equal(y_mean, np.zeros(n_outputs)) + assert_array_almost_equal(yt, y) + + _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=True, normalize=False) + assert_array_almost_equal(y_mean, expected_y_mean) + assert_array_almost_equal(yt, y - y_mean) + + _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=True, normalize=True) + assert_array_almost_equal(y_mean, expected_y_mean) + assert_array_almost_equal(yt, y - y_mean) + + # case 2 + X = sparse.csc_matrix(X) + error_msg = "Does not support sparse input!" + with pytest.raises(NotImplementedError, match=error_msg): + _, yt, _, y_mean, _ = _preprocess_data( + X, y, fit_intercept=False, normalize=False + ) + + with pytest.raises(NotImplementedError, match=error_msg): + _, yt, _, y_mean, _ = _preprocess_data( + X, y, fit_intercept=True, normalize=False + ) + + with pytest.raises(NotImplementedError, match=error_msg): + _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=True, normalize=True) + + +def test_preprocess_data_weighted(setup): + n_samples = 200 + n_features = 2 + X = rng.rand(n_samples, n_features) + y = rng.rand(n_samples) + sample_weight = rng.rand(n_samples) + expected_X_mean = np.average(X, axis=0, weights=sample_weight) + expected_y_mean = np.average(y, axis=0, weights=sample_weight) + + # XXX: if normalize=True, should we expect a weighted standard deviation? + # Currently not weighted, but calculated with respect to weighted mean + expected_X_norm = ( + np.sqrt(X.shape[0]) * np.mean((X - expected_X_mean) ** 2, axis=0) ** 0.5 + ) + + Xt, yt, X_mean, y_mean, X_norm = _preprocess_data( + X, y, fit_intercept=True, normalize=False, sample_weight=sample_weight + ) + assert_array_almost_equal(X_mean, expected_X_mean) + assert_array_almost_equal(y_mean, expected_y_mean) + assert_array_almost_equal(X_norm, np.ones(n_features)) + assert_array_almost_equal(Xt, X - expected_X_mean) + assert_array_almost_equal(yt, y - expected_y_mean) + + Xt, yt, X_mean, y_mean, X_norm = _preprocess_data( + X, y, fit_intercept=True, normalize=True, sample_weight=sample_weight + ) + assert_array_almost_equal(X_mean, expected_X_mean) + assert_array_almost_equal(y_mean, expected_y_mean) + assert_array_almost_equal(X_norm, expected_X_norm) + assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm) + assert_array_almost_equal(yt, y - expected_y_mean) + + +def test_sparse_preprocess_data_with_return_mean(setup): + n_samples = 200 + n_features = 2 + # random_state not supported yet in sparse.rand + X = sparse.rand(n_samples, n_features, density=0.5) # , random_state=rng + X = sparse.csr_matrix(X) + y = rng.rand(n_samples) + + error_msg = re.escape("Does not support sparse input!") + with pytest.raises(NotImplementedError, match=error_msg): + Xt, yt, X_mean, y_mean, X_norm = _preprocess_data( + X, y, fit_intercept=False, normalize=False, return_mean=True + ) + + error_msg = re.escape("Does not support sparse input!") + with pytest.raises(NotImplementedError, match=error_msg): + Xt, yt, X_mean, y_mean, X_norm = _preprocess_data( + X, + y, + fit_intercept=True, + normalize=False, + return_mean=True, + check_input=False, + ) + + error_msg = re.escape("Does not support sparse input!") + with pytest.raises(NotImplementedError, match=error_msg): + Xt, yt, X_mean, y_mean, X_norm = _preprocess_data( + X, + y, + fit_intercept=True, + normalize=True, + return_mean=True, + check_input=False, + ) + + +# # AttributeError: 'TensorData' object has no attribute 'getformat' +# def test_csr_preprocess_data(): +# # Test output format of _preprocess_data, when input is csr +# X, y = make_regression() +# X[X < 2.5] = 0.0 +# csr = sparse.csr_matrix(X) +# csr_, y, _, _, _ = _preprocess_data(csr, y, True) +# assert csr_.getformat() == 'csr' + + +@pytest.mark.parametrize("is_sparse", (True, False)) +@pytest.mark.parametrize("to_copy", (True, False)) +def test_preprocess_copy_data_no_checks(setup, is_sparse, to_copy): + X, y = make_regression() + X[X < 2.5] = 0.0 + + if is_sparse: + X = sparse.csr_matrix(X) + error_msg = re.escape("Does not support sparse input!") + with pytest.raises(NotImplementedError, match=error_msg): + X_, y_, _, _, _ = _preprocess_data( + X, y, True, copy=to_copy, check_input=False + ) + else: + X_, y_, _, _, _ = _preprocess_data(X, y, True, copy=to_copy, check_input=False) + + if to_copy and is_sparse: + assert not np.may_share_memory(X_.data, X.data) + elif to_copy: + assert not np.may_share_memory(X_.to_numpy(), X) + elif is_sparse: + assert np.may_share_memory(X_.data, X.data) + # else: # fake pass + # assert np.may_share_memory(X_.to_numpy(), X) + + +def test_dtype_preprocess_data(setup): + n_samples = 200 + n_features = 2 + X = rng.rand(n_samples, n_features) + y = rng.rand(n_samples) + + X_32 = np.asarray(X, dtype=np.float32) + y_32 = np.asarray(y, dtype=np.float32) + X_64 = np.asarray(X, dtype=np.float64) + y_64 = np.asarray(y, dtype=np.float64) + + for fit_intercept in [True, False]: + for normalize in [True, False]: + Xt_32, yt_32, X_mean_32, y_mean_32, X_norm_32 = _preprocess_data( + X_32, + y_32, + fit_intercept=fit_intercept, + normalize=normalize, + return_mean=True, + ) + + Xt_64, yt_64, X_mean_64, y_mean_64, X_norm_64 = _preprocess_data( + X_64, + y_64, + fit_intercept=fit_intercept, + normalize=normalize, + return_mean=True, + ) + + Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_norm_3264 = _preprocess_data( + X_32, + y_64, + fit_intercept=fit_intercept, + normalize=normalize, + return_mean=True, + ) + + Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_norm_6432 = _preprocess_data( + X_64, + y_32, + fit_intercept=fit_intercept, + normalize=normalize, + return_mean=True, + ) + + assert Xt_32.dtype == np.float32 + assert yt_32.dtype == np.float32 + assert X_mean_32.dtype == np.float32 + assert y_mean_32.dtype == np.float32 + assert X_norm_32.dtype == np.float32 + + assert Xt_64.dtype == np.float64 + assert yt_64.dtype == np.float64 + assert X_mean_64.dtype == np.float64 + assert y_mean_64.dtype == np.float64 + assert X_norm_64.dtype == np.float64 + + assert Xt_3264.dtype == np.float32 + assert yt_3264.dtype == np.float32 + assert X_mean_3264.dtype == np.float32 + assert y_mean_3264.dtype == np.float32 + assert X_norm_3264.dtype == np.float32 + + assert Xt_6432.dtype == np.float64 + assert yt_6432.dtype == np.float64 + assert X_mean_6432.dtype == np.float64 + assert y_mean_6432.dtype == np.float64 + assert X_norm_6432.dtype == np.float64 + + assert X_32.dtype == np.float32 + assert y_32.dtype == np.float32 + assert X_64.dtype == np.float64 + assert y_64.dtype == np.float64 + + assert_array_almost_equal(Xt_32, Xt_64) + assert_array_almost_equal(yt_32, yt_64) + assert_array_almost_equal(X_mean_32, X_mean_64) + assert_array_almost_equal(y_mean_32, y_mean_64) + assert_array_almost_equal(X_norm_32, X_norm_64) + + +@pytest.mark.parametrize("n_targets", [None, 2]) +def test_rescale_data_dense(setup, n_targets): + n_samples = 200 + n_features = 2 + + sample_weight = 1.0 + rng.rand(n_samples) + X = rng.rand(n_samples, n_features) + if n_targets is None: + y = rng.rand(n_samples) + else: + y = rng.rand(n_samples, n_targets) + rescaled_X, rescaled_y = _rescale_data(X, y, sample_weight) + rescaled_X2 = X * np.sqrt(sample_weight)[:, np.newaxis] + if n_targets is None: + rescaled_y2 = y * np.sqrt(sample_weight) + else: + rescaled_y2 = y * np.sqrt(sample_weight)[:, np.newaxis] + assert_array_almost_equal(rescaled_X, rescaled_X2) + assert_array_almost_equal(rescaled_y, rescaled_y2) + + +def test_fused_types_make_dataset(setup): + iris = load_iris() + + X_32 = iris.data.astype(np.float32) + y_32 = iris.target.astype(np.float32) + X_csr_32 = sparse.csr_matrix(X_32) + sample_weight_32 = np.arange(y_32.size, dtype=np.float32) + + X_64 = iris.data.astype(np.float64) + y_64 = iris.target.astype(np.float64) + X_csr_64 = sparse.csr_matrix(X_64) + sample_weight_64 = np.arange(y_64.size, dtype=np.float64) + + # array + dataset_32, _ = make_dataset(X_32, y_32, sample_weight_32) + dataset_64, _ = make_dataset(X_64, y_64, sample_weight_64) + xi_32, yi_32, _, _ = dataset_32._next_py() + xi_64, yi_64, _, _ = dataset_64._next_py() + xi_data_32, _, _ = xi_32 + xi_data_64, _, _ = xi_64 + + assert xi_data_32.dtype == np.float32 + assert xi_data_64.dtype == np.float64 + assert_allclose(yi_64, yi_32, rtol=rtol) + + # csr + datasetcsr_32, _ = make_dataset(X_csr_32, y_32, sample_weight_32) + datasetcsr_64, _ = make_dataset(X_csr_64, y_64, sample_weight_64) + xicsr_32, yicsr_32, _, _ = datasetcsr_32._next_py() + xicsr_64, yicsr_64, _, _ = datasetcsr_64._next_py() + xicsr_data_32, _, _ = xicsr_32 + xicsr_data_64, _, _ = xicsr_64 + + assert xicsr_data_32.dtype == np.float32 + assert xicsr_data_64.dtype == np.float64 + + assert_allclose(xicsr_data_64, xicsr_data_32, rtol=rtol) + assert_allclose(yicsr_64, yicsr_32, rtol=rtol) + + assert_array_equal(xi_data_32, xicsr_data_32) + assert_array_equal(xi_data_64, xicsr_data_64) + assert_array_equal(yi_32, yicsr_32) + assert_array_equal(yi_64, yicsr_64) + + +def test_raise_notimplemented_when_positive(setup): + error_msg = re.escape("Does not support positive coefficients!") + + X = [[1], [2]] + y = [1, 2] + + reg = LinearRegression(positive=True) + with pytest.raises(NotImplementedError, match=error_msg): + reg.fit(X, y) diff --git a/python/xorbits/_mars/learn/metrics/__init__.py b/python/xorbits/_mars/learn/metrics/__init__.py new file mode 100644 index 000000000..5c7c792b1 --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ._classification import ( + accuracy_score, + f1_score, + fbeta_score, + log_loss, + multilabel_confusion_matrix, + precision_recall_fscore_support, + precision_score, + recall_score, +) +from ._ranking import auc, roc_auc_score, roc_curve +from ._regresssion import r2_score +from ._scorer import get_scorer +from .pairwise import euclidean_distances, pairwise_distances, pairwise_distances_topk diff --git a/python/xorbits/_mars/learn/metrics/_base.py b/python/xorbits/_mars/learn/metrics/_base.py new file mode 100644 index 000000000..337915669 --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/_base.py @@ -0,0 +1,131 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from itertools import combinations + +from ... import tensor as mt +from ..utils import check_array, check_consistent_length +from ..utils.multiclass import type_of_target + + +def _average_binary_score( + binary_metric, + y_true, + y_score, + average, + sample_weight=None, + session=None, + run_kwargs=None, +): + average_options = (None, "micro", "macro", "weighted", "samples") + if average not in average_options: # pragma: no cover + raise ValueError("average has to be one of {0}".format(average_options)) + + y_type = type_of_target(y_true).to_numpy(session=session, **(run_kwargs or dict())) + if y_type not in ("binary", "multilabel-indicator"): # pragma: no cover + raise ValueError("{0} format is not supported".format(y_type)) + + if y_type == "binary": + return binary_metric(y_true, y_score, sample_weight=sample_weight) + + check_consistent_length( + y_true, y_score, sample_weight, session=session, run_kwargs=run_kwargs + ) + y_true = check_array(y_true) + y_score = check_array(y_score) + + not_average_axis = 1 + score_weight = sample_weight + average_weight = None + + if average == "micro": + if score_weight is not None: # pragma: no cover + score_weight = mt.repeat(score_weight, y_true.shape[1]) + y_true = y_true.ravel() + y_score = y_score.ravel() + + elif average == "weighted": + if score_weight is not None: # pragma: no cover + average_weight = mt.sum( + mt.multiply(y_true, mt.reshape(score_weight, (-1, 1))), axis=0 + ) + else: + average_weight = mt.sum(y_true, axis=0) + if mt.isclose(average_weight.sum(), 0.0).to_numpy( + session=session, **(run_kwargs or dict()) + ): + return 0 + + elif average == "samples": + # swap average_weight <-> score_weight + average_weight = score_weight + score_weight = None + not_average_axis = 0 + + if y_true.ndim == 1: + y_true = y_true.reshape((-1, 1)) + + if y_score.ndim == 1: + y_score = y_score.reshape((-1, 1)) + + n_classes = y_score.shape[not_average_axis] + score = mt.zeros((n_classes,)) + for c in range(n_classes): + y_true_c = y_true.take([c], axis=not_average_axis).ravel() + y_score_c = y_score.take([c], axis=not_average_axis).ravel() + score[c] = binary_metric(y_true_c, y_score_c, sample_weight=score_weight) + + # Average the results + if average is not None: + if average_weight is not None: + # Scores with 0 weights are forced to be 0, preventing the average + # score from being affected by 0-weighted NaN elements. + average_weight = mt.asarray(average_weight) + score[average_weight == 0] = 0 + return mt.average(score, weights=average_weight) + else: + return score + + +def _average_multiclass_ovo_score( + binary_metric, y_true, y_score, average="macro", session=None, run_kwargs=None +): + check_consistent_length(y_true, y_score, session=session, run_kwargs=run_kwargs) + + y_true_unique = mt.unique(y_true).to_numpy() + n_classes = y_true_unique.shape[0] + n_pairs = n_classes * (n_classes - 1) // 2 + pair_scores = mt.empty(n_pairs) + + is_weighted = average == "weighted" + prevalence = mt.empty(n_pairs) if is_weighted else None + + # Compute scores treating a as positive class and b as negative class, + # then b as positive class and a as negative class + for ix, (a, b) in enumerate(combinations(y_true_unique, 2)): + a_mask = y_true == a + b_mask = y_true == b + ab_mask = mt.logical_or(a_mask, b_mask) + + if is_weighted: + prevalence[ix] = mt.average(ab_mask) + + a_true = a_mask[ab_mask] + b_true = b_mask[ab_mask] + + a_true_score = binary_metric(a_true, y_score[ab_mask, a]) + b_true_score = binary_metric(b_true, y_score[ab_mask, b]) + pair_scores[ix] = (a_true_score + b_true_score) / 2 + + return mt.average(pair_scores, weights=prevalence) diff --git a/python/xorbits/_mars/learn/metrics/_check_targets.py b/python/xorbits/_mars/learn/metrics/_check_targets.py new file mode 100644 index 000000000..e9d58bf53 --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/_check_targets.py @@ -0,0 +1,196 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +try: + from sklearn.metrics._classification import _check_targets as sklearn_check_targets +except ImportError: # pragma: no cover + # sklearn < 0.22 + from sklearn.metrics.classification import _check_targets as sklearn_check_targets + +from ... import opcodes as OperandDef +from ... import tensor as mt +from ...core import ENTITY_TYPE, ExecutableTuple, recursive_tile +from ...core.context import get_context +from ...serialization.serializables import AnyField +from ...tensor.core import TENSOR_TYPE, TensorOrder +from ..operands import LearnOperand, LearnOperandMixin, OutputType +from ..utils import check_consistent_length, column_or_1d +from ..utils.multiclass import type_of_target + + +class CheckTargets(LearnOperand, LearnOperandMixin): + _op_type_ = OperandDef.CHECK_TARGETS + + y_true = AnyField("y_true") + y_pred = AnyField("y_pred") + + @property + def output_limit(self): + return 3 + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + if isinstance(self.y_true, ENTITY_TYPE): + self.y_true = next(inputs_iter) + if isinstance(self.y_pred, ENTITY_TYPE): + self.y_pred = next(inputs_iter) + + def __call__(self, y_true, y_pred): + # scalar(y_type), y_true, y_pred + self.output_types = [OutputType.tensor] * 3 + + inputs = [] + if isinstance(y_true, ENTITY_TYPE): + inputs.append(y_true) + if isinstance(y_pred, ENTITY_TYPE): + inputs.append(y_pred) + + kws = list() + kws.append( + {"shape": (), "dtype": np.dtype(object), "order": TensorOrder.C_ORDER} + ) + kws.extend([y.params for y in (mt.tensor(y_true), mt.tensor(y_pred))]) + kws[1]["shape"] = kws[2]["shape"] = (np.nan,) + return ExecutableTuple(self.new_tileables(inputs, kws=kws)) + + @classmethod + def tile(cls, op): + y_true, y_pred = op.y_true, op.y_pred + if isinstance(y_true, ENTITY_TYPE): + y_true = mt.tensor(y_true) + if isinstance(y_pred, ENTITY_TYPE): + y_pred = mt.tensor(y_pred) + + if len(op.inputs) == 0: + # no entity input + type_true, y_true, y_pred = sklearn_check_targets(y_true, y_pred) + new_op = op.copy() + outs = yield from recursive_tile( + mt.tensor(type_true), mt.tensor(y_true), mt.tensor(y_pred) + ) + params = [out.params.copy() for out in op.outputs] + for param, out in zip(params, outs): + param["nsplits"] = out.nsplits + param["chunks"] = out.chunks + param["shape"] = out.shape + return new_op.new_tileables(op.inputs, kws=params) + + check_consistent_length(y_true, y_pred) + + type_true, type_pred = type_of_target(y_true), type_of_target(y_pred) + y_true, y_pred = mt.tensor(y_true), mt.tensor(y_pred) + tileables = y_true, y_pred, type_true, type_pred = yield from recursive_tile( + y_true, y_pred, type_true, type_pred + ) + yield [c for t in tileables for c in t.chunks] + + ctx = get_context() + type_true, type_pred = [ + d.item() if hasattr(d, "item") else d + for d in ctx.get_chunks_result( + [type_true.chunks[0].key, type_pred.chunks[0].key] + ) + ] + + y_type = {type_true, type_pred} + if y_type == {"binary", "multiclass"}: + y_type = {"multiclass"} + + if len(y_type) > 1: + raise ValueError( + f"Classification metrics can't handle a mix of {type_true} " + f"and {type_pred} targets" + ) + + # We can't have more than one value on y_type => The set is no more needed + y_type = y_type.pop() + + # No metrics support "multiclass-multioutput" format + if y_type not in ["binary", "multiclass", "multilabel-indicator"]: + raise ValueError(f"{y_type} is not supported") + + if y_type in ["binary", "multiclass"]: + y_true = column_or_1d(y_true) + y_pred = column_or_1d(y_pred) + if y_type == "binary": + unique_values = mt.union1d(y_true, y_pred) + y_type = mt.where( + mt.count_nonzero(unique_values) > 2, "multiclass", y_type + ) + elif y_type.startswith("multilabel"): + y_true = mt.tensor(y_true).tosparse() + y_pred = mt.tensor(y_pred).tosparse() + y_type = "multilabel-indicator" + + if not isinstance(y_true, ENTITY_TYPE): + y_true = mt.tensor(y_true) + if not isinstance(y_pred, ENTITY_TYPE): + y_pred = mt.tensor(y_pred) + if not isinstance(y_type, TENSOR_TYPE): + y_type = mt.tensor(y_type, dtype=object) + + y_type, y_true, y_pred = yield from recursive_tile(y_type, y_true, y_pred) + + kws = [out.params for out in op.outputs] + kws[0].update(dict(nsplits=(), chunks=[y_type.chunks[0]])) + kws[1].update( + dict( + nsplits=y_true.nsplits, + chunks=y_true.chunks, + shape=tuple(sum(sp) for sp in y_true.nsplits), + ) + ) + kws[2].update( + dict( + nsplits=y_pred.nsplits, + chunks=y_pred.chunks, + shape=tuple(sum(sp) for sp in y_pred.nsplits), + ) + ) + new_op = op.copy() + return new_op.new_tileables(op.inputs, kws=kws) + + +def _check_targets(y_true, y_pred): + """Check that y_true and y_pred belong to the same classification task + + This converts multiclass or binary types to a common shape, and raises a + ValueError for a mix of multilabel and multiclass targets, a mix of + multilabel formats, for the presence of continuous-valued or multioutput + targets, or for targets of different lengths. + + Column vectors are squeezed to 1d, while multilabel formats are returned + as CSR sparse label indicators. + + Parameters + ---------- + y_true : array-like + + y_pred : array-like + + Returns + ------- + type_true : one of {'multilabel-indicator', 'multiclass', 'binary'} + The type of the true target data, as output by + ``utils.multiclass.type_of_target`` + + y_true : Tensor + + y_pred : Tensor + """ + op = CheckTargets(y_true=y_true, y_pred=y_pred) + return op(y_true, y_pred) diff --git a/python/xorbits/_mars/learn/metrics/_classification.py b/python/xorbits/_mars/learn/metrics/_classification.py new file mode 100644 index 000000000..b7c8a46eb --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/_classification.py @@ -0,0 +1,1475 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings + +import numpy as np +from sklearn.exceptions import UndefinedMetricWarning + +from ... import execute, fetch +from ... import opcodes as OperandDef +from ... import tensor as mt +from ...core import ENTITY_TYPE, recursive_tile +from ...core.context import get_context +from ...serialization.serializables import AnyField, BoolField, KeyField +from ...tensor.core import TensorOrder +from ..operands import LearnOperand, LearnOperandMixin, OutputType +from ..preprocessing import LabelBinarizer, LabelEncoder +from ..utils import check_array, check_consistent_length, column_or_1d +from ..utils.multiclass import unique_labels +from ..utils.sparsefuncs import count_nonzero +from ._check_targets import _check_targets + + +class AccuracyScore(LearnOperand, LearnOperandMixin): + _op_type_ = OperandDef.ACCURACY_SCORE + + _y_true = AnyField("y_true") + _y_pred = AnyField("y_pred") + _normalize = BoolField("normalize") + _sample_weight = AnyField("sample_weight") + _type_true = KeyField("type_true") + + def __init__( + self, + y_true=None, + y_pred=None, + normalize=None, + sample_weight=None, + type_true=None, + **kw + ): + super().__init__( + _y_true=y_true, + _y_pred=y_pred, + _normalize=normalize, + _sample_weight=sample_weight, + _type_true=type_true, + **kw + ) + self.output_types = [OutputType.tensor] + + @property + def y_true(self): + return self._y_true + + @property + def y_pred(self): + return self._y_pred + + @property + def normalize(self): + return self._normalize + + @property + def sample_weight(self): + return self._sample_weight + + @property + def type_true(self): + return self._type_true + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + if self._y_true is not None: + self._y_true = next(inputs_iter) + if self._y_pred is not None: + self._y_pred = next(inputs_iter) + if self._type_true is not None: + self._type_true = next(inputs_iter) + if isinstance(self._sample_weight, ENTITY_TYPE): + self._sample_weight = next(inputs_iter) + + def __call__(self, y_true, y_pred): + type_true, y_true, y_pred = _check_targets(y_true, y_pred) + self._type_true = type_true + inputs = [y_true, y_pred, type_true] + if isinstance(self._sample_weight, ENTITY_TYPE): + inputs.append(self._sample_weight) + + dtype = np.dtype(float) if self._normalize else np.result_type(y_true, y_pred) + return self.new_tileable( + inputs, dtype=dtype, shape=(), order=TensorOrder.C_ORDER + ) + + @classmethod + def tile(cls, op): + # make sure type_true executed first + chunks = [op.type_true.chunks[0]] + yield chunks + + ctx = get_context() + type_true = ctx.get_chunks_result([chunks[0].key])[0] + + y_true, y_pred = op.y_true, op.y_pred + if type_true.item().startswith("multilabel"): + differing_labels = mt.count_nonzero(y_true - y_pred, axis=1) + score = mt.equal(differing_labels, 0) + else: + score = mt.equal(y_true, y_pred) + + result = _weighted_sum(score, op.sample_weight, op.normalize) + return [(yield from recursive_tile(result))] + + +def _weighted_sum(sample_score, sample_weight, normalize=False): + if normalize: + return mt.average(sample_score, weights=sample_weight) + elif sample_weight is not None: + return mt.dot(sample_score, sample_weight) + else: + return sample_score.sum() + + +def accuracy_score( + y_true, y_pred, normalize=True, sample_weight=None, session=None, run_kwargs=None +): + """Accuracy classification score. + + In multilabel classification, this function computes subset accuracy: + the set of labels predicted for a sample must *exactly* match the + corresponding set of labels in y_true. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : 1d array-like, or label indicator tensor / sparse tensor + Ground truth (correct) labels. + + y_pred : 1d array-like, or label indicator tensor / sparse tensor + Predicted labels, as returned by a classifier. + + normalize : bool, optional (default=True) + If ``False``, return the number of correctly classified samples. + Otherwise, return the fraction of correctly classified samples. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + score : float + If ``normalize == True``, return the fraction of correctly + classified samples (float), else returns the number of correctly + classified samples (int). + + The best performance is 1 with ``normalize == True`` and the number + of samples with ``normalize == False``. + + See also + -------- + jaccard_score, hamming_loss, zero_one_loss + + Notes + ----- + In binary and multiclass classification, this function is equal + to the ``jaccard_score`` function. + + Examples + -------- + >>> from mars.learn.metrics import accuracy_score + >>> y_pred = [0, 2, 1, 3] + >>> y_true = [0, 1, 2, 3] + >>> accuracy_score(y_true, y_pred).execute() + 0.5 + >>> accuracy_score(y_true, y_pred, normalize=False).execute() + 2 + + In the multilabel case with binary label indicators: + + >>> import mars.tensor as mt + >>> accuracy_score(mt.array([[0, 1], [1, 1]]), mt.ones((2, 2))).execute() + 0.5 + """ + + # Compute accuracy for each possible representation + op = AccuracyScore( + y_true=y_true, y_pred=y_pred, normalize=normalize, sample_weight=sample_weight + ) + score = op(y_true, y_pred) + return score.execute(session=session, **(run_kwargs or dict())) + + +def log_loss( + y_true, y_pred, *, eps=1e-15, normalize=True, sample_weight=None, labels=None +): + r"""Log loss, aka logistic loss or cross-entropy loss. + + This is the loss function used in (multinomial) logistic regression + and extensions of it such as neural networks, defined as the negative + log-likelihood of a logistic model that returns ``y_pred`` probabilities + for its training data ``y_true``. + The log loss is only defined for two or more labels. + For a single sample with true label :math:`y \in \{0,1\}` and + and a probability estimate :math:`p = \operatorname{Pr}(y = 1)`, the log + loss is: + + .. math:: + L_{\log}(y, p) = -(y \log (p) + (1 - y) \log (1 - p)) + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like or label indicator matrix + Ground truth (correct) labels for n_samples samples. + + y_pred : array-like of float, shape = (n_samples, n_classes) or (n_samples,) + Predicted probabilities, as returned by a classifier's + predict_proba method. If ``y_pred.shape = (n_samples,)`` + the probabilities provided are assumed to be that of the + positive class. The labels in ``y_pred`` are assumed to be + ordered alphabetically, as done by + :class:`preprocessing.LabelBinarizer`. + + eps : float, default=1e-15 + Log loss is undefined for p=0 or p=1, so probabilities are + clipped to max(eps, min(1 - eps, p)). + + normalize : bool, default=True + If true, return the mean loss per sample. + Otherwise, return the sum of the per-sample losses. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + labels : array-like, default=None + If not provided, labels will be inferred from y_true. If ``labels`` + is ``None`` and ``y_pred`` has shape (n_samples,) the labels are + assumed to be binary and are inferred from ``y_true``. + + Returns + ------- + loss : float + + Notes + ----- + The logarithm used is the natural logarithm (base-e). + + Examples + -------- + >>> from mars.learn.metrics import log_loss + >>> log_loss(["spam", "ham", "ham", "spam"], + ... [[.1, .9], [.9, .1], [.8, .2], [.35, .65]]) + 0.21616... + + References + ---------- + C.M. Bishop (2006). Pattern Recognition and Machine Learning. Springer, + p. 209. + """ + y_pred = check_array(y_pred, ensure_2d=False) + check_consistent_length(y_pred, y_true, sample_weight) + + lb = LabelBinarizer() + + if labels is not None: + lb.fit(labels) + else: + lb.fit(y_true) + + if len(lb.classes_) == 1: + if labels is None: + raise ValueError( + "y_true contains only one label ({0}). Please " + "provide the true labels explicitly through the " + "labels argument.".format(lb.classes_[0].fetch()) + ) + else: + raise ValueError( + "The labels array needs to contain at least two " + "labels for log_loss, " + "got {0}.".format(lb.classes_.fetch()) + ) + + transformed_labels = lb.transform(y_true) + + if transformed_labels.shape[1] == 1: + transformed_labels = mt.append( + 1 - transformed_labels, transformed_labels, axis=1 + ) + + # Clipping + y_pred = mt.clip(y_pred, eps, 1 - eps) + + # If y_pred is of single dimension, assume y_true to be binary + # and then check. + if y_pred.ndim == 1: # pragma: no cover + y_pred = y_pred[:, mt.newaxis] + if y_pred.shape[1] == 1: # pragma: no cover + y_pred = mt.append(1 - y_pred, y_pred, axis=1) + + # Check if dimensions are consistent. + transformed_labels = check_array(transformed_labels) + if len(lb.classes_) != y_pred.shape[1]: + if labels is None: + raise ValueError( + "y_true and y_pred contain different number of " + "classes {0}, {1}. Please provide the true " + "labels explicitly through the labels argument. " + "Classes found in " + "y_true: {2}".format( + transformed_labels.shape[1], y_pred.shape[1], lb.classes_.fetch() + ) + ) + else: + raise ValueError( + "The number of classes in labels is different " + "from that in y_pred. Classes found in " + "labels: {0}".format(lb.classes_.fetch()) + ) + + # Renormalize + y_pred /= y_pred.sum(axis=1)[:, mt.newaxis] + loss = -(transformed_labels * mt.log(y_pred)).sum(axis=1) + + return _weighted_sum(loss, sample_weight, normalize).execute() + + +def multilabel_confusion_matrix( + y_true, + y_pred, + *, + sample_weight=None, + labels=None, + samplewise=False, + session=None, + run_kwargs=None +): + """ + Compute a confusion matrix for each class or sample. + + Compute class-wise (default) or sample-wise (samplewise=True) multilabel + confusion matrix to evaluate the accuracy of a classification, and output + confusion matrices for each class or sample. + + In multilabel confusion matrix :math:`MCM`, the count of true negatives + is :math:`MCM_{:,0,0}`, false negatives is :math:`MCM_{:,1,0}`, + true positives is :math:`MCM_{:,1,1}` and false positives is + :math:`MCM_{:,0,1}`. + + Multiclass data will be treated as if binarized under a one-vs-rest + transformation. Returned confusion matrices will be in the order of + sorted unique labels in the union of (y_true, y_pred). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : {array-like, sparse matrix} of shape (n_samples, n_outputs) or \ + (n_samples,) + Ground truth (correct) target values. + + y_pred : {array-like, sparse matrix} of shape (n_samples, n_outputs) or \ + (n_samples,) + Estimated targets as returned by a classifier. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + labels : array-like of shape (n_classes,), default=None + A list of classes or column indices to select some (or to force + inclusion of classes absent from the data). + + samplewise : bool, default=False + In the multilabel case, this calculates a confusion matrix per sample. + + Returns + ------- + multi_confusion : ndarray of shape (n_outputs, 2, 2) + A 2x2 confusion matrix corresponding to each output in the input. + When calculating class-wise multi_confusion (default), then + n_outputs = n_labels; when calculating sample-wise multi_confusion + (samplewise=True), n_outputs = n_samples. If ``labels`` is defined, + the results will be returned in the order specified in ``labels``, + otherwise the results will be returned in sorted order by default. + + See Also + -------- + confusion_matrix : Compute confusion matrix to evaluate the accuracy of a + classifier. + + Notes + ----- + The `multilabel_confusion_matrix` calculates class-wise or sample-wise + multilabel confusion matrices, and in multiclass tasks, labels are + binarized under a one-vs-rest way; while + :func:`~sklearn.metrics.confusion_matrix` calculates one confusion matrix + for confusion between every two classes. + + Examples + -------- + Multiclass case: + + >>> import mars.tensor as mt + >>> from mars.learn.metrics import multilabel_confusion_matrix + >>> y_true = ["cat", "ant", "cat", "cat", "ant", "bird"] + >>> y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"] + >>> multilabel_confusion_matrix(y_true, y_pred, + ... labels=["ant", "bird", "cat"]) + array([[[3, 1], + [0, 2]], + + [[5, 0], + [1, 0]], + + [[2, 1], + [1, 2]]]) + + Multilabel-indicator case not implemented yet. + """ + exec_kw = dict(session=session, **(run_kwargs or dict())) + + y_type, y_true, y_pred = _check_targets(y_true, y_pred) + execute(y_type, y_true, y_pred, **exec_kw) + y_type = y_type.fetch() + + if sample_weight is not None: + sample_weight = column_or_1d(sample_weight) + check_consistent_length(y_true, y_pred, sample_weight, **exec_kw) + + if y_type not in ("binary", "multiclass", "multilabel-indicator"): + raise ValueError("%s is not supported" % y_type) + + present_labels = unique_labels(y_true, y_pred) + if labels is None: + labels = present_labels + n_labels = None + else: + labels = mt.tensor(labels) + n_labels = labels.shape[0] + # todo simplify this when mt.setdiff1d is implemented. + labels = labels.rechunk(((np.nan,),)).map_chunk( + lambda l, pl: np.hstack([l, np.setdiff1d(pl, l, assume_unique=True)]), + args=(present_labels,), + dtype=labels.dtype, + shape=(np.nan,), + ) + + if y_true.ndim == 1: + if samplewise: + raise ValueError( + "Samplewise metrics are not available outside of " + "multilabel classification." + ) + + le = LabelEncoder() + le.fit(labels, execute=False) + y_true = le.transform(y_true, execute=False) + y_pred = le.transform(y_pred, execute=False) + sorted_labels = le.classes_ + + # labels are now from 0 to len(labels) - 1 -> use bincount + tp = y_true == y_pred + tp_bins = y_true[tp] + execute(labels, y_true, y_pred, tp_bins, **exec_kw) + if sample_weight is not None: + tp_bins_weights = mt.asarray(sample_weight)[tp] + else: + tp_bins_weights = None + + if tp_bins.shape[0]: + tp_sum = mt.bincount( + tp_bins, weights=tp_bins_weights, minlength=labels.shape[0] + ) + else: + # Pathological case + true_sum = pred_sum = tp_sum = mt.zeros(labels.shape[0]) + if y_pred.shape[0]: + pred_sum = mt.bincount( + y_pred, weights=sample_weight, minlength=labels.shape[0] + ) + if y_true.shape[0]: + true_sum = mt.bincount( + y_true, weights=sample_weight, minlength=labels.shape[0] + ) + + # Retain only selected labels + indices = mt.searchsorted(sorted_labels, labels[:n_labels]) + tp_sum = tp_sum[indices] + true_sum = true_sum[indices] + pred_sum = pred_sum[indices] + + else: + sum_axis = 1 if samplewise else 0 + + def _check_labels(labels, present_labels): + # All labels are index integers for multilabel. + # Select labels: + if not np.array_equal(labels, present_labels): + if np.max(labels) > np.max(present_labels): + raise ValueError( + "All labels must be in [0, n labels) for " + "multilabel targets. " + "Got %d > %d" % (np.max(labels), np.max(present_labels)) + ) + if np.min(labels) < 0: + raise ValueError( + "All labels must be in [0, n labels) for " + "multilabel targets. " + "Got %d < 0" % np.min(labels) + ) + return labels + + labels = labels.map_chunk( + _check_labels, + args=(present_labels,), + dtype=labels.dtype, + shape=labels.shape, + ) + + if n_labels is not None: + y_true = y_true[:, labels[:n_labels]] + y_pred = y_pred[:, labels[:n_labels]] + + # calculate weighted counts + true_and_pred = mt.multiply(y_true, y_pred) + tp_sum = count_nonzero( + true_and_pred, axis=sum_axis, sample_weight=sample_weight + ) + pred_sum = count_nonzero(y_pred, axis=sum_axis, sample_weight=sample_weight) + true_sum = count_nonzero(y_true, axis=sum_axis, sample_weight=sample_weight) + + fp = pred_sum - tp_sum + fn = true_sum - tp_sum + tp = tp_sum + + # we need to obtain correct shape of y_true for further computation + executables = (fp, fn, tp, y_true) + execute(*executables, **exec_kw) + + if sample_weight is not None and samplewise: + sample_weight = mt.asarray(sample_weight) + tp = mt.asarray(tp) + fp = mt.asarray(fp) + fn = mt.asarray(fn) + tn = sample_weight * y_true.shape[1] - tp - fp - fn + elif sample_weight is not None: + tn = sum(sample_weight) - tp - fp - fn + elif samplewise: + tn = y_true.shape[1] - tp - fp - fn + else: + tn = y_true.shape[0] - tp - fp - fn + + ret = mt.array([tn, fp, fn, tp]).T.reshape(-1, 2, 2) + return ret.execute(**exec_kw) + + +def _check_zero_division(zero_division): # pragma: no cover + if isinstance(zero_division, str) and zero_division == "warn": + return + elif isinstance(zero_division, (int, float)) and zero_division in [0, 1]: + return + raise ValueError( + "Got zero_division={0}." ' Must be one of ["warn", 0, 1]'.format(zero_division) + ) + + +def _warn_prf(average, modifier, msg_start, result_size): # pragma: no cover + axis0, axis1 = "sample", "label" + if average == "samples": + axis0, axis1 = axis1, axis0 + msg = ( + "{0} ill-defined and being set to 0.0 {{0}} " + "no {1} {2}s. Use `zero_division` parameter to control" + " this behavior.".format(msg_start, modifier, axis0) + ) + if result_size == 1: + msg = msg.format("due to") + else: + msg = msg.format("in {0}s with".format(axis1)) + warnings.warn(msg, UndefinedMetricWarning, stacklevel=2) + + +def _prf_divide( + numerator, denominator, metric, modifier, average, warn_for, zero_division="warn" +): # pragma: no cover + """Performs division and handles divide-by-zero. + + On zero-division, sets the corresponding result elements equal to + 0 or 1 (according to ``zero_division``). Plus, if + ``zero_division != "warn"`` raises a warning. + + The metric, modifier and average arguments are used only for determining + an appropriate warning. + """ + mask = denominator == 0.0 + denominator = denominator.copy() + denominator[mask] = 1 # avoid infs/nans + result = numerator / denominator + + # if ``zero_division=1``, set those with denominator == 0 equal to 1 + result[mask] = 0.0 if zero_division in ["warn", 0] else 1.0 + + # the user will be removing warnings if zero_division is set to something + # different than its default value. If we are computing only f-score + # the warning will be raised only if precision and recall are ill-defined + if zero_division != "warn" or metric not in warn_for: + return result + + # build appropriate warning + # E.g. "Precision and F-score are ill-defined and being set to 0.0 in + # labels with no predicted samples. Use ``zero_division`` parameter to + # control this behavior." + + if metric in warn_for and "f-score" in warn_for: + msg_start = "{0} and F-score are".format(metric.title()) + elif metric in warn_for: + msg_start = "{0} is".format(metric.title()) + elif "f-score" in warn_for: + msg_start = "F-score is" + else: + return result + + _warn_prf(average, modifier, msg_start, len(result)) + + return result + + +def _check_set_wise_labels( + y_true, y_pred, average, labels, pos_label, session=None, run_kwargs=None +): # pragma: no cover + """Validation associated with set-wise metrics + + Returns identified labels + """ + exec_kwargs = dict(session=session, **(run_kwargs or dict())) + average_options = (None, "micro", "macro", "weighted", "samples") + if average not in average_options and average != "binary": + raise ValueError("average has to be one of " + str(average_options)) + + y_type, y_true, y_pred = _check_targets(y_true, y_pred) + present_labels = unique_labels(y_true, y_pred) + execute(y_type, y_true, y_pred, **exec_kwargs) + y_type = y_type.fetch(**exec_kwargs) + + if average == "binary": + if y_type == "binary": + t_pos_in_labels = mt.any(mt.isin(present_labels, pos_label)) + execute(t_pos_in_labels, present_labels, **exec_kwargs) + pos_in_labels = t_pos_in_labels.fetch(**exec_kwargs) + if pos_in_labels: + if present_labels.shape[0] >= 2: + raise ValueError( + "pos_label=%r is not a valid label: " + "%r" % (pos_label, present_labels) + ) + labels = [pos_label] + else: + average_options = list(average_options) + if y_type == "multiclass": + average_options.remove("samples") + raise ValueError( + "Target is %s but average='binary'. Please " + "choose another average setting, one of %r." % (y_type, average_options) + ) + elif pos_label not in (None, 1): + warnings.warn( + "Note that pos_label (set to %r) is ignored when " + "average != 'binary' (got %r). You may use " + "labels=[pos_label] to specify a single positive class." + % (pos_label, average), + UserWarning, + ) + return labels + + +def precision_recall_fscore_support( + y_true, + y_pred, + *, + beta=1.0, + labels=None, + pos_label=1, + average=None, + warn_for=("precision", "recall", "f-score"), + sample_weight=None, + zero_division="warn", + session=None, + run_kwargs=None +): + """Compute precision, recall, F-measure and support for each class + + The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of + true positives and ``fp`` the number of false positives. The precision is + intuitively the ability of the classifier not to label as positive a sample + that is negative. + + The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of + true positives and ``fn`` the number of false negatives. The recall is + intuitively the ability of the classifier to find all the positive samples. + + The F-beta score can be interpreted as a weighted harmonic mean of + the precision and recall, where an F-beta score reaches its best + value at 1 and worst score at 0. + + The F-beta score weights recall more than precision by a factor of + ``beta``. ``beta == 1.0`` means recall and precision are equally important. + + The support is the number of occurrences of each class in ``y_true``. + + If ``pos_label is None`` and in binary classification, this function + returns the average precision, recall and F-measure if ``average`` + is one of ``'micro'``, ``'macro'``, ``'weighted'`` or ``'samples'``. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : 1d array-like, or label indicator array / sparse matrix + Ground truth (correct) target values. + + y_pred : 1d array-like, or label indicator array / sparse matrix + Estimated targets as returned by a classifier. + + beta : float, 1.0 by default + The strength of recall versus precision in the F-score. + + labels : list, optional + The set of labels to include when ``average != 'binary'``, and their + order if ``average is None``. Labels present in the data can be + excluded, for example to calculate a multiclass average ignoring a + majority negative class, while labels not present in the data will + result in 0 components in a macro average. For multilabel targets, + labels are column indices. By default, all labels in ``y_true`` and + ``y_pred`` are used in sorted order. + + pos_label : str or int, 1 by default + The class to report if ``average='binary'`` and the data is binary. + If the data are multiclass or multilabel, this will be ignored; + setting ``labels=[pos_label]`` and ``average != 'binary'`` will report + scores for that label only. + + average : string, [None (default), 'binary', 'micro', 'macro', 'samples', \ + 'weighted'] + If ``None``, the scores for each class are returned. Otherwise, this + determines the type of averaging performed on the data: + + ``'binary'``: + Only report results for the class specified by ``pos_label``. + This is applicable only if targets (``y_{true,pred}``) are binary. + ``'micro'``: + Calculate metrics globally by counting the total true positives, + false negatives and false positives. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average weighted + by support (the number of true instances for each label). This + alters 'macro' to account for label imbalance; it can result in an + F-score that is not between precision and recall. + ``'samples'``: + Calculate metrics for each instance, and find their average (only + meaningful for multilabel classification where this differs from + :func:`accuracy_score`). + + warn_for : tuple or set, for internal use + This determines which warnings will be made in the case that this + function is being used to return only one of its metrics. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + zero_division : "warn", 0 or 1, default="warn" + Sets the value to return when there is a zero division: + - recall: when there are no positive labels + - precision: when there are no positive predictions + - f-score: both + + If set to "warn", this acts as 0, but warnings are also raised. + + Returns + ------- + precision : float (if average is not None) or array of float, shape =\ + [n_unique_labels] + + recall : float (if average is not None) or array of float, , shape =\ + [n_unique_labels] + + fbeta_score : float (if average is not None) or array of float, shape =\ + [n_unique_labels] + + support : None (if average is not None) or array of int, shape =\ + [n_unique_labels] + The number of occurrences of each label in ``y_true``. + + References + ---------- + .. [1] `Wikipedia entry for the Precision and recall + `_ + + .. [2] `Wikipedia entry for the F1-score + `_ + + .. [3] `Discriminative Methods for Multi-labeled Classification Advances + in Knowledge Discovery and Data Mining (2004), pp. 22-30 by Shantanu + Godbole, Sunita Sarawagi + `_ + + Examples + -------- + >>> import numpy as np + >>> from mars.learn.metrics import precision_recall_fscore_support + >>> y_true = np.array(['cat', 'dog', 'pig', 'cat', 'dog', 'pig']) + >>> y_pred = np.array(['cat', 'pig', 'dog', 'cat', 'cat', 'dog']) + >>> precision_recall_fscore_support(y_true, y_pred, average='macro') + (0.22..., 0.33..., 0.26..., None) + >>> precision_recall_fscore_support(y_true, y_pred, average='micro') + (0.33..., 0.33..., 0.33..., None) + >>> precision_recall_fscore_support(y_true, y_pred, average='weighted') + (0.22..., 0.33..., 0.26..., None) + + It is possible to compute per-label precisions, recalls, F1-scores and + supports instead of averaging: + + >>> precision_recall_fscore_support(y_true, y_pred, average=None, + ... labels=['pig', 'dog', 'cat']) + (array([0. , 0. , 0.66...]), + array([0., 0., 1.]), array([0. , 0. , 0.8]), + array([2, 2, 2])) + + Notes + ----- + When ``true positive + false positive == 0``, precision is undefined; + When ``true positive + false negative == 0``, recall is undefined. + In such cases, by default the metric will be set to 0, as will f-score, + and ``UndefinedMetricWarning`` will be raised. This behavior can be + modified with ``zero_division``. + """ + exec_kw = dict(session=session, **(run_kwargs or dict())) + + _check_zero_division(zero_division) + if beta < 0: + raise ValueError("beta should be >=0 in the F-beta score") + labels = _check_set_wise_labels( + y_true, + y_pred, + average, + labels, + pos_label, + session=session, + run_kwargs=run_kwargs, + ) + + # Calculate tp_sum, pred_sum, true_sum ### + samplewise = average == "samples" + MCM = multilabel_confusion_matrix( + y_true, + y_pred, + sample_weight=sample_weight, + labels=labels, + samplewise=samplewise, + session=session, + run_kwargs=run_kwargs, + ) + tp_sum = MCM[:, 1, 1] + pred_sum = tp_sum + MCM[:, 0, 1] + true_sum = tp_sum + MCM[:, 1, 0] + + if average == "micro": + tp_sum = mt.array([tp_sum.sum()]) + pred_sum = mt.array([pred_sum.sum()]) + true_sum = mt.array([true_sum.sum()]) + + execute(true_sum, **exec_kw) + + # Finally, we have all our sufficient statistics. Divide! # + beta2 = beta**2 + + # Divide, and on zero-division, set scores and/or warn according to + # zero_division: + precision = _prf_divide( + tp_sum, pred_sum, "precision", "predicted", average, warn_for, zero_division + ) + recall = _prf_divide( + tp_sum, true_sum, "recall", "true", average, warn_for, zero_division + ) + + # warn for f-score only if zero_division is warn, it is in warn_for + # and BOTH prec and rec are ill-defined + if zero_division == "warn" and ("f-score",) == warn_for: + any_pred_sum_zero = ( + (pred_sum[true_sum == 0] == 0).any().execute(**exec_kw).fetch(**exec_kw) + ) + if any_pred_sum_zero: + _warn_prf(average, "true nor predicted", "F-score is", len(true_sum)) + + # if tp == 0 F will be 1 only if all predictions are zero, all labels are + # zero, and zero_division=1. In all other case, 0 + if np.isposinf(beta): + f_score = recall + else: + denom = beta2 * precision + recall + + denom[denom == 0.0] = 1 # avoid division by 0 + f_score = (1 + beta2) * precision * recall / denom + + # Average the results + if average == "weighted": + weights = true_sum + sum_weights, sum_pred_sum = fetch( + execute(weights.sum(), pred_sum.sum(), **exec_kw), **exec_kw + ) + if sum_weights == 0: + zero_division_value = 0.0 if zero_division in ["warn", 0] else 1.0 + # precision is zero_division if there are no positive predictions + # recall is zero_division if there are no positive labels + # fscore is zero_division if all labels AND predictions are + # negative + return ( + mt.scalar(zero_division_value if sum_pred_sum == 0 else 0), + mt.scalar(zero_division_value), + mt.scalar(zero_division_value if sum_pred_sum == 0 else 0), + None, + ) + + elif average == "samples": + weights = sample_weight + else: + weights = None + + if average is not None: + assert average != "binary" or len(precision) == 1 + precision = mt.average(precision, weights=weights) + recall = mt.average(recall, weights=weights) + f_score = mt.average(f_score, weights=weights) + true_sum = None # return no support + + return precision, recall, f_score, true_sum + + +def precision_score( + y_true, + y_pred, + *, + labels=None, + pos_label=1, + average="binary", + sample_weight=None, + zero_division="warn" +): + """Compute the precision + + The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of + true positives and ``fp`` the number of false positives. The precision is + intuitively the ability of the classifier not to label as positive a sample + that is negative. + + The best value is 1 and the worst value is 0. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : 1d array-like, or label indicator array / sparse matrix + Ground truth (correct) target values. + + y_pred : 1d array-like, or label indicator array / sparse matrix + Estimated targets as returned by a classifier. + + labels : list, optional + The set of labels to include when ``average != 'binary'``, and their + order if ``average is None``. Labels present in the data can be + excluded, for example to calculate a multiclass average ignoring a + majority negative class, while labels not present in the data will + result in 0 components in a macro average. For multilabel targets, + labels are column indices. By default, all labels in ``y_true`` and + ``y_pred`` are used in sorted order. + + pos_label : str or int, 1 by default + The class to report if ``average='binary'`` and the data is binary. + If the data are multiclass or multilabel, this will be ignored; + setting ``labels=[pos_label]`` and ``average != 'binary'`` will report + scores for that label only. + + average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \ + 'weighted'] + This parameter is required for multiclass/multilabel targets. + If ``None``, the scores for each class are returned. Otherwise, this + determines the type of averaging performed on the data: + + ``'binary'``: + Only report results for the class specified by ``pos_label``. + This is applicable only if targets (``y_{true,pred}``) are binary. + ``'micro'``: + Calculate metrics globally by counting the total true positives, + false negatives and false positives. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average weighted + by support (the number of true instances for each label). This + alters 'macro' to account for label imbalance; it can result in an + F-score that is not between precision and recall. + ``'samples'``: + Calculate metrics for each instance, and find their average (only + meaningful for multilabel classification where this differs from + :func:`accuracy_score`). + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + zero_division : "warn", 0 or 1, default="warn" + Sets the value to return when there is a zero division. If set to + "warn", this acts as 0, but warnings are also raised. + + Returns + ------- + precision : float (if average is not None) or array of float, shape =\ + [n_unique_labels] + Precision of the positive class in binary classification or weighted + average of the precision of each class for the multiclass task. + + See also + -------- + precision_recall_fscore_support, multilabel_confusion_matrix + + Examples + -------- + >>> from mars.learn.metrics import precision_score + >>> y_true = [0, 1, 2, 0, 1, 2] + >>> y_pred = [0, 2, 1, 0, 0, 1] + >>> precision_score(y_true, y_pred, average='macro') + 0.22... + >>> precision_score(y_true, y_pred, average='micro') + 0.33... + >>> precision_score(y_true, y_pred, average='weighted') + 0.22... + >>> precision_score(y_true, y_pred, average=None) + array([0.66..., 0. , 0. ]) + >>> y_pred = [0, 0, 0, 0, 0, 0] + >>> precision_score(y_true, y_pred, average=None) + array([0.33..., 0. , 0. ]) + >>> precision_score(y_true, y_pred, average=None, zero_division=1) + array([0.33..., 1. , 1. ]) + + Notes + ----- + When ``true positive + false positive == 0``, precision returns 0 and + raises ``UndefinedMetricWarning``. This behavior can be + modified with ``zero_division``. + + """ + p, _, _, _ = precision_recall_fscore_support( + y_true, + y_pred, + labels=labels, + pos_label=pos_label, + average=average, + warn_for=("precision",), + sample_weight=sample_weight, + zero_division=zero_division, + ) + return p + + +def recall_score( + y_true, + y_pred, + *, + labels=None, + pos_label=1, + average="binary", + sample_weight=None, + zero_division="warn" +): + """Compute the recall + + The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of + true positives and ``fn`` the number of false negatives. The recall is + intuitively the ability of the classifier to find all the positive samples. + + The best value is 1 and the worst value is 0. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : 1d array-like, or label indicator array / sparse matrix + Ground truth (correct) target values. + + y_pred : 1d array-like, or label indicator array / sparse matrix + Estimated targets as returned by a classifier. + + labels : list, optional + The set of labels to include when ``average != 'binary'``, and their + order if ``average is None``. Labels present in the data can be + excluded, for example to calculate a multiclass average ignoring a + majority negative class, while labels not present in the data will + result in 0 components in a macro average. For multilabel targets, + labels are column indices. By default, all labels in ``y_true`` and + ``y_pred`` are used in sorted order. + + pos_label : str or int, 1 by default + The class to report if ``average='binary'`` and the data is binary. + If the data are multiclass or multilabel, this will be ignored; + setting ``labels=[pos_label]`` and ``average != 'binary'`` will report + scores for that label only. + + average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \ + 'weighted'] + This parameter is required for multiclass/multilabel targets. + If ``None``, the scores for each class are returned. Otherwise, this + determines the type of averaging performed on the data: + + ``'binary'``: + Only report results for the class specified by ``pos_label``. + This is applicable only if targets (``y_{true,pred}``) are binary. + ``'micro'``: + Calculate metrics globally by counting the total true positives, + false negatives and false positives. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average weighted + by support (the number of true instances for each label). This + alters 'macro' to account for label imbalance; it can result in an + F-score that is not between precision and recall. + ``'samples'``: + Calculate metrics for each instance, and find their average (only + meaningful for multilabel classification where this differs from + :func:`accuracy_score`). + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + zero_division : "warn", 0 or 1, default="warn" + Sets the value to return when there is a zero division. If set to + "warn", this acts as 0, but warnings are also raised. + + Returns + ------- + recall : float (if average is not None) or array of float, shape =\ + [n_unique_labels] + Recall of the positive class in binary classification or weighted + average of the recall of each class for the multiclass task. + + See also + -------- + precision_recall_fscore_support, balanced_accuracy_score, + multilabel_confusion_matrix + + Examples + -------- + >>> from mars.learn.metrics import recall_score + >>> y_true = [0, 1, 2, 0, 1, 2] + >>> y_pred = [0, 2, 1, 0, 0, 1] + >>> recall_score(y_true, y_pred, average='macro') + 0.33... + >>> recall_score(y_true, y_pred, average='micro') + 0.33... + >>> recall_score(y_true, y_pred, average='weighted') + 0.33... + >>> recall_score(y_true, y_pred, average=None) + array([1., 0., 0.]) + >>> y_true = [0, 0, 0, 0, 0, 0] + >>> recall_score(y_true, y_pred, average=None) + array([0.5, 0. , 0. ]) + >>> recall_score(y_true, y_pred, average=None, zero_division=1) + array([0.5, 1. , 1. ]) + + Notes + ----- + When ``true positive + false negative == 0``, recall returns 0 and raises + ``UndefinedMetricWarning``. This behavior can be modified with + ``zero_division``. + """ + _, r, _, _ = precision_recall_fscore_support( + y_true, + y_pred, + labels=labels, + pos_label=pos_label, + average=average, + warn_for=("recall",), + sample_weight=sample_weight, + zero_division=zero_division, + ) + return r + + +def f1_score( + y_true, + y_pred, + *, + labels=None, + pos_label=1, + average="binary", + sample_weight=None, + zero_division="warn" +): + """Compute the F1 score, also known as balanced F-score or F-measure + + The F1 score can be interpreted as a weighted average of the precision and + recall, where an F1 score reaches its best value at 1 and worst score at 0. + The relative contribution of precision and recall to the F1 score are + equal. The formula for the F1 score is:: + + F1 = 2 * (precision * recall) / (precision + recall) + + In the multi-class and multi-label case, this is the average of + the F1 score of each class with weighting depending on the ``average`` + parameter. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : 1d array-like, or label indicator array / sparse matrix + Ground truth (correct) target values. + + y_pred : 1d array-like, or label indicator array / sparse matrix + Estimated targets as returned by a classifier. + + labels : list, optional + The set of labels to include when ``average != 'binary'``, and their + order if ``average is None``. Labels present in the data can be + excluded, for example to calculate a multiclass average ignoring a + majority negative class, while labels not present in the data will + result in 0 components in a macro average. For multilabel targets, + labels are column indices. By default, all labels in ``y_true`` and + ``y_pred`` are used in sorted order. + + pos_label : str or int, 1 by default + The class to report if ``average='binary'`` and the data is binary. + If the data are multiclass or multilabel, this will be ignored; + setting ``labels=[pos_label]`` and ``average != 'binary'`` will report + scores for that label only. + + average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \ + 'weighted'] + This parameter is required for multiclass/multilabel targets. + If ``None``, the scores for each class are returned. Otherwise, this + determines the type of averaging performed on the data: + + ``'binary'``: + Only report results for the class specified by ``pos_label``. + This is applicable only if targets (``y_{true,pred}``) are binary. + ``'micro'``: + Calculate metrics globally by counting the total true positives, + false negatives and false positives. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average weighted + by support (the number of true instances for each label). This + alters 'macro' to account for label imbalance; it can result in an + F-score that is not between precision and recall. + ``'samples'``: + Calculate metrics for each instance, and find their average (only + meaningful for multilabel classification where this differs from + :func:`accuracy_score`). + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + zero_division : "warn", 0 or 1, default="warn" + Sets the value to return when there is a zero division, i.e. when all + predictions and labels are negative. If set to "warn", this acts as 0, + but warnings are also raised. + + Returns + ------- + f1_score : float or array of float, shape = [n_unique_labels] + F1 score of the positive class in binary classification or weighted + average of the F1 scores of each class for the multiclass task. + + See also + -------- + fbeta_score, precision_recall_fscore_support, jaccard_score, + multilabel_confusion_matrix + + References + ---------- + .. [1] `Wikipedia entry for the F1-score + `_ + + Examples + -------- + >>> from mars.learn.metrics import f1_score + >>> y_true = [0, 1, 2, 0, 1, 2] + >>> y_pred = [0, 2, 1, 0, 0, 1] + >>> f1_score(y_true, y_pred, average='macro') + 0.26... + >>> f1_score(y_true, y_pred, average='micro') + 0.33... + >>> f1_score(y_true, y_pred, average='weighted') + 0.26... + >>> f1_score(y_true, y_pred, average=None) + array([0.8, 0. , 0. ]) + >>> y_true = [0, 0, 0, 0, 0, 0] + >>> y_pred = [0, 0, 0, 0, 0, 0] + >>> f1_score(y_true, y_pred, zero_division=1) + 1.0... + + Notes + ----- + When ``true positive + false positive == 0``, precision is undefined; + When ``true positive + false negative == 0``, recall is undefined. + In such cases, by default the metric will be set to 0, as will f-score, + and ``UndefinedMetricWarning`` will be raised. This behavior can be + modified with ``zero_division``. + """ + return fbeta_score( + y_true, + y_pred, + beta=1, + labels=labels, + pos_label=pos_label, + average=average, + sample_weight=sample_weight, + zero_division=zero_division, + ) + + +def fbeta_score( + y_true, + y_pred, + *, + beta, + labels=None, + pos_label=1, + average="binary", + sample_weight=None, + zero_division="warn" +): + """Compute the F-beta score + + The F-beta score is the weighted harmonic mean of precision and recall, + reaching its optimal value at 1 and its worst value at 0. + + The `beta` parameter determines the weight of recall in the combined + score. ``beta < 1`` lends more weight to precision, while ``beta > 1`` + favors recall (``beta -> 0`` considers only precision, ``beta -> +inf`` + only recall). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : 1d array-like, or label indicator array / sparse matrix + Ground truth (correct) target values. + + y_pred : 1d array-like, or label indicator array / sparse matrix + Estimated targets as returned by a classifier. + + beta : float + Determines the weight of recall in the combined score. + + labels : list, optional + The set of labels to include when ``average != 'binary'``, and their + order if ``average is None``. Labels present in the data can be + excluded, for example to calculate a multiclass average ignoring a + majority negative class, while labels not present in the data will + result in 0 components in a macro average. For multilabel targets, + labels are column indices. By default, all labels in ``y_true`` and + ``y_pred`` are used in sorted order. + + pos_label : str or int, 1 by default + The class to report if ``average='binary'`` and the data is binary. + If the data are multiclass or multilabel, this will be ignored; + setting ``labels=[pos_label]`` and ``average != 'binary'`` will report + scores for that label only. + + average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \ + 'weighted'] + This parameter is required for multiclass/multilabel targets. + If ``None``, the scores for each class are returned. Otherwise, this + determines the type of averaging performed on the data: + + ``'binary'``: + Only report results for the class specified by ``pos_label``. + This is applicable only if targets (``y_{true,pred}``) are binary. + ``'micro'``: + Calculate metrics globally by counting the total true positives, + false negatives and false positives. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average weighted + by support (the number of true instances for each label). This + alters 'macro' to account for label imbalance; it can result in an + F-score that is not between precision and recall. + ``'samples'``: + Calculate metrics for each instance, and find their average (only + meaningful for multilabel classification where this differs from + :func:`accuracy_score`). + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + zero_division : "warn", 0 or 1, default="warn" + Sets the value to return when there is a zero division, i.e. when all + predictions and labels are negative. If set to "warn", this acts as 0, + but warnings are also raised. + + Returns + ------- + fbeta_score : float (if average is not None) or array of float, shape =\ + [n_unique_labels] + F-beta score of the positive class in binary classification or weighted + average of the F-beta score of each class for the multiclass task. + + See also + -------- + precision_recall_fscore_support, multilabel_confusion_matrix + + References + ---------- + .. [1] R. Baeza-Yates and B. Ribeiro-Neto (2011). + Modern Information Retrieval. Addison Wesley, pp. 327-328. + + .. [2] `Wikipedia entry for the F1-score + `_ + + Examples + -------- + >>> from mars.learn.metrics import fbeta_score + >>> y_true = [0, 1, 2, 0, 1, 2] + >>> y_pred = [0, 2, 1, 0, 0, 1] + >>> fbeta_score(y_true, y_pred, average='macro', beta=0.5) + 0.23... + >>> fbeta_score(y_true, y_pred, average='micro', beta=0.5) + 0.33... + >>> fbeta_score(y_true, y_pred, average='weighted', beta=0.5) + 0.23... + >>> fbeta_score(y_true, y_pred, average=None, beta=0.5) + array([0.71..., 0. , 0. ]) + + Notes + ----- + When ``true positive + false positive == 0`` or + ``true positive + false negative == 0``, f-score returns 0 and raises + ``UndefinedMetricWarning``. This behavior can be + modified with ``zero_division``. + """ + + _, _, f, _ = precision_recall_fscore_support( + y_true, + y_pred, + beta=beta, + labels=labels, + pos_label=pos_label, + average=average, + warn_for=("f-score",), + sample_weight=sample_weight, + zero_division=zero_division, + ) + return f diff --git a/python/xorbits/_mars/learn/metrics/_ranking.py b/python/xorbits/_mars/learn/metrics/_ranking.py new file mode 100644 index 000000000..2a36233ed --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/_ranking.py @@ -0,0 +1,791 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from functools import partial + +import numpy as np + +from ... import execute as _execute +from ... import fetch as _fetch +from ... import tensor as mt +from ...utils import cache_tileables +from ..preprocessing import label_binarize +from ..utils._encode import _encode, _unique +from ..utils.checks import assert_all_finite +from ..utils.core import sort_by +from ..utils.multiclass import type_of_target +from ..utils.validation import check_array, check_consistent_length, column_or_1d +from ._base import _average_binary_score, _average_multiclass_ovo_score + + +def auc(x, y, session=None, run_kwargs=None): + """Compute Area Under the Curve (AUC) using the trapezoidal rule + + This is a general function, given points on a curve. For computing the + area under the ROC-curve, see :func:`roc_auc_score`. For an alternative + way to summarize a precision-recall curve, see + :func:`average_precision_score`. + + Parameters + ---------- + x : tensor, shape = [n] + x coordinates. These must be either monotonic increasing or monotonic + decreasing. + y : tensor, shape = [n] + y coordinates. + + Returns + ------- + auc : tensor, with float value + + Examples + -------- + >>> import mars.tensor as mt + >>> from mars.learn import metrics + >>> y = mt.array([1, 1, 2, 2]) + >>> pred = mt.array([0.1, 0.4, 0.35, 0.8]) + >>> fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2) + >>> metrics.auc(fpr, tpr) + 0.75 + + See also + -------- + roc_auc_score : Compute the area under the ROC curve + average_precision_score : Compute average precision from prediction scores + precision_recall_curve : + Compute precision-recall pairs for different probability thresholds + """ + check_consistent_length(x, y) + x = column_or_1d(x) + y = column_or_1d(y) + + if x.shape[0] < 2: + raise ValueError( + "At least 2 points are needed to compute" + f" area under curve, but x.shape = {x.shape}" + ) + + direction = 1 + dx = mt.diff(x) + any_dx_lt_0 = mt.any(dx < 0) + all_dx_le_0 = mt.all(dx <= 0) + mt.ExecutableTuple([x, any_dx_lt_0, all_dx_le_0]).execute( + session=session, **(run_kwargs or dict()) + ) + if any_dx_lt_0.fetch(session=session): + if all_dx_le_0.fetch(session=session): + direction = -1 + else: + x_data = x.fetch(session=session) + raise ValueError(f"x is neither increasing nor decreasing : {x_data}.") + + area = direction * mt.trapz(y, x) + return area.execute(session=session, **(run_kwargs or dict())) + + +def _binary_clf_curve( + y_true, y_score, pos_label=None, sample_weight=None, session=None, run_kwargs=None +): + """Calculate true and false positives per binary classification threshold. + + Parameters + ---------- + y_true : tensor, shape = [n_samples] + True targets of binary classification + + y_score : tensor, shape = [n_samples] + Estimated probabilities or decision function + + pos_label : int or str, default=None + The label of the positive class + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + fps : tensor, shape = [n_thresholds] + A count of false positives, at index i being the number of negative + samples assigned a score >= thresholds[i]. The total number of + negative samples is equal to fps[-1] (thus true negatives are given by + fps[-1] - fps). + + tps : tensor, shape = [n_thresholds <= len(mt.unique(y_score))] + An increasing count of true positives, at index i being the number + of positive samples assigned a score >= thresholds[i]. The total + number of positive samples is equal to tps[-1] (thus false negatives + are given by tps[-1] - tps). + + thresholds : tensor, shape = [n_thresholds] + Decreasing score values. + """ + y_type = type_of_target(y_true).to_numpy(session=session, **(run_kwargs or dict())) + if not (y_type == "binary" or (y_type == "multiclass" and pos_label is not None)): + raise ValueError(f"{y_type} format is not supported") + + check_consistent_length( + y_true, y_score, sample_weight, session=session, **(run_kwargs or dict()) + ) + y_true = column_or_1d(y_true) + y_score = column_or_1d(y_score) + y_true = assert_all_finite(y_true, check_only=False) + y_score = assert_all_finite(y_score, check_only=False) + + cache_tileables(y_true, y_score) + + if sample_weight is not None: + sample_weight = column_or_1d(sample_weight) + + # ensure binary classification if pos_label is not specified + # classes.dtype.kind in ('O', 'U', 'S') is required to avoid + # triggering a FutureWarning by calling np.array_equal(a, b) + # when elements in the two arrays are not comparable. + classes = mt.unique(y_true, aggregate_size=1).to_numpy( + session=session, **(run_kwargs or dict()) + ) + if pos_label is None and ( + classes.dtype.kind in ("O", "U", "S") + or not ( + np.array_equal(classes, [0, 1]) + or np.array_equal(classes, [-1, 1]) + or np.array_equal(classes, [0]) + or np.array_equal(classes, [-1]) + or np.array_equal(classes, [1]) + ) + ): + classes_repr = ", ".join(repr(c) for c in classes) + raise ValueError( + f"y_true takes value in {{{classes_repr}}} and " + "pos_label is not specified: either make y_true " + "take value in {{0, 1}} or {{-1, 1}} or " + "pass pos_label explicitly." + ) + elif pos_label is None: + pos_label = 1.0 + + # make y_true a boolean vector + y_true = y_true == pos_label + + # sort scores and corresponding truth values + # original implementation adopted from sklearn: + # """ + # desc_score_indices = mt.argsort(y_score, kind="mergesort")[::-1] + # y_score = y_score[desc_score_indices] + # y_true = y_true[desc_score_indices] + # if sample_weight is not None: + # weight = sample_weight[desc_score_indices] + # else: + # weight = 1.0 + # """ + # since fancy indexing is a heavy operation, we try to use DataFrame to sort + to_sort = [y_score, y_true] + if sample_weight is not None: + to_sort.append(sample_weight) + to_sort = sort_by(to_sort, y_score, ascending=False) + y_score, y_true = to_sort[:2] + if sample_weight is not None: + weight = to_sort[-1] + else: + weight = 1.0 + + # y_score typically has many tied values. Here we extract + # the indices associated with the distinct values. We also + # concatenate a value for the end of the curve. + distinct_value_indices = mt.where(mt.diff(y_score))[0] + threshold_idxs = mt.r_[distinct_value_indices, y_true.size - 1] + + # accumulate the true positives with decreasing threshold + # raw tps from sklearn implementation + # we try to perform only one fancy index + # tps = (y_true * weight).cumsum()[threshold_idxs] + temp_tps = (y_true * weight).cumsum() + if sample_weight is not None: + # express fps as a cumsum to ensure fps is increasing even in + # the presence of floating point errors + # fps = ((1 - y_true) * weight).cumsum()[threshold_idxs] + temp_fps = ((1 - y_true) * weight).cumsum() + tps, fps, thresholds = mt.stack([temp_tps, temp_fps, y_score])[ + :, threshold_idxs + ] + + else: + tps, thresholds = mt.stack([temp_tps, y_score])[:, threshold_idxs] + fps = 1 + threshold_idxs - tps + return _execute([fps, tps, thresholds], session=session, **(run_kwargs or dict())) + + +def _binary_roc_auc_score( + y_true, y_score, sample_weight=None, max_fpr=None, session=None, run_kwargs=None +): + """Binary roc auc score.""" + + from numpy import interp + + if len(mt.unique(y_true).execute()) != 2: + raise ValueError( + "Only one class present in y_true. ROC AUC score " + "is not defined in that case." + ) + + fpr, tpr, _ = roc_curve( + y_true, + y_score, + sample_weight=sample_weight, + session=session, + run_kwargs=run_kwargs, + ) + fpr, tpr = mt.ExecutableTuple([fpr, tpr]).fetch(session=session) + + if max_fpr is None or max_fpr == 1: + return auc(fpr, tpr, session=session, run_kwargs=run_kwargs).fetch( + session=session + ) + if max_fpr <= 0 or max_fpr > 1: + raise ValueError(f"Expected max_fpr in range (0, 1], got: {max_fpr}") + + # Add a single point at max_fpr by linear interpolation + stop = ( + mt.searchsorted(fpr, max_fpr, "right") + .execute(session=session, **(run_kwargs or dict())) + .fetch(session=session) + ) + x_interp = [fpr[stop - 1], fpr[stop]] + y_interp = [tpr[stop - 1], tpr[stop]] + tpr = list(tpr[:stop]) + tpr.append(interp(max_fpr, x_interp, y_interp)) + fpr = list(fpr[:stop]) + fpr.append(max_fpr) + partial_auc = auc(fpr, tpr, session=session, run_kwargs=run_kwargs) + + # McClish correction: standardize result to be 0.5 if non-discriminant + # and 1 if maximal + min_area = 0.5 * max_fpr**2 + max_area = max_fpr + return 0.5 * ( + 1 + (partial_auc.fetch(session=session) - min_area) / (max_area - min_area) + ) + + +def roc_auc_score( + y_true, + y_score, + *, + average="macro", + sample_weight=None, + max_fpr=None, + multi_class="raise", + labels=None, + session=None, + run_kwargs=None, +): + """ + Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) + from prediction scores. + + Note: this implementation can be used with binary, multiclass and + multilabel classification, but some restrictions apply (see Parameters). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) or (n_samples, n_classes) + True labels or binary label indicators. The binary and multiclass cases + expect labels with shape (n_samples,) while the multilabel case expects + binary label indicators with shape (n_samples, n_classes). + + y_score : array-like of shape (n_samples,) or (n_samples, n_classes) + Target scores. + + * In the binary case, it corresponds to an array of shape + `(n_samples,)`. Both probability estimates and non-thresholded + decision values can be provided. The probability estimates correspond + to the **probability of the class with the greater label**, + i.e. `estimator.classes_[1]` and thus + `estimator.predict_proba(X, y)[:, 1]`. The decision values + corresponds to the output of `estimator.decision_function(X, y)`. + See more information in the :ref:`User guide `; + * In the multiclass case, it corresponds to an array of shape + `(n_samples, n_classes)` of probability estimates provided by the + `predict_proba` method. The probability estimates **must** + sum to 1 across the possible classes. In addition, the order of the + class scores must correspond to the order of ``labels``, + if provided, or else to the numerical or lexicographical order of + the labels in ``y_true``. See more information in the + :ref:`User guide `; + * In the multilabel case, it corresponds to an array of shape + `(n_samples, n_classes)`. Probability estimates are provided by the + `predict_proba` method and the non-thresholded decision values by + the `decision_function` method. The probability estimates correspond + to the **probability of the class with the greater label for each + output** of the classifier. See more information in the + :ref:`User guide `. + + average : {'micro', 'macro', 'samples', 'weighted'} or None, \ + default='macro' + If ``None``, the scores for each class are returned. Otherwise, + this determines the type of averaging performed on the data: + Note: multiclass ROC AUC currently only handles the 'macro' and + 'weighted' averages. + + ``'micro'``: + Calculate metrics globally by considering each element of the label + indicator matrix as a label. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average, weighted + by support (the number of true instances for each label). + ``'samples'``: + Calculate metrics for each instance, and find their average. + + Will be ignored when ``y_true`` is binary. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + max_fpr : float > 0 and <= 1, default=None + If not ``None``, the standardized partial AUC [2]_ over the range + [0, max_fpr] is returned. For the multiclass case, ``max_fpr``, + should be either equal to ``None`` or ``1.0`` as AUC ROC partial + computation currently is not supported for multiclass. + + multi_class : {'raise', 'ovr', 'ovo'}, default='raise' + Only used for multiclass targets. Determines the type of configuration + to use. The default value raises an error, so either + ``'ovr'`` or ``'ovo'`` must be passed explicitly. + + ``'ovr'``: + Stands for One-vs-rest. Computes the AUC of each class + against the rest [3]_ [4]_. This + treats the multiclass case in the same way as the multilabel case. + Sensitive to class imbalance even when ``average == 'macro'``, + because class imbalance affects the composition of each of the + 'rest' groupings. + ``'ovo'``: + Stands for One-vs-one. Computes the average AUC of all + possible pairwise combinations of classes [5]_. + Insensitive to class imbalance when + ``average == 'macro'``. + + labels : array-like of shape (n_classes,), default=None + Only used for multiclass targets. List of labels that index the + classes in ``y_score``. If ``None``, the numerical or lexicographical + order of the labels in ``y_true`` is used. + + Returns + ------- + auc : float + + References + ---------- + .. [1] `Wikipedia entry for the Receiver operating characteristic + `_ + + .. [2] `Analyzing a portion of the ROC curve. McClish, 1989 + `_ + + .. [3] Provost, F., Domingos, P. (2000). Well-trained PETs: Improving + probability estimation trees (Section 6.2), CeDER Working Paper + #IS-00-04, Stern School of Business, New York University. + + .. [4] `Fawcett, T. (2006). An introduction to ROC analysis. Pattern + Recognition Letters, 27(8), 861-874. + `_ + + .. [5] `Hand, D.J., Till, R.J. (2001). A Simple Generalisation of the Area + Under the ROC Curve for Multiple Class Classification Problems. + Machine Learning, 45(2), 171-186. + `_ + + See Also + -------- + average_precision_score : Area under the precision-recall curve. + roc_curve : Compute Receiver operating characteristic (ROC) curve. + RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic + (ROC) curve given an estimator and some data. + RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic + (ROC) curve given the true and predicted values. + + Examples + -------- + Binary case: + + >>> from sklearn.datasets import load_breast_cancer + >>> from sklearn.linear_model import LogisticRegression + >>> from mars.learn.metrics import roc_auc_score + >>> X, y = load_breast_cancer(return_X_y=True) + >>> clf = LogisticRegression(solver="liblinear", random_state=0).fit(X, y) + >>> roc_auc_score(y, clf.predict_proba(X)[:, 1]) + 0.99... + >>> roc_auc_score(y, clf.decision_function(X)) + 0.99... + + Multiclass case: + + >>> from sklearn.datasets import load_iris + >>> X, y = load_iris(return_X_y=True) + >>> clf = LogisticRegression(solver="liblinear").fit(X, y) + >>> roc_auc_score(y, clf.predict_proba(X), multi_class='ovr') + 0.99... + + Multilabel case: + + >>> import numpy as np + >>> from sklearn.datasets import make_multilabel_classification + >>> from sklearn.multioutput import MultiOutputClassifier + >>> X, y = make_multilabel_classification(random_state=0) + >>> clf = MultiOutputClassifier(clf).fit(X, y) + >>> # get a list of n_output containing probability arrays of shape + >>> # (n_samples, n_classes) + >>> y_pred = clf.predict_proba(X) + >>> # extract the positive columns for each output + >>> y_pred = np.transpose([pred[:, 1] for pred in y_pred]) + >>> roc_auc_score(y, y_pred, average=None) + array([0.82..., 0.86..., 0.94..., 0.85... , 0.94...]) + >>> from sklearn.linear_model import RidgeClassifierCV + >>> clf = RidgeClassifierCV().fit(X, y) + >>> roc_auc_score(y, clf.decision_function(X), average=None) + array([0.81..., 0.84... , 0.93..., 0.87..., 0.94...]) + """ + + cache_tileables(y_true, y_score) + + y_type = type_of_target(y_true) + y_true = check_array(y_true, ensure_2d=False, dtype=None) + y_score = check_array(y_score, ensure_2d=False) + _execute([y_type, y_true, y_score], session=session, **(run_kwargs or dict())) + y_type = y_type.fetch(session=session) + + def execute(*args): + result = [None] * len(args) + to_execute = dict() + for i, arg in enumerate(args): + if hasattr(arg, "op"): + to_execute[i] = arg + else: + result[i] = arg + if to_execute: + _execute(*to_execute.values(), session=session, **(run_kwargs or dict())) + for i, e in to_execute.items(): + if e.isscalar(): + e = e.fetch(session=session) + result[i] = e + return result[0] if len(result) == 1 else result + + if y_type == "multiclass" or ( + y_type == "binary" and y_score.ndim == 2 and y_score.shape[1] > 2 + ): + # do not support partial ROC computation for multiclass + if max_fpr is not None and max_fpr != 1.0: + raise ValueError( + "Partial AUC computation not available in " + "multiclass setting, 'max_fpr' must be" + " set to `None`, received `max_fpr={0}` " + "instead".format(max_fpr) + ) + if multi_class == "raise": + raise ValueError("multi_class must be in ('ovo', 'ovr')") + return execute( + _multiclass_roc_auc_score( + y_true, y_score, labels, multi_class, average, sample_weight + ) + ) + elif y_type == "binary": + labels = mt.unique(y_true).execute(session=session, **(run_kwargs or dict())) + y_true = label_binarize(y_true, classes=labels, execute=False)[:, 0] + cache_tileables(y_true) + return execute( + _average_binary_score( + partial(_binary_roc_auc_score, max_fpr=max_fpr), + y_true, + y_score, + average, + sample_weight=sample_weight, + ) + ) + else: # multilabel-indicator + return execute( + _average_binary_score( + partial(_binary_roc_auc_score, max_fpr=max_fpr), + y_true, + y_score, + average, + sample_weight=sample_weight, + ) + ) + + +def _multiclass_roc_auc_score( + y_true, + y_score, + labels, + multi_class, + average, + sample_weight, + session=None, + run_kwargs=None, +): + # validation of the input y_score + if not mt.allclose(1, y_score.sum(axis=1)).to_numpy( + session=session, **(run_kwargs or dict()) + ): # pragma: no cover + raise ValueError( + "Target scores need to be probabilities for multiclass " + "roc_auc, i.e. they should sum up to 1.0 over classes" + ) + + # validation for multiclass parameter specifications + average_options = ("macro", "weighted") + if average not in average_options: + raise ValueError( + "average must be one of {0} for multiclass problems".format(average_options) + ) + + multiclass_options = ("ovo", "ovr") + if multi_class not in multiclass_options: + raise ValueError( + "multi_class='{0}' is not supported " + "for multiclass ROC AUC, multi_class must be " + "in {1}".format(multi_class, multiclass_options) + ) + + if labels is not None: + labels = column_or_1d(labels).to_numpy( + session=session, **(run_kwargs or dict()) + ) + classes = _unique(labels).to_numpy(session=session, **(run_kwargs or dict())) + if len(classes) != len(labels): + raise ValueError("Parameter 'labels' must be unique") + if not np.array_equal(classes, labels): + raise ValueError("Parameter 'labels' must be ordered") + if len(classes) != y_score.shape[1]: + raise ValueError( + "Number of given labels, {0}, not equal to the number " + "of columns in 'y_score', {1}".format(len(classes), y_score.shape[1]) + ) + if len( + mt.setdiff1d(y_true, classes).execute( + session=session, **(run_kwargs or dict()) + ) + ): + raise ValueError("'y_true' contains labels not in parameter 'labels'") + else: + classes = _unique(y_true).execute(session=session, **(run_kwargs or dict())) + if len(classes) != y_score.shape[1]: + raise ValueError( + "Number of classes in y_true not equal to the number of " + "columns in 'y_score'" + ) + + if multi_class == "ovo": + if sample_weight is not None: + raise ValueError( + "sample_weight is not supported " + "for multiclass one-vs-one ROC AUC, " + "'sample_weight' must be None in this case." + ) + y_true_encoded = _encode(y_true, uniques=classes) + # Hand & Till (2001) implementation (ovo) + return _average_multiclass_ovo_score( + _binary_roc_auc_score, + y_true_encoded, + y_score, + average=average, + session=session, + run_kwargs=run_kwargs, + ) + else: + # ovr is same as multi-label + y_true_multilabel = label_binarize(y_true, classes=classes, execute=False) + return _average_binary_score( + _binary_roc_auc_score, + y_true_multilabel, + y_score, + average, + sample_weight=sample_weight, + session=session, + run_kwargs=run_kwargs, + ) + + +def roc_curve( + y_true, + y_score, + pos_label=None, + sample_weight=None, + drop_intermediate=True, + session=None, + run_kwargs=None, +): + """Compute Receiver operating characteristic (ROC) + + Note: this implementation is restricted to the binary classification task. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + + y_true : tensor, shape = [n_samples] + True binary labels. If labels are not either {-1, 1} or {0, 1}, then + pos_label should be explicitly given. + + y_score : tensor, shape = [n_samples] + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by "decision_function" on some classifiers). + + pos_label : int or str, default=None + The label of the positive class. + When ``pos_label=None``, if y_true is in {-1, 1} or {0, 1}, + ``pos_label`` is set to 1, otherwise an error will be raised. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + drop_intermediate : boolean, optional (default=True) + Whether to drop some suboptimal thresholds which would not appear + on a plotted ROC curve. This is useful in order to create lighter + ROC curves. + + .. versionadded:: 0.17 + parameter *drop_intermediate*. + + Returns + ------- + fpr : tensor, shape = [>2] + Increasing false positive rates such that element i is the false + positive rate of predictions with score >= thresholds[i]. + + tpr : tensor, shape = [>2] + Increasing true positive rates such that element i is the true + positive rate of predictions with score >= thresholds[i]. + + thresholds : tensor, shape = [n_thresholds] + Decreasing thresholds on the decision function used to compute + fpr and tpr. `thresholds[0]` represents no instances being predicted + and is arbitrarily set to `max(y_score) + 1`. + + See also + -------- + roc_auc_score : Compute the area under the ROC curve + + Notes + ----- + Since the thresholds are sorted from low to high values, they + are reversed upon returning them to ensure they correspond to both ``fpr`` + and ``tpr``, which are sorted in reversed order during their calculation. + + References + ---------- + .. [1] `Wikipedia entry for the Receiver operating characteristic + `_ + + .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition + Letters, 2006, 27(8):861-874. + + Examples + -------- + >>> import mars.tensor as mt + >>> from mars.learn import metrics + >>> y = mt.array([1, 1, 2, 2]) + >>> scores = mt.array([0.1, 0.4, 0.35, 0.8]) + >>> fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2) + >>> fpr + array([0. , 0. , 0.5, 0.5, 1. ]) + >>> tpr + array([0. , 0.5, 0.5, 1. , 1. ]) + >>> thresholds + array([1.8 , 0.8 , 0.4 , 0.35, 0.1 ]) + + """ + from sklearn.exceptions import UndefinedMetricWarning + + cache_tileables(y_true, y_score) + + fps, tps, thresholds = _binary_clf_curve( + y_true, + y_score, + pos_label=pos_label, + sample_weight=sample_weight, + session=session, + run_kwargs=run_kwargs, + ) + + # Attempt to drop thresholds corresponding to points in between and + # collinear with other points. These are always suboptimal and do not + # appear on a plotted ROC curve (and thus do not affect the AUC). + # Here mt.diff(_, 2) is used as a "second derivative" to tell if there + # is a corner at the point. Both fps and tps must be tested to handle + # thresholds with multiple data points (which are combined in + # _binary_clf_curve). This keeps all cases where the point should be kept, + # but does not drop more complicated cases like fps = [1, 3, 7], + # tps = [1, 2, 4]; there is no harm in keeping too many thresholds. + if drop_intermediate and len(fps) > 2: + optimal_idxs = mt.where( + mt.r_[True, mt.logical_or(mt.diff(fps, 2), mt.diff(tps, 2)), True] + )[0] + # original implementation of sklearn: + # """ + # fps = fps[optimal_idxs] + # tps = tps[optimal_idxs] + # thresholds = thresholds[optimal_idxs] + # """ + # however, it's really a heavy operation to perform fancy index, + # thus we put them together + stacked = mt.stack([fps, tps, thresholds]) + fps, tps, thresholds = stacked[:, optimal_idxs] + + # Add an extra threshold position + # to make sure that the curve starts at (0, 0) + tps = mt.r_[0, tps] + fps = mt.r_[0, fps] + thresholds = mt.r_[thresholds[0] + 1, thresholds] + + last_fps = fps[-1] + last_tps = tps[-1] + _execute( + [tps, fps, last_fps, last_tps, thresholds], + session=session, + **(run_kwargs or dict()), + ) + last_fps, last_tps = _fetch([last_fps, last_tps], session=session) + + if last_fps <= 0: + warnings.warn( + "No negative samples in y_true, " + "false positive value should be meaningless", + UndefinedMetricWarning, + ) + fpr = mt.repeat(mt.nan, fps.shape) + else: + fpr = fps / last_fps + + if last_tps <= 0: + warnings.warn( + "No positive samples in y_true, " + "true positive value should be meaningless", + UndefinedMetricWarning, + ) + tpr = mt.repeat(mt.nan, tps.shape) + else: + tpr = tps / last_tps + + ret = mt.ExecutableTuple([fpr, tpr, thresholds]).execute( + session=session, **(run_kwargs or dict()) + ) + return ret diff --git a/python/xorbits/_mars/learn/metrics/_regresssion.py b/python/xorbits/_mars/learn/metrics/_regresssion.py new file mode 100644 index 000000000..bdbd0f7c1 --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/_regresssion.py @@ -0,0 +1,248 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings + +import numpy as np +from sklearn.exceptions import UndefinedMetricWarning + +from ... import execute +from ... import tensor as mt +from ..utils.validation import ( + _num_samples, + check_array, + check_consistent_length, + column_or_1d, +) + + +def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric"): + """Check that y_true and y_pred belong to the same regression task. + + Parameters + ---------- + y_true : array-like + + y_pred : array-like + + multioutput : array-like or string in ['raw_values', uniform_average', + 'variance_weighted'] or None + None is accepted due to backward compatibility of r2_score(). + + Returns + ------- + type_true : one of {'continuous', continuous-multioutput'} + The type of the true target data, as output by + 'utils.multiclass.type_of_target'. + + y_true : array-like of shape (n_samples, n_outputs) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples, n_outputs) + Estimated target values. + + multioutput : array-like of shape (n_outputs) or string in ['raw_values', + uniform_average', 'variance_weighted'] or None + Custom output weights if ``multioutput`` is array-like or + just the corresponding argument if ``multioutput`` is a + correct keyword. + + dtype : str or list, default="numeric" + the dtype argument passed to check_array. + """ + check_consistent_length(y_true, y_pred) + y_true = check_array(y_true, ensure_2d=False, dtype=dtype) + y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype) + + if y_true.ndim == 1: + y_true = y_true.reshape((-1, 1)) + + if y_pred.ndim == 1: + y_pred = y_pred.reshape((-1, 1)) + + if y_true.shape[1] != y_pred.shape[1]: + raise ValueError( + "y_true and y_pred have different number of output " + "({0}!={1})".format(y_true.shape[1], y_pred.shape[1]) + ) + + n_outputs = y_true.shape[1] + allowed_multioutput_str = ("raw_values", "uniform_average", "variance_weighted") + if isinstance(multioutput, str): + if multioutput not in allowed_multioutput_str: + raise ValueError( + "Allowed 'multioutput' string values are {}. " + "You provided multioutput={!r}".format( + allowed_multioutput_str, multioutput + ) + ) + elif multioutput is not None: + multioutput = check_array(multioutput, ensure_2d=False) + if n_outputs == 1: + raise ValueError("Custom weights are useful only in multi-output cases.") + elif n_outputs != len(multioutput): + raise ValueError( + ("There must be equally many custom weights (%d) as outputs (%d).") + % (len(multioutput), n_outputs) + ) + y_type = "continuous" if n_outputs == 1 else "continuous-multioutput" + + return y_type, y_true, y_pred, multioutput + + +def r2_score( + y_true, + y_pred, + *, + sample_weight=None, + multioutput="uniform_average", + session=None, + run_kwargs=None +): + """:math:`R^2` (coefficient of determination) regression score function. + + Best possible score is 1.0 and it can be negative (because the + model can be arbitrarily worse). A constant model that always + predicts the expected value of y, disregarding the input features, + would get a :math:`R^2` score of 0.0. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) or (n_samples, n_outputs) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs) + Estimated target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + multioutput : {'raw_values', 'uniform_average', 'variance_weighted'}, \ + array-like of shape (n_outputs,) or None, default='uniform_average' + + Defines aggregating of multiple output scores. + Array-like value defines weights used to average scores. + Default is "uniform_average". + + 'raw_values' : + Returns a full set of scores in case of multioutput input. + + 'uniform_average' : + Scores of all outputs are averaged with uniform weight. + + 'variance_weighted' : + Scores of all outputs are averaged, weighted by the variances + of each individual output. + + Returns + ------- + z : float or tensor of floats + The :math:`R^2` score or ndarray of scores if 'multioutput' is + 'raw_values'. + + Notes + ----- + This is not a symmetric function. + + Unlike most other scores, :math:`R^2` score may be negative (it need not + actually be the square of a quantity R). + + This metric is not well-defined for single samples and will return a NaN + value if n_samples is less than two. + + References + ---------- + .. [1] `Wikipedia entry on the Coefficient of determination + `_ + + Examples + -------- + >>> from mars.learn.metrics import r2_score + >>> y_true = [3, -0.5, 2, 7] + >>> y_pred = [2.5, 0.0, 2, 8] + >>> r2_score(y_true, y_pred) + 0.948... + >>> y_true = [[0.5, 1], [-1, 1], [7, -6]] + >>> y_pred = [[0, 2], [-1, 2], [8, -5]] + >>> r2_score(y_true, y_pred, + ... multioutput='variance_weighted') + 0.938... + >>> y_true = [1, 2, 3] + >>> y_pred = [1, 2, 3] + >>> r2_score(y_true, y_pred) + 1.0 + >>> y_true = [1, 2, 3] + >>> y_pred = [2, 2, 2] + >>> r2_score(y_true, y_pred) + 0.0 + >>> y_true = [1, 2, 3] + >>> y_pred = [3, 2, 1] + >>> r2_score(y_true, y_pred) + -3.0 + """ + _, y_true, y_pred, multioutput = _check_reg_targets(y_true, y_pred, multioutput) + check_consistent_length(y_true, y_pred, sample_weight) + + if _num_samples(y_pred) < 2: + msg = "R^2 score is not well-defined with less than two samples." + warnings.warn(msg, UndefinedMetricWarning) + return float("nan") + + if sample_weight is not None: + sample_weight = column_or_1d(sample_weight) + weight = sample_weight[:, np.newaxis] + else: + weight = 1.0 + + numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64) + denominator = ( + weight * (y_true - mt.average(y_true, axis=0, weights=sample_weight)) ** 2 + ).sum(axis=0, dtype=np.float64) + nonzero_denominator = denominator != 0 + nonzero_numerator = numerator != 0 + valid_score = nonzero_denominator & nonzero_numerator + output_scores = mt.ones((y_true.shape[1],)) + output_scores[valid_score] = 1 - (numerator[valid_score] / denominator[valid_score]) + # arbitrary set to zero to avoid -inf scores, having a constant + # y_true is not interesting for scoring a regression anyway + output_scores[nonzero_numerator & ~nonzero_denominator] = 0.0 + if isinstance(multioutput, str): + if multioutput == "raw_values": + # return scores individually + return output_scores + elif multioutput == "uniform_average": + # passing None as weights results is uniform mean + avg_weights = None + elif multioutput == "variance_weighted": + avg_weights = denominator + # avoid fail on constant y or one-element arrays + cond1 = mt.any(nonzero_denominator) + execute( + cond1, nonzero_denominator, session=session, **(run_kwargs or dict()) + ) + if not cond1.fetch(): + if not mt.any(nonzero_numerator).to_numpy( + session=session, **(run_kwargs or dict()) + ): + return 1.0 + else: + return 0.0 + else: + avg_weights = multioutput + + return mt.average(output_scores, weights=avg_weights).execute( + session=session, **(run_kwargs or dict()) + ) diff --git a/python/xorbits/_mars/learn/metrics/_scorer.py b/python/xorbits/_mars/learn/metrics/_scorer.py new file mode 100644 index 000000000..61e01dfa7 --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/_scorer.py @@ -0,0 +1,57 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Union + +from sklearn.metrics import make_scorer + +from . import accuracy_score, log_loss, r2_score + +accuracy_score = make_scorer(accuracy_score) +r2_score = make_scorer(r2_score) +neg_log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True) + + +SCORERS = dict( + r2=r2_score, + accuracy=accuracy_score, + neg_log_loss=neg_log_loss_scorer, +) + + +def get_scorer(score_func: Union[str, Callable], **kwargs) -> Callable: + """ + Get a scorer from string + + Parameters + ---------- + score_func : str | callable + scoring method as string. If callable it is returned as is. + + Returns + ------- + scorer : callable + The scorer. + """ + if isinstance(score_func, str): + try: + scorer = SCORERS[score_func] + except KeyError: + raise ValueError( + "{} is not a valid scoring value. " + "Valid options are {}".format(score_func, sorted(SCORERS)) + ) + return scorer + else: + return make_scorer(score_func, **kwargs) diff --git a/python/xorbits/_mars/learn/metrics/pairwise/__init__.py b/python/xorbits/_mars/learn/metrics/pairwise/__init__.py new file mode 100644 index 000000000..b554de019 --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/pairwise/__init__.py @@ -0,0 +1,21 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .cosine import cosine_distances, cosine_similarity +from .euclidean import euclidean_distances +from .haversine import haversine_distances +from .manhattan import manhattan_distances +from .pairwise import PAIRWISE_DISTANCE_FUNCTIONS, pairwise_distances +from .pairwise_distances_topk import pairwise_distances_topk +from .rbf_kernel import rbf_kernel diff --git a/python/xorbits/_mars/learn/metrics/pairwise/core.py b/python/xorbits/_mars/learn/metrics/pairwise/core.py new file mode 100644 index 000000000..37c0a81e6 --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/pairwise/core.py @@ -0,0 +1,184 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np + +from ....core import recursive_tile +from ....serialization.serializables import Int64Field +from ....tensor import tensor as astensor +from ....tensor.operands import TensorOperand, TensorOperandMixin +from ....utils import has_unknown_shape +from ...utils import check_array + + +class PairwiseDistances(TensorOperand, TensorOperandMixin): + _op_module_ = "learn" + + chunk_store_limit = Int64Field("chunk_store_limit") + + @staticmethod + def _return_float_dtype(X, Y): + """ + 1. If dtype of X and Y is float32, then dtype float32 is returned. + 2. Else dtype float is returned. + """ + + X = astensor(X) + + if Y is None: + Y_dtype = X.dtype + else: + Y = astensor(Y) + Y_dtype = Y.dtype + + if X.dtype == Y_dtype == np.float32: + dtype = np.float32 + else: + dtype = float + + return X, Y, dtype + + @staticmethod + def check_pairwise_arrays(X, Y, precomputed=False, dtype=None): + X, Y, dtype_float = PairwiseDistances._return_float_dtype(X, Y) + + estimator = "check_pairwise_arrays" + if dtype is None: + dtype = dtype_float + + if Y is X or Y is None: + X = Y = check_array(X, accept_sparse=True, dtype=dtype, estimator=estimator) + else: + X = check_array(X, accept_sparse=True, dtype=dtype, estimator=estimator) + Y = check_array(Y, accept_sparse=True, dtype=dtype, estimator=estimator) + + if precomputed: + if X.shape[1] != Y.shape[0]: + raise ValueError( + "Precomputed metric requires shape " + f"(n_queries, n_indexed). Got ({X.shape[0]}, {X.shape[1]}) " + f"for {Y.shape[0]} indexed." + ) + elif X.shape[1] != Y.shape[1]: + raise ValueError( + "Incompatible dimension for X and Y matrices: " + f"X.shape[1] == {X.shape[1]} while Y.shape[1] == {Y.shape[1]}" + ) + + return X, Y + + @classmethod + def _tile_one_chunk(cls, op): + out = op.outputs[0] + chunk_op = op.copy().reset_key() + chunk = chunk_op.new_chunk( + [op.x.chunks[0], op.y.chunks[0]], + shape=out.shape, + order=out.order, + index=(0, 0), + ) + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + shape=out.shape, + order=out.order, + chunks=[chunk], + nsplits=tuple((s,) for s in out.shape), + ) + + @classmethod + def _tile_chunks(cls, op, x, y): + out = op.outputs[0] + out_chunks = [] + for idx in itertools.product(range(x.chunk_shape[0]), range(y.chunk_shape[0])): + xi, yi = idx + + chunk_op = op.copy().reset_key() + chunk_inputs = [x.cix[xi, 0], y.cix[yi, 0]] + out_chunk = chunk_op.new_chunk( + chunk_inputs, + shape=( + chunk_inputs[0].shape[0], + chunk_inputs[1].shape[0], + ), + order=out.order, + index=idx, + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + shape=out.shape, + order=out.order, + chunks=out_chunks, + nsplits=(x.nsplits[0], y.nsplits[0]), + ) + + @classmethod + def _rechunk_cols_into_one(cls, x, y): + y_is_x = y is x + if x.chunk_shape[1] != 1 or y.chunk_shape[1] != 1: + if has_unknown_shape([x, y]): + yield + + x = yield from recursive_tile(x.rechunk({1: x.shape[1]})) + if y_is_x: + y = x + else: + y = yield from recursive_tile(y.rechunk({1: y.shape[1]})) + + return x, y + + @classmethod + def _adjust_chunk_sizes(cls, op, X, Y, out): + max_x_chunk_size = max(X.nsplits[0]) + max_y_chunk_size = max(Y.nsplits[0]) + itemsize = out.dtype.itemsize + max_chunk_bytes = max_x_chunk_size * max_y_chunk_size * itemsize + chunk_store_limit = op.chunk_store_limit * 2 # scale 2 times + if max_chunk_bytes > chunk_store_limit: + adjust_succeeded = False + # chunk is too huge, try to rechunk X and Y + if X.shape[0] > Y.shape[0]: + # y is smaller, rechunk y is more efficient + expected_y_chunk_size = max( + int(chunk_store_limit / itemsize / max_x_chunk_size), 1 + ) + if ( + max_x_chunk_size * expected_y_chunk_size * itemsize + <= chunk_store_limit + ): + adjust_succeeded = True + Y = yield from recursive_tile(Y.rechunk({0: expected_y_chunk_size})) + else: + # x is smaller, rechunk x is more efficient + expected_x_chunk_size = max( + int(chunk_store_limit / itemsize / max_y_chunk_size), 1 + ) + if ( + max_y_chunk_size * expected_x_chunk_size * itemsize + <= chunk_store_limit + ): + adjust_succeeded = True + X = yield from recursive_tile(X.rechunk({0: expected_x_chunk_size})) + + if not adjust_succeeded: + expected_chunk_size = max(int(np.sqrt(chunk_store_limit / itemsize)), 1) + X = yield from recursive_tile(X.rechunk({0: expected_chunk_size})) + Y = yield from recursive_tile(Y.rechunk({0: expected_chunk_size})) + + return X, Y diff --git a/python/xorbits/_mars/learn/metrics/pairwise/cosine.py b/python/xorbits/_mars/learn/metrics/pairwise/cosine.py new file mode 100644 index 000000000..8d47beee0 --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/pairwise/cosine.py @@ -0,0 +1,138 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from .... import opcodes as OperandDef +from .... import tensor as mt +from ....core import recursive_tile +from ....serialization.serializables import KeyField +from ....tensor.core import TensorOrder +from ...preprocessing import normalize +from .core import PairwiseDistances + + +class CosineDistances(PairwiseDistances): + _op_type_ = OperandDef.PAIRWISE_COSINE_DISTANCES + + _x = KeyField("x") + _y = KeyField("y") + + def __init__(self, x=None, y=None, **kw): + super().__init__(_x=x, _y=y, **kw) + + @property + def x(self): + return self._x + + @property + def y(self): + return self._y + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._x = self._inputs[0] + self._y = self._inputs[1] + + def __call__(self, x, y=None): + x, y = self.check_pairwise_arrays(x, y) + return self.new_tensor( + [x, y], shape=(x.shape[0], y.shape[0]), order=TensorOrder.C_ORDER + ) + + @classmethod + def tile(cls, op): + x, y = op.x, op.y + if x is y: + S = cosine_similarity(x) + else: + S = cosine_similarity(x, y) + S = (S * -1) + 1 + S = mt.clip(S, 0, 2) + if x is y: + mt.fill_diagonal(S, 0.0) + return [(yield from recursive_tile(S))] + + +def cosine_similarity(X, Y=None, dense_output=True): + """Compute cosine similarity between samples in X and Y. + + Cosine similarity, or the cosine kernel, computes similarity as the + normalized dot product of X and Y: + + K(X, Y) = / (||X||*||Y||) + + On L2-normalized data, this function is equivalent to linear_kernel. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : Tensor or sparse tensor, shape: (n_samples_X, n_features) + Input data. + + Y : Tensor or sparse tensor, shape: (n_samples_Y, n_features) + Input data. If ``None``, the output will be the pairwise + similarities between all samples in ``X``. + + dense_output : boolean (optional), default True + Whether to return dense output even when the input is sparse. If + ``False``, the output is sparse if both input tensors are sparse. + + Returns + ------- + kernel matrix : Tensor + A tensor with shape (n_samples_X, n_samples_Y). + """ + X, Y = PairwiseDistances.check_pairwise_arrays(X, Y) + + X_normalized = normalize(X, copy=True) + if X is Y: + Y_normalized = X_normalized + else: + Y_normalized = normalize(Y, copy=True) + + K = X_normalized.dot(Y_normalized.T) + if dense_output: + K = K.todense() + return K + + +def cosine_distances(X, Y=None): + """Compute cosine distance between samples in X and Y. + + Cosine distance is defined as 1.0 minus the cosine similarity. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : array_like, sparse matrix + with shape (n_samples_X, n_features). + + Y : array_like, sparse matrix (optional) + with shape (n_samples_Y, n_features). + + Returns + ------- + distance matrix : Tensor + A tensor with shape (n_samples_X, n_samples_Y). + + See also + -------- + mars.learn.metrics.pairwise.cosine_similarity + mars.tensor.spatial.distance.cosine : dense matrices only + """ + op = CosineDistances(x=X, y=Y, dtype=np.dtype(np.float64)) + return op(X, y=Y) diff --git a/python/xorbits/_mars/learn/metrics/pairwise/euclidean.py b/python/xorbits/_mars/learn/metrics/pairwise/euclidean.py new file mode 100644 index 000000000..2b265d201 --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/pairwise/euclidean.py @@ -0,0 +1,253 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from .... import opcodes as OperandDef +from .... import tensor as mt +from ....config import options +from ....core import recursive_tile +from ....serialization.serializables import BoolField, KeyField +from ....tensor.core import TensorOrder +from ....utils import has_unknown_shape +from ...utils import check_array +from ...utils.extmath import row_norms +from .core import PairwiseDistances + + +class EuclideanDistances(PairwiseDistances): + _op_type_ = OperandDef.PAIRWISE_EUCLIDEAN_DISTANCES + + _x = KeyField("X") + _y = KeyField("Y") + _x_norm_squared = KeyField("X_norm_squared") + _y_norm_squared = KeyField("Y_norm_squared") + _squared = BoolField("squared") + + def __init__( + self, + x=None, + y=None, + x_norm_squared=None, + y_norm_squared=None, + squared=None, + **kw + ): + super().__init__( + _x=x, + _y=y, + _x_norm_squared=x_norm_squared, + _y_norm_squared=y_norm_squared, + _squared=squared, + **kw + ) + + @property + def x(self): + return self._x + + @property + def y(self): + return self._y + + @property + def x_norm_squared(self): + return self._x_norm_squared + + @property + def y_norm_squared(self): + return self._y_norm_squared + + @property + def squared(self): + return self._squared + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + input_iter = iter(self._inputs) + self._x = next(input_iter) + if self._y is not None: + self._y = next(input_iter) + if self._x_norm_squared is not None: + self._x_norm_squared = next(input_iter) + if self._y_norm_squared is not None: + self._y_norm_squared = next(input_iter) + + def __call__(self, X, Y=None, Y_norm_squared=None, X_norm_squared=None): + # If norms are passed as float32, they are unused. If arrays are passed as + # float32, norms needs to be recomputed on upcast chunks. + # TODO: use a float64 accumulator in row_norms to avoid the latter. + if X_norm_squared is not None: + XX = check_array(X_norm_squared) + if XX.shape == (1, X.shape[0]): + XX = XX.T + elif XX.shape != (X.shape[0], 1): + raise ValueError("Incompatible dimensions for X and X_norm_squared") + if XX.dtype == np.float32: + XX = self._x_norm_squared = None + else: + XX = None + + if X is Y and XX is not None: + # shortcut in the common case euclidean_distances(X, X) + YY = XX.T + elif Y_norm_squared is not None: + YY = mt.atleast_2d(Y_norm_squared) + + if YY.shape != (1, Y.shape[0]): + raise ValueError("Incompatible dimensions for Y and Y_norm_squared") + if YY.dtype == np.float32: + YY = self._y_norm_squared = None + else: + YY = None + + inputs = [X, Y] + if XX is not None: + inputs.append(XX) + if YY is not None: + inputs.append(YY) + return self.new_tensor( + inputs, shape=(X.shape[0], Y.shape[0]), order=TensorOrder.C_ORDER + ) + + @classmethod + def tile(cls, op): + X, Y = op.x, op.y + out = op.outputs[0] + + if X.dtype == np.float32: + if has_unknown_shape(X, Y): + yield + # rechunk + new_nsplit = max(max(X.nsplits[0]) // 2, 1) + X = yield from recursive_tile(X.rechunk({0: new_nsplit}).astype(np.float64)) + if Y is not X: + new_nsplit = max(max(Y.nsplits[0]) // 2, 1) + Y = yield from recursive_tile( + Y.rechunk({0: new_nsplit}).astype(np.float64) + ) + + XX = op.x_norm_squared + if XX is None: + XX = row_norms(X, squared=True)[:, np.newaxis] + YY = op.y_norm_squared + if YY is None: + YY = row_norms(Y, squared=True)[np.newaxis, :] + + X, Y = yield from cls._adjust_chunk_sizes(op, X, Y, out) + + distances = -2 * X.dot(Y.T) + if distances.issparse(): + distances = distances.todense() + distances += XX + distances += YY + distances = mt.maximum(distances, 0) + + if X is Y or X.key == Y.key: + mt.fill_diagonal(distances, 0) + + distances = distances if op.squared else mt.sqrt(distances) + distances = distances.astype(out.dtype, copy=False) + return [(yield from recursive_tile(distances))] + + +def euclidean_distances( + X, Y=None, Y_norm_squared=None, squared=False, X_norm_squared=None +): + """ + Considering the rows of X (and Y=X) as vectors, compute the + distance matrix between each pair of vectors. + + For efficiency reasons, the euclidean distance between a pair of row + vector x and y is computed as:: + + dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y)) + + This formulation has two advantages over other ways of computing distances. + First, it is computationally efficient when dealing with sparse data. + Second, if one argument varies but the other remains unchanged, then + `dot(x, x)` and/or `dot(y, y)` can be pre-computed. + + However, this is not the most precise way of doing this computation, and + the distance matrix returned by this function may not be exactly + symmetric as required by, e.g., ``scipy.spatial.distance`` functions. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples_1, n_features) + + Y : {array-like, sparse matrix}, shape (n_samples_2, n_features) + + Y_norm_squared : array-like, shape (n_samples_2, ), optional + Pre-computed dot-products of vectors in Y (e.g., + ``(Y**2).sum(axis=1)``) + May be ignored in some cases, see the note below. + + squared : boolean, optional + Return squared Euclidean distances. + + X_norm_squared : array-like, shape = [n_samples_1], optional + Pre-computed dot-products of vectors in X (e.g., + ``(X**2).sum(axis=1)``) + May be ignored in some cases, see the note below. + + Notes + ----- + To achieve better accuracy, `X_norm_squared` and `Y_norm_squared` may be + unused if they are passed as ``float32``. + + Returns + ------- + distances : tensor, shape (n_samples_1, n_samples_2) + + Examples + -------- + >>> from mars.learn.metrics.pairwise import euclidean_distances + >>> X = [[0, 1], [1, 1]] + >>> # distance between rows of X + >>> euclidean_distances(X, X).execute() + array([[0., 1.], + [1., 0.]]) + >>> # get distance to origin + >>> euclidean_distances(X, [[0, 0]]).execute() + array([[1. ], + [1.41421356]]) + + See also + -------- + paired_distances : distances betweens pairs of elements of X and Y. + """ + if X.dtype == np.float32: + if Y is None: + dtype = X.dtype + elif Y.dtype == np.float32: + dtype = np.float32 + else: + dtype = np.float64 + else: + dtype = np.float64 + + X, Y = EuclideanDistances.check_pairwise_arrays(X, Y) + op = EuclideanDistances( + x=X, + y=Y, + x_norm_squared=X_norm_squared, + y_norm_squared=Y_norm_squared, + squared=squared, + dtype=np.dtype(dtype), + chunk_store_limit=options.chunk_store_limit, + ) + return op(X, Y=Y, Y_norm_squared=Y_norm_squared, X_norm_squared=X_norm_squared) diff --git a/python/xorbits/_mars/learn/metrics/pairwise/haversine.py b/python/xorbits/_mars/learn/metrics/pairwise/haversine.py new file mode 100644 index 000000000..8d6d0d534 --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/pairwise/haversine.py @@ -0,0 +1,156 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +try: + from sklearn.neighbors import DistanceMetric as SklearnDistanceMetric +except ImportError: # pragma: no cover + SklearnDistanceMetric = None + +from .... import opcodes as OperandDef +from ....core import recursive_tile +from ....serialization.serializables import BoolField, KeyField +from ....tensor.array_utils import as_same_device, device +from ....tensor.core import TensorOrder +from ....tensor.indexing import fill_diagonal +from .core import PairwiseDistances + + +class HaversineDistances(PairwiseDistances): + _op_type_ = OperandDef.PAIRWISE_HAVERSINE_DISTANCES + + _x = KeyField("x") + _y = KeyField("y") + # for test purpose + _use_sklearn = BoolField("use_sklearn") + + def __init__(self, x=None, y=None, use_sklearn=None, **kw): + super().__init__(_x=x, _y=y, _use_sklearn=use_sklearn, **kw) + if self._use_sklearn is None: + # if not set use_sklearn, will try to use sklearn by default + self._use_sklearn = True + + @property + def x(self): + return self._x + + @property + def y(self): + return self._y + + @property + def use_sklearn(self): + return self._use_sklearn + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._x = self._inputs[0] + self._y = self._inputs[1] + + def __call__(self, X, Y=None): + X, Y = self.check_pairwise_arrays(X, Y) + if self._y is None: + self._y = Y + + if X.shape[1] != 2 or Y.shape[1] != 2: + raise ValueError("Haversine distance only valid in 2 dimensions") + if X.issparse() or Y.issparse(): + raise TypeError("Haversine distance requires inputs dense") + + return self.new_tensor( + [X, Y], shape=(X.shape[0], Y.shape[0]), order=TensorOrder.C_ORDER + ) + + @classmethod + def tile(cls, op): + x, y = op.x, op.y + y_is_x = y is x + + if len(x.chunks) == 1 and len(y.chunks) == 1: + return cls._tile_one_chunk(op) + + x, y = yield from cls._rechunk_cols_into_one(x, y) + (ret,) = cls._tile_chunks(op, x, y) + if y_is_x: + fill_diagonal(ret, 0) + return [(yield from recursive_tile(ret))] + + @classmethod + def execute(cls, ctx, op): + (x, y), device_id, xp = as_same_device( + [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + if xp is np and op.use_sklearn and SklearnDistanceMetric is not None: + # CPU and sklearn installed, delegate computation to sklearn + d = SklearnDistanceMetric.get_metric("haversine").pairwise(x, y) + else: + # try to leverage xp(np, cp) to perform computation + sin_0 = xp.sin(0.5 * (x[:, [0]] - y[:, 0])) + sin_1 = xp.sin(0.5 * (x[:, [1]] - y[:, 1])) + d = 2 * xp.arcsin( + xp.sqrt( + sin_0 * sin_0 + + xp.cos(x[:, [0]]) * xp.cos(y[:, 0]) * sin_1 * sin_1 + ) + ) + + ctx[op.outputs[0].key] = d + + +def haversine_distances(X, Y=None): + """Compute the Haversine distance between samples in X and Y + + The Haversine (or great circle) distance is the angular distance between + two points on the surface of a sphere. The first distance of each point is + assumed to be the latitude, the second is the longitude, given in radians. + The dimension of the data must be 2. + + .. math:: + D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x1 - y1) / 2) + + \\cos(x1)\\cos(y1)\\sin^2((x2 - y2) / 2)}] + + Parameters + ---------- + X : array_like, shape (n_samples_1, 2) + + Y : array_like, shape (n_samples_2, 2), optional + + Returns + ------- + distance : {Tensor}, shape (n_samples_1, n_samples_2) + + Notes + ----- + As the Earth is nearly spherical, the haversine formula provides a good + approximation of the distance between two points of the Earth surface, with + a less than 1% error on average. + + Examples + -------- + We want to calculate the distance between the Ezeiza Airport + (Buenos Aires, Argentina) and the Charles de Gaulle Airport (Paris, France) + + >>> from mars.learn.metrics.pairwise import haversine_distances + >>> bsas = [-34.83333, -58.5166646] + >>> paris = [49.0083899664, 2.53844117956] + >>> result = haversine_distances([bsas, paris]) + >>> (result * 6371000/1000).execute() # multiply by Earth radius to get kilometers + array([[ 0. , 11279.45379464], + [11279.45379464, 0. ]]) + """ + op = HaversineDistances(x=X, y=Y, dtype=np.dtype(np.float64)) + return op(X, Y=Y) diff --git a/python/xorbits/_mars/learn/metrics/pairwise/manhattan.py b/python/xorbits/_mars/learn/metrics/pairwise/manhattan.py new file mode 100644 index 000000000..26c45facb --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/pairwise/manhattan.py @@ -0,0 +1,180 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +try: + from sklearn.metrics.pairwise import ( + manhattan_distances as sklearn_manhattan_distances, + ) +except ImportError: # pragma: no cover + sklearn_manhattan_distances = None + +from .... import opcodes as OperandDef +from ....core import recursive_tile +from ....serialization.serializables import BoolField, KeyField +from ....tensor.arithmetic import abs as mt_abs +from ....tensor.array_utils import as_same_device, device +from ....tensor.core import TensorOrder +from ....tensor.spatial.distance import cdist +from ....utils import ensure_own_data +from .core import PairwiseDistances + + +class ManhattanDistances(PairwiseDistances): + _op_type_ = OperandDef.PAIRWISE_MANHATTAN_DISTANCES + + _x = KeyField("x") + _y = KeyField("y") + _sum_over_features = BoolField("sum_over_features") + + def __init__(self, x=None, y=None, sum_over_features=None, use_sklearn=None, **kw): + super().__init__( + _x=x, + _y=y, + _sum_over_features=sum_over_features, + _use_sklearn=use_sklearn, + **kw, + ) + + @property + def x(self): + return self._x + + @property + def y(self): + return self._y + + @property + def sum_over_features(self): + return self._sum_over_features + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._x = self._inputs[0] + self._y = self._inputs[1] + + def __call__(self, X, Y=None): + X, Y = self.check_pairwise_arrays(X, Y) + if self._y is None: + self._y = Y + + if (X.issparse() or Y.issparse()) and not self._sum_over_features: + raise TypeError( + f"sum_over_features={self._sum_over_features} not supported" + " for sparse matrices" + ) + + if not self._sum_over_features: + shape = (X.shape[0] * Y.shape[0], X.shape[1]) + else: + shape = (X.shape[0], Y.shape[0]) + + return self.new_tensor([X, Y], shape=shape, order=TensorOrder.C_ORDER) + + @classmethod + def tile(cls, op): + x, y = op.x, op.y + + if len(x.chunks) == 1 and len(y.chunks) == 1: + return cls._tile_one_chunk(op) + + if x.issparse() or y.issparse(): + assert op.sum_over_features + return cls._tile_chunks(op, x, y) + elif op.sum_over_features: + # if x, y are not sparse and `sum_over_features` is True + # just use cdist + return [(yield from recursive_tile(cdist(x, y, "cityblock")))] + else: + d = x[:, np.newaxis, :] - y[np.newaxis, :, :] + d = mt_abs(d) + d = d.reshape((-1, x.shape[1])) + return [(yield from recursive_tile(d))] + + @classmethod + def execute(cls, ctx, op): + (x, y), device_id, xp = as_same_device( + [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True + ) + out = op.outputs[0] + + with device(device_id): + if sklearn_manhattan_distances is not None: + ctx[out.key] = sklearn_manhattan_distances( + ensure_own_data(x), + ensure_own_data(y), + sum_over_features=op.sum_over_features, + ) + else: # pragma: no cover + # we cannot support sparse + raise NotImplementedError( + "cannot support calculate manhattan distances on GPU" + ) + + +def manhattan_distances(X, Y=None, sum_over_features=True): + """ Compute the L1 distances between the vectors in X and Y. + + With sum_over_features equal to False it returns the componentwise + distances. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : array_like + A tensor with shape (n_samples_X, n_features). + + Y : array_like, optional + A tensor with shape (n_samples_Y, n_features). + + sum_over_features : bool, default=True + If True the function returns the pairwise distance matrix + else it returns the componentwise L1 pairwise-distances. + Not supported for sparse matrix inputs. + + Returns + ------- + D : Tensor + If sum_over_features is False shape is + (n_samples_X * n_samples_Y, n_features) and D contains the + componentwise L1 pairwise-distances (ie. absolute difference), + else shape is (n_samples_X, n_samples_Y) and D contains + the pairwise L1 distances. + + Examples + -------- + >>> from mars.learn.metrics.pairwise import manhattan_distances + >>> manhattan_distances([[3]], [[3]]).execute() #doctest:+ELLIPSIS + array([[0.]]) + >>> manhattan_distances([[3]], [[2]]).execute() #doctest:+ELLIPSIS + array([[1.]]) + >>> manhattan_distances([[2]], [[3]]).execute() #doctest:+ELLIPSIS + array([[1.]]) + >>> manhattan_distances([[1, 2], [3, 4]],\ + [[1, 2], [0, 3]]).execute() #doctest:+ELLIPSIS + array([[0., 2.], + [4., 4.]]) + >>> import mars.tensor as mt + >>> X = mt.ones((1, 2)) + >>> y = mt.full((2, 2), 2.) + >>> manhattan_distances(X, y, sum_over_features=False).execute() #doctest:+ELLIPSIS + array([[1., 1.], + [1., 1.]]) + """ + op = ManhattanDistances( + x=X, y=Y, sum_over_features=sum_over_features, dtype=np.dtype(np.float64) + ) + return op(X, Y=Y) diff --git a/python/xorbits/_mars/learn/metrics/pairwise/pairwise.py b/python/xorbits/_mars/learn/metrics/pairwise/pairwise.py new file mode 100644 index 000000000..5db1b7848 --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/pairwise/pairwise.py @@ -0,0 +1,127 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from functools import partial + +try: + from sklearn.exceptions import DataConversionWarning +except ImportError: # pragma: no cover + DataConversionWarning = None + +from ....tensor.spatial import distance +from ...utils.validation import check_non_negative +from .core import PairwiseDistances +from .cosine import cosine_distances +from .euclidean import euclidean_distances +from .haversine import haversine_distances +from .manhattan import manhattan_distances + +_VALID_METRICS = [ + "euclidean", + "l2", + "l1", + "manhattan", + "cityblock", + "braycurtis", + "canberra", + "chebyshev", + "correlation", + "cosine", + "dice", + "hamming", + "jaccard", + "kulsinski", + "mahalanobis", + "matching", + "minkowski", + "rogerstanimoto", + "russellrao", + "seuclidean", + "sokalmichener", + "sokalsneath", + "sqeuclidean", + "yule", + "wminkowski", + "haversine", +] + +# Helper functions - distance +PAIRWISE_DISTANCE_FUNCTIONS = { + # If updating this dictionary, update the doc in both distance_metrics() + # and also in pairwise_distances()! + "cityblock": manhattan_distances, + "cosine": cosine_distances, + "euclidean": euclidean_distances, + "haversine": haversine_distances, + "l2": euclidean_distances, + "l1": manhattan_distances, + "manhattan": manhattan_distances, + "precomputed": None, # HACK: precomputed is always allowed, never called +} + +# These distances recquire boolean tensors, when using mars.tensor.spatial.distance +PAIRWISE_BOOLEAN_FUNCTIONS = [ + "dice", + "jaccard", + "kulsinski", + "matching", + "rogerstanimoto", + "russellrao", + "sokalmichener", + "sokalsneath", + "yule", +] + + +def pairwise_distances(X, Y=None, metric="euclidean", **kwds): + if ( + metric not in _VALID_METRICS + and not callable(metric) + and metric != "precomputed" + ): + raise ValueError( + f"Unknown metric {metric}. Valid metrics are {_VALID_METRICS}, " + "or 'precomputed', or a callable" + ) + + if metric == "precomputed": + X, _ = PairwiseDistances.check_pairwise_arrays(X, Y, precomputed=True) + + whom = ( + "`pairwise_distances`. Precomputed distance " + " need to have non-negative values." + ) + X = check_non_negative(X, whom=whom) + return X + elif metric in PAIRWISE_DISTANCE_FUNCTIONS: + func = PAIRWISE_DISTANCE_FUNCTIONS[metric] + else: + # including when metric is callable + dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None + + if ( + dtype == bool + and (X.dtype != bool or (Y is not None and Y.dtype != bool)) + and DataConversionWarning is not None + ): + msg = f"Data was converted to boolean for metric {metric}" + warnings.warn(msg, DataConversionWarning) + + X, Y = PairwiseDistances.check_pairwise_arrays(X, Y, dtype=dtype) + if X is Y: + return distance.squareform(distance.pdist(X, metric=metric, **kwds)) + func = partial(distance.cdist, metric=metric, **kwds) + + return func(X, Y, **kwds) diff --git a/python/xorbits/_mars/learn/metrics/pairwise/pairwise_distances_topk.py b/python/xorbits/_mars/learn/metrics/pairwise/pairwise_distances_topk.py new file mode 100644 index 000000000..8e261f7cc --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/pairwise/pairwise_distances_topk.py @@ -0,0 +1,506 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import partial + +import numpy as np + +from .... import opcodes, options +from ....core import recursive_tile +from ....core.operand import OperandStage +from ....serialization.serializables import ( + AnyField, + BoolField, + DictField, + Int64Field, + KeyField, +) +from ....tensor.array_utils import as_same_device, device, get_array_module +from ....tensor.core import TensorOrder +from ....tensor.merge import TensorConcatenate +from ....utils import ensure_own_data, has_unknown_shape +from ...utils import gen_batches, get_chunk_n_rows +from ...utils.validation import _num_samples +from .core import PairwiseDistances + + +def _precompute_metric_params(X, Y, xp, metric=None, **kwds): # pragma: no cover + """Precompute data-derived metric parameters if not provided""" + if metric == "seuclidean" and "V" not in kwds: + if X is Y: + V = xp.var(X, axis=0, ddof=1) + else: + V = xp.var(xp.vstack([X, Y]), axis=0, ddof=1) + return {"V": V} + if metric == "mahalanobis" and "VI" not in kwds: + if X is Y: + VI = xp.linalg.inv(xp.cov(X.T)).T + else: + VI = xp.linalg.inv(xp.cov(xp.vstack([X, Y]).T)).T + return {"VI": VI} + return {} + + +def _check_chunk_size(reduced, chunk_size): # pragma: no cover + """Checks chunk is a sequence of expected size or a tuple of same""" + if reduced is None: + return + is_tuple = isinstance(reduced, tuple) + if not is_tuple: + reduced = (reduced,) + if any(isinstance(r, tuple) or not hasattr(r, "__iter__") for r in reduced): + raise TypeError( + "reduce_func returned %r. " + "Expected sequence(s) of length %d." + % (reduced if is_tuple else reduced[0], chunk_size) + ) + if any(_num_samples(r) != chunk_size for r in reduced): + actual_size = tuple(_num_samples(r) for r in reduced) + raise ValueError( + "reduce_func returned object of length %s. " + "Expected same length as input: %d." + % (actual_size if is_tuple else actual_size[0], chunk_size) + ) + + +def _pariwise_distance_chunked( + X, Y, reduce_func=None, metric="euclidean", working_memory=None, xp=None, **kwds +): + if xp is np: + from sklearn.metrics import pairwise_distances + else: # pragma: no cover + from cuml.metrics import pairwise_distances + + n_samples_X = _num_samples(X) + if metric == "precomputed": # pragma: no cover + slices = (slice(0, n_samples_X),) + else: + # We get as many rows as possible within our working_memory budget to + # store len(Y) distances in each row of output. + # + # Note: + # - this will get at least 1 row, even if 1 row of distances will + # exceed working_memory. + # - this does not account for any temporary memory usage while + # calculating distances (e.g. difference of vectors in manhattan + # distance. + chunk_n_rows = get_chunk_n_rows( + row_bytes=8 * _num_samples(Y), + max_n_rows=n_samples_X, + working_memory=working_memory, + ) + slices = gen_batches(n_samples_X, chunk_n_rows) + + # precompute data-derived metric params + params = _precompute_metric_params(X, Y, xp, metric=metric, **kwds) + kwds.update(**params) + + for sl in slices: + if sl.start == 0 and sl.stop == n_samples_X: + X_chunk = X # enable optimised paths for X is Y + else: + X_chunk = X[sl] + # call pairwise op's execute method to get the result + D_chunk = pairwise_distances( + ensure_own_data(X_chunk), ensure_own_data(Y), metric=metric, **kwds + ) + if (X is Y or Y is None) and metric == "euclidean": + # zeroing diagonal, taking care of aliases of "euclidean", + # i.e. "l2" + D_chunk.flat[ + sl.start :: _num_samples(X) + 1 + ] = 0 # pylint: disable=invalid-slice-index + if reduce_func is not None: + chunk_size = D_chunk.shape[0] + D_chunk = reduce_func(D_chunk, sl.start) + _check_chunk_size(D_chunk, chunk_size) + yield D_chunk + + +class PairwiseDistancesTopk(PairwiseDistances): + _op_type_ = opcodes.PAIRWISE_DISTANCES_TOPK + + _x = KeyField("x") + _y = KeyField("y") + _k = Int64Field("k") + _metric = AnyField("metric") + _metric_kwargs = DictField("metric_kwargs") + _return_index = BoolField("return_index") + _working_memory = AnyField("working_memory") + # for chunks + _y_offset = Int64Field("y_offset") + + def __init__( + self, + x=None, + y=None, + k=None, + metric=None, + metric_kwargs=None, + return_index=None, + working_memory=None, + **kw + ): + super().__init__( + _x=x, + _y=y, + _k=k, + _metric=metric, + _metric_kwargs=metric_kwargs, + _return_index=return_index, + _working_memory=working_memory, + **kw + ) + + @property + def x(self): + return self._x + + @property + def y(self): + return self._y + + @property + def k(self): + return self._k + + @property + def metric(self): + return self._metric + + @property + def metric_kwargs(self): + return self._metric_kwargs + + @property + def return_index(self): + return self._return_index + + @property + def working_memory(self): + return self._working_memory + + @property + def y_offset(self): + return self._y_offset + + @property + def output_limit(self): + return 1 if not self._return_index or self.stage == OperandStage.map else 2 + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if self.stage != OperandStage.agg: + self._x = self._inputs[0] + self._y = self._inputs[1] + else: + self._x = self._y = None + + def __call__(self, X, Y): + from .pairwise import pairwise_distances + + # leverage pairwise_distances for checks + d = pairwise_distances(X, Y, metric=self._metric, **self._metric_kwargs) + + if self._k > Y.shape[0]: + self._k = Y.shape[0] + + X, Y = d.op.inputs + + shape_list = [X.shape[0]] + shape_list.append(min(Y.shape[0], self._k)) + shape = tuple(shape_list) + kws = [ + { + "shape": shape, + "order": TensorOrder.C_ORDER, + "dtype": np.dtype(np.float64), + "_type_": "distance", + }, + ] + if self._return_index: + kws.append( + { + "shape": shape, + "order": TensorOrder.C_ORDER, + "dtype": np.dtype(np.int64), + "_type_": "index", + } + ) + return self.new_tensors([X, Y], kws=kws) + else: + return self.new_tensors([X, Y], kws=kws)[0] + + @classmethod + def _gen_out_chunks(cls, x_chunk, y_chunk, chunk_op): + k = chunk_op.k + i, j = x_chunk.index[0], y_chunk.index[0] + + distance_chunk_params = { + "shape": (x_chunk.shape[0], k), + "order": TensorOrder.C_ORDER, + "dtype": np.dtype(np.float64), + "index": (i, j), + "_type_": "distance", + } + if chunk_op.return_index: + index_chunk_params = { + "shape": (x_chunk.shape[0], k), + "order": TensorOrder.C_ORDER, + "dtype": np.dtype(np.int64), + "index": (i, j), + "_type_": "index", + } + distance_chunk, index_chunk = chunk_op.new_chunks( + [x_chunk, y_chunk], kws=[distance_chunk_params, index_chunk_params] + ) + return distance_chunk, index_chunk + else: + return chunk_op.new_chunks([x_chunk, y_chunk], kws=[distance_chunk_params])[ + 0 + ] + + @classmethod + def tile(cls, op: "PairwiseDistancesTopk"): + X, Y = op.x, op.y + k = op.k + + if X.chunk_shape[1] > 1: + X = yield from recursive_tile(X.rechunk({1: X.shape[1]})) + + if has_unknown_shape(Y): + yield + if Y.chunk_shape[1] > 1: + Y = yield from recursive_tile(Y.rechunk({1: Y.shape[1]})) + + out_distance_chunks, out_index_chunks = [], [] + y_acc_chunk_shapes = [0] + np.cumsum(Y.nsplits[0]).tolist() + for i in range(len(range(X.chunk_shape[0]))): + x_chunk = X.cix[i, 0] + y_chunk_shape = Y.chunk_shape[0] + + if y_chunk_shape == 1: + chunk_op = op.copy().reset_key() + y_chunk = Y.chunks[0] + o = cls._gen_out_chunks(x_chunk, y_chunk, chunk_op) + if chunk_op.return_index: + out_distance_chunks.append(o[0]) + out_index_chunks.append(o[1]) + else: + out_distance_chunks.append(o) + else: + to_concat_chunks = [] + concat_size = 0 + for j in range(y_chunk_shape): + y_chunk = Y.cix[j, 0] + chunk_op = op.copy().reset_key() + chunk_op._y_offset = y_acc_chunk_shapes[j] + chunk_op.stage = OperandStage.map + size = min(k, y_chunk.shape[0]) + o = chunk_op.new_chunk( + [x_chunk, y_chunk], + shape=(x_chunk.shape[0], size), + order=TensorOrder.C_ORDER, + index=(i, j), + ) + to_concat_chunks.append(o) + concat_size += size + + concat_op = TensorConcatenate(axis=1, dtype=to_concat_chunks[0].dtype) + concat = concat_op.new_chunk( + to_concat_chunks, + shape=(x_chunk.shape[0], concat_size), + order=TensorOrder.C_ORDER, + index=(i, 0), + ) + + chunk_op = op.copy().reset_key() + chunk_op.stage = OperandStage.agg + distance_params = { + "shape": (x_chunk.shape[0], k), + "order": TensorOrder.C_ORDER, + "dtype": np.dtype(np.float64), + "index": (i, 0), + "_type_": "distance", + } + if op.return_index: + index_params = { + "shape": (x_chunk.shape[0], k), + "order": TensorOrder.C_ORDER, + "dtype": np.dtype(np.int64), + "index": (i, 0), + "_type": "index", + } + distance_chunk, index_chunk = chunk_op.new_chunks( + [concat], kws=[distance_params, index_params] + ) + out_distance_chunks.append(distance_chunk) + out_index_chunks.append(index_chunk) + else: + out_distance_chunks.append( + chunk_op.new_chunk([concat], kws=[distance_params]) + ) + + new_op = op.copy() + nsplits = (tuple(c.shape[0] for c in out_distance_chunks), (k,)) + params = [o.params for o in op.outputs] + params[0]["chunks"] = out_distance_chunks + params[0]["nsplits"] = nsplits + if op.return_index: + params[1]["chunks"] = out_index_chunks + params[1]["nsplits"] = nsplits + return new_op.new_tensors(op.inputs, kws=params) + + @classmethod + def _topk_reduce_func(cls, dist, start, topk, xp, metric): + """Reduce a chunk of distances to topk + + Parameters + ---------- + dist : array of shape (n_samples_chunk, n_samples) + start : int + The index in X which the first row of dist corresponds to. + topk : int + + Returns + ------- + dist : array of shape (n_samples_chunk, n_neighbors) + neigh : array of shape (n_samples_chunk, n_neighbors) + """ + sample_range = xp.arange(dist.shape[0])[:, None] + if topk - 1 >= dist.shape[1]: + neigh_ind = xp.repeat( + xp.arange(dist.shape[1]).reshape(1, -1), dist.shape[0], axis=0 + ) + else: + neigh_ind = xp.argpartition(dist, topk - 1, axis=1) + neigh_ind = neigh_ind[:, :topk] + # argpartition doesn't guarantee sorted order, so we sort again + neigh_ind = neigh_ind[sample_range, xp.argsort(dist[sample_range, neigh_ind])] + return dist[sample_range, neigh_ind], neigh_ind + + @classmethod + def _calcuate_topk_distances(cls, x, y, op, xp): + metric = op.metric + reduce_func = partial(cls._topk_reduce_func, topk=op.k, xp=xp, metric=op.metric) + kwds = op.metric_kwargs or dict() + need_sqrt = False + if metric == "euclidean" and not kwds.get("squared", False): + need_sqrt = True + kwds["squared"] = True + chunked_results = _pariwise_distance_chunked( + x, + y, + reduce_func=reduce_func, + metric=op.metric, + working_memory=op.working_memory, + xp=xp, + **kwds + ) + neigh_dist, neigh_ind = zip(*chunked_results) + dist, ind = np.vstack(neigh_dist), np.vstack(neigh_ind) + if metric == "euclidean" and need_sqrt: + dist = xp.sqrt(dist) + if getattr(op, "y_offset", None) is not None: + ind += op.y_offset + return dist, ind + + @classmethod + def _execute_map(cls, ctx, op: "PairwiseDistancesTopk"): + (x, y), device_id, xp = as_same_device( + [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + xp = get_array_module(x, nosparse=True) + ctx[op.outputs[0].key] = cls._calcuate_topk_distances(x, y, op, xp) + + @classmethod + def _execute_agg(cls, ctx, op: "PairwiseDistancesTopk"): + inputs, device_id, xp = as_same_device( + [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True + ) + inputs = inputs[0] + distances = inputs[0] + + with device(device_id): + dist, ind = cls._topk_reduce_func(distances, 0, op.k, xp, op.metric) + ctx[op.outputs[0].key] = dist + if op.return_index: + inds = inputs[1] + ind_result = xp.empty_like(ind) + for i in range( + len(ind_result) + ): # pylint: disable=consider-using-enumerate + ind_result[i] = inds[i][ind[i]] + ctx[op.outputs[1].key] = ind_result + + @classmethod + def _execute(cls, ctx, op: "PairwiseDistancesTopk"): + (x, y), device_id, xp = as_same_device( + [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + xp = get_array_module(x, nosparse=True) + dist, ind = cls._calcuate_topk_distances(x, y, op, xp) + dist, ind_on_ind = cls._topk_reduce_func(dist, 0, op.k, xp, op.metric) + ctx[op.outputs[0].key] = dist + if op.return_index: + ind_result = xp.empty_like(ind_on_ind) + for i in range( + len(ind_on_ind) + ): # pylint: disable=consider-using-enumerate + ind_result[i] = ind[i][ind_on_ind[i]] + ctx[op.outputs[1].key] = ind_result + + @classmethod + def execute(cls, ctx, op): + if op.stage == OperandStage.map: + return cls._execute_map(ctx, op) + elif op.stage == OperandStage.agg: + return cls._execute_agg(ctx, op) + else: + return cls._execute(ctx, op) + + +def pairwise_distances_topk( + X, + Y=None, + k=None, + metric="euclidean", + return_index=True, + axis=1, + working_memory=None, + **kwds +): + if k is None: # pragma: no cover + raise ValueError("`k` has to be specified") + + if Y is None: + Y = X + if axis == 0: + X, Y = Y, X + if working_memory is None: + working_memory = options.learn.working_memory + op = PairwiseDistancesTopk( + x=X, + y=Y, + k=k, + metric=metric, + metric_kwargs=kwds, + return_index=return_index, + working_memory=working_memory, + ) + return op(X, Y) diff --git a/python/xorbits/_mars/learn/metrics/pairwise/rbf_kernel.py b/python/xorbits/_mars/learn/metrics/pairwise/rbf_kernel.py new file mode 100644 index 000000000..700bbf87d --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/pairwise/rbf_kernel.py @@ -0,0 +1,51 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .... import tensor as mt +from .core import PairwiseDistances +from .euclidean import euclidean_distances + + +def rbf_kernel(X, Y=None, gamma=None): + """ + Compute the rbf (gaussian) kernel between X and Y:: + + K(x, y) = exp(-gamma ||x-y||^2) + + for each pair of rows x in X and y in Y. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : tensor of shape (n_samples_X, n_features) + + Y : tensor of shape (n_samples_Y, n_features) + + gamma : float, default None + If None, defaults to 1.0 / n_features + + Returns + ------- + kernel_matrix : tensor of shape (n_samples_X, n_samples_Y) + """ + + X, Y = PairwiseDistances.check_pairwise_arrays(X, Y) + if gamma is None: + gamma = 1.0 / X.shape[1] + + K = euclidean_distances(X, Y, squared=True) + K *= -gamma + K = mt.exp(K) + return K diff --git a/python/xorbits/_mars/learn/metrics/pairwise/tests/__init__.py b/python/xorbits/_mars/learn/metrics/pairwise/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/pairwise/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/learn/metrics/pairwise/tests/test_cosine_distances.py b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_cosine_distances.py new file mode 100644 index 000000000..ad20c032e --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_cosine_distances.py @@ -0,0 +1,50 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +import scipy.sparse as sps +from sklearn.metrics.pairwise import cosine_distances as sk_cosine_distances + +from ..... import tensor as mt +from .. import cosine_distances + +raw_dense_x = np.random.rand(25, 10) +raw_dense_y = np.random.rand(17, 10) + +raw_sparse_x = sps.random(25, 10, density=0.5, format="csr", random_state=0) +raw_sparse_y = sps.random(17, 10, density=0.4, format="csr", random_state=1) + +raw_x_ys = [(raw_dense_x, raw_dense_y), (raw_sparse_x, raw_sparse_y)] + + +@pytest.mark.parametrize("raw_x, raw_y", raw_x_ys) +@pytest.mark.parametrize("chunk_size", [25, 6]) +def test_cosine_distances_execution(setup, raw_x, raw_y, chunk_size): + x = mt.tensor(raw_x, chunk_size=chunk_size) + y = mt.tensor(raw_y, chunk_size=chunk_size) + + d = cosine_distances(x, y) + + result = d.execute().fetch() + expected = sk_cosine_distances(raw_x, raw_y) + + np.testing.assert_almost_equal(np.asarray(result), expected) + + d = cosine_distances(x) + + result = d.execute().fetch() + expected = sk_cosine_distances(raw_x) + + np.testing.assert_almost_equal(np.asarray(result), expected) diff --git a/python/xorbits/_mars/learn/metrics/pairwise/tests/test_euclidean_distances.py b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_euclidean_distances.py new file mode 100644 index 000000000..175c87e51 --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_euclidean_distances.py @@ -0,0 +1,131 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +import scipy.sparse as sps +from sklearn.metrics import euclidean_distances as sk_euclidean_distances + +from ..... import tensor as mt +from .....config import option_context +from .....lib.sparse import SparseNDArray +from ....utils import check_array +from ... import euclidean_distances + + +def test_euclidean_distances_op(): + x = mt.random.rand(10, 3) + xx = mt.random.rand(1, 10) + y = mt.random.rand(11, 3) + + d = euclidean_distances(x, X_norm_squared=xx) + assert d.op.x_norm_squared.key == check_array(xx).T.key + + d = euclidean_distances( + x, + y, + X_norm_squared=mt.random.rand(10, 1, dtype=mt.float32), + Y_norm_squared=mt.random.rand(1, 11, dtype=mt.float32), + ) + assert d.op.x_norm_squared is None + assert d.op.y_norm_squared is None + + # XX shape incompatible + with pytest.raises(ValueError): + euclidean_distances(x, X_norm_squared=mt.random.rand(10)) + + # XX shape incompatible + with pytest.raises(ValueError): + euclidean_distances(x, X_norm_squared=mt.random.rand(11, 1)) + + # YY shape incompatible + with pytest.raises(ValueError): + euclidean_distances(x, y, Y_norm_squared=mt.random.rand(10)) + + +def test_euclidean_distances_execution(setup): + dense_raw_x = np.random.rand(30, 10) + dense_raw_y = np.random.rand(40, 10) + sparse_raw_x = SparseNDArray(sps.random(30, 10, density=0.5, format="csr")) + sparse_raw_y = SparseNDArray(sps.random(40, 10, density=0.5, format="csr")) + + for raw_x, raw_y in [(dense_raw_x, dense_raw_y), (sparse_raw_x, sparse_raw_y)]: + x = mt.tensor(raw_x, chunk_size=9) + y = mt.tensor(raw_y, chunk_size=7) + + distance = euclidean_distances(x, y) + + result = distance.execute().fetch() + expected = sk_euclidean_distances(raw_x, Y=raw_y) + np.testing.assert_almost_equal(result, expected) + + x_norm = x.sum(axis=1)[..., np.newaxis] + y_norm = y.sum(axis=1)[np.newaxis, ...] + distance = euclidean_distances( + x, y, X_norm_squared=x_norm, Y_norm_squared=y_norm + ) + x_raw_norm = raw_x.sum(axis=1)[..., np.newaxis] + y_raw_norm = raw_y.sum(axis=1)[np.newaxis, ...] + + result = distance.execute().fetch() + expected = sk_euclidean_distances( + raw_x, raw_y, X_norm_squared=x_raw_norm, Y_norm_squared=y_raw_norm + ) + np.testing.assert_almost_equal(result, expected) + + x_sq = (x**2).astype(np.float32) + y_sq = (y**2).astype(np.float32) + + distance = euclidean_distances(x_sq, y_sq, squared=True) + + x_raw_sq = (raw_x**2).astype(np.float32) + y_raw_sq = (raw_y**2).astype(np.float32) + + result = distance.execute().fetch() + expected = sk_euclidean_distances(x_raw_sq, y_raw_sq, squared=True) + np.testing.assert_almost_equal(result, expected, decimal=6) + + # test x is y + distance = euclidean_distances(x) + + result = distance.execute().fetch() + expected = sk_euclidean_distances(raw_x) + + np.testing.assert_almost_equal(result, expected) + + # test size adjust + raw1 = np.random.rand(12, 4) + raw2 = np.random.rand(18, 4) + + t1 = mt.tensor(raw1, chunk_size=4) + t2 = mt.tensor(raw2, chunk_size=6) + with option_context({"chunk_store_limit": 80}): + distance = euclidean_distances(t1, t2) + + result = distance.execute().fetch() + expected = sk_euclidean_distances(raw1, raw2) + np.testing.assert_almost_equal(result, expected) + + distance = euclidean_distances(t2, t1) + + result = distance.execute().fetch() + expected = sk_euclidean_distances(raw2, raw1) + np.testing.assert_almost_equal(result, expected) + + with option_context({"chunk_store_limit": 20}): + distance = euclidean_distances(t1, t2) + + result = distance.execute().fetch() + expected = sk_euclidean_distances(raw1, raw2) + np.testing.assert_almost_equal(result, expected) diff --git a/python/xorbits/_mars/learn/metrics/pairwise/tests/test_haversine_distances.py b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_haversine_distances.py new file mode 100644 index 000000000..7b78e42ad --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_haversine_distances.py @@ -0,0 +1,65 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +from sklearn.metrics.pairwise import haversine_distances as sk_haversine_distances + +from ..... import tensor as mt +from .. import haversine_distances + + +def test_haversine_distances_op(): + # shape[1] != 2 + with pytest.raises(ValueError): + haversine_distances(mt.random.rand(10, 3)) + + # shape[1] != 2 + with pytest.raises(ValueError): + haversine_distances(mt.random.rand(10, 2), mt.random.rand(11, 3)) + + # cannot support sparse + with pytest.raises(TypeError): + haversine_distances(mt.random.randint(10, size=(10, 2), density=0.5)) + + +raw_x = np.random.rand(30, 2) +raw_y = np.random.rand(21, 2) + +# one chunk +x1 = mt.tensor(raw_x, chunk_size=30) +y1 = mt.tensor(raw_y, chunk_size=30) + +# multiple chunks +x2 = mt.tensor(raw_x, chunk_size=(11, 1)) +y2 = mt.tensor(raw_y, chunk_size=(17, 1)) + + +@pytest.mark.parametrize("x, y", [(x1, y1), (x2, y2)]) +@pytest.mark.parametrize("use_sklearn", [True, False]) +def test_haversine_distances_execution(setup, x, y, use_sklearn): + distance = haversine_distances(x, y) + distance.op._use_sklearn = use_sklearn + + result = distance.execute().fetch() + expected = sk_haversine_distances(raw_x, raw_y) + np.testing.assert_almost_equal(result, expected) + + # test x is y + distance = haversine_distances(x) + distance.op._use_sklearn = use_sklearn + + result = distance.execute().fetch() + expected = sk_haversine_distances(raw_x, raw_x) + np.testing.assert_almost_equal(result, expected) diff --git a/python/xorbits/_mars/learn/metrics/pairwise/tests/test_manhattan_distances.py b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_manhattan_distances.py new file mode 100644 index 000000000..70541eea7 --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_manhattan_distances.py @@ -0,0 +1,84 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +import scipy.sparse as sps +from sklearn.metrics.pairwise import manhattan_distances as sk_manhattan_distances + +from ..... import tensor as mt +from .. import manhattan_distances + + +def test_manhattan_distances(): + x = mt.random.randint(10, size=(10, 3), density=0.4) + y = mt.random.randint(10, size=(11, 3), density=0.5) + + with pytest.raises(TypeError): + manhattan_distances(x, y, sum_over_features=False) + + x = x.todense() + y = y.todense() + + d = manhattan_distances(x, y, sum_over_features=True) + assert d.shape == (10, 11) + d = manhattan_distances(x, y, sum_over_features=False) + assert d.shape == (110, 3) + + +raw_x = np.random.rand(20, 5) +raw_y = np.random.rand(21, 5) + +x1 = mt.tensor(raw_x, chunk_size=30) +y1 = mt.tensor(raw_y, chunk_size=30) + +x2 = mt.tensor(raw_x, chunk_size=11) +y2 = mt.tensor(raw_y, chunk_size=12) + +raw_sparse_x = sps.random(20, 5, density=0.4, format="csr", random_state=0) +raw_sparse_y = sps.random(21, 5, density=0.3, format="csr", random_state=0) + +x3 = mt.tensor(raw_sparse_x, chunk_size=30) +y3 = mt.tensor(raw_sparse_y, chunk_size=30) + +x4 = mt.tensor(raw_sparse_x, chunk_size=11) +y4 = mt.tensor(raw_sparse_y, chunk_size=12) + + +@pytest.mark.parametrize( + "x, y, is_sparse", + [(x1, y1, False), (x2, y2, False), (x3, y3, True), (x4, y4, True)], +) +def test_manhattan_distances_execution(setup, x, y, is_sparse): + if is_sparse: + rx, ry = raw_sparse_x, raw_sparse_y + else: + rx, ry = raw_x, raw_y + + sv = [True, False] if not is_sparse else [True] + + for sum_over_features in sv: + d = manhattan_distances(x, y, sum_over_features) + + result = d.execute().fetch() + expected = sk_manhattan_distances(rx, ry, sum_over_features=sum_over_features) + + np.testing.assert_almost_equal(result, expected) + + d = manhattan_distances(x, sum_over_features=sum_over_features) + + result = d.execute().fetch() + expected = sk_manhattan_distances(rx, sum_over_features=sum_over_features) + + np.testing.assert_almost_equal(result, expected) diff --git a/python/xorbits/_mars/learn/metrics/pairwise/tests/test_pariwise_distances.py b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_pariwise_distances.py new file mode 100644 index 000000000..5b61f3683 --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_pariwise_distances.py @@ -0,0 +1,114 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +from sklearn.exceptions import DataConversionWarning +from sklearn.metrics import pairwise_distances as sk_pairwise_distances +from sklearn.neighbors import NearestNeighbors as SkNearestNeighbors + +from ..... import tensor as mt +from .....session import execute, fetch +from ... import pairwise_distances, pairwise_distances_topk + + +def test_pairwise_distances_execution(setup): + raw_x = np.random.rand(20, 5) + raw_y = np.random.rand(21, 5) + + x = mt.tensor(raw_x, chunk_size=11) + y = mt.tensor(raw_y, chunk_size=12) + + d = pairwise_distances(x, y) + result = d.execute().fetch() + expected = sk_pairwise_distances(raw_x, raw_y) + np.testing.assert_almost_equal(result, expected) + + # test precomputed + d2 = d.copy() + d2[0, 0] = -1 + d2 = pairwise_distances(d2, y, metric="precomputed") + with pytest.raises(ValueError): + _ = d2.execute().fetch() + + # test cdist + weight = np.random.rand(5) + d = pairwise_distances(x, y, metric="wminkowski", p=3, w=weight) + result = d.execute().fetch() + expected = sk_pairwise_distances(raw_x, raw_y, metric="minkowski", p=3, w=weight) + np.testing.assert_almost_equal(result, expected) + + # test pdist + d = pairwise_distances(x, metric="hamming") + result = d.execute().fetch() + expected = sk_pairwise_distances(raw_x, metric="hamming") + np.testing.assert_almost_equal(result, expected) + + # test function metric + m = lambda u, v: np.sqrt(((u - v) ** 2).sum()) + d = pairwise_distances(x, y, metric=m) + result = d.execute().fetch() + expected = sk_pairwise_distances(raw_x, raw_y, metric=m) + np.testing.assert_almost_equal(result, expected) + + with pytest.warns(DataConversionWarning): + pairwise_distances(x, y, metric="jaccard") + + with pytest.raises(ValueError): + _ = pairwise_distances(x, y, metric="unknown") + + +def test_pairwise_distances_topk_execution(setup): + rs = np.random.RandomState(0) + raw_x = rs.rand(20, 5) + raw_y = rs.rand(21, 5) + + x = mt.tensor(raw_x, chunk_size=11) + y = mt.tensor(raw_y, chunk_size=12) + + d, i = pairwise_distances_topk(x, y, 3, metric="euclidean", return_index=True) + result = fetch(*execute(d, i)) + nn = SkNearestNeighbors(n_neighbors=3, algorithm="brute", metric="euclidean") + nn.fit(raw_y) + expected = nn.kneighbors(raw_x, return_distance=True) + np.testing.assert_almost_equal(result[0], expected[0]) + np.testing.assert_array_equal(result[1], expected[1]) + + x = mt.tensor(raw_x, chunk_size=(11, 3)) + + d = pairwise_distances_topk(x, k=4, metric="euclidean", return_index=False) + result = d.execute().fetch() + nn = SkNearestNeighbors(n_neighbors=3, algorithm="brute", metric="euclidean") + nn.fit(raw_x) + expected = nn.kneighbors(return_distance=True)[0] + np.testing.assert_almost_equal(result[:, 1:], expected) + + y = mt.tensor(raw_y, chunk_size=21) + + d, i = pairwise_distances_topk( + x, y, 3, metric="cosine", return_index=True, working_memory="168" + ) + result = fetch(*execute(d, i)) + nn = SkNearestNeighbors(n_neighbors=3, algorithm="brute", metric="cosine") + nn.fit(raw_y) + expected = nn.kneighbors(raw_x, return_distance=True) + np.testing.assert_almost_equal(result[0], expected[0]) + np.testing.assert_array_equal(result[1], expected[1]) + + d = pairwise_distances_topk(x, y, 3, metric="cosine", axis=0, return_index=False) + result = d.execute().fetch() + nn = SkNearestNeighbors(n_neighbors=3, algorithm="brute", metric="cosine") + nn.fit(raw_x) + expected = nn.kneighbors(raw_y, return_distance=True)[0] + np.testing.assert_almost_equal(result, expected) diff --git a/python/xorbits/_mars/learn/metrics/pairwise/tests/test_rbf_kernel.py b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_rbf_kernel.py new file mode 100644 index 000000000..510ae679d --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_rbf_kernel.py @@ -0,0 +1,30 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from sklearn.metrics.pairwise import rbf_kernel as sklearn_rbf_kernel + +from .. import rbf_kernel + + +def test_rbf_kernel(setup): + rs = np.random.RandomState(0) + raw_X = rs.rand(10, 4) + raw_Y = rs.rand(11, 4) + + r = rbf_kernel(raw_X, raw_Y) + result = r.to_numpy() + expected = sklearn_rbf_kernel(raw_X, raw_Y) + + np.testing.assert_almost_equal(result, expected) diff --git a/python/xorbits/_mars/learn/metrics/tests/__init__.py b/python/xorbits/_mars/learn/metrics/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/learn/metrics/tests/test_classification.py b/python/xorbits/_mars/learn/metrics/tests/test_classification.py new file mode 100644 index 000000000..6cb95cf96 --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/tests/test_classification.py @@ -0,0 +1,591 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +from sklearn import datasets, svm +from sklearn.exceptions import UndefinedMetricWarning +from sklearn.metrics import accuracy_score as sklearn_accuracy_score +from sklearn.utils import check_random_state +from sklearn.utils._mocking import MockDataFrame +from sklearn.utils._testing import ( + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) + +from .... import execute, fetch +from .... import tensor as mt +from ....lib.sparse import SparseNDArray +from .. import accuracy_score, log_loss +from .._classification import ( + _check_targets, + f1_score, + fbeta_score, + multilabel_confusion_matrix, + precision_recall_fscore_support, + precision_score, + recall_score, +) + +IND = "multilabel-indicator" +MC = "multiclass" +BIN = "binary" +CNT = "continuous" +MMC = "multiclass-multioutput" +MCN = "continuous-multioutput" +# all of length 3 +EXAMPLES = [ + (IND, np.array([[0, 1, 1], [1, 0, 0], [0, 0, 1]])), + # must not be considered binary + (IND, np.array([[0, 1], [1, 0], [1, 1]])), + (MC, [2, 3, 1]), + (BIN, [0, 1, 1]), + (CNT, [0.0, 1.5, 1.0]), + (MC, np.array([[2], [3], [1]])), + (BIN, np.array([[0], [1], [1]])), + (CNT, np.array([[0.0], [1.5], [1.0]])), + (MMC, np.array([[0, 2], [1, 3], [2, 3]])), + (MCN, np.array([[0.5, 2.0], [1.1, 3.0], [2.0, 3.0]])), +] +# expected type given input types, or None for error +# (types will be tried in either order) +EXPECTED = { + (IND, IND): IND, + (MC, MC): MC, + (BIN, BIN): BIN, + (MC, IND): None, + (BIN, IND): None, + (BIN, MC): MC, + # Disallowed types + (CNT, CNT): None, + (MMC, MMC): None, + (MCN, MCN): None, + (IND, CNT): None, + (MC, CNT): None, + (BIN, CNT): None, + (MMC, CNT): None, + (MCN, CNT): None, + (IND, MMC): None, + (MC, MMC): None, + (BIN, MMC): None, + (MCN, MMC): None, + (IND, MCN): None, + (MC, MCN): None, + (BIN, MCN): None, +} + + +############################################################################### +# Utilities for testing + + +def make_prediction(dataset=None, binary=False): + """Make some classification predictions on a toy dataset using a SVC + + If binary is True restrict to a binary classification problem instead of a + multiclass classification problem + """ + + if dataset is None: + # import some data to play with + dataset = datasets.load_iris() + + X = dataset.data + y = dataset.target + + if binary: + # restrict to a binary classification task + X, y = X[y < 2], y[y < 2] + + n_samples, n_features = X.shape + p = np.arange(n_samples) + + rng = check_random_state(37) + rng.shuffle(p) + X, y = X[p], y[p] + half = int(n_samples / 2) + + # add noisy features to make the problem harder and avoid perfect results + rng = np.random.RandomState(0) + X = np.c_[X, rng.randn(n_samples, 200 * n_features)] + + # run classifier, get class probabilities and label predictions + clf = svm.SVC(kernel="linear", probability=True, random_state=0) + probas_pred = clf.fit(X[:half], y[:half]).predict_proba(X[half:]) + + if binary: + # only interested in probabilities of the positive case + # XXX: do we really want a special API for the binary case? + probas_pred = probas_pred[:, 1] + + y_pred = clf.predict(X[half:]) + y_true = y[half:] + return y_true, y_pred, probas_pred + + +@pytest.mark.parametrize("type1, y1", EXAMPLES) +@pytest.mark.parametrize("type2, y2", EXAMPLES) +def test__check_targets(setup, type1, y1, type2, y2): + # Check that _check_targets correctly merges target types, squeezes + # output and fails if input lengths differ. + try: + expected = EXPECTED[type1, type2] + except KeyError: + expected = EXPECTED[type2, type1] + if expected is None: + with pytest.raises(ValueError): + _check_targets(y1, y2).execute() + + if type1 != type2: + with pytest.raises(ValueError): + _check_targets(y1, y2).execute() + + else: + if type1 not in (BIN, MC, IND): + with pytest.raises(ValueError): + _check_targets(y1, y2).execute() + + else: + merged_type, y1out, y2out = _check_targets(y1, y2).execute().fetch() + assert merged_type == expected + if merged_type.startswith("multilabel"): + assert isinstance(y1out, SparseNDArray) + assert isinstance(y2out, SparseNDArray) + else: + np.testing.assert_array_equal(y1out, np.squeeze(y1)) + np.testing.assert_array_equal(y2out, np.squeeze(y2)) + with pytest.raises(ValueError): + _check_targets(y1[:-1], y2).execute() + + +def test_accuracy_score(setup): + y_pred = [0, 2, 1, 3] + y_true = [0, 1, 2, 3] + + score = accuracy_score(y_true, y_pred) + result = score.execute().fetch() + expected = sklearn_accuracy_score(y_true, y_pred) + assert pytest.approx(result) == expected + + score = accuracy_score(y_true, y_pred, normalize=False) + result = score.execute().fetch() + expected = sklearn_accuracy_score(y_true, y_pred, normalize=False) + assert pytest.approx(result) == expected + + y_pred = np.array([[0, 1], [1, 1]]) + y_true = np.ones((2, 2)) + score = accuracy_score(y_true, y_pred) + result = score.execute().fetch() + expected = sklearn_accuracy_score(y_true, y_pred) + assert pytest.approx(result) == expected + + sample_weight = [0.7, 0.3] + score = accuracy_score(y_true, y_pred, sample_weight=sample_weight) + result = score.execute().fetch() + expected = sklearn_accuracy_score(y_true, y_pred, sample_weight=sample_weight) + assert pytest.approx(result) == expected + + score = accuracy_score( + mt.tensor(y_true), + mt.tensor(y_pred), + sample_weight=mt.tensor(sample_weight), + normalize=False, + ) + result = score.execute().fetch() + expected = sklearn_accuracy_score( + y_true, y_pred, sample_weight=sample_weight, normalize=False + ) + assert pytest.approx(result) == expected + + +def test_log_loss(setup): + # binary case with symbolic labels ("no" < "yes") + y_true = ["no", "no", "no", "yes", "yes", "yes"] + y_pred = mt.array( + [[0.5, 0.5], [0.1, 0.9], [0.01, 0.99], [0.9, 0.1], [0.75, 0.25], [0.001, 0.999]] + ) + loss = log_loss(y_true, y_pred).fetch() + assert_almost_equal(loss, 1.8817971) + + # multiclass case; adapted from http://bit.ly/RJJHWA + y_true = [1, 0, 2] + y_pred = [[0.2, 0.7, 0.1], [0.6, 0.2, 0.2], [0.6, 0.1, 0.3]] + loss = log_loss(y_true, y_pred, normalize=True).fetch() + assert_almost_equal(loss, 0.6904911) + + # check that we got all the shapes and axes right + # by doubling the length of y_true and y_pred + y_true *= 2 + y_pred *= 2 + loss = log_loss(y_true, y_pred, normalize=False).fetch() + assert_almost_equal(loss, 0.6904911 * 6, decimal=6) + + # check eps and handling of absolute zero and one probabilities + y_pred = np.asarray(y_pred) > 0.5 + loss = log_loss(y_true, y_pred, normalize=True, eps=0.1).fetch() + assert_almost_equal(loss, log_loss(y_true, np.clip(y_pred, 0.1, 0.9)).fetch()) + + # raise error if number of classes are not equal. + y_true = [1, 0, 2] + y_pred = [[0.2, 0.7], [0.6, 0.5], [0.4, 0.1]] + with pytest.raises(ValueError): + log_loss(y_true, y_pred) + with pytest.raises(ValueError): + log_loss(y_true, y_pred, labels=[0, 1, 2]) + + # case when y_true is a string array object + y_true = ["ham", "spam", "spam", "ham"] + y_pred = [[0.2, 0.7], [0.6, 0.5], [0.4, 0.1], [0.7, 0.2]] + loss = log_loss(y_true, y_pred).fetch() + assert_almost_equal(loss, 1.0383217, decimal=6) + + # test labels option + + y_true = [2, 2] + y_pred = [[0.2, 0.7], [0.6, 0.5]] + y_score = np.array([[0.1, 0.9], [0.1, 0.9]]) + error_str = ( + r"y_true contains only one label \(2\). Please provide " + r"the true labels explicitly through the labels argument." + ) + with pytest.raises(ValueError, match=error_str): + log_loss(y_true, y_pred) + error_str = ( + r"The labels array needs to contain at least two " + r"labels for log_loss, got \[1\]." + ) + with pytest.raises(ValueError, match=error_str): + log_loss(y_true, y_pred, labels=[1]) + + # works when the labels argument is used + + true_log_loss = -np.mean(np.log(y_score[:, 1])) + calculated_log_loss = log_loss(y_true, y_score, labels=[1, 2]).fetch() + assert_almost_equal(calculated_log_loss, true_log_loss) + + # ensure labels work when len(np.unique(y_true)) != y_pred.shape[1] + y_true = [1, 2, 2] + y_score2 = [[0.2, 0.7, 0.3], [0.6, 0.5, 0.3], [0.3, 0.9, 0.1]] + loss = log_loss(y_true, y_score2, labels=[1, 2, 3]).fetch() + assert_almost_equal(loss, 1.0630345, decimal=6) + + +def test_log_loss_pandas_input(setup): + # case when input is a pandas series and dataframe gh-5715 + y_tr = np.array(["ham", "spam", "spam", "ham"]) + y_pr = np.array([[0.2, 0.7], [0.6, 0.5], [0.4, 0.1], [0.7, 0.2]]) + types = [(MockDataFrame, MockDataFrame)] + try: + from pandas import DataFrame, Series + + types.append((Series, DataFrame)) + except ImportError: + pass + for TrueInputType, PredInputType in types: + # y_pred dataframe, y_true series + y_true, y_pred = TrueInputType(y_tr), PredInputType(y_pr) + loss = log_loss(y_true, y_pred).fetch() + assert_almost_equal(loss, 1.0383217, decimal=6) + + +def test_multilabel_confusion_matrix_binary(setup): + # Test multilabel confusion matrix - binary classification case + y_true, y_pred, _ = make_prediction(binary=True) + y_true = mt.tensor(y_true, chunk_size=40) + y_pred = mt.tensor(y_pred, chunk_size=40) + + def run_test(y_true, y_pred): + cm = multilabel_confusion_matrix(y_true, y_pred).fetch() + assert_array_equal(cm, [[[17, 8], [3, 22]], [[22, 3], [8, 17]]]) + + run_test(y_true, y_pred) + run_test(y_true.astype(str), y_pred.astype(str)) + + +def test_multilabel_confusion_matrix_multiclass(setup): + # Test multilabel confusion matrix - multi-class case + y_true, y_pred, _ = make_prediction(binary=False) + y_true = mt.tensor(y_true, chunk_size=40) + y_pred = mt.tensor(y_pred, chunk_size=40) + + def run_test(y_true, y_pred, string_type=False): + # compute confusion matrix with default labels introspection + cm = multilabel_confusion_matrix(y_true, y_pred).fetch() + assert_array_equal( + cm, [[[47, 4], [5, 19]], [[38, 6], [28, 3]], [[30, 25], [2, 18]]] + ) + + # compute confusion matrix with explicit label ordering + labels = ["0", "2", "1"] if string_type else [0, 2, 1] + cm = multilabel_confusion_matrix(y_true, y_pred, labels=labels).fetch() + assert_array_equal( + cm, [[[47, 4], [5, 19]], [[30, 25], [2, 18]], [[38, 6], [28, 3]]] + ) + + # compute confusion matrix with super set of present labels + labels = ["0", "2", "1", "3"] if string_type else [0, 2, 1, 3] + cm = multilabel_confusion_matrix(y_true, y_pred, labels=labels).fetch() + assert_array_equal( + cm, + [ + [[47, 4], [5, 19]], + [[30, 25], [2, 18]], + [[38, 6], [28, 3]], + [[75, 0], [0, 0]], + ], + ) + + run_test(y_true, y_pred) + run_test(y_true.astype(str), y_pred.astype(str), string_type=True) + + +def test_multilabel_confusion_matrix_multilabel(setup): + # Test multilabel confusion matrix - multilabel-indicator case + from scipy.sparse import csc_matrix, csr_matrix + + y_true = np.array([[1, 0, 1], [0, 1, 0], [1, 1, 0]]) + y_pred = np.array([[1, 0, 0], [0, 1, 1], [0, 0, 1]]) + y_true_csr = csr_matrix(y_true) + y_pred_csr = csr_matrix(y_pred) + y_true_csc = csc_matrix(y_true) + y_pred_csc = csc_matrix(y_pred) + + y_true_t = mt.tensor(y_true) + y_pred_t = mt.tensor(y_pred) + + # cross test different types + sample_weight = np.array([2, 1, 3]) + real_cm = [[[1, 0], [1, 1]], [[1, 0], [1, 1]], [[0, 2], [1, 0]]] + trues = [y_true_t, y_true_csr, y_true_csc] + preds = [y_pred_t, y_pred_csr, y_pred_csc] + + for y_true_tmp in trues: + for y_pred_tmp in preds: + cm = multilabel_confusion_matrix(y_true_tmp, y_pred_tmp).fetch() + assert_array_equal(cm, real_cm) + + # test support for samplewise + cm = multilabel_confusion_matrix(y_true_t, y_pred_t, samplewise=True).fetch() + assert_array_equal(cm, [[[1, 0], [1, 1]], [[1, 1], [0, 1]], [[0, 1], [2, 0]]]) + + # test support for labels + cm = multilabel_confusion_matrix(y_true_t, y_pred_t, labels=[2, 0]).fetch() + assert_array_equal(cm, [[[0, 2], [1, 0]], [[1, 0], [1, 1]]]) + + # test support for labels with samplewise + cm = multilabel_confusion_matrix( + y_true_t, y_pred_t, labels=[2, 0], samplewise=True + ).fetch() + assert_array_equal(cm, [[[0, 0], [1, 1]], [[1, 1], [0, 0]], [[0, 1], [1, 0]]]) + + # test support for sample_weight with sample_wise + cm = multilabel_confusion_matrix( + y_true_t, y_pred_t, sample_weight=sample_weight, samplewise=True + ).fetch() + assert_array_equal(cm, [[[2, 0], [2, 2]], [[1, 1], [0, 1]], [[0, 3], [6, 0]]]) + + +def test_multilabel_confusion_matrix_errors(setup): + y_true = mt.array([[1, 0, 1], [0, 1, 0], [1, 1, 0]]) + y_pred = mt.array([[1, 0, 0], [0, 1, 1], [0, 0, 1]]) + + # Bad sample_weight + with pytest.raises(ValueError, match="inconsistent numbers of samples"): + multilabel_confusion_matrix(y_true, y_pred, sample_weight=[1, 2]) + with pytest.raises(ValueError, match="should be a 1d array"): + multilabel_confusion_matrix( + y_true, y_pred, sample_weight=[[1, 2, 3], [2, 3, 4], [3, 4, 5]] + ) + + # Bad labels + err_msg = r"All labels must be in \[0, n labels\)" + with pytest.raises(ValueError, match=err_msg): + multilabel_confusion_matrix(y_true, y_pred, labels=[-1]) + err_msg = r"All labels must be in \[0, n labels\)" + with pytest.raises(ValueError, match=err_msg): + multilabel_confusion_matrix(y_true, y_pred, labels=[3]) + + # Using samplewise outside multilabel + with pytest.raises(ValueError, match="Samplewise metrics"): + multilabel_confusion_matrix([0, 1, 2], [1, 2, 0], samplewise=True) + + # Bad y_type + err_msg = "multiclass-multioutput is not supported" + with pytest.raises(ValueError, match=err_msg): + multilabel_confusion_matrix([[0, 1, 2], [2, 1, 0]], [[1, 2, 0], [1, 0, 2]]) + + +@pytest.mark.parametrize("average", ["macro", "micro", "weighted", "samples"]) +def test_precision_recall_f1_no_labels_check_warnings(setup, average): + y_true = mt.zeros((20, 3)) + y_pred = mt.zeros_like(y_true) + + func = precision_recall_fscore_support + with pytest.warns(UndefinedMetricWarning): + p, r, f, s = func(y_true, y_pred, average=average, beta=1.0) + p, r, f = fetch(execute(p, r, f)) + + assert_almost_equal(p, 0) + assert_almost_equal(r, 0) + assert_almost_equal(f, 0) + assert s is None + + with pytest.warns(UndefinedMetricWarning): + fbeta = fetch(execute(fbeta_score(y_true, y_pred, average=average, beta=1.0))) + + assert_almost_equal(fbeta, 0) + + +def test_precision_recall_f1_score_multiclass(setup): + # Test Precision Recall and F1 Score for multiclass classification task + y_true, y_pred, _ = make_prediction(binary=False) + y_true = mt.tensor(y_true, chunk_size=40) + y_pred = mt.tensor(y_pred, chunk_size=40) + + # compute scores with default labels introspection + p, r, f, s = fetch( + execute(precision_recall_fscore_support(y_true, y_pred, average=None)) + ) + assert_array_almost_equal(p, [0.83, 0.33, 0.42], 2) + assert_array_almost_equal(r, [0.79, 0.09, 0.90], 2) + assert_array_almost_equal(f, [0.81, 0.15, 0.57], 2) + assert_array_equal(s, [24, 31, 20]) + + # averaging tests + ps = fetch(execute(precision_score(y_true, y_pred, pos_label=1, average="micro"))) + assert_array_almost_equal(ps, 0.53, 2) + + rs = fetch(execute(recall_score(y_true, y_pred, average="micro"))) + assert_array_almost_equal(rs, 0.53, 2) + + fs = fetch(execute(f1_score(y_true, y_pred, average="micro"))) + assert_array_almost_equal(fs, 0.53, 2) + + ps = fetch(execute(precision_score(y_true, y_pred, average="macro"))) + assert_array_almost_equal(ps, 0.53, 2) + + rs = fetch(execute(recall_score(y_true, y_pred, average="macro"))) + assert_array_almost_equal(rs, 0.60, 2) + + fs = fetch(execute(f1_score(y_true, y_pred, average="macro"))) + assert_array_almost_equal(fs, 0.51, 2) + + ps = fetch(execute(precision_score(y_true, y_pred, average="weighted"))) + assert_array_almost_equal(ps, 0.51, 2) + + rs = fetch(execute(recall_score(y_true, y_pred, average="weighted"))) + assert_array_almost_equal(rs, 0.53, 2) + + fs = fetch(execute(f1_score(y_true, y_pred, average="weighted"))) + assert_array_almost_equal(fs, 0.47, 2) + + with pytest.raises(ValueError): + precision_score(y_true, y_pred, average="samples") + with pytest.raises(ValueError): + recall_score(y_true, y_pred, average="samples") + with pytest.raises(ValueError): + f1_score(y_true, y_pred, average="samples") + with pytest.raises(ValueError): + fbeta_score(y_true, y_pred, average="samples", beta=0.5) + + # same prediction but with and explicit label ordering + p, r, f, s = fetch( + execute( + precision_recall_fscore_support( + y_true, y_pred, labels=[0, 2, 1], average=None + ) + ) + ) + assert_array_almost_equal(p, [0.83, 0.41, 0.33], 2) + assert_array_almost_equal(r, [0.79, 0.90, 0.10], 2) + assert_array_almost_equal(f, [0.81, 0.57, 0.15], 2) + assert_array_equal(s, [24, 20, 31]) + + +@pytest.mark.parametrize("average", ["samples", "micro", "macro", "weighted", None]) +def test_precision_refcall_f1_score_multilabel_unordered_labels(setup, average): + # test that labels need not be sorted in the multilabel case + y_true = mt.array([[1, 1, 0, 0]]) + y_pred = mt.array([[0, 0, 1, 1]]) + p, r, f, s = precision_recall_fscore_support( + y_true, y_pred, labels=[3, 0, 1, 2], warn_for=[], average=average + ) + p, r, f = fetch(execute(p, r, f)) + assert_array_equal(p, 0) + assert_array_equal(r, 0) + assert_array_equal(f, 0) + if average is None: + assert_array_equal(s, [0, 1, 1, 0]) + + +def test_precision_recall_f1_score_binary_averaged(setup): + y_true = mt.array([0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1], chunk_size=10) + y_pred = mt.array([1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1], chunk_size=10) + + # compute scores with default labels introspection + ps, rs, fs, _ = precision_recall_fscore_support(y_true, y_pred, average=None) + ps, rs, fs = fetch(execute(ps, rs, fs)) + p, r, f, _ = precision_recall_fscore_support(y_true, y_pred, average="macro") + p, r, f = fetch(execute(p, r, f)) + assert p == np.mean(ps) + assert r == np.mean(rs) + assert f == np.mean(fs) + p, r, f, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted") + p, r, f = fetch(execute(p, r, f)) + support = np.bincount(y_true).execute().fetch() + assert p == np.average(ps, weights=support) + assert r == np.average(rs, weights=support) + assert f == np.average(fs, weights=support) + + +def test_zero_precision_recall(setup): + # Check that pathological cases do not bring NaNs + + old_error_settings = np.seterr(all="raise") + + try: + y_true = mt.array([0, 1, 2, 0, 1, 2], chunk_size=4) + y_pred = mt.array([2, 0, 1, 1, 2, 0], chunk_size=4) + + assert_almost_equal( + precision_score(y_true, y_pred, average="macro").execute().fetch(), 0.0, 2 + ) + assert_almost_equal( + recall_score(y_true, y_pred, average="macro").execute().fetch(), 0.0, 2 + ) + assert_almost_equal( + f1_score(y_true, y_pred, average="macro").execute().fetch(), 0.0, 2 + ) + + finally: + np.seterr(**old_error_settings) + + +def test_precision_recall_f_binary_single_class(setup): + # Test precision, recall and F-scores behave with a single positive or + # negative class + # Such a case may occur with non-stratified cross-validation + assert 1.0 == fetch(execute(precision_score([1, 1], [1, 1]))) + assert 1.0 == fetch(execute(recall_score([1, 1], [1, 1]))) + assert 1.0 == fetch(execute(f1_score([1, 1], [1, 1]))) + assert 1.0 == fetch(execute(fbeta_score([1, 1], [1, 1], beta=0))) + + assert 0.0 == fetch(execute(precision_score([-1, -1], [-1, -1]))) + assert 0.0 == fetch(execute(recall_score([-1, -1], [-1, -1]))) + assert 0.0 == fetch(execute(f1_score([-1, -1], [-1, -1]))) + assert 0.0 == fetch(execute(fbeta_score([-1, -1], [-1, -1], beta=float("inf")))) + assert fetch( + execute(fbeta_score([-1, -1], [-1, -1], beta=float("inf"))) + ) == pytest.approx(fetch(execute(fbeta_score([-1, -1], [-1, -1], beta=1e5)))) diff --git a/python/xorbits/_mars/learn/metrics/tests/test_ranking.py b/python/xorbits/_mars/learn/metrics/tests/test_ranking.py new file mode 100644 index 000000000..9ce5d73c8 --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/tests/test_ranking.py @@ -0,0 +1,699 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +import warnings + +import numpy as np +import pandas as pd +import pytest +from sklearn.exceptions import UndefinedMetricWarning +from sklearn.metrics import accuracy_score as sklearn_accuracy_score +from sklearn.metrics import auc as sklearn_auc +from sklearn.metrics import roc_curve as sklearn_roc_curve +from sklearn.metrics.tests.test_ranking import _auc, make_prediction +from sklearn.utils import check_random_state +from sklearn.utils._testing import assert_almost_equal, assert_array_almost_equal + +from .... import dataframe as md +from .... import tensor as mt +from ...utils.extmath import softmax +from .. import accuracy_score, auc, roc_auc_score, roc_curve + + +def _partial_roc_auc_score(y_true, y_predict, max_fpr): + """Alternative implementation to check for correctness of `roc_auc_score` + with `max_fpr` set. + """ + + def _partial_roc(y_true, y_predict, max_fpr): + fpr, tpr, _ = sklearn_roc_curve(y_true, y_predict) + new_fpr = fpr[fpr <= max_fpr] + new_fpr = np.append(new_fpr, max_fpr) + new_tpr = tpr[fpr <= max_fpr] + idx_out = np.argmax(fpr > max_fpr) + idx_in = idx_out - 1 + x_interp = [fpr[idx_in], fpr[idx_out]] + y_interp = [tpr[idx_in], tpr[idx_out]] + new_tpr = np.append(new_tpr, np.interp(max_fpr, x_interp, y_interp)) + return (new_fpr, new_tpr) + + new_fpr, new_tpr = _partial_roc(y_true, y_predict, max_fpr) + partial_auc = sklearn_auc(new_fpr, new_tpr) + + # Formula (5) from McClish 1989 + fpr1 = 0 + fpr2 = max_fpr + min_area = 0.5 * (fpr2 - fpr1) * (fpr2 + fpr1) + max_area = fpr2 - fpr1 + return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area)) + + +@pytest.mark.parametrize("drop", [True, False]) +def test_roc_curve(setup, drop): + # Test Area under Receiver Operating Characteristic (ROC) curve + y_true, _, probas_pred = make_prediction(binary=True) + expected_auc = _auc(y_true, probas_pred) + + fpr, tpr, thresholds = ( + roc_curve(y_true, probas_pred, drop_intermediate=drop).execute().fetch() + ) + roc_auc = auc(fpr, tpr).to_numpy() + np.testing.assert_array_almost_equal(roc_auc, expected_auc, decimal=2) + np.testing.assert_almost_equal(roc_auc, roc_auc_score(y_true, probas_pred)) + assert fpr.shape == tpr.shape + assert fpr.shape == thresholds.shape + + +def test_roc_curve_end_points(setup): + # Make sure that roc_curve returns a curve start at 0 and ending and + # 1 even in corner cases + rng = np.random.RandomState(0) + y_true = np.array([0] * 50 + [1] * 50) + y_pred = rng.randint(3, size=100) + fpr, tpr, thr = roc_curve(y_true, y_pred, drop_intermediate=True).fetch() + assert fpr[0] == 0 + assert fpr[-1] == 1 + assert fpr.shape == tpr.shape + assert fpr.shape == thr.shape + + +def test_roc_returns_consistency(setup): + # Test whether the returned threshold matches up with tpr + # make small toy dataset + y_true, _, probas_pred = make_prediction(binary=True) + fpr, tpr, thresholds = roc_curve(y_true, probas_pred).fetch() + + # use the given thresholds to determine the tpr + tpr_correct = [] + for t in thresholds: + tp = np.sum((probas_pred >= t) & y_true) + p = np.sum(y_true) + tpr_correct.append(1.0 * tp / p) + + # compare tpr and tpr_correct to see if the thresholds' order was correct + np.testing.assert_array_almost_equal(tpr, tpr_correct, decimal=2) + assert fpr.shape == tpr.shape + assert fpr.shape == thresholds.shape + + +def test_roc_curve_multi(setup): + # roc_curve not applicable for multi-class problems + y_true, _, probas_pred = make_prediction(binary=False) + + with pytest.raises(ValueError): + roc_curve(y_true, probas_pred) + + +def test_roc_curve_confidence(setup): + # roc_curve for confidence scores + y_true, _, probas_pred = make_prediction(binary=True) + + fpr, tpr, thresholds = roc_curve(y_true, probas_pred - 0.5) + roc_auc = auc(fpr, tpr).fetch() + np.testing.assert_array_almost_equal(roc_auc, 0.90, decimal=2) + assert fpr.shape == tpr.shape + assert fpr.shape == thresholds.shape + + +def test_roc_curve_hard(setup): + # roc_curve for hard decisions + y_true, pred, probas_pred = make_prediction(binary=True) + + # always predict one + trivial_pred = np.ones(y_true.shape) + fpr, tpr, thresholds = roc_curve(y_true, trivial_pred) + roc_auc = auc(fpr, tpr).fetch() + np.testing.assert_array_almost_equal(roc_auc, 0.50, decimal=2) + assert fpr.shape == tpr.shape + assert fpr.shape == thresholds.shape + + # always predict zero + trivial_pred = np.zeros(y_true.shape) + fpr, tpr, thresholds = roc_curve(y_true, trivial_pred) + roc_auc = auc(fpr, tpr).fetch() + np.testing.assert_array_almost_equal(roc_auc, 0.50, decimal=2) + assert fpr.shape == tpr.shape + assert fpr.shape == thresholds.shape + + # hard decisions + fpr, tpr, thresholds = roc_curve(y_true, pred) + roc_auc = auc(fpr, tpr).fetch() + np.testing.assert_array_almost_equal(roc_auc, 0.78, decimal=2) + assert fpr.shape == tpr.shape + assert fpr.shape == thresholds.shape + + +def test_roc_curve_one_label(setup): + y_true = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + y_pred = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1] + # assert there are warnings + w = UndefinedMetricWarning + with pytest.warns(w): + fpr, tpr, thresholds = roc_curve(y_true, y_pred) + # all true labels, all fpr should be nan + np.testing.assert_array_equal(fpr.fetch(), np.full(len(thresholds), np.nan)) + assert fpr.shape == tpr.shape + assert fpr.shape == thresholds.shape + + # assert there are warnings + with pytest.warns(w): + fpr, tpr, thresholds = roc_curve([1 - x for x in y_true], y_pred) + # all negative labels, all tpr should be nan + np.testing.assert_array_equal(tpr.fetch(), np.full(len(thresholds), np.nan)) + assert fpr.shape == tpr.shape + assert fpr.shape == thresholds.shape + + +def test_roc_curve_toydata(setup): + # Binary classification + y_true = [0, 1] + y_score = [0, 1] + tpr, fpr, _ = roc_curve(y_true, y_score) + roc_auc = roc_auc_score(y_true, y_score) + assert_array_almost_equal(tpr, [0, 0, 1]) + assert_array_almost_equal(fpr, [0, 1, 1]) + assert_almost_equal(roc_auc, 1.0) + + y_true = [0, 1] + y_score = [1, 0] + tpr, fpr, _ = roc_curve(y_true, y_score) + roc_auc = roc_auc_score(y_true, y_score) + assert_array_almost_equal(tpr, [0, 1, 1]) + assert_array_almost_equal(fpr, [0, 0, 1]) + assert_almost_equal(roc_auc, 0.0) + + y_true = [1, 0] + y_score = [1, 1] + tpr, fpr, _ = roc_curve(y_true, y_score) + roc_auc = roc_auc_score(y_true, y_score) + assert_array_almost_equal(tpr, [0, 1]) + assert_array_almost_equal(fpr, [0, 1]) + assert_almost_equal(roc_auc, 0.5) + + y_true = [1, 0] + y_score = [1, 0] + tpr, fpr, _ = roc_curve(y_true, y_score) + roc_auc = roc_auc_score(y_true, y_score) + assert_array_almost_equal(tpr, [0, 0, 1]) + assert_array_almost_equal(fpr, [0, 1, 1]) + assert_almost_equal(roc_auc, 1.0) + + y_true = [1, 0] + y_score = [0.5, 0.5] + tpr, fpr, _ = roc_curve(y_true, y_score) + roc_auc = roc_auc_score(y_true, y_score) + assert_array_almost_equal(tpr, [0, 1]) + assert_array_almost_equal(fpr, [0, 1]) + assert_almost_equal(roc_auc, 0.5) + + y_true = [0, 0] + y_score = [0.25, 0.75] + # assert UndefinedMetricWarning because of no positive sample in y_true + expected_message = ( + "No positive samples in y_true, true positive value should be meaningless" + ) + with pytest.warns(UndefinedMetricWarning, match=expected_message): + tpr, fpr, _ = roc_curve(y_true, y_score) + + with pytest.raises(ValueError): + roc_auc_score(y_true, y_score) + assert_array_almost_equal(tpr, [0.0, 0.5, 1.0]) + assert_array_almost_equal(fpr, [np.nan, np.nan, np.nan]) + + y_true = [1, 1] + y_score = [0.25, 0.75] + # assert UndefinedMetricWarning because of no negative sample in y_true + expected_message = ( + "No negative samples in y_true, false positive value should be meaningless" + ) + with pytest.warns(UndefinedMetricWarning, match=expected_message): + tpr, fpr, _ = roc_curve(y_true, y_score) + + with pytest.raises(ValueError): + roc_auc_score(y_true, y_score) + assert_array_almost_equal(tpr, [np.nan, np.nan, np.nan]) + assert_array_almost_equal(fpr, [0.0, 0.5, 1.0]) + + # Multi-label classification task + y_true = np.array([[0, 1], [0, 1]]) + y_score = np.array([[0, 1], [0, 1]]) + with pytest.raises(ValueError): + roc_auc_score(y_true, y_score, average="macro") + with pytest.raises(ValueError): + roc_auc_score(y_true, y_score, average="weighted") + assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 1.0) + assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 1.0) + + y_true = np.array([[0, 1], [0, 1]]) + y_score = np.array([[0, 1], [1, 0]]) + with pytest.raises(ValueError): + roc_auc_score(y_true, y_score, average="macro") + with pytest.raises(ValueError): + roc_auc_score(y_true, y_score, average="weighted") + assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0.5) + assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0.5) + + y_true = np.array([[1, 0], [0, 1]]) + y_score = np.array([[0, 1], [1, 0]]) + assert_almost_equal(roc_auc_score(y_true, y_score, average="macro"), 0) + assert_almost_equal(roc_auc_score(y_true, y_score, average="weighted"), 0) + assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0) + assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0) + + y_true = np.array([[1, 0], [0, 1]]) + y_score = np.array([[0.5, 0.5], [0.5, 0.5]]) + assert_almost_equal(roc_auc_score(y_true, y_score, average="macro"), 0.5) + assert_almost_equal(roc_auc_score(y_true, y_score, average="weighted"), 0.5) + assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0.5) + assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0.5) + + +def test_roc_curve_drop_intermediate(setup): + # Test that drop_intermediate drops the correct thresholds + y_true = [0, 0, 0, 0, 1, 1] + y_score = [0.0, 0.2, 0.5, 0.6, 0.7, 1.0] + tpr, fpr, thresholds = roc_curve(y_true, y_score, drop_intermediate=True) + np.testing.assert_array_almost_equal(thresholds.fetch(), [2.0, 1.0, 0.7, 0.0]) + + # Test dropping thresholds with repeating scores + y_true = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] + y_score = [0.0, 0.1, 0.6, 0.6, 0.7, 0.8, 0.9, 0.6, 0.7, 0.8, 0.9, 0.9, 1.0] + tpr, fpr, thresholds = roc_curve(y_true, y_score, drop_intermediate=True) + np.testing.assert_array_almost_equal( + thresholds.fetch(), [2.0, 1.0, 0.9, 0.7, 0.6, 0.0] + ) + + +def test_roc_curve_fpr_tpr_increasing(setup): + # Ensure that fpr and tpr returned by roc_curve are increasing. + # Construct an edge case with float y_score and sample_weight + # when some adjacent values of fpr and tpr are actually the same. + y_true = [0, 0, 1, 1, 1] + y_score = [0.1, 0.7, 0.3, 0.4, 0.5] + sample_weight = np.repeat(0.2, 5) + fpr, tpr, _ = roc_curve(y_true, y_score, sample_weight=sample_weight) + assert ((mt.diff(fpr) < 0).sum() == 0).to_numpy() + assert ((mt.diff(tpr) < 0).sum() == 0).to_numpy() + + +def test_auc(setup): + # Test Area Under Curve (AUC) computation + x = [0, 1] + y = [0, 1] + np.testing.assert_array_almost_equal(auc(x, y).fetch(), 0.5) + x = [1, 0] + y = [0, 1] + np.testing.assert_array_almost_equal(auc(x, y).fetch(), 0.5) + x = [1, 0, 0] + y = [0, 1, 1] + np.testing.assert_array_almost_equal(auc(x, y).fetch(), 0.5) + x = [0, 1] + y = [1, 1] + np.testing.assert_array_almost_equal(auc(x, y).fetch(), 1) + x = [0, 0.5, 1] + y = [0, 0.5, 1] + np.testing.assert_array_almost_equal(auc(x, y).fetch(), 0.5) + + +def test_auc_errors(setup): + # Incompatible shapes + with pytest.raises(ValueError): + auc([0.0, 0.5, 1.0], [0.1, 0.2]) + + # Too few x values + with pytest.raises(ValueError): + auc([0.0], [0.1]) + + # x is not in order + x = [2, 1, 3, 4] + y = [5, 6, 7, 8] + error_message = f"x is neither increasing nor decreasing : {np.array(x)}" + with pytest.raises(ValueError, match=re.escape(error_message)): + auc(x, y) + + +@pytest.mark.parametrize( + "y_true, labels", + [ + (np.array([0, 1, 0, 2]), [0, 1, 2]), + (np.array([0, 1, 0, 2]), None), + (["a", "b", "a", "c"], ["a", "b", "c"]), + (["a", "b", "a", "c"], None), + ], +) +def test_multiclass_ovo_roc_auc_toydata(setup, y_true, labels): + # Tests the one-vs-one multiclass ROC AUC algorithm + # on a small example, representative of an expected use case. + y_scores = np.array( + [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]] + ) + + # Used to compute the expected output. + # Consider labels 0 and 1: + # positive label is 0, negative label is 1 + score_01 = roc_auc_score([1, 0, 1], [0.1, 0.3, 0.35]) + # positive label is 1, negative label is 0 + score_10 = roc_auc_score([0, 1, 0], [0.8, 0.4, 0.5]) + average_score_01 = (score_01 + score_10) / 2 + + # Consider labels 0 and 2: + score_02 = roc_auc_score([1, 1, 0], [0.1, 0.35, 0]) + score_20 = roc_auc_score([0, 0, 1], [0.1, 0.15, 0.8]) + average_score_02 = (score_02 + score_20) / 2 + + # Consider labels 1 and 2: + score_12 = roc_auc_score([1, 0], [0.4, 0.2]) + score_21 = roc_auc_score([0, 1], [0.3, 0.8]) + average_score_12 = (score_12 + score_21) / 2 + + # Unweighted, one-vs-one multiclass ROC AUC algorithm + ovo_unweighted_score = (average_score_01 + average_score_02 + average_score_12) / 3 + assert_almost_equal( + roc_auc_score(y_true, y_scores, labels=labels, multi_class="ovo"), + ovo_unweighted_score, + ) + + # Weighted, one-vs-one multiclass ROC AUC algorithm + # Each term is weighted by the prevalence for the positive label. + pair_scores = [average_score_01, average_score_02, average_score_12] + prevalence = [0.75, 0.75, 0.50] + ovo_weighted_score = np.average(pair_scores, weights=prevalence) + assert_almost_equal( + roc_auc_score( + y_true, y_scores, labels=labels, multi_class="ovo", average="weighted" + ), + ovo_weighted_score, + ) + + +@pytest.mark.parametrize( + "y_true, labels", + [ + (np.array([0, 2, 0, 2]), [0, 1, 2]), + (np.array(["a", "d", "a", "d"]), ["a", "b", "d"]), + ], +) +def test_multiclass_ovo_roc_auc_toydata_binary(setup, y_true, labels): + # Tests the one-vs-one multiclass ROC AUC algorithm for binary y_true + # + # on a small example, representative of an expected use case. + y_scores = np.array( + [[0.2, 0.0, 0.8], [0.6, 0.0, 0.4], [0.55, 0.0, 0.45], [0.4, 0.0, 0.6]] + ) + + # Used to compute the expected output. + # Consider labels 0 and 1: + # positive label is 0, negative label is 1 + score_01 = roc_auc_score([1, 0, 1, 0], [0.2, 0.6, 0.55, 0.4]) + # positive label is 1, negative label is 0 + score_10 = roc_auc_score([0, 1, 0, 1], [0.8, 0.4, 0.45, 0.6]) + ovo_score = (score_01 + score_10) / 2 + + assert_almost_equal( + roc_auc_score(y_true, y_scores, labels=labels, multi_class="ovo"), ovo_score + ) + + # Weighted, one-vs-one multiclass ROC AUC algorithm + assert_almost_equal( + roc_auc_score( + y_true, y_scores, labels=labels, multi_class="ovo", average="weighted" + ), + ovo_score, + ) + + +@pytest.mark.parametrize( + "y_true, labels", + [ + (np.array([0, 1, 2, 2]), None), + (["a", "b", "c", "c"], None), + ([0, 1, 2, 2], [0, 1, 2]), + (["a", "b", "c", "c"], ["a", "b", "c"]), + ], +) +def test_multiclass_ovr_roc_auc_toydata(setup, y_true, labels): + # Tests the unweighted, one-vs-rest multiclass ROC AUC algorithm + # on a small example, representative of an expected use case. + y_scores = np.array( + [[1.0, 0.0, 0.0], [0.1, 0.5, 0.4], [0.1, 0.1, 0.8], [0.3, 0.3, 0.4]] + ) + # Compute the expected result by individually computing the 'one-vs-rest' + # ROC AUC scores for classes 0, 1, and 2. + out_0 = roc_auc_score([1, 0, 0, 0], y_scores[:, 0]) + out_1 = roc_auc_score([0, 1, 0, 0], y_scores[:, 1]) + out_2 = roc_auc_score([0, 0, 1, 1], y_scores[:, 2]) + result_unweighted = (out_0 + out_1 + out_2) / 3.0 + + assert_almost_equal( + roc_auc_score(y_true, y_scores, multi_class="ovr", labels=labels), + result_unweighted, + ) + + # Tests the weighted, one-vs-rest multiclass ROC AUC algorithm + # on the same input (Provost & Domingos, 2000) + result_weighted = out_0 * 0.25 + out_1 * 0.25 + out_2 * 0.5 + assert_almost_equal( + roc_auc_score( + y_true, y_scores, multi_class="ovr", labels=labels, average="weighted" + ), + result_weighted, + ) + + +@pytest.mark.parametrize( + "msg, y_true, labels", + [ + ("Parameter 'labels' must be unique", np.array([0, 1, 2, 2]), [0, 2, 0]), + ( + "Parameter 'labels' must be unique", + np.array(["a", "b", "c", "c"]), + ["a", "a", "b"], + ), + ( + "Number of classes in y_true not equal to the number of columns " + "in 'y_score'", + np.array([0, 2, 0, 2]), + None, + ), + ( + "Parameter 'labels' must be ordered", + np.array(["a", "b", "c", "c"]), + ["a", "c", "b"], + ), + ( + "Number of given labels, 2, not equal to the number of columns in " + "'y_score', 3", + np.array([0, 1, 2, 2]), + [0, 1], + ), + ( + "Number of given labels, 2, not equal to the number of columns in " + "'y_score', 3", + np.array(["a", "b", "c", "c"]), + ["a", "b"], + ), + ( + "Number of given labels, 4, not equal to the number of columns in " + "'y_score', 3", + np.array([0, 1, 2, 2]), + [0, 1, 2, 3], + ), + ( + "Number of given labels, 4, not equal to the number of columns in " + "'y_score', 3", + np.array(["a", "b", "c", "c"]), + ["a", "b", "c", "d"], + ), + ( + "'y_true' contains labels not in parameter 'labels'", + np.array(["a", "b", "c", "e"]), + ["a", "b", "c"], + ), + ( + "'y_true' contains labels not in parameter 'labels'", + np.array(["a", "b", "c", "d"]), + ["a", "b", "c"], + ), + ( + "'y_true' contains labels not in parameter 'labels'", + np.array([0, 1, 2, 3]), + [0, 1, 2], + ), + ], +) +@pytest.mark.parametrize("multi_class", ["ovo", "ovr"]) +def test_roc_auc_score_multiclass_labels_error(setup, msg, y_true, labels, multi_class): + y_scores = np.array( + [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]] + ) + + with pytest.raises(ValueError, match=msg): + roc_auc_score(y_true, y_scores, labels=labels, multi_class=multi_class) + + +@pytest.mark.parametrize( + "msg, kwargs", + [ + ( + ( + r"average must be one of \('macro', 'weighted'\) for " + r"multiclass problems" + ), + {"average": "samples", "multi_class": "ovo"}, + ), + ( + ( + r"average must be one of \('macro', 'weighted'\) for " + r"multiclass problems" + ), + {"average": "micro", "multi_class": "ovr"}, + ), + ( + ( + r"sample_weight is not supported for multiclass one-vs-one " + r"ROC AUC, 'sample_weight' must be None in this case" + ), + {"multi_class": "ovo", "sample_weight": []}, + ), + ( + ( + r"Partial AUC computation not available in multiclass setting, " + r"'max_fpr' must be set to `None`, received `max_fpr=0.5` " + r"instead" + ), + {"multi_class": "ovo", "max_fpr": 0.5}, + ), + ( + ( + r"multi_class='ovp' is not supported for multiclass ROC AUC, " + r"multi_class must be in \('ovo', 'ovr'\)" + ), + {"multi_class": "ovp"}, + ), + (r"multi_class must be in \('ovo', 'ovr'\)", {}), + ], +) +def test_roc_auc_score_multiclass_error(setup, msg, kwargs): + # Test that roc_auc_score function returns an error when trying + # to compute multiclass AUC for parameters where an output + # is not defined. + rng = check_random_state(404) + y_score = rng.rand(20, 3) + y_prob = softmax(y_score) + y_true = rng.randint(0, 3, size=20) + with pytest.raises(ValueError, match=msg): + roc_auc_score(y_true, y_prob, **kwargs) + + +def test_auc_score_non_binary_class(setup): + # Test that roc_auc_score function returns an error when trying + # to compute AUC for non-binary class values. + rng = check_random_state(404) + y_pred = rng.rand(10) + # y_true contains only one class value + y_true = np.zeros(10, dtype="int") + err_msg = "ROC AUC score is not defined" + with pytest.raises(ValueError, match=err_msg): + roc_auc_score(y_true, y_pred) + y_true = np.ones(10, dtype="int") + with pytest.raises(ValueError, match=err_msg): + roc_auc_score(y_true, y_pred) + y_true = np.full(10, -1, dtype="int") + with pytest.raises(ValueError, match=err_msg): + roc_auc_score(y_true, y_pred) + + with warnings.catch_warnings(record=True): + rng = check_random_state(404) + y_pred = rng.rand(10) + # y_true contains only one class value + y_true = np.zeros(10, dtype="int") + with pytest.raises(ValueError, match=err_msg): + roc_auc_score(y_true, y_pred) + y_true = np.ones(10, dtype="int") + with pytest.raises(ValueError, match=err_msg): + roc_auc_score(y_true, y_pred) + y_true = np.full(10, -1, dtype="int") + with pytest.raises(ValueError, match=err_msg): + roc_auc_score(y_true, y_pred) + + +def test_binary_clf_curve_multiclass_error(setup): + rng = check_random_state(404) + y_true = rng.randint(0, 3, size=10) + y_pred = rng.rand(10) + msg = "multiclass format is not supported" + + with pytest.raises(ValueError, match=msg): + roc_curve(y_true, y_pred) + + +def test_dataframe_roc_curve_auc(setup): + rs = np.random.RandomState(0) + raw = pd.DataFrame({"a": rs.randint(0, 10, (10,)), "b": rs.rand(10)}) + + df = md.DataFrame(raw) + y = df["a"].to_tensor().astype("int") + pred = df["b"].to_tensor().astype("float") + fpr, tpr, thresholds = roc_curve(y, pred, pos_label=2) + m = auc(fpr, tpr) + + sk_fpr, sk_tpr, sk_threshod = sklearn_roc_curve( + raw["a"].to_numpy().astype("int"), + raw["b"].to_numpy().astype("float"), + pos_label=2, + ) + expect_m = sklearn_auc(sk_fpr, sk_tpr) + assert pytest.approx(m.fetch()) == expect_m + + +def test_dataframe_accuracy_score(setup): + rs = np.random.RandomState(0) + raw = pd.DataFrame({"a": rs.randint(0, 10, (10,)), "b": rs.randint(0, 10, (10,))}) + + df = md.DataFrame(raw) + y = df["a"].to_tensor().astype("int") + pred = df["b"].astype("int") + + score = accuracy_score(y, pred) + expect = sklearn_accuracy_score( + raw["a"].to_numpy().astype("int"), raw["b"].to_numpy().astype("int") + ) + assert pytest.approx(score.fetch()) == expect + + +def test_partial_roc_auc_score(setup): + # Check `roc_auc_score` for max_fpr != `None` + y_true = np.array([0, 0, 1, 1]) + assert roc_auc_score(y_true, y_true, max_fpr=1) == 1 + assert roc_auc_score(y_true, y_true, max_fpr=0.001) == 1 + with pytest.raises(ValueError): + assert roc_auc_score(y_true, y_true, max_fpr=-0.1) + with pytest.raises(ValueError): + assert roc_auc_score(y_true, y_true, max_fpr=1.1) + with pytest.raises(ValueError): + assert roc_auc_score(y_true, y_true, max_fpr=0) + + y_scores = np.array([0.1, 0, 0.1, 0.01]) + roc_auc_with_max_fpr_one = roc_auc_score(y_true, y_scores, max_fpr=1) + unconstrained_roc_auc = roc_auc_score(y_true, y_scores) + assert roc_auc_with_max_fpr_one == unconstrained_roc_auc + assert roc_auc_score(y_true, y_scores, max_fpr=0.3) == 0.5 + + y_true, y_pred, _ = make_prediction(binary=True) + for max_fpr in np.linspace(1e-4, 1, 5): + assert_almost_equal( + roc_auc_score(y_true, y_pred, max_fpr=max_fpr), + _partial_roc_auc_score(y_true, y_pred, max_fpr), + ) diff --git a/python/xorbits/_mars/learn/metrics/tests/test_regression.py b/python/xorbits/_mars/learn/metrics/tests/test_regression.py new file mode 100644 index 000000000..28183873f --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/tests/test_regression.py @@ -0,0 +1,138 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from itertools import product + +import numpy as np +import pytest +from sklearn.exceptions import UndefinedMetricWarning +from sklearn.utils._testing import ( + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) + +from .... import tensor as mt +from .. import r2_score +from .._regresssion import _check_reg_targets + + +def test__check_reg_targets(setup): + # All of length 3 + EXAMPLES = [ + ("continuous", [1, 2, 3], 1), + ("continuous", [[1], [2], [3]], 1), + ("continuous-multioutput", [[1, 1], [2, 2], [3, 1]], 2), + ("continuous-multioutput", [[5, 1], [4, 2], [3, 1]], 2), + ("continuous-multioutput", [[1, 3, 4], [2, 2, 2], [3, 1, 1]], 3), + ] + + for (type1, y1, n_out1), (type2, y2, n_out2) in product(EXAMPLES, repeat=2): + if type1 == type2 and n_out1 == n_out2: + y_type, y_check1, y_check2, multioutput = _check_reg_targets(y1, y2, None) + assert type1 == y_type + if type1 == "continuous": + assert_array_equal(y_check1, np.reshape(y1, (-1, 1))) + assert_array_equal(y_check2, np.reshape(y2, (-1, 1))) + else: + assert_array_equal(y_check1, y1) + assert_array_equal(y_check2, y2) + else: + with pytest.raises(ValueError): + _check_reg_targets(y1, y2, None) + + +def test__check_reg_targets_exception(setup): + invalid_multioutput = "this_value_is_not_valid" + expected_message = ( + "Allowed 'multioutput' string values are.+" + "You provided multioutput={!r}".format(invalid_multioutput) + ) + with pytest.raises(ValueError, match=expected_message): + _check_reg_targets([1, 2, 3], [[1], [2], [3]], invalid_multioutput) + + with pytest.raises(ValueError): + _check_reg_targets([1, 2], [[1], [2]], multioutput=[0.4, 0.6]) + with pytest.raises(ValueError): + _check_reg_targets([[1, 2], [3, 4]], [[1, 2], [3, 4]], multioutput=[0.4]) + + +def test_r2_score(setup, n_samples=50): + y_true = mt.arange(n_samples) + y_pred = y_true + 1 + + assert_almost_equal(r2_score(y_true, y_pred).fetch(), 0.995, 2) + + y_true = mt.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 1, 0, 1]]) + y_pred = mt.array([[0, 0, 0, 1], [1, 0, 1, 1], [0, 0, 0, 1]]) + + error = r2_score(y_true, y_pred, multioutput="variance_weighted") + assert_almost_equal(error.fetch(), 1.0 - 5.0 / 2) + error = r2_score(y_true, y_pred, multioutput="uniform_average") + assert_almost_equal(error.fetch(), -0.875) + + assert_almost_equal(r2_score([0.0, 1], [0.0, 1]).fetch(), 1.00, 2) + assert_almost_equal( + r2_score([0.0, 1], [0.0, 1], sample_weight=[0.5, 0.5]).fetch(), 1.00, 2 + ) + + y_true = [[1, 2], [2.5, -1], [4.5, 3], [5, 7]] + y_pred = [[1, 1], [2, -1], [5, 4], [5, 6.5]] + + r = r2_score(y_true, y_pred, multioutput="raw_values") + + assert_array_almost_equal(r, [0.95, 0.93], decimal=2) + + # mean_absolute_error and mean_squared_error are equal because + # it is a binary problem. + y_true = [[0, 0]] * 4 + y_pred = [[1, 1]] * 4 + r = r2_score(y_true, y_pred, multioutput="raw_values") + assert_array_almost_equal(r, [0.0, 0.0], decimal=2) + + r = r2_score([[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput="raw_values") + assert_array_almost_equal(r, [0, -3.5], decimal=2) + assert ( + np.mean(r.fetch()) + == r2_score( + [[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput="uniform_average" + ).fetch() + ) + + # Checking for the condition in which both numerator and denominator is + # zero. + y_true = [[1, 3], [-1, 2]] + y_pred = [[1, 4], [-1, 1]] + r2 = r2_score(y_true, y_pred, multioutput="raw_values") + assert_array_almost_equal(r2, [1.0, -3.0], decimal=2) + assert ( + np.mean(r2.fetch()) + == r2_score(y_true, y_pred, multioutput="uniform_average").fetch() + ) + + y_true = [[1, 2], [2.5, -1], [4.5, 3], [5, 7]] + y_pred = [[1, 1], [2, -1], [5, 4], [5, 6.5]] + + rw = r2_score(y_true, y_pred, multioutput=[0.4, 0.6]) + + assert_almost_equal(rw.fetch(), 0.94, decimal=2) + + y_true = [0] + y_pred = [1] + warning_msg = "not well-defined with less than two samples." + + # Trigger the warning + with pytest.warns(UndefinedMetricWarning, match=warning_msg): + score = r2_score(y_true, y_pred) + assert np.isnan(score) diff --git a/python/xorbits/_mars/learn/metrics/tests/test_scorer.py b/python/xorbits/_mars/learn/metrics/tests/test_scorer.py new file mode 100644 index 000000000..3f30691de --- /dev/null +++ b/python/xorbits/_mars/learn/metrics/tests/test_scorer.py @@ -0,0 +1,26 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from sklearn.metrics import r2_score + +from .. import get_scorer + + +def test_get_scorer(): + with pytest.raises(ValueError): + get_scorer("unknown") + + assert get_scorer("r2") is not None + assert get_scorer(r2_score) is not None diff --git a/python/xorbits/_mars/learn/model_selection/__init__.py b/python/xorbits/_mars/learn/model_selection/__init__.py new file mode 100644 index 000000000..2df377453 --- /dev/null +++ b/python/xorbits/_mars/learn/model_selection/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ._split import KFold, train_test_split diff --git a/python/xorbits/_mars/learn/model_selection/_split.py b/python/xorbits/_mars/learn/model_selection/_split.py new file mode 100644 index 000000000..faf31984c --- /dev/null +++ b/python/xorbits/_mars/learn/model_selection/_split.py @@ -0,0 +1,459 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numbers +from abc import ABCMeta, abstractmethod +from itertools import chain +from math import ceil, floor + +import numpy as np + +from ... import tensor as mt +from ...core import ExecutableTuple +from ...tensor.utils import check_random_state +from ..utils import shuffle as shuffle_arrays +from ..utils.validation import _num_samples, indexable + + +def train_test_split(*arrays, **options): + """Split arrays or matrices into random train and test subsets + + Parameters + ---------- + *arrays : sequence of indexables with same length / shape[0] + Allowed inputs are lists, numpy arrays, scipy-sparse + matrices or pandas dataframes. + + test_size : float, int or None, optional (default=None) + If float, should be between 0.0 and 1.0 and represent the proportion + of the dataset to include in the test split. If int, represents the + absolute number of test samples. If None, the value is set to the + complement of the train size. If ``train_size`` is also None, it will + be set to 0.25. + + train_size : float, int, or None, (default=None) + If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the train split. If + int, represents the absolute number of train samples. If None, + the value is automatically set to the complement of the test size. + + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + + shuffle : boolean, optional (default=True) + Whether or not to shuffle the data before splitting. If shuffle=False + then stratify must be None. + + stratify : array-like or None (default=None) + If not None, data is split in a stratified fashion, using this as + the class labels. + + Returns + ------- + splitting : list, length=2 * len(arrays) + List containing train-test split of inputs. + + Examples + -------- + >>> import mars.tensor as mt + >>> from mars.learn.model_selection import train_test_split + >>> X, y = mt.arange(10).reshape((5, 2)), range(5) + >>> X.execute() + array([[0, 1], + [2, 3], + [4, 5], + [6, 7], + [8, 9]]) + >>> list(y) + [0, 1, 2, 3, 4] + + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, test_size=0.33, random_state=42) + ... + >>> X_train.execute() + array([[8, 9], + [0, 1], + [4, 5]]) + >>> y_train.execute() + array([4, 0, 2]) + >>> X_test.execute() + array([[2, 3], + [6, 7]]) + >>> y_test.execute() + array([1, 3]) + + >>> train_test_split(y, shuffle=False) + [array([0, 1, 2]), array([3, 4])] + + """ + + n_arrays = len(arrays) + if n_arrays == 0: + raise ValueError("At least one array required as input") + test_size = options.pop("test_size", None) + train_size = options.pop("train_size", None) + random_state = options.pop("random_state", None) + stratify = options.pop("stratify", None) + shuffle = options.pop("shuffle", True) + session = options.pop("session", None) + run_kwargs = options.pop("run_kwargs", None) + + if options: + raise TypeError(f"Invalid parameters passed: {options}") + + arrays = indexable(*arrays, session=session, run_kwargs=run_kwargs) + + n_samples = _num_samples(arrays[0]) + n_train, n_test = _validate_shuffle_split( + n_samples, test_size, train_size, default_test_size=0.25 + ) + + if shuffle is False: + if stratify is not None: # pragma: no cover + raise ValueError( + "Stratified train/test split is not implemented for shuffle=False" + ) + + iterables = ((a[:n_train], a[n_train : n_train + n_test]) for a in arrays) + else: + if stratify is not None: # pragma: no cover + raise NotImplementedError("stratify is not implemented yet") + else: + shuffled_arrays = shuffle_arrays(*arrays, random_state=random_state) + if not isinstance(shuffled_arrays, tuple): + shuffled_arrays = (shuffled_arrays,) + iterables = ( + (a[:n_train], a[n_train : n_train + n_test]) for a in shuffled_arrays + ) + + return list( + ExecutableTuple(chain.from_iterable(iterables)).execute( + session=session, **(run_kwargs or dict()) + ) + ) + + +def _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=None): + """ + Validation helper to check if the test/test sizes are meaningful wrt to the + size of the data (n_samples) + """ + if test_size is None and train_size is None: + test_size = default_test_size + + test_size_type = np.asarray(test_size).dtype.kind + train_size_type = np.asarray(train_size).dtype.kind + + if ( + test_size_type == "i" + and (test_size >= n_samples or test_size <= 0) + or test_size_type == "f" + and (test_size <= 0 or test_size >= 1) + ): + raise ValueError( + f"test_size={test_size} should be either positive and smaller" + f" than the number of samples {n_samples} or a float in the " + "(0, 1) range" + ) + + if ( + train_size_type == "i" + and (train_size >= n_samples or train_size <= 0) + or train_size_type == "f" + and (train_size <= 0 or train_size >= 1) + ): + raise ValueError( + f"train_size={train_size} should be either positive and smaller" + f" than the number of samples {n_samples} or a float in the " + "(0, 1) range" + ) + + if train_size is not None and train_size_type not in ("i", "f"): # pragma: no cover + raise ValueError(f"Invalid value for train_size: {train_size}") + if test_size is not None and test_size_type not in ("i", "f"): + raise ValueError(f"Invalid value for test_size: {test_size}") + + if train_size_type == "f" and test_size_type == "f" and train_size + test_size > 1: + raise ValueError( + f"The sum of test_size and train_size = {train_size + test_size}, " + "should be in the (0, 1) range. Reduce test_size and/or train_size." + ) + + if test_size_type == "f": + n_test = ceil(test_size * n_samples) + elif test_size_type == "i": + n_test = float(test_size) + + if train_size_type == "f": + n_train = floor(train_size * n_samples) + elif train_size_type == "i": # pragma: no cover + n_train = float(train_size) + + if train_size is None: + n_train = n_samples - n_test + elif test_size is None: + n_test = n_samples - n_train + + if n_train + n_test > n_samples: # pragma: no cover + raise ValueError( + f"The sum of train_size and test_size = {n_train + n_test}, " + f"should be smaller than the number of samples {n_samples}. " + "Reduce test_size and/or train_size." + ) + + n_train, n_test = int(n_train), int(n_test) + + if n_train == 0: # pragma: no cover + raise ValueError( + f"With n_samples={n_samples}, test_size={test_size} and " + f"train_size={train_size}, the resulting train set will " + f"be empty. Adjust any of the aforementioned parameters." + ) + + return n_train, n_test + + +class BaseCrossValidator(metaclass=ABCMeta): + """Base class for all cross-validators + + Implementations must define `_iter_test_masks` or `_iter_test_indices`. + """ + + def split(self, X, y=None, groups=None): # pragma: no cover + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where n_samples is the number of samples + and n_features is the number of features. + + y : array-like of shape (n_samples,) + The target variable for supervised learning problems. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + X, y, groups = indexable(X, y, groups) + indices = mt.arange(_num_samples(X)) + for test_index in self._iter_test_masks(X, y, groups): + train_index = indices[mt.logical_not(test_index)] + test_index = indices[test_index] + yield train_index, test_index + + # Since subclasses must implement either _iter_test_masks or + # _iter_test_indices, neither can be abstract. + def _iter_test_masks(self, X=None, y=None, groups=None): # pragma: no cover + """Generates boolean masks corresponding to test sets. + + By default, delegates to _iter_test_indices(X, y, groups) + """ + for test_index in self._iter_test_indices(X, y, groups): + test_mask = mt.zeros(_num_samples(X), dtype=bool) + test_mask[test_index] = True + yield test_mask + + def _iter_test_indices(self, X=None, y=None, groups=None): # pragma: no cover + """Generates integer indices corresponding to test sets.""" + raise NotImplementedError + + @abstractmethod + def get_n_splits(self, X=None, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator""" + + +class _BaseKFold(BaseCrossValidator, metaclass=ABCMeta): + """Base class for KFold, GroupKFold, and StratifiedKFold""" + + @abstractmethod + def __init__(self, n_splits, *, shuffle, random_state): + if not isinstance(n_splits, numbers.Integral): + raise ValueError( + "The number of folds must be of Integral type. " + "%s of type %s was passed." % (n_splits, type(n_splits)) + ) + n_splits = int(n_splits) + + if n_splits <= 1: + raise ValueError( + "k-fold cross-validation requires at least one" + " train/test split by setting n_splits=2 or more," + " got n_splits={0}.".format(n_splits) + ) + + if not isinstance(shuffle, bool): + raise TypeError("shuffle must be True or False; got {0}".format(shuffle)) + + if not shuffle and random_state is not None: # None is the default + raise ValueError( + "Setting a random_state has no effect since shuffle is " + "False. You should leave " + "random_state to its default (None), or set shuffle=True.", + ) + + self.n_splits = n_splits + self.shuffle = shuffle + self.random_state = random_state + + def get_n_splits(self, X=None, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + + y : object + Always ignored, exists for compatibility. + + groups : object + Always ignored, exists for compatibility. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + return self.n_splits + + +class KFold(_BaseKFold): + """K-Folds cross-validator + + Provides train/test indices to split data in train/test sets. Split + dataset into k consecutive folds (without shuffling by default). + + Each fold is then used once as a validation while the k - 1 remaining + folds form the training set. + + Parameters + ---------- + n_splits : int, default=5 + Number of folds. Must be at least 2. + + .. versionchanged:: 0.22 + ``n_splits`` default value changed from 3 to 5. + + shuffle : bool, default=False + Whether to shuffle the data before splitting into batches. + Note that the samples within each split will not be shuffled. + + random_state : int, RandomState instance or None, default=None + When `shuffle` is True, `random_state` affects the ordering of the + indices, which controls the randomness of each fold. Otherwise, this + parameter has no effect. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Examples + -------- + >>> import mars.tensor as mt + >>> from mars.learn.model_selection import KFold + >>> X = mt.array([[1, 2], [3, 4], [1, 2], [3, 4]]) + >>> y = mt.array([1, 2, 3, 4]) + >>> kf = KFold(n_splits=2) + >>> kf.get_n_splits(X) + 2 + >>> print(kf) + KFold(n_splits=2, random_state=None, shuffle=False) + >>> for train_index, test_index in kf.split(X): + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] + TRAIN: [2 3] TEST: [0 1] + TRAIN: [0 1] TEST: [2 3] + + Notes + ----- + The first ``n_samples % n_splits`` folds have size + ``n_samples // n_splits + 1``, other folds have size + ``n_samples // n_splits``, where ``n_samples`` is the number of samples. + + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting `random_state` + to an integer. + + See Also + -------- + StratifiedKFold : Takes group information into account to avoid building + folds with imbalanced class distributions (for binary or multiclass + classification tasks). + + GroupKFold : K-fold iterator variant with non-overlapping groups. + + RepeatedKFold : Repeats K-Fold n times. + """ + + def __init__(self, n_splits=5, *, shuffle=False, random_state=None): + super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state) + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where n_samples is the number of samples + and n_features is the number of features. + + y : array-like of shape (n_samples,), default=None + The target variable for supervised learning problems. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + X, y, groups = indexable(X, y, groups) + n_samples = _num_samples(X) + if self.n_splits > n_samples: + raise ValueError( + ( + "Cannot have number of splits n_splits={0} greater" + " than the number of samples: n_samples={1}." + ).format(self.n_splits, n_samples) + ) + + indices = mt.arange(n_samples) + if self.shuffle: + check_random_state(self.random_state).shuffle(indices) + + n_splits = self.n_splits + fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=int) + fold_sizes[: n_samples % n_splits] += 1 + current = 0 + for fold_size in fold_sizes: + start, stop = current, current + fold_size + train_index = mt.concatenate([indices[:start], indices[stop:]]) + yield train_index, indices[start:stop] + current = stop diff --git a/python/xorbits/_mars/learn/model_selection/tests/__init__.py b/python/xorbits/_mars/learn/model_selection/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/learn/model_selection/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/learn/model_selection/tests/test_split.py b/python/xorbits/_mars/learn/model_selection/tests/test_split.py new file mode 100644 index 000000000..ccdf99370 --- /dev/null +++ b/python/xorbits/_mars/learn/model_selection/tests/test_split.py @@ -0,0 +1,317 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np +import pandas as pd +import pytest + +try: + import scipy.sparse as sps +except ImportError: # pragma: no cover + sps = None + +from .... import dataframe as md +from .... import tensor as mt +from ....dataframe.core import DATAFRAME_TYPE +from ....lib.sparse import SparseNDArray +from ...utils.validation import _num_samples +from .. import KFold, train_test_split + + +def test_train_test_split_errors(setup): + pytest.raises(ValueError, train_test_split) + + pytest.raises(ValueError, train_test_split, range(3), train_size=1.1) + + pytest.raises(ValueError, train_test_split, range(3), test_size=0.6, train_size=0.6) + pytest.raises( + ValueError, + train_test_split, + range(3), + test_size=np.float32(0.6), + train_size=np.float32(0.6), + ) + pytest.raises(ValueError, train_test_split, range(3), test_size="wrong_type") + pytest.raises(ValueError, train_test_split, range(3), test_size=2, train_size=4) + pytest.raises(TypeError, train_test_split, range(3), some_argument=1.1) + pytest.raises(ValueError, train_test_split, range(3), range(42)) + pytest.raises(ValueError, train_test_split, range(10), shuffle=False, stratify=True) + + with pytest.raises( + ValueError, + match=r"train_size=11 should be either positive and " + r"smaller than the number of samples 10 or a " + r"float in the \(0, 1\) range", + ): + train_test_split(range(10), train_size=11, test_size=1) + + +def test_train_test_split_invalid_sizes1(setup): + for train_size, test_size in [ + (1.2, 0.8), + (1.0, 0.8), + (0.0, 0.8), + (-0.2, 0.8), + (0.8, 1.2), + (0.8, 1.0), + (0.8, 0.0), + (0.8, -0.2), + ]: + with pytest.raises(ValueError, match=r"should be .* in the \(0, 1\) range"): + train_test_split(range(10), train_size=train_size, test_size=test_size) + + +def test_train_test_split_invalid_sizes2(setup): + for train_size, test_size in [ + (-10, 0.8), + (0, 0.8), + (11, 0.8), + (0.8, -10), + (0.8, 0), + (0.8, 11), + ]: + with pytest.raises(ValueError, match=r"should be .* in the \(0, 1\) range"): + train_test_split(range(10), train_size=train_size, test_size=test_size) + + +def test_train_test_split(setup): + X = np.arange(100).reshape((10, 10)) + y = np.arange(10) + + # simple test + split = train_test_split(X, y, test_size=None, train_size=0.5) + X_train, X_test, y_train, y_test = split + assert len(y_test) == len(y_train) + # test correspondence of X and y + np.testing.assert_array_equal(X_train[:, 0], y_train * 10) + np.testing.assert_array_equal(X_test[:, 0], y_test * 10) + + # allow nd-arrays + X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2) + y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11) + split = train_test_split(X_4d, y_3d) + assert split[0].shape == (7, 5, 3, 2) + assert split[1].shape == (3, 5, 3, 2) + assert split[2].shape == (7, 7, 11) + assert split[3].shape == (3, 7, 11) + + # test unshuffled split + y = np.arange(10) + for test_size in [2, 0.2]: + train, test = train_test_split(y, shuffle=False, test_size=test_size) + np.testing.assert_array_equal(test, [8, 9]) + np.testing.assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7]) + + +def test_train_test_split_dataframe(setup): + X = np.ones(10) + types = [pd.DataFrame, md.DataFrame] + for InputFeatureType in types: + # X dataframe + X_df = InputFeatureType(X) + X_train, X_test = train_test_split(X_df) + assert isinstance(X_train, DATAFRAME_TYPE) + assert isinstance(X_test, DATAFRAME_TYPE) + + +@pytest.mark.skipif(sps is None, reason="scipy not installed") +def test_train_test_split_sparse(setup): + # check that train_test_split converts scipy sparse matrices + # to csr, as stated in the documentation + X = np.arange(100).reshape((10, 10)) + sparse_types = [sps.csr_matrix, sps.csc_matrix, sps.coo_matrix] + for InputFeatureType in sparse_types: + X_s = InputFeatureType(X) + for x in (X_s, mt.tensor(X_s, chunk_size=(2, 5))): + X_train, X_test = train_test_split(x) + assert isinstance(X_train.fetch(), SparseNDArray) + assert isinstance(X_test.fetch(), SparseNDArray) + + +def test_train_testplit_list_input(setup): + # Check that when y is a list / list of string labels, it works. + X = np.ones(7) + y1 = ["1"] * 4 + ["0"] * 3 + y2 = np.hstack((np.ones(4), np.zeros(3))) + y3 = y2.tolist() + + for stratify in (False,): + X_train1, X_test1, y_train1, y_test1 = train_test_split( + X, y1, stratify=y1 if stratify else None, random_state=0 + ) + X_train2, X_test2, y_train2, y_test2 = train_test_split( + X, y2, stratify=y2 if stratify else None, random_state=0 + ) + X_train3, X_test3, y_train3, y_test3 = train_test_split( + X, y3, stratify=y3 if stratify else None, random_state=0 + ) + + np.testing.assert_equal(X_train1, X_train2) + np.testing.assert_equal(y_train2, y_train3) + np.testing.assert_equal(X_test1, X_test3) + np.testing.assert_equal(y_test3, y_test2) + + +def test_mixied_input_type_train_test_split(setup): + rs = np.random.RandomState(0) + df_raw = pd.DataFrame(rs.rand(10, 4)) + df = md.DataFrame(df_raw, chunk_size=5) + X, y = df.iloc[:, :-1], df.iloc[:, -1] + + for x_to_tensor, y_to_tensor in itertools.product(range(1), range(1)): + x = X + if x_to_tensor: + x = mt.tensor(x) + yy = y + if y_to_tensor: + yy = mt.tensor(yy) + + x_train, x_test, y_train, y_test = train_test_split( + x, y, random_state=0, run_kwargs={"extra_config": {"check_nsplits": False}} + ) + assert isinstance(x_train, type(x)) + assert isinstance(x_test, type(x)) + assert isinstance(y_train, type(yy)) + assert isinstance(y_test, type(yy)) + + +def test_kfold_valueerrors(): + X1 = np.array([[1, 2], [3, 4], [5, 6]]) + # Check that errors are raised if there is not enough samples + with pytest.raises(ValueError): + next(KFold(4).split(X1)) + + # Error when number of folds is <= 1 + with pytest.raises(ValueError): + KFold(0) + with pytest.raises(ValueError): + KFold(1) + + # When n_splits is not integer: + with pytest.raises(ValueError): + KFold(1.5) + with pytest.raises(ValueError): + KFold(2.0) + + # When shuffle is not a bool: + with pytest.raises(TypeError): + KFold(n_splits=4, shuffle=None) + + +def check_valid_split(train, test, n_samples=None): + # Use python sets to get more informative assertion failure messages + train = train.execute().to_numpy() + test = test.execute().to_numpy() + train, test = set(train), set(test) + + # Train and test split should not overlap + assert train.intersection(test) == set() + + if n_samples is not None: + # Check that the union of train an test split cover all the indices + assert train.union(test) == set(range(n_samples)) + + +def check_cv_coverage(cv, X, y, groups, expected_n_splits): + n_samples = _num_samples(X) + # Check that a all the samples appear at least once in a test fold + assert cv.get_n_splits(X, y, groups) == expected_n_splits + + collected_test_samples = set() + iterations = 0 + for train, test in cv.split(X, y, groups): + check_valid_split(train, test, n_samples=n_samples) + iterations += 1 + collected_test_samples.update(test.execute().to_numpy()) + + # Check that the accumulated test samples cover the whole dataset + assert iterations == expected_n_splits + if n_samples is not None: + assert collected_test_samples == set(range(n_samples)) + + +def test_kfold_indices(setup): + # Check all indices are returned in the test folds + X1 = np.ones(18) + kf = KFold(3) + check_cv_coverage(kf, X1, y=None, groups=None, expected_n_splits=3) + + # Check all indices are returned in the test folds even when equal-sized + # folds are not possible + X2 = np.ones(17) + kf = KFold(3) + check_cv_coverage(kf, X2, y=None, groups=None, expected_n_splits=3) + + # Check if get_n_splits returns the number of folds + assert 5 == KFold(5).get_n_splits(X2) + + +def test_kfold_no_shuffle(setup): + # Manually check that KFold preserves the data ordering on toy datasets + X2 = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] + + splits = KFold(2).split(X2[:-1]) + train, test = next(splits) + np.testing.assert_array_equal(test.execute().fetch(), [0, 1]) + np.testing.assert_array_equal(train.execute().fetch(), [2, 3]) + + train, test = next(splits) + np.testing.assert_array_equal(test.execute().fetch(), [2, 3]) + np.testing.assert_array_equal(train.execute().fetch(), [0, 1]) + + splits = KFold(2).split(X2) + train, test = next(splits) + np.testing.assert_array_equal(test.execute().fetch(), [0, 1, 2]) + np.testing.assert_array_equal(train.execute().fetch(), [3, 4]) + + train, test = next(splits) + np.testing.assert_array_equal(test.execute().fetch(), [3, 4]) + np.testing.assert_array_equal(train.execute().fetch(), [0, 1, 2]) + + +def test_kfold_balance(setup): + # Check that KFold returns folds with balanced sizes + for i in range(11, 17): + kf = KFold(5).split(X=np.ones(i)) + sizes = [len(test) for _, test in kf] + + assert (np.max(sizes) - np.min(sizes)) <= 1 + assert np.sum(sizes) == i + + +def test_shuffle_kfold(setup): + # Check the indices are shuffled properly + kf = KFold(3) + kf2 = KFold(3, shuffle=True, random_state=0) + kf3 = KFold(3, shuffle=True, random_state=1) + + X = mt.ones(300) + + all_folds = np.zeros(300) + for (tr1, te1), (tr2, te2), (tr3, te3) in zip( + kf.split(X), kf2.split(X), kf3.split(X) + ): + for tr_a, tr_b in itertools.combinations((tr1, tr2, tr3), 2): + # Assert that there is no complete overlap + tr_a = tr_a.execute().fetch() + tr_b = tr_b.execute().fetch() + assert len(np.intersect1d(tr_a, tr_b)) != len(tr1) + + # Set all test indices in successive iterations of kf2 to 1 + all_folds[te2.execute().fetch()] = 1 + + # Check that all indices are returned in the different test folds + assert sum(all_folds) == 300 diff --git a/python/xorbits/_mars/learn/neighbors/__init__.py b/python/xorbits/_mars/learn/neighbors/__init__.py new file mode 100644 index 000000000..860298dbb --- /dev/null +++ b/python/xorbits/_mars/learn/neighbors/__init__.py @@ -0,0 +1,25 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +try: + from .unsupervised import NearestNeighbors +except ImportError: # pragma: no cover + pass + + +def register_op(): + from ._ball_tree import BallTree, BallTreeQuery + from ._kd_tree import KDTree, KDTreeQuery + + del BallTree, BallTreeQuery, KDTree, KDTreeQuery diff --git a/python/xorbits/_mars/learn/neighbors/_ball_tree.py b/python/xorbits/_mars/learn/neighbors/_ball_tree.py new file mode 100644 index 000000000..48b850e99 --- /dev/null +++ b/python/xorbits/_mars/learn/neighbors/_ball_tree.py @@ -0,0 +1,59 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +try: + from sklearn.neighbors import BallTree as SklearnBallTree +except ImportError: # pragma: no cover + SklearnBallTree = None + +from ... import opcodes as OperandDef +from ...utils import require_not_none +from .tree import TreeBase, TreeObject, TreeQueryBase + + +class BallTree(TreeObject): + pass + + +@require_not_none(SklearnBallTree) +class _BallTree(TreeBase): + _op_type_ = OperandDef.BALL_TREE_TRAIN + _tree_type = SklearnBallTree + + def __call__(self, a): + result = super().__call__(a) + return BallTree(result.data) + + +@require_not_none(SklearnBallTree) +class BallTreeQuery(TreeQueryBase): + _op_type_ = OperandDef.BALL_TREE_QUERY + _tree_type = SklearnBallTree + + +@require_not_none(SklearnBallTree) +def ball_tree_query(tree, data, n_neighbors, return_distance): + op = BallTreeQuery( + tree=tree, n_neighbors=n_neighbors, return_distance=return_distance + ) + ret = op(data) + if not return_distance: + return ret[0] + return ret + + +@require_not_none(SklearnBallTree) +def create_ball_tree(X, leaf_size, metric=None, **metric_params): + op = _BallTree(leaf_size=leaf_size, metric=metric, **metric_params) + return op(X) diff --git a/python/xorbits/_mars/learn/neighbors/_faiss.py b/python/xorbits/_mars/learn/neighbors/_faiss.py new file mode 100644 index 000000000..167c13c5e --- /dev/null +++ b/python/xorbits/_mars/learn/neighbors/_faiss.py @@ -0,0 +1,806 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator +import os +import tempfile +from enum import Enum + +import numpy as np + +try: + import faiss +except ImportError: # pragma: no cover + faiss = None + +from ... import opcodes as OperandDef +from ...core import recursive_tile +from ...core.operand import OperandStage +from ...serialization.serializables import ( + BoolField, + Int8Field, + Int32Field, + Int64Field, + KeyField, + StringField, +) +from ...tensor import tensor as astensor +from ...tensor.array_utils import as_same_device, device +from ...tensor.core import TensorOrder +from ...tensor.random import RandomState +from ...tensor.utils import check_random_state, gen_random_seeds +from ...utils import has_unknown_shape, require_not_none +from ..operands import LearnOperand, LearnOperandMixin, OutputType + + +class MemoryRequirementGrade(Enum): + minimum = 0 + low = 1 + high = 2 + maximum = 3 + + +if faiss is not None: + METRIC_TO_FAISS_METRIC_TYPE = { + "l2": faiss.METRIC_L2, + "euclidean": faiss.METRIC_L2, + "innerproduct": faiss.METRIC_INNER_PRODUCT, + "cosine": faiss.METRIC_INNER_PRODUCT, + } +else: # pragma: no cover + METRIC_TO_FAISS_METRIC_TYPE = {} + + +@require_not_none(faiss) +class FaissBuildIndex(LearnOperand, LearnOperandMixin): + _op_type_ = OperandDef.FAISS_BUILD_INDEX + + _input = KeyField("input") + _metric = StringField("metric") + _faiss_index = StringField("faiss_index") + _n_sample = Int64Field("n_sample") + _seed = Int32Field("seed") + _same_distribution = BoolField("same_distribution") + _accuracy = BoolField("accuracy") + _memory_require = Int8Field( + "memory_require", + on_serialize=operator.attrgetter("value"), + on_deserialize=MemoryRequirementGrade, + ) + + def __init__( + self, + metric=None, + faiss_index=None, + n_sample=None, + seed=None, + same_distribution=None, + accuracy=None, + memory_require=None, + output_types=None, + **kw, + ): + super().__init__( + _metric=metric, + _faiss_index=faiss_index, + _n_sample=n_sample, + _seed=seed, + _same_distribution=same_distribution, + _accuracy=accuracy, + _memory_require=memory_require, + _output_types=output_types, + **kw, + ) + if self.output_types is None: + self.output_types = [OutputType.object] + + @property + def input(self): + return self._input + + @property + def metric(self): + return self._metric + + @property + def faiss_metric_type(self): + return METRIC_TO_FAISS_METRIC_TYPE[self._metric] + + @property + def faiss_index(self): + return self._faiss_index + + @property + def n_sample(self): + return self._n_sample + + @property + def seed(self): + return self._seed + + @property + def same_distribution(self): + return self._same_distribution + + @property + def accuracy(self): + return self._accuracy + + @property + def memory_require(self): + return self._memory_require + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + def __call__(self, X): + return self.new_tileable([X]) + + @classmethod + def tile(cls, op): + if has_unknown_shape(*op.inputs): + yield + + in_tensor = yield from recursive_tile(astensor(op.input, np.dtype(np.float32))) + if op.faiss_index == "auto": + faiss_index, n_sample = _gen_index_string_and_sample_count( + in_tensor.shape, + op.n_sample, + op.accuracy, + op.memory_require, + gpu=op.gpu, + **op.extra_params, + ) + op._n_sample = n_sample + else: + faiss_index, n_sample = op.faiss_index, op.n_sample + + if len(in_tensor.chunks) == 1: + return cls._tile_one_chunk(op, faiss_index, n_sample) + + if in_tensor.chunk_shape[1] != 1: + # make sure axis 1 has 1 chunk + in_tensor = yield from recursive_tile( + in_tensor.rechunk({1: in_tensor.shape[1]}) + ) + return (yield from cls._tile_chunks(op, in_tensor, faiss_index, n_sample)) + + @classmethod + def _tile_one_chunk(cls, op, faiss_index, n_sample): + in_chunk = op.input.chunks[0] + chunk_op = op.copy().reset_key() + chunk_op._faiss_index = faiss_index + chunk_op._n_sample = n_sample + chunk = chunk_op.new_chunk([in_chunk], index=in_chunk.index) + + new_op = op.copy() + kw = op.outputs[0].params + kw["chunks"] = [chunk] + kw["nsplits"] = ((1,),) + return new_op.new_tileables(op.inputs, kws=[kw]) + + @classmethod + def _tile_chunks(cls, op, in_tensor, faiss_index, n_sample): + """ + If the distribution on each chunk is the same, + refer to: + https://github.com/facebookresearch/faiss/wiki/FAQ#how-can-i-distribute-index-building-on-several-machines + + 1. train an IndexIVF* on a representative sample of the data, store it. + 2. for each node, load the trained index, add the local data to it, store the resulting populated index + 3. on a central node, load all the populated indexes and merge them. + """ + faiss_index_ = faiss.index_factory( + in_tensor.shape[1], faiss_index, op.faiss_metric_type + ) + # Training on sample data when two conditions meet + # 1. the index type requires for training, e.g. Flat does not require + # 2. distributions of chunks are the same, in not, + # train separately on each chunk data + need_sample_train = not faiss_index_.is_trained and op.same_distribution + need_merge_index = ( + hasattr(faiss_index_, "merge_from") if need_sample_train else False + ) + + train_chunk = None + if need_sample_train: + # sample data to train + rs = RandomState(op.seed) + sampled_index = rs.choice( + in_tensor.shape[0], size=n_sample, replace=False, chunk_size=n_sample + ) + sample_tensor = yield from recursive_tile(in_tensor[sampled_index]) + assert len(sample_tensor.chunks) == 1 + sample_chunk = sample_tensor.chunks[0] + train_op = FaissTrainSampledIndex(faiss_index=faiss_index, metric=op.metric) + train_chunk = train_op.new_chunk([sample_chunk]) + elif op.gpu: # pragma: no cover + # if not need train, and on gpu, just merge data together to train + in_tensor = yield from recursive_tile(in_tensor.rechunk(in_tensor.shape)) + + # build index for each input chunk + build_index_chunks = [] + for i, chunk in enumerate(in_tensor.chunks): + build_index_op = op.copy().reset_key() + build_index_op.stage = OperandStage.map + build_index_op._faiss_index = faiss_index + if train_chunk is not None: + build_index_chunk = build_index_op.new_chunk( + [chunk, train_chunk], index=(i,) + ) + else: + build_index_chunk = build_index_op.new_chunk([chunk], index=(i,)) + build_index_chunks.append(build_index_chunk) + + out_chunks = [] + if need_merge_index: + assert op.n_sample is not None + # merge all indices into one, do only when trained on sample data + out_chunk_op = op.copy().reset_key() + out_chunk_op._faiss_index = faiss_index + out_chunk_op.stage = OperandStage.agg + out_chunk = out_chunk_op.new_chunk(build_index_chunks, index=(0,)) + out_chunks.append(out_chunk) + else: + out_chunks.extend(build_index_chunks) + + new_op = op.copy() + return new_op.new_tileables( + op.inputs, chunks=out_chunks, nsplits=((len(out_chunks),),) + ) + + @classmethod + def _execute_one_chunk(cls, ctx, op): + (inp,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + inp = inp.astype(np.float32, copy=False) + # create index + index = faiss.index_factory( + inp.shape[1], op.faiss_index, op.faiss_metric_type + ) + # GPU + if device_id >= 0: # pragma: no cover + index = _index_to_gpu(index, device_id) + + # train index + if not index.is_trained: + assert op.n_sample is not None + sample_indices = xp.random.choice( + inp.shape[0], size=op.n_sample, replace=False + ) + sampled = inp[sample_indices] + index.train(sampled) + + if op.metric == "cosine": + # faiss does not support cosine distances directly, + # data needs to be normalize before adding to index, + # refer to: + # https://github.com/facebookresearch/faiss/wiki/FAQ#how-can-i-index-vectors-for-cosine-distance + faiss.normalize_L2(inp) + # add vectors to index + if device_id >= 0: # pragma: no cover + # gpu + index.add_c(inp.shape[0], _swig_ptr_from_cupy_float32_array(inp)) + else: + index.add(inp) + + ctx[op.outputs[0].key] = _store_index(index, device_id) + + @classmethod + def _execute_map(cls, ctx, op): + (data,), device_id, xp = as_same_device( + [ctx[op.inputs[0].key]], device=op.device, ret_extra=True + ) + index = ctx[op.inputs[1].key] if len(op.inputs) == 2 else None + + with device(device_id): + data = xp.ascontiguousarray(data) + if index is not None: + # fetch the trained index + trained_index = _load_index(index, device_id) + else: + trained_index = faiss.index_factory( + data.shape[1], op.faiss_index, op.faiss_metric_type + ) + if op.same_distribution: + # no need to train, just create index + pass + else: + # distribution no the same, train on each chunk + trained_index.train(data) + + if device_id >= 0: # pragma: no cover + trained_index = _index_to_gpu(trained_index, device_id) + if op.metric == "cosine": + # faiss does not support cosine distances directly, + # data needs to be normalize before adding to index, + # refer to: + # https://github.com/facebookresearch/faiss/wiki/FAQ#how-can-i-index-vectors-for-cosine-distance + faiss.normalize_L2(data) + + # add data into index + if device_id >= 0: # pragma: no cover + # gpu + trained_index.add_c( + data.shape[0], _swig_ptr_from_cupy_float32_array(data) + ) + else: + trained_index.add(data) + + ctx[op.outputs[0].key] = _store_index(trained_index, device_id) + + @classmethod + def _execute_agg(cls, ctx, op): + device_id = op.device + if device_id is None: + device_id = -1 + inputs = [ctx[inp.key] for inp in op.inputs] + + with device(device_id): + merged_index = None + indexes = [] + for index in inputs: + index = _load_index(index, device_id) + indexes.append(index) + assert hasattr(index, "merge_from") + if merged_index is None: + merged_index = index + else: + merged_index.merge_from(index, index.ntotal) + + ctx[op.outputs[0].key] = _store_index(merged_index, device_id) + + @classmethod + def execute(cls, ctx, op): + if op.stage == OperandStage.map: + cls._execute_map(ctx, op) + elif op.stage == OperandStage.agg: + cls._execute_agg(ctx, op) + else: + assert op.stage is None + cls._execute_one_chunk(ctx, op) + + +def _store_index(index, device_id): + if device_id >= 0: # pragma: no cover + # for gpu, convert to cpu first + index = faiss.index_gpu_to_cpu(index) + # distributed, save to file, then return in memory bytes + fn = tempfile.mkstemp(".index", prefix="faiss_")[1] + faiss.write_index(index, fn) + try: + with open(fn, "rb") as f: + return f.read() + finally: + os.remove(fn) + + +def _load_index(index, device_id): + # distributed + fn = tempfile.mkstemp(".index", prefix="faiss_")[1] + with open(fn, "wb") as f: + f.write(index) + index = faiss.read_index(f.name) + if device_id >= 0: # pragma: no cover + index = _index_to_gpu(index, device_id) + return index + + +def _index_to_gpu(index, device_id): # pragma: no cover + res = faiss.StandardGpuResources() + return faiss.index_cpu_to_gpu(res, device_id, index) + + +def _swig_ptr_from_cupy_float32_array(x): # pragma: no cover + assert x.flags.c_contiguous + assert x.dtype == np.float32 + data_ptr = x.__cuda_array_interface__["data"][0] + return faiss.cast_integer_to_float_ptr(data_ptr) + + +def _swig_ptr_from_cupy_int64_array(x): # pragma: no cover + assert x.flags.c_contiguous + assert x.dtype == np.int64 + data_ptr = x.__cuda_array_interface__["data"][0] + return faiss.cast_integer_to_long_ptr(data_ptr) + + +@require_not_none(faiss) +class FaissTrainSampledIndex(LearnOperand, LearnOperandMixin): + _op_type_ = OperandDef.FAISS_TRAIN_SAMPLED_INDEX + + _input = KeyField("input") + _metric = StringField("metric") + _faiss_index = StringField("faiss_index") + + def __init__(self, faiss_index=None, metric=None, output_types=None, **kw): + super().__init__( + _faiss_index=faiss_index, _metric=metric, _output_types=output_types, **kw + ) + if self.output_types is None: + self.output_types = [OutputType.object] + + @property + def input(self): + return self._input + + @property + def metric(self): + return self._metric + + @property + def faiss_metric_type(self): + return METRIC_TO_FAISS_METRIC_TYPE[self.metric] + + @property + def faiss_index(self): + return self._faiss_index + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + @classmethod + def execute(cls, ctx, op): + (data,), device_id, _ = as_same_device( + [ctx[op.input.key]], device=op.device, ret_extra=True + ) + + with device(device_id): + index = faiss.index_factory( + data.shape[1], op.faiss_index, op.faiss_metric_type + ) + + if device_id >= 0: # pragma: no cover + # GPU + index = _index_to_gpu(index, device_id) + index.train_c(data.shape[0], _swig_ptr_from_cupy_float32_array(data)) + else: + index.train(data) + + ctx[op.outputs[0].key] = _store_index(index, device_id) + + +def _gen_index_string_and_sample_count( + shape, n_sample, accuracy, memory_require, gpu=None, **kw +): + """ + Generate index string and sample count according to guidance of faiss: + https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index + """ + size, dim = shape + memory_require = _get_memory_require(memory_require) + + if accuracy or size < 10**5: + # Flat is the only index that guarantees exact results + # no need to train, thus sample count is None + return "Flat", None + + if memory_require == MemoryRequirementGrade.maximum and not gpu: + x = kw.get("M", 32) # get medium number by default + if x < 4 or x > 64: + raise ValueError(f"HNSWx requires M that between 4 and 64, got {x}") + return f"HNSW{x}", None + + if memory_require in (MemoryRequirementGrade.high, MemoryRequirementGrade.maximum): + basement = "{},Flat" + elif memory_require == MemoryRequirementGrade.low: + x = kw.get("dim", dim // 2) + basement = f"PCAR{x},{{}},SQ8" + elif memory_require == MemoryRequirementGrade.minimum: + x = kw.get("M", min(64, dim // 2)) + if x > 64: + raise ValueError(f"PQx requires M <= 64, got {x}") + y = kw.get("dim", None) + if y is not None and y % x != 0: + raise ValueError( + f"OPQx_y requires dim is a multiple of M({x}), got dim: {y}" + ) + y = min(dim, 4 * x) + y = x * (y // x) # make sure y is a multiple of x + basement = f"OPQ{x}_{y},{{}},PQ{x}" + else: # pragma: no cover + raise ValueError("unknown memory require") + + # now choose the clustering options + if size < 10**6 or (size < 10**7 and gpu): + # < 1M, or <10M but need GPU + k = kw.get("k", 5 * int(np.sqrt(size))) + if k < 4 * int(np.sqrt(size)) or k > 16 * int(np.sqrt(size)): + raise ValueError( + f"k should be between 4 * sqrt(N) and 16 * sqrt(N), got {k}" + ) + index_str = basement.format(f"IVF{k}") + if n_sample is None: + # 30 * k - 256 * k + n_sample = min(30 * k, size) + elif size < 10**7 and not gpu: + # 1M - 10M + index_str = basement.format("IVF65536_HNSW32") + if n_sample is None: + # between 30 * 65536 and 256 * 65536 + n_sample = 32 * 65536 + elif size < 10**8: + index_str = basement.format("IVF65536_HNSW32") + n_sample = 64 * 65536 if n_sample is None else n_sample + else: + index_str = basement.format("IVF1048576_HNSW32") + n_sample = 64 * 65536 if n_sample is None else n_sample + + return index_str, n_sample + + +def _get_memory_require(memory_require): + if isinstance(memory_require, str): + return getattr(MemoryRequirementGrade, memory_require) + elif isinstance(memory_require, MemoryRequirementGrade): + return memory_require + return MemoryRequirementGrade(memory_require) + + +@require_not_none(faiss) +def build_faiss_index( + X, + index_name="auto", + n_sample=None, + metric="euclidean", + random_state=None, + same_distribution=True, + accuracy=False, + memory_require=None, + **kw, +): + X = astensor(X) + + if metric not in METRIC_TO_FAISS_METRIC_TYPE: + raise ValueError(f"unknown metric: {metric}") + if index_name != "auto": + try: + faiss.index_factory( + X.shape[1], index_name, METRIC_TO_FAISS_METRIC_TYPE[metric] + ) + except RuntimeError: + raise ValueError(f"illegal faiss index: {index_name}") + + rs = check_random_state(random_state) + if isinstance(rs, RandomState): + rs = rs.to_numpy() + seed = gen_random_seeds(1, rs)[0] + if memory_require is None: + memory_require = MemoryRequirementGrade.low + else: + memory_require = _get_memory_require(memory_require) + op = FaissBuildIndex( + faiss_index=index_name, + metric=metric, + n_sample=n_sample, + gpu=X.op.gpu, + seed=seed, + same_distribution=same_distribution, + accuracy=accuracy, + memory_require=memory_require, + **kw, + ) + return op(X) + + +class FaissQuery(LearnOperand, LearnOperandMixin): + _op_type_ = OperandDef.FAISS_QUERY + + _input = KeyField("input") + _faiss_index = KeyField("faiss_index") + _metric = StringField("metric") + _n_neighbors = Int32Field("n_neighbors") + _return_distance = BoolField("return_distance") + _nprobe = Int64Field("nprobe") + + def __init__( + self, + faiss_index=None, + metric=None, + n_neighbors=None, + return_distance=None, + nprobe=None, + output_types=None, + gpu=None, + **kw, + ): + super().__init__( + _faiss_index=faiss_index, + _n_neighbors=n_neighbors, + _metric=metric, + _return_distance=return_distance, + _output_types=output_types, + _nprobe=nprobe, + gpu=gpu, + **kw, + ) + if self.output_types is None: + self.output_types = [OutputType.tensor] * self.output_limit + + @property + def input(self): + return self._input + + @property + def faiss_index(self): + return self._faiss_index + + @property + def metric(self): + return self._metric + + @property + def n_neighbors(self): + return self._n_neighbors + + @property + def nprobe(self): + return self._nprobe + + @property + def return_distance(self): + return self._return_distance + + @property + def output_limit(self): + return 2 if self._return_distance else 1 + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + if self._faiss_index is not None: + self._faiss_index = self._inputs[1] + + def __call__(self, y): + kws = [] + if self._return_distance: + kws.append( + { + "shape": (y.shape[0], self._n_neighbors), + "dtype": np.dtype(np.float32), + "order": TensorOrder.C_ORDER, + "type": "distance", + } + ) + kws.append( + { + "shape": (y.shape[0], self._n_neighbors), + "dtype": np.dtype(np.int64), + "order": TensorOrder.C_ORDER, + "type": "indices", + } + ) + return self.new_tileables([y, self._faiss_index], kws=kws) + + @classmethod + def tile(cls, op): + in_tensor = astensor(op.input) + + if in_tensor.chunk_shape[1] != 1: + if has_unknown_shape(in_tensor): + yield + in_tensor = yield from recursive_tile( + in_tensor.rechunk({1: in_tensor.shape[1]}) + ) + + out_chunks = [], [] + for chunk in in_tensor.chunks: + chunk_op = op.copy().reset_key() + chunk_kws = [] + if op.return_distance: + chunk_kws.append( + { + "shape": (chunk.shape[0], op.n_neighbors), + "dtype": np.dtype(np.float32), + "order": TensorOrder.C_ORDER, + "index": chunk.index, + "type": "distance", + } + ) + chunk_kws.append( + { + "shape": (chunk.shape[0], op.n_neighbors), + "dtype": np.dtype(np.int64), + "order": TensorOrder.C_ORDER, + "index": chunk.index, + "type": "indices", + } + ) + in_chunks = [chunk] + in_chunks.extend(op.faiss_index.chunks) + chunks = chunk_op.new_chunks(in_chunks, kws=chunk_kws) + if op.return_distance: + out_chunks[0].append(chunks[0]) + out_chunks[1].append(chunks[-1]) + + new_op = op.copy() + kws = [out.params for out in op.outputs] + if op.return_distance: + kws[0]["chunks"] = out_chunks[0] + kws[0]["nsplits"] = (in_tensor.nsplits[0], (op.n_neighbors,)) + kws[-1]["chunks"] = out_chunks[1] + kws[-1]["nsplits"] = (in_tensor.nsplits[0], (op.n_neighbors,)) + return new_op.new_tileables(op.inputs, kws=kws) + + @classmethod + def execute(cls, ctx, op): + (y,), device_id, xp = as_same_device( + [ctx[op.input.key]], device=op.device, ret_extra=True + ) + indexes = [_load_index(ctx[index.key], device_id) for index in op.inputs[1:]] + + with device(device_id): + y = xp.ascontiguousarray(y, dtype=np.float32) + + if len(indexes) == 1: + index = indexes[0] + else: + index = faiss.IndexShards(indexes[0].d) + [index.add_shard(ind) for ind in indexes] + + if op.metric == "cosine": + # faiss does not support cosine distances directly, + # data needs to be normalize before searching, + # refer to: + # https://github.com/facebookresearch/faiss/wiki/FAQ#how-can-i-index-vectors-for-cosine-distance + faiss.normalize_L2(y) + + if op.nprobe is not None: + index.nprobe = op.nprobe + + if device_id >= 0: # pragma: no cover + n = y.shape[0] + k = op.n_neighbors + distances = xp.empty((n, k), dtype=xp.float32) + indices = xp.empty((n, k), dtype=xp.int64) + index.search_c( + n, + _swig_ptr_from_cupy_float32_array(y), + k, + _swig_ptr_from_cupy_float32_array(distances), + _swig_ptr_from_cupy_int64_array(indices), + ) + else: + distances, indices = index.search(y, op.n_neighbors) + if op.return_distance: + if index.metric_type == faiss.METRIC_L2: + # make it equivalent to `pairwise.euclidean_distances` + distances = xp.sqrt(distances, out=distances) + elif op.metric == "cosine": + # make it equivalent to `pairwise.cosine_distances` + distances = xp.subtract(1, distances, out=distances) + ctx[op.outputs[0].key] = distances + ctx[op.outputs[-1].key] = indices + + +@require_not_none(faiss) +def faiss_query(faiss_index, data, n_neighbors, return_distance=True, nprobe=None): + data = astensor(data) + op = FaissQuery( + faiss_index=faiss_index, + n_neighbors=n_neighbors, + metric=faiss_index.op.metric, + return_distance=return_distance, + nprobe=nprobe, + gpu=data.op.gpu, + ) + ret = op(data) + if not return_distance: + return ret[0] + return ret diff --git a/python/xorbits/_mars/learn/neighbors/_kd_tree.py b/python/xorbits/_mars/learn/neighbors/_kd_tree.py new file mode 100644 index 000000000..f810adf72 --- /dev/null +++ b/python/xorbits/_mars/learn/neighbors/_kd_tree.py @@ -0,0 +1,61 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +try: + from sklearn.neighbors import KDTree as SklearnKDTree +except ImportError: # pragma: no cover + SklearnKDTree = None + +from ... import opcodes as OperandDef +from ...utils import require_not_none +from .tree import TreeBase, TreeObject, TreeQueryBase + + +class KDTree(TreeObject): + pass + + +@require_not_none(SklearnKDTree) +class _KDTree(TreeBase): + _op_type_ = OperandDef.KD_TREE_TRAIN + _tree_type = SklearnKDTree + + def __call__(self, a): + result = super().__call__(a) + return KDTree(result.data) + + +@require_not_none(SklearnKDTree) +class KDTreeQuery(TreeQueryBase): + _op_type_ = OperandDef.KD_TREE_QUERY + _tree_type = SklearnKDTree + + +@require_not_none(SklearnKDTree) +def kd_tree_query(tree, data, n_neighbors, return_distance): + op = KDTreeQuery( + tree=tree, n_neighbors=n_neighbors, return_distance=return_distance + ) + ret = op(data) + if not return_distance: + return ret[0] + return ret + + +@require_not_none(SklearnKDTree) +def create_kd_tree(X, leaf_size, metric=None, **metric_params): + # kd_tree cannot accept callable metric + assert not callable(metric) + op = _KDTree(leaf_size=leaf_size, metric=metric, **metric_params) + return op(X) diff --git a/python/xorbits/_mars/learn/neighbors/_kneighbors_graph.py b/python/xorbits/_mars/learn/neighbors/_kneighbors_graph.py new file mode 100644 index 000000000..a5364524d --- /dev/null +++ b/python/xorbits/_mars/learn/neighbors/_kneighbors_graph.py @@ -0,0 +1,134 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...core import recursive_tile +from ...lib.sparse.array import SparseNDArray, get_sparse_module +from ...serialization.serializables import Int64Field, KeyField +from ...tensor.array_utils import as_same_device, device +from ...tensor.core import TensorOrder +from ...tensor.utils import decide_unify_split +from ...utils import has_unknown_shape +from ..operands import LearnOperand, LearnOperandMixin, OutputType + + +class KNeighborsGraph(LearnOperand, LearnOperandMixin): + _op_type_ = OperandDef.KNEIGHBORS_GRAPH + + _a_data = KeyField("a_data") + _a_ind = KeyField("a_ind") + _n_neighbors = Int64Field("n_neighbors") + + def __init__(self, a_data=None, a_ind=None, n_neighbors=None, **kw): + super().__init__(_a_data=a_data, _a_ind=a_ind, _n_neighbors=n_neighbors, **kw) + self.output_types = [OutputType.tensor] + + @property + def a_data(self): + return self._a_data + + @property + def a_ind(self): + return self._a_ind + + @property + def n_neighbors(self): + return self._n_neighbors + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if self._a_data is not None: + self._a_data = self._inputs[0] + self._a_ind = self._inputs[-1] + + def __call__(self, A_data, A_ind, shape): + inputs = [] + if A_data is not None: + inputs.append(A_data) + inputs.append(A_ind) + return self.new_tileable( + inputs, dtype=np.dtype(np.float64), shape=shape, order=TensorOrder.C_ORDER + ) + + @classmethod + def tile(cls, op): + if has_unknown_shape(*op.inputs): + yield + A_data, A_ind = op.a_data, op.a_ind + out = op.outputs[0] + + shape1 = A_ind.shape[1] + if A_data is not None: + # mode == 'distance' + axis0_chunk_sizes = decide_unify_split(A_data.nsplits[0], A_ind.nsplits[0]) + A_data = yield from recursive_tile( + A_data.rechunk({0: axis0_chunk_sizes, 1: shape1}) + ) + A_ind = yield from recursive_tile( + A_ind.rechunk({0: axis0_chunk_sizes, 1: shape1}) + ) + else: + # mode == 'connectivity' + A_ind = yield from recursive_tile(A_ind.rechunk({1: shape1})) + + out_chunks = [] + for i, ind_c in enumerate(A_ind.chunks): + chunk_op = op.copy().reset_key() + chunk_inputs = [ind_c] + if A_data is not None: + data_c = A_data.cix[i, 0] + chunk_inputs.insert(0, data_c) + out_chunk = chunk_op.new_chunk( + chunk_inputs, + dtype=out.dtype, + shape=(ind_c.shape[0], out.shape[1]), + order=out.order, + index=(i, 0), + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + params = out.params + params["chunks"] = out_chunks + params["nsplits"] = (A_ind.nsplits[0], (out.shape[1],)) + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + out = op.outputs[0] + n_samples1, n_samples2 = out.shape + n_neighbors = op.n_neighbors + n_nonzero = n_samples1 * n_neighbors + + with device(device_id): + A_ind = inputs[-1] + A_indptr = xp.arange(0, n_nonzero + 1, n_neighbors) + + if op.a_data is None: + # mode == 'connectivity + A_data = xp.ones(n_samples1 * n_neighbors) + else: + # mode == 'distance' + A_data = xp.ravel(inputs[0]) + + xps = get_sparse_module(A_ind) + graph = xps.csr_matrix( + (A_data, A_ind.ravel(), A_indptr), shape=(n_samples1, n_samples2) + ) + ctx[out.key] = SparseNDArray(graph) diff --git a/python/xorbits/_mars/learn/neighbors/_proxima.py b/python/xorbits/_mars/learn/neighbors/_proxima.py new file mode 100644 index 000000000..7ead22086 --- /dev/null +++ b/python/xorbits/_mars/learn/neighbors/_proxima.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..proxima.simple_index import build_index, search_index + +METRIC_TO_PROXIMA_METRIC_TYPE = { + "l2": "Euclidean", + "euclidean": "Euclidean", + "canberra": "Canberra", + "chebyshev": "Chebyshev", + "sqeuclidean": "SquaredEuclidean", + "innerproduct": "InnerProduct", + "manhattan": "Manhattan", +} + +build_proxima_index = build_index +proxima_query = search_index diff --git a/python/xorbits/_mars/learn/neighbors/base.py b/python/xorbits/_mars/learn/neighbors/base.py new file mode 100644 index 000000000..39f74a45b --- /dev/null +++ b/python/xorbits/_mars/learn/neighbors/base.py @@ -0,0 +1,575 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from abc import ABCMeta, abstractmethod + +import numpy as np +from sklearn.base import BaseEstimator, MultiOutputMixin + +from ... import tensor as mt +from ...tensor.reshape.reshape import _reshape as reshape_unchecked +from ..metrics import pairwise_distances_topk +from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS +from ..utils import check_array +from ..utils.validation import check_is_fitted +from ._ball_tree import SklearnBallTree, ball_tree_query, create_ball_tree +from ._faiss import METRIC_TO_FAISS_METRIC_TYPE, build_faiss_index, faiss_query +from ._kd_tree import SklearnKDTree, create_kd_tree, kd_tree_query +from ._kneighbors_graph import KNeighborsGraph +from ._proxima import METRIC_TO_PROXIMA_METRIC_TYPE, build_proxima_index, proxima_query + +VALID_METRICS = dict( + ball_tree=SklearnBallTree.valid_metrics, + kd_tree=SklearnKDTree.valid_metrics, + # The following list comes from the + # sklearn.metrics.pairwise doc string + brute=( + list(PAIRWISE_DISTANCE_FUNCTIONS.keys()) + + [ + "braycurtis", + "canberra", + "chebyshev", + "correlation", + "cosine", + "dice", + "hamming", + "jaccard", + "kulsinski", + "mahalanobis", + "matching", + "minkowski", + "rogerstanimoto", + "russellrao", + "seuclidean", + "sokalmichener", + "sokalsneath", + "sqeuclidean", + "yule", + "wminkowski", + ] + ), + faiss=list(METRIC_TO_FAISS_METRIC_TYPE), + proxima=list(METRIC_TO_PROXIMA_METRIC_TYPE), +) + + +VALID_METRICS_SPARSE = dict( + ball_tree=[], kd_tree=[], brute=(PAIRWISE_DISTANCE_FUNCTIONS.keys() - {"haversine"}) +) + + +class NeighborsBase(BaseEstimator, MultiOutputMixin, metaclass=ABCMeta): + """Base class for nearest neighbors estimators.""" + + @abstractmethod + def __init__( + self, + n_neighbors=None, + radius=None, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + n_jobs=None, + ): + self.n_neighbors = n_neighbors + self.radius = radius + self.algorithm = algorithm + self.leaf_size = leaf_size + self.metric = metric + self.metric_params = metric_params + self.p = p + self.n_jobs = n_jobs + self._check_algorithm_metric() + + def _check_algorithm_metric(self): + if self.algorithm not in [ + "auto", + "brute", + "kd_tree", + "ball_tree", + "faiss", + "proxima", + ]: + raise ValueError(f"unrecognized algorithm: '{self.algorithm}'") + + if self.algorithm == "auto": + if self.metric == "precomputed": + alg_check = "brute" + elif callable(self.metric) or self.metric in VALID_METRICS["ball_tree"]: + alg_check = "ball_tree" + else: + alg_check = "brute" + else: + alg_check = self.algorithm + + if callable(self.metric): + if self.algorithm == "kd_tree": + # callable metric is only valid for brute force and ball_tree + raise ValueError( + "kd_tree algorithm does not support callable metric '%s'" + % self.metric + ) + elif self.metric not in VALID_METRICS[alg_check]: + raise ValueError( + "Metric '%s' not valid. Use " + "sorted(sklearn.neighbors.VALID_METRICS['%s']) " + "to get valid options. " + "Metric can also be a callable function." % (self.metric, alg_check) + ) + + if self.metric_params is not None and "p" in self.metric_params: + warnings.warn( + "Parameter p is found in metric_params. " + "The corresponding parameter from __init__ " + "is ignored.", + SyntaxWarning, + stacklevel=3, + ) + effective_p = self.metric_params["p"] + else: + effective_p = self.p + + if self.metric in ["wminkowski", "minkowski"] and effective_p < 1: + raise ValueError("p must be greater than one for minkowski metric") + + def _fit(self, X, session=None, run_kwargs=None): + self._check_algorithm_metric() + if self.metric_params is None: + self.effective_metric_params_ = {} + else: + self.effective_metric_params_ = self.metric_params.copy() + + effective_p = self.effective_metric_params_.get("p", self.p) + if self.metric in ["wminkowski", "minkowski"]: + self.effective_metric_params_["p"] = effective_p + + self.effective_metric_ = self.metric + # For minkowski distance, use more efficient methods where available + if self.metric == "minkowski": + p = self.effective_metric_params_.pop("p", 2) + if p < 1: # pragma: no cover + raise ValueError("p must be greater than one for minkowski metric") + elif p == 1: + self.effective_metric_ = "manhattan" + elif p == 2: + self.effective_metric_ = "euclidean" + elif p == np.inf: + self.effective_metric_ = "chebyshev" + else: + self.effective_metric_params_["p"] = p + + if isinstance(X, NeighborsBase): + self._fit_X = X._fit_X + self._tree = X._tree + self._fit_method = X._fit_method + return self + + elif isinstance(X, SklearnBallTree): + self._fit_X = mt.tensor(X.data) + self._tree = X + self._fit_method = "ball_tree" + return self + + elif isinstance(X, SklearnKDTree): + self._fit_X = mt.tensor(X.data) + self._tree = X + self._fit_method = "kd_tree" + return self + + X = check_array(X, accept_sparse=True) + + if np.isnan(X.size): + # if X has unknown shape, execute it first + X.execute(session=session, **(run_kwargs or dict())) + + if X.issparse(): + if self.algorithm not in ("auto", "brute"): + warnings.warn("cannot use tree with sparse input: using brute force") + if self.effective_metric_ not in VALID_METRICS_SPARSE[ + "brute" + ] and not callable(self.effective_metric_): + raise ValueError( + "Metric '%s' not valid for sparse input. " + "Use sorted(sklearn.neighbors." + "VALID_METRICS_SPARSE['brute']) " + "to get valid options. " + "Metric can also be a callable function." % (self.effective_metric_) + ) + self._fit_X = X.copy() + self._tree = None + self._fit_method = "brute" + return self + + self._fit_method = self.algorithm + self._fit_X = X + + if self._fit_method == "auto": + # A tree approach is better for small number of neighbors, + # and KDTree is generally faster when available + if ( + self.n_neighbors is None or self.n_neighbors < self._fit_X.shape[0] // 2 + ) and self.metric != "precomputed": + if self.effective_metric_ in VALID_METRICS["kd_tree"]: + self._fit_method = "kd_tree" + elif ( + callable(self.effective_metric_) + or self.effective_metric_ in VALID_METRICS["ball_tree"] + ): + self._fit_method = "ball_tree" + else: + self._fit_method = "brute" + else: + self._fit_method = "brute" + + if self._fit_method == "ball_tree": + self._tree = tree = create_ball_tree( + X, + self.leaf_size, + metric=self.effective_metric_, + **self.effective_metric_params_, + ) + tree.execute(session=session, **(run_kwargs or dict())) + elif self._fit_method == "kd_tree": + self._tree = tree = create_kd_tree( + X, + self.leaf_size, + metric=self.effective_metric_, + **self.effective_metric_params_, + ) + tree.execute(session=session, **(run_kwargs or dict())) + elif self._fit_method == "brute": + self._tree = None + elif self._fit_method == "faiss": + faiss_index = build_faiss_index( + X, metric=self.effective_metric_, **self.effective_metric_params_ + ) + faiss_index.execute(session=session, **(run_kwargs or dict())) + self._faiss_index = faiss_index + elif self._fit_method == "proxima": # pragma: no cover + proxima_metric = METRIC_TO_PROXIMA_METRIC_TYPE[self.effective_metric_] + proxima_index = build_proxima_index( + X, + distance_metric=proxima_metric, + topk=self.n_neighbors, + session=session, + run_kwargs=run_kwargs, + **self.effective_metric_params_, + ) + self._proxima_index = proxima_index + else: # pragma: no cover + raise ValueError("algorithm = '%s' not recognized" % self.algorithm) + + if self.n_neighbors is not None: + if self.n_neighbors <= 0: + raise ValueError(f"Expected n_neighbors > 0. Got {self.n_neighbors}") + else: + if not np.issubdtype(type(self.n_neighbors), np.integer): + raise TypeError( + f"n_neighbors does not take {type(self.n_neighbors)} value, " + "enter integer value" + ) + + return self + + +class KNeighborsMixin: + """Mixin for k-neighbors searches""" + + def kneighbors( + self, + X=None, + n_neighbors=None, + return_distance=True, + session=None, + run_kwargs=None, + **kw, + ): + """Finds the K-neighbors of a point. + Returns indices of and distances to the neighbors of each point. + + Parameters + ---------- + X : array-like, shape (n_query, n_features), \ + or (n_query, n_indexed) if metric == 'precomputed' + The query point or points. + If not provided, neighbors of each indexed point are returned. + In this case, the query point is not considered its own neighbor. + + n_neighbors : int + Number of neighbors to get (default is the value + passed to the constructor). + + return_distance : boolean, optional. Defaults to True. + If False, distances will not be returned + + Returns + ------- + dist : Tensor + Array representing the lengths to points, only present if + return_distance=True + + ind : Tensor + Indices of the nearest points in the population matrix. + + Examples + -------- + In the following example, we construct a NeighborsClassifier + class from a tensor representing our data set and ask who's + the closest point to [1,1,1] + + >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]] + >>> from mars.learn.neighbors import NearestNeighbors + >>> neigh = NearestNeighbors(n_neighbors=1) + >>> neigh.fit(samples) # doctest: +ELLIPSIS + NearestNeighbors(algorithm='auto', leaf_size=30, ...) + >>> print(neigh.kneighbors([[1., 1., 1.]])) # doctest: +ELLIPSIS + (array([[0.5]]), array([[2]])) + + As you can see, it returns [[0.5]], and [[2]], which means that the + element is at distance 0.5 and is the third element of samples + (indexes start at 0). You can also query for multiple points: + + >>> X = [[0., 1., 0.], [1., 0., 1.]] + >>> neigh.kneighbors(X, return_distance=False) # doctest: +ELLIPSIS + array([[1], + [2]]...) + + """ + check_is_fitted(self, ["_fit_method", "_fit_X"], all_or_any=any) + + if n_neighbors is None: + n_neighbors = self.n_neighbors + elif n_neighbors <= 0: + raise ValueError(f"Expected n_neighbors > 0. Got {n_neighbors}") + else: + if not np.issubdtype(type(n_neighbors), np.integer): + raise TypeError( + f"n_neighbors does not take {type(n_neighbors)} value, " + "enter integer value" + ) + + if X is not None: + query_is_train = False + X = check_array(X, accept_sparse=True) + else: + query_is_train = True + X = self._fit_X + # Include an extra neighbor to account for the sample itself being + # returned, which is removed later + n_neighbors += 1 + + if X.key == self._fit_X.key and X is not self._fit_X: + X = self._fit_X + if np.isnan(X.size): + # has unknown size, execute first + X.execute(session=session, **(run_kwargs or dict())) + + train_size = self._fit_X.shape[0] + if n_neighbors > train_size: + raise ValueError( + "Expected n_neighbors <= n_samples, " + f"but n_samples = {train_size}, n_neighbors = {n_neighbors}" + ) + n_samples, _ = X.shape + sample_range = mt.arange(n_samples)[:, None] + + if self._fit_method == "brute": + # for efficiency, use squared euclidean distances + kwds = ( + {"squared": True} + if self.effective_metric_ == "euclidean" + else self.effective_metric_params_ + ) + + neigh_dist, neigh_ind = pairwise_distances_topk( + X, self._fit_X, k=n_neighbors, metric=self.effective_metric_, **kwds + ) + if return_distance: + if self.effective_metric_ == "euclidean": + result = mt.sqrt(neigh_dist), neigh_ind + else: + result = neigh_dist, neigh_ind + else: + result = neigh_ind + elif self._fit_method in ["ball_tree", "kd_tree"]: + if X.issparse(): + raise ValueError( + f"{self._fit_method} does not work with sparse matrices. " + "Densify the data, or set algorithm='brute'" + ) + + query = ( + ball_tree_query if self._fit_method == "ball_tree" else kd_tree_query + ) + result = query(self._tree, X, n_neighbors, return_distance) + elif self._fit_method == "faiss": + if X.issparse(): + raise ValueError( + f"{self._fit_method} does not work with sparse matrices. " + "Densify the data, or set algorithm='brute'" + ) + result = faiss_query( + self._faiss_index, X, n_neighbors, return_distance, **kw + ) + elif self._fit_method == "proxima": # pragma: no cover + if X.issparse(): + raise ValueError( + f"{self._fit_method} does not work with sparse matrices. " + "Densify the data, or set algorithm='brute'" + ) + ind, dis = proxima_query( + X, n_neighbors, index=self._proxima_index, run=False, **kw + ) + if not return_distance: + result = ind + else: + result = (dis, ind) + else: # pragma: no cover + raise ValueError("internal: _fit_method not recognized") + + if not query_is_train: + if isinstance(result, (tuple, list)): + result = mt.ExecutableTuple(result) + result.execute(session=session, **(run_kwargs or dict())) + return result + else: + # If the query data is the same as the indexed data, we would like + # to ignore the first nearest neighbor of every sample, i.e + # the sample itself. + if return_distance: + dist, neigh_ind = result + else: + neigh_ind = result + + sample_mask = neigh_ind != sample_range + + # Corner case: When the number of duplicates are more + # than the number of neighbors, the first NN will not + # be the sample, but a duplicate. + # In that case mask the first duplicate. + dup_gr_nbrs = mt.all(sample_mask, axis=1) + sample_mask[:, 0] = mt.where(dup_gr_nbrs, False, sample_mask[:, 0]) + + neigh_ind = reshape_unchecked( + neigh_ind[sample_mask], (n_samples, n_neighbors - 1) + ) + + if return_distance: + dist = reshape_unchecked( + dist[sample_mask], (n_samples, n_neighbors - 1) + ) + ret = mt.ExecutableTuple([dist, neigh_ind]) + ret.execute(session=session, **(run_kwargs or dict())) + return ret + neigh_ind.execute(session=session, **(run_kwargs or dict())) + return neigh_ind + + def kneighbors_graph( + self, + X=None, + n_neighbors=None, + mode="connectivity", + session=None, + run_kwargs=None, + ): + """Computes the (weighted) graph of k-Neighbors for points in X + + Parameters + ---------- + X : array-like, shape (n_query, n_features), \ + or (n_query, n_indexed) if metric == 'precomputed' + The query point or points. + If not provided, neighbors of each indexed point are returned. + In this case, the query point is not considered its own neighbor. + + n_neighbors : int + Number of neighbors for each sample. + (default is value passed to the constructor). + + mode : {'connectivity', 'distance'}, optional + Type of returned matrix: 'connectivity' will return the + connectivity matrix with ones and zeros, in 'distance' the + edges are Euclidean distance between points. + + Returns + ------- + A : SparseTensor, shape = [n_samples, n_samples_fit] + n_samples_fit is the number of samples in the fitted data + A[i, j] is assigned the weight of edge that connects i to j. + + Examples + -------- + >>> X = [[0], [3], [1]] + >>> from mars.learn.neighbors import NearestNeighbors + >>> neigh = NearestNeighbors(n_neighbors=2) + >>> neigh.fit(X) # doctest: +ELLIPSIS + NearestNeighbors(algorithm='auto', leaf_size=30, ...) + >>> A = neigh.kneighbors_graph(X) + >>> A.fetch().toarray() + array([[1., 0., 1.], + [0., 1., 1.], + [1., 0., 1.]]) + + See also + -------- + NearestNeighbors.radius_neighbors_graph + """ + check_is_fitted(self, ["_fit_method", "_fit_X"], all_or_any=any) + if n_neighbors is None: + n_neighbors = self.n_neighbors + + # kneighbors does the None handling. + if X is not None: + X = check_array(X, accept_sparse=True) + n_samples1 = X.shape[0] + else: + n_samples1 = self._fit_X.shape[0] + + n_samples2 = self._fit_X.shape[0] + + if mode == "connectivity": + A_data = None + A_ind = self.kneighbors(X, n_neighbors, return_distance=False) + + elif mode == "distance": + A_data, A_ind = self.kneighbors(X, n_neighbors, return_distance=True) + + else: + raise ValueError( + 'Unsupported mode, must be one of "connectivity" ' + f'or "distance" but got {mode} instead' + ) + + op = KNeighborsGraph( + a_data=A_data, a_ind=A_ind, n_neighbors=n_neighbors, sparse=True + ) + graph = op(A_data, A_ind, shape=(n_samples1, n_samples2)) + graph.execute(session=session, **(run_kwargs or dict())) + return graph + + +class UnsupervisedMixin: + def fit(self, X, y=None, session=None, run_kwargs=None): + """Fit the model using X as training data + + Parameters + ---------- + X : {array-like, tensor, BallTree, KDTree} + Training data. If tensor, shape [n_samples, n_features], + or [n_samples, n_samples] if metric='precomputed'. + """ + return self._fit(X, session=session, run_kwargs=run_kwargs) diff --git a/python/xorbits/_mars/learn/neighbors/tests/__init__.py b/python/xorbits/_mars/learn/neighbors/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/learn/neighbors/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/learn/neighbors/tests/test_faiss.py b/python/xorbits/_mars/learn/neighbors/tests/test_faiss.py new file mode 100644 index 000000000..8628fb1ef --- /dev/null +++ b/python/xorbits/_mars/learn/neighbors/tests/test_faiss.py @@ -0,0 +1,241 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest + +try: + import faiss +except ImportError: # pragma: no cover + faiss = None + +from .... import tensor as mt +from ....core import tile +from ....session import execute, fetch +from .. import NearestNeighbors +from .._faiss import ( + _gen_index_string_and_sample_count, + _load_index, + build_faiss_index, + faiss_query, +) + + +@pytest.mark.skipif(faiss is None, reason="faiss not installed") +def test_manual_build_faiss_index(setup): + d = 8 + n = 50 + n_test = 10 + x = np.random.RandomState(0).rand(n, d).astype(np.float32) + y = np.random.RandomState(0).rand(n_test, d).astype(np.float32) + + nn = NearestNeighbors(algorithm="kd_tree") + nn.fit(x) + _, expected_indices = nn.kneighbors(y, 5) + + # test brute-force search + X = mt.tensor(x, chunk_size=10) + index = build_faiss_index(X, "Flat", None, random_state=0, same_distribution=True) + faiss_index = index.execute().fetch() + + index_shards = faiss.IndexShards(d) + for ind in faiss_index: + shard = _load_index(ind, -1) + index_shards.add_shard(shard) + faiss_index = index_shards + + faiss_index.nprob = 10 + _, indices = faiss_index.search(y, k=5) + + np.testing.assert_array_equal(indices, expected_indices.fetch()) + + # test one chunk, brute force + X = mt.tensor(x, chunk_size=50) + index = build_faiss_index(X, "Flat", None, random_state=0, same_distribution=True) + faiss_index = _load_index(index.execute().fetch(), -1) + + faiss_index.nprob = 10 + _, indices = faiss_index.search(y, k=5) + + np.testing.assert_array_equal(indices, expected_indices.fetch()) + + # test train, same distribution + X = mt.tensor(x, chunk_size=10) + index = build_faiss_index( + X, "IVF30,Flat", 30, random_state=0, same_distribution=True + ) + faiss_index = _load_index(index.execute().fetch(), -1) + + assert isinstance(faiss_index, faiss.IndexIVFFlat) + assert faiss_index.ntotal == n + assert len(tile(index).chunks) == 1 + + # test train, distributions are variant + X = mt.tensor(x, chunk_size=10) + index = build_faiss_index( + X, "IVF10,Flat", None, random_state=0, same_distribution=False + ) + faiss_index = index.execute().fetch() + + assert len(faiss_index) == 5 + for ind in faiss_index: + ind = _load_index(ind, -1) + assert isinstance(ind, faiss.IndexIVFFlat) + assert ind.ntotal == 10 + + # test more index type + index = build_faiss_index(X, "PCAR6,IVF8_HNSW32,SQ8", 10, random_state=0) + faiss_index = index.execute().fetch() + + assert len(faiss_index) == 5 + for ind in faiss_index: + ind = _load_index(ind, -1) + assert isinstance(ind, faiss.IndexPreTransform) + assert ind.ntotal == 10 + + # test one chunk, train + X = mt.tensor(x, chunk_size=50) + index = build_faiss_index( + X, "IVF30,Flat", 30, random_state=0, same_distribution=True + ) + faiss_index = _load_index(index.execute().fetch(), -1) + + assert isinstance(faiss_index, faiss.IndexIVFFlat) + assert faiss_index.ntotal == n + + # test wrong index + with pytest.raises(ValueError): + build_faiss_index(X, "unknown_index", None) + + # test unknown metric + with pytest.raises(ValueError): + build_faiss_index(X, "Flat", None, metric="unknown_metric") + + +d = 8 +n = 50 +n_test = 10 +x = np.random.RandomState(0).rand(n, d).astype(np.float32) +y = np.random.RandomState(1).rand(n_test, d).astype(np.float32) + + +@pytest.mark.skipif(faiss is None, reason="faiss not installed") +@pytest.mark.parametrize( + "X, Y", + [ + # multi chunks + (mt.tensor(x, chunk_size=(20, 5)), mt.tensor(y, chunk_size=5)), + # one chunk + (mt.tensor(x, chunk_size=50), mt.tensor(y, chunk_size=10)), + ], +) +@pytest.mark.parametrize("metric", ["l2", "cosine"]) +def test_faiss_query(setup, X, Y, metric): + faiss_index = build_faiss_index(X, "Flat", None, metric=metric, random_state=0) + d, i = faiss_query(faiss_index, Y, 5, nprobe=10) + distance, indices = fetch(*execute(d, i)) + + nn = NearestNeighbors(metric=metric) + nn.fit(x) + expected_distance, expected_indices = nn.kneighbors(y, 5) + + np.testing.assert_array_equal(indices, expected_indices.fetch()) + np.testing.assert_almost_equal(distance, expected_distance.fetch(), decimal=4) + + # test other index + X2 = X.astype(np.float64) + Y2 = y.astype(np.float64) + faiss_index = build_faiss_index( + X2, "PCAR6,IVF8_HNSW32,SQ8", 10, random_state=0, return_index_type="object" + ) + d, i = faiss_query(faiss_index, Y2, 5, nprobe=10) + # test execute only + execute(d, i) + + +@pytest.mark.skipif(faiss is None, reason="faiss not installed") +def test_gen_index_string_and_sample_count(setup): + d = 32 + + # accuracy=True, could be Flat only + ret = _gen_index_string_and_sample_count((10**9, d), None, True, "minimum") + assert ret == ("Flat", None) + + # no memory concern + ret = _gen_index_string_and_sample_count((10**5, d), None, False, "maximum") + assert ret == ("HNSW32", None) + index = faiss.index_factory(d, ret[0]) + assert index.is_trained is True + + # memory concern not much + ret = _gen_index_string_and_sample_count((10**5, d), None, False, "high") + assert ret == ("IVF1580,Flat", 47400) + index = faiss.index_factory(d, ret[0]) + assert index.is_trained is False + + # memory quite important + ret = _gen_index_string_and_sample_count((5 * 10**6, d), None, False, "low") + assert ret == ("PCAR16,IVF65536_HNSW32,SQ8", 32 * 65536) + index = faiss.index_factory(d, ret[0]) + assert index.is_trained is False + + # memory very important + ret = _gen_index_string_and_sample_count((10**8, d), None, False, "minimum") + assert ret == ("OPQ16_32,IVF1048576_HNSW32,PQ16", 64 * 65536) + index = faiss.index_factory(d, ret[0]) + assert index.is_trained is False + + ret = _gen_index_string_and_sample_count((10**10, d), None, False, "low") + assert ret == ("PCAR16,IVF1048576_HNSW32,SQ8", 64 * 65536) + index = faiss.index_factory(d, ret[0]) + assert index.is_trained is False + + with pytest.raises(ValueError): + # M > 64 raise error + _gen_index_string_and_sample_count((10**5, d), None, False, "maximum", M=128) + + with pytest.raises(ValueError): + # M > 64 + _gen_index_string_and_sample_count((10**5, d), None, False, "minimum", M=128) + + with pytest.raises(ValueError): + # dim should be multiple of M + _gen_index_string_and_sample_count( + (10**5, d), None, False, "minimum", M=16, dim=17 + ) + + with pytest.raises(ValueError): + _gen_index_string_and_sample_count((10**5, d), None, False, "low", k=5) + + +@pytest.mark.skipif(faiss is None, reason="faiss not installed") +def test_auto_index(setup): + d = 8 + n = 50 + n_test = 10 + x = np.random.RandomState(0).rand(n, d).astype(np.float32) + y = np.random.RandomState(1).rand(n_test, d).astype(np.float32) + + for chunk_size in (50, 20): + X = mt.tensor(x, chunk_size=chunk_size) + + faiss_index = build_faiss_index(X, random_state=0, return_index_type="object") + d, i = faiss_query(faiss_index, y, 5, nprobe=10) + indices = i.execute().fetch() + + nn = NearestNeighbors() + nn.fit(x) + expected_indices = nn.kneighbors(y, 5, return_distance=False) + + np.testing.assert_array_equal(indices, expected_indices) diff --git a/python/xorbits/_mars/learn/neighbors/tests/test_nearest_neighbors.py b/python/xorbits/_mars/learn/neighbors/tests/test_nearest_neighbors.py new file mode 100644 index 000000000..423918a1e --- /dev/null +++ b/python/xorbits/_mars/learn/neighbors/tests/test_nearest_neighbors.py @@ -0,0 +1,414 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +import scipy.sparse as sps + +try: + import faiss +except ImportError: # pragma: no cover + faiss = None +try: + from sklearn.neighbors import BallTree as SkBallTree + from sklearn.neighbors import KDTree as SkKDTree + from sklearn.neighbors import NearestNeighbors as SkNearestNeighbors +except ImportError: # pragma: no cover + SkNearestNeighbors = None + +from .... import tensor as mt +from ....core import tile +from ....lib.sparse import SparseNDArray +from ....tests.core import require_cupy +from ....utils import lazy_import +from ...proxima.core import proxima +from .. import NearestNeighbors + +cupy = lazy_import("cupy") + + +def test_nearest_neighbors(setup): + rs = np.random.RandomState(0) + raw_X = rs.rand(10, 5) + raw_Y = rs.rand(8, 5) + + X = mt.tensor(raw_X) + Y = mt.tensor(raw_Y) + + raw_sparse_x = sps.random(10, 5, density=0.5, format="csr", random_state=rs) + raw_sparse_y = sps.random(8, 5, density=0.4, format="csr", random_state=rs) + + X_sparse = mt.tensor(raw_sparse_x) + Y_sparse = mt.tensor(raw_sparse_y) + + metric_func = lambda u, v: np.sqrt(((u - v) ** 2).sum()) + + _ = NearestNeighbors(algorithm="auto", metric="precomputed", metric_params={}) + + with pytest.raises(ValueError): + _ = NearestNeighbors(algorithm="unknown") + + with pytest.raises(ValueError): + _ = NearestNeighbors(algorithm="kd_tree", metric=metric_func) + + with pytest.raises(ValueError): + _ = NearestNeighbors(algorithm="auto", metric="unknown") + + with pytest.warns(SyntaxWarning): + NearestNeighbors(metric_params={"p": 1}) + + with pytest.raises(ValueError): + _ = NearestNeighbors(metric="wminkowski", p=0) + + with pytest.raises(ValueError): + _ = NearestNeighbors(algorithm="auto", metric="minkowski", p=0) + + nn = NearestNeighbors(algorithm="auto", metric="minkowski", p=1) + nn.fit(X) + assert nn.effective_metric_ == "manhattan" + + nn = NearestNeighbors(algorithm="auto", metric="minkowski", p=2) + nn.fit(X) + assert nn.effective_metric_ == "euclidean" + + nn = NearestNeighbors(algorithm="auto", metric="minkowski", p=np.inf) + nn.fit(X) + assert nn.effective_metric_ == "chebyshev" + + nn2 = NearestNeighbors(algorithm="auto", metric="minkowski") + nn2.fit(nn) + assert nn2._fit_method == nn._fit_method + + nn = NearestNeighbors(algorithm="auto", metric="minkowski") + ball_tree = SkBallTree(raw_X) + nn.fit(ball_tree) + assert nn._fit_method == "ball_tree" + + nn = NearestNeighbors(algorithm="auto", metric="minkowski") + kd_tree = SkKDTree(raw_X) + nn.fit(kd_tree) + assert nn._fit_method == "kd_tree" + + with pytest.raises(ValueError): + nn = NearestNeighbors() + nn.fit(np.random.rand(0, 10)) + + nn = NearestNeighbors(algorithm="ball_tree") + with pytest.warns(UserWarning): + nn.fit(X_sparse) + + nn = NearestNeighbors(metric="haversine") + with pytest.raises(ValueError): + nn.fit(X_sparse) + + nn = NearestNeighbors(metric=metric_func, n_neighbors=1) + nn.fit(X) + assert nn._fit_method == "ball_tree" + + nn = NearestNeighbors(metric="sqeuclidean", n_neighbors=1) + nn.fit(X) + assert nn._fit_method == "brute" + + with pytest.raises(ValueError): + nn = NearestNeighbors(n_neighbors=-1) + nn.fit(X) + + with pytest.raises(TypeError): + nn = NearestNeighbors(n_neighbors=1.3) + nn.fit(X) + + nn = NearestNeighbors() + nn.fit(X) + with pytest.raises(ValueError): + nn.kneighbors(Y, n_neighbors=-1) + with pytest.raises(TypeError): + nn.kneighbors(Y, n_neighbors=1.3) + with pytest.raises(ValueError): + nn.kneighbors(Y, n_neighbors=11) + + nn = NearestNeighbors(algorithm="ball_tree") + nn.fit(X) + with pytest.raises(ValueError): + nn.kneighbors(Y_sparse) + + +def test_nearest_neighbors_execution(setup): + rs = np.random.RandomState(0) + raw_X = rs.rand(10, 5) + raw_Y = rs.rand(8, 5) + + X = mt.tensor(raw_X, chunk_size=7) + Y = mt.tensor(raw_Y, chunk_size=(5, 3)) + + for algo in ["brute", "ball_tree", "kd_tree", "auto"]: + for metric in ["minkowski", "manhattan"]: + nn = NearestNeighbors(n_neighbors=3, algorithm=algo, metric=metric) + nn.fit(X) + + ret = nn.kneighbors(Y) + + snn = SkNearestNeighbors(n_neighbors=3, algorithm=algo, metric=metric) + snn.fit(raw_X) + expected = snn.kneighbors(raw_Y) + + result = [r.fetch() for r in ret] + np.testing.assert_almost_equal(result[0], expected[0]) + np.testing.assert_almost_equal(result[1], expected[1]) + + if nn._tree is not None: + assert isinstance(nn._tree.fetch(), type(snn._tree)) + + # test return_distance=False + ret = nn.kneighbors(Y, return_distance=False) + + result = ret.fetch() + np.testing.assert_almost_equal(result, expected[1]) + + # test y is x + ret = nn.kneighbors() + + expected = snn.kneighbors() + + result = [r.fetch() for r in ret] + np.testing.assert_almost_equal(result[0], expected[0]) + np.testing.assert_almost_equal(result[1], expected[1]) + + # test y is x, and return_distance=False + ret = nn.kneighbors(return_distance=False) + + result = ret.fetch() + np.testing.assert_almost_equal(result, expected[1]) + + # test callable metric + metric = lambda u, v: np.sqrt(((u - v) ** 2).sum()) + for algo in ["brute", "ball_tree"]: + nn = NearestNeighbors(n_neighbors=3, algorithm=algo, metric=metric) + nn.fit(X) + + ret = nn.kneighbors(Y) + + snn = SkNearestNeighbors(n_neighbors=3, algorithm=algo, metric=metric) + snn.fit(raw_X) + expected = snn.kneighbors(raw_Y) + + result = [r.fetch() for r in ret] + np.testing.assert_almost_equal(result[0], expected[0]) + np.testing.assert_almost_equal(result[1], expected[1]) + + # test sparse + raw_sparse_x = sps.random(10, 5, density=0.5, format="csr", random_state=rs) + raw_sparse_y = sps.random(8, 5, density=0.4, format="csr", random_state=rs) + + X = mt.tensor(raw_sparse_x, chunk_size=7) + Y = mt.tensor(raw_sparse_y, chunk_size=5) + + nn = NearestNeighbors(n_neighbors=3) + nn.fit(X) + + ret = nn.kneighbors(Y) + + snn = SkNearestNeighbors(n_neighbors=3) + snn.fit(raw_sparse_x) + expected = snn.kneighbors(raw_sparse_y) + + result = [r.fetch() for r in ret] + np.testing.assert_almost_equal(result[0], expected[0]) + np.testing.assert_almost_equal(result[1], expected[1]) + + # test input with unknown shape + X = mt.tensor(raw_X, chunk_size=7) + X = X[X[:, 0] > 0.1] + Y = mt.tensor(raw_Y, chunk_size=(5, 3)) + Y = Y[Y[:, 0] > 0.1] + + nn = NearestNeighbors(n_neighbors=3) + nn.fit(X) + + ret = nn.kneighbors(Y) + + x2 = raw_X[raw_X[:, 0] > 0.1] + y2 = raw_Y[raw_Y[:, 0] > 0.1] + snn = SkNearestNeighbors(n_neighbors=3) + snn.fit(x2) + expected = snn.kneighbors(y2) + + result = ret.fetch() + assert nn._fit_method == snn._fit_method + np.testing.assert_almost_equal(result[0], expected[0]) + np.testing.assert_almost_equal(result[1], expected[1]) + + # test fit a sklearn tree + nn = NearestNeighbors(n_neighbors=3) + nn.fit(snn._tree) + + ret = nn.kneighbors(Y) + result = ret.fetch() + assert nn._fit_method == snn._fit_method + np.testing.assert_almost_equal(result[0], expected[0]) + np.testing.assert_almost_equal(result[1], expected[1]) + + +def test_k_neighbors_graph_execution(setup): + rs = np.random.RandomState(0) + raw_X = rs.rand(10, 5) + raw_Y = rs.rand(8, 5) + + X = mt.tensor(raw_X, chunk_size=7) + Y = mt.tensor(raw_Y, chunk_size=(5, 3)) + + neigh = NearestNeighbors(n_neighbors=3) + neigh.fit(X) + sklearn_neigh = SkNearestNeighbors(n_neighbors=3) + sklearn_neigh.fit(raw_X) + + for mode in ["connectivity", "distance"]: + graph = neigh.kneighbors_graph(Y, mode=mode) + result = graph.fetch() + + assert isinstance(result, SparseNDArray) + assert len(tile(graph).chunks) > 1 + + expected = sklearn_neigh.kneighbors_graph(raw_Y, mode=mode) + + np.testing.assert_array_equal(result.toarray(), expected.toarray()) + + graph2 = neigh.kneighbors_graph(mode=mode) + result2 = graph2.fetch() + + assert isinstance(result2, SparseNDArray) + + expected2 = sklearn_neigh.kneighbors_graph(mode=mode) + + np.testing.assert_array_equal(result2.toarray(), expected2.toarray()) + + X = [[0], [3], [1]] + + neigh = NearestNeighbors(n_neighbors=2) + sklearn_neigh = SkNearestNeighbors(n_neighbors=2) + neigh.fit(X) + sklearn_neigh.fit(X) + + A = neigh.kneighbors_graph(X).fetch() + expected_A = sklearn_neigh.kneighbors_graph(X) + np.testing.assert_array_equal(A.toarray(), expected_A.toarray()) + + # test wrong mode + with pytest.raises(ValueError): + _ = neigh.kneighbors_graph(mode="unknown") + + +@pytest.mark.skipif(faiss is None, reason="faiss not installed") +def test_faiss_nearest_neighbors_execution(setup): + rs = np.random.RandomState(0) + raw_X = rs.rand(10, 5) + raw_Y = rs.rand(8, 5) + + # test faiss execution + X = mt.tensor(raw_X, chunk_size=7) + Y = mt.tensor(raw_Y, chunk_size=(5, 3)) + + nn = NearestNeighbors(n_neighbors=3, algorithm="faiss", metric="l2") + nn.fit(X) + + ret = nn.kneighbors(Y) + + snn = SkNearestNeighbors(n_neighbors=3, algorithm="auto", metric="l2") + snn.fit(raw_X) + expected = snn.kneighbors(raw_Y) + + result = [r.fetch() for r in ret] + np.testing.assert_almost_equal(result[0], expected[0], decimal=6) + np.testing.assert_almost_equal(result[1], expected[1]) + + # test return_distance=False + ret = nn.kneighbors(Y, return_distance=False) + + result = ret.fetch() + np.testing.assert_almost_equal(result, expected[1]) + + # test y is x + ret = nn.kneighbors() + + expected = snn.kneighbors() + + result = [r.fetch() for r in ret] + np.testing.assert_almost_equal(result[0], expected[0], decimal=5) + np.testing.assert_almost_equal(result[1], expected[1]) + + +@pytest.mark.skipif(proxima is None, reason="proxima not installed") +def test_proxima_nearest_neighbors_execution(setup): + rs = np.random.RandomState(0) + raw_X = rs.rand(10, 5).astype("float32") + raw_Y = rs.rand(8, 5).astype("float32") + + # test faiss execution + X = mt.tensor(raw_X, chunk_size=6) + Y = mt.tensor(raw_Y, chunk_size=(5, 3)) + + nn = NearestNeighbors(n_neighbors=3, algorithm="proxima", metric="l2") + nn.fit(X) + + ret = nn.kneighbors(Y) + + snn = SkNearestNeighbors(n_neighbors=3, algorithm="auto", metric="l2") + snn.fit(raw_X) + expected = snn.kneighbors(raw_Y) + + result = [r.fetch() for r in ret] + np.testing.assert_almost_equal(result[0], expected[0], decimal=6) + np.testing.assert_almost_equal(result[1], expected[1]) + + # test return_distance=False + ret = nn.kneighbors(Y, return_distance=False) + + result = ret.fetch() + np.testing.assert_almost_equal(result, expected[1]) + + # test y is x + ret = nn.kneighbors() + + expected = snn.kneighbors() + + result = [r.fetch() for r in ret] + np.testing.assert_almost_equal(result[0], expected[0], decimal=5) + np.testing.assert_almost_equal(result[1], expected[1]) + + +@require_cupy +@pytest.mark.skipif( + cupy is None or faiss is None, reason="either cupy or faiss not installed" +) +def test_gpu_faiss_nearest_neighbors_execution(setup_gpu): + rs = np.random.RandomState(0) + + raw_X = rs.rand(10, 5) + raw_Y = rs.rand(8, 5) + + # test faiss execution + X = mt.tensor(raw_X, chunk_size=7).to_gpu() + Y = mt.tensor(raw_Y, chunk_size=8).to_gpu() + + nn = NearestNeighbors(n_neighbors=3, algorithm="faiss", metric="l2") + nn.fit(X) + + ret = nn.kneighbors(Y) + + snn = SkNearestNeighbors(n_neighbors=3, algorithm="auto", metric="l2") + snn.fit(raw_X) + expected = snn.kneighbors(raw_Y) + + result = [r.fetch() for r in ret] + np.testing.assert_almost_equal(result[0].get(), expected[0], decimal=6) + np.testing.assert_almost_equal(result[1].get(), expected[1]) diff --git a/python/xorbits/_mars/learn/neighbors/tree.py b/python/xorbits/_mars/learn/neighbors/tree.py new file mode 100644 index 000000000..a6ae58999 --- /dev/null +++ b/python/xorbits/_mars/learn/neighbors/tree.py @@ -0,0 +1,280 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cloudpickle +import numpy as np + +from ...core import OBJECT_CHUNK_TYPE, OBJECT_TYPE, Object, recursive_tile +from ...serialization.serializables import ( + AnyField, + BoolField, + DictField, + Int32Field, + KeyField, +) +from ...tensor.core import TensorOrder +from ...utils import has_unknown_shape, tokenize +from ..operands import LearnOperand, LearnOperandMixin, OutputType + + +class TreeObject(Object): + def fetch(self, session=None, **kw): + result = self._data.fetch(session=session, **kw) + return cloudpickle.loads(result) if isinstance(result, bytes) else result + + +class TreeBase(LearnOperand, LearnOperandMixin): + _input = KeyField("input") + _leaf_size = Int32Field("leaf_size") + _metric = AnyField("metric") + + _metric_params = DictField("metric_params") + + def __init__( + self, leaf_size=None, metric=None, metric_params=None, output_types=None, **kw + ): + super().__init__( + _leaf_size=leaf_size, + _metric=metric, + _metric_params=metric_params, + _output_types=output_types, + **kw + ) + if self.output_types is None: + self.output_types = [OutputType.object] + + @property + def input(self): + return self._input + + @property + def leaf_size(self): + return self._leaf_size + + @property + def metric(self): + return self._metric + + @property + def metric_params(self): + return self._metric_params + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + def __call__(self, a): + return self.new_tileable([a]) + + @classmethod + def tile(cls, op): + if has_unknown_shape(*op.inputs): + yield + + # ball tree and kd tree requires the full data, + # thus rechunk input tensor into 1 chunk + inp = op.input.rechunk({ax: s for ax, s in enumerate(op.input.shape)}) + inp = yield from recursive_tile(inp) + out = op.outputs[0] + + chunk_op = op.copy().reset_key() + kw = out.params + kw["index"] = inp.chunks[0].index + chunk = chunk_op.new_chunk([inp.chunks[0]], kws=[kw]) + + new_op = op.copy() + tileable_kw = out.params + tileable_kw["nsplits"] = ((1,),) + tileable_kw["chunks"] = [chunk] + return new_op.new_tileables(op.inputs, kws=[tileable_kw]) + + @classmethod + def execute(cls, ctx, op): + if op.gpu: # pragma: no cover + raise NotImplementedError( + "Does not support tree-based nearest neighbors on GPU" + ) + + a = ctx[op.input.key] + tree = cls._tree_type( + a, op.leaf_size, metric=op.metric, **(op.metric_params or dict()) + ) + ctx[op.outputs[0].key] = tree + + +def _on_serialize_tree(tree): + return cloudpickle.dumps(tree) if not hasattr(tree, "key") else tree + + +def _on_deserialize_tree(ser): + return cloudpickle.loads(ser) if isinstance(ser, bytes) else ser + + +class TreeQueryBase(LearnOperand, LearnOperandMixin): + _input = KeyField("input") + _tree = AnyField( + "tree", on_serialize=_on_serialize_tree, on_deserialize=_on_deserialize_tree + ) + _n_neighbors = Int32Field("n_neighbors") + _return_distance = BoolField("return_distance") + + def __init__( + self, tree=None, n_neighbors=None, return_distance=None, output_types=None, **kw + ): + super().__init__( + _tree=tree, + _n_neighbors=n_neighbors, + _return_distance=return_distance, + _output_types=output_types, + **kw + ) + if self.output_types is None: + self.output_types = [OutputType.tensor] * self.output_limit + + @property + def input(self): + return self._input + + @property + def tree(self): + return self._tree + + @property + def n_neighbors(self): + return self._n_neighbors + + @property + def return_distance(self): + return self._return_distance + + @property + def output_limit(self): + return 2 if self._return_distance else 1 + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + if isinstance(self._tree, (OBJECT_TYPE, OBJECT_CHUNK_TYPE)): + self._tree = self._inputs[1] + + def _update_key(self): + values = [] + for value in self._values_: + if isinstance(value, self._tree_type): + values.append(cloudpickle.dumps(value)) + else: + values.append(value) + self._obj_set("_key", tokenize(type(self).__name__, *values)) + return self + + def __call__(self, x): + kws = [] + if self._return_distance: + kws.append( + { + "shape": (x.shape[0], self._n_neighbors), + "dtype": np.dtype(np.float64), + "order": x.order, + "type": "distance", + } + ) + kws.append( + { + "shape": (x.shape[0], self._n_neighbors), + "dtype": np.dtype(np.int64), + "order": TensorOrder.C_ORDER, + "type": "indices", + } + ) + inputs = [x] + if isinstance(self._tree, OBJECT_TYPE): + inputs.append(self._tree) + return self.new_tileables(inputs, kws=kws, output_limit=len(kws)) + + @classmethod + def tile(cls, op): + inp = op.input + + if inp.chunk_shape[1] != 1: + if has_unknown_shape(inp): + yield + inp = yield from recursive_tile(inp.rechunk({1: inp.shape[1]})) + + tree_chunk = None + if isinstance(op.tree, OBJECT_TYPE): + tree_chunk = op.tree.chunks[0] + out_chunks = [[] for _ in range(len(op.outputs))] + for chunk in inp.chunks: + chunk_op = op.copy().reset_key() + if tree_chunk is not None: + chunk_op._tree = tree_chunk + chunk_kws = [] + if op.return_distance: + chunk_kws.append( + { + "shape": (chunk.shape[0], op.n_neighbors), + "dtype": np.dtype(np.float64), + "order": chunk.order, + "index": chunk.index, + "type": "distance", + } + ) + chunk_kws.append( + { + "shape": (chunk.shape[0], op.n_neighbors), + "dtype": np.dtype(np.int64), + "order": TensorOrder.C_ORDER, + "index": chunk.index, + "type": "indices", + } + ) + chunk_inputs = [chunk] + if tree_chunk is not None: + chunk_inputs.append(tree_chunk) + chunks = chunk_op.new_chunks( + chunk_inputs, kws=chunk_kws, output_limit=len(chunk_kws) + ) + for cs, c in zip(out_chunks, chunks): + cs.append(c) + + kws = [o.params for o in op.outputs] + nsplits = list(inp.nsplits) + nsplits[1] = (op.n_neighbors,) + if op.return_distance: + kws[0]["chunks"] = out_chunks[0] + kws[0]["nsplits"] = tuple(nsplits) + kws[-1]["chunks"] = out_chunks[-1] + kws[-1]["nsplits"] = tuple(nsplits) + new_op = op.copy() + return new_op.new_tileables(op.inputs, kws=kws, output_limit=len(kws)) + + @classmethod + def execute(cls, ctx, op): + if op.gpu: # pragma: no cover + raise NotImplementedError( + "Does not support tree-based nearest neighbors on GPU" + ) + + x = ctx[op.input.key] + if len(op.inputs) == 2: + tree = ctx[op.tree.key] + else: + tree = op.tree + tree = cloudpickle.loads(tree) if isinstance(tree, bytes) else tree + ret = tree.query(x, op.n_neighbors, op.return_distance) + if op.return_distance: + ctx[op.outputs[0].key] = ret[0] + ctx[op.outputs[1].key] = ret[1] + else: + ctx[op.outputs[0].key] = ret diff --git a/python/xorbits/_mars/learn/neighbors/unsupervised.py b/python/xorbits/_mars/learn/neighbors/unsupervised.py new file mode 100644 index 000000000..bb949cd31 --- /dev/null +++ b/python/xorbits/_mars/learn/neighbors/unsupervised.py @@ -0,0 +1,39 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .base import KNeighborsMixin, NeighborsBase, UnsupervisedMixin + + +class NearestNeighbors(NeighborsBase, KNeighborsMixin, UnsupervisedMixin): + def __init__( + self, + n_neighbors=5, + radius=1.0, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + **kwargs + ): + super().__init__( + n_neighbors=n_neighbors, + radius=radius, + algorithm=algorithm, + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + **kwargs + ) diff --git a/python/xorbits/_mars/learn/operands.py b/python/xorbits/_mars/learn/operands.py new file mode 100644 index 000000000..0c5e007c3 --- /dev/null +++ b/python/xorbits/_mars/learn/operands.py @@ -0,0 +1,88 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..core import OutputType +from ..core.operand import ( + Fuse, + FuseChunkMixin, + Operand, + ShuffleProxy, + TileableOperandMixin, +) +from ..dataframe.core import CHUNK_TYPE as DATAFRAME_CHUNK_TYPE +from ..dataframe.core import TILEABLE_TYPE as DATAFRAME_TYPE +from ..dataframe.operands import DataFrameFuseChunk, DataFrameOperandMixin +from ..tensor.core import TENSOR_CHUNK_TYPE, TENSOR_TYPE +from ..tensor.fuse import TensorFuseChunk +from ..tensor.operands import TensorOperandMixin + +LearnOperand = Operand + + +class LearnOperandMixin(TileableOperandMixin): + __slots__ = () + _op_module_ = "learn" + + @classmethod + def concat_tileable_chunks(cls, tileable): + if isinstance(tileable, TENSOR_TYPE): + return TensorOperandMixin.concat_tileable_chunks(tileable) + elif isinstance(tileable, DATAFRAME_TYPE): + return DataFrameOperandMixin.concat_tileable_chunks(tileable) + else: + # op has to implement its logic of `concat_tileable_chunks` + raise NotImplementedError + + @classmethod + def create_tileable_from_chunks(cls, chunks, inputs=None, **kw): + if isinstance(chunks[0], TENSOR_CHUNK_TYPE): + return TensorOperandMixin.create_tileable_from_chunks( + chunks, inputs=inputs, **kw + ) + elif isinstance(chunks[0], DATAFRAME_CHUNK_TYPE): + return DataFrameOperandMixin.create_tileable_from_chunks( + chunks, inputs=inputs, **kw + ) + else: + # op has to implement its logic of `create_tileable_from_chunks` + raise NotImplementedError + + def get_fuse_op_cls(self, obj): + if isinstance(obj, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)): + return TensorFuseChunk + elif isinstance(obj, (DATAFRAME_TYPE, DATAFRAME_CHUNK_TYPE)): + return DataFrameFuseChunk + else: + return LearnObjectFuseChunk + + +class LearnObjectFuseChunkMixin(FuseChunkMixin, LearnOperandMixin): + __slots__ = () + + _output_type_ = OutputType.object + + +class LearnObjectFuseChunk(LearnObjectFuseChunkMixin, Fuse): + pass + + +class LearnShuffleProxy(ShuffleProxy, LearnOperandMixin): + def __init__(self, output_types=None, **kw): + super().__init__(_output_types=output_types, **kw) + if not self.output_types: + self.output_types = [OutputType.object] + + @classmethod + def execute(cls, ctx, op): + pass diff --git a/python/xorbits/_mars/learn/preprocessing/__init__.py b/python/xorbits/_mars/learn/preprocessing/__init__.py new file mode 100644 index 000000000..7162b0e27 --- /dev/null +++ b/python/xorbits/_mars/learn/preprocessing/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ._data import MinMaxScaler, minmax_scale +from ._label import LabelBinarizer, LabelEncoder, label_binarize +from .normalize import normalize diff --git a/python/xorbits/_mars/learn/preprocessing/_data.py b/python/xorbits/_mars/learn/preprocessing/_data.py new file mode 100644 index 000000000..7b33815e8 --- /dev/null +++ b/python/xorbits/_mars/learn/preprocessing/_data.py @@ -0,0 +1,400 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from sklearn.base import TransformerMixin +from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted + +from ... import tensor as mt +from ...tensor.core import TENSOR_TYPE +from ..base import BaseEstimator +from ..utils.validation import check_array + + +def _handle_zeros_in_scale(scale, copy=True): + """Makes sure that whenever scale is zero, we handle it correctly. + + This happens in most scalers when we have constant features. + """ + + # if we are fitting on 1D arrays, scale might be a scalar + if np.isscalar(scale): # pragma: no cover + if scale == 0.0: + scale = 1.0 + return scale + elif hasattr(scale, "ndim") and scale.ndim == 0: # pragma: no cover + # scalar that is tensor + return mt.where(scale == 0.0, 1.0, scale) + elif isinstance(scale, (np.ndarray, TENSOR_TYPE)): + if copy: + # New array to avoid side-effects + scale = scale.copy() + scale[scale == 0.0] = 1.0 + return scale + + +class MinMaxScaler(TransformerMixin, BaseEstimator): + """Transform features by scaling each feature to a given range. + + This estimator scales and translates each feature individually such + that it is in the given range on the training set, e.g. between + zero and one. + + The transformation is given by:: + + X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) + X_scaled = X_std * (max - min) + min + + where min, max = feature_range. + + This transformation is often used as an alternative to zero mean, + unit variance scaling. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + feature_range : tuple (min, max), default=(0, 1) + Desired range of transformed data. + + copy : bool, default=True + Set to False to perform inplace row normalization and avoid a + copy (if the input is already a numpy array). + + clip: bool, default=False + Set to True to clip transformed values of held-out data to + provided `feature range`. + + Attributes + ---------- + min_ : Tensor of shape (n_features,) + Per feature adjustment for minimum. Equivalent to + ``min - X.min(axis=0) * self.scale_`` + + scale_ : Tensor of shape (n_features,) + Per feature relative scaling of the data. Equivalent to + ``(max - min) / (X.max(axis=0) - X.min(axis=0))`` + + data_min_ : ndarray of shape (n_features,) + Per feature minimum seen in the data + + data_max_ : ndarray of shape (n_features,) + Per feature maximum seen in the data + + data_range_ : ndarray of shape (n_features,) + Per feature range ``(data_max_ - data_min_)`` seen in the data + + n_samples_seen_ : int + The number of samples processed by the estimator. + It will be reset on new calls to fit, but increments across + ``partial_fit`` calls. + + Examples + -------- + >>> from mars.learn.preprocessing import MinMaxScaler + >>> data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]] + >>> scaler = MinMaxScaler() + >>> print(scaler.fit(data)) + MinMaxScaler() + >>> print(scaler.data_max_) + [ 1. 18.] + >>> print(scaler.transform(data)) + [[0. 0. ] + [0.25 0.25] + [0.5 0.5 ] + [1. 1. ]] + >>> print(scaler.transform([[2, 2]])) + [[1.5 0. ]] + + See Also + -------- + minmax_scale : Equivalent function without the estimator API. + + Notes + ----- + NaNs are treated as missing values: disregarded in fit, and maintained in + transform. + + For a comparison of the different scalers, transformers, and normalizers, + see :ref:`examples/preprocessing/plot_all_scaling.py + `. + """ + + def __init__(self, feature_range=(0, 1), copy=True, clip=False): + self.feature_range = feature_range + self.copy = copy + self.clip = clip + + def _reset(self): # pragma: no cover + """Reset internal data-dependent state of the scaler, if necessary. + + __init__ parameters are not touched. + """ + + # Checking one attribute is enough, because they are all set together + # in partial_fit + if hasattr(self, "scale_"): + del self.scale_ + del self.min_ + del self.n_samples_seen_ + del self.data_min_ + del self.data_max_ + del self.data_range_ + + def fit(self, X, y=None, session=None, run_kwargs=None): + """Compute the minimum and maximum to be used for later scaling. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data used to compute the per-feature minimum and maximum + used for later scaling along the features axis. + + y : None + Ignored. + + Returns + ------- + self : object + Fitted scaler. + """ + + # Reset internal state before fitting + self._reset() + return self.partial_fit(X, y, session=session, run_kwargs=run_kwargs) + + def partial_fit(self, X, y=None, session=None, run_kwargs=None): + """Online computation of min and max on X for later scaling. + + All of X is processed as a single batch. This is intended for cases + when :meth:`fit` is not feasible due to very large number of + `n_samples` or because X is read from a continuous stream. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data used to compute the mean and standard deviation + used for later scaling along the features axis. + + y : None + Ignored. + + Returns + ------- + self : object + Fitted scaler. + """ + feature_range = self.feature_range + if feature_range[0] >= feature_range[1]: + raise ValueError( + "Minimum of desired feature range must be smaller" + " than maximum. Got %s." % str(feature_range) + ) + + if mt.tensor(X).issparse(): # pragma: no cover + raise TypeError( + "MinMaxScaler does not support sparse input. " + "Consider using MaxAbsScaler instead." + ) + + first_pass = not hasattr(self, "n_samples_seen_") + X = self._validate_data( + X, + reset=first_pass, + estimator=self, + dtype=FLOAT_DTYPES, + force_all_finite="allow-nan", + ) + + if np.isnan(X.shape[0]): # pragma: no cover + X.execute(session=session, **(run_kwargs or dict())) + + data_min = mt.nanmin(X, axis=0) + data_max = mt.nanmax(X, axis=0) + + if first_pass: + self.n_samples_seen_ = X.shape[0] + else: + data_min = mt.minimum( + self.data_min_, data_min + ) # pylint: disable=access-member-before-definition + data_max = mt.maximum( + self.data_max_, data_max + ) # pylint: disable=access-member-before-definition + self.n_samples_seen_ += X.shape[0] + + data_range = data_max - data_min + self.scale_ = (feature_range[1] - feature_range[0]) / _handle_zeros_in_scale( + data_range + ) + self.min_ = feature_range[0] - data_min * self.scale_ + self.data_min_ = data_min + self.data_max_ = data_max + self.data_range_ = data_range + mt.ExecutableTuple( + [self.scale_, self.min_, self.data_min_, self.data_max_, self.data_range_] + ).execute(session=session, **(run_kwargs or dict())) + return self + + def transform(self, X, session=None, run_kwargs=None): + """Scale features of X according to feature_range. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data that will be transformed. + + Returns + ------- + Xt : ndarray of shape (n_samples, n_features) + Transformed data. + """ + check_is_fitted(self) + + X = self._validate_data( + X, + copy=self.copy, + dtype=FLOAT_DTYPES, + force_all_finite="allow-nan", + reset=False, + ) + + X *= self.scale_ + X += self.min_ + if self.clip: + X = mt.clip(X, self.feature_range[0], self.feature_range[1]) + return X.execute(session=session, **(run_kwargs or dict())) + + def inverse_transform(self, X, session=None, run_kwargs=None): + """Undo the scaling of X according to feature_range. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data that will be transformed. It cannot be sparse. + + Returns + ------- + Xt : ndarray of shape (n_samples, n_features) + Transformed data. + """ + check_is_fitted(self) + + X = check_array( + X, copy=self.copy, dtype=FLOAT_DTYPES, force_all_finite="allow-nan" + ) + + X -= self.min_ + X /= self.scale_ + return X.execute(session=session, **(run_kwargs or dict())) + + def _more_tags(self): # pylint: disable=no-self-use + return {"allow_nan": True} + + +def minmax_scale( + X, feature_range=(0, 1), *, axis=0, copy=True, session=None, run_kwargs=None +): + """Transform features by scaling each feature to a given range. + + This estimator scales and translates each feature individually such + that it is in the given range on the training set, i.e. between + zero and one. + + The transformation is given by (when ``axis=0``):: + + X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) + X_scaled = X_std * (max - min) + min + + where min, max = feature_range. + + The transformation is calculated as (when ``axis=0``):: + + X_scaled = scale * X + min - X.min(axis=0) * scale + where scale = (max - min) / (X.max(axis=0) - X.min(axis=0)) + + This transformation is often used as an alternative to zero mean, + unit variance scaling. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.17 + *minmax_scale* function interface + to :class:`~sklearn.preprocessing.MinMaxScaler`. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data. + + feature_range : tuple (min, max), default=(0, 1) + Desired range of transformed data. + + axis : int, default=0 + Axis used to scale along. If 0, independently scale each feature, + otherwise (if 1) scale each sample. + + copy : bool, default=True + Set to False to perform inplace scaling and avoid a copy (if the input + is already a numpy array). + + Returns + ------- + X_tr : ndarray of shape (n_samples, n_features) + The transformed data. + + .. warning:: Risk of data leak + + Do not use :func:`~sklearn.preprocessing.minmax_scale` unless you know + what you are doing. A common mistake is to apply it to the entire data + *before* splitting into training and test sets. This will bias the + model evaluation because information would have leaked from the test + set to the training set. + In general, we recommend using + :class:`~sklearn.preprocessing.MinMaxScaler` within a + :ref:`Pipeline ` in order to prevent most risks of data + leaking: `pipe = make_pipeline(MinMaxScaler(), LogisticRegression())`. + + See Also + -------- + MinMaxScaler : Performs scaling to a given range using the Transformer + API (e.g. as part of a preprocessing + :class:`~sklearn.pipeline.Pipeline`). + + Notes + ----- + For a comparison of the different scalers, transformers, and normalizers, + see :ref:`examples/preprocessing/plot_all_scaling.py + `. + """ # noqa + # Unlike the scaler object, this function allows 1d input. + # If copy is required, it will be done inside the scaler object. + X = check_array( + X, copy=False, ensure_2d=False, dtype=FLOAT_DTYPES, force_all_finite="allow-nan" + ) + original_ndim = X.ndim + + if original_ndim == 1: + X = X.reshape(X.shape[0], 1) + + s = MinMaxScaler(feature_range=feature_range, copy=copy) + if axis == 0: + X = s.fit_transform(X) + else: + X = s.fit_transform(X.T).T + + if original_ndim == 1: + X = X.ravel() + + return X.execute(session=session, **(run_kwargs or dict())) diff --git a/python/xorbits/_mars/learn/preprocessing/_label.py b/python/xorbits/_mars/learn/preprocessing/_label.py new file mode 100644 index 000000000..83d537233 --- /dev/null +++ b/python/xorbits/_mars/learn/preprocessing/_label.py @@ -0,0 +1,864 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union + +import numpy as np +import scipy.sparse as sp +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.preprocessing import label_binarize as sklearn_label_binarize +from sklearn.utils.sparsefuncs import min_max_axis + +from ... import execute as execute_tileable +from ... import fetch as fetch_tileable +from ... import opcodes +from ... import tensor as mt +from ...core import ENTITY_TYPE, OutputType, recursive_tile +from ...core.context import Context, get_context +from ...lib.sparse import SparseNDArray +from ...serialization.serializables import AnyField, BoolField, Int32Field, StringField +from ...tensor.core import TensorOrder +from ...typing import TileableType +from ..operands import LearnOperand, LearnOperandMixin +from ..utils import column_or_1d +from ..utils._encode import _encode, _unique +from ..utils.multiclass import type_of_target, unique_labels +from ..utils.validation import _num_samples, check_array, check_is_fitted + + +class LabelEncoder(TransformerMixin, BaseEstimator): + """Encode target labels with value between 0 and n_classes-1. + + This transformer should be used to encode target values, *i.e.* `y`, and + not the input `X`. + + Read more in the :ref:`User Guide `. + + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) + Holds the label for each class. + + See Also + -------- + OrdinalEncoder : Encode categorical features using an ordinal encoding + scheme. + OneHotEncoder : Encode categorical features as a one-hot numeric array. + + Examples + -------- + `LabelEncoder` can be used to normalize labels. + + >>> from sklearn import preprocessing + >>> le = preprocessing.LabelEncoder() + >>> le.fit([1, 2, 2, 6]) + LabelEncoder() + >>> le.classes_ + array([1, 2, 6]) + >>> le.transform([1, 1, 2, 6]) + array([0, 0, 1, 2]...) + >>> le.inverse_transform([0, 0, 1, 2]) + array([1, 1, 2, 6]) + + It can also be used to transform non-numerical labels (as long as they are + hashable and comparable) to numerical labels. + + >>> le = preprocessing.LabelEncoder() + >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) + LabelEncoder() + >>> list(le.classes_) + ['amsterdam', 'paris', 'tokyo'] + >>> le.transform(["tokyo", "tokyo", "paris"]) + array([2, 2, 1]...) + >>> list(le.inverse_transform([2, 2, 1])) + ['tokyo', 'tokyo', 'paris'] + """ + + def fit(self, y, session=None, run_kwargs=None, execute=True): + """Fit label encoder. + + Parameters + ---------- + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + self : returns an instance of self. + Fitted label encoder. + """ + y = column_or_1d(y, warn=True) + self.classes_ = _unique(y) + if execute: + self.classes_ = execute_tileable( + self.classes_, session=session, **(run_kwargs or dict()) + ) + return self + + def fit_transform(self, y, session=None, run_kwargs=None): + """Fit label encoder and return encoded labels. + + Parameters + ---------- + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + y : array-like of shape (n_samples,) + Encoded labels. + """ + y = column_or_1d(y, warn=True) + self.classes_, y = execute_tileable( + _unique(y, return_inverse=True), session=session, **(run_kwargs or dict()) + ) + return y + + def transform(self, y, session=None, run_kwargs=None, execute=True): + """Transform labels to normalized encoding. + + Parameters + ---------- + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + y : array-like of shape (n_samples,) + Labels as normalized encodings. + """ + check_is_fitted(self) + y = column_or_1d(y, warn=True) + # transform of empty array is empty array + if _num_samples(y) == 0: + return mt.array([]) + + t = _encode(y, uniques=self.classes_) + if execute: + t = t.execute(session=session, **(run_kwargs or dict())) + return t + + def inverse_transform(self, y, session=None, run_kwargs=None): + """Transform labels back to original encoding. + + Parameters + ---------- + y : ndarray of shape (n_samples,) + Target values. + + Returns + ------- + y : ndarray of shape (n_samples,) + Original encoding. + """ + check_is_fitted(self) + y = column_or_1d(y, warn=True) + # inverse transform of empty array is empty array + if _num_samples(y) == 0: + return mt.array([]) + + def _class_checker(chunk_data, classes_data): + diff = np.setdiff1d(chunk_data, np.arange(len(classes_data))) + if len(diff): + raise ValueError("y contains previously unseen labels: %s" % str(diff)) + return chunk_data + + y = mt.asarray(y).map_chunk(_class_checker, args=(self.classes_,)) + return self.classes_[y].execute(session=session, **(run_kwargs or dict())) + + def _more_tags(self): + return {"X_types": ["1dlabels"]} + + +class LabelBinarizer(TransformerMixin, BaseEstimator): + """Binarize labels in a one-vs-all fashion. + + Several regression and binary classification algorithms are + available in scikit-learn. A simple way to extend these algorithms + to the multi-class classification case is to use the so-called + one-vs-all scheme. + + At learning time, this simply consists in learning one regressor + or binary classifier per class. In doing so, one needs to convert + multi-class labels to binary labels (belong or does not belong + to the class). LabelBinarizer makes this process easy with the + transform method. + + At prediction time, one assigns the class for which the corresponding + model gave the greatest confidence. LabelBinarizer makes this easy + with the inverse_transform method. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + + neg_label : int, default=0 + Value with which negative labels must be encoded. + + pos_label : int, default=1 + Value with which positive labels must be encoded. + + sparse_output : bool, default=False + True if the returned array from transform is desired to be in sparse + CSR format. + + Attributes + ---------- + + classes_ : ndarray of shape (n_classes,) + Holds the label for each class. + + y_type_ : str + Represents the type of the target data as evaluated by + utils.multiclass.type_of_target. Possible type are 'continuous', + 'continuous-multioutput', 'binary', 'multiclass', + 'multiclass-multioutput', 'multilabel-indicator', and 'unknown'. + + sparse_input_ : bool + True if the input data to transform is given as a sparse matrix, False + otherwise. + + Examples + -------- + >>> from mars.learn import preprocessing + >>> lb = preprocessing.LabelBinarizer() + >>> lb.fit([1, 2, 6, 4, 2]) + LabelBinarizer() + >>> lb.classes_ + array([1, 2, 4, 6]) + >>> lb.transform([1, 6]) + array([[1, 0, 0, 0], + [0, 0, 0, 1]]) + + Binary targets transform to a column vector + + >>> lb = preprocessing.LabelBinarizer() + >>> lb.fit_transform(['yes', 'no', 'no', 'yes']) + array([[1], + [0], + [0], + [1]]) + + Passing a 2D matrix for multilabel classification + + >>> import numpy as np + >>> lb.fit(np.array([[0, 1, 1], [1, 0, 0]])) + LabelBinarizer() + >>> lb.classes_ + array([0, 1, 2]) + >>> lb.transform([0, 1, 2, 1]) + array([[1, 0, 0], + [0, 1, 0], + [0, 0, 1], + [0, 1, 0]]) + + See Also + -------- + label_binarize : Function to perform the transform operation of + LabelBinarizer with fixed classes. + OneHotEncoder : Encode categorical features using a one-hot aka one-of-K + scheme. + """ + + def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False): + if neg_label >= pos_label: + raise ValueError( + "neg_label={0} must be strictly less than " + "pos_label={1}.".format(neg_label, pos_label) + ) + + if sparse_output and (pos_label == 0 or neg_label != 0): + raise ValueError( + "Sparse binarization is only supported with non " + "zero pos_label and zero neg_label, got " + "pos_label={0} and neg_label={1}" + "".format(pos_label, neg_label) + ) + + self.neg_label = neg_label + self.pos_label = pos_label + self.sparse_output = sparse_output + + def fit(self, y, session=None, run_kwargs=None): + """Fit label binarizer. + + Parameters + ---------- + y : ndarray of shape (n_samples,) or (n_samples, n_classes) + Target values. The 2-d matrix should only contain 0 and 1, + represents multilabel classification. + + Returns + ------- + self : returns an instance of self. + """ + self.y_type_ = fetch_tileable( + execute_tileable( + type_of_target(y), session=session, **(run_kwargs or dict()) + ) + ) + if "multioutput" in self.y_type_: + raise ValueError( + "Multioutput target data is not supported with label binarization" + ) + if _num_samples(y) == 0: # pragma: no cover + raise ValueError("y has 0 samples: %r" % y) + + self.sparse_input_ = mt.tensor(y).issparse() + self.classes_ = unique_labels(y).execute( + session=session, **(run_kwargs or dict()) + ) + return self + + def fit_transform(self, y, session=None, run_kwargs=None): + """Fit label binarizer and transform multi-class labels to binary + labels. + + The output of transform is sometimes referred to as + the 1-of-K coding scheme. + + Parameters + ---------- + y : {ndarray, sparse matrix} of shape (n_samples,) or \ + (n_samples, n_classes) + Target values. The 2-d matrix should only contain 0 and 1, + represents multilabel classification. Sparse matrix can be + CSR, CSC, COO, DOK, or LIL. + + Returns + ------- + Y : {ndarray, sparse matrix} of shape (n_samples, n_classes) + Shape will be (n_samples, 1) for binary problems. Sparse matrix + will be of CSR format. + """ + return self.fit(y, session=session, run_kwargs=run_kwargs).transform( + y, session=session, run_kwargs=run_kwargs + ) + + def transform(self, y, session=None, run_kwargs=None): + """Transform multi-class labels to binary labels. + + The output of transform is sometimes referred to by some authors as + the 1-of-K coding scheme. + + Parameters + ---------- + y : {array, sparse matrix} of shape (n_samples,) or \ + (n_samples, n_classes) + Target values. The 2-d matrix should only contain 0 and 1, + represents multilabel classification. Sparse matrix can be + CSR, CSC, COO, DOK, or LIL. + + Returns + ------- + Y : {ndarray, sparse matrix} of shape (n_samples, n_classes) + Shape will be (n_samples, 1) for binary problems. Sparse matrix + will be of CSR format. + """ + check_is_fitted(self) + + target = fetch_tileable( + execute_tileable( + type_of_target(y), session=session, **(run_kwargs or dict()) + ) + ) + y_is_multilabel = target.startswith("multilabel") + if y_is_multilabel and not self.y_type_.startswith("multilabel"): + raise ValueError("The object was not fitted with multilabel input.") + + return label_binarize( + y, + classes=self.classes_, + pos_label=self.pos_label, + neg_label=self.neg_label, + sparse_output=self.sparse_output, + ) + + def inverse_transform(self, Y, threshold=None): + """Transform binary labels back to multi-class labels. + + Parameters + ---------- + Y : {ndarray, sparse matrix} of shape (n_samples, n_classes) + Target values. All sparse matrices are converted to CSR before + inverse transformation. + + threshold : float, default=None + Threshold used in the binary and multi-label cases. + + Use 0 when ``Y`` contains the output of decision_function + (classifier). + Use 0.5 when ``Y`` contains the output of predict_proba. + + If None, the threshold is assumed to be half way between + neg_label and pos_label. + + Returns + ------- + y : {ndarray, sparse matrix} of shape (n_samples,) + Target values. Sparse matrix will be of CSR format. + + Notes + ----- + In the case when the binary labels are fractional + (probabilistic), inverse_transform chooses the class with the + greatest value. Typically, this allows to use the output of a + linear model's decision_function method directly as the input + of inverse_transform. + """ + check_is_fitted(self) + + if threshold is None: + threshold = (self.pos_label + self.neg_label) / 2.0 + + Y = mt.asarray(Y) + if self.y_type_ == "multiclass": + y_inv = Y.map_chunk( + _inverse_binarize_multiclass, + args=(self.classes_,), + dtype=self.classes_.dtype, + shape=(Y.shape[0],), + ) + else: + shape = (Y.shape[0],) if self.y_type_ != "multilabel-indicator" else Y.shape + y_inv = Y.map_chunk( + _inverse_binarize_thresholding, + args=(self.y_type_, self.classes_, threshold), + dtype=self.classes_.dtype, + shape=shape, + ) + + if self.sparse_input_: + y_inv = y_inv.tosparse() + elif y_inv.issparse(): + y_inv = y_inv.todense() + + return y_inv + + def _more_tags( + self, + ): # pragma: no cover # noqa: R0201 # pylint: disable=no-self-use + return {"X_types": ["1dlabels"]} + + +class LabelBinarize(LearnOperand, LearnOperandMixin): + _op_type_ = opcodes.LABEL_BINARIZE + + y = AnyField("y") + classes = AnyField("classes") + neg_label = Int32Field("neg_label") + pos_label = Int32Field("pos_label") + sparse_output = BoolField("sparse_output") + # for chunk + y_type = StringField("y_type") + pos_switch = BoolField("pos_switch") + + def __call__(self, y: TileableType, classes: TileableType): + inputs = [] + if not isinstance(y, list): + # XXX Workaround that will be removed when list of list format is + # dropped + self.y = y = check_array(y, accept_sparse=True, ensure_2d=False, dtype=None) + if isinstance(y, ENTITY_TYPE): + inputs.append(y) + if isinstance(classes, ENTITY_TYPE): + inputs.append(classes) + self.sparse = self.sparse_output + self.output_types = [OutputType.tensor] + if len(classes) == 2: + n_dim1 = 1 + else: + n_dim1 = len(classes) + return self.new_tileable( + inputs, + shape=(np.nan, n_dim1), + dtype=np.dtype(int), + order=TensorOrder.C_ORDER, + ) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if isinstance(self.y, ENTITY_TYPE): + self.y = self._inputs[0] + if isinstance(self.classes, ENTITY_TYPE): + self.classes = self._inputs[-1] + + @classmethod + def tile(cls, op: "LabelBinarize"): + y = op.y + classes = op.classes + neg_label = op.neg_label + pos_label = op.pos_label + sparse_output = op.sparse_output + out = op.outputs[0] + ctx = get_context() + + if isinstance(y, list): + if _num_samples(y) == 0: + raise ValueError("y has 0 samples: %r" % y) + + if len(op.inputs) == 0: + # no entity input + r = sklearn_label_binarize( + op.y, + classes=op.classes, + neg_label=neg_label, + pos_label=pos_label, + sparse_output=sparse_output, + ) + return (yield from recursive_tile(mt.tensor(r))) + else: + # trigger execution + yield + + if neg_label >= pos_label: + raise ValueError( + "neg_label={0} must be strictly less than " + "pos_label={1}.".format(neg_label, pos_label) + ) + + if sparse_output and (pos_label == 0 or neg_label != 0): + raise ValueError( + "Sparse binarization is only supported with non " + "zero pos_label and zero neg_label, got " + "pos_label={0} and neg_label={1}" + "".format(pos_label, neg_label) + ) + + # To account for pos_label == 0 in the dense case + pos_switch = pos_label == 0 + if pos_switch: + pos_label = -neg_label + + y_type = yield from recursive_tile(type_of_target(y)) + yield y_type.chunks + y_type = ctx.get_chunks_result([y_type.chunks[0].key])[0] + y_type = y_type.item() if hasattr(y_type, "item") else y_type + if "multioutput" in y_type: + raise ValueError( + "Multioutput target data is not supported with label binarization" + ) + if y_type == "unknown": + raise ValueError("The type of target data is not known") + + n_samples = mt.tensor(y).shape[0] + n_classes = len(classes) + + if y_type == "binary": + if n_classes == 1: + if sparse_output: + return ( + yield from recursive_tile( + mt.zeros((n_samples, 1), dtype=int, sparse=True) + ) + ) + else: + Y = mt.zeros((len(y), 1), dtype=int) + Y += neg_label + return (yield from recursive_tile(Y)) + elif len(classes) >= 3: + y_type = "multiclass" + + if y_type == "multilabel-indicator": + y_n_classes = y.shape[1] if hasattr(y, "shape") else len(y[0]) + if mt.tensor(classes).size != y_n_classes: + raise ValueError( + "classes {0} mismatch with the labels {1}" + " found in the data".format(classes, unique_labels(y)) + ) + + if y_type in ("binary", "multiclass"): + y = yield from recursive_tile(column_or_1d(y)) + if y_type == "binary": + out_shape = (n_samples, 1) + else: + out_shape = (n_samples, n_classes) + elif y_type == "multilabel-indicator": + out_shape = y.shape + else: + raise ValueError( + "%s target data is not supported with label binarization" % y_type + ) + + out_chunks = [] + for y_chunk in y.chunks: + chunk_inputs = [y_chunk] + classes_chunk = classes + if isinstance(classes, ENTITY_TYPE): + chunk_inputs.append(classes.chunks[0]) + classes_chunk = classes.chunks[0] + chunk_op = LabelBinarize( + y=y_chunk, + classes=classes_chunk, + neg_label=neg_label, + pos_label=pos_label, + sparse_output=sparse_output, + y_type=y_type, + pos_switch=pos_switch, + _output_types=op.output_types, + ) + if len(out_shape) == 2: + chunk_shape = (y_chunk.shape[0], out_shape[1]) + chunk_index = (y_chunk.index[0], 0) + else: # pragma: no cover + chunk_shape = (y_chunk.shape[0],) + chunk_index = (y_chunk.index[0],) + out_chunk = chunk_op.new_chunk( + chunk_inputs, + shape=chunk_shape, + dtype=out.dtype, + order=out.order, + index=chunk_index, + ) + out_chunks.append(out_chunk) + + params = out.params.copy() + params["chunks"] = out_chunks + params["shape"] = out_shape + if len(out_shape) == 2: + nsplits = (y.nsplits[0], (out_shape[1],)) + else: # pragma: no cover + nsplits = (y.nsplits[0],) + params["nsplits"] = nsplits + return op.copy().new_tileables(op.inputs, kws=[params]) + + @classmethod + def execute(cls, ctx: Union[dict, Context], op: "LabelBinarize"): + y = ctx[op.y.key] + if hasattr(y, "raw"): + # SparseNDArray + y = y.raw + if isinstance(op.classes, ENTITY_TYPE): + classes = ctx[op.classes.key] + else: + classes = op.classes + y_type = op.y_type + sparse_output = op.sparse_output + pos_label = op.pos_label + neg_label = op.neg_label + pos_switch = op.pos_switch + + n_samples = y.shape[0] if sp.issparse(y) else len(y) + n_classes = len(classes) + sorted_class = np.sort(classes) + + if y_type in ("binary", "multiclass"): + # pick out the known labels from y + y_in_classes = np.in1d(y, classes) + y_seen = y[y_in_classes] + indices = np.searchsorted(sorted_class, y_seen) + indptr = np.hstack((0, np.cumsum(y_in_classes))) + + data = np.empty_like(indices) + data.fill(pos_label) + Y = sp.csr_matrix((data, indices, indptr), shape=(n_samples, n_classes)) + elif y_type == "multilabel-indicator": + Y = sp.csr_matrix(y) + if pos_label != 1: + data = np.empty_like(Y.data) + data.fill(pos_label) + Y.data = data + else: # pragma: no cover + raise ValueError( + "%s target data is not supported with label binarization" % y_type + ) + + if not sparse_output: + Y = Y.toarray() + Y = Y.astype(int, copy=False) + + if neg_label != 0: + Y[Y == 0] = neg_label + + if pos_switch: + Y[Y == pos_label] = 0 + else: + Y.data = Y.data.astype(int, copy=False) + + # preserve label ordering + if np.any(classes != sorted_class): + indices = np.searchsorted(sorted_class, classes) + Y = Y[:, indices] + + if y_type == "binary": + if sparse_output: + Y = Y.getcol(-1) + else: + Y = Y[:, -1].reshape((-1, 1)) + + if sp.issparse(Y): + Y = SparseNDArray(Y) + ctx[op.outputs[0].key] = Y + + +def label_binarize( + y, *, classes, neg_label=0, pos_label=1, sparse_output=False, execute=True +): + """Binarize labels in a one-vs-all fashion. + + Several regression and binary classification algorithms are + available in scikit-learn. A simple way to extend these algorithms + to the multi-class classification case is to use the so-called + one-vs-all scheme. + + This function makes it possible to compute this transformation for a + fixed set of class labels known ahead of time. + + Parameters + ---------- + y : array-like + Sequence of integer labels or multilabel data to encode. + + classes : array-like of shape (n_classes,) + Uniquely holds the label for each class. + + neg_label : int, default=0 + Value with which negative labels must be encoded. + + pos_label : int, default=1 + Value with which positive labels must be encoded. + + sparse_output : bool, default=False, + Set to true if output binary array is desired in CSR sparse format. + + Returns + ------- + Y : {tensor, sparse tensor} of shape (n_samples, n_classes) + Shape will be (n_samples, 1) for binary problems. + + Examples + -------- + >>> from mars.learn.preprocessing import label_binarize + >>> label_binarize([1, 6], classes=[1, 2, 4, 6]) + array([[1, 0, 0, 0], + [0, 0, 0, 1]]) + + The class ordering is preserved: + + >>> label_binarize([1, 6], classes=[1, 6, 4, 2]) + array([[1, 0, 0, 0], + [0, 1, 0, 0]]) + + Binary targets transform to a column vector + + >>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes']) + array([[1], + [0], + [0], + [1]]) + + See Also + -------- + LabelBinarizer : Class used to wrap the functionality of label_binarize and + allow for fitting to classes independently of the transform operation. + """ + op = LabelBinarize( + y=y, + classes=classes, + neg_label=neg_label, + pos_label=pos_label, + sparse_output=sparse_output, + ) + result = op(y, classes) + return result.execute() if execute else result + + +def _inverse_binarize_multiclass(y, classes): # pragma: no cover + """Inverse label binarization transformation for multiclass. + + Multiclass uses the maximal score instead of a threshold. + """ + classes = np.asarray(classes) + + if sp.issparse(y): + # Find the argmax for each row in y where y is a CSR matrix + + y = y.tocsr() + n_samples, n_outputs = y.shape + outputs = np.arange(n_outputs) + row_max = min_max_axis(y, 1)[1] + row_nnz = np.diff(y.indptr) + + y_data_repeated_max = np.repeat(row_max, row_nnz) + # picks out all indices obtaining the maximum per row + y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data) + + # For corner case where last row has a max of 0 + if row_max[-1] == 0: + y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)]) + + # Gets the index of the first argmax in each row from y_i_all_argmax + index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1]) + # first argmax of each row + y_ind_ext = np.append(y.indices, [0]) + y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]] + # Handle rows of all 0 + y_i_argmax[np.where(row_nnz == 0)[0]] = 0 + + # Handles rows with max of 0 that contain negative numbers + samples = np.arange(n_samples)[(row_nnz > 0) & (row_max.ravel() == 0)] + for i in samples: + ind = y.indices[y.indptr[i] : y.indptr[i + 1]] + y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0] + + return classes[y_i_argmax] + else: + return classes.take(y.argmax(axis=1), mode="clip") + + +def _inverse_binarize_thresholding( + y, output_type, classes, threshold +): # pragma: no cover + """Inverse label binarization transformation using thresholding.""" + + if output_type == "binary" and y.ndim == 2 and y.shape[1] > 2: + raise ValueError("output_type='binary', but y.shape = {0}".format(y.shape)) + + if output_type != "binary" and y.shape[1] != len(classes): + raise ValueError( + "The number of class is not equal to the number of dimension of y." + ) + + classes = np.asarray(classes) + + # Perform thresholding + if sp.issparse(y): + if threshold > 0: + if y.format not in ("csr", "csc"): + y = y.tocsr() + y.data = np.array(y.data > threshold, dtype=int) + y.eliminate_zeros() + else: + y = np.array(y.toarray() > threshold, dtype=int) + else: + y = np.array(y > threshold, dtype=int) + + # Inverse transform data + if output_type == "binary": + if sp.issparse(y): + y = y.toarray() + if y.ndim == 2 and y.shape[1] == 2: + return classes[y[:, 1]] + else: + if len(classes) == 1: + return np.repeat(classes[0], len(y)) + else: + return classes[y.ravel()] + + elif output_type == "multilabel-indicator": + return y + + else: + raise ValueError("{0} format is not supported".format(output_type)) diff --git a/python/xorbits/_mars/learn/preprocessing/normalize.py b/python/xorbits/_mars/learn/preprocessing/normalize.py new file mode 100644 index 000000000..6108cec32 --- /dev/null +++ b/python/xorbits/_mars/learn/preprocessing/normalize.py @@ -0,0 +1,345 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +try: + from sklearn.preprocessing import normalize as sklearn_normalize +except ImportError: # pragma: no cover + sklearn_normalize = None + +from ... import opcodes as OperandDef +from ... import tensor as mt +from ...core import ExecutableTuple, recursive_tile +from ...serialization.serializables import BoolField, Int32Field, KeyField, StringField +from ...tensor.array_utils import as_same_device, device, issparse, sparse +from ...tensor.core import TensorOrder +from ...tensor.operands import TensorOperand, TensorOperandMixin +from ..utils import check_array + + +class TensorNormalize(TensorOperand, TensorOperandMixin): + _op_module_ = "learn" + _op_type_ = OperandDef.NORMALIZE + + _input = KeyField("input") + _norm = StringField("norm") + _axis = Int32Field("axis") + _return_norm = BoolField("return_norm") + # for test purpose + _use_sklearn = BoolField("use_sklearn") + + def __init__(self, norm=None, axis=None, return_norm=None, use_sklearn=None, **kw): + super().__init__( + _norm=norm, + _axis=axis, + _return_norm=return_norm, + _use_sklearn=use_sklearn, + **kw, + ) + if self._use_sklearn is None: + # force to use sklearn if not specified + self._use_sklearn = True + + @property + def input(self): + return self._input + + @property + def norm(self): + return self._norm + + @property + def axis(self): + return self._axis + + @property + def return_norm(self): + return self._return_norm + + @property + def use_sklearn(self): + return self._use_sklearn + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + @property + def output_limit(self): + return 2 if self._return_norm else 1 + + def __call__(self, x, copy=True): + x = check_array( + x, + accept_sparse=True, + estimator="the normalize function", + dtype=(np.float64, np.float32, np.float16), + ) + + normed = None + if not self._return_norm: + res = self.new_tensor([x], shape=x.shape, order=x.order) + else: + kws = [ + {"shape": x.shape, "order": x.order}, + { + "shape": (x.shape[0] if self._axis == 1 else x.shape[1],), + "order": TensorOrder.C_ORDER, + }, + ] + res, normed = self.new_tensors([x], kws=kws, output_limit=2) + + if not copy and self._axis == 1: + # follow the behaviour of sklearn + x.data = res.data + + if normed is None: + return res + return ExecutableTuple([res, normed]) + + @classmethod + def _tile_one_chunk(cls, op): + outs = op.outputs + chunk_op = op.copy().reset_key() + kws = [{"shape": outs[0].shape, "order": outs[0].order, "index": (0, 0)}] + if len(outs) == 2: + kws.append({"shape": outs[1].shape, "order": outs[1].order, "index": (0,)}) + chunks = chunk_op.new_chunks( + [op.input.chunks[0]], kws=kws, output_limit=len(outs) + ) + + tensor_kws = [ + { + "shape": outs[0].shape, + "order": outs[0].order, + "chunks": [chunks[0]], + "nsplits": tuple((s,) for s in outs[0].shape), + } + ] + if len(outs) == 2: + tensor_kws.append( + { + "shape": outs[1].shape, + "order": outs[1].order, + "chunks": [chunks[1]], + "nsplits": tuple((s,) for s in outs[1].shape), + } + ) + + new_op = op.copy() + return new_op.new_tensors(op.inputs, kws=tensor_kws, output_limit=len(outs)) + + @classmethod + def _need_tile_into_chunks(cls, op): + # if true, try to tile into chunks + # whose implementation is based on sklearn itself + x = op.input + if op.gpu: # pragma: no cover + return False + if x.issparse() and op.return_norm and op.norm in ("l1", "l2"): + # sklearn cannot handle + return False + if x.chunk_shape[op.axis] > 1: + return False + return True + + @classmethod + def _tile_chunks(cls, op): + assert op.input.chunk_shape[op.axis] == 1 + x = op.input + axis = op.axis + outs = op.outputs + + out_chunks = [], [] + for i, c in enumerate(x.chunks): + chunk_op = op.copy().reset_key() + kws = [{"shape": c.shape, "order": c.order, "index": c.index}] + if op.return_norm: + kws.append( + { + "shape": (c.shape[1 - axis],), + "order": TensorOrder.C_ORDER, + "index": (i,), + } + ) + chunks = chunk_op.new_chunks([c], kws=kws, output_limit=op.output_limit) + out_chunks[0].append(chunks[0]) + if len(chunks) == 2: + out_chunks[1].append(chunks[1]) + + tensor_kws = [ + { + "shape": outs[0].shape, + "order": outs[0].order, + "chunks": out_chunks[0], + "nsplits": x.nsplits, + } + ] + if len(outs) == 2: + tensor_kws.append( + { + "shape": outs[1].shape, + "order": outs[1].order, + "chunks": out_chunks[1], + "nsplits": (x.nsplits[1 - axis],), + } + ) + new_op = op.copy() + return new_op.new_tensors(op.inputs, kws=tensor_kws, output_limit=len(outs)) + + @classmethod + def tile(cls, op): + x = op.input + norm = op.norm + axis = op.axis + + if len(x.chunks) == 1: + return cls._tile_one_chunk(op) + + if cls._need_tile_into_chunks(op): + return cls._tile_chunks(op) + else: + if norm == "l1": + norms = mt.abs(x).sum(axis=axis) + elif norm == "l2": + norms = mt.sqrt((x**2).sum(axis=axis)) + else: + assert norm == "max" + # sparse.max will still be a sparse, + # force to convert to dense + norms = mt.max(x, axis=axis).todense() + norms = mt.where(mt.equal(norms, 0.0), 1.0, norms) + if axis == 1: + x = x / norms[:, mt.newaxis] + else: + x = x / norms[mt.newaxis, :] + + ret = [(yield from recursive_tile(x))] + if op.return_norm: + ret.append((yield from recursive_tile(norms))) + + new_op = op.copy() + kws = [out.params for out in op.outputs] + for i, r in enumerate(ret): + kws[i]["chunks"] = r.chunks + kws[i]["nsplits"] = r.nsplits + return new_op.new_tensors(op.inputs, kws=kws) + + @classmethod + def execute(cls, ctx, op): + (x,), device_id, xp = as_same_device( + [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True + ) + axis = op.axis + return_norm = op.return_norm + norm = op.norm + outs = op.outputs + + with device(device_id): + if device_id < 0 and op.use_sklearn and sklearn_normalize is not None: + # no GPU + try: + if xp is sparse: + if axis == 0: + xm = x.raw.tocsc() + else: + xm = x.raw + else: + xm = x + ret = sklearn_normalize( + xm, norm=norm, axis=axis, return_norm=return_norm + ) + normed = None + if return_norm: + ret, normed = ret + if issparse(ret): + ret = sparse.SparseNDArray(ret) + ctx[outs[0].key] = ret + if normed is not None: + ctx[outs[1].key] = normed + return + except NotImplementedError: + pass + + # fall back + if axis == 0: + x = x.T + + if norm == "l1": + norms = xp.abs(x).sum(axis=1) + elif norm == "l2": + norms = xp.sqrt((x**2).sum(axis=1)) + else: + norms = xp.max(x, axis=1) + if issparse(norms): + norms = norms.toarray() + norms[norms == 0.0] = 1.0 + x = x / norms[:, np.newaxis] + + if axis == 0: + x = x.T + + ctx[outs[0].key] = x + if return_norm: + ctx[outs[1].key] = norms + + +def normalize(X, norm="l2", axis=1, copy=True, return_norm=False): + """ + Scale input vectors individually to unit norm (vector length). + + Parameters + ---------- + X : {array-like, sparse matrix}, shape [n_samples, n_features] + The data to normalize, element by element. + scipy.sparse matrices should be in CSR format to avoid an + un-necessary copy. + + norm : 'l1', 'l2', or 'max', optional ('l2' by default) + The norm to use to normalize each non zero sample (or each non-zero + feature if axis is 0). + + axis : 0 or 1, optional (1 by default) + axis used to normalize the data along. If 1, independently normalize + each sample, otherwise (if 0) normalize each feature. + + copy : boolean, optional, default True + set to False to perform inplace row normalization and avoid a + copy (if the input is already a tensor and if axis is 1). + + return_norm : boolean, default False + whether to return the computed norms + + Returns + ------- + X : {array-like, sparse matrix}, shape [n_samples, n_features] + Normalized input X. + + norms : Tensor, shape [n_samples] if axis=1 else [n_features] + A tensor of norms along given axis for X. + When X is sparse, a NotImplementedError will be raised + for norm 'l1' or 'l2'. + + See also + -------- + Normalizer: Performs normalization using the ``Transformer`` API + (e.g. as part of a preprocessing :class:`mars.learn.pipeline.Pipeline`). + """ + if norm not in ("l1", "l2", "max"): + raise ValueError(f"'{norm}' is not a supported norm") + if axis not in (0, 1): + raise ValueError(f"'{axis}' is not a supported axis") + op = TensorNormalize(norm=norm, axis=axis, return_norm=return_norm, dtype=X.dtype) + return op(X, copy=copy) diff --git a/python/xorbits/_mars/learn/preprocessing/tests/__init__.py b/python/xorbits/_mars/learn/preprocessing/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/learn/preprocessing/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/learn/preprocessing/tests/test_data.py b/python/xorbits/_mars/learn/preprocessing/tests/test_data.py new file mode 100644 index 000000000..fe33b9256 --- /dev/null +++ b/python/xorbits/_mars/learn/preprocessing/tests/test_data.py @@ -0,0 +1,216 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from sklearn.datasets import load_iris +from sklearn.utils import gen_batches +from sklearn.utils._testing import assert_allclose, assert_array_almost_equal + +from .... import tensor as mt +from .. import MinMaxScaler, minmax_scale + + +def assert_correct_incr(i, batch_start, batch_stop, n, chunk_size, n_samples_seen): + if batch_stop != n: + assert (i + 1) * chunk_size == n_samples_seen + else: + assert i * chunk_size + (batch_stop - batch_start) == n_samples_seen + + +def _check_dim_1axis(a): + return mt.asarray(a).shape[0] + + +rng = mt.random.RandomState(0) +n_features = 30 +n_samples = 1000 +offsets = rng.uniform(-1, 1, size=n_features) +scales = rng.uniform(1, 10, size=n_features) +X_2d = rng.randn(n_samples, n_features) * scales + offsets +X_1row = X_2d[0, :].reshape(1, n_features) +X_1col = X_2d[:, 0].reshape(n_samples, 1) +iris = mt.tensor(load_iris().data) + + +@pytest.mark.parametrize("chunk_size", [200, X_2d.shape[0], X_2d.shape[0] + 42]) +def test_min_max_scaler_partial_fit(setup, chunk_size): + # Test if partial_fit run over many batches of size 1 and 50 + # gives the same results as fit + X = X_2d + n = X.shape[0] + + # Test mean at the end of the process + scaler_batch = MinMaxScaler().fit(X) + + scaler_incr = MinMaxScaler() + for batch in gen_batches(n_samples, chunk_size): + scaler_incr = scaler_incr.partial_fit(X[batch]) + + assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_) + assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_) + assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ + assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_) + assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) + assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_) + + # Test std after 1 step + batch0 = slice(0, chunk_size) + scaler_batch = MinMaxScaler().fit(X[batch0]) + scaler_incr = MinMaxScaler().partial_fit(X[batch0]) + + assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_) + assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_) + assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ + assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_) + assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) + assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_) + + # Test std until the end of partial fits, and + _ = MinMaxScaler().fit(X) + scaler_incr = MinMaxScaler() # Clean estimator + for i, batch in enumerate(gen_batches(n_samples, chunk_size)): + scaler_incr = scaler_incr.partial_fit(X[batch]) + assert_correct_incr( + i, + batch_start=batch.start, + batch_stop=batch.stop, + n=n, + chunk_size=chunk_size, + n_samples_seen=scaler_incr.n_samples_seen_, + ) + + +def test_min_max_scaler_iris(setup): + X = iris + scaler = MinMaxScaler() + # default params + X_trans = scaler.fit_transform(X) + assert_array_almost_equal(X_trans.min(axis=0), 0) + assert_array_almost_equal(X_trans.max(axis=0), 1) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + # not default params: min=1, max=2 + scaler = MinMaxScaler(feature_range=(1, 2)) + X_trans = scaler.fit_transform(X) + assert_array_almost_equal(X_trans.min(axis=0), 1) + assert_array_almost_equal(X_trans.max(axis=0), 2) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + # min=-.5, max=.6 + scaler = MinMaxScaler(feature_range=(-0.5, 0.6)) + X_trans = scaler.fit_transform(X) + assert_array_almost_equal(X_trans.min(axis=0), -0.5) + assert_array_almost_equal(X_trans.max(axis=0), 0.6) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + # raises on invalid range + scaler = MinMaxScaler(feature_range=(2, 1)) + with pytest.raises(ValueError): + scaler.fit(X) + + +def test_min_max_scaler_zero_variance_features(setup): + # Check min max scaler on toy data with zero variance features + X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.1], [0.0, 1.0, +1.1]] + + X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]] + + # default params + scaler = MinMaxScaler() + X_trans = scaler.fit_transform(X) + X_expected_0_1 = [[0.0, 0.0, 0.5], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0]] + assert_array_almost_equal(X_trans, X_expected_0_1) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + X_trans_new = scaler.transform(X_new) + X_expected_0_1_new = [[+0.0, 1.0, 0.500], [-1.0, 0.0, 0.083], [+0.0, 0.0, 1.333]] + assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2) + + # not default params + scaler = MinMaxScaler(feature_range=(1, 2)) + X_trans = scaler.fit_transform(X) + X_expected_1_2 = [[1.0, 1.0, 1.5], [1.0, 1.0, 1.0], [1.0, 1.0, 2.0]] + assert_array_almost_equal(X_trans, X_expected_1_2) + + # function interface + X_trans = minmax_scale(X) + assert_array_almost_equal(X_trans, X_expected_0_1) + X_trans = minmax_scale(X, feature_range=(1, 2)) + assert_array_almost_equal(X_trans, X_expected_1_2) + + +def test_minmax_scale_axis1(setup): + X = iris + X_trans = minmax_scale(X, axis=1) + assert_array_almost_equal(mt.min(X_trans, axis=1), 0) + assert_array_almost_equal(mt.max(X_trans, axis=1), 1) + + +def test_min_max_scaler1d(setup): + X_list_1row = X_1row.to_numpy().tolist() + X_list_1col = X_1col.to_numpy().tolist() + + # Test scaling of dataset along single axis + for X in [X_1row, X_1col, X_list_1row, X_list_1col]: + scaler = MinMaxScaler(copy=True) + X_scaled = scaler.fit(X).transform(X) + + if isinstance(X, list): + X = mt.array(X) # cast only after scaling done + + if _check_dim_1axis(X) == 1: + assert_array_almost_equal(X_scaled.min(axis=0), mt.zeros(n_features)) + assert_array_almost_equal(X_scaled.max(axis=0), mt.zeros(n_features)) + else: + assert_array_almost_equal(X_scaled.min(axis=0), 0.0) + assert_array_almost_equal(X_scaled.max(axis=0), 1.0) + assert scaler.n_samples_seen_ == X.shape[0] + + # check inverse transform + X_scaled_back = scaler.inverse_transform(X_scaled) + assert_array_almost_equal(X_scaled_back, X) + + # Constant feature + X = mt.ones((5, 1)) + scaler = MinMaxScaler() + X_scaled = scaler.fit(X).transform(X) + assert X_scaled.min().to_numpy() >= 0.0 + assert X_scaled.max().to_numpy() <= 1.0 + assert scaler.n_samples_seen_ == X.shape[0] + + # Function interface + X_1d = X_1row.ravel() + min_ = X_1d.min() + max_ = X_1d.max() + assert_array_almost_equal( + (X_1d - min_) / (max_ - min_), minmax_scale(X_1d, copy=True) + ) + + +@pytest.mark.parametrize("feature_range", [(0, 1), (-10, 10)]) +def test_minmax_scaler_clip(setup, feature_range): + # test behaviour of the parameter 'clip' in MinMaxScaler + X = iris + scaler = MinMaxScaler(feature_range=feature_range, clip=True).fit(X) + X_min, X_max = mt.min(X, axis=0), mt.max(X, axis=0) + X_test = [mt.r_[X_min[:2] - 10, X_max[2:] + 10]] + X_transformed = scaler.transform(X_test) + assert_allclose( + X_transformed, + [[feature_range[0], feature_range[0], feature_range[1], feature_range[1]]], + ) diff --git a/python/xorbits/_mars/learn/preprocessing/tests/test_label.py b/python/xorbits/_mars/learn/preprocessing/tests/test_label.py new file mode 100644 index 000000000..1d4cf0425 --- /dev/null +++ b/python/xorbits/_mars/learn/preprocessing/tests/test_label.py @@ -0,0 +1,381 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +import scipy.sparse as sp +from sklearn.preprocessing._label import ( + _inverse_binarize_multiclass, + _inverse_binarize_thresholding, +) +from sklearn.utils._testing import assert_array_equal, ignore_warnings +from sklearn.utils.multiclass import type_of_target + +from .... import tensor as mt +from .. import LabelBinarizer, LabelEncoder, label_binarize + + +def test_label_binarizer(setup): + # one-class case defaults to negative label + # For dense case: + inp = ["pos", "pos", "pos", "pos"] + lb = LabelBinarizer(sparse_output=False) + expected = np.array([[0, 0, 0, 0]]).T + got = lb.fit_transform(inp) + assert_array_equal(lb.classes_, ["pos"]) + assert_array_equal(expected, got) + assert_array_equal(lb.inverse_transform(got), inp) + + # For sparse case: + lb = LabelBinarizer(sparse_output=True) + got = lb.fit_transform(inp) + assert got.issparse() + assert_array_equal(lb.classes_, ["pos"]) + assert_array_equal(expected, got.fetch().toarray()) + assert_array_equal(lb.inverse_transform(got.todense()), inp) + + lb = LabelBinarizer(sparse_output=False) + # two-class case + inp = ["neg", "pos", "pos", "neg"] + expected = np.array([[0, 1, 1, 0]]).T + got = lb.fit_transform(inp) + assert_array_equal(lb.classes_, ["neg", "pos"]) + assert_array_equal(expected, got) + + to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]]) + assert_array_equal(lb.inverse_transform(to_invert), inp) + + # multi-class case + inp = ["spam", "ham", "eggs", "ham", "0"] + expected = np.array( + [[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]] + ) + got = lb.fit_transform(inp) + assert_array_equal(lb.classes_, ["0", "eggs", "ham", "spam"]) + assert_array_equal(expected, got) + assert_array_equal(lb.inverse_transform(got), inp) + + +def test_label_binarizer_set_label_encoding(setup): + lb = LabelBinarizer(neg_label=-2, pos_label=0) + + # two-class case with pos_label=0 + inp = np.array([0, 1, 1, 0]) + expected = np.array([[-2, 0, 0, -2]]).T + got = lb.fit_transform(mt.tensor(inp)) + assert_array_equal(expected, got) + assert_array_equal(lb.inverse_transform(got), inp) + + lb = LabelBinarizer(neg_label=-2, pos_label=2) + + # multi-class case + inp = np.array([3, 2, 1, 2, 0]) + expected = np.array( + [ + [-2, -2, -2, +2], + [-2, -2, +2, -2], + [-2, +2, -2, -2], + [-2, -2, +2, -2], + [+2, -2, -2, -2], + ] + ) + got = lb.fit_transform(inp) + assert_array_equal(expected, got) + assert_array_equal(lb.inverse_transform(got), inp) + + +@ignore_warnings +def test_label_binarizer_errors(setup): + # Check that invalid arguments yield ValueError + one_class = np.array([0, 0, 0, 0]) + lb = LabelBinarizer().fit(one_class) + + multi_label = [(2, 3), (0,), (0, 2)] + with pytest.raises(ValueError): + lb.transform(multi_label) + + lb = LabelBinarizer() + with pytest.raises(ValueError): + lb.transform([]) + with pytest.raises(ValueError): + lb.inverse_transform([]) + + with pytest.raises(ValueError): + LabelBinarizer(neg_label=2, pos_label=1) + with pytest.raises(ValueError): + LabelBinarizer(neg_label=2, pos_label=2) + + with pytest.raises(ValueError): + LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True) + + # Sequence of seq type should raise ValueError + y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]] + with pytest.raises(ValueError): + LabelBinarizer().fit_transform(y_seq_of_seqs) + + # Fail on multioutput data + with pytest.raises(ValueError): + LabelBinarizer().fit(np.array([[1, 3], [2, 1]])) + with pytest.raises(ValueError): + label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3]) + + +def test_label_binarize_with_class_order(setup): + out = label_binarize([1, 6], classes=[1, 2, 4, 6]) + expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]]) + assert_array_equal(out, expected) + + # Modified class order + out = label_binarize([1, 6], classes=[1, 6, 4, 2]) + expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]]) + assert_array_equal(out, expected) + + out = label_binarize([0, 1, 2, 3], classes=[3, 2, 0, 1]) + expected = np.array([[0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0]]) + assert_array_equal(out, expected) + + +def toarray(a): + if hasattr(a, "toarray"): + a = a.toarray() + return a + + +def check_binarized_results(y, classes, pos_label, neg_label, expected): + for sparse_output in [True, False]: + if (pos_label == 0 or neg_label != 0) and sparse_output: + with pytest.raises(ValueError): + label_binarize( + y, + classes=classes, + neg_label=neg_label, + pos_label=pos_label, + sparse_output=sparse_output, + ) + continue + + # check label_binarize + binarized = label_binarize( + y, + classes=classes, + neg_label=neg_label, + pos_label=pos_label, + sparse_output=sparse_output, + ) + binarized = binarized.fetch() + if hasattr(binarized, "raw"): + binarized = binarized.raw + assert_array_equal(toarray(binarized), expected) + assert sp.issparse(binarized) == sparse_output + + # check inverse + y_type = type_of_target(y) + if y_type == "multiclass": + inversed = _inverse_binarize_multiclass(binarized, classes=classes) + + else: + inversed = _inverse_binarize_thresholding( + binarized.copy(), # https://github.com/mars-project/mars/issues/3268 + output_type=y_type, + classes=classes, + threshold=((neg_label + pos_label) / 2.0), + ) + + assert_array_equal(toarray(inversed), toarray(y)) + + # Check label binarizer + lb = LabelBinarizer( + neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output + ) + binarized = lb.fit_transform(y) + assert_array_equal(toarray(binarized), expected) + assert binarized.issparse() == sparse_output + inverse_output = lb.inverse_transform(binarized) + assert_array_equal(toarray(inverse_output), toarray(y)) + assert inverse_output.issparse() == sp.issparse(y) + + +def test_label_binarize_binary(setup): + y = [0, 1, 0] + classes = [0, 1] + pos_label = 2 + neg_label = -1 + expected = np.array([[2, -1], [-1, 2], [2, -1]])[:, 1].reshape((-1, 1)) + + check_binarized_results(y, classes, pos_label, neg_label, expected) + + # Binary case where sparse_output = True will not result in a ValueError + y = [0, 1, 0] + classes = [0, 1] + pos_label = 3 + neg_label = 0 + expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1)) + + check_binarized_results(y, classes, pos_label, neg_label, expected) + + +def test_label_binarize_multiclass(setup): + y = [0, 1, 2] + classes = [0, 1, 2] + pos_label = 2 + neg_label = 0 + expected = 2 * np.eye(3) + + check_binarized_results(y, classes, pos_label, neg_label, expected) + + with pytest.raises(ValueError): + label_binarize( + y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True + ) + + +def test_label_binarize_multilabel(setup): + y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]]) + classes = [0, 1, 2] + pos_label = 2 + neg_label = 0 + expected = pos_label * y_ind + y_sparse = [sp.csr_matrix(y_ind)] + + for y in [y_ind] + y_sparse: + check_binarized_results(y, classes, pos_label, neg_label, expected) + + with pytest.raises(ValueError): + label_binarize( + y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True + ) + + +def test_invalid_input_label_binarize(setup): + with pytest.raises(ValueError): + label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1) + with pytest.raises(ValueError, match="continuous target data is not "): + label_binarize([1.2, 2.7], classes=[0, 1]) + with pytest.raises(ValueError, match="mismatch with the labels"): + label_binarize([[1, 3]], classes=[1, 2, 3]) + + +@pytest.mark.parametrize( + "values, classes, unknown", + [ + ( + np.array([2, 1, 3, 1, 3], dtype="int64"), + np.array([1, 2, 3], dtype="int64"), + np.array([4], dtype="int64"), + ), + ( + np.array(["b", "a", "c", "a", "c"], dtype=object), + np.array(["a", "b", "c"], dtype=object), + np.array(["d"], dtype=object), + ), + ( + np.array(["b", "a", "c", "a", "c"]), + np.array(["a", "b", "c"]), + np.array(["d"]), + ), + ], + ids=["int64", "object", "str"], +) +def test_label_encoder(setup, values, classes, unknown): + # Test LabelEncoder's transform, fit_transform and + # inverse_transform methods + values_t = mt.tensor(values) + + le = LabelEncoder() + le.fit(values_t) + assert_array_equal(le.classes_.fetch(), classes) + assert_array_equal(le.transform(values_t).fetch(), [1, 0, 2, 0, 2]) + assert_array_equal(le.inverse_transform(mt.tensor([1, 0, 2, 0, 2])).fetch(), values) + + le = LabelEncoder() + ret = le.fit_transform(values) + assert_array_equal(ret.fetch(), [1, 0, 2, 0, 2]) + + with pytest.raises(ValueError, match="unseen labels"): + le.transform(unknown) + + +def test_label_encoder_missing_values_numeric(setup): + values = np.array([3, 1, np.nan, 5, 3, np.nan], dtype=float) + values_t = mt.tensor(values) + le = LabelEncoder() + assert_array_equal(le.fit_transform(values_t).fetch(), [1, 0, 3, 2, 1, 3]) + + +def test_label_encoder_negative_ints(setup): + le = LabelEncoder() + le.fit(mt.tensor([1, 1, 4, 5, -1, 0])) + assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) + assert_array_equal( + le.transform(mt.tensor([0, 1, 4, 4, 5, -1, -1])), [1, 2, 3, 3, 4, 0, 0] + ) + assert_array_equal( + le.inverse_transform(mt.tensor([1, 2, 3, 3, 4, 0, 0])), [0, 1, 4, 4, 5, -1, -1] + ) + with pytest.raises(ValueError): + le.transform(mt.tensor([0, 6])) + + +@pytest.mark.parametrize("dtype", ["str", "object"]) +def test_label_encoder_str_bad_shape(setup, dtype): + le = LabelEncoder() + le.fit(mt.tensor(np.array(["apple", "orange"], dtype=dtype))) + msg = "should be a 1d array" + with pytest.raises(ValueError, match=msg): + le.transform("apple") + + +def test_label_encoder_errors(setup): + # Check that invalid arguments yield ValueError + le = LabelEncoder() + with pytest.raises(ValueError): + le.transform([]) + with pytest.raises(ValueError): + le.inverse_transform([]) + + # Fail on unseen labels + le = LabelEncoder() + le.fit(mt.tensor([1, 2, 3, -1, 1])) + msg = "contains previously unseen labels" + with pytest.raises(ValueError, match=msg): + le.inverse_transform(mt.tensor([-2])) + with pytest.raises(ValueError, match=msg): + le.inverse_transform(mt.tensor([-2, -3, -4])) + + # Fail on inverse_transform("") + msg = r"should be a 1d array.+shape \(\)" + with pytest.raises(ValueError, match=msg): + le.inverse_transform("") + + +@pytest.mark.parametrize( + "values", + [ + np.array([2, 1, 3, 1, 3], dtype="int64"), + np.array(["b", "a", "c", "a", "c"], dtype=object), + np.array(["b", "a", "c", "a", "c"]), + ], + ids=["int64", "object", "str"], +) +def test_label_encoder_empty_array(setup, values): + values_t = mt.tensor(values) + + le = LabelEncoder() + le.fit(values_t) + # test empty transform + transformed = le.transform(mt.array([])) + assert_array_equal(np.array([]), transformed) + # test empty inverse transform + inverse_transformed = le.inverse_transform([]) + assert_array_equal(np.array([]), inverse_transformed) diff --git a/python/xorbits/_mars/learn/preprocessing/tests/test_normalize.py b/python/xorbits/_mars/learn/preprocessing/tests/test_normalize.py new file mode 100644 index 000000000..017a81b38 --- /dev/null +++ b/python/xorbits/_mars/learn/preprocessing/tests/test_normalize.py @@ -0,0 +1,83 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +import scipy.sparse as sps +from sklearn.preprocessing import normalize as sk_normalize + +from .... import tensor as mt +from .. import normalize + + +def test_normalize_op(): + with pytest.raises(ValueError): + normalize(mt.random.random(10, 3), norm="unknown") + + with pytest.raises(ValueError): + normalize(mt.random.random(10, 3), axis=-1) + + with pytest.raises(ValueError): + normalize(mt.random.rand(10, 3, 3)) + + +def test_normalize_execution(setup): + raw_dense = np.random.rand(10, 10) + raw_sparse = sps.random(10, 10, density=0.4, format="csr") + + for chunk_size in [10, 6, (10, 6), (6, 10)]: + for raw, x in [ + (raw_dense, mt.tensor(raw_dense, chunk_size=chunk_size)), + (raw_sparse, mt.tensor(raw_sparse, chunk_size=chunk_size)), + ]: + for norm in ["l1", "l2", "max"]: + for axis in (0, 1): + for use_sklearn in [True, False]: + n = normalize(x, norm=norm, axis=axis, return_norm=False) + n.op._use_sklearn = use_sklearn + + result = n.execute().fetch() + expected = sk_normalize( + raw, norm=norm, axis=axis, return_norm=False + ) + + if sps.issparse(expected): + expected = expected.A + np.testing.assert_almost_equal(np.asarray(result), expected) + + raw_dense = np.random.rand(10, 10) + raw_sparse = sps.random(10, 10, density=0.4, format="csr") + + # test copy and return_normalize + for axis in (0, 1): + for chunk_size in (10, 6, (6, 10)): + for raw in (raw_dense, raw_sparse): + x = mt.tensor(raw, chunk_size=chunk_size) + n = normalize(x, axis=axis, copy=False, return_norm=True) + + results = n.execute().fetch() + raw_copy = raw.copy() + try: + expects = sk_normalize( + raw_copy, axis=axis, copy=False, return_norm=True + ) + except NotImplementedError: + continue + + if sps.issparse(expects[0]): + expected = expects[0].A + else: + expected = expects[0] + np.testing.assert_almost_equal(np.asarray(results[0]), expected) + np.testing.assert_almost_equal(results[1], expects[1]) diff --git a/python/xorbits/_mars/learn/proxima/__init__.py b/python/xorbits/_mars/learn/proxima/__init__.py new file mode 100644 index 000000000..ea47818fc --- /dev/null +++ b/python/xorbits/_mars/learn/proxima/__init__.py @@ -0,0 +1,21 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def register_op(): + from .simple_index.builder import ProximaBuilder + from .simple_index.searcher import ProximaSearcher + + del ProximaBuilder + del ProximaSearcher diff --git a/python/xorbits/_mars/learn/proxima/core.py b/python/xorbits/_mars/learn/proxima/core.py new file mode 100644 index 000000000..4ac67d904 --- /dev/null +++ b/python/xorbits/_mars/learn/proxima/core.py @@ -0,0 +1,178 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +try: + import pyproxima2 as proxima +except ImportError: # pragma: no cover + proxima = None + +from ... import tensor as mt +from ...tensor.indexing import TensorSlice +from ...tensor.merge import TensorConcatenate + +available_numpy_dtypes = [ + np.dtype(np.float16), + np.dtype(np.float32), + np.dtype(np.int8), + np.dtype(np.int16), +] + + +if proxima: + _proxima_types = [ + proxima.IndexMeta.FT_FP16, + proxima.IndexMeta.FT_FP32, + proxima.IndexMeta.FT_INT8, + proxima.IndexMeta.FT_INT16, + ] + assert len(_proxima_types) == len(available_numpy_dtypes) + _type_mapping = { + numpy_dtype: proxima_type + for numpy_dtype, proxima_type in zip(available_numpy_dtypes, _proxima_types) + } + + +def rechunk_tensor(tensor, chunk_size): + # TODO(hks): Provide a unify rechunk logic with mmap. + cur_chunks = [] + + out_nchunks = tensor.shape[0] // chunk_size + row_nsplits = [chunk_size] * out_nchunks + rest = tensor.shape[0] % chunk_size + if rest >= out_nchunks: + row_nsplits.append(rest) + else: + for i in range(tensor.shape[0] % chunk_size): + row_nsplits[-i - 1] += 1 + + tensor_cumnrows = np.cumsum([0] + list(tensor.nsplits[0])) + offset = 0 + out_groups = [] + for split in row_nsplits: + start_chunk_index = int(tensor_cumnrows.searchsorted(offset)) + start_chunk_index = start_chunk_index - 1 if start_chunk_index != 0 else 0 + end_chunk_index = int(tensor_cumnrows.searchsorted(offset + split) - 1) + if start_chunk_index == end_chunk_index: + t = tensor.chunks[start_chunk_index] + slice_op = TensorSlice( + [ + slice( + offset - tensor_cumnrows[start_chunk_index], + split + offset - tensor_cumnrows[end_chunk_index], + ), + slice(None), + ], + dtype=t.dtype, + ) + out_groups.append( + [ + slice_op.new_chunk( + [t], + shape=(split, t.shape[1]), + index=(len(cur_chunks), 0), + order=t.order, + ) + ] + ) + else: + chunks = [] + start_chunk = tensor.chunks[start_chunk_index] + start_slice = int(offset - tensor_cumnrows[start_chunk_index]) + slice_op = TensorSlice( + [slice(start_slice, None), slice(None)], dtype=start_chunk.dtype + ) + chunks.append( + slice_op.new_chunk( + [start_chunk], + shape=(start_chunk.shape[0] - start_slice, start_chunk.shape[1]), + index=(0, 0), + order=start_chunk.order, + ) + ) + chunks.extend(tensor.chunks[start_chunk_index + 1 : end_chunk_index]) + end_chunk = tensor.chunks[end_chunk_index] + end_slice = int(split + offset - tensor_cumnrows[end_chunk_index]) + slice_op_end = TensorSlice( + [slice(None, end_slice), slice(None)], dtype=start_chunk.dtype + ) + chunks.append( + slice_op_end.new_chunk( + [end_chunk], + shape=(end_slice, end_chunk.shape[1]), + index=(end_chunk_index - start_chunk_index, 0), + order=end_chunk.order, + ) + ) + out_groups.append(chunks) + + offset += split + + return out_groups + + +def build_mmap_chunks(chunks, worker, file_prefix): + write_mmap_chunks = [] + nrows = sum(c.shape[0] for c in chunks) + array_shape = (nrows, chunks[0].shape[1]) + array_dtype = chunks[0].dtype + create_mmap_op = TensorConcatenate( + mmap=True, + create_mmap_file=True, + total_shape=array_shape, + file_prefix=file_prefix, + dtype=array_dtype, + ) + create_mmap_op.expect_worker = worker + create_mmap_chunk = create_mmap_op.new_chunk( + None, index=(0,), shape=(), dtype=array_dtype + ) + start_index = 0 + for j, chk in enumerate(chunks): + s = slice(start_index, start_index + chk.shape[0]) + start_index += chk.shape[0] + write_mmap_op = TensorConcatenate( + mmap=True, + create_mmap_file=False, + total_shape=array_shape, + partition_slice=s, + dtype=array_dtype, + ) + write_mmap_op.expect_worker = worker + write_mmap_chunk = write_mmap_op.new_chunk( + [create_mmap_chunk, chk], index=(j + 1, 0), shape=(), dtype=array_dtype + ) + write_mmap_chunks.append(write_mmap_chunk) + return write_mmap_chunks + + +def validate_tensor(tensor): + if hasattr(tensor, "to_tensor"): + tensor = tensor.to_tensor() + else: + tensor = mt.tensor(tensor) + if tensor.ndim != 2: + raise ValueError("Input tensor should be 2-d") + return tensor + + +def get_proxima_type(np_dtype): + try: + return _type_mapping[np_dtype] + except KeyError: + raise TypeError( + f"Does not support {np_dtype}, available types include " + f"{', '.join(t.name for t in _type_mapping)}" + ) diff --git a/python/xorbits/_mars/learn/proxima/simple_index/__init__.py b/python/xorbits/_mars/learn/proxima/simple_index/__init__.py new file mode 100644 index 000000000..42818f9a0 --- /dev/null +++ b/python/xorbits/_mars/learn/proxima/simple_index/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .builder import build_index +from .searcher import search_index +from .recall import recall # isort: skip diff --git a/python/xorbits/_mars/learn/proxima/simple_index/builder.py b/python/xorbits/_mars/learn/proxima/simple_index/builder.py new file mode 100644 index 000000000..1ab2f37fc --- /dev/null +++ b/python/xorbits/_mars/learn/proxima/simple_index/builder.py @@ -0,0 +1,438 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import logging +import os +import pickle # nosec # pylint: disable=import_pickle +import tempfile +import uuid + +import numpy as np + +from .... import opcodes +from .... import tensor as mt +from ....core import OutputType +from ....core.context import get_context +from ....core.operand import OperandStage +from ....lib.filesystem import get_fs +from ....serialization.serializables import ( + BytesField, + DataTypeField, + DictField, + Int32Field, + Int64Field, + StringField, + TupleField, +) +from ....utils import Timer, has_unknown_shape +from ...operands import LearnOperand, LearnOperandMixin +from ..core import ( + available_numpy_dtypes, + build_mmap_chunks, + get_proxima_type, + proxima, + rechunk_tensor, + validate_tensor, +) + +logger = logging.getLogger(__name__) + +DEFAULT_INDEX_SIZE = 5 * 10**6 + + +class ProximaBuilder(LearnOperand, LearnOperandMixin): + _op_type_ = opcodes.PROXIMA_SIMPLE_BUILDER + + _distance_metric = StringField("distance_metric") + _dimension = Int32Field("dimension") + _column_number = Int64Field("column_number") + _index_path = StringField("index_path") + _index_builder = StringField("index_builder") + _index_builder_params = DictField("index_builder_params") + _index_converter = StringField("index_converter") + _index_converter_params = DictField("index_converter_params") + _topk = Int32Field("topk") + _storage_options = BytesField( + "storage_options", on_serialize=pickle.dumps, on_deserialize=pickle.loads + ) + + # only for chunk + _array_shape = TupleField("array_shape") + _array_dtype = DataTypeField("array_dtype") + _offset = Int64Field("offset") + + def __init__( + self, + distance_metric=None, + index_path=None, + dimension=None, + column_number=None, + index_builder=None, + index_builder_params=None, + index_converter=None, + index_converter_params=None, + array_shape=None, + array_dtype=None, + offset=None, + topk=None, + storage_options=None, + output_types=None, + **kw, + ): + super().__init__( + _distance_metric=distance_metric, + _index_path=index_path, + _dimension=dimension, + _column_number=column_number, + _index_builder=index_builder, + _index_builder_params=index_builder_params, + _array_shape=array_shape, + _array_dtype=array_dtype, + _offset=offset, + _index_converter=index_converter, + _index_converter_params=index_converter_params, + _topk=topk, + _storage_options=storage_options, + _output_types=output_types, + **kw, + ) + if self._output_types is None: + self._output_types = [OutputType.object] + + @property + def distance_metric(self): + return self._distance_metric + + @property + def column_number(self): + return self._column_number + + @property + def index_path(self): + return self._index_path + + @property + def dimension(self): + return self._dimension + + @property + def index_builder(self): + return self._index_builder + + @property + def index_builder_params(self): + return self._index_builder_params + + @property + def index_converter(self): + return self._index_converter + + @property + def index_converter_params(self): + return self._index_converter_params + + @property + def topk(self): + return self._topk + + @property + def storage_options(self): + return self._storage_options + + @property + def array_shape(self): + return self._array_shape + + @property + def array_dtype(self): + return self._array_dtype + + @property + def offset(self): + return self._offset + + def __call__(self, tensor): + return self.new_tileable([tensor]) + + @classmethod + def _get_atleast_topk_nsplit(cls, nsplit, topk): + new_nsplit = [] + i = 0 + while i < len(nsplit): + cur = nsplit[i] + i += 1 + if cur >= topk: + new_nsplit.append(cur) + else: + while i < len(nsplit): + cur += nsplit[i] + i += 1 + if cur >= topk: + break + if cur < topk and len(new_nsplit) > 0: + new_nsplit[-1] += cur + elif cur >= topk: + new_nsplit.append(cur) + new_nsplit = tuple(new_nsplit) + assert sum(new_nsplit) == sum( + nsplit + ), f"sum of nsplit not equal, old: {nsplit}, new: {new_nsplit}" + + return new_nsplit + + @classmethod + def tile(cls, op): + tensor = op.inputs[0] + out = op.outputs[0] + index_path = op.index_path + ctx = get_context() + fs = None + if index_path is not None: + fs = get_fs(index_path, op.storage_options) + + if index_path is not None: + # check if the index path is empty + try: + files = [f for f in fs.ls(index_path) if "proxima_" in f] + if files: + raise ValueError( + f"Directory {index_path} contains built proxima index, " + f"clean them to perform new index building" + ) + except FileNotFoundError: + # if not exist, create directory + fs.mkdir(index_path) + + # make sure all inputs have known chunk sizes + if has_unknown_shape(*op.inputs): + yield + + if op.column_number: + index_chunk_size = op.inputs[0].shape[0] // op.column_number + else: + worker_num = len(ctx.get_worker_addresses() or []) + if worker_num > 0: + index_chunk_size = max( + op.inputs[0].shape[0] // worker_num, DEFAULT_INDEX_SIZE + ) + else: + index_chunk_size = DEFAULT_INDEX_SIZE + + if op.topk is not None: + index_chunk_size = cls._get_atleast_topk_nsplit(index_chunk_size, op.topk) + + # build chunks for writing tensors to mmap files. + worker_iter = iter(itertools.cycle(ctx.get_worker_addresses() or [None])) + chunk_groups = rechunk_tensor(tensor, index_chunk_size) + out_chunks = [] + offsets = [] + offset = 0 + for chunk_group in chunk_groups: + offsets.append(offset) + file_prefix = f"proxima-build-{str(uuid.uuid4())}" + out_chunks.append( + build_mmap_chunks( + chunk_group, next(worker_iter), file_prefix=file_prefix + ) + ) + offset += sum(c.shape[0] for c in chunk_group) + + final_out_chunks = [] + for j, chunks in enumerate(out_chunks): + chunk_op = op.copy().reset_key() + chunk_op.stage = OperandStage.map + chunk_op.expect_worker = chunks[0].op.expect_worker + chunk_op._array_shape = chunks[0].op.total_shape + chunk_op._array_dtype = chunks[0].dtype + chunk_op._offset = offsets[j] + out_chunk = chunk_op.new_chunk(chunks, index=(j,)) + final_out_chunks.append(out_chunk) + + logger.warning(f"index chunks count: {len(final_out_chunks)} ") + + params = out.params + params["chunks"] = final_out_chunks + params["nsplits"] = ((1,) * len(final_out_chunks),) + new_op = op.copy() + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + def _execute_map(cls, ctx, op: "ProximaBuilder"): + mmap_path = ctx[op.inputs[0].key] + out = op.outputs[0] + + data = np.memmap( + mmap_path, dtype=op.array_dtype, mode="r", shape=op.array_shape + ) + + proxima_type = get_proxima_type(op.array_dtype) + offset = op.offset + + # holder + with Timer() as timer: + holder = proxima.IndexHolder( + type=proxima_type, dimension=op.dimension, shallow=True + ) + holder.mount(data, key_base=offset) + + logger.warning(f"Holder({op.key}) costs {timer.duration} seconds") + + # converter + meta = proxima.IndexMeta( + proxima_type, dimension=op.dimension, measure_name=op.distance_metric + ) + if op.index_converter is not None: + with Timer() as timer: + converter = proxima.IndexConverter( + name=op.index_converter, meta=meta, params=op.index_converter_params + ) + converter.train_and_transform(holder) + holder = converter.result() + meta = converter.meta() + + logger.warning(f"Converter({op.key}) costs {timer.duration} seconds") + + # builder + with Timer() as timer: + builder = proxima.IndexBuilder( + name=op.index_builder, meta=meta, params=op.index_builder_params + ) + builder = builder.train_and_build(holder) + + logger.warning(f"Builder({op.key}) costs {timer.duration} seconds") + + # remove mmap file + os.remove(mmap_path) + + # dumper + with Timer() as timer: + path = tempfile.mkstemp(prefix="proxima-", suffix=".index")[1] + dumper = proxima.IndexDumper(name="FileDumper", path=path) + builder.dump(dumper) + dumper.close() + + logger.warning(f"Dumper({op.key}) costs {timer.duration} seconds") + + if op.index_path is None: + ctx[out.key] = path + else: + # write to external file + with Timer() as timer: + fs = get_fs(op.index_path, op.storage_options) + filename = f"proxima_{out.index[0]}_index" + out_path = f'{op.index_path.rstrip("/")}/{filename}' + + def write_index(): + with fs.open(out_path, "wb") as out_f: + with open(path, "rb") as in_f: + # 128M + chunk_bytes = 128 * 1024**2 + while True: + data = in_f.read(chunk_bytes) + if data: + out_f.write(data) + else: + break + + # retry 3 times + for _ in range(3): + try: + write_index() + break + except: # noqa: E722 # nosec # pylint: disable=bare-except + fs.delete(out_path) + continue + + logger.warning( + f"WritingToVolume({op.key}), out path: {out_path}, " + f"size {os.path.getsize(path)}, " + f"costs {timer.duration} seconds " + f"speed {round(os.path.getsize(path) / (1024 ** 2) / timer.duration, 2)} MB/s" + ) + + ctx[out.key] = filename + + @classmethod + def _execute_agg(cls, ctx, op: "ProximaBuilder"): + paths = [ctx[inp.key] for inp in op.inputs] + ctx[op.outputs[0].key] = paths + + @classmethod + def execute(cls, ctx, op: "ProximaBuilder"): + if op.stage != OperandStage.agg: + return cls._execute_map(ctx, op) + else: + return cls._execute_agg(ctx, op) + + @classmethod + def concat_tileable_chunks(cls, tileable): + assert not tileable.is_coarse() + + op = cls(stage=OperandStage.agg) + chunk = cls(stage=OperandStage.agg).new_chunk(tileable.chunks) + return op.new_tileable([tileable], chunks=[chunk], nsplits=((1,),)) + + +def build_index( + tensor, + dimension=None, + index_path=None, + column_number=None, + need_shuffle=False, + distance_metric="SquaredEuclidean", + index_builder="SsgBuilder", + index_builder_params=None, + index_converter=None, + index_converter_params=None, + topk=None, + storage_options=None, + run=True, + session=None, + run_kwargs=None, +): + tensor = validate_tensor(tensor) + if tensor.dtype not in available_numpy_dtypes: + raise ValueError( + f"Dtype to build index should be one of {available_numpy_dtypes}, " + f"got {tensor.dtype}" + ) + + if dimension is None: + dimension = tensor.shape[1] + if index_builder_params is None: + index_builder_params = {} + if index_converter_params is None: + index_converter_params = {} + + if need_shuffle: + tensor = mt.random.permutation(tensor) + + op = ProximaBuilder( + distance_metric=distance_metric, + index_path=index_path, + dimension=dimension, + column_number=column_number, + index_builder=index_builder, + index_builder_params=index_builder_params, + index_converter=index_converter, + index_converter_params=index_converter_params, + topk=topk, + storage_options=storage_options, + ) + result = op(tensor) + if run: + return result.execute(session=session, **(run_kwargs or dict())) + else: + return result diff --git a/python/xorbits/_mars/learn/proxima/simple_index/knn.py b/python/xorbits/_mars/learn/proxima/simple_index/knn.py new file mode 100644 index 000000000..1c3fe1b40 --- /dev/null +++ b/python/xorbits/_mars/learn/proxima/simple_index/knn.py @@ -0,0 +1,140 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random + +import pandas as pd + +from .... import dataframe as md +from .... import tensor as mt +from . import build_index, search_index + + +def sample_data(query, sample_count=10000): + if sample_count > query.shape[0]: + sample_count = query.shape[0] + + idx = random.sample(range(query.shape[0]), sample_count) + sample_query = query[idx, :] + return sample_query, idx + + +def linear_build_and_search( + doc, + query, + topk, + column_number=None, + row_number=None, + dimension=None, + measure_name=None, + threads=4, +): + if measure_name is None: + measure_name = "SquaredEuclidean" + if dimension is None: + dimension = doc.shape[1] + + index = build_index( + tensor=doc, + dimension=dimension, + column_number=column_number, + distance_metric=measure_name, + index_builder="LinearBuilder", + ) + + pk_l, distance_l = search_index( + tensor=query, + threads=threads, + row_number=row_number, + distance_metric=measure_name, + dimension=dimension, + topk=topk, + index=index, + ) + + return pk_l, distance_l + + +def build_and_search( + doc, + query, + topk, + doc_chunk, + query_chunk, + index_path=None, + threads=4, + dimension=None, + measure_name=None, + need_shuffle=False, + storage_options=None, + index_builder=None, + builder_params=None, + index_converter=None, + index_converter_params=None, + index_searcher=None, + searcher_params=None, + index_reformer=None, + index_reformer_params=None, +): + if measure_name is None: + measure_name = "SquaredEuclidean" + if dimension is None: + dimension = doc.shape[1] + if index_builder is None: + index_builder = "SsgBuilder" + if builder_params is None: + builder_params = {} + if index_converter_params is None: + index_converter_params = {} + if index_searcher is None: + index_searcher = "" + if searcher_params is None: + searcher_params = {} + if index_reformer is None: + index_reformer = "" + if index_reformer_params is None: + index_reformer_params = {} + + doc = md.DataFrame(pd.DataFrame(doc), chunk_size=(doc_chunk, dimension)) + query = mt.tensor(query, chunk_size=(query_chunk, dimension)) + + index = build_index( + doc, + dimension, + index_path, + need_shuffle, + measure_name, + index_builder, + builder_params, + index_converter, + index_converter_params, + topk, + storage_options, + ) + + pk2, distance = search_index( + query, + topk, + index, + threads, + dimension, + measure_name, + index_searcher, + searcher_params, + index_reformer, + index_reformer_params, + storage_options, + ) + + return pk2, distance diff --git a/python/xorbits/_mars/learn/proxima/simple_index/recall.py b/python/xorbits/_mars/learn/proxima/simple_index/recall.py new file mode 100644 index 000000000..b43099065 --- /dev/null +++ b/python/xorbits/_mars/learn/proxima/simple_index/recall.py @@ -0,0 +1,153 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import numpy as np + +from .... import remote as mr +from .knn import linear_build_and_search, sample_data + + +def recall_one(linear_score, ann_score, topk_ids, epsilon=1e-6): + topk_matchs = {} + for ids in topk_ids: + topk_matchs[ids] = 0 + length = len(linear_score) + match = 0 + idx, ann_item = 0, 0 + while idx < length: + cur_topk = idx + 1 + if ann_item < len(ann_score): + if math.fabs(linear_score[idx] - ann_score[ann_item]) < epsilon: + ann_item += 1 + idx += 1 + match += 1 + else: + if linear_score[idx] < ann_score[ann_item]: + idx += 1 # linear + else: + ann_item += 1 # ann + else: + idx += 1 + + if cur_topk in topk_ids: + topk_matchs[cur_topk] = match / cur_topk + + return topk_matchs + + +def recall_one_byid(linear_key, ann_key, ann_score, topk_ids): + idx, length = 0, len(linear_key) + topk_matchs, result_topk_matchs = {}, {} + + for ids in topk_ids: + topk_matchs[ids] = 0 + result_topk_matchs[ids] = 0 + + while idx < length: + for k in topk_ids: + dynamic_size = k + while dynamic_size + 1 < length: + if math.isclose(ann_score[dynamic_size - 1], ann_score[dynamic_size]): + dynamic_size += 1 + else: + break + + items = 0 + while items < len(ann_score) and items < dynamic_size: + if linear_key[idx] == ann_key[items]: + topk_matchs[k] += 1 + break + else: + items += 1 + + idx += 1 + if idx in topk_ids: + result_topk_matchs[idx] = topk_matchs[idx] / idx + + return result_topk_matchs + + +def compute_recall( + pk_l, distance_l, pk_p, distance_p, topk_ids, method="BYID", epsilon=1e-6 +): + pk_l, distance_l, pk_p, distance_p = ( + np.array(pk_l), + np.array(distance_l), + np.array(pk_p), + np.array(distance_p), + ) + topk_matchs = {} + for ids in topk_ids: + topk_matchs[ids] = 0 + for linear_res_k, linear_res_s, knn_res_k, knn_res_s in zip( + pk_l, distance_l, pk_p, distance_p + ): + if method == "BYID": + res_t = recall_one_byid(linear_res_k, knn_res_k, knn_res_s, topk_ids) + else: + res_t = recall_one(linear_res_s, knn_res_s, topk_ids, epsilon) + for k, v in res_t.items(): + topk_matchs[k] += v + + length = len(pk_l) + for k, v in topk_matchs.items(): + topk_matchs[k] = min(v / length, 1) + return topk_matchs + + +def recall( + doc, + query, + topk, + sample_count, + pk_p, + distance_p, + row_number=None, + column_number=None, + topk_ids=None, + method=None, + epsilon=1e-6, + session=None, + run_kwargs=None, +): + if topk_ids is None: + topk_ids = [topk] + if method is None: + method = "BYSCORE" + + query_sample, idx = sample_data(query=query, sample_count=sample_count) + pk_p_sample, distance_p_sample = pk_p[idx, :], distance_p[idx, :] + pk_l, distance_l = linear_build_and_search( + doc=doc, + query=query_sample, + topk=topk, + row_number=row_number, + column_number=column_number, + ) + + r = mr.spawn( + compute_recall, + args=( + pk_l, + distance_l, + pk_p_sample, + distance_p_sample, + topk_ids, + method, + epsilon, + ), + ) + return r.execute(session=session, **(run_kwargs or dict())).fetch() diff --git a/python/xorbits/_mars/learn/proxima/simple_index/searcher.py b/python/xorbits/_mars/learn/proxima/simple_index/searcher.py new file mode 100644 index 000000000..76ec1ad3e --- /dev/null +++ b/python/xorbits/_mars/learn/proxima/simple_index/searcher.py @@ -0,0 +1,553 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import logging +import os +import pickle # nosec # pylint: disable=import_pickle +import random +from collections import defaultdict +from hashlib import md5 + +import numpy as np + +from .... import opcodes +from .... import tensor as mt +from ....config import options +from ....core import ENTITY_TYPE, OutputType, recursive_tile +from ....core.context import get_context +from ....core.operand import OperandStage +from ....lib.filesystem import FileSystem, get_fs +from ....serialization.serializables import ( + AnyField, + BoolField, + BytesField, + DictField, + Int32Field, + Int64Field, + KeyField, + StringField, +) +from ....tensor.core import TensorOrder +from ....utils import Timer, ceildiv, has_unknown_shape +from ...operands import LearnOperand, LearnOperandMixin +from ..core import get_proxima_type, proxima, validate_tensor + +logger = logging.getLogger(__name__) + + +class ProximaSearcher(LearnOperand, LearnOperandMixin): + _op_type_ = opcodes.PROXIMA_SIMPLE_SEARCHER + _tensor = KeyField("tensor") + _distance_metric = StringField("distance_metric") + _dimension = Int32Field("dimension") + _row_number = Int64Field("row_number") + _topk = Int32Field("topk") + _threads = Int32Field("threads") + _index = AnyField("index") + _index_searcher = StringField("index_searcher") + _index_searcher_params = DictField("index_searcher_params") + _index_reformer = StringField("index_reformer") + _index_reformer_params = DictField("index_reformer_params") + _download_index = BoolField("download_index") + _storage_options = BytesField( + "storage_options", on_serialize=pickle.dumps, on_deserialize=pickle.loads + ) + + def __init__( + self, + tensor=None, + distance_metric=None, + dimension=None, + row_number=None, + topk=None, + index=None, + threads=None, + index_searcher=None, + index_searcher_params=None, + index_reformer=None, + index_reformer_params=None, + download_index=None, + storage_options=None, + output_types=None, + stage=None, + **kw, + ): + super().__init__( + _tensor=tensor, + _distance_metric=distance_metric, + _row_number=row_number, + _dimension=dimension, + _index=index, + _threads=threads, + _index_searcher=index_searcher, + _index_searcher_params=index_searcher_params, + _index_reformer=index_reformer, + _index_reformer_params=index_reformer_params, + _download_index=download_index, + _output_types=output_types, + _topk=topk, + _storage_options=storage_options, + **kw, + ) + if self._output_types is None: + self._output_types = [OutputType.tensor, OutputType.tensor] + + @property + def tensor(self): + return self._tensor + + @property + def distance_metric(self): + return self._distance_metric + + @property + def dimension(self): + return self._dimension + + @property + def row_number(self): + return self._row_number + + @property + def topk(self): + return self._topk + + @property + def threads(self): + return self._threads + + @property + def index(self): + return self._index + + @property + def index_searcher(self): + return self._index_searcher + + @property + def index_searcher_params(self): + return self._index_searcher_params + + @property + def index_reformer(self): + return self._index_reformer + + @property + def index_reformer_params(self): + return self._index_reformer_params + + @property + def download_index(self): + return self._download_index + + @property + def storage_options(self): + return self._storage_options + + @property + def output_limit(self): + return 1 if self._download_index else 2 + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if self.stage != OperandStage.agg and not self._download_index: + self._tensor = self._inputs[0] + if isinstance(self._index, ENTITY_TYPE): + self._index = self._inputs[-1] + + def __call__(self, tensor, index): + kws = [ + { + "dtype": np.dtype(np.uint64), + "shape": (tensor.shape[0], self._topk), + "order": TensorOrder.C_ORDER, + }, + { + "dtype": np.dtype(np.float32), + "shape": (tensor.shape[0], self._topk), + "order": TensorOrder.C_ORDER, + }, + ] + inputs = [tensor] + if hasattr(index, "op"): + inputs.append(index) + return mt.ExecutableTuple(self.new_tileables(inputs, kws=kws)) + + @classmethod + def _build_download_chunks(cls, op, indexes): + ctx = get_context() + workers = ctx.get_worker_addresses() or [None] + if len(workers) < len(indexes): + workers = [workers[i % len(workers)] for i in range(len(indexes))] + indexes_iter = iter(itertools.cycle(indexes)) + + download_chunks = defaultdict(list) + for i, worker in enumerate(workers): + download_op = op.copy().reset_key() + download_op.stage = OperandStage.map + download_op.expect_worker = worker + download_op._download_index = True + download_op._tensor = None + download_op._index = next(indexes_iter) + download_chunks[i % len(indexes)].append( + download_op.new_chunk( + None, index=(i,), shape=(), dtype=op.inputs[0].dtype + ) + ) + return download_chunks + + @classmethod + def tile(cls, op: "ProximaSearcher"): + tensor = op.tensor + index = op.index + topk = op.topk + outs = op.outputs + row_number = op.row_number + + ctx = get_context() + + # make sure all inputs have known chunk sizes + if has_unknown_shape(*op.inputs): + yield + + rechunk_size = dict() + if tensor.chunk_shape[1] > 1: + rechunk_size[1] = tensor.shape[1] + if row_number is not None: + rechunk_size[0] = tensor.shape[0] // row_number + if len(rechunk_size) > 0: + tensor = yield from recursive_tile(tensor.rechunk(rechunk_size)) + + logger.warning(f"query chunks count: {len(tensor.chunks)} ") + + if hasattr(index, "op"): + built_indexes = [index.chunks] * len(tensor.chunks) + else: + # index path + fs: FileSystem = get_fs(index, op.storage_options) + index_paths = [ + f for f in fs.ls(index) if f.rsplit("/", 1)[-1].startswith("proxima_") + ] + download_chunks = cls._build_download_chunks(op, index_paths) + iters = [iter(itertools.cycle(i)) for i in download_chunks.values()] + built_indexes = [] + for _ in range(len(tensor.chunks)): + built_indexes.append([next(it) for it in iters]) + + if hasattr(index, "op"): + index_chunks_workers = [ + m["bands"][0][0] + for m in ctx.get_chunks_meta( + [c.key for c in index.chunks], fields=["bands"] + ) + ] + else: + index_chunks_workers = [None] * len(built_indexes[0]) + + out_chunks = [], [] + for i, tensor_chunk in enumerate(tensor.chunks): + pk_chunks, distance_chunks = [], [] + for j, chunk_index, worker in zip( + itertools.count(), built_indexes[i], index_chunks_workers + ): + chunk_op = op.copy().reset_key() + chunk_op.stage = OperandStage.map + if hasattr(index, "op"): + chunk_op.expect_worker = worker + else: + chunk_op.expect_worker = chunk_index.op.expect_worker + chunk_op._index = chunk_index + chunk_op._tensor = None + chunk_kws = [ + { + "index": (tensor_chunk.index[0], j), + "dtype": outs[0].dtype, + "shape": (tensor_chunk.shape[0], topk), + "order": TensorOrder.C_ORDER, + }, + { + "index": (tensor_chunk.index[0], j), + "dtype": outs[1].dtype, + "shape": (tensor_chunk.shape[0], topk), + "order": TensorOrder.C_ORDER, + }, + ] + chunk_inputs = [tensor_chunk, chunk_index] + pk_chunk, distance_chunk = chunk_op.new_chunks( + chunk_inputs, kws=chunk_kws + ) + pk_chunks.append(pk_chunk) + distance_chunks.append(distance_chunk) + + if len(pk_chunks) == 1: + out_chunks[0].append(pk_chunks[0]) + out_chunks[1].append(distance_chunks[0]) + continue + + # combine topk results + combine_size = options.combine_size + + tensor_out_chunks = [pk_chunks, distance_chunks] + while True: + chunk_size = ceildiv(len(tensor_out_chunks[0]), combine_size) + cur_out_chunks = [[], []] + for k in range(chunk_size): + to_combine_pks = tensor_out_chunks[0][ + k * combine_size : (k + 1) * combine_size + ] + to_combine_distances = tensor_out_chunks[1][ + k * combine_size : (k + 1) * combine_size + ] + + chunk_op = op.copy().reset_key() + chunk_op.stage = OperandStage.agg + chunk_op._tensor = None + chunk_op._index = None + agg_chunk_kws = [ + { + "index": (i, 0), + "dtype": outs[0].dtype, + "shape": (tensor_chunk.shape[0], topk), + "order": outs[0].order, + }, + { + "index": (i, 0), + "dtype": outs[1].dtype, + "shape": (tensor_chunk.shape[0], topk), + "order": outs[1].order, + }, + ] + pk_result_chunk, distance_result_chunk = chunk_op.new_chunks( + to_combine_pks + to_combine_distances, kws=agg_chunk_kws + ) + cur_out_chunks[0].append(pk_result_chunk) + cur_out_chunks[1].append(distance_result_chunk) + tensor_out_chunks = cur_out_chunks + if len(tensor_out_chunks[0]) == 1: + break + out_chunks[0].append(tensor_out_chunks[0][0]) + out_chunks[1].append(tensor_out_chunks[1][0]) + + kws = [] + pk_params = outs[0].params + pk_params["chunks"] = out_chunks[0] + pk_params["nsplits"] = (tensor.nsplits[0], (topk,)) + kws.append(pk_params) + distance_params = outs[1].params + distance_params["chunks"] = out_chunks[1] + distance_params["nsplits"] = (tensor.nsplits[0], (topk,)) + kws.append(distance_params) + new_op = op.copy() + return new_op.new_tileables(op.inputs, kws=kws) + + @classmethod + def _execute_download(cls, ctx, op: "ProximaSearcher"): + index_path = op.index + with Timer() as timer: + fs = get_fs(index_path, op.storage_options) + + # TODO + dirs = os.environ.get("MARS_SPILL_DIRS") + if dirs: + temp_dir = random.choice(dirs.split(":")) + else: + temp_dir = "/tmp/proxima-index/" + + local_path = os.path.join( + temp_dir, md5(str(index_path).encode("utf-8")).hexdigest() + ) # noqa: B303 # nosec + exist_state = True + if not os.path.exists(local_path): + exist_state = False + if not os.path.exists(local_path.rsplit("/", 1)[0]): + os.mkdir(local_path.rsplit("/", 1)[0]) + with open(local_path, "wb") as out_f: + with fs.open(index_path, "rb") as in_f: + # 32M + chunk_bytes = 32 * 1024**2 + while True: + data = in_f.read(chunk_bytes) + if data: + out_f.write(data) + else: + break + + logger.warning( + f"ReadingFromVolume({op.key}), index path: {index_path}, " + f"local_path {local_path}" + f"size {os.path.getsize(local_path)}, " + f"already exist {exist_state}, " + f"costs {timer.duration} seconds " + f"speed {round(os.path.getsize(local_path) / (1024 ** 2) / timer.duration, 2)} MB/s" + ) + ctx[op.outputs[0].key] = local_path + + @classmethod + def _execute_map(cls, ctx, op: "ProximaSearcher"): + if op.download_index: + cls._execute_download(ctx, op) + return + + inp = ctx[op.tensor.key] + index_path = ctx[op.inputs[-1].key] + + with Timer() as timer: + flow = proxima.IndexFlow( + container_name="MMapFileContainer", + container_params={}, + searcher_name=op.index_searcher, + searcher_params=op.index_searcher_params, + measure_name="", + measure_params={}, + reformer_name=op.index_reformer, + reformer_params=op.index_reformer_params, + ) + + flow.load(index_path) + vecs = np.ascontiguousarray(inp) + + logger.warning( + f"LoadIndex({op.key}) index path: {index_path} costs {timer.duration} seconds" + ) + logger.warning(f"threads count:{op.threads} vecs count:{len(vecs)}") + + with Timer() as timer: + batch = 10000 + s_idx = 0 + e_idx = min(s_idx + batch, len(vecs)) + result_pks, result_distances = None, None + while s_idx < len(vecs): + with Timer() as timer_s: + tp = get_proxima_type(vecs.dtype) + result_pks_b, result_distances_b = proxima.IndexUtility.ann_search( + searcher=flow, + type=tp, + query=vecs[s_idx:e_idx], + topk=op.topk, + threads=op.threads, + ) + if result_pks is None: + result_pks = np.asarray(result_pks_b) + result_distances = np.asarray(result_distances_b) + else: + result_pks = np.concatenate( + (result_pks, np.asarray(result_pks_b)) + ) + result_distances = np.concatenate( + (result_distances, np.asarray(result_distances_b)) + ) + + s_idx = e_idx + e_idx = min(s_idx + batch, len(vecs)) + logger.warning( + f"Search({op.key}) count {s_idx}/{len(vecs)}:{round(s_idx * 100 / len(vecs), 2)}%" + f" costs {round(timer_s.duration, 2)} seconds" + ) + logger.warning(f"Search({op.key}) costs {timer.duration} seconds") + + ctx[op.outputs[0].key] = np.asarray(result_pks) + ctx[op.outputs[1].key] = np.asarray(result_distances) + + @classmethod + def _execute_agg(cls, ctx, op: "ProximaSearcher"): + inputs_data = [ctx[inp.key] for inp in op.inputs] + + chunk_num = len(inputs_data) // 2 + pks = np.concatenate(inputs_data[:chunk_num], axis=1) + distances = np.concatenate(inputs_data[chunk_num:], axis=1) + + n_doc = len(pks) + topk = op.topk + + # calculate topk on rows + if op.distance_metric == "InnerProduct": + inds = np.argsort(distances, axis=1)[:, -1 : -topk - 1 : -1] + else: + inds = np.argsort(distances, axis=1)[:, :topk] + + result_pks = np.empty((n_doc, topk), dtype=pks.dtype) + result_distances = np.empty((n_doc, topk), dtype=distances.dtype) + rng = np.arange(n_doc) + for i in range(topk): + ind = inds[:, i] + result_pks[:, i] = pks[rng, ind] + result_distances[:, i] = distances[rng, ind] + del rng + + ctx[op.outputs[0].key] = result_pks + ctx[op.outputs[1].key] = result_distances + + @classmethod + def execute(cls, ctx, op: "ProximaSearcher"): + if op.stage != OperandStage.agg: + return cls._execute_map(ctx, op) + else: + return cls._execute_agg(ctx, op) + + +def search_index( + tensor, + topk, + index, + threads=4, + row_number=None, + dimension=None, + distance_metric=None, + index_searcher=None, + index_searcher_params=None, + index_reformer=None, + index_reformer_params=None, + storage_options=None, + run=True, + session=None, + run_kwargs=None, +): + tensor = validate_tensor(tensor) + + if dimension is None: + dimension = tensor.shape[1] + if index_searcher is None: + index_searcher = "" + if index_searcher_params is None: + index_searcher_params = {} + if index_reformer is None: + index_reformer = "" + if index_reformer_params is None: + index_reformer_params = {} + if distance_metric is None: + distance_metric = "" + if hasattr(index, "op") and index.op.index_path is not None: + storage_options = storage_options or index.op.storage_options + index = index.op.index_path + + op = ProximaSearcher( + tensor=tensor, + distance_metric=distance_metric, + dimension=dimension, + row_number=row_number, + topk=topk, + index=index, + threads=threads, + index_searcher=index_searcher, + index_searcher_params=index_searcher_params, + index_reformer=index_reformer, + index_reformer_params=index_reformer_params, + storage_options=storage_options, + ) + result = op(tensor, index) + if run: + return result.execute(session=session, **(run_kwargs or dict())) + else: + return result diff --git a/python/xorbits/_mars/learn/proxima/simple_index/tests/__init__.py b/python/xorbits/_mars/learn/proxima/simple_index/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/learn/proxima/simple_index/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/learn/proxima/simple_index/tests/test_simple_index.py b/python/xorbits/_mars/learn/proxima/simple_index/tests/test_simple_index.py new file mode 100644 index 000000000..1746c9ed2 --- /dev/null +++ b/python/xorbits/_mars/learn/proxima/simple_index/tests/test_simple_index.py @@ -0,0 +1,799 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile + +import numpy as np +import pandas as pd +import pytest + +from ..... import dataframe as md +from ..... import tensor as mt +from ...core import proxima +from .. import build_index, recall, search_index + + +def proxima_build_and_query( + doc, + query, + topk, + measure_name=None, + dimension=None, + index_builder=None, + builder_params=None, + index_converter=None, + index_converter_params=None, + index_searcher=None, + searcher_params=None, + index_reformer=None, + index_reformer_params=None, +): + if measure_name is None: + measure_name = "SquaredEuclidean" + if dimension is None: + dimension = doc.shape[1] + if index_builder is None: + index_builder = "SsgBuilder" + if builder_params is None: + builder_params = {} + if index_converter_params is None: + index_converter_params = {} + if index_searcher is None: + index_searcher = "" + if searcher_params is None: + searcher_params = {} + if index_reformer is None: + index_reformer = "" + if index_reformer_params is None: + index_reformer_params = {} + + map_dtype = { + np.dtype(np.float32): proxima.IndexMeta.FT_FP32, + np.dtype(np.int16): proxima.IndexMeta.FT_INT16, + } + # holder + holder = proxima.IndexHolder(type=map_dtype[doc.dtypes[0]], dimension=dimension) + holder.mount(np.array(doc)) # add batch data, pk starts from 0 + + # converter + meta = proxima.IndexMeta( + map_dtype[doc.dtypes[0]], dimension=dimension, measure_name=measure_name + ) + if index_converter is not None: + if index_converter == "MipsConverter": + measure_name = "" + converter = proxima.IndexConverter( + name=index_converter, meta=meta, params=index_converter_params + ) + converter.train_and_transform(holder) + holder = converter.result() + meta = converter.meta() + + # builder && dumper + builder = proxima.IndexBuilder(name=index_builder, meta=meta, params=builder_params) + builder = builder.train_and_build(holder) + dumper = proxima.IndexDumper(name="MemoryDumper", path="test.index") + builder.dump(dumper) + dumper.close() + + # indexflow for search + flow = proxima.IndexFlow( + container_name="MemoryContainer", + container_params={}, + searcher_name=index_searcher, + searcher_params=searcher_params, + measure_name=measure_name, + measure_params={}, + reformer_name=index_reformer, + reformer_params=index_reformer_params, + ) + flow.load("test.index") + keys, scores = proxima.IndexUtility.ann_search( + searcher=flow, query=query, topk=topk, threads=1 + ) + return np.asarray(keys), np.asarray(scores) + + +def gen_data(doc_count, query_count, dimension, dtype=np.float32): + if dtype == np.float32: + rs = np.random.RandomState(0) + doc = pd.DataFrame(rs.rand(doc_count, dimension).astype(dtype)) + query = rs.rand(query_count, dimension).astype(dtype) + elif dtype == np.int32: + rs = np.random.RandomState(0) + doc = pd.DataFrame((rs.rand(doc_count, dimension) * 1000).astype(dtype)) + query = (rs.rand(query_count, dimension) * 1000).astype(dtype) + else: + raise ValueError(f"Unsupported dtype {dtype}") + return doc, query + + +@pytest.mark.skipif(proxima is None, reason="proxima not installed") +def build_and_query( + doc, + query, + topk, + column_number, + row_number, + threads=1, + dimension=None, + measure_name=None, + index_builder=None, + builder_params=None, + index_converter=None, + index_converter_params=None, + index_searcher=None, + searcher_params=None, + index_reformer=None, + index_reformer_params=None, +): + if measure_name is None: + measure_name = "SquaredEuclidean" + if dimension is None: + dimension = doc.shape[1] + if index_builder is None: + index_builder = "SsgBuilder" + if builder_params is None: + builder_params = {} + if index_converter_params is None: + index_converter_params = {} + if index_searcher is None: + index_searcher = "" + if searcher_params is None: + searcher_params = {} + if index_reformer is None: + index_reformer = "" + if index_reformer_params is None: + index_reformer_params = {} + + doc = md.DataFrame(pd.DataFrame(doc)) + query = mt.tensor(query) + + index = build_index( + tensor=doc, + need_shuffle=False, + column_number=column_number, + distance_metric=measure_name, + dimension=dimension, + index_builder=index_builder, + index_builder_params=builder_params, + index_converter=index_converter, + index_converter_params=index_converter_params, + ) + paths = index.fetch() + if not isinstance(paths, list): + paths = [paths] + + try: + for path in paths: + with open(path, "rb") as f: + assert len(f.read()) > 0 + + pk2, distance = search_index( + tensor=query, + threads=threads, + row_number=row_number, + distance_metric=measure_name, + dimension=dimension, + topk=topk, + index=index, + index_searcher=index_searcher, + index_searcher_params=searcher_params, + index_reformer=index_reformer, + index_reformer_params=index_reformer_params, + ) + assert pk2.shape == (len(query), topk) + assert distance.shape == (len(query), topk) + return pk2, distance + finally: + for path in paths: + os.remove(path) + + +def consistency_checking( + doc, + query, + dimension, + topk, + measure_name, + column_number, + row_number, + threads, + index_builder, + builder_params, + index_converter, + index_converter_params, + index_searcher, + searcher_params, + index_reformer, + index_reformer_params, + decimal=6, +): + # proxima_data + pk_p, distance_p = proxima_build_and_query( + doc=doc, + query=query, + dimension=dimension, + topk=topk, + measure_name=measure_name, + index_builder=index_builder, + builder_params=builder_params, + index_converter=index_converter, + index_converter_params=index_converter_params, + index_searcher=index_searcher, + searcher_params=searcher_params, + index_reformer=index_reformer, + index_reformer_params=index_reformer_params, + ) + + # mars_data + pk_m, distance_m = build_and_query( + doc, + query, + dimension=dimension, + topk=topk, + threads=threads, + measure_name=measure_name, + column_number=column_number, + row_number=row_number, + index_builder=index_builder, + builder_params=builder_params, + index_converter=index_converter, + index_converter_params=index_converter_params, + index_searcher=index_searcher, + searcher_params=searcher_params, + index_reformer=index_reformer, + index_reformer_params=index_reformer_params, + ) + + # testing + np.testing.assert_array_equal(pk_p, pk_m) + np.testing.assert_array_almost_equal(distance_p, distance_m, decimal=decimal) + + +@pytest.mark.skipif(proxima is None, reason="proxima not installed") +def test_build_and_search_index(setup): + # for now, test SquaredEuclidean and Euclidean only, + # TODO: add more tests for "Canberra", "Chebyshev" + # "Manhattan" when ready + + # L2 space + # params + doc_count, query_count, dimension, topk = 200, 15, 5, 3 + threads, column_number, row_number = 4, 2, 2 + measure_name_lists = ["SquaredEuclidean", "Euclidean"] + index_builder_lists = [ + "SsgBuilder", + "HnswBuilder", + "LinearBuilder", + "ClusteringBuilder", + "GcBuilder", + "QcBuilder", + ] + builder_params_lists = [ + {}, + {}, + {}, + {"proxima.hc.builder.max_document_count": doc_count}, + {"proxima.gc.builder.centroid_count": "16"}, + {"proxima.qc.builder.centroid_count": "16"}, + ] + index_searcher_lists = [ + "SsgSearcher", + "HnswSearcher", + "LinearSearcher", + "ClusteringSearcher", + "GcSearcher", + "QcSearcher", + ] + searcher_params = {} + index_converter, index_converter_params = None, {} + index_reformer, index_reformer_params = "", {} + + # data + doc, query = gen_data( + doc_count=doc_count, query_count=query_count, dimension=dimension + ) + + # test + for i, index_builder in enumerate(index_builder_lists): + for measure_name in measure_name_lists: + consistency_checking( + doc=doc, + query=query, + dimension=dimension, + topk=topk, + threads=threads, + measure_name=measure_name, + column_number=column_number, + row_number=row_number, + index_builder=index_builder, + builder_params=builder_params_lists[i], + index_converter=index_converter, + index_converter_params=index_converter_params, + index_searcher=index_searcher_lists[i], + searcher_params=searcher_params, + index_reformer=index_reformer, + index_reformer_params=index_reformer_params, + ) + + # L2 space with HalfFloatConverter + # params + doc_count, query_count, dimension, topk = 200, 15, 5, 3 + threads, column_number, row_number = 4, 2, 2 + measure_name_lists = ["SquaredEuclidean", "Euclidean"] + index_builder_lists = [ + "SsgBuilder", + "HnswBuilder", + "LinearBuilder", + "ClusteringBuilder", + "GcBuilder", + "QcBuilder", + ] + builder_params_lists = [ + {}, + {}, + {}, + {"proxima.hc.builder.max_document_count": doc_count}, + {"proxima.gc.builder.centroid_count": "16"}, + {"proxima.qc.builder.centroid_count": "16"}, + ] + index_searcher_lists = [ + "SsgSearcher", + "HnswSearcher", + "LinearSearcher", + "ClusteringSearcher", + "GcSearcher", + "QcSearcher", + ] + index_converter_lists = [ + "HalfFloatConverter", + "HalfFloatConverter", + "HalfFloatConverter", + "HalfFloatConverter", + "HalfFloatConverter", + "HalfFloatConverter", + ] + searcher_params = {} + index_converter, index_converter_params = None, {} + index_reformer, index_reformer_params = "", {} + + # data + doc, query = gen_data( + doc_count=doc_count, query_count=query_count, dimension=dimension + ) + + # test + for i, index_builder in enumerate(index_builder_lists): + for measure_name in measure_name_lists: + consistency_checking( + doc=doc, + query=query, + dimension=dimension, + topk=topk, + threads=threads, + measure_name=measure_name, + column_number=column_number, + row_number=row_number, + index_builder=index_builder, + builder_params=builder_params_lists[i], + index_converter=index_converter_lists[i], + index_converter_params=index_converter_params, + index_searcher=index_searcher_lists[i], + searcher_params=searcher_params, + index_reformer=index_reformer, + index_reformer_params=index_reformer_params, + decimal=7, + ) + + # L2 space with Int8QuantizerConverter + # params + doc_count, query_count, dimension, topk = 2000, 1, 32, 5 + threads, column_number, row_number = 4, 2, 1 + + measure_name_lists = ["SquaredEuclidean", "Euclidean"] + index_builder_lists = [ + "SsgBuilder", + "HnswBuilder", + "LinearBuilder", + "ClusteringBuilder", + "GcBuilder", + "QcBuilder", + ] + builder_params_lists = [ + {}, + {}, + {}, + {"proxima.hc.builder.max_document_count": doc_count}, + {"proxima.gc.builder.centroid_count": "16"}, + { + "proxima.qc.builder.centroid_count": "16", + "proxima.qc.builder.quantizer_class": "Int8QuantizerConverter", + }, + ] + index_searcher_lists = [ + "SsgSearcher", + "HnswSearcher", + "LinearSearcher", + "ClusteringSearcher", + "GcSearcher", + "QcSearcher", + ] + searcher_params_lists = [ + {}, + {}, + {}, + {"proxima.hc.searcher.scan_ratio": 1}, + {"proxima.gc.searcher.scan_ratio": 1}, + {"proxima.qc.searcher.scan_ratio": 1}, + ] + index_converter_lists = [ + "Int8QuantizerConverter", + "Int8QuantizerConverter", + "Int8QuantizerConverter", + "Int8QuantizerConverter", + "Int8QuantizerConverter", + None, + ] + index_converter_params = {} + index_reformer, index_reformer_params = "", {} + + # data + doc, query = gen_data( + doc_count=doc_count, query_count=query_count, dimension=dimension + ) + + # test + for i, index_builder in enumerate(index_builder_lists): + for measure_name in measure_name_lists: + consistency_checking( + doc=doc, + query=query, + dimension=dimension, + topk=topk, + threads=threads, + measure_name=measure_name, + column_number=column_number, + row_number=row_number, + index_builder=index_builder, + builder_params=builder_params_lists[i], + index_converter=index_converter_lists[i], + index_converter_params=index_converter_params, + index_searcher=index_searcher_lists[i], + searcher_params=searcher_params_lists[i], + index_reformer=index_reformer, + index_reformer_params=index_reformer_params, + decimal=2, + ) + + # L2 space with Int4QuantizerConverter + # params + doc_count, query_count, dimension, topk = 2000, 1, 32, 5 + threads, column_number, row_number = 4, 2, 1 + + measure_name_lists = ["SquaredEuclidean", "Euclidean"] + index_builder_lists = [ + "SsgBuilder", + "HnswBuilder", + "LinearBuilder", + "ClusteringBuilder", + "GcBuilder", + "QcBuilder", + ] + builder_params_lists = [ + {}, + {}, + {}, + {"proxima.hc.builder.max_document_count": doc_count}, + {"proxima.gc.builder.centroid_count": "16"}, + { + "proxima.qc.builder.centroid_count": "16", + "proxima.qc.builder.quantizer_class": "Int4QuantizerConverter", + }, + ] + index_searcher_lists = [ + "SsgSearcher", + "HnswSearcher", + "LinearSearcher", + "ClusteringSearcher", + "GcSearcher", + "QcSearcher", + ] + searcher_params_lists = [ + {}, + {}, + {}, + {"proxima.hc.searcher.scan_ratio": 1}, + {"proxima.gc.searcher.scan_ratio": 1}, + {"proxima.qc.searcher.scan_ratio": 1}, + ] + index_converter_lists = [ + "Int4QuantizerConverter", + "Int4QuantizerConverter", + "Int4QuantizerConverter", + "Int4QuantizerConverter", + "Int4QuantizerConverter", + None, + ] + index_converter_params = {} + index_reformer, index_reformer_params = "", {} + + # data + doc, query = gen_data( + doc_count=doc_count, + query_count=query_count, + dimension=dimension, + dtype=np.float32, + ) + + for i, index_builder in enumerate(index_builder_lists): + for measure_name in measure_name_lists: + consistency_checking( + doc=doc, + query=query, + dimension=dimension, + topk=topk, + threads=threads, + measure_name=measure_name, + column_number=column_number, + row_number=row_number, + index_builder=index_builder, + builder_params=builder_params_lists[i], + index_converter=index_converter_lists[i], + index_converter_params=index_converter_params, + index_searcher=index_searcher_lists[i], + searcher_params=searcher_params_lists[i], + index_reformer=index_reformer, + index_reformer_params=index_reformer_params, + decimal=2, + ) + + # L2 space with NormalizeConverter + # params + doc_count, query_count, dimension, topk = 2000, 1, 32, 5 + threads, column_number, row_number = 4, 2, 1 + + measure_name_lists = ["SquaredEuclidean", "Euclidean"] + index_builder_lists = [ + "SsgBuilder", + "HnswBuilder", + "LinearBuilder", + "ClusteringBuilder", + "GcBuilder", + "QcBuilder", + ] + builder_params_lists = [ + {}, + {}, + {}, + {"proxima.hc.builder.max_document_count": doc_count}, + {"proxima.gc.builder.centroid_count": "16"}, + {"proxima.qc.builder.centroid_count": "16"}, + ] + index_searcher_lists = [ + "SsgSearcher", + "HnswSearcher", + "LinearSearcher", + "ClusteringSearcher", + "GcSearcher", + "QcSearcher", + ] + searcher_params_lists = [ + {}, + {}, + {}, + {"proxima.hc.searcher.scan_ratio": 1}, + {"proxima.gc.searcher.scan_ratio": 1}, + {"proxima.qc.searcher.scan_ratio": 1}, + ] + index_converter_lists = [ + "NormalizeConverter", + "NormalizeConverter", + "NormalizeConverter", + "NormalizeConverter", + "NormalizeConverter", + "NormalizeConverter", + ] + index_converter_params = {} + index_reformer, index_reformer_params = "", {} + # data + doc, query = gen_data( + doc_count=doc_count, + query_count=query_count, + dimension=dimension, + dtype=np.float32, + ) + + for i, index_builder in enumerate(index_builder_lists): + for measure_name in measure_name_lists: + consistency_checking( + doc, + query, + dimension=dimension, + topk=topk, + threads=threads, + measure_name=measure_name, + column_number=column_number, + row_number=row_number, + index_builder=index_builder, + builder_params=builder_params_lists[i], + index_converter=index_converter_lists[i], + index_converter_params=index_converter_params, + index_searcher=index_searcher_lists[i], + searcher_params=searcher_params_lists[i], + index_reformer=index_reformer, + index_reformer_params=index_reformer_params, + decimal=2, + ) + + # InnerProduct space + # params + doc_count, query_count, dimension, topk = 200, 15, 5, 2 + threads, column_number, row_number = 4, 2, 2 + + measure_name_lists = ["InnerProduct"] + index_builder_lists = [ + "LinearBuilder", + "QcBuilder", + "HnswBuilder", + "SsgBuilder", + "ClusteringBuilder", + "GcBuilder", + ] + builder_params_lists = [ + {}, + {"proxima.qc.builder.centroid_count": "16"}, + {}, + {}, + {"proxima.hc.builder.max_document_count": doc_count}, + {"proxima.gc.builder.centroid_count": "16"}, + ] + index_searcher_lists = [ + "LinearSearcher", + "QcSearcher", + "HnswSearcher", + "SsgSearcher", + "ClusteringSearcher", + "GcSearcher", + ] + index_converter_lists = [ + None, + None, + "MipsConverter", + "MipsConverter", + "MipsConverter", + "MipsConverter", + ] + + searcher_params = {} + index_converter_params = {} + index_reformer, index_reformer_params = "", {} + + # data + doc, query = gen_data( + doc_count=doc_count, query_count=query_count, dimension=dimension + ) + + for i, index_builder in enumerate(index_builder_lists): + for measure_name in measure_name_lists: + consistency_checking( + doc, + query, + dimension=dimension, + topk=topk, + threads=threads, + measure_name=measure_name, + column_number=column_number, + row_number=row_number, + index_builder=index_builder, + builder_params=builder_params_lists[i], + index_converter=index_converter_lists[i], + index_converter_params=index_converter_params, + index_searcher=index_searcher_lists[i], + searcher_params=searcher_params, + index_reformer=index_reformer, + index_reformer_params=index_reformer_params, + decimal=5, + ) + + +@pytest.mark.skipif(proxima is None, reason="proxima not installed") +def test_build_and_search_index_with_filesystem(setup): + with tempfile.TemporaryDirectory() as f: + # params + doc_count, query_count, dimension = 2000, 50, 10 + topk = 10 + + # data + doc, query = gen_data( + doc_count=doc_count, query_count=query_count, dimension=dimension + ) + + df = md.DataFrame(pd.DataFrame(doc)) + q = mt.tensor(query) + + index = build_index(tensor=df, index_path=f, column_number=2) + + assert len(os.listdir(f)) > 0 + + # proxima_data + pk_p, distance_p = proxima_build_and_query(doc, query, topk) + pk_m, distance_m = search_index(tensor=q, topk=topk, index=index, row_number=5) + + # testing + np.testing.assert_array_equal(pk_p, pk_m) + np.testing.assert_array_equal(distance_p, distance_m) + + +@pytest.mark.skipif(proxima is None, reason="proxima not installed") +def test_build_and_search_index_with_filesystem_download(setup): + with tempfile.TemporaryDirectory() as f: + # params + doc_count, query_count, dimension = 2000, 15, 10 + topk = 10 + doc_chunk, query_chunk = 1000, 5 + + # data + doc, query = gen_data( + doc_count=doc_count, query_count=query_count, dimension=dimension + ) + + df = md.DataFrame(pd.DataFrame(doc), chunk_size=(doc_chunk, dimension)) + q = mt.tensor(query, chunk_size=(query_chunk, dimension)) + + index = build_index(tensor=df, index_path=f, column_number=2) + + assert len(os.listdir(f)) > 0 + + search_index(q[0:5], topk, index) + search_index(q[5:10], topk, index) + search_index(q[10:15], topk, index) + + +@pytest.mark.skipif(proxima is None, reason="proxima not installed") +def test_recall(setup): + # params + doc_count, query_count, dimension = 2000, 150, 20 + topk = 100 + sample_count = 100 + + # data + doc, query = gen_data( + doc_count=doc_count, query_count=query_count, dimension=dimension + ) + + # proxima_data + pk_p, distance_p = build_and_query( + doc, + query, + dimension=dimension, + topk=topk, + threads=1, + column_number=2, + row_number=3, + ) + assert isinstance( + recall( + doc=doc, + query=query, + topk=topk, + sample_count=sample_count, + pk_p=pk_p, + distance_p=distance_p, + column_number=2, + row_number=2, + ), + dict, + ) diff --git a/python/xorbits/_mars/learn/semi_supervised/__init__.py b/python/xorbits/_mars/learn/semi_supervised/__init__.py new file mode 100644 index 000000000..919f15491 --- /dev/null +++ b/python/xorbits/_mars/learn/semi_supervised/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ._label_propagation import LabelPropagation diff --git a/python/xorbits/_mars/learn/semi_supervised/_label_propagation.py b/python/xorbits/_mars/learn/semi_supervised/_label_propagation.py new file mode 100644 index 000000000..a7c9904d9 --- /dev/null +++ b/python/xorbits/_mars/learn/semi_supervised/_label_propagation.py @@ -0,0 +1,368 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from abc import ABCMeta, abstractmethod + +from sklearn.base import BaseEstimator +from sklearn.exceptions import ConvergenceWarning + +from ... import tensor as mt +from ...core import ExecutableTuple +from ..base import ClassifierMixin +from ..metrics.pairwise import rbf_kernel +from ..neighbors.unsupervised import NearestNeighbors +from ..utils import check_array +from ..utils.multiclass import check_classification_targets +from ..utils.validation import check_is_fitted, check_X_y + + +class BaseLabelPropagation(ClassifierMixin, BaseEstimator, metaclass=ABCMeta): + """Base class for label propagation module. + + Parameters + ---------- + kernel : {'knn', 'rbf', callable} + String identifier for kernel function to use or the kernel function + itself. Only 'rbf' and 'knn' strings are valid inputs. The function + passed should take two inputs, each of shape [n_samples, n_features], + and return a [n_samples, n_samples] shaped weight matrix + + gamma : float + Parameter for rbf kernel + + n_neighbors : integer > 0 + Parameter for knn kernel + + alpha : float + Clamping factor + + max_iter : integer + Change maximum number of iterations allowed + + tol : float + Convergence tolerance: threshold to consider the system at steady + state + """ + + def __init__( + self, kernel="rbf", gamma=20, n_neighbors=7, alpha=1, max_iter=30, tol=1e-3 + ): + self.max_iter = max_iter + self.tol = tol + + # kernel parameters + self.kernel = kernel + self.gamma = gamma + self.n_neighbors = n_neighbors + + # clamping factor + self.alpha = alpha + + self.nn_fit = None + + def _get_kernel(self, X, y=None): + if self.kernel == "rbf": + if y is None: + return rbf_kernel(X, X, gamma=self.gamma) + else: + return rbf_kernel(X, y, gamma=self.gamma) + elif self.kernel == "knn": + if self.nn_fit is None: + self.nn_fit = NearestNeighbors(self.n_neighbors).fit(X) + if y is None: + return self.nn_fit.kneighbors_graph( + self.nn_fit._fit_X, self.n_neighbors, mode="connectivity" + ) + else: + return self.nn_fit.kneighbors(y, return_distance=False) + elif callable(self.kernel): + if y is None: + return self.kernel(X, X) + else: + return self.kernel(X, y) + else: # pragma: no cover + raise ValueError( + f"{self.kernel} is not a valid kernel. Only rbf and knn" + " or an explicit function " + " are supported at this time." + ) + + @abstractmethod + def _build_graph(self): # pragma: no cover + raise NotImplementedError( + "Graph construction must be implemented" + " to fit a label propagation model." + ) + + def predict(self, X, session=None, run_kwargs=None): + """Performs inductive inference across the model. + + Parameters + ---------- + X : array_like, shape = [n_samples, n_features] + + Returns + ------- + y : array_like, shape = [n_samples] + Predictions for input data + """ + probas = self.predict_proba(X, session=session, run_kwargs=run_kwargs) + result = mt.tensor(self.classes_)[mt.argmax(probas, axis=1)].ravel() + result.execute(session=session, **(run_kwargs or dict())) + return result + + def predict_proba(self, X, session=None, run_kwargs=None): + """Predict probability for each possible outcome. + + Compute the probability estimates for each single sample in X + and each possible outcome seen during training (categorical + distribution). + + Parameters + ---------- + X : array_like, shape = [n_samples, n_features] + + Returns + ------- + probabilities : Tensor, shape = [n_samples, n_classes] + Normalized probability distributions across + class labels + """ + + check_is_fitted(self, "X_") + + X_2d = check_array(X, accept_sparse=True) + weight_matrices = self._get_kernel(self.X_, X_2d) + if self.kernel == "knn": + probabilities = mt.array( + [ + mt.sum(self.label_distributions_[weight_matrix], axis=0) + for weight_matrix in weight_matrices + ] + ) + else: + weight_matrices = weight_matrices.T + probabilities = mt.dot(weight_matrices, self.label_distributions_) + normalizer = mt.atleast_2d(mt.sum(probabilities, axis=1)).T + probabilities /= normalizer + probabilities.execute(session=session, **(run_kwargs or dict())) + return probabilities + + def fit(self, X, y, session=None, run_kwargs=None): + """Fit a semi-supervised label propagation model based + + All the input data is provided matrix X (labeled and unlabeled) + and corresponding label matrix y with a dedicated marker value for + unlabeled samples. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + A {n_samples by n_samples} size matrix will be created from this + + y : array_like, shape = [n_samples] + n_labeled_samples (unlabeled points are marked as -1) + All unlabeled samples will be transductively assigned labels + + Returns + ------- + self : returns an instance of self. + """ + X, y = check_X_y(X, y) + self.X_ = X + to_run = [check_classification_targets(y)] + + # actual graph construction (implementations should override this) + graph_matrix = self._build_graph() + + # label construction + # construct a categorical distribution for classification only + classes = mt.unique(y, aggregate_size=1).to_numpy( + session=session, **(run_kwargs or dict()) + ) + classes = classes[classes != -1] + self.classes_ = classes + + n_samples, n_classes = len(y), len(classes) + + alpha = self.alpha + # add check when we support LabelSpreading + # if self._variant == 'spreading' and \ + # (alpha is None or alpha <= 0.0 or alpha >= 1.0): + # raise ValueError('alpha=%s is invalid: it must be inside ' + # 'the open interval (0, 1)' % alpha) + y = mt.asarray(y) + unlabeled = y == -1 + + # initialize distributions + self.label_distributions_ = mt.zeros((n_samples, n_classes)) + for label in classes: + self.label_distributions_[y == label, classes == label] = 1 + + y_static = mt.copy(self.label_distributions_) + if self._variant == "propagation": + # LabelPropagation + y_static[unlabeled] = 0 + else: # pragma: no cover + # LabelSpreading + y_static *= 1 - alpha + + l_previous = mt.zeros((self.X_.shape[0], n_classes)) + + unlabeled = unlabeled[:, mt.newaxis] + + for self.n_iter_ in range(self.max_iter): + cond = mt.abs(self.label_distributions_ - l_previous).sum() < self.tol + + to_run.append(cond) + ExecutableTuple(to_run).execute(session=session, **(run_kwargs or dict())) + # clear + to_run = [] + + if cond.fetch(session=session): + break + + l_previous = self.label_distributions_ + self.label_distributions_ = graph_matrix.dot(self.label_distributions_) + + if self._variant == "propagation": + normalizer = mt.sum(self.label_distributions_, axis=1)[:, mt.newaxis] + self.label_distributions_ /= normalizer + self.label_distributions_ = mt.where( + unlabeled, self.label_distributions_, y_static + ) + else: # pragma: no cover + # clamp + self.label_distributions_ = ( + mt.multiply(alpha, self.label_distributions_) + y_static + ) + + to_run.append(self.label_distributions_) + else: + warnings.warn( + f"max_iter={self.max_iter} was reached without convergence.", + category=ConvergenceWarning, + ) + self.n_iter_ += 1 + + normalizer = mt.sum(self.label_distributions_, axis=1)[:, mt.newaxis] + self.label_distributions_ /= normalizer + + # set the transduction item + transduction = mt.tensor(self.classes_)[ + mt.argmax(self.label_distributions_, axis=1) + ] + self.transduction_ = transduction.ravel() + ExecutableTuple([self.label_distributions_, self.transduction_]).execute( + session=session, **(run_kwargs or dict()) + ) + return self + + +class LabelPropagation(BaseLabelPropagation): + """Label Propagation classifier + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + kernel : {'knn', 'rbf', callable} + String identifier for kernel function to use or the kernel function + itself. Only 'rbf' and 'knn' strings are valid inputs. The function + passed should take two inputs, each of shape [n_samples, n_features], + and return a [n_samples, n_samples] shaped weight matrix. + + gamma : float + Parameter for rbf kernel + + n_neighbors : integer > 0 + Parameter for knn kernel + + max_iter : integer + Change maximum number of iterations allowed + + tol : float + Convergence tolerance: threshold to consider the system at steady + state + + Attributes + ---------- + X_ : array, shape = [n_samples, n_features] + Input array. + + classes_ : array, shape = [n_classes] + The distinct labels used in classifying instances. + + label_distributions_ : array, shape = [n_samples, n_classes] + Categorical distribution for each item. + + transduction_ : array, shape = [n_samples] + Label assigned to each item via the transduction. + + n_iter_ : int + Number of iterations run. + + Examples + -------- + >>> import numpy as np + >>> from sklearn import datasets + >>> from mars.learn.semi_supervised import LabelPropagation + >>> label_prop_model = LabelPropagation() + >>> iris = datasets.load_iris() + >>> rng = np.random.RandomState(42) + >>> random_unlabeled_points = rng.rand(len(iris.target)) < 0.3 + >>> labels = np.copy(iris.target) + >>> labels[random_unlabeled_points] = -1 + >>> label_prop_model.fit(iris.data, labels) + LabelPropagation(...) + + References + ---------- + Xiaojin Zhu and Zoubin Ghahramani. Learning from labeled and unlabeled data + with label propagation. Technical Report CMU-CALD-02-107, Carnegie Mellon + University, 2002 http://pages.cs.wisc.edu/~jerryzhu/pub/CMU-CALD-02-107.pdf + + See Also + -------- + LabelSpreading : Alternate label propagation strategy more robust to noise + """ + + _variant = "propagation" + + def __init__(self, kernel="rbf", gamma=20, n_neighbors=7, max_iter=1000, tol=1e-3): + super().__init__( + kernel=kernel, + gamma=gamma, + n_neighbors=n_neighbors, + max_iter=max_iter, + tol=tol, + alpha=None, + ) + + def _build_graph(self): + """Matrix representing a fully connected graph between each sample + + This basic implementation creates a non-stochastic affinity matrix, so + class distributions will exceed 1 (normalization may be desired). + """ + if self.kernel == "knn": + self.nn_fit = None + affinity_matrix = self._get_kernel(self.X_) + normalizer = affinity_matrix.sum(axis=0) + affinity_matrix /= normalizer[:, mt.newaxis] + return affinity_matrix + + def fit(self, X, y, session=None, run_kwargs=None): + return super().fit(X, y, session=session, run_kwargs=run_kwargs) diff --git a/python/xorbits/_mars/learn/semi_supervised/tests/__init__.py b/python/xorbits/_mars/learn/semi_supervised/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/learn/semi_supervised/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/learn/semi_supervised/tests/test_label_propagation.py b/python/xorbits/_mars/learn/semi_supervised/tests/test_label_propagation.py new file mode 100644 index 000000000..494b2447d --- /dev/null +++ b/python/xorbits/_mars/learn/semi_supervised/tests/test_label_propagation.py @@ -0,0 +1,144 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +from sklearn.datasets import make_classification +from sklearn.exceptions import ConvergenceWarning +from sklearn.model_selection import train_test_split +from sklearn.utils._testing import assert_no_warnings + +from .... import tensor as mt +from ...metrics.pairwise import rbf_kernel +from ...neighbors import NearestNeighbors +from .. import LabelPropagation + +estimators = [ + (LabelPropagation, {"kernel": "rbf"}), + (LabelPropagation, {"kernel": "knn", "n_neighbors": 2}), + (LabelPropagation, {"kernel": lambda x, y: rbf_kernel(x, y, gamma=20)}), +] + + +@pytest.mark.parametrize("estimator, parameters", estimators) +def test_fit_transduction(setup, estimator, parameters): + samples = [[1.0, 0.0], [0.0, 2.0], [1.0, 3.0]] + labels = [0, 1, -1] + clf = estimator(**parameters).fit(samples, labels) + assert clf.transduction_[2].fetch() == 1 + + +@pytest.mark.parametrize("estimator, parameters", estimators) +def test_distribution(setup, estimator, parameters): + samples = [[1.0, 0.0], [0.0, 1.0], [1.0, 1.0]] + labels = [0, 1, -1] + clf = estimator(**parameters).fit(samples, labels) + if parameters["kernel"] == "knn": + return # unstable test; changes in k-NN ordering break it + else: + np.testing.assert_array_almost_equal( + np.asarray(clf.label_distributions_[2]), np.array([0.5, 0.5]), 2 + ) + + +@pytest.mark.parametrize("estimator, parameters", estimators) +def test_predict(setup, estimator, parameters): + samples = [[1.0, 0.0], [0.0, 2.0], [1.0, 3.0]] + labels = [0, 1, -1] + clf = estimator(**parameters).fit(samples, labels) + np.testing.assert_array_equal(clf.predict([[0.5, 2.5]]).fetch(), np.array([1])) + + +@pytest.mark.parametrize("estimator, parameters", estimators) +def test_predict_proba(setup, estimator, parameters): + samples = [[1.0, 0.0], [0.0, 1.0], [1.0, 2.5]] + labels = [0, 1, -1] + clf = estimator(**parameters).fit(samples, labels) + np.testing.assert_almost_equal( + clf.predict_proba([[1.0, 1.0]]).fetch(), np.array([[0.5, 0.5]]) + ) + + +def test_label_propagation_closed_form(setup): + n_classes = 2 + X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0) + y[::3] = -1 + Y = np.zeros((len(y), n_classes + 1)) + Y[np.arange(len(y)), y] = 1 + unlabelled_idx = Y[:, (-1,)].nonzero()[0] + labelled_idx = (Y[:, (-1,)] == 0).nonzero()[0] + + clf = LabelPropagation(max_iter=10000, gamma=0.1) + clf.fit(X, y) + # adopting notation from Zhu et al 2002 + T_bar = clf._build_graph().to_numpy() + Tuu = T_bar[tuple(np.meshgrid(unlabelled_idx, unlabelled_idx, indexing="ij"))] + Tul = T_bar[tuple(np.meshgrid(unlabelled_idx, labelled_idx, indexing="ij"))] + Y = Y[:, :-1] + Y_l = Y[labelled_idx, :] + Y_u = np.dot(np.dot(np.linalg.inv(np.eye(Tuu.shape[0]) - Tuu), Tul), Y_l) + + expected = Y.copy() + expected[unlabelled_idx, :] = Y_u + expected /= expected.sum(axis=1)[:, np.newaxis] + + np.testing.assert_array_almost_equal(expected, clf.label_distributions_.fetch(), 4) + + +def test_convergence_warning(setup): + # This is a non-regression test for #5774 + X = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 2.5]]) + y = np.array([0, 1, -1]) + + mdl = LabelPropagation(kernel="rbf", max_iter=1) + with pytest.warns(ConvergenceWarning): + mdl.fit(X, y) + assert mdl.n_iter_ == mdl.max_iter + + mdl = LabelPropagation(kernel="rbf", max_iter=500) + assert_no_warnings(mdl.fit, X, y) + + +def test_predict_sparse_callable_kernel(setup): + # This is a non-regression test for #15866 + + # Custom sparse kernel (top-K RBF) + def topk_rbf(X, Y=None, n_neighbors=10, gamma=1e-5): + nn = NearestNeighbors(n_neighbors=10, metric="euclidean", n_jobs=-1) + nn.fit(X) + W = -1 * mt.power(nn.kneighbors_graph(Y, mode="distance"), 2) * gamma + W = mt.exp(W) + assert W.issparse() + return W.T + + n_classes = 4 + n_samples = 500 + n_test = 10 + X, y = make_classification( + n_classes=n_classes, + n_samples=n_samples, + n_features=20, + n_informative=20, + n_redundant=0, + n_repeated=0, + random_state=0, + ) + + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=n_test, random_state=0 + ) + + model = LabelPropagation(kernel=topk_rbf) + model.fit(X_train, y_train) + assert model.score(X_test, y_test).fetch() >= 0.9 diff --git a/python/xorbits/_mars/learn/tests/__init__.py b/python/xorbits/_mars/learn/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/learn/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/learn/tests/test_wrappers.py b/python/xorbits/_mars/learn/tests/test_wrappers.py new file mode 100644 index 000000000..c0707ae58 --- /dev/null +++ b/python/xorbits/_mars/learn/tests/test_wrappers.py @@ -0,0 +1,107 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +from sklearn.datasets import make_classification +from sklearn.decomposition import PCA +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.linear_model import LinearRegression, LogisticRegression + +from ... import tensor as mt +from ..wrappers import ParallelPostFit + + +def test_parallel_post_fit_basic(setup): + raw_x, raw_y = make_classification(n_samples=1000) + X, y = mt.tensor(raw_x, chunk_size=100), mt.tensor(raw_y, chunk_size=100) + clf = ParallelPostFit(GradientBoostingClassifier()) + clf.fit(X, y) + + assert isinstance(clf.predict(X), mt.Tensor) + assert isinstance(clf.predict_proba(X), mt.Tensor) + + result = clf.score(X, y) + expected = clf.estimator.score(X, y) + assert result.fetch() == expected + + clf = ParallelPostFit(LinearRegression()) + clf.fit(X, y) + with pytest.raises( + AttributeError, match="The wrapped estimator (.|\n)* 'predict_proba' method." + ): + clf.predict_proba(X) + + +def test_parallel_post_fit_predict(setup): + raw_x, raw_y = make_classification(n_samples=1000) + X, y = mt.tensor(raw_x, chunk_size=100), mt.tensor(raw_y, chunk_size=100) + base = LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs") + wrap = ParallelPostFit(LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs")) + + base.fit(X, y) + wrap.fit(X, y) + + result = wrap.predict(X) + expected = base.predict(X) + np.testing.assert_allclose(result, expected) + + result = wrap.predict_proba(X) + expected = base.predict_proba(X) + np.testing.assert_allclose(result, expected) + + result = wrap.predict_log_proba(X) + expected = base.predict_log_proba(X) + np.testing.assert_allclose(result, expected) + + +def test_parallel_post_fit_transform(setup): + raw_x, raw_y = make_classification(n_samples=1000) + X, y = mt.tensor(raw_x, chunk_size=100), mt.tensor(raw_y, chunk_size=100) + base = PCA(random_state=0) + wrap = ParallelPostFit(PCA(random_state=0)) + + base.fit(raw_x, raw_y) + wrap.fit(X, y) + + result = base.transform(X) + expected = wrap.transform(X) + np.testing.assert_allclose(result, expected, atol=0.1) + + +def test_parallel_post_fit_multiclass(setup): + raw_x, raw_y = make_classification(n_samples=1000) + X, y = mt.tensor(raw_x, chunk_size=100), mt.tensor(raw_y, chunk_size=100) + raw_x, raw_y = make_classification(n_classes=3, n_informative=4) + X, y = mt.tensor(raw_x, chunk_size=50), mt.tensor(raw_y, chunk_size=50) + + clf = ParallelPostFit( + LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs", multi_class="auto") + ) + + clf.fit(X, y) + result = clf.predict(X) + expected = clf.estimator.predict(X) + + np.testing.assert_allclose(result, expected) + + result = clf.predict_proba(X) + expected = clf.estimator.predict_proba(X) + + np.testing.assert_allclose(result, expected) + + result = clf.predict_log_proba(X) + expected = clf.estimator.predict_log_proba(X) + + np.testing.assert_allclose(result, expected) diff --git a/python/xorbits/_mars/learn/utils/__init__.py b/python/xorbits/_mars/learn/utils/__init__.py new file mode 100644 index 000000000..107fc1d3a --- /dev/null +++ b/python/xorbits/_mars/learn/utils/__init__.py @@ -0,0 +1,32 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# noinspection PyUnresolvedReferences +from sklearn.utils import gen_batches + +from .collect_ports import collect_ports +from .core import ( + concat_chunks, + convert_to_tensor_or_dataframe, + copy_learned_attributes, + get_chunk_n_rows, +) +from .shuffle import shuffle +from .validation import ( + assert_all_finite, + check_array, + check_consistent_length, + check_X_y, + column_or_1d, +) diff --git a/python/xorbits/_mars/learn/utils/_cython_blas.pxd b/python/xorbits/_mars/learn/utils/_cython_blas.pxd new file mode 100644 index 000000000..3667d2889 --- /dev/null +++ b/python/xorbits/_mars/learn/utils/_cython_blas.pxd @@ -0,0 +1,41 @@ +from cython cimport floating + + +cpdef enum BLAS_Order: + RowMajor # C contiguous + ColMajor # Fortran contiguous + + +cpdef enum BLAS_Trans: + NoTrans = 110 # correspond to 'n' + Trans = 116 # correspond to 't' + + +# BLAS Level 1 ################################################################ +cdef floating _dot(int, floating*, int, floating*, int) nogil + +cdef floating _asum(int, floating*, int) nogil + +cdef void _axpy(int, floating, floating*, int, floating*, int) nogil + +cdef floating _nrm2(int, floating*, int) nogil + +cdef void _copy(int, floating*, int, floating*, int) nogil + +cdef void _scal(int, floating, floating*, int) nogil + +cdef void _rotg(floating*, floating*, floating*, floating*) nogil + +cdef void _rot(int, floating*, int, floating*, int, floating, floating) nogil + +# BLAS Level 2 ################################################################ +cdef void _gemv(BLAS_Order, BLAS_Trans, int, int, floating, floating*, int, + floating*, int, floating, floating*, int) nogil + +cdef void _ger(BLAS_Order, int, int, floating, floating*, int, floating*, int, + floating*, int) nogil + +# BLASLevel 3 ################################################################ +cdef void _gemm(BLAS_Order, BLAS_Trans, BLAS_Trans, int, int, int, floating, + floating*, int, floating*, int, floating, floating*, + int) nogil diff --git a/python/xorbits/_mars/learn/utils/_cython_blas.pyx b/python/xorbits/_mars/learn/utils/_cython_blas.pyx new file mode 100644 index 000000000..c15e66ee0 --- /dev/null +++ b/python/xorbits/_mars/learn/utils/_cython_blas.pyx @@ -0,0 +1,226 @@ +from cython cimport floating + +from scipy.linalg.cython_blas cimport sdot, ddot +from scipy.linalg.cython_blas cimport sasum, dasum +from scipy.linalg.cython_blas cimport saxpy, daxpy +from scipy.linalg.cython_blas cimport snrm2, dnrm2 +from scipy.linalg.cython_blas cimport scopy, dcopy +from scipy.linalg.cython_blas cimport sscal, dscal +from scipy.linalg.cython_blas cimport srotg, drotg +from scipy.linalg.cython_blas cimport srot, drot +from scipy.linalg.cython_blas cimport sgemv, dgemv +from scipy.linalg.cython_blas cimport sger, dger +from scipy.linalg.cython_blas cimport sgemm, dgemm + + +################ +# BLAS Level 1 # +################ + +cdef floating _dot(int n, floating *x, int incx, + floating *y, int incy) nogil: + """x.T.y""" + if floating is float: + return sdot(&n, x, &incx, y, &incy) + else: + return ddot(&n, x, &incx, y, &incy) + + +cpdef _dot_memview(floating[::1] x, floating[::1] y): + return _dot(x.shape[0], &x[0], 1, &y[0], 1) + + +cdef floating _asum(int n, floating *x, int incx) nogil: + """sum(|x_i|)""" + if floating is float: + return sasum(&n, x, &incx) + else: + return dasum(&n, x, &incx) + + +cpdef _asum_memview(floating[::1] x): + return _asum(x.shape[0], &x[0], 1) + + +cdef void _axpy(int n, floating alpha, floating *x, int incx, + floating *y, int incy) nogil: + """y := alpha * x + y""" + if floating is float: + saxpy(&n, &alpha, x, &incx, y, &incy) + else: + daxpy(&n, &alpha, x, &incx, y, &incy) + + +cpdef _axpy_memview(floating alpha, floating[::1] x, floating[::1] y): + _axpy(x.shape[0], alpha, &x[0], 1, &y[0], 1) + + +cdef floating _nrm2(int n, floating *x, int incx) nogil: + """sqrt(sum((x_i)^2))""" + if floating is float: + return snrm2(&n, x, &incx) + else: + return dnrm2(&n, x, &incx) + + +cpdef _nrm2_memview(floating[::1] x): + return _nrm2(x.shape[0], &x[0], 1) + + +cdef void _copy(int n, floating *x, int incx, floating *y, int incy) nogil: + """y := x""" + if floating is float: + scopy(&n, x, &incx, y, &incy) + else: + dcopy(&n, x, &incx, y, &incy) + + +cpdef _copy_memview(floating[::1] x, floating[::1] y): + _copy(x.shape[0], &x[0], 1, &y[0], 1) + + +cdef void _scal(int n, floating alpha, floating *x, int incx) nogil: + """x := alpha * x""" + if floating is float: + sscal(&n, &alpha, x, &incx) + else: + dscal(&n, &alpha, x, &incx) + + +cpdef _scal_memview(floating alpha, floating[::1] x): + _scal(x.shape[0], alpha, &x[0], 1) + + +cdef void _rotg(floating *a, floating *b, floating *c, floating *s) nogil: + """Generate plane rotation""" + if floating is float: + srotg(a, b, c, s) + else: + drotg(a, b, c, s) + + +cpdef _rotg_memview(floating a, floating b, floating c, floating s): + _rotg(&a, &b, &c, &s) + return a, b, c, s + + +cdef void _rot(int n, floating *x, int incx, floating *y, int incy, + floating c, floating s) nogil: + """Apply plane rotation""" + if floating is float: + srot(&n, x, &incx, y, &incy, &c, &s) + else: + drot(&n, x, &incx, y, &incy, &c, &s) + + +cpdef _rot_memview(floating[::1] x, floating[::1] y, floating c, floating s): + _rot(x.shape[0], &x[0], 1, &y[0], 1, c, s) + + +################ +# BLAS Level 2 # +################ + +cdef void _gemv(BLAS_Order order, BLAS_Trans ta, int m, int n, floating alpha, + floating *A, int lda, floating *x, int incx, + floating beta, floating *y, int incy) nogil: + """y := alpha * op(A).x + beta * y""" + cdef char ta_ = ta + if order == RowMajor: + ta_ = NoTrans if ta == Trans else Trans + if floating is float: + sgemv(&ta_, &n, &m, &alpha, A, &lda, x, &incx, &beta, y, &incy) + else: + dgemv(&ta_, &n, &m, &alpha, A, &lda, x, &incx, &beta, y, &incy) + else: + if floating is float: + sgemv(&ta_, &m, &n, &alpha, A, &lda, x, &incx, &beta, y, &incy) + else: + dgemv(&ta_, &m, &n, &alpha, A, &lda, x, &incx, &beta, y, &incy) + + +cpdef _gemv_memview(BLAS_Trans ta, floating alpha, floating[:, :] A, + floating[::1] x, floating beta, floating[::1] y): + cdef: + int m = A.shape[0] + int n = A.shape[1] + BLAS_Order order = ColMajor if A.strides[0] == A.itemsize else RowMajor + int lda = m if order == ColMajor else n + + _gemv(order, ta, m, n, alpha, &A[0, 0], lda, &x[0], 1, beta, &y[0], 1) + + +cdef void _ger(BLAS_Order order, int m, int n, floating alpha, floating *x, + int incx, floating *y, int incy, floating *A, int lda) nogil: + """A := alpha * x.y.T + A""" + if order == RowMajor: + if floating is float: + sger(&n, &m, &alpha, y, &incy, x, &incx, A, &lda) + else: + dger(&n, &m, &alpha, y, &incy, x, &incx, A, &lda) + else: + if floating is float: + sger(&m, &n, &alpha, x, &incx, y, &incy, A, &lda) + else: + dger(&m, &n, &alpha, x, &incx, y, &incy, A, &lda) + + +cpdef _ger_memview(floating alpha, floating[::1] x, floating[::] y, + floating[:, :] A): + cdef: + int m = A.shape[0] + int n = A.shape[1] + BLAS_Order order = ColMajor if A.strides[0] == A.itemsize else RowMajor + int lda = m if order == ColMajor else n + + _ger(order, m, n, alpha, &x[0], 1, &y[0], 1, &A[0, 0], lda) + + +################ +# BLAS Level 3 # +################ + +cdef void _gemm(BLAS_Order order, BLAS_Trans ta, BLAS_Trans tb, int m, int n, + int k, floating alpha, floating *A, int lda, floating *B, + int ldb, floating beta, floating *C, int ldc) nogil: + """C := alpha * op(A).op(B) + beta * C""" + cdef: + char ta_ = ta + char tb_ = tb + if order == RowMajor: + if floating is float: + sgemm(&tb_, &ta_, &n, &m, &k, &alpha, B, + &ldb, A, &lda, &beta, C, &ldc) + else: + dgemm(&tb_, &ta_, &n, &m, &k, &alpha, B, + &ldb, A, &lda, &beta, C, &ldc) + else: + if floating is float: + sgemm(&ta_, &tb_, &m, &n, &k, &alpha, A, + &lda, B, &ldb, &beta, C, &ldc) + else: + dgemm(&ta_, &tb_, &m, &n, &k, &alpha, A, + &lda, B, &ldb, &beta, C, &ldc) + + +cpdef _gemm_memview(BLAS_Trans ta, BLAS_Trans tb, floating alpha, + floating[:, :] A, floating[:, :] B, floating beta, + floating[:, :] C): + cdef: + int m = A.shape[0] if ta == NoTrans else A.shape[1] + int n = B.shape[1] if tb == NoTrans else B.shape[0] + int k = A.shape[1] if ta == NoTrans else A.shape[0] + int lda, ldb, ldc + BLAS_Order order = ColMajor if A.strides[0] == A.itemsize else RowMajor + + if order == RowMajor: + lda = k if ta == NoTrans else m + ldb = n if tb == NoTrans else k + ldc = n + else: + lda = m if ta == NoTrans else k + ldb = k if tb == NoTrans else n + ldc = m + + _gemm(order, ta, tb, m, n, k, alpha, &A[0, 0], + lda, &B[0, 0], ldb, beta, &C[0, 0], ldc) diff --git a/python/xorbits/_mars/learn/utils/_encode.py b/python/xorbits/_mars/learn/utils/_encode.py new file mode 100644 index 000000000..da5e4c417 --- /dev/null +++ b/python/xorbits/_mars/learn/utils/_encode.py @@ -0,0 +1,300 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import NamedTuple + +import numpy as np + +from ... import dataframe as md +from ... import tensor as mt +from .core import is_scalar_nan + + +def _unique(values, *, return_inverse=False): + """Helper function to find unique values with support for python objects. + + Uses pure python method for object dtype, and numpy method for + all other dtypes. + + Parameters + ---------- + values : ndarray + Values to check for unknowns. + + return_inverse : bool, default=False + If True, also return the indices of the unique values. + + Returns + ------- + unique : ndarray + The sorted unique values. + + unique_inverse : ndarray + The indices to reconstruct the original array from the unique array. + Only provided if `return_inverse` is True. + """ + if values.dtype == object: + series_unique = md.Series(md.Series(values).unique()).sort_values().values + if return_inverse: + return series_unique, _map_to_integer(values, series_unique) + else: + return series_unique + + out = mt.unique(values, return_inverse=return_inverse) + + if return_inverse: + uniques, inverse = out + else: + uniques = out + + # np.unique will have duplicate missing values at the end of `uniques` + # here we clip the nans and remove it from uniques + uniques = uniques.rechunk(tuple((s,) for idx, s in enumerate(uniques.shape))) + nan_idx = mt.searchsorted(uniques, mt.nan) + uniques = uniques.map_chunk( + lambda c, idx: c[: idx + 1], + args=(nan_idx,), + dtype=uniques.dtype, + shape=(np.nan,) * uniques.ndim, + ) + if return_inverse: + + def inv_mapper(c, idx): + if c.flags.writeable: + c[c > idx] = idx + else: # pragma: no cover + # If c is got from the shared memory, it is immutable. + c = np.select([c <= idx], [c], idx) + return c + + inverse = inverse.map_chunk( + inv_mapper, + args=(nan_idx,), + dtype=inverse.dtype, + shape=((np.nan,),) * inverse.ndim, + ) + + return uniques, inverse + return uniques + + +class MissingValues(NamedTuple): # pragma: no cover + """Data class for missing data information""" + + nan: bool + none: bool + + def to_list(self): + """Convert tuple to a list where None is always first.""" + output = [] + if self.none: + output.append(None) + if self.nan: + output.append(np.nan) + return output + + +def _extract_missing(values): # pragma: no cover + """Extract missing values from `values`. + + Parameters + ---------- + values: set + Set of values to extract missing from. + + Returns + ------- + output: set + Set with missing values extracted. + + missing_values: MissingValues + Object with missing value information. + """ + missing_values_set = { + value for value in values if value is None or is_scalar_nan(value) + } + + if not missing_values_set: + return values, MissingValues(nan=False, none=False) + + if None in missing_values_set: + if len(missing_values_set) == 1: + output_missing_values = MissingValues(nan=False, none=True) + else: + # If there is more than one missing value, then it has to be + # float('nan') or np.nan + output_missing_values = MissingValues(nan=True, none=True) + else: + output_missing_values = MissingValues(nan=True, none=False) + + # create set without the missing values + output = values - missing_values_set + return output, output_missing_values + + +class _nandict(dict): # pragma: no cover + """Dictionary with support for nans.""" + + def __init__(self, mapping): + super().__init__(mapping) + for key, value in mapping.items(): + if is_scalar_nan(key): + self.nan_value = value + break + + def __missing__(self, key): + if hasattr(self, "nan_value") and is_scalar_nan(key): + return self.nan_value + raise KeyError(key) + + +def _map_to_integer(values, uniques, check_unknown=True): + """Map values based on its position in uniques.""" + + def mapper(values_data, uniques_data): + if values_data.dtype.kind in "OUS": + try: + table = _nandict({val: i for i, val in enumerate(uniques_data)}) + return np.array([table[v] for v in values_data]) + except KeyError as e: + raise ValueError(f"y contains previously unseen labels: {str(e)}") + else: + if check_unknown: + diff = _check_unknown(values_data, uniques_data) + if diff: + raise ValueError( + f"y contains previously unseen labels: {str(diff)}" + ) + return np.searchsorted(uniques_data, values_data) + + return values.map_chunk( + mapper, args=(uniques,), dtype=np.dtype(np.int64), shape=values.shape + ) + + +def _check_unknown(values, known_values, return_mask=False): # pragma: no cover + """ + Helper function to check for unknowns in values to be encoded. + + Uses pure python method for object dtype, and numpy method for + all other dtypes. + + Parameters + ---------- + values : array + Values to check for unknowns. + known_values : array + Known values. Must be unique. + return_mask : bool, default=False + If True, return a mask of the same shape as `values` indicating + the valid values. + + Returns + ------- + diff : list + The unique values present in `values` and not in `know_values`. + valid_mask : boolean array + Additionally returned if ``return_mask=True``. + + """ + valid_mask = None + + if values.dtype.kind in "OUS": + values_set = set(values) + values_set, missing_in_values = _extract_missing(values_set) + + uniques_set = set(known_values) + uniques_set, missing_in_uniques = _extract_missing(uniques_set) + diff = values_set - uniques_set + + nan_in_diff = missing_in_values.nan and not missing_in_uniques.nan + none_in_diff = missing_in_values.none and not missing_in_uniques.none + + def is_valid(value): + return ( + value in uniques_set + or missing_in_uniques.none + and value is None + or missing_in_uniques.nan + and is_scalar_nan(value) + ) + + if return_mask: + if diff or nan_in_diff or none_in_diff: + valid_mask = np.array([is_valid(value) for value in values]) + else: + valid_mask = np.ones(len(values), dtype=bool) + + diff = list(diff) + if none_in_diff: + diff.append(None) + if nan_in_diff: + diff.append(np.nan) + else: + unique_values = np.unique(values) + diff = np.setdiff1d(unique_values, known_values, assume_unique=True) + if return_mask: + if diff.size: + valid_mask = np.in1d(values, known_values) + else: + valid_mask = np.ones(len(values), dtype=bool) + + # check for nans in the known_values + if np.isnan(known_values).any(): + diff_is_nan = np.isnan(diff) + if diff_is_nan.any(): + # removes nan from valid_mask + if diff.size and return_mask: + is_nan = np.isnan(values) + valid_mask[is_nan] = 1 + + # remove nan from diff + diff = diff[~diff_is_nan] + diff = list(diff) + + if return_mask: + return diff, valid_mask + return diff + + +def _encode(values, *, uniques, check_unknown=True): + """Helper function to encode values into [0, n_uniques - 1]. + + Uses pure python method for object dtype, and numpy method for + all other dtypes. + The numpy method has the limitation that the `uniques` need to + be sorted. Importantly, this is not checked but assumed to already be + the case. The calling method needs to ensure this for all non-object + values. + + Parameters + ---------- + values : tensor + Values to encode. + uniques : tensor + The unique values in `values`. If the dtype is not object, then + `uniques` needs to be sorted. + check_unknown : bool, default=True + If True, check for values in `values` that are not in `unique` + and raise an error. This is ignored for object dtype, and treated as + True in this case. This parameter is useful for + _BaseEncoder._transform() to avoid calling _check_unknown() + twice. + + Returns + ------- + encoded : tensor + Encoded values + """ + return _map_to_integer(values, uniques, check_unknown=check_unknown) diff --git a/python/xorbits/_mars/learn/utils/checks.py b/python/xorbits/_mars/learn/utils/checks.py new file mode 100644 index 000000000..bd436a35e --- /dev/null +++ b/python/xorbits/_mars/learn/utils/checks.py @@ -0,0 +1,474 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +try: + from sklearn import get_config as get_sklearn_config +except ImportError: # pragma: no cover + get_sklearn_config = None + +from ... import opcodes as OperandDef +from ... import tensor as mt +from ...config import options +from ...core import ENTITY_TYPE, get_output_types, recursive_tile +from ...core.operand import OperandStage +from ...serialization.serializables import ( + BoolField, + DataTypeField, + KeyField, + StringField, +) +from ...tensor.array_utils import as_same_device, device, get_array_module, issparse +from ...tensor.core import TENSOR_CHUNK_TYPE, TensorOrder +from ...utils import ceildiv +from ..operands import LearnOperand, LearnOperandMixin, OutputType + + +class CheckBase(LearnOperand, LearnOperandMixin): + _input = KeyField("input") + _value = KeyField("value") + _err_msg = StringField("err_msg") + + def __init__(self, input=None, value=None, err_msg=None, output_types=None, **kw): + super().__init__( + _input=input, + _value=value, + _err_msg=err_msg, + _output_types=output_types, + **kw, + ) + + @property + def input(self): + return self._input + + @property + def value(self): + return self._value + + @property + def err_msg(self): + return self._err_msg + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if self._input is not None: + self._input = self._inputs[0] + if self._value is not None: + self._value = self._inputs[-1] + + def __call__(self, x, value=None): + # output input if value not specified + self._value = value = value if value is not None else x + self.output_types = get_output_types(value) + self.stage = OperandStage.agg + return self.new_tileable([x, value], kws=[value.params]) + + @classmethod + def tile(cls, op): + combine_size = options.combine_size + x, value = op.input, op.value + check_chunks = [] + for i, chunk in enumerate(x.chunks): + chunk_op = cls( + err_msg=op.err_msg, + stage=OperandStage.map, + output_types=[OutputType.tensor], + ) + check_chunk = chunk_op.new_chunk( + [chunk], + shape=(), + index=(i,), + dtype=np.dtype(bool), + order=TensorOrder.C_ORDER, + ) + check_chunks.append(check_chunk) + + while len(check_chunks) > 1: + prev_check_chunks = check_chunks + check_chunks = [] + chunk_size = ceildiv(len(prev_check_chunks), combine_size) + for i in range(chunk_size): + chunks = prev_check_chunks[i * combine_size : (i + 1) * combine_size] + chunk_op = cls( + err_msg=op.err_msg, + stage=OperandStage.combine, + output_types=[OutputType.tensor], + ) + check_chunk = chunk_op.new_chunk( + chunks, + shape=(), + index=(i,), + dtype=np.dtype(bool), + order=TensorOrder.C_ORDER, + ) + check_chunks.append(check_chunk) + + check_chunk = check_chunks[0] + out_chunks = [] + for val_chunk in value.chunks: + chunk_op = cls( + value=val_chunk, + err_msg=op.err_msg, + stage=OperandStage.agg, + output_types=op.output_types, + ) + out_chunk = chunk_op.new_chunk( + [check_chunk, val_chunk], kws=[val_chunk.params] + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + kw = op.outputs[0].params + kw["chunks"] = out_chunks + kw["nsplits"] = value.nsplits + return new_op.new_tileables(op.inputs, kws=[kw]) + + +class CheckNonNegative(CheckBase): + _op_type_ = OperandDef.CHECK_NON_NEGATIVE + + _whom = StringField("whom") + + def __init__( + self, + input=None, + value=None, + whom=None, + err_msg=None, + stage=None, + gpu=None, + output_types=None, + **kw, + ): + super().__init__( + input=input, + value=value, + _whom=whom, + err_msg=err_msg, + stage=stage, + output_types=output_types, + gpu=gpu, + **kw, + ) + if self._err_msg is None and self._whom is not None: + self._err_msg = f"Negative values in data passed to {self._whom}" + + @property + def whom(self): + return self._whom + + @classmethod + def _execute_tensor(cls, ctx, op): + (x,), device_id, xp = as_same_device( + [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + if issparse(x) and x.nnz == 0: + x_min = 0 + else: + x_min = xp.min(x) + + if x_min < 0: + raise ValueError(op.err_msg) + + ctx[op.outputs[0].key] = np.array(True) + + @classmethod + def _execute_df(cls, ctx, op): + x = ctx[op.inputs[0].key] + x_min = x.min().min() + if x_min < 0: + raise ValueError(op.err_msg) + + ctx[op.outputs[0].key] = np.array(True) + + @classmethod + def _execute_map(cls, ctx, op): + if isinstance(op.inputs[0], TENSOR_CHUNK_TYPE): + return cls._execute_tensor(ctx, op) + else: + return cls._execute_df(ctx, op) + + @classmethod + def _execute_combine(cls, ctx, op): + # just pass value cuz all inputs executed successfully + ctx[op.outputs[0].key] = np.array(True) + + @classmethod + def _execute_agg(cls, ctx, op): + ctx[op.outputs[0].key] = ctx[op.value.key] + + @classmethod + def execute(cls, ctx, op): + if op.stage == OperandStage.map: + return cls._execute_map(ctx, op) + elif op.stage == OperandStage.combine: + return cls._execute_combine(ctx, op) + else: + assert op.stage == OperandStage.agg + return cls._execute_agg(ctx, op) + + +def check_non_negative_then_return_value(to_check, value, whom): + op = CheckNonNegative(input=to_check, value=value, whom=whom) + return op(to_check, value) + + +class AssertAllFinite(LearnOperand, LearnOperandMixin): + _op_type_ = OperandDef.ASSERT_ALL_FINITE + + _x = KeyField("x") + _allow_nan = BoolField("allow_nan") + _msg_dtype = DataTypeField("msg_dtype") + _check_only = BoolField("check_only") + # chunks + _is_finite = KeyField("is_finite") + _check_nan = KeyField("check_nan") + + def __init__( + self, + x=None, + allow_nan=None, + msg_dtype=None, + check_only=None, + is_finite=None, + check_nan=None, + output_types=None, + **kw, + ): + super().__init__( + _x=x, + _allow_nan=allow_nan, + _msg_dtype=msg_dtype, + _check_only=check_only, + _is_finite=is_finite, + _check_nan=check_nan, + _output_types=output_types, + **kw, + ) + + @property + def x(self): + return self._x + + @property + def allow_nan(self): + return self._allow_nan + + @property + def msg_dtype(self): + return self._msg_dtype + + @property + def check_only(self): + return self._check_only + + @property + def is_finite(self): + return self._is_finite + + @property + def check_nan(self): + return self._check_nan + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + for attr in ("_x", "_is_finite", "_check_nan"): + if getattr(self, attr) is not None: + setattr(self, attr, next(inputs_iter)) + + @classmethod + def _assume_finite(cls): + assume_finite = options.learn.assume_finite + if assume_finite is None and get_sklearn_config is not None: + # get config from scikit-learn + assume_finite = get_sklearn_config()["assume_finite"] + if assume_finite is None: # pragma: no cover + assume_finite = False + + return assume_finite + + def __call__(self, x): + if self._assume_finite(): + # skip check + if self._check_only: + return + else: + return x + + if self._check_only: + return self.new_tileable( + [x], dtype=np.dtype(bool), shape=(), order=TensorOrder.C_ORDER + ) + else: + return self.new_tileable([x], kws=[x.params]) + + @classmethod + def tile(cls, op): + from .extmath import _safe_accumulator_op + + x = op.x + out = op.outputs[0] + is_float = x.dtype.kind in "fc" + combine_size = options.combine_size + + is_finite_chunk = check_nan_chunk = None + if is_float: + is_finite_chunk = ( + yield from recursive_tile(mt.isfinite(_safe_accumulator_op(mt.sum, x))) + ).chunks[0] + elif x.dtype == np.dtype(object) and not op.allow_nan: + check_nan_chunk = (yield from recursive_tile((x != x).any())).chunks[0] + + map_chunks = [] + for c in x.chunks: + chunk_op = op.copy().reset_key() + chunk_op.stage = OperandStage.map + chunk_op._is_finite = is_finite_chunk + chunk_op._check_nan = check_nan_chunk + chunk_inputs = [c] + if is_finite_chunk is not None: + chunk_inputs.append(is_finite_chunk) + if check_nan_chunk is not None: + chunk_inputs.append(check_nan_chunk) + chunk_params = c.params + if op.check_only: + chunk_params["dtype"] = np.dtype(bool) + chunk_params["shape"] = () + if len(x.chunks) == 1: + chunk_params["index"] = () + map_chunk = chunk_op.new_chunk(chunk_inputs, kws=[chunk_params]) + map_chunks.append(map_chunk) + + new_op = op.copy() + if not op.check_only: + params = out.params + params["nsplits"] = x.nsplits + params["chunks"] = map_chunks + return new_op.new_tileables(op.inputs, kws=[params]) + + out_chunks = map_chunks + # if check only, we use tree reduction to aggregate to one chunk + while len(out_chunks) > 1: + size = ceildiv(len(out_chunks), combine_size) + new_out_chunks = [] + for i in range(size): + chunk_op = AssertAllFinite( + check_only=True, + output_types=op.output_types, + stage=OperandStage.combine if size > 1 else OperandStage.agg, + ) + chunk_index = (i,) if size > 1 else () + out_chunk = chunk_op.new_chunk( + out_chunks[combine_size * i : combine_size * (i + 1)], + dtype=out.dtype, + shape=(), + index=chunk_index, + order=out.order, + ) + new_out_chunks.append(out_chunk) + out_chunks = new_out_chunks + + params = out.params + params["nsplits"] = () + params["chunks"] = out_chunks + return new_op.new_tileables(op.inputs, kws=[params]) + + @classmethod + def _execute_map(cls, ctx, op): + allow_nan = op.allow_nan + msg_dtype = op.msg_dtype + raw = x = ctx[op.x.key] + xp = get_array_module(x, nosparse=True) + + if issparse(x): + x = x.data + # First try an O(n) time, O(1) space solution for the common case that + # everything is finite; fall back to O(n) space np.isfinite to prevent + # false positives from overflow in sum method. The sum is also calculated + # safely to reduce dtype induced overflows. + is_float = x.dtype.kind in "fc" + if is_float and ctx[op.is_finite.key]: + pass + elif is_float: + msg_err = "Input contains {} or a value too large for {!r}." + if ( + allow_nan + and xp.isinf(x).any() + or not allow_nan + and not xp.isfinite(x).all() + ): + type_err = "infinity" if allow_nan else "NaN, infinity" + raise ValueError( + msg_err.format( + type_err, msg_dtype if msg_dtype is not None else x.dtype + ) + ) + # for object dtype data, we only check for NaNs + elif x.dtype == np.dtype(object) and not allow_nan: + if ctx[op.check_nan.key]: + raise ValueError("Input contains NaN") + + if op.check_only: + result = np.array(True) + else: + result = raw + ctx[op.outputs[0].key] = result + + @classmethod + def _execute_combine_reduce(cls, ctx, op): + # just return True + ctx[op.outputs[0].key] = np.array(True) + + @classmethod + def execute(cls, ctx, op): + if op.stage == OperandStage.map: + return cls._execute_map(ctx, op) + else: + assert op.stage in (OperandStage.combine, OperandStage.agg) + return cls._execute_combine_reduce(ctx, op) + + +def assert_all_finite(X, allow_nan=False, msg_dtype=None, check_only=True): + if not isinstance(X, ENTITY_TYPE): + X = mt.asarray(X) + + if ( + isinstance(X.op, AssertAllFinite) + and X.op.allow_nan == allow_nan + and X.op.msg_dtype == msg_dtype + and X.op.check_only == check_only + ): + return X + + if check_only: + output_types = [OutputType.tensor] + sparse = False + else: + output_types = get_output_types(X) + sparse = X.issparse() + + op = AssertAllFinite( + x=X, + allow_nan=allow_nan, + msg_dtype=msg_dtype, + check_only=check_only, + sparse=sparse, + output_types=output_types, + ) + return op(X) diff --git a/python/xorbits/_mars/learn/utils/collect_ports.py b/python/xorbits/_mars/learn/utils/collect_ports.py new file mode 100644 index 000000000..0bdfa959f --- /dev/null +++ b/python/xorbits/_mars/learn/utils/collect_ports.py @@ -0,0 +1,111 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import socket + +import numpy as np + +from ... import opcodes +from ...core.operand import OperandStage +from ...serialization.serializables import ( + FieldTypes, + Int32Field, + ListField, + StringField, +) +from ...tensor.merge import TensorConcatenate +from ...utils import get_next_port +from ..operands import LearnOperand, LearnOperandMixin, OutputType + + +class CollectPorts(LearnOperand, LearnOperandMixin): + _op_code_ = opcodes.COLLECT_PORTS + + _socket_type = Int32Field("socket_type") + _index = Int32Field("index") + _workers = ListField("workers", FieldTypes.string) + _tileable_key = StringField("tileable_key") + + def __init__( + self, workers=None, socket_type=None, tileable_key=None, index=None, **kw + ): + super().__init__( + _socket_type=socket_type, + _workers=workers, + _tileable_key=tileable_key, + _index=index, + _pure_depends=[True], + **kw + ) + + @property + def socket_type(self): + return self._socket_type + + @property + def workers(self): + return self._workers + + @property + def tileable_key(self): + return self._tileable_key + + def __call__(self, dep=None): + self._output_types = [OutputType.tensor] + if dep: + deps = [dep] + else: + deps = None + return self.new_tileable(deps, shape=(len(self.workers),), dtype=np.dtype(int)) + + @classmethod + def tile(cls, op: "CollectPorts"): + chunks = [] + if op.inputs: + chunk_iter = itertools.cycle(op.inputs[0].chunks) + else: + chunk_iter = itertools.repeat(None) + for idx, (worker, inp) in enumerate(zip(op.workers, chunk_iter)): + new_op = op.copy().reset_key() + new_op._workers = [worker] + new_op.expect_worker = worker + new_op.stage = OperandStage.map + new_op._tileable_key = op.outputs[0].key + new_op._index = idx + new_op._pure_depends = [True] + inps = [inp] if inp else None + chunks.append( + new_op.new_chunk(inps, index=(idx,), shape=(1,), dtype=np.dtype(int)) + ) + + concat_op = TensorConcatenate(axis=0, dtype=chunks[0].dtype) + concat_chunk = concat_op.new_chunk(chunks, shape=(len(op.workers),), index=(0,)) + + new_op = op.copy().reset_key() + params = op.outputs[0].params + params.update(dict(chunks=[concat_chunk], nsplits=((len(op.workers),),))) + return new_op.new_tileables(op.inputs, **params) + + @classmethod + def execute(cls, ctx, op): + assert ctx.band[0] == op.expect_worker + socket_type = op.socket_type or socket.SOCK_STREAM + port_num = get_next_port(socket_type, occupy=False) + ctx[op.outputs[0].key] = np.array([port_num], dtype=int) + + +def collect_ports(workers, input_tileable=None): + op = CollectPorts(workers=workers) + return op(input_tileable) diff --git a/python/xorbits/_mars/learn/utils/core.py b/python/xorbits/_mars/learn/utils/core.py new file mode 100644 index 000000000..e12c0b3aa --- /dev/null +++ b/python/xorbits/_mars/learn/utils/core.py @@ -0,0 +1,155 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import numbers +import warnings +from typing import List + +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator + +try: + from sklearn import get_config as sklearn_get_config +except ImportError: # pragma: no cover + sklearn_get_config = None + +from ... import options +from ...core import enter_mode +from ...dataframe import DataFrame, Series +from ...dataframe.core import DATAFRAME_TYPE, SERIES_TYPE +from ...tensor import tensor as astensor +from ...typing import TileableType +from ...utils import parse_readable_size + + +def convert_to_tensor_or_dataframe(item): + if isinstance(item, (DATAFRAME_TYPE, pd.DataFrame)): + item = DataFrame(item) + elif isinstance(item, (SERIES_TYPE, pd.Series)): + item = Series(item) + else: + item = astensor(item) + return item + + +def concat_chunks(chunks): + tileable = chunks[0].op.create_tileable_from_chunks(chunks) + return tileable.op.concat_tileable_chunks(tileable).chunks[0] + + +def copy_learned_attributes(from_estimator: BaseEstimator, to_estimator: BaseEstimator): + attrs = { + k: v + for k, v in vars(from_estimator).items() + if k.endswith("_") or k.startswith("_") + } + for k, v in attrs.items(): + setattr(to_estimator, k, v) + + +def is_scalar_nan(x): + """Tests if x is NaN. + + This function is meant to overcome the issue that np.isnan does not allow + non-numerical types as input, and that np.nan is not float('nan'). + + Parameters + ---------- + x : any type + + Returns + ------- + boolean + + Examples + -------- + >>> is_scalar_nan(np.nan) + True + >>> is_scalar_nan(float("nan")) + True + >>> is_scalar_nan(None) + False + >>> is_scalar_nan("") + False + >>> is_scalar_nan([np.nan]) + False + """ + return isinstance(x, numbers.Real) and math.isnan(x) + + +def get_chunk_n_rows(row_bytes, max_n_rows=None, working_memory=None): + """Calculates how many rows can be processed within working_memory + + Parameters + ---------- + row_bytes : int + The expected number of bytes of memory that will be consumed + during the processing of each row. + max_n_rows : int, optional + The maximum return value. + working_memory : int or float, optional + The number of rows to fit inside this number of MiB will be returned. + When None (default), the value of + ``sklearn.get_config()['working_memory']`` is used. + + Returns + ------- + int or the value of n_samples + + Warns + ----- + Issues a UserWarning if ``row_bytes`` exceeds ``working_memory`` MiB. + """ + + if working_memory is None: # pragma: no cover + working_memory = options.learn.working_memory + if working_memory is None and sklearn_get_config is not None: + working_memory = sklearn_get_config()["working_memory"] + elif working_memory is None: + working_memory = 1024 + + if isinstance(working_memory, int): + working_memory *= 2**20 + else: + working_memory = parse_readable_size(working_memory)[0] + + chunk_n_rows = int(working_memory // row_bytes) + if max_n_rows is not None: + chunk_n_rows = min(chunk_n_rows, max_n_rows) + if chunk_n_rows < 1: # pragma: no cover + warnings.warn( + "Could not adhere to working_memory config. " + "Currently %.0fMiB, %.0fMiB required." + % (working_memory, np.ceil(row_bytes * 2**-20)) + ) + chunk_n_rows = 1 + return chunk_n_rows + + +@enter_mode(build=True) +def sort_by( + tensors: List[TileableType], by: TileableType, ascending: bool = True +) -> List[TileableType]: + # sort tensors by another tensor + i_to_tensors = {i: t for i, t in enumerate(tensors)} + if by not in tensors: + by_name = len(i_to_tensors) + i_to_tensors[by_name] = by + else: + by_name = tensors.index(by) + df = DataFrame(i_to_tensors) + sorted_df = df.sort_values(by_name, ascending=ascending) + return [sorted_df[i].to_tensor() for i in range(len(tensors))] diff --git a/python/xorbits/_mars/learn/utils/extmath.py b/python/xorbits/_mars/learn/utils/extmath.py new file mode 100644 index 000000000..76c8d07fc --- /dev/null +++ b/python/xorbits/_mars/learn/utils/extmath.py @@ -0,0 +1,107 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import tensor as mt + + +# Use at least float64 for the accumulating functions to avoid precision issue +# see https://github.com/numpy/numpy/issues/9393. The float64 is also retained +# as it is in case the float overflows +def _safe_accumulator_op(op, x, *args, **kwargs): + """ + This function provides numpy accumulator functions with a float64 dtype + when used on a floating point input. This prevents accumulator overflow on + smaller floating point dtypes. + + Parameters + ---------- + op : function + A accumulator function such as np.mean or np.sum + x : numpy array + A tensor to apply the accumulator function + *args : positional arguments + Positional arguments passed to the accumulator function after the + input x + **kwargs : keyword arguments + Keyword arguments passed to the accumulator function + + Returns + ------- + result : The output of the accumulator function passed to this function + """ + if np.issubdtype(x.dtype, np.floating) and x.dtype.itemsize < 8: + result = op(x, *args, **kwargs, dtype=np.float64) + else: + result = op(x, *args, **kwargs) + return result + + +def row_norms(X, squared=False): + """Row-wise (squared) Euclidean norm of X. + + Performs no input validation. + + Parameters + ---------- + X : array_like + The input tensor + squared : bool, optional (default = False) + If True, return squared norms. + + Returns + ------- + array_like + The row-wise (squared) Euclidean norm of X. + """ + + norms = (X * X).sum(axis=1) + if not squared: + norms = mt.sqrt(norms) + return norms + + +def softmax(X, copy=True): + """ + Calculate the softmax function. + + The softmax function is calculated by + np.exp(X) / np.sum(np.exp(X), axis=1) + + This will cause overflow when large values are exponentiated. + Hence the largest value in each row is subtracted from each data + point to prevent this. + + Parameters + ---------- + X : array-like of float of shape (M, N) + Argument to the logistic function. + + copy : bool, default=True + Copy X or not. + + Returns + ------- + out : ndarray of shape (M, N) + Softmax function evaluated at every point in x. + """ + if copy: + X = mt.copy(X) + max_prob = mt.max(X, axis=1).reshape((-1, 1)) + X = X - max_prob + X = mt.exp(X) + sum_prob = mt.sum(X, axis=1).reshape((-1, 1)) + X = X / sum_prob + return X diff --git a/python/xorbits/_mars/learn/utils/multiclass.py b/python/xorbits/_mars/learn/utils/multiclass.py new file mode 100644 index 000000000..854e37211 --- /dev/null +++ b/python/xorbits/_mars/learn/utils/multiclass.py @@ -0,0 +1,465 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from collections.abc import Sequence +from typing import List + +import numpy as np +from scipy.sparse.base import spmatrix +from sklearn.utils.multiclass import is_multilabel as sklearn_is_multilabel +from sklearn.utils.multiclass import type_of_target as sklearn_type_of_target + +from ... import opcodes as OperandDef +from ... import tensor as mt +from ...core import ENTITY_TYPE, TILEABLE_TYPE, recursive_tile +from ...core.context import get_context +from ...serialization.serializables import AnyField, ListField +from ...tensor.core import TENSOR_TYPE, TensorOrder +from ...typing import TileableType +from ...utils import has_unknown_shape +from ..operands import LearnOperand, LearnOperandMixin, OutputType +from ..utils import assert_all_finite +from .validation import check_array + + +def _unique_multiclass(y): + if hasattr(y, "__array__") or hasattr(y, "__mars_tensor__"): + return mt.unique(mt.asarray(y)) + else: + return set(y) + + +def _unique_indicator(y): + return mt.arange(check_array(y, accept_sparse=True).shape[1]) + + +_FN_UNIQUE_LABELS = { + "binary": _unique_multiclass, + "multiclass": _unique_multiclass, + "multilabel-indicator": _unique_indicator, +} + + +class UniqueLabels(LearnOperand, LearnOperandMixin): + _op_type_ = OperandDef.UNIQUE_LABELS + + ys = ListField("ys") + + def __call__(self, ys: List[TileableType]): + self._output_types = [OutputType.tensor] + inputs = [y for y in ys if isinstance(y, TILEABLE_TYPE)] + return self.new_tileable( + inputs, + shape=(np.nan,), + dtype=mt.tensor(ys[0]).dtype, + order=TensorOrder.C_ORDER, + ) + + @classmethod + def tile(cls, op: "UniqueLabels"): + ys = op.ys + ctx = get_context() + + target_types = yield from recursive_tile([type_of_target(x) for x in ys]) + # yield chunks of target_types for execution + chunks = list(itertools.chain(*(t.chunks for t in target_types))) + yield chunks + + ys_types = set( + [it.item() for it in ctx.get_chunks_result([c.key for c in chunks])] + ) + if ys_types == {"binary", "multiclass"}: + ys_types = {"multiclass"} + + if len(ys_types) > 1: + raise ValueError("Mix type of y not allowed, got types %s" % ys_types) + + label_type = ys_types.pop() + + # Check consistency for the indicator format + if label_type == "multilabel-indicator": + check_arrays = [] + chunks = [] + for y in ys: + arr = yield from recursive_tile(check_array(y, accept_sparse=True)) + check_arrays.append(arr) + chunks.extend(arr.chunks) + yield check_arrays + chunks + if len(set(arr.shape[1] for arr in check_arrays)) > 1: + raise ValueError( + "Multi-label binary indicator input with " + "different numbers of labels" + ) + + # Get the unique set of labels + _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None) + if not _unique_labels: + raise ValueError("Unknown label type: %s" % repr(ys)) + + labels = [_unique_labels(y) for y in ys] + labels_chunks = [] + ys_labels = set() + for label in labels: + if isinstance(label, ENTITY_TYPE): + label = yield from recursive_tile(label) + labels_chunks.extend(label.chunks) + else: + ys_labels.update(label) + yield labels_chunks + ys_labels.update( + itertools.chain.from_iterable( + ctx.get_chunks_result([c.key for c in labels_chunks]) + ) + ) + + # Check that we don't mix string type with number type + if len(set(isinstance(label, str) for label in ys_labels)) > 1: + raise ValueError("Mix of label input types (string and number)") + + return (yield from recursive_tile(mt.array(sorted(ys_labels)))) + + +def unique_labels(*ys): + """ + Extract an ordered array of unique labels. + + We don't allow: + - mix of multilabel and multiclass (single label) targets + - mix of label indicator matrix and anything else, + because there are no explicit labels) + - mix of label indicator matrices of different sizes + - mix of string and integer labels + + At the moment, we also don't allow "multiclass-multioutput" input type. + + Parameters + ---------- + *ys : array-likes + + Returns + ------- + out : ndarray of shape (n_unique_labels,) + An ordered array of unique labels. + + Examples + -------- + >>> from mars.learn.utils.multiclass import unique_labels + >>> unique_labels([3, 5, 5, 5, 7, 7]).execute() + array([3, 5, 7]) + >>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4]).execute() + array([1, 2, 3, 4]) + >>> unique_labels([1, 2, 10], [5, 11]).execute() + array([ 1, 2, 5, 10, 11]) + """ + if not ys: + raise ValueError("No argument has been passed.") + + ys = list(ys) + op = UniqueLabels(ys=ys) + return op(ys) + + +class IsMultilabel(LearnOperand, LearnOperandMixin): + _op_type_ = OperandDef.IS_MULTILABEL + + y = AnyField("y") + + def __call__(self, y): + self._output_types = [OutputType.tensor] + inputs = [y] if isinstance(y, ENTITY_TYPE) else [] + return self.new_tileable( + inputs, shape=(), dtype=np.dtype(bool), order=TensorOrder.C_ORDER + ) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if self._inputs: + self.y = self._inputs[0] + + @classmethod + def _tile(cls, op: "IsMultilabel"): + y = op.y + + if not isinstance(y, ENTITY_TYPE): + return sklearn_is_multilabel(y) + + ctx = get_context() + + if has_unknown_shape(y): # pragma: no cover + yield + + if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1): + return False + + labels = yield from recursive_tile(mt.unique(y)) + yield labels.chunks + [labels] + + if len(labels) < 3: + if y.dtype.kind in "biu": + return True + if y.dtype.kind == "f": + is_integral_float = yield from recursive_tile( + mt.all(mt.equal(y.astype(int), y)) + ) + yield is_integral_float.chunks + is_integral_float = ctx.get_chunks_result( + [is_integral_float.chunks[0].key] + )[0] + if is_integral_float: + return True + + return False + + @classmethod + def tile(cls, op: "IsMultilabel"): + result = yield from cls._tile(op) + return (yield from recursive_tile(mt.array(result))) + + +def is_multilabel(y): + """ + Check if ``y`` is in a multilabel format. + + Parameters + ---------- + y : numpy array of shape [n_samples] + Target values. + + Returns + ------- + out : bool, + Return ``True``, if ``y`` is in a multilabel format, else ```False``. + + Examples + -------- + >>> import mars.tensor as mt + >>> from mars.learn.utils.multiclass import is_multilabel + >>> is_multilabel([0, 1, 0, 1]).execute() + False + >>> is_multilabel([[1], [0, 2], []]).execute() + False + >>> is_multilabel(mt.array([[1, 0], [0, 0]])).execute() + True + >>> is_multilabel(mt.array([[1], [0], [0]])).execute() + False + >>> is_multilabel(mt.array([[1, 0, 0]])).execute() + True + """ + if not isinstance(y, ENTITY_TYPE): + if hasattr(y, "__array__") or isinstance(y, Sequence): + y = np.asarray(y) + yt = None + else: + yt = y = mt.tensor(y) + + op = IsMultilabel(y=y) + return op(yt) + + +class TypeOfTarget(LearnOperand, LearnOperandMixin): + _op_type_ = OperandDef.TYPE_OF_TARGET + + y = AnyField("y") + + def __call__(self, y: TileableType): + self._output_types = [OutputType.tensor] + inputs = [y] if isinstance(y, ENTITY_TYPE) else [] + return self.new_tileable( + inputs, shape=(), order=TensorOrder.C_ORDER, dtype=np.dtype(object) + ) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if self._inputs: + self.y = self._inputs[0] + + @classmethod + def _tile(cls, op: "TypeOfTarget"): + y = op.y + + # y is ndarray + if not isinstance(y, ENTITY_TYPE): + return sklearn_type_of_target(y) + else: + # make sure y executed + yield + + ctx = get_context() + + multilabel = yield from recursive_tile(is_multilabel(y)) + yield multilabel.chunks + multilabel = ctx.get_chunks_result([multilabel.chunks[0].key])[0] + if multilabel: + return "multilabel-indicator" + + # Invalid inputs + if y.ndim > 2: + return "unknown" + if y.dtype == object and len(y): + # [[[1, 2]]] or [obj_1] and not ["label_1"] + first_val = ctx.get_chunks_result([y.chunks[0].key])[0].flat[0] + if not isinstance(first_val, str): + return "unknown" + + if y.ndim == 2 and y.shape[1] == 0: + return "unknown" # [[]] + + if y.ndim == 2 and y.shape[1] > 1: + suffix = "-multioutput" # [[1, 2], [1, 2]] + else: + suffix = "" # [1, 2, 3] or [[1], [2], [3]] + + if y.dtype.kind == "f": + # check float and contains non-integer float values + contain_float_values = yield from recursive_tile(mt.any(y != y.astype(int))) + yield contain_float_values.chunks + contain_float_values = ctx.get_chunks_result( + [contain_float_values.chunks[0].key] + )[0] + # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] + if contain_float_values: + yield from recursive_tile(assert_all_finite(y)) + return "continuous" + suffix + + unique_y = yield from recursive_tile(mt.unique(y)) + yield unique_y.chunks + [unique_y] + if (len(unique_y) > 2) or (y.ndim >= 2 and len(y[0]) > 1): + return "multiclass" + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] + else: + return "binary" # [1, 2] or [["a"], ["b"]] + + @classmethod + def tile(cls, op: "TypeOfTarget"): + result = yield from cls._tile(op) + return (yield from recursive_tile(mt.array(result))) + + +def type_of_target(y): + """ + Determine the type of data indicated by the target. + + Note that this type is the most specific type that can be inferred. + For example: + + * ``binary`` is more specific but compatible with ``multiclass``. + * ``multiclass`` of integers is more specific but compatible with + ``continuous``. + * ``multilabel-indicator`` is more specific but compatible with + ``multiclass-multioutput``. + + Parameters + ---------- + y : array-like + + Returns + ------- + target_type : string + One of: + + * 'continuous': `y` is an array-like of floats that are not all + integers, and is 1d or a column vector. + * 'continuous-multioutput': `y` is a 2d tensor of floats that are + not all integers, and both dimensions are of size > 1. + * 'binary': `y` contains <= 2 discrete values and is 1d or a column + vector. + * 'multiclass': `y` contains more than two discrete values, is not a + sequence of sequences, and is 1d or a column vector. + * 'multiclass-multioutput': `y` is a 2d tensor that contains more + than two discrete values, is not a sequence of sequences, and both + dimensions are of size > 1. + * 'multilabel-indicator': `y` is a label indicator matrix, a tensor + of two dimensions with at least two columns, and at most 2 unique + values. + * 'unknown': `y` is array-like but none of the above, such as a 3d + tensor, sequence of sequences, or a tensor of non-sequence objects. + + Examples + -------- + >>> import mars.tensor as mt + >>> from mars.learn.utils.multiclass import type_of_target + >>> type_of_target([0.1, 0.6]).execute() + 'continuous' + >>> type_of_target([1, -1, -1, 1]).execute() + 'binary' + >>> type_of_target(['a', 'b', 'a']).execute() + 'binary' + >>> type_of_target([1.0, 2.0]).execute() + 'binary' + >>> type_of_target([1, 0, 2]).execute() + 'multiclass' + >>> type_of_target([1.0, 0.0, 3.0]).execute() + 'multiclass' + >>> type_of_target(['a', 'b', 'c']).execute() + 'multiclass' + >>> type_of_target(mt.array([[1, 2], [3, 1]])).execute() + 'multiclass-multioutput' + >>> type_of_target([[1, 2]]).execute() + 'multiclass-multioutput' + >>> type_of_target(mt.array([[1.5, 2.0], [3.0, 1.6]])).execute() + 'continuous-multioutput' + >>> type_of_target(mt.array([[0, 1], [1, 1]])).execute() + 'multilabel-indicator' + """ + if isinstance(y, TENSOR_TYPE): + y = mt.tensor(y) + + valid_types = (Sequence, spmatrix) if spmatrix is not None else (Sequence,) + valid = ( + isinstance(y, valid_types) + or hasattr(y, "__array__") + or hasattr(y, "__mars_tensor__") + ) and not isinstance(y, str) + + if not valid: + raise ValueError(f"Expected array-like (array or non-string sequence), got {y}") + + sparse_pandas = y.__class__.__name__ in ["SparseSeries", "SparseArray"] + if sparse_pandas: # pragma: no cover + raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'") + + if isinstance(y, ENTITY_TYPE): + y = mt.tensor(y) + + op = TypeOfTarget(y=y) + return op(y) + + +def check_classification_targets(y): + """ + Ensure that target y is of a non-regression type. + + Only the following target types (as defined in type_of_target) are allowed: + 'binary', 'multiclass', 'multiclass-multioutput', + 'multilabel-indicator', 'multilabel-sequences' + + Parameters + ---------- + y : array-like + """ + y_type = type_of_target(y) + + def check(t): + if t not in [ + "binary", + "multiclass", + "multiclass-multioutput", + "multilabel-indicator", + "multilabel-sequences", + ]: + raise ValueError("Unknown label type: %r" % t) + return t + + y_type = y_type.map_chunk(check, dtype=y_type.dtype) + return y_type diff --git a/python/xorbits/_mars/learn/utils/shuffle.py b/python/xorbits/_mars/learn/utils/shuffle.py new file mode 100644 index 000000000..8b8a3d646 --- /dev/null +++ b/python/xorbits/_mars/learn/utils/shuffle.py @@ -0,0 +1,490 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from collections import defaultdict +from collections.abc import Iterable +from functools import reduce + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...core import ExecutableTuple, get_output_types, recursive_tile +from ...core.operand import MapReduceOperand, OperandStage +from ...dataframe.utils import parse_index +from ...lib import sparse +from ...serialization.serializables import FieldTypes, KeyField, TupleField +from ...tensor.array_utils import get_array_module +from ...tensor.utils import ( + check_random_state, + decide_unify_split, + gen_random_seeds, + validate_axis, +) +from ...utils import has_unknown_shape, lazy_import, tokenize +from ..operands import LearnOperandMixin, LearnShuffleProxy, OutputType +from ..utils import convert_to_tensor_or_dataframe + +cudf = lazy_import("cudf") + + +def _shuffle_index_value(op, index_value, chunk_index=None): + key = tokenize((op._values_, chunk_index, index_value.key)) + return parse_index(pd.Index([], index_value.to_pandas().dtype), key=key) + + +def _safe_slice(obj, slc, output_type): + if output_type == OutputType.tensor: + return obj[slc] + else: + return obj.iloc[slc] + + +class LearnShuffle(MapReduceOperand, LearnOperandMixin): + _op_type_ = OperandDef.PERMUTATION + + _axes = TupleField("axes", FieldTypes.int32) + _seeds = TupleField("seeds", FieldTypes.uint32) + + _input = KeyField("input") + _reduce_sizes = TupleField("reduce_sizes", FieldTypes.uint32) + + def __init__( + self, axes=None, seeds=None, output_types=None, reduce_sizes=None, **kw + ): + super().__init__( + _axes=axes, + _seeds=seeds, + _output_types=output_types, + _reduce_sizes=reduce_sizes, + **kw, + ) + + @property + def axes(self): + return self._axes + + @property + def seeds(self): + return self._seeds + + @property + def input(self): + return self._input + + @property + def reduce_sizes(self): + return self._reduce_sizes + + @property + def output_limit(self): + if self.stage is None: + return len(self.output_types) + return 1 + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + def __call__(self, arrays): + params = self._calc_params([ar.params for ar in arrays]) + return self.new_tileables(arrays, kws=params) + + def _shuffle_index_value(self, index_value): + return _shuffle_index_value(self, index_value) + + def _shuffle_dtypes(self, dtypes): + seed = self.seeds[self.axes.index(1)] + rs = np.random.RandomState(seed) + shuffled_dtypes = dtypes[rs.permutation(np.arange(len(dtypes)))] + return shuffled_dtypes + + def _calc_params(self, params): + axes = set(self.axes) + for i, output_type, param in zip(itertools.count(0), self.output_types, params): + if output_type == OutputType.dataframe: + if 0 in axes: + param["index_value"] = self._shuffle_index_value( + param["index_value"] + ) + if 1 in axes: + dtypes = param["dtypes"] = self._shuffle_dtypes(param["dtypes"]) + param["columns_value"] = parse_index(dtypes.index, store_data=True) + elif output_type == OutputType.series: + if 0 in axes: + param["index_value"] = self._shuffle_index_value( + param["index_value"] + ) + param["_position_"] = i + return params + + @staticmethod + def _safe_rechunk(tileable, ax_nsplit): + do_rechunk = False + for ax, nsplit in ax_nsplit.items(): + if ax >= tileable.ndim: + continue + if tuple(tileable.nsplits[ax]) != tuple(nsplit): + do_rechunk = True + if do_rechunk: + return (yield from recursive_tile(tileable.rechunk(ax_nsplit))) + else: + return tileable + + @classmethod + def _calc_chunk_params( + cls, + in_chunk, + axes, + chunk_shape, + output, + output_type, + chunk_op, + no_shuffle: bool, + ): + params = {"index": in_chunk.index} + if output_type == OutputType.tensor: + shape_c = list(in_chunk.shape) + for ax in axes: + if not no_shuffle and chunk_shape[ax] > 1: + shape_c[ax] = np.nan + params["shape"] = tuple(shape_c) + params["dtype"] = in_chunk.dtype + params["order"] = output.order + elif output_type == OutputType.dataframe: + shape_c = list(in_chunk.shape) + if 0 in axes: + if not no_shuffle and chunk_shape[0] > 1: + shape_c[0] = np.nan + params["shape"] = tuple(shape_c) + if 1 not in axes: + params["dtypes"] = in_chunk.dtypes + params["columns_value"] = in_chunk.columns_value + else: + params["dtypes"] = output.dtypes + params["columns_value"] = output.columns_value + params["index_value"] = _shuffle_index_value( + chunk_op, in_chunk.index_value, in_chunk.index + ) + else: + assert output_type == OutputType.series + if no_shuffle: + params["shape"] = in_chunk.shape + else: + params["shape"] = (np.nan,) + params["name"] = in_chunk.name + params["index_value"] = _shuffle_index_value( + chunk_op, in_chunk.index_value, in_chunk.index + ) + params["dtype"] = in_chunk.dtype + return params + + @classmethod + def tile(cls, op): + inputs = op.inputs + if has_unknown_shape(inputs): + yield + axis_to_nsplits = defaultdict(list) + has_dataframe = any( + output_type == OutputType.dataframe for output_type in op.output_types + ) + for ax in op.axes: + if has_dataframe and ax == 1: + # if DataFrame exists, for the columns axis, + # we only allow 1 chunk to ensure the columns consistent + axis_to_nsplits[ax].append((inputs[0].shape[ax],)) + continue + for inp in inputs: + if ax < inp.ndim: + axis_to_nsplits[ax].append(inp.nsplits[ax]) + ax_nsplit = {ax: decide_unify_split(*ns) for ax, ns in axis_to_nsplits.items()} + rechunked_inputs = [] + for inp in inputs: + inp_ax_nsplit = {ax: ns for ax, ns in ax_nsplit.items() if ax < inp.ndim} + inp = yield from cls._safe_rechunk(inp, inp_ax_nsplit) + rechunked_inputs.append(inp) + inputs = rechunked_inputs + + mapper_seeds = [None] * len(op.axes) + reducer_seeds = [None] * len(op.axes) + for i, ax in enumerate(op.axes): + rs = np.random.RandomState(op.seeds[i]) + size = len(ax_nsplit[ax]) + if size > 1: + mapper_seeds[i] = gen_random_seeds(size, rs) + reducer_seeds[i] = gen_random_seeds(size, rs) + else: + mapper_seeds[i] = reducer_seeds[i] = [op.seeds[i]] * size + out_chunks = [] + out_nsplits = [] + for output_type, inp, oup in zip(op.output_types, inputs, op.outputs): + inp_axes = tuple(ax for ax in op.axes if ax < inp.ndim) + reduce_sizes = tuple(inp.chunk_shape[ax] for ax in inp_axes) + output_types = [output_type] + + if len(inp_axes) == 0: + continue + + nsplits = list(inp.nsplits) + for ax in inp_axes: + cs = len(nsplits[ax]) + if cs > 1: + nsplits[ax] = (np.nan,) * cs + out_nsplits.append(tuple(nsplits)) + + if all(reduce_size == 1 for reduce_size in reduce_sizes): + # no need to do shuffle + chunks = [] + for c in inp.chunks: + chunk_op = LearnShuffle( + axes=inp_axes, + seeds=op.seeds[: len(inp_axes)], + output_types=output_types, + ) + params = cls._calc_chunk_params( + c, inp_axes, inp.chunk_shape, oup, output_type, chunk_op, True + ) + out_chunk = chunk_op.new_chunk([c], kws=[params]) + chunks.append(out_chunk) + out_chunks.append(chunks) + continue + + if inp.ndim > 1: + left_chunk_shape = [ + s for ax, s in enumerate(inp.chunk_shape) if ax not in inp_axes + ] + idx_iter = itertools.product(*[range(s) for s in left_chunk_shape]) + else: + idx_iter = [()] + reduce_chunks = [] + out_chunks.append(reduce_chunks) + for idx in idx_iter: + map_chunks = [] + for reducer_inds in itertools.product( + *[range(s) for s in reduce_sizes] + ): + inp_index = list(idx) + for ax, reducer_ind in zip(inp_axes, reducer_inds): + inp_index.insert(ax, reducer_ind) + inp_index = tuple(inp_index) + in_chunk = inp.cix[inp_index] + params = in_chunk.params + map_chunk_op = LearnShuffle( + stage=OperandStage.map, + output_types=output_types, + axes=inp_axes, + seeds=tuple( + mapper_seeds[j][in_chunk.index[ax]] + for j, ax in enumerate(inp_axes) + ), + reduce_sizes=reduce_sizes, + ) + map_chunk = map_chunk_op.new_chunk([in_chunk], **params) + map_chunks.append(map_chunk) + + map_chunk_kw = {} + if output_type == OutputType.tensor: + map_chunk_kw = {"dtype": inp.dtype, "shape": ()} + proxy_chunk = LearnShuffleProxy( + _tileable_keys=[inp.key], output_types=[output_type] + ).new_chunk(map_chunks, **map_chunk_kw) + + reduce_axes = tuple( + ax for j, ax in enumerate(inp_axes) if reduce_sizes[j] > 1 + ) + reduce_sizes_ = tuple(rs for rs in reduce_sizes if rs > 1) + for c in map_chunks: + chunk_op = LearnShuffle( + stage=OperandStage.reduce, + output_types=output_types, + axes=reduce_axes, + seeds=tuple( + reducer_seeds[j][c.index[ax]] + for j, ax in enumerate(inp_axes) + if reduce_sizes[j] > 1 + ), + reduce_sizes=reduce_sizes_, + n_reducers=len(map_chunks), + ) + params = cls._calc_chunk_params( + c, inp_axes, inp.chunk_shape, oup, output_type, chunk_op, False + ) + reduce_chunk = chunk_op.new_chunk([proxy_chunk], kws=[params]) + reduce_chunks.append(reduce_chunk) + + new_op = op.copy() + params = [out.params for out in op.outputs] + if len(out_chunks) < len(op.outputs): + # axes are all higher than its ndim + for i, inp in enumerate(op.inputs): + if all(ax >= inp.ndim for ax in op.axes): + out_chunks.insert(i, inp.chunks) + out_nsplits.insert(i, inp.nsplits) + assert len(out_chunks) == len(op.outputs) + for i, param, chunks, ns in zip( + itertools.count(), params, out_chunks, out_nsplits + ): + param["chunks"] = chunks + param["nsplits"] = ns + param["_position_"] = i + return new_op.new_tileables(op.inputs, kws=params) + + @classmethod + def execute_single(cls, ctx, op): + x = ctx[op.inputs[0].key] + conv = lambda x: x + if op.output_types[0] == OutputType.tensor: + xp = get_array_module(x) + if xp is sparse: + conv = lambda x: x + else: + conv = ( + xp.ascontiguousarray + if op.outputs[0].order.value == "C" + else xp.asfortranarray + ) + + for axis, seed in zip(op.axes, op.seeds): + size = x.shape[axis] + ind = np.random.RandomState(seed).permutation(np.arange(size)) + slc = (slice(None),) * axis + (ind,) + x = _safe_slice(x, slc, op.output_types[0]) + + ctx[op.outputs[0].key] = conv(x) + + @classmethod + def execute_map(cls, ctx, op): + out = op.outputs[0] + x = ctx[op.input.key] + axes, seeds, reduce_sizes = op.axes, op.seeds, op.reduce_sizes + if 1 in set(op.reduce_sizes): + # if chunk size on shuffle axis == 0 + inds = [slice(None) for _ in range(x.ndim)] + extra_axes, extra_seeds, extra_reduce_sizes = [], [], [] + for ax, seed, reduce_size in zip(axes, seeds, reduce_sizes): + rs = np.random.RandomState(seed) + if reduce_size == 1: + inds[ax] = rs.permutation(np.arange(x.shape[ax])) + else: + extra_axes.append(ax) + extra_seeds.append(seed) + extra_reduce_sizes.append(reduce_size) + # for the reduce == 1 + # do shuffle on the map phase + x = _safe_slice(x, tuple(inds), op.output_types[0]) + axes, seeds, reduce_sizes = extra_axes, extra_seeds, extra_reduce_sizes + + to_hash_inds = [] + for ax, seed, reduce_size in zip(axes, seeds, reduce_sizes): + rs = np.random.RandomState(seed) + to_hash_inds.append(rs.randint(reduce_size, size=x.shape[ax])) + + for reduce_index in itertools.product(*(range(rs) for rs in reduce_sizes)): + index = list(out.index) + for ax, ind in zip(axes, reduce_index): + index[ax] = ind + selected = x + for ax, to_hash_ind in zip(axes, to_hash_inds): + slc = (slice(None),) * ax + (to_hash_ind == index[ax],) + selected = _safe_slice(selected, slc, op.output_types[0]) + ctx[out.key, tuple(index)] = (ctx.get_current_chunk().index, selected) + + @classmethod + def execute_reduce(cls, ctx, op: "LearnShuffle"): + inputs_grid = np.empty(op.reduce_sizes, dtype=object) + for input_index, inp in op.iter_mapper_data(ctx): + reduce_index = tuple(input_index[ax] for ax in op.axes) + inputs_grid[reduce_index] = inp + ret = cls._concat_grid(inputs_grid, op.axes, op.output_types[0]) + for ax, seed in zip(op.axes, op.seeds): + ind = np.random.RandomState(seed).permutation(np.arange(ret.shape[ax])) + slc = (slice(None),) * ax + (ind,) + ret = _safe_slice(ret, slc, op.output_types[0]) + ctx[op.outputs[0].key] = ret + + @classmethod + def _concat_grid(cls, grid, axes, output_type): + if output_type == OutputType.tensor: + return cls._concat_tensor_grid(grid, axes) + elif output_type == OutputType.dataframe: + return cls._concat_dataframe_grid(grid, axes) + else: + assert output_type == OutputType.series + return cls._concat_series_grid(grid, axes) + + @classmethod + def _concat_dataframe_grid(cls, grid, axes): + xdf = pd if isinstance(grid.ravel()[0], pd.DataFrame) else cudf + # if 1 exists in axes, the shuffle would have been done in map phase + assert len(axes) == 1 + return xdf.concat(grid, axis=axes[0]) + + @classmethod + def _concat_series_grid(cls, grid, axes): + assert axes == (0,) and grid.ndim == 1 + + return reduce(lambda a, b: a.append(b), grid) + + @classmethod + def _concat_tensor_grid(cls, grid, axes): + cur = grid + xp = get_array_module(grid.ravel()[0]) + for ax, i in zip(axes[:0:-1], range(len(axes) - 1, 0, -1)): + new_shape = grid.shape[:i] + new_grid = np.empty(new_shape, dtype=object) + for idx in itertools.product(*(range(s) for s in new_shape)): + new_grid[idx] = xp.concatenate(cur[idx], axis=ax) + cur = new_grid + return xp.concatenate(cur, axis=axes[0]) + + @classmethod + def execute(cls, ctx, op): + if op.stage == OperandStage.map: + cls.execute_map(ctx, op) + elif op.stage == OperandStage.reduce: + cls.execute_reduce(ctx, op) + else: + cls.execute_single(ctx, op) + + +def shuffle(*arrays, **options): + arrays = [convert_to_tensor_or_dataframe(ar) for ar in arrays] + axes = options.pop("axes", (0,)) + if not isinstance(axes, Iterable): + axes = (axes,) + elif not isinstance(axes, tuple): + axes = tuple(axes) + random_state = check_random_state(options.pop("random_state", None)).to_numpy() + if options: + raise TypeError( + f"shuffle() got an unexpected keyword argument {next(iter(options))}" + ) + + max_ndim = max(ar.ndim for ar in arrays) + axes = tuple(np.unique([validate_axis(max_ndim, ax) for ax in axes]).tolist()) + seeds = gen_random_seeds(len(axes), random_state) + + # verify shape + for ax in axes: + shapes = {ar.shape[ax] for ar in arrays if ax < ar.ndim} + if len(shapes) > 1: + raise ValueError(f"arrays do not have same shape on axis {ax}") + + op = LearnShuffle(axes=axes, seeds=seeds, output_types=get_output_types(*arrays)) + shuffled_arrays = op(arrays) + if len(arrays) == 1: + return shuffled_arrays[0] + else: + return ExecutableTuple(shuffled_arrays) diff --git a/python/xorbits/_mars/learn/utils/sparsefuncs.py b/python/xorbits/_mars/learn/utils/sparsefuncs.py new file mode 100644 index 000000000..1033edb51 --- /dev/null +++ b/python/xorbits/_mars/learn/utils/sparsefuncs.py @@ -0,0 +1,183 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +import numpy as np + +from ... import opcodes +from ... import tensor as mt +from ...core import OutputType, recursive_tile +from ...serialization.serializables import Int16Field, ReferenceField +from ...utils import has_unknown_shape +from ..operands import LearnOperand, LearnOperandMixin + + +class LearnCountNonzero(LearnOperand, LearnOperandMixin): + _op_code_ = opcodes.COUNT_NONZERO + + axis = Int16Field("axis") + sample_weight = ReferenceField("sample_weight") + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if self.sample_weight is not None: + self.sample_weight = inputs[-1] + + def __call__(self, x, sample_weight=None): + self.sample_weight = sample_weight + self._output_types = [ + OutputType.scalar if self.axis is None else OutputType.tensor + ] + dtype = np.dtype(int) + inputs = [x] + if sample_weight is not None: + dtype = sample_weight.dtype + inputs = [x, sample_weight] + + if self.axis is None: + shape = () + else: + shape = (x.shape[1 - self.axis],) + + return self.new_tileable(inputs, shape=shape, dtype=dtype) + + @classmethod + def tile(cls, op: "LearnCountNonzero"): + input_tensor = op.inputs[0] + out_tensor = op.outputs[0] + + if op.sample_weight is not None: + if has_unknown_shape(input_tensor): + yield + sample_weight = yield from recursive_tile( + op.sample_weight.rechunk({0: input_tensor.nsplits[0]}) + ) + else: + sample_weight = None + + chunks = [] + for input_chunk in input_tensor.chunks: + if sample_weight is None: + weight_chunk = None + else: + weight_chunk = sample_weight.cix[(input_chunk.index[0],)] + + new_op = op.copy().reset_key() + new_op.sample_weight = weight_chunk + + inputs = [input_chunk] if not weight_chunk else [input_chunk, weight_chunk] + if op.axis is None: + shape = (1, 1) + elif op.axis == 0: + shape = (1, input_chunk.shape[1]) + else: + shape = (input_chunk.shape[0], 1) + chunks.append( + new_op.new_chunk( + inputs, shape=shape, dtype=out_tensor.dtype, index=input_chunk.index + ) + ) + + new_op = op.copy().reset_key() + if op.axis is None: + nsplits = tuple((1,) * len(split) for split in input_tensor.nsplits) + shape = tuple(len(split) for split in input_tensor.nsplits) + elif op.axis == 0: + nsplits = ((1,) * len(input_tensor.nsplits[0]), input_tensor.nsplits[1]) + shape = (len(input_tensor.nsplits[0]), input_tensor.shape[1]) + else: + nsplits = (input_tensor.nsplits[0], (1,) * len(input_tensor.nsplits[1])) + shape = (input_tensor.shape[0], len(input_tensor.nsplits[1])) + + tileable = new_op.new_tileable( + out_tensor.inputs, + chunks=chunks, + nsplits=nsplits, + shape=shape, + dtype=out_tensor.dtype, + ) + return [(yield from recursive_tile(mt.sum(tileable, axis=op.axis)))] + + @classmethod + def execute(cls, ctx, op: "LearnCountNonzero"): + axis = op.axis + X = ctx[op.inputs[0].key] + sample_weight = ( + ctx[op.sample_weight.key] if op.sample_weight is not None else None + ) + + # We rely here on the fact that np.diff(Y.indptr) for a CSR + # will return the number of nonzero entries in each row. + # A bincount over Y.indices will return the number of nonzeros + # in each column. See ``csr_matrix.getnnz`` in scipy >= 0.14. + if axis is None: + if sample_weight is None: + res = X.nnz + else: + res = np.dot(np.diff(X.indptr), sample_weight) + elif axis == 1: + out = np.diff(X.indptr) + if sample_weight is None: + # astype here is for consistency with axis=0 dtype + res = out.astype("intp") + else: + res = out * sample_weight + else: + if sample_weight is None: + res = np.bincount(X.indices, minlength=X.shape[1]) + else: + weights = np.repeat(sample_weight, np.diff(X.indptr)) + res = np.bincount(X.indices, minlength=X.shape[1], weights=weights) + if np.isscalar(res): + res = np.array([res]) + out_shape = op.outputs[0].shape + if any(np.isnan(s) for s in out_shape): + new_shape = list(out_shape) + for i, s in enumerate(out_shape): + if np.isnan(s): + new_shape[i] = -1 + out_shape = tuple(new_shape) + ctx[op.outputs[0].key] = res.reshape(out_shape) + + +def count_nonzero(X, axis: Optional[int] = None, sample_weight=None): + """A variant of X.getnnz() with extension to weighting on axis 0 + + Useful in efficiently calculating multilabel metrics. + + Parameters + ---------- + X : CSR sparse matrix of shape (n_samples, n_labels) + Input data. + + axis : None, 0 or 1 + The axis on which the data is aggregated. + + sample_weight : array-like of shape (n_samples,), default=None + Weight for each row of X. + """ + if axis == -1: + axis = 1 + elif axis == -2: + axis = 0 + if axis is not None and axis not in (0, 1): + raise ValueError(f"Unsupported axis: {axis}") + + X = mt.asarray(X) + if sample_weight is not None: + sample_weight = mt.asarray(sample_weight) + + op = LearnCountNonzero(axis=axis) + return op(X, sample_weight=sample_weight) diff --git a/python/xorbits/_mars/learn/utils/tests/__init__.py b/python/xorbits/_mars/learn/utils/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/learn/utils/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/learn/utils/tests/test_checks.py b/python/xorbits/_mars/learn/utils/tests/test_checks.py new file mode 100644 index 000000000..1bdd8273a --- /dev/null +++ b/python/xorbits/_mars/learn/utils/tests/test_checks.py @@ -0,0 +1,130 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest +import scipy.sparse as sps + +from .... import dataframe as md +from .... import tensor as mt +from ....config import option_context +from ..checks import assert_all_finite, check_non_negative_then_return_value + + +def test_check_non_negative_then_return_value_execution(setup): + raw = np.random.randint(10, size=(10, 5)) + c = mt.tensor(raw, chunk_size=(3, 2)) + + r = check_non_negative_then_return_value(c, c, "sth") + result = r.execute().fetch() + np.testing.assert_array_equal(result, raw) + + raw = raw.copy() + raw[1, 3] = -1 + c = mt.tensor(raw, chunk_size=(3, 2)) + + r = check_non_negative_then_return_value(c, c, "sth") + with pytest.raises(ValueError): + _ = r.execute().fetch() + + raw = sps.random(10, 5, density=0.3, format="csr") + c = mt.tensor(raw, chunk_size=(3, 2)) + + r = check_non_negative_then_return_value(c, c, "sth") + result = r.execute().fetch() + np.testing.assert_array_equal(result.toarray(), raw.A) + + raw = raw.copy() + raw[1, 3] = -1 + c = mt.tensor(raw, chunk_size=(3, 2)) + + r = check_non_negative_then_return_value(c, c, "sth") + with pytest.raises(ValueError): + _ = r.execute().fetch() + + raw = pd.DataFrame(np.random.rand(10, 4)) + c = md.DataFrame(raw, chunk_size=(3, 2)) + + r = check_non_negative_then_return_value(c, c, "sth") + result = r.execute().fetch() + + pd.testing.assert_frame_equal(result, raw) + + raw = raw.copy() + raw.iloc[1, 3] = -1 + c = md.DataFrame(raw, chunk_size=(3, 2)) + + r = check_non_negative_then_return_value(c, c, "sth") + with pytest.raises(ValueError): + _ = r.execute().fetch() + + +def test_assert_all_finite(setup): + raw = np.array([2.3, np.inf], dtype=np.float64) + x = mt.tensor(raw) + + with pytest.raises(ValueError): + r = assert_all_finite(x) + r.execute() + + raw = np.array([2.3, np.nan], dtype=np.float64) + x = mt.tensor(raw) + + with pytest.raises(ValueError): + r = assert_all_finite(x, allow_nan=False) + r.execute() + + max_float32 = np.finfo(np.float32).max + raw = [max_float32] * 2 + assert not np.isfinite(np.sum(raw)) + x = mt.tensor(raw) + + r = assert_all_finite(x) + result = r.execute().fetch() + assert result is True + + raw = np.array([np.nan, "a"], dtype=object) + x = mt.tensor(raw) + + with pytest.raises(ValueError): + r = assert_all_finite(x) + r.execute() + + raw = np.random.rand(10) + x = mt.tensor(raw, chunk_size=2) + + r = assert_all_finite(x, check_only=False) + result = r.execute().fetch() + np.testing.assert_array_equal(result, raw) + + r = assert_all_finite(x) + result = r.execute().fetch() + assert result is True + + with option_context() as options: + options.learn.assume_finite = True + + assert assert_all_finite(x) is None + assert assert_all_finite(x, check_only=False) is x + + # test sparse + s = sps.random( + 10, 3, density=0.1, format="csr", random_state=np.random.RandomState(0) + ) + s[0, 2] = np.nan + + with pytest.raises(ValueError): + r = assert_all_finite(s) + r.execute() diff --git a/python/xorbits/_mars/learn/utils/tests/test_collect_ports.py b/python/xorbits/_mars/learn/utils/tests/test_collect_ports.py new file mode 100644 index 000000000..1642f158b --- /dev/null +++ b/python/xorbits/_mars/learn/utils/tests/test_collect_ports.py @@ -0,0 +1,24 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..collect_ports import collect_ports + + +def test_collect_ports(setup_cluster): + session = setup_cluster + workers = [ + pool.external_address for pool in session._session.client._cluster._worker_pools + ] + # make sure assert works inside execution of collect ports + collect_ports(workers * 2).execute(session=session) diff --git a/python/xorbits/_mars/learn/utils/tests/test_core.py b/python/xorbits/_mars/learn/utils/tests/test_core.py new file mode 100644 index 000000000..a6dc71862 --- /dev/null +++ b/python/xorbits/_mars/learn/utils/tests/test_core.py @@ -0,0 +1,41 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from .... import tensor as mt +from ..core import sort_by + + +def test_sort_by(setup): + rs = np.random.RandomState(0) + raw1 = rs.rand(10) + raw2 = rs.rand(10) + raw3 = rs.rand(10) + + a1 = mt.tensor(raw1, chunk_size=4) + a2 = mt.tensor(raw2, chunk_size=4) + a3 = mt.tensor(raw3, chunk_size=4) + + s1, s2 = sort_by([a1, a2], by=a3) + ind = np.argsort(raw3) + e1, e2 = raw1[ind], raw2[ind] + np.testing.assert_array_equal(s1, e1) + np.testing.assert_array_equal(s2, e2) + + s1, s2 = sort_by([a1, a2], by=a2, ascending=False) + ind = np.argsort(raw2)[::-1] + e1, e2 = raw1[ind], raw2[ind] + np.testing.assert_array_equal(s1, e1) + np.testing.assert_array_equal(s2, e2) diff --git a/python/xorbits/_mars/learn/utils/tests/test_extmath.py b/python/xorbits/_mars/learn/utils/tests/test_extmath.py new file mode 100644 index 000000000..48f8c24a2 --- /dev/null +++ b/python/xorbits/_mars/learn/utils/tests/test_extmath.py @@ -0,0 +1,26 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from numpy.testing import assert_array_almost_equal + +from ..extmath import softmax + + +@pytest.mark.parametrize("copy", [True, False]) +def test_softmax(setup, copy): + x = [[1, 2, 3], [2, 3, 4]] + ref = [[0.09003057, 0.24472847, 0.66524096], [0.09003057, 0.24472847, 0.66524096]] + x_ = softmax(x, copy=copy) + assert_array_almost_equal(ref, x_) diff --git a/python/xorbits/_mars/learn/utils/tests/test_multiclass.py b/python/xorbits/_mars/learn/utils/tests/test_multiclass.py new file mode 100644 index 000000000..d778ac339 --- /dev/null +++ b/python/xorbits/_mars/learn/utils/tests/test_multiclass.py @@ -0,0 +1,262 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from itertools import product + +import numpy as np +import pytest +import scipy.sparse as sps +from scipy.sparse import csr_matrix +from sklearn.utils._testing import assert_array_equal +from sklearn.utils.estimator_checks import _NotAnArray +from sklearn.utils.multiclass import is_multilabel as sklearn_is_multilabel +from sklearn.utils.multiclass import type_of_target as sklearn_type_of_target + +from .... import tensor as mt +from ..multiclass import is_multilabel, type_of_target, unique_labels + +EXAMPLES = { + "multilabel-indicator": [ + # valid when the data is formatted as sparse or dense, identified + # by CSR format when the testing takes place + csr_matrix(np.random.RandomState(42).randint(2, size=(10, 10))), + [[0, 1], [1, 0]], + [[0, 1]], + csr_matrix(np.array([[0, 1], [1, 0]])), + csr_matrix(np.array([[0, 1], [1, 0]], dtype=bool)), + csr_matrix(np.array([[0, 1], [1, 0]], dtype=np.int8)), + csr_matrix(np.array([[0, 1], [1, 0]], dtype=np.uint8)), + csr_matrix(np.array([[0, 1], [1, 0]], dtype=float)), + csr_matrix(np.array([[0, 1], [1, 0]], dtype=np.float32)), + csr_matrix(np.array([[0, 0], [0, 0]])), + csr_matrix(np.array([[0, 1]])), + # Only valid when data is dense + [[-1, 1], [1, -1]], + np.array([[-1, 1], [1, -1]]), + np.array([[-3, 3], [3, -3]]), + _NotAnArray(np.array([[-3, 3], [3, -3]])), + ], + "multiclass": [ + [1, 0, 2, 2, 1, 4, 2, 4, 4, 4], + np.array([1, 0, 2]), + np.array([1, 0, 2], dtype=np.int8), + np.array([1, 0, 2], dtype=np.uint8), + np.array([1, 0, 2], dtype=float), + np.array([1, 0, 2], dtype=np.float32), + np.array([[1], [0], [2]]), + _NotAnArray(np.array([1, 0, 2])), + [0, 1, 2], + ["a", "b", "c"], + np.array(["a", "b", "c"]), + np.array(["a", "b", "c"], dtype=object), + np.array(["a", "b", "c"], dtype=object), + ], + "multiclass-multioutput": [ + [[1, 0, 2, 2], [1, 4, 2, 4]], + [["a", "b"], ["c", "d"]], + np.array([[1, 0, 2, 2], [1, 4, 2, 4]]), + np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8), + np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8), + np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float), + np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32), + np.array([["a", "b"], ["c", "d"]]), + np.array([["a", "b"], ["c", "d"]]), + np.array([["a", "b"], ["c", "d"]], dtype=object), + np.array([[1, 0, 2]]), + _NotAnArray(np.array([[1, 0, 2]])), + ], + "binary": [ + [0, 1], + [1, 1], + [], + [0], + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=bool), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=float), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32), + np.array([[0], [1]]), + _NotAnArray(np.array([[0], [1]])), + [1, -1], + [3, 5], + ["a"], + ["a", "b"], + ["abc", "def"], + np.array(["abc", "def"]), + ["a", "b"], + np.array(["abc", "def"], dtype=object), + ], + "continuous": [ + [1e-5], + [0, 0.5], + np.array([[0], [0.5]]), + np.array([[0], [0.5]], dtype=np.float32), + ], + "continuous-multioutput": [ + np.array([[0, 0.5], [0.5, 0]]), + np.array([[0, 0.5], [0.5, 0]], dtype=np.float32), + np.array([[0, 0.5]]), + ], + "unknown": [ + [[]], + [()], + # sequence of sequences that weren't supported even before deprecation + np.array([np.array([]), np.array([1, 2, 3])], dtype=object), + # [np.array([]), np.array([1, 2, 3])], # deprecated in numpy v1.24 + [{1, 2, 3}, {1, 2}], + [frozenset([1, 2, 3]), frozenset([1, 2])], + # and also confusable as sequences of sequences + [{0: "a", 1: "b"}, {0: "a"}], + # empty second dimension + np.array([[], []]), + # 3d + np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]), + ], +} + +NON_ARRAY_LIKE_EXAMPLES = [ + {1, 2, 3}, + {0: "a", 1: "b"}, + {0: [5], 1: [5]}, + "abc", + frozenset([1, 2, 3]), + None, +] + + +def test_unique_labels(setup): + # Empty iterable + with pytest.raises(ValueError): + unique_labels() + + # Multiclass problem + assert_array_equal(unique_labels(range(10)), np.arange(10)) + assert_array_equal(unique_labels(np.arange(10)), np.arange(10)) + assert_array_equal(unique_labels([4, 0, 2]), np.array([0, 2, 4])) + + # Multilabel indicator + assert_array_equal( + unique_labels(np.array([[0, 0, 1], [1, 0, 1], [0, 0, 0]])), np.arange(3) + ) + + assert_array_equal(unique_labels(np.array([[0, 0, 1], [0, 0, 0]])), np.arange(3)) + + # Several arrays passed + assert_array_equal(unique_labels([4, 0, 2], range(5)), np.arange(5)) + assert_array_equal(unique_labels((0, 1, 2), (0,), (2, 1)), np.arange(3)) + + # Border line case with binary indicator matrix + with pytest.raises(ValueError): + unique_labels([4, 0, 2], np.ones((5, 5))).execute() + with pytest.raises(ValueError): + unique_labels(np.ones((5, 4)), np.ones((5, 5))).execute() + + assert_array_equal(unique_labels(np.ones((4, 5)), np.ones((5, 5))), np.arange(5)) + + +def test_unique_labels_non_specific(setup): + # Test unique_labels with a variety of collected examples + + # Smoke test for all supported format + for format in ["binary", "multiclass", "multilabel-indicator"]: + for y in EXAMPLES[format]: + unique_labels(y).execute() + + # We don't support those format at the moment + for example in NON_ARRAY_LIKE_EXAMPLES: + with pytest.raises(ValueError): + unique_labels(example).execute() + + for y_type in [ + "unknown", + "continuous", + "continuous-multioutput", + "multiclass-multioutput", + ]: + for example in EXAMPLES[y_type]: + with pytest.raises(ValueError): + unique_labels(example).execute() + + +def test_unique_labels_mixed_types(setup): + # Mix with binary or multiclass and multilabel + mix_clf_format = product( + EXAMPLES["multilabel-indicator"], EXAMPLES["multiclass"] + EXAMPLES["binary"] + ) + + for y_multilabel, y_multiclass in mix_clf_format: + with pytest.raises(ValueError): + unique_labels(y_multiclass, y_multilabel).execute() + with pytest.raises(ValueError): + unique_labels(y_multilabel, y_multiclass).execute() + + with pytest.raises(ValueError): + unique_labels([[1, 2]], [["a", "d"]]).execute() + + with pytest.raises(ValueError): + unique_labels(["1", 2]).execute() + + with pytest.raises(ValueError): + unique_labels([["1", 2], [1, 3]]).execute() + + with pytest.raises(ValueError): + unique_labels([["1", "2"], [2, 3]]).execute() + + +def test_is_multilabel(setup): + raws = [ + [[1, 2]], + [0, 1, 0, 1], + # [[1], [0, 2], []], # deprecated in numpy v1.24 + np.array([[1, 0], [0, 0]]), + np.array([[1], [0], [0]]), + np.array([[1, 0, 0]]), + np.array([[1.0, 0.0], [0.0, 0.0]]), + sps.csr_matrix([[1, 0], [0, 1]]), + ] + + for raw in raws: + assert is_multilabel(raw).to_numpy() == sklearn_is_multilabel(raw) + + t = mt.tensor(raws[3], chunk_size=1) + assert is_multilabel(t).to_numpy() == sklearn_is_multilabel(raws[3]) + + +def test_type_of_target(setup): + raws = [ + np.array([[0, 1], [0, 0]]), # multilabel + np.random.randint(2, size=(5, 3, 3)), # ndim > 2, unknown + np.array([[]]), # ndim == 2, shape[1] == 0, unknown + np.array([[1, 2], [1, 2]]), + np.array([1, 2, 3]), + np.array([0.1, 0.2, 3]), + np.array([[0.1, 0.2, 3]]), + np.array([[1.0, 0.2]]), + np.array([[1.0, 2.0, 3]]), + np.array([[1, 2]]), + np.array([1, 2]), + np.array([["a"], ["b"]], dtype=object), + [[1, 2]], + [], # empty list + ] + + for raw in raws: + assert type_of_target(raw).to_numpy() == sklearn_type_of_target(raw) + + t = mt.tensor(raws[0], chunk_size=1) + assert type_of_target(t).to_numpy() == sklearn_type_of_target(raws[0]) + + with pytest.raises(ValueError): + type_of_target("sth") diff --git a/python/xorbits/_mars/learn/utils/tests/test_shuffle.py b/python/xorbits/_mars/learn/utils/tests/test_shuffle.py new file mode 100644 index 000000000..139df2e25 --- /dev/null +++ b/python/xorbits/_mars/learn/utils/tests/test_shuffle.py @@ -0,0 +1,161 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest + +from .... import dataframe as md +from .... import tensor as mt +from ....core import tile +from .. import shuffle +from ..shuffle import LearnShuffle + + +def test_shuffle_expr(): + a = mt.random.rand(10, 3, chunk_size=2) + b = md.DataFrame(mt.random.rand(10, 5), chunk_size=2) + + new_a, new_b = shuffle(a, b, random_state=0) + + assert new_a.op is new_b.op + assert isinstance(new_a.op, LearnShuffle) + assert new_a.shape == a.shape + assert new_b.shape == b.shape + assert b.index_value.key != new_b.index_value.key + + new_a, new_b = tile(new_a, new_b) + + assert len(new_a.chunks) == 10 + assert np.isnan(new_a.chunks[0].shape[0]) + assert len(new_b.chunks) == 15 + assert np.isnan(new_b.chunks[0].shape[0]) + assert new_b.chunks[0].index_value.key != new_b.chunks[1].index_value.key + assert new_a.chunks[0].op.seeds == new_b.chunks[0].op.seeds + + c = mt.random.rand(10, 5, 3, chunk_size=2) + d = md.DataFrame(mt.random.rand(10, 5), chunk_size=(2, 5)) + + new_c, new_d = shuffle(c, d, axes=(0, 1), random_state=0) + + assert new_c.op is new_d.op + assert isinstance(new_c.op, LearnShuffle) + assert new_c.shape == c.shape + assert new_d.shape == d.shape + assert d.index_value.key != new_d.index_value.key + assert not np.all(new_d.dtypes.index[:-1] < new_d.dtypes.index[1:]) + pd.testing.assert_series_equal(d.dtypes, new_d.dtypes.sort_index()) + + new_c, new_d = tile(new_c, new_d) + + assert len(new_c.chunks) == 5 * 1 * 2 + assert np.isnan(new_c.chunks[0].shape[0]) + assert len(new_d.chunks) == 5 + assert np.isnan(new_d.chunks[0].shape[0]) + assert new_d.chunks[0].shape[1] == 5 + assert new_d.chunks[0].index_value.key != new_d.chunks[1].index_value.key + pd.testing.assert_series_equal(new_d.chunks[0].dtypes.sort_index(), d.dtypes) + assert new_c.chunks[0].op.seeds == new_d.chunks[0].op.seeds + assert len(new_c.chunks[0].op.seeds) == 1 + assert new_c.chunks[0].op.reduce_sizes == (5,) + + with pytest.raises(ValueError): + a = mt.random.rand(10, 5) + b = mt.random.rand(10, 4, 3) + shuffle(a, b, axes=1) + + with pytest.raises(TypeError): + shuffle(a, b, unknown_param=True) + + assert isinstance(shuffle(mt.random.rand(10, 5)), mt.Tensor) + + +def _sort(data, axes): + cur = data + for ax in axes: + if ax < data.ndim: + cur = np.sort(cur, axis=ax) + return cur + + +def test_shuffle_execution(setup): + # test consistency + s1 = np.arange(9).reshape(3, 3) + s2 = np.arange(1, 10).reshape(3, 3) + ts1 = mt.array(s1, chunk_size=2) + ts2 = mt.array(s2, chunk_size=2) + + ret = shuffle(ts1, ts2, axes=[0, 1], random_state=0) + res1, res2 = ret.execute().fetch() + + # calc row index + s1_col_0 = s1[:, 0].tolist() + rs1_col_0 = [res1[:, i] for i in range(3) if set(s1_col_0) == set(res1[:, i])][0] + row_index = [s1_col_0.index(j) for j in rs1_col_0] + # calc col index + s1_row_0 = s1[0].tolist() + rs1_row_0 = [res1[i] for i in range(3) if set(s1_row_0) == set(res1[i])][0] + col_index = [s1_row_0.index(j) for j in rs1_row_0] + np.testing.assert_array_equal(res2, s2[row_index][:, col_index]) + + # tensor + tensor + raw1 = np.random.rand(10, 15, 20) + t1 = mt.array(raw1, chunk_size=8) + raw2 = np.random.rand(10, 15, 20) + t2 = mt.array(raw2, chunk_size=5) + + for axes in [(0,), (0, 1), (0, 2), (1, 2), (0, 1, 2)]: + ret = shuffle(t1, t2, axes=axes, random_state=0) + res1, res2 = ret.execute().fetch() + + assert res1.shape == raw1.shape + assert res2.shape == raw2.shape + np.testing.assert_array_equal(_sort(raw1, axes), _sort(res1, axes)) + np.testing.assert_array_equal(_sort(raw2, axes), _sort(res2, axes)) + + # tensor + tensor(more dimension) + raw3 = np.random.rand(10, 15) + t3 = mt.array(raw3, chunk_size=(8, 15)) + raw4 = np.random.rand(10, 15, 20) + t4 = mt.array(raw4, chunk_size=(5, 15, 10)) + + for axes in [(1,), (0, 1), (1, 2)]: + ret = shuffle(t3, t4, axes=axes, random_state=0) + res3, res4 = ret.execute().fetch() + + assert res3.shape == raw3.shape + assert res4.shape == raw4.shape + np.testing.assert_array_equal(_sort(raw3, axes), _sort(res3, axes)) + np.testing.assert_array_equal(_sort(raw4, axes), _sort(res4, axes)) + + # tensor + dataframe + series + raw5 = np.random.rand(10, 15, 20) + t5 = mt.array(raw5, chunk_size=8) + t6 = mt.array(raw5[:, 0, 0], chunk_size=6) + raw6 = pd.DataFrame(np.random.rand(10, 15)) + df = md.DataFrame(raw6, chunk_size=(8, 15)) + raw7 = pd.Series(np.random.rand(10)) + series = md.Series(raw7, chunk_size=8) + + for axes in [(0,), (1,), (0, 1), (1, 2), [0, 1, 2]]: + ret = shuffle(t5, df, series, t6, axes=axes, random_state=0) + # skip check nsplits because it's updated + res5, res_df, res_series, res6 = ret.execute( + extra_config={"check_nsplits": False} + ).fetch(extra_config={"check_nsplits": False}) + + assert res5.shape == raw5.shape + assert res_df.shape == df.shape + assert res_series.shape == series.shape + assert res6.shape == (raw5.shape[0],) diff --git a/python/xorbits/_mars/learn/utils/tests/test_sparsefuncs.py b/python/xorbits/_mars/learn/utils/tests/test_sparsefuncs.py new file mode 100644 index 000000000..6952d3763 --- /dev/null +++ b/python/xorbits/_mars/learn/utils/tests/test_sparsefuncs.py @@ -0,0 +1,71 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +import scipy.sparse as sp + +from .... import tensor as mt +from ..sparsefuncs import count_nonzero + + +def test_count_nonzero(setup): + X = np.array( + [[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64 + ) + X_nonzero = X != 0 + + X_csr = sp.csr_matrix(X) + X_csr_t = mt.tensor(X_csr, chunk_size=3) + + sample_weight = [0.5, 0.2, 0.3, 0.1, 0.1] + X_nonzero_weighted = X_nonzero * np.array(sample_weight)[:, None] + + for axis in [0, 1, -1, -2, None]: + np.testing.assert_array_almost_equal( + count_nonzero(X_csr_t, axis=axis).execute().fetch(), + X_nonzero.sum(axis=axis), + ) + np.testing.assert_array_almost_equal( + count_nonzero(X_csr_t, axis=axis, sample_weight=sample_weight) + .execute() + .fetch(), + X_nonzero_weighted.sum(axis=axis), + ) + + with pytest.raises(ValueError): + count_nonzero(X_csr_t, axis=2).execute() + + assert count_nonzero(X_csr_t, axis=0).dtype == count_nonzero(X_csr_t, axis=1).dtype + assert ( + count_nonzero(X_csr_t, axis=0, sample_weight=sample_weight).dtype + == count_nonzero(X_csr_t, axis=1, sample_weight=sample_weight).dtype + ) + + # Check dtypes with large sparse matrices too + # XXX: test fails on 32bit (Windows/Linux) + try: + X_csr.indices = X_csr.indices.astype(np.int64) + X_csr.indptr = X_csr.indptr.astype(np.int64) + X_csr_t = mt.tensor(X_csr, chunk_size=3) + + assert ( + count_nonzero(X_csr_t, axis=0).dtype == count_nonzero(X_csr_t, axis=1).dtype + ) + assert ( + count_nonzero(X_csr_t, axis=0, sample_weight=sample_weight).dtype + == count_nonzero(X_csr_t, axis=1, sample_weight=sample_weight).dtype + ) + except TypeError as e: + assert "according to the rule 'safe'" in e.args[0] and np.intp().nbytes < 8, e diff --git a/python/xorbits/_mars/learn/utils/tests/test_validation.py b/python/xorbits/_mars/learn/utils/tests/test_validation.py new file mode 100644 index 000000000..f9283de69 --- /dev/null +++ b/python/xorbits/_mars/learn/utils/tests/test_validation.py @@ -0,0 +1,256 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from itertools import product + +import numpy as np +import pytest +import scipy.sparse as sp +from sklearn.utils._mocking import MockDataFrame +from sklearn.utils._testing import assert_raise_message, assert_raises_regex +from sklearn.utils.estimator_checks import _NotAnArray + +from .... import dataframe as md +from .... import tensor as mt +from ....tensor.core import Tensor +from ..validation import check_array, check_consistent_length + + +def test_ordering(): + # Check that ordering is enforced correctly by validation utilities. + # We need to check each validation utility, because a 'copy' without + # 'order=K' will kill the ordering. + X = mt.ones((10, 5)) + for A in X, X.T: + for copy in (True, False): + B = check_array(A, order="C", copy=copy) + assert B.flags["C_CONTIGUOUS"] is True + B = check_array(A, order="F", copy=copy) + assert B.flags["F_CONTIGUOUS"] is True + if copy: + assert A is not B + + +def test_check_array(setup): + # accept_sparse == False + # raise error on sparse inputs + X = [[1, 2], [3, 4]] + X_csr = sp.csr_matrix(X) + with pytest.raises(TypeError): + check_array(X_csr) + X_csr = mt.tensor(sp.csr_matrix(X)) + with pytest.raises(TypeError): + check_array(X_csr) + # ensure_2d=False + X_array = check_array([0, 1, 2], ensure_2d=False) + assert X_array.ndim == 1 + # ensure_2d=True with 1d array + assert_raise_message( + ValueError, + "Expected 2D array, got 1D array instead", + check_array, + [0, 1, 2], + ensure_2d=True, + ) + assert_raise_message( + ValueError, + "Expected 2D array, got 1D array instead", + check_array, + mt.tensor([0, 1, 2]), + ensure_2d=True, + ) + # ensure_2d=True with scalar array + assert_raise_message( + ValueError, + "Expected 2D array, got scalar array instead", + check_array, + 10, + ensure_2d=True, + ) + # don't allow ndim > 3 + X_ndim = mt.arange(8).reshape(2, 2, 2) + with pytest.raises(ValueError): + check_array(X_ndim) + check_array(X_ndim, allow_nd=True) # doesn't raise + + # dtype and order enforcement. + X_C = mt.arange(4).reshape(2, 2).copy("C") + X_F = X_C.copy("F") + X_int = X_C.astype(mt.int) + X_float = X_C.astype(mt.float) + Xs = [X_C, X_F, X_int, X_float] + dtypes = [mt.int32, mt.int, mt.float, mt.float32, None, mt.bool, object] + orders = ["C", "F", None] + copy_flags = [True, False] + + for X, dtype, order, copy in product(Xs, dtypes, orders, copy_flags): + X_checked = check_array( + X, dtype=dtype, order=order, copy=copy, force_all_finite=False + ) + if dtype is not None: + assert X_checked.dtype == dtype + else: + assert X_checked.dtype == X.dtype + if order == "C": + assert X_checked.flags["C_CONTIGUOUS"] + assert not X_checked.flags["F_CONTIGUOUS"] + elif order == "F": + assert X_checked.flags["F_CONTIGUOUS"] + assert not X_checked.flags["C_CONTIGUOUS"] + if copy: + assert X is not X_checked + else: + # doesn't copy if it was already good + if ( + X.dtype == X_checked.dtype + and X_checked.flags["C_CONTIGUOUS"] == X.flags["C_CONTIGUOUS"] + and X_checked.flags["F_CONTIGUOUS"] == X.flags["F_CONTIGUOUS"] + ): + assert X is X_checked + + # other input formats + # convert lists to arrays + X_dense = check_array([[1, 2], [3, 4]]) + assert isinstance(X_dense, Tensor) + # raise on too deep lists + with pytest.raises(ValueError): + check_array(X_ndim.to_numpy().tolist()) + check_array(X_ndim.to_numpy().tolist(), allow_nd=True) # doesn't raise + # convert weird stuff to arrays + X_no_array = _NotAnArray(X_dense.to_numpy()) + result = check_array(X_no_array) + assert isinstance(result, Tensor) + + # deprecation warning if string-like array with dtype="numeric" + expected_warn_regex = r"converted to decimal numbers if dtype='numeric'" + X_str = [["11", "12"], ["13", "xx"]] + for X in [X_str, mt.array(X_str, dtype="U"), mt.array(X_str, dtype="S")]: + with pytest.warns(FutureWarning, match=expected_warn_regex): + check_array(X, dtype="numeric") + + # deprecation warning if byte-like array with dtype="numeric" + X_bytes = [[b"a", b"b"], [b"c", b"d"]] + for X in [X_bytes, mt.array(X_bytes, dtype="V1")]: + with pytest.warns(FutureWarning, match=expected_warn_regex): + check_array(X, dtype="numeric") + + # test finite + X = [[1.0, np.nan], [2.0, 3.0]] + with pytest.raises(ValueError): + _ = check_array(X).execute() + + +def test_check_array_pandas_dtype_object_conversion(): + # test that data-frame like objects with dtype object + # get converted + X = mt.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=mt.object) + X_df = MockDataFrame(X) + assert check_array(X_df).dtype.kind == "f" + assert check_array(X_df, ensure_2d=False).dtype.kind == "f" + # smoke-test against dataframes with column named "dtype" + X_df.dtype = "Hans" + assert check_array(X_df, ensure_2d=False).dtype.kind == "f" + + +def test_check_array_from_dataframe(): + X = md.DataFrame({"a": [1.0, 2.0, 3.0]}) + assert check_array(X).dtype.kind == "f" + + +def test_check_array_accept_sparse_type_exception(): + X = [[1, 2], [3, 4]] + X_csr = sp.csr_matrix(X) + + msg = ( + "A sparse tensor was passed, but dense data is required. " + "Use X.todense() to convert to a dense tensor." + ) + assert_raise_message(TypeError, msg, check_array, X_csr, accept_sparse=False) + + msg = ( + "When providing 'accept_sparse' as a tuple or list, " + "it must contain at least one string value." + ) + assert_raise_message( + ValueError, msg.format([]), check_array, X_csr, accept_sparse=[] + ) + assert_raise_message( + ValueError, msg.format(()), check_array, X_csr, accept_sparse=() + ) + + with pytest.raises(ValueError): + check_array(X_csr, accept_sparse=object) + + +def test_check_array_accept_sparse_no_exception(): + X = [[1, 2], [3, 4]] + X_csr = sp.csr_matrix(X) + + array = check_array(X_csr, accept_sparse=True) + assert isinstance(array, Tensor) + assert array.issparse() is True + + +def test_check_array_min_samples_and_features_messages(): + # empty list is considered 2D by default: + msg = "0 feature(s) (shape=(1, 0)) while a minimum of 1 is required." + assert_raise_message(ValueError, msg, check_array, [[]]) + + # If considered a 1D collection when ensure_2d=False, then the minimum + # number of samples will break: + msg = "0 sample(s) (shape=(0,)) while a minimum of 1 is required." + assert_raise_message(ValueError, msg, check_array, [], ensure_2d=False) + + # Invalid edge case when checking the default minimum sample of a scalar + msg = "Singleton array array(42) cannot be considered a valid collection." + assert_raise_message(TypeError, msg, check_array, 42, ensure_2d=False) + + +def test_check_array_complex_data_error(): + X = mt.array([[1 + 2j, 3 + 4j, 5 + 7j], [2 + 3j, 4 + 5j, 6 + 7j]]) + assert_raises_regex(ValueError, "Complex data not supported", check_array, X) + + # list of lists + X = [[1 + 2j, 3 + 4j, 5 + 7j], [2 + 3j, 4 + 5j, 6 + 7j]] + assert_raises_regex(ValueError, "Complex data not supported", check_array, X) + + # tuple of tuples + X = ((1 + 2j, 3 + 4j, 5 + 7j), (2 + 3j, 4 + 5j, 6 + 7j)) + assert_raises_regex(ValueError, "Complex data not supported", check_array, X) + + # list of np arrays + X = [mt.array([1 + 2j, 3 + 4j, 5 + 7j]), mt.array([2 + 3j, 4 + 5j, 6 + 7j])] + assert_raises_regex(ValueError, "Complex data not supported", check_array, X) + + # tuple of np arrays + X = (mt.array([1 + 2j, 3 + 4j, 5 + 7j]), mt.array([2 + 3j, 4 + 5j, 6 + 7j])) + assert_raises_regex(ValueError, "Complex data not supported", check_array, X) + + # dataframe + X = MockDataFrame(mt.array([[1 + 2j, 3 + 4j, 5 + 7j], [2 + 3j, 4 + 5j, 6 + 7j]])) + assert_raises_regex(ValueError, "Complex data not supported", check_array, X) + + # sparse matrix + X = sp.coo_matrix([[0, 1 + 2j], [0, 0]]) + assert_raises_regex(ValueError, "Complex data not supported", check_array, X) + + +def test_check_consistent_length(setup): + t = mt.random.RandomState(0).rand(10, 5) + t2 = t[t[:, 0] < 0.5] + t3 = t[t[:, 1] < 0.1] + + check_consistent_length(t2, t2.copy()) + with pytest.raises(ValueError): + check_consistent_length(t2, t3) diff --git a/python/xorbits/_mars/learn/utils/validation.py b/python/xorbits/_mars/learn/utils/validation.py new file mode 100644 index 000000000..c2cd68792 --- /dev/null +++ b/python/xorbits/_mars/learn/utils/validation.py @@ -0,0 +1,727 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numbers +import warnings + +import numpy as np +from numpy.core.numeric import ComplexWarning + +try: + from sklearn.exceptions import DataConversionWarning + from sklearn.utils.validation import check_is_fitted +except ImportError: # pragma: no cover + check_is_fitted = None + DataConversionWarning = UserWarning + +from ... import dataframe as md +from ... import tensor as mt +from ...core import ExecutableTuple +from ...lib.sparse import issparse +from ...tensor import Tensor +from .checks import ( + AssertAllFinite, + assert_all_finite, + check_non_negative_then_return_value, +) + +FLOAT_DTYPES = (mt.float64, mt.float32, mt.float16) + +# --------------------------------------------------------- +# Original implementation is in `sklearn.utils.validation`. +# --------------------------------------------------------- + +assert_all_finite = _assert_all_finite = assert_all_finite + + +def _num_samples(x): + """Return number of samples in array-like x.""" + if hasattr(x, "fit") and callable(x.fit): + # Don't get num_samples from an ensembles length! + raise TypeError(f"Expected sequence or array-like, got estimator {x}") + if not hasattr(x, "__len__") and not hasattr(x, "shape"): + if hasattr(x, "__array__"): + x = mt.asarray(x) + else: + raise TypeError(f"Expected sequence or array-like, got {type(x)}") + if hasattr(x, "shape"): + if len(x.shape) == 0: + if isinstance(x.op, AssertAllFinite): + x = x.op.x + if hasattr(x.op, "data") and x.op.data is not None: + x = np.asarray(x.op.data) + raise TypeError( + f"Singleton array {x!r} cannot be considered a valid collection." + ) + # Check that shape is returning an integer or default to len + if isinstance(x.shape[0], numbers.Integral): + return x.shape[0] + elif np.isnan(x.shape[0]): + return x.shape[0] + else: + return len(x) + else: + return len(x) + + +def check_consistent_length(*arrays, session=None, run_kwargs=None): + """Check that all arrays have consistent first dimensions. + + Checks whether all objects in arrays have the same shape or length. + + Parameters + ---------- + *arrays : list or tuple of input objects. + Objects that will be checked for consistent length. + """ + + new_arrays = [] + lengths = [] + to_execute = [] + for X in arrays: + if X is not None: + n = _num_samples(X) + if np.isnan(n): + to_execute.append(X) + new_arrays.append(X) + lengths.append(n) + # unknown length exists + if len(to_execute) > 0: + # update shape + ExecutableTuple(to_execute).execute(session=session, **(run_kwargs or dict())) + # get length again + lengths = [_num_samples(X) for X in new_arrays] + + uniques = np.unique(lengths) + if len(uniques) > 1: + raise ValueError( + "Found input variables with inconsistent numbers of" + f" samples: {[int(length) for length in lengths]}" + ) + + +def _make_indexable(iterable): + """Ensure iterable supports indexing or convert to an indexable variant. + + Convert sparse matrices to csr and other non-indexable iterable to arrays. + Let `None` and indexable objects (e.g. pandas dataframes) pass unchanged. + + Parameters + ---------- + iterable : {list, dataframe, array, sparse} or None + Object to be converted to an indexable iterable. + """ + if issparse(iterable): + return mt.tensor(iterable) + elif hasattr(iterable, "iloc"): + if iterable.ndim == 1: + return md.Series(iterable) + else: + return md.DataFrame(iterable) + elif hasattr(iterable, "__getitem__"): + return mt.tensor(iterable) + elif iterable is None: + return iterable + return mt.tensor(iterable) + + +def indexable(*iterables, session=None, run_kwargs=None): + """Make arrays indexable for cross-validation. + + Checks consistent length, passes through None, and ensures that everything + can be indexed by converting sparse matrices to csr and converting + non-interable objects to arrays. + + Parameters + ---------- + *iterables : lists, dataframes, arrays, sparse matrices + List of objects to ensure sliceability. + """ + result = [_make_indexable(X) for X in iterables] + check_consistent_length(*result, session=session, run_kwargs=run_kwargs) + return result + + +def _ensure_no_complex_data(array): + if ( + hasattr(array, "dtype") + and array.dtype is not None + and hasattr(array.dtype, "kind") + and array.dtype.kind == "c" + ): + raise ValueError(f"Complex data not supported\n{array}\n") + + +def _ensure_sparse_format( + spmatrix, accept_sparse, dtype, copy, force_all_finite, accept_large_sparse +): + """Convert a sparse matrix to a given format. + + Checks the sparse format of spmatrix and converts if necessary. + + Parameters + ---------- + spmatrix : scipy sparse matrix + Input to validate and convert. + + accept_sparse : string, boolean or list/tuple of strings + String[s] representing allowed sparse matrix formats ('csc', + 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). If the input is sparse but + not in the allowed format, it will be converted to the first listed + format. True allows the input to be any format. False means + that a sparse matrix input will raise an error. + + dtype : string, type or None + Data type of result. If None, the dtype of the input is preserved. + + copy : boolean + Whether a forced copy will be triggered. If copy=False, a copy might + be triggered by a conversion. + + force_all_finite : boolean or 'allow-nan', (default=True) + Whether to raise an error on np.inf and np.nan in X. The possibilities + are: + + - True: Force all values of X to be finite. + - False: accept both np.inf and np.nan in X. + - 'allow-nan': accept only np.nan values in X. Values cannot be + infinite. + + Returns + ------- + spmatrix_converted : scipy sparse matrix. + Matrix that is ensured to have an allowed type. + """ + if dtype is None: + dtype = spmatrix.dtype + + changed_format = False + + if isinstance(accept_sparse, str): + accept_sparse = [accept_sparse] + + # Indices dtype validation + # _check_large_sparse(spmatrix, accept_large_sparse) + + if accept_sparse is False: + raise TypeError( + "A sparse tensor was passed, but dense " + "data is required. Use X.todense() to " + "convert to a dense tensor." + ) + elif isinstance(accept_sparse, (list, tuple)): + if len(accept_sparse) == 0: + raise ValueError( + "When providing 'accept_sparse' " + "as a tuple or list, it must contain at " + "least one string value." + ) + # # ensure correct sparse format + # if spmatrix.format not in accept_sparse: + # # create new with correct sparse + # spmatrix = spmatrix.asformat(accept_sparse[0]) + # changed_format = True + elif accept_sparse is not True: + # any other type + raise ValueError( + "Parameter 'accept_sparse' should be a string, " + "boolean or list of strings. You provided " + f"'accept_sparse={accept_sparse}'." + ) + + if dtype != spmatrix.dtype: + # convert dtype + spmatrix = spmatrix.astype(dtype) + elif copy and not changed_format: + # force copy + spmatrix = spmatrix.copy() + + if force_all_finite: + spmatrix = assert_all_finite( + spmatrix, allow_nan=force_all_finite == "allow-nan", check_only=False + ) + + return spmatrix + + +def check_array( + array, + accept_sparse=False, + accept_large_sparse=True, + dtype="numeric", + order=None, + copy=False, + force_all_finite=True, + ensure_2d=True, + allow_nd=False, + ensure_min_samples=1, + ensure_min_features=1, + estimator=None, +) -> Tensor: + """Input validation on a tensor, list, sparse matrix or similar. + + By default, the input is checked to be a non-empty 2D array containing + only finite values. If the dtype of the tensor is object, attempt + converting to float, raising on failure. + + Parameters + ---------- + array : object + Input object to check / convert. + + accept_sparse : string, boolean or list/tuple of strings (default=False) + String[s] representing allowed sparse matrix formats, such as 'csc', + 'csr', etc. If the input is sparse but not in the allowed format, + it will be converted to the first listed format. True allows the input + to be any format. False means that a sparse matrix input will + raise an error. + + accept_large_sparse : bool (default=True) + If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by + accept_sparse, accept_large_sparse=False will cause it to be accepted + only if its indices are stored with a 32-bit dtype. + + dtype : string, type, list of types or None (default="numeric") + Data type of result. If None, the dtype of the input is preserved. + If "numeric", dtype is preserved unless array.dtype is object. + If dtype is a list of types, conversion on the first type is only + performed if the dtype of the input is not in the list. + + order : 'F', 'C' or None (default=None) + Whether a tenor will be forced to be fortran or c-style. + When order is None (default), then if copy=False, nothing is ensured + about the memory layout of the output tensor; otherwise (copy=True) + the memory layout of the returned tensor is kept as close as possible + to the original tensor. + + copy : boolean (default=False) + Whether a forced copy will be triggered. If copy=False, a copy might + be triggered by a conversion. + + force_all_finite : boolean or 'allow-nan', (default=True) + Whether to raise an error on np.inf and np.nan in tensor. The + possibilities are: + + - True: Force all values of tensor to be finite. + - False: accept both np.inf and np.nan in tensor. + - 'allow-nan': accept only np.nan values in tensor. Values cannot + be infinite. + + For object dtyped data, only np.nan is checked and not np.inf. + + ensure_2d : boolean (default=True) + Whether to raise a value error if tensor is not 2D. + + allow_nd : boolean (default=False) + Whether to allow tensor.ndim > 2. + + ensure_min_samples : int (default=1) + Make sure that the tensor has a minimum number of samples in its first + axis (rows for a 2D tensor). Setting to 0 disables this check. + + ensure_min_features : int (default=1) + Make sure that the 2D tensor has some minimum number of features + (columns). The default value of 1 rejects empty datasets. + This check is only enforced when the input data has effectively 2 + dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0 + disables this check. + + estimator : str or estimator instance (default=None) + If passed, include the name of the estimator in warning messages. + + Returns + ------- + array_converted : object + The converted and validated tensor. + """ + + # store whether originally we wanted numeric dtype + dtype_numeric = isinstance(dtype, str) and dtype == "numeric" + + dtype_orig = getattr(array, "dtype", None) + if not hasattr(dtype_orig, "kind"): + # not a data type (e.g. a column named dtype in a pandas DataFrame) + dtype_orig = None + + if dtype_numeric: + if dtype_orig is not None and dtype_orig.kind == "O": + # if input is object, convert to float. + dtype = np.float64 + else: + dtype = None + + if isinstance(dtype, (list, tuple)): + if dtype_orig is not None and dtype_orig in dtype: + # no dtype conversion required + dtype = None + else: + # dtype conversion required. Let's select the first element of the + # list of accepted types. + dtype = dtype[0] + + if force_all_finite not in (True, False, "allow-nan"): + raise ValueError( + 'force_all_finite should be a bool or "allow-nan"' + f". Got {force_all_finite!r} instead" + ) + + if estimator is not None: + if isinstance(estimator, str): + estimator_name = estimator + else: + estimator_name = estimator.__class__.__name__ + else: + estimator_name = "Estimator" + context = f" by {estimator_name}" if estimator is not None else "" + + if (hasattr(array, "issparse") and array.issparse()) or issparse(array): + _ensure_no_complex_data(array) + array = mt.asarray(array) + array = _ensure_sparse_format( + array, + accept_sparse=accept_sparse, + dtype=dtype, + copy=copy, + force_all_finite=force_all_finite, + accept_large_sparse=accept_large_sparse, + ) + else: + # If np.array(..) gives ComplexWarning, then we convert the warning + # to an error. This is needed because specifying a non complex + # dtype to the function converts complex to real dtype, + # thereby passing the test made in the lines following the scope + # of warnings context manager. + with warnings.catch_warnings(): + try: + warnings.simplefilter("error", ComplexWarning) + array = mt.asarray(array, dtype=dtype, order=order) + except ComplexWarning: + raise ValueError(f"Complex data not supported\n{array}\n") + + # It is possible that the np.array(..) gave no warning. This happens + # when no dtype conversion happened, for example dtype = None. The + # result is that np.array(..) produces an array of complex dtype + # and we need to catch and raise exception for such cases. + _ensure_no_complex_data(array) + + if ensure_2d: + # If input is scalar raise error + if array.ndim == 0: + raise ValueError( + f"Expected 2D array, got scalar array instead:\narray={array}.\n" + "Reshape your data either using array.reshape(-1, 1) if " + "your data has a single feature or array.reshape(1, -1) " + "if it contains a single sample." + ) + # If input is 1D raise error + if array.ndim == 1: + raise ValueError( + f"Expected 2D array, got 1D array instead:\narray={array}.\n" + "Reshape your data either using array.reshape(-1, 1) if " + "your data has a single feature or array.reshape(1, -1) " + "if it contains a single sample." + ) + + # in the future np.flexible dtypes will be handled like object dtypes + if dtype_numeric and np.issubdtype(array.dtype, np.flexible): + warnings.warn( + "Beginning in version 0.22, arrays of bytes/strings will be " + "converted to decimal numbers if dtype='numeric'. " + "It is recommended that you convert the array to " + "a float dtype before using it in scikit-learn, " + "for example by using " + "your_array = your_array.astype(np.float64).", + FutureWarning, + ) + + # make sure we actually converted to numeric: + if dtype_numeric and array.dtype.kind == "O": + array = array.astype(np.float64) + if not allow_nd and array.ndim >= 3: + raise ValueError( + "Found array with dim %d. %s expected <= 2." + % (array.ndim, estimator_name) + ) + if force_all_finite: + array = _assert_all_finite( + array, allow_nan=force_all_finite == "allow-nan", check_only=False + ) + + if ensure_min_samples > 0: + n_samples = _num_samples(array) + if n_samples < ensure_min_samples: + raise ValueError( + "Found array with %d sample(s) (shape=%s) while a" + " minimum of %d is required%s." + % (n_samples, array.shape, ensure_min_samples, context) + ) + + if ensure_min_features > 0 and array.ndim == 2: + n_features = array.shape[1] + if n_features < ensure_min_features: + raise ValueError( + "Found array with %d feature(s) (shape=%s) while" + " a minimum of %d is required%s." + % (n_features, array.shape, ensure_min_features, context) + ) + + if copy: + array = mt.array(array, dtype=dtype, order=order) + + return array + + +def check_X_y( + X, + y, + accept_sparse=False, + accept_large_sparse=True, + dtype="numeric", + order=None, + copy=False, + force_all_finite=True, + ensure_2d=True, + allow_nd=False, + multi_output=False, + ensure_min_samples=1, + ensure_min_features=1, + y_numeric=False, + estimator=None, +): + """Input validation for standard estimators. + + Checks X and y for consistent length, enforces X to be 2D and y 1D. By + default, X is checked to be non-empty and containing only finite values. + Standard input checks are also applied to y, such as checking that y + does not have np.nan or np.inf targets. For multi-label y, set + multi_output=True to allow 2D and sparse y. If the dtype of X is + object, attempt converting to float, raising on failure. + + Parameters + ---------- + X : tensor, list or sparse tensor + Input data. + + y : tensor, list or sparse tensor + Labels. + + accept_sparse : string, boolean or list of string (default=False) + String[s] representing allowed sparse matrix formats, such as 'csc', + 'csr', etc. If the input is sparse but not in the allowed format, + it will be converted to the first listed format. True allows the input + to be any format. False means that a sparse matrix input will + raise an error. + + accept_large_sparse : bool (default=True) + If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by + accept_sparse, accept_large_sparse will cause it to be accepted only + if its indices are stored with a 32-bit dtype. + + dtype : string, type, list of types or None (default="numeric") + Data type of result. If None, the dtype of the input is preserved. + If "numeric", dtype is preserved unless array.dtype is object. + If dtype is a list of types, conversion on the first type is only + performed if the dtype of the input is not in the list. + + order : 'F', 'C' or None (default=None) + Whether an array will be forced to be fortran or c-style. + + copy : boolean (default=False) + Whether a forced copy will be triggered. If copy=False, a copy might + be triggered by a conversion. + + force_all_finite : boolean or 'allow-nan', (default=True) + Whether to raise an error on np.inf and np.nan in X. This parameter + does not influence whether y can have np.inf or np.nan values. + The possibilities are: + + - True: Force all values of X to be finite. + - False: accept both np.inf and np.nan in X. + - 'allow-nan': accept only np.nan values in X. Values cannot be + infinite. + + ensure_2d : boolean (default=True) + Whether to raise a value error if X is not 2D. + + allow_nd : boolean (default=False) + Whether to allow X.ndim > 2. + + multi_output : boolean (default=False) + Whether to allow 2D y (array or sparse matrix). If false, y will be + validated as a vector. y cannot have np.nan or np.inf values if + multi_output=True. + + ensure_min_samples : int (default=1) + Make sure that X has a minimum number of samples in its first + axis (rows for a 2D array). + + ensure_min_features : int (default=1) + Make sure that the 2D array has some minimum number of features + (columns). The default value of 1 rejects empty datasets. + This check is only enforced when X has effectively 2 dimensions or + is originally 1D and ``ensure_2d`` is True. Setting to 0 disables + this check. + + y_numeric : boolean (default=False) + Whether to ensure that y has a numeric type. If dtype of y is object, + it is converted to float64. Should only be used for regression + algorithms. + + estimator : str or estimator instance (default=None) + If passed, include the name of the estimator in warning messages. + + Returns + ------- + X_converted : object + The converted and validated X. + + y_converted : object + The converted and validated y. + """ + if y is None: + raise ValueError("y cannot be None") + + X = check_array( + X, + accept_sparse=accept_sparse, + accept_large_sparse=accept_large_sparse, + dtype=dtype, + order=order, + copy=copy, + force_all_finite=force_all_finite, + ensure_2d=ensure_2d, + allow_nd=allow_nd, + ensure_min_samples=ensure_min_samples, + ensure_min_features=ensure_min_features, + estimator=estimator, + ) + if multi_output: + y = check_array(y, True, force_all_finite=True, ensure_2d=False, dtype=None) + else: + y = column_or_1d(y, warn=True) + y = _assert_all_finite(y, check_only=False) + if y_numeric and y.dtype.kind == "O": + y = y.astype(np.float64) + + check_consistent_length(X, y) + + return X, y + + +def check_non_negative(X, whom): + """ + Check if there is any negative value in a tensor. + + Parameters + ---------- + X : array-like or sparse matrix + Input data. + + whom : string + Who passed X to this function. + """ + return check_non_negative_then_return_value(X, X, whom) + + +def column_or_1d(y, warn=False): + """Ravel column or 1d numpy array, else raises an error + + Parameters + ---------- + y : array-like + + warn : boolean, default False + To control display of warnings. + + Returns + ------- + y : array + + """ + y = mt.tensor(y) + shape = y.shape + if len(shape) == 1: + return mt.ravel(y) + if len(shape) == 2 and shape[1] == 1: + if warn: + warnings.warn( + "A column-vector y was passed when a 1d array was" + " expected. Please change the shape of y to " + "(n_samples, ), for example using ravel().", + DataConversionWarning, + stacklevel=2, + ) + return mt.ravel(y) + + raise ValueError( + "y should be a 1d array, got an array of shape {} instead.".format(shape) + ) + + +check_is_fitted = check_is_fitted + + +def _check_sample_weight(sample_weight, X, dtype=None): + """Validate sample weights. + + Note that passing sample_weight=None will output an array of ones. + Therefore, in some cases, you may want to protect the call with: + if sample_weight is not None: + sample_weight = _check_sample_weight(...) + + Parameters + ---------- + sample_weight : {ndarray, Number or None}, shape (n_samples,) + Input sample weights. + + X : nd-array, list or sparse matrix + Input data. + + dtype: dtype + dtype of the validated `sample_weight`. + If None, and the input `sample_weight` is an array, the dtype of the + input is preserved; otherwise an array with the default numpy dtype + is be allocated. If `dtype` is not one of `float32`, `float64`, + `None`, the output will be of dtype `float64`. + + Returns + ------- + sample_weight : ndarray, shape (n_samples,) + Validated sample weight. It is guaranteed to be "C" contiguous. + """ + n_samples = _num_samples(X) + + if dtype is not None and dtype not in [np.float32, np.float64]: + dtype = np.float64 + + if sample_weight is None or isinstance(sample_weight, numbers.Number): + if sample_weight is None: + sample_weight = mt.ones(n_samples, dtype=dtype) + else: + sample_weight = mt.full(n_samples, sample_weight, dtype=dtype) + else: + if dtype is None: + dtype = [np.float64, np.float32] + sample_weight = check_array( + sample_weight, accept_sparse=False, ensure_2d=False, dtype=dtype, order="C" + ) + if sample_weight.ndim != 1: + raise ValueError("Sample weights must be 1D array or scalar") + + if sample_weight.shape != (n_samples,): + raise ValueError( + f"sample_weight.shape == {sample_weight.shape}, " + f"expected {(n_samples,)}!" + ) + return sample_weight diff --git a/python/xorbits/_mars/learn/wrappers.py b/python/xorbits/_mars/learn/wrappers.py new file mode 100644 index 000000000..79ef22e8d --- /dev/null +++ b/python/xorbits/_mars/learn/wrappers.py @@ -0,0 +1,341 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Union + +import numpy as np +from sklearn.base import BaseEstimator as SklearnBaseEstimator +from sklearn.base import ClassifierMixin as SklearnClassifierMixin +from sklearn.base import MetaEstimatorMixin +from sklearn.base import RegressorMixin as SklearnRegressorMixin + +from .. import remote as mr +from .. import tensor as mt +from .base import BaseEstimator, ClassifierMixin, RegressorMixin +from .metrics import get_scorer +from .utils import check_array, copy_learned_attributes + + +def _wrap(estimator: SklearnBaseEstimator, method, X, y, **kwargs): + return getattr(estimator, method)(X, y, **kwargs) + + +class ParallelPostFit(BaseEstimator, MetaEstimatorMixin): + """ + Meta-estimator for parallel predict and transform. + + Parameters + ---------- + estimator : Estimator + The underlying estimator that is fit. + + scoring : string or callable, optional + A single string (see :ref:`scoring_parameter`) or a callable + (see :ref:`scoring`) to evaluate the predictions on the test set. + + For evaluating multiple metrics, either give a list of (unique) + strings or a dict with names as keys and callables as values. + + NOTE that when using custom scorers, each scorer should return a + single value. Metric functions returning a list/array of values + can be wrapped into multiple scorers that return one value each. + + See :ref:`multimetric_grid_search` for an example. + + .. warning:: + + If None, the estimator's default scorer (if available) is used. + Most scikit-learn estimators will convert large Mars tensors to + a single NumPy array, which may exhaust the memory of your worker. + You probably want to always specify `scoring`. + + Notes + ----- + + .. warning:: + + This class is not appropriate for parallel or distributed *training* + on large datasets. For that, see :class:`Incremental`, which provides + distributed (but sequential) training. If you're doing distributed + hyperparameter optimization on larger-than-memory datasets, see + :class:`mars.learn.model_selection.IncrementalSearch`. + + This estimator does not parallelize the training step. This simply calls + the underlying estimators's ``fit`` method called and copies over the + learned attributes to ``self`` afterwards. + + It is helpful for situations where your training dataset is relatively + small (fits on a single machine) but you need to predict or transform + a much larger dataset. ``predict``, ``predict_proba`` and ``transform`` + will be done in parallel (potentially distributed if you've connected + to a Mars cluster). + + Note that many scikit-learn estimators already predict and transform in + parallel. This meta-estimator may still be useful in those cases when your + dataset is larger than memory, as the distributed scheduler will ensure the + data isn't all read into memory at once. + + See Also + -------- + Incremental + mars.learn.model_selection.IncrementalSearch + + Examples + -------- + >>> from sklearn.ensemble import GradientBoostingClassifier + >>> from sklearn.datasets import make_classification + >>> import mars.tensor as mt + >>> from mars.learn.wrappers import ParallelPostFit + + Make a small 1,000 sample 2 training dataset and fit normally. + + >>> X, y = make_classification(n_samples=1000, random_state=0) + >>> clf = ParallelPostFit(estimator=GradientBoostingClassifier(), + ... scoring='accuracy') + >>> clf.fit(X, y) + ParallelPostFit(estimator=GradientBoostingClassifier(...)) + + >>> clf.classes_ + array([0, 1]) + + Transform and predict return Mars outputs for Mars inputs. + + >>> X_big, y_big = make_classification(n_samples=100000, + random_state=0) + >>> X_big, y_big = mt.tensor(X_big), mt.tensor(y_big) + >>> clf.predict(X_big) + array([1, 0, 0, ..., 1, 0, 0]) + + Which can be computed in parallel. + + >>> clf.predict_proba(X_big) + array([[0.01780031, 0.98219969], + [0.62199242, 0.37800758], + [0.89059934, 0.10940066], + ..., + [0.03249968, 0.96750032], + [0.951434 , 0.048566 ], + [0.99527114, 0.00472886]]) + """ + + def __init__( + self, + estimator: SklearnBaseEstimator = None, + scoring: Union[str, Callable] = None, + ): + self.estimator = estimator + self.scoring = scoring + + def _make_fit(self, method): + def _fit(X, y=None, **kwargs): + result = ( + mr.spawn( + _wrap, + args=(self.estimator, method, X, y), + kwargs=kwargs, + resolve_tileable_input=True, + ) + .execute() + .fetch() + ) + + copy_learned_attributes(result, self) + copy_learned_attributes(result, self.estimator) + return self + + return _fit + + def fit(self, X, y=None, **kwargs): + """ + Fit the underlying estimator. + + Parameters + ---------- + X, y : array-like + **kwargs + Additional fit-kwargs for the underlying estimator. + + Returns + ------- + self : object + """ + return self._make_fit("fit")(X, y=y, **kwargs) + + def partial_fit(self, X, y=None, **kwargs): # pragma: no cover + return self._make_fit("partial_fit")(X, y=y, **kwargs) + + def _check_method(self, method): + """ + Check if self.estimator has 'method'. + + Raises + ------ + AttributeError + """ + estimator = self.estimator + if not hasattr(estimator, method): + msg = "The wrapped estimator '{}' does not have a '{}' method.".format( + estimator, method + ) + raise AttributeError(msg) + return getattr(estimator, method) + + def transform(self, X): + """ + Transform block or partition-wise for Mars inputs. + + For Mars inputs, a Mars tensor is returned. For other + inputs (NumPy array, pandas dataframe, scipy sparse matrix), the + regular return value is returned. + + If the underlying estimator does not have a ``transform`` method, then + an ``AttributeError`` is raised. + + Parameters + ---------- + X : array-like + + Returns + ------- + transformed : array-like + """ + self._check_method("transform") + X = check_array(X) + dtype = self.estimator.transform(np.zeros((1, X.shape[1]), dtype=X.dtype)).dtype + return X.map_chunk(self.estimator.transform, dtype=dtype) + + def score(self, X, y): + """ + Returns the score on the given data. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Input data, where n_samples is the number of samples and + n_features is the number of features. + + y : array-like, shape = [n_samples] or [n_samples, n_output], optional + Target relative to X for classification or regression; + None for unsupervised learning. + + Returns + ------- + score : float + return self.estimator.score(X, y) + """ + + scoring = self.scoring + X = check_array(X) + y = check_array(y, ensure_2d=False) + + if not scoring: + if type(self.estimator).score in ( + RegressorMixin.score, + SklearnRegressorMixin.score, + ): # pragma: no cover + scoring = "r2" + elif type(self.estimator).score in ( + ClassifierMixin.score, + SklearnClassifierMixin.score, + ): + scoring = "accuracy" + else: # pragma: no cover + scoring = self.scoring + + if scoring: + scorer = get_scorer(scoring) + return scorer(self, X, y).execute() + else: # pragma: no cover + return mr.spawn(self.estimator.score, args=(X, y)).execute().fetch() + + def predict(self, X, execute=True): + """ + Predict for X. + + For Mars inputs, a Mars tensor is returned. For other + inputs (NumPy array, pandas dataframe, scipy sparse matrix), the + regular return value is returned. + + Parameters + ---------- + X : array-like + + Returns + ------- + y : array-like + """ + + self._check_method("predict") + X = check_array(X) + + result = X.map_chunk(self.estimator.predict, dtype="int", shape=X.shape[:1]) + if execute: + result.execute() + return result + + def predict_proba(self, X, execute=True): + """ + Probability estimates. + + For Mars inputs, a Mars tensor is returned. For other + inputs (NumPy array, pandas dataframe, scipy sparse matrix), the + regular return value is returned. + + If the underlying estimator does not have a ``predict_proba`` + method, then an ``AttributeError`` is raised. + + Parameters + ---------- + X : array or dataframe + + Returns + ------- + y : array-like + """ + self._check_method("predict_proba") + X = check_array(X) + result = X.map_chunk( + self.estimator.predict_proba, + dtype="float", + shape=(X.shape[0], len(self.estimator.classes_)), + ) + if execute: + result.execute() + return result + + def predict_log_proba(self, X, execute=True): + """ + Log of probability estimates. + + For Mars inputs, a Mars tensor is returned. For other + inputs (NumPy array, pandas dataframe, scipy sparse matrix), the + regular return value is returned. + + If the underlying estimator does not have a ``predict_proba`` + method, then an ``AttributeError`` is raised. + + Parameters + ---------- + X : array or dataframe + + Returns + ------- + y : array-like + """ + + self._check_method("predict_log_proba") + result = mt.log(self.predict_proba(X, execute=False)) + if execute: + result.execute() + return result diff --git a/python/xorbits/_mars/lib/__init__.py b/python/xorbits/_mars/lib/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/lib/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/lib/aio/__init__.py b/python/xorbits/_mars/lib/aio/__init__.py new file mode 100644 index 000000000..43ff1f846 --- /dev/null +++ b/python/xorbits/_mars/lib/aio/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import contextlib +import sys + +from .file import AioFileObject, AioFilesystem +from .isolation import Isolation, get_isolation, new_isolation, stop_isolation +from .lru import alru_cache +from .parallelism import AioEvent + +if sys.version_info[:2] < (3, 9): + from ._threads import to_thread + + asyncio.to_thread = to_thread diff --git a/python/xorbits/_mars/lib/aio/_runners.py b/python/xorbits/_mars/lib/aio/_runners.py new file mode 100644 index 000000000..b45828e45 --- /dev/null +++ b/python/xorbits/_mars/lib/aio/_runners.py @@ -0,0 +1,162 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +Backport of the asyncio.runners module from Python 3.7. +""" +# Source: +# https://github.com/python/cpython/blob/a4afcdfa55ddffa4b9ae3b0cf101628c7bff4102/Lib/asyncio/runners.py + +# Modifications: +# * removed relative imports of .coroutines, .events, .tasks +# * replaced `coroutines`, `events`, `tasks` with `asyncio`. +# * replaced `tasks.all_tasks` with `asyncio.Task.all_tasks` because it is +# backwards compatible. +# * Use private function `asyncio.events._get_running_loop` directly in +# Python 3.6 + +import asyncio +import weakref +from typing import Any, Awaitable, Coroutine, TypeVar, Union + +try: + from asyncio import get_running_loop # noqa Python >=3.7 +except ImportError: # pragma: no cover + from asyncio.events import _get_running_loop as get_running_loop # pragma: no cover + +__all__ = ("run", "get_running_loop") +_T = TypeVar("_T") + + +def _patch_loop(loop): + """ + This function is designed to work around https://bugs.python.org/issue36607 + + It's job is to keep a thread safe variable tasks up to date with any tasks that + are created for the given loop. This then lets you cancel them as _all_tasks + was intended for. + + We also need to patch the {get,set}_task_factory functions because we can't allow + Other users of it to overwrite our factory function. This function will pretend + like there is no factory set but in reality our factory is always set and we will + call the provided one set + """ + tasks = weakref.WeakSet() + + task_factory = [None] + + def _set_task_factory(factory): + task_factory[0] = factory + + def _get_task_factory(): + return task_factory[0] + + def _safe_task_factory(loop, coro): + if task_factory[0] is None: + # These lines are copied from the standard library because they don't have + # this inside a default factory function for me to call. + # https://github.com/python/cpython/blob/3.6/Lib/asyncio/base_events.py#L304 + task = asyncio.Task(coro, loop=loop) + if task._source_traceback: + del task._source_traceback[-1] # pragma: no cover + else: + task = task_factory[0](loop, coro) + tasks.add(task) + return task + + loop.set_task_factory(_safe_task_factory) + loop.set_task_factory = _set_task_factory + loop.get_task_factory = _get_task_factory + + return tasks + + +def run( + main: Union[Coroutine[Any, None, _T], Awaitable[_T]], *, debug: bool = False +) -> _T: + """Run a coroutine. + + This function runs the passed coroutine, taking care of + managing the asyncio event loop and finalizing asynchronous + generators. + + This function cannot be called when another asyncio event loop is + running in the same thread. + + If debug is True, the event loop will be run in debug mode. + + This function always creates a new event loop and closes it at the end. + It should be used as a main entry point for asyncio programs, and should + ideally only be called once. + + Example: + + async def main(): + await asyncio.sleep(1) + print('hello') + + asyncio.run(main()) + """ + # Python 3.7+ raises RuntimeError while <3.6 returns None + try: + loop = get_running_loop() + except RuntimeError: + loop = None + if loop is not None: + raise RuntimeError("asyncio.run() cannot be called from a running event loop") + + if not asyncio.iscoroutine(main): + raise ValueError("a coroutine was expected, got {!r}".format(main)) + + loop = asyncio.new_event_loop() + tasks = _patch_loop(loop) + + try: + asyncio.set_event_loop(loop) + loop.set_debug(debug) + return loop.run_until_complete(main) + finally: + try: + _cancel_all_tasks(loop, tasks) + loop.run_until_complete(loop.shutdown_asyncgens()) + finally: + asyncio.set_event_loop(None) # type: ignore + loop.close() + + +def _cancel_all_tasks(loop, tasks): + to_cancel = [task for task in tasks if not task.done()] + + if not to_cancel: + return + + for task in to_cancel: + task.cancel() + + loop.run_until_complete( + asyncio.gather(*to_cancel, loop=loop, return_exceptions=True) + ) + + for task in to_cancel: + if task.cancelled(): + continue + if task.exception() is not None: + loop.call_exception_handler( + { + "message": "unhandled exception during asyncio.run() shutdown", + "exception": task.exception(), + "task": task, + } + ) diff --git a/python/xorbits/_mars/lib/aio/_threads.py b/python/xorbits/_mars/lib/aio/_threads.py new file mode 100644 index 000000000..6324577ae --- /dev/null +++ b/python/xorbits/_mars/lib/aio/_threads.py @@ -0,0 +1,35 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextvars +import functools +from asyncio import events + +__all__ = ("to_thread",) + + +async def to_thread(func, *args, **kwargs): + """Asynchronously run function *func* in a separate thread. + + Any *args and **kwargs supplied for this function are directly passed + to *func*. Also, the current :class:`contextvars.Context` is propagated, + allowing context variables from the main thread to be accessed in the + separate thread. + + Return a coroutine that can be awaited to get the eventual result of *func*. + """ + loop = events.get_running_loop() + ctx = contextvars.copy_context() + func_call = functools.partial(ctx.run, func, *args, **kwargs) + return await loop.run_in_executor(None, func_call) diff --git a/python/xorbits/_mars/lib/aio/base.py b/python/xorbits/_mars/lib/aio/base.py new file mode 100644 index 000000000..db5557e09 --- /dev/null +++ b/python/xorbits/_mars/lib/aio/base.py @@ -0,0 +1,82 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import asyncio +import functools +from concurrent.futures import Executor +from typing import Any, Type + + +def _make_delegate_method(attr): + async def method(self, *args, **kwargs): + func = functools.partial(getattr(self._file, attr), *args, **kwargs) + return await self._loop.run_in_executor(self._executor, func) + + return method + + +def _make_proxy_method(attr): + def method(self, *args, **kwargs): + return getattr(self._file, attr)(*args, **kwargs) + + return method + + +def _make_proxy_property(attr): + def proxy_property(self): + return getattr(self._file, attr) + + return property(proxy_property) + + +def delegate_to_executor(*attrs): + def wrap_cls(cls: Type): + for attr in attrs: + setattr(cls, attr, _make_delegate_method(attr)) + return cls + + return wrap_cls + + +def proxy_method_directly(*attrs): + def wrap_cls(cls: Type): + for attr in attrs: + setattr(cls, attr, _make_proxy_method(attr)) + return cls + + return wrap_cls + + +def proxy_property_directly(*attrs): + def wrap_cls(cls): + for attr in attrs: + setattr(cls, attr, _make_proxy_property(attr)) + return cls + + return wrap_cls + + +class AioBase: + def __init__( + self, file: Any, loop: asyncio.BaseEventLoop = None, executor: Executor = None + ): + if loop is None: + loop = asyncio.get_event_loop() + if isinstance(file, AioBase): + file = file._file + + self._file = file + self._loop = loop + self._executor = executor diff --git a/python/xorbits/_mars/lib/aio/file.py b/python/xorbits/_mars/lib/aio/file.py new file mode 100644 index 000000000..fd9d1b2d9 --- /dev/null +++ b/python/xorbits/_mars/lib/aio/file.py @@ -0,0 +1,85 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools + +from .base import ( + AioBase, + delegate_to_executor, + proxy_method_directly, + proxy_property_directly, +) + + +@delegate_to_executor( + "close", + "flush", + "isatty", + "read", + "read1", + "readinto", + "readline", + "readlines", + "seek", + "seekable", + "tell", + "truncate", + "writable", + "write", + "writelines", +) +@proxy_method_directly("fileno", "readable") +@proxy_property_directly("closed", "name", "mode") +class AioFileObject(AioBase): + def __aiter__(self): + return self + + async def __anext__(self): + """Simulate normal file iteration.""" + line = await self.readline() + if line: + return line + else: + raise StopAsyncIteration + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + await self.close() + self._file = None + + +@delegate_to_executor( + "cat", + "ls", + "delete", + "disk_usage", + "stat", + "rm", + "mv", + "rename", + "mkdir", + "exists", + "isdir", + "isfile", + "read_parquet", + "walk", +) +@proxy_property_directly("pathsep") +class AioFilesystem(AioBase): + async def open(self, *args, **kwargs): + func = functools.partial(self._file.open, *args, **kwargs) + file = await self._loop.run_in_executor(self._executor, func) + return AioFileObject(file) diff --git a/python/xorbits/_mars/lib/aio/isolation.py b/python/xorbits/_mars/lib/aio/isolation.py new file mode 100644 index 000000000..7968588ad --- /dev/null +++ b/python/xorbits/_mars/lib/aio/isolation.py @@ -0,0 +1,95 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import atexit +import threading +from typing import Dict, Optional + + +class Isolation: + loop: asyncio.AbstractEventLoop + _stopped: Optional[asyncio.Event] + _thread: Optional[threading.Thread] + + def __init__(self, loop: asyncio.AbstractEventLoop, threaded: bool = True): + self.loop = loop + self._threaded = threaded + + self._stopped = None + self._thread = None + self._thread_ident = None + + def _run(self): + asyncio.set_event_loop(self.loop) + self._stopped = asyncio.Event() + self.loop.run_until_complete(self._stopped.wait()) + + def start(self): + if self._threaded: + self._thread = thread = threading.Thread(target=self._run) + thread.daemon = True + thread.start() + self._thread_ident = thread.ident + + @property + def thread_ident(self): + return self._thread_ident + + async def _stop(self): + self._stopped.set() + + def stop(self): + if self._threaded: + asyncio.run_coroutine_threadsafe(self._stop(), self.loop).result() + self._thread.join() + + +_name_to_isolation: Dict[str, Isolation] = dict() + + +DEFAULT_ISOLATION = "oscar" + + +def new_isolation( + name: str = DEFAULT_ISOLATION, + loop: asyncio.AbstractEventLoop = None, + threaded: bool = True, +) -> Isolation: + if name in _name_to_isolation: + return _name_to_isolation[name] + + if loop is None: + loop = asyncio.new_event_loop() + + isolation = Isolation(loop, threaded=threaded) + isolation.start() + _name_to_isolation[name] = isolation + return isolation + + +def get_isolation(name: str = DEFAULT_ISOLATION): + isolation = _name_to_isolation[name] + if isolation.loop.is_closed(): # pragma: no cover + _name_to_isolation.pop(name) + raise KeyError(name) + return isolation + + +def stop_isolation(name: str = DEFAULT_ISOLATION): + if name in _name_to_isolation: + return _name_to_isolation.pop(name).stop() + + +atexit.register(stop_isolation) diff --git a/python/xorbits/_mars/lib/aio/lru.py b/python/xorbits/_mars/lib/aio/lru.py new file mode 100644 index 000000000..46f8ed232 --- /dev/null +++ b/python/xorbits/_mars/lib/aio/lru.py @@ -0,0 +1,229 @@ +# The MIT License +# +# Copyright (c) 2018 aio-libs team https://github.com/aio-libs/ +# Copyright (c) 2017 Ocean S. A. https://ocean.io/ +# Copyright (c) 2016-2017 WikiBusiness Corporation http://wikibusiness.org/ +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +import asyncio +import os +import weakref +from collections import OrderedDict +from functools import _CacheInfo, _make_key, partial, wraps + + +__version__ = "1.0.2" + +__all__ = ("alru_cache", "clear_all_alru_caches") + +_is_ci = (os.environ.get("CI") or "0").lower() in ("1", "true") +_all_wrapped = weakref.WeakSet() + + +def clear_all_alru_caches(): + for wrapped in _all_wrapped: + wrapped.cache_clear() + + +def unpartial(fn): + while hasattr(fn, "func"): + fn = fn.func + + return fn + + +def _done_callback(fut, task): + if task.cancelled(): + fut.cancel() + return + + exc = task.exception() + if exc is not None: + fut.set_exception(exc) + return + + fut.set_result(task.result()) + + +def _cache_invalidate(wrapped, typed, *args, **kwargs): + key = _make_key(args, kwargs, typed) + + exists = key in wrapped._cache + + if exists: + wrapped._cache.pop(key) + + return exists + + +def _cache_clear(wrapped): + wrapped.hits = wrapped.misses = 0 + wrapped._cache = OrderedDict() + wrapped.tasks = set() + + +def _open(wrapped): + if not wrapped.closed: + raise RuntimeError("alru_cache is not closed") + + was_closed = ( + wrapped.hits == wrapped.misses == len(wrapped.tasks) == len(wrapped._cache) == 0 + ) + + if not was_closed: + raise RuntimeError("alru_cache was not closed correctly") + + wrapped.closed = False + + +def _close(wrapped, *, cancel=False, return_exceptions=True): + if wrapped.closed: + raise RuntimeError("alru_cache is closed") + + wrapped.closed = True + + if cancel: + for task in wrapped.tasks: + if not task.done(): # not sure is it possible + task.cancel() + + return _wait_closed(wrapped, return_exceptions=return_exceptions) + + +async def _wait_closed(wrapped, *, return_exceptions): + wait_closed = asyncio.gather(*wrapped.tasks, return_exceptions=return_exceptions) + + wait_closed.add_done_callback(partial(_close_waited, wrapped)) + + ret = await wait_closed + + # hack to get _close_waited callback to be executed + await asyncio.sleep(0) + + return ret + + +def _close_waited(wrapped, _): + wrapped.cache_clear() + + +def _cache_info(wrapped, maxsize): + return _CacheInfo( + wrapped.hits, + wrapped.misses, + maxsize, + len(wrapped._cache), + ) + + +def __cache_touch(wrapped, key): + try: + wrapped._cache.move_to_end(key) + except KeyError: # not sure is it possible + pass + + +def _cache_hit(wrapped, key): + wrapped.hits += 1 + __cache_touch(wrapped, key) + + +def _cache_miss(wrapped, key): + wrapped.misses += 1 + __cache_touch(wrapped, key) + + +def alru_cache( + fn=None, + maxsize=128, + typed=False, + *, + cache_exceptions=True, +): + def wrapper(fn): + _origin = unpartial(fn) + + if not asyncio.iscoroutinefunction(_origin): + raise RuntimeError("Coroutine function is required, got {}".format(fn)) + + # functools.partialmethod support + if hasattr(fn, "_make_unbound_method"): + fn = fn._make_unbound_method() + + @wraps(fn) + async def wrapped(*fn_args, **fn_kwargs): + if wrapped.closed: + raise RuntimeError("alru_cache is closed for {}".format(wrapped)) + + loop = asyncio.get_event_loop() + + key = _make_key(fn_args, fn_kwargs, typed) + + fut = wrapped._cache.get(key) + + if fut is not None: + if not fut.done(): + _cache_hit(wrapped, key) + return await asyncio.shield(fut) + + exc = fut._exception + + if exc is None or cache_exceptions: + _cache_hit(wrapped, key) + return fut.result() + + # exception here and cache_exceptions == False + wrapped._cache.pop(key) + + fut = loop.create_future() + task = loop.create_task(fn(*fn_args, **fn_kwargs)) + task.add_done_callback(partial(_done_callback, fut)) + + wrapped.tasks.add(task) + task.add_done_callback(wrapped.tasks.remove) + + wrapped._cache[key] = fut + + if maxsize is not None and len(wrapped._cache) > maxsize: + wrapped._cache.popitem(last=False) + + _cache_miss(wrapped, key) + return await asyncio.shield(fut) + + _cache_clear(wrapped) + wrapped._origin = _origin + wrapped.closed = False + wrapped.cache_info = partial(_cache_info, wrapped, maxsize) + wrapped.cache_clear = partial(_cache_clear, wrapped) + wrapped.invalidate = partial(_cache_invalidate, wrapped, typed) + wrapped.close = partial(_close, wrapped) + wrapped.open = partial(_open, wrapped) + + if _is_ci: + _all_wrapped.add(wrapped) + return wrapped + + if fn is None: + return wrapper + + if callable(fn) or hasattr(fn, "_make_unbound_method"): + return wrapper(fn) + + raise NotImplementedError("{} decorating is not supported".format(fn)) diff --git a/python/xorbits/_mars/lib/aio/parallelism.py b/python/xorbits/_mars/lib/aio/parallelism.py new file mode 100644 index 000000000..45c3e9308 --- /dev/null +++ b/python/xorbits/_mars/lib/aio/parallelism.py @@ -0,0 +1,37 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import multiprocessing +import threading +from concurrent.futures import Executor +from typing import Union + +from .base import AioBase, delegate_to_executor, proxy_method_directly + +event_types = Union[threading.Event, multiprocessing.Event] + + +@delegate_to_executor("wait") +@proxy_method_directly("set", "is_set", "clear") +class AioEvent(AioBase): + def __init__( + self, + event: event_types = None, + loop: asyncio.BaseEventLoop = None, + executor: Executor = None, + ): + if event is None: + event = threading.Event() + super().__init__(event, loop=loop, executor=executor) diff --git a/python/xorbits/_mars/lib/aio/tests/__init__.py b/python/xorbits/_mars/lib/aio/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/lib/aio/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/lib/aio/tests/test_aio_file.py b/python/xorbits/_mars/lib/aio/tests/test_aio_file.py new file mode 100644 index 000000000..fe8538007 --- /dev/null +++ b/python/xorbits/_mars/lib/aio/tests/test_aio_file.py @@ -0,0 +1,55 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile + +import pytest + +from ...filesystem import LocalFileSystem +from .. import AioFileObject, AioFilesystem + + +@pytest.mark.asyncio +async def test_aio_filesystem(): + local_fs = LocalFileSystem.get_instance() + aio_fs = AioFilesystem(local_fs) + + assert aio_fs.pathsep == local_fs.pathsep + + with tempfile.TemporaryDirectory() as tempdir: + file_path = os.path.join(tempdir, "test") + + with open(file_path, "wb") as f: + f.write(b"text for test") + + stat = await aio_fs.stat(tempdir) + assert stat["type"] == "directory" + + +@pytest.mark.asyncio +async def test_aio_file_object(): + with tempfile.TemporaryDirectory() as tempdir: + file_path = os.path.join(tempdir, "test") + + f = AioFileObject(open(file_path, "w")) + async with f: + assert f.readable() is False + assert f.mode == "w" + await f.write("text for test") + + f2 = AioFileObject(open(file_path)) + async with f2: + async for l in f2: + assert len(l) > 0 diff --git a/python/xorbits/_mars/lib/bloom_filter.py b/python/xorbits/_mars/lib/bloom_filter.py new file mode 100644 index 000000000..8faee0fa6 --- /dev/null +++ b/python/xorbits/_mars/lib/bloom_filter.py @@ -0,0 +1,572 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# pylint: disable=superfluous-parens,redefined-variable-type +# superfluous-parens: Sometimes extra parens are more clear + +"""Bloom Filter: Probabilistic set membership testing for large sets""" + +# Shamelessly borrowed (under MIT license) from +# https://code.activestate.com/recipes/577686-bloom-filter/ +# About Bloom Filters: https://en.wikipedia.org/wiki/Bloom_filter + +# Tweaked by Daniel Richard Stromberg, mostly to: +# 1) Give it a little nicer __init__ parameters. +# 2) Improve the hash functions to get a much lower rate of false positives. +# 3) Give it a selection of backends. +# 4) Make it pass pylint. + +# In the literature: +# k is the number of probes - we call this num_probes_k +# m is the number of bits in the filter - we call this num_bits_m +# n is the ideal number of elements to eventually be stored in the filter - we +# call this ideal_num_elements_n +# p is the desired error rate when full - we call this error_rate_p + +import array +import math +import os +import random + +try: + import mmap as mmap_mod +except ImportError: + # Jython lacks mmap() + HAVE_MMAP = False +else: + HAVE_MMAP = True + + +class Mmap_backend(object): + """ + Backend storage for our "array of bits" using an mmap'd file. + Please note that this has only been tested on Linux so far. + """ + + effs = 2**8 - 1 + + def __init__(self, num_bits, filename): + if not HAVE_MMAP: + raise NotImplementedError("mmap is not available") + self.num_bits = num_bits + self.num_chars = (self.num_bits + 7) // 8 + flags = os.O_RDWR | os.O_CREAT + if hasattr(os, "O_BINARY"): + flags |= getattr(os, "O_BINARY") + self.file_ = os.open(filename, flags) + os.lseek(self.file_, self.num_chars + 1, os.SEEK_SET) + os.write(self.file_, b"\x00") + self.mmap = mmap_mod.mmap(self.file_, self.num_chars) + + def is_set(self, bitno): + """Return true iff bit number bitno is set""" + byteno, bit_within_wordno = divmod(bitno, 8) + mask = 1 << bit_within_wordno + byte = self.mmap[byteno] + return byte & mask + + def set(self, bitno): + """set bit number bitno to true""" + + byteno, bit_within_byteno = divmod(bitno, 8) + mask = 1 << bit_within_byteno + byte = self.mmap[byteno] + byte |= mask + self.mmap[byteno] = byte + + def clear(self, bitno): + """clear bit number bitno - set it to false""" + + byteno, bit_within_byteno = divmod(bitno, 8) + mask = 1 << bit_within_byteno + byte = self.mmap[byteno] + byte &= Mmap_backend.effs - mask + self.mmap[byteno] = byte + + def __iand__(self, other): + assert self.num_bits == other.num_bits + + for byteno in range(self.num_chars): + self.mmap[byteno] = self.mmap[byteno] & other.mmap[byteno] + + return self + + def __ior__(self, other): + assert self.num_bits == other.num_bits + + for byteno in range(self.num_chars): + self.mmap[byteno] = self.mmap[byteno] | other.mmap[byteno] + + return self + + def close(self): + """Close the file""" + os.close(self.file_) + + +class File_seek_backend(object): + """Backend storage for our "array of bits" using a file in which we seek""" + + effs = 2**8 - 1 + + def __init__(self, num_bits, filename): + self.num_bits = num_bits + self.num_chars = (self.num_bits + 7) // 8 + flags = os.O_RDWR | os.O_CREAT + if hasattr(os, "O_BINARY"): + flags |= getattr(os, "O_BINARY") + self.file_ = os.open(filename, flags) + os.lseek(self.file_, self.num_chars + 1, os.SEEK_SET) + os.write(self.file_, b"\x00") + + def is_set(self, bitno): + """Return true iff bit number bitno is set""" + byteno, bit_within_wordno = divmod(bitno, 8) + mask = 1 << bit_within_wordno + os.lseek(self.file_, byteno, os.SEEK_SET) + byte = os.read(self.file_, 1)[0] + return byte & mask + + def set(self, bitno): + """set bit number bitno to true""" + + byteno, bit_within_byteno = divmod(bitno, 8) + mask = 1 << bit_within_byteno + os.lseek(self.file_, byteno, os.SEEK_SET) + byte = os.read(self.file_, 1)[0] + byte |= mask + os.lseek(self.file_, byteno, os.SEEK_SET) + os.write(self.file_, bytes([byte])) + + def clear(self, bitno): + """clear bit number bitno - set it to false""" + + byteno, bit_within_byteno = divmod(bitno, 8) + mask = 1 << bit_within_byteno + os.lseek(self.file_, byteno, os.SEEK_SET) + byte = os.read(self.file_, 1)[0] + byte &= File_seek_backend.effs - mask + os.lseek(self.file_, byteno, os.SEEK_SET) + os.write(self.file_, bytes([byte])) + + # These are quite slow ways to do iand and ior, but they should work, + # and a faster version is going to take more time + def __iand__(self, other): + assert self.num_bits == other.num_bits + + for bitno in range(self.num_bits): + if self.is_set(bitno) and other.is_set(bitno): + self.set(bitno) + else: + self.clear(bitno) + + return self + + def __ior__(self, other): + assert self.num_bits == other.num_bits + + for bitno in range(self.num_bits): + if self.is_set(bitno) or other.is_set(bitno): + self.set(bitno) + else: + self.clear(bitno) + + return self + + def close(self): + """Close the file""" + os.close(self.file_) + + +class Array_then_file_seek_backend(object): + # pylint: disable=R0902 + # R0902: We kinda need a bunch of instance attributes + """ + Backend storage for our "array of bits" using a python array of integers up + to some maximum number of bytes, then spilling over to a file. + This is -not- a cache; we instead save the leftmost bits in RAM, and the + rightmost bits (if necessary) in a file. On open, we read from the file to + RAM. On close, we write from RAM to the file. + """ + + effs = 2**8 - 1 + + def __init__(self, num_bits, filename, max_bytes_in_memory): + self.num_bits = num_bits + num_chars = (self.num_bits + 7) // 8 + self.filename = filename + self.max_bytes_in_memory = max_bytes_in_memory + self.bits_in_memory = min(num_bits, self.max_bytes_in_memory * 8) + self.bits_in_file = max(self.num_bits - self.bits_in_memory, 0) + self.bytes_in_memory = (self.bits_in_memory + 7) // 8 + self.bytes_in_file = (self.bits_in_file + 7) // 8 + + self.array_ = array.array("B", [0]) * self.bytes_in_memory + flags = os.O_RDWR | os.O_CREAT + if hasattr(os, "O_BINARY"): + flags |= getattr(os, "O_BINARY") + self.file_ = os.open(filename, flags) + os.lseek(self.file_, num_chars + 1, os.SEEK_SET) + os.write(self.file_, b"\x00") + + os.lseek(self.file_, 0, os.SEEK_SET) + offset = 0 + intended_block_len = 2**17 + while True: + if offset + intended_block_len < self.bytes_in_memory: + block = os.read(self.file_, intended_block_len) + elif offset < self.bytes_in_memory: + block = os.read(self.file_, self.bytes_in_memory - offset) + else: + break + for index_in_block, byte in enumerate(block): + self.array_[offset + index_in_block] = byte + offset += intended_block_len + + def is_set(self, bitno): + """Return true iff bit number bitno is set""" + byteno, bit_within_byteno = divmod(bitno, 8) + mask = 1 << bit_within_byteno + if byteno < self.bytes_in_memory: + return self.array_[byteno] & mask + else: + os.lseek(self.file_, byteno, os.SEEK_SET) + byte = os.read(self.file_, 1)[0] + return byte & mask + + def set(self, bitno): + """set bit number bitno to true""" + byteno, bit_within_byteno = divmod(bitno, 8) + mask = 1 << bit_within_byteno + if byteno < self.bytes_in_memory: + self.array_[byteno] |= mask + else: + os.lseek(self.file_, byteno, os.SEEK_SET) + byte = os.read(self.file_, 1)[0] + byte |= mask + os.lseek(self.file_, byteno, os.SEEK_SET) + os.write(self.file_, bytes([byte])) + + def clear(self, bitno): + """clear bit number bitno - set it to false""" + byteno, bit_within_byteno = divmod(bitno, 8) + mask = Array_backend.effs - (1 << bit_within_byteno) + if byteno < self.bytes_in_memory: + self.array_[byteno] &= mask + else: + os.lseek(self.file_, byteno, os.SEEK_SET) + byte = os.read(self.file_, 1)[0] + byte &= File_seek_backend.effs - mask + os.lseek(self.file_, byteno, os.SEEK_SET) + os.write(self.file_, bytes([byte])) + + # These are quite slow ways to do iand and ior, but they should work, + # and a faster version is going to take more time + def __iand__(self, other): + assert self.num_bits == other.num_bits + + for bitno in range(self.num_bits): + if self.is_set(bitno) and other.is_set(bitno): + self.set(bitno) + else: + self.clear(bitno) + + return self + + def __ior__(self, other): + assert self.num_bits == other.num_bits + + for bitno in range(self.num_bits): + if self.is_set(bitno) or other.is_set(bitno): + self.set(bitno) + else: + self.clear(bitno) + + return self + + def close(self): + """ + Write the in-memory portion to disk, leave the already-on-disk portion + unchanged + """ + + os.lseek(self.file_, 0, os.SEEK_SET) + os.write(self.file_, bytes(self.array_[0 : self.bytes_in_memory])) + + os.close(self.file_) + + +class Array_backend(object): + """ + Backend storage for our "array of bits" using a python array of integers + """ + + # Note that this has now been split out into a bits_mod for the benefit of + # other projects. + effs = 2**32 - 1 + + def __init__(self, num_bits): + self.num_bits = num_bits + self.num_words = (self.num_bits + 31) // 32 + self.array_ = array.array("L", [0]) * self.num_words + + def is_set(self, bitno): + """Return true iff bit number bitno is set""" + wordno, bit_within_wordno = divmod(bitno, 32) + mask = 1 << bit_within_wordno + return self.array_[wordno] & mask + + def set(self, bitno): + """set bit number bitno to true""" + wordno, bit_within_wordno = divmod(bitno, 32) + mask = 1 << bit_within_wordno + self.array_[wordno] |= mask + + def clear(self, bitno): + """clear bit number bitno - set it to false""" + wordno, bit_within_wordno = divmod(bitno, 32) + mask = Array_backend.effs - (1 << bit_within_wordno) + self.array_[wordno] &= mask + + # It'd be nice to do __iand__ and __ior__ in a base class, but + # that'd be Much slower + + def __iand__(self, other): + assert self.num_bits == other.num_bits + + for wordno in range(self.num_words): + self.array_[wordno] &= other.array_[wordno] + + return self + + def __ior__(self, other): + assert self.num_bits == other.num_bits + + for wordno in range(self.num_words): + self.array_[wordno] |= other.array_[wordno] + + return self + + def close(self): + """Noop for compatibility with the file+seek backend""" + pass + + +def get_bitno_seed_rnd(bloom_filter, key): + """ + Apply num_probes_k hash functions to key. + Generate the array index and bitmask corresponding to each result. + """ + + # We're using key as a seed to a pseudorandom number generator + hasher = random.Random(key).randrange + for dummy in range(bloom_filter.num_probes_k): + bitno = hasher(bloom_filter.num_bits_m) + yield bitno % bloom_filter.num_bits_m + + +MERSENNES1 = [2**x - 1 for x in [17, 31, 127]] +MERSENNES2 = [2**x - 1 for x in [19, 67, 257]] + + +def simple_hash(int_list, prime1, prime2, prime3): + """Compute a hash value from a list of integers and 3 primes""" + result = 0 + for integer in int_list: + result += ((result + integer + prime1) * prime2) % prime3 + return result + + +def hash1(int_list): + """Basic hash function #1""" + return simple_hash(int_list, MERSENNES1[0], MERSENNES1[1], MERSENNES1[2]) + + +def hash2(int_list): + """Basic hash function #2""" + return simple_hash(int_list, MERSENNES2[0], MERSENNES2[1], MERSENNES2[2]) + + +def get_filter_bitno_probes(bloom_filter, key): + """ + Apply num_probes_k hash functions to key. + Generate the array index and bitmask corresponding to each result + """ + + # This one assumes key is either bytes or str (or other list of integers) + + if hasattr(key, "__divmod__"): + int_list = [] + temp = key + while temp: + quotient, remainder = divmod(temp, 256) + int_list.append(remainder) + temp = quotient + elif isinstance(key, (list, tuple, str, bytes)) and not key: + int_list = [] + + elif isinstance(key, (list, tuple)): + int_list = [] + for v in key: + if isinstance(v, str): + int_list.extend([ord(char) for char in v]) + elif hasattr(v, "__divmod__"): + int_list.append(v) + else: + raise TypeError("Sorry, I do not know how to hash this type") + elif isinstance(key[0], str): + int_list = [ord(char) for char in key] + else: + raise TypeError("Sorry, I do not know how to hash this type") + + hash_value1 = hash1(int_list) + hash_value2 = hash2(int_list) + probe_value = hash_value1 + + for _ in range(1, bloom_filter.num_probes_k + 1): + probe_value *= hash_value1 + probe_value += hash_value2 + probe_value %= MERSENNES1[2] + yield probe_value % bloom_filter.num_bits_m + + +def try_unlink(filename): + """unlink a file. Don't complain if it's not there""" + try: + os.unlink(filename) + except OSError: + pass + return + + +class BloomFilter(object): + """Probabilistic set membership testing for large sets""" + + def __init__( + self, + max_elements=10000, + error_rate=0.1, + probe_bitnoer=get_filter_bitno_probes, + filename=None, + start_fresh=False, + ): + # pylint: disable=R0913 + # R0913: We want a few arguments + if max_elements <= 0: + raise ValueError("ideal_num_elements_n must be > 0") + if not (0 < error_rate < 1): + raise ValueError("error_rate_p must be between 0 and 1 exclusive") + + self.error_rate_p = error_rate + # With fewer elements, we should do very well. With more elements, our + # error rate "guarantee" drops rapidly. + self.ideal_num_elements_n = max_elements + + numerator = -1 * self.ideal_num_elements_n * math.log(self.error_rate_p) + denominator = math.log(2) ** 2 + real_num_bits_m = numerator / denominator + self.num_bits_m = int(math.ceil(real_num_bits_m)) + + if filename is None: + self.backend = Array_backend(self.num_bits_m) + elif isinstance(filename, tuple) and isinstance(filename[1], int): + if start_fresh: + try_unlink(filename[0]) + if filename[1] == -1: + self.backend = Mmap_backend(self.num_bits_m, filename[0]) + else: + self.backend = Array_then_file_seek_backend( + self.num_bits_m, + filename[0], + filename[1], + ) + else: + if start_fresh: + try_unlink(filename) + self.backend = File_seek_backend(self.num_bits_m, filename) + + # AKA num_offsetters + # Verified against + # https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives + real_num_probes_k = (self.num_bits_m / self.ideal_num_elements_n) * math.log(2) + self.num_probes_k = int(math.ceil(real_num_probes_k)) + self.probe_bitnoer = probe_bitnoer + + def __repr__(self): + return ( + "BloomFilter(ideal_num_elements_n=%d, error_rate_p=%f, " + "num_bits_m=%d)" + ) % ( + self.ideal_num_elements_n, + self.error_rate_p, + self.num_bits_m, + ) + + def add(self, key): + """Add an element to the filter""" + for bitno in self.probe_bitnoer(self, key): + self.backend.set(bitno) + + def __iadd__(self, key): + self.add(key) + return self + + def _match_template(self, bloom_filter): + """ + Compare a sort of signature for two bloom filters. + Used in preparation for binary operations + """ + return ( + self.num_bits_m == bloom_filter.num_bits_m + and self.num_probes_k == bloom_filter.num_probes_k + and self.probe_bitnoer == bloom_filter.probe_bitnoer + ) + + def union(self, bloom_filter): + """Compute the set union of two bloom filters""" + self.backend |= bloom_filter.backend + + def __ior__(self, bloom_filter): + self.union(bloom_filter) + return self + + def intersection(self, bloom_filter): + """Compute the set intersection of two bloom filters""" + self.backend &= bloom_filter.backend + + def __iand__(self, bloom_filter): + self.intersection(bloom_filter) + return self + + def __contains__(self, key): + for bitno in self.probe_bitnoer(self, key): + if not self.backend.is_set(bitno): + return False + return True + + def close(self): + self.backend.close() + self.backend = None + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + self.backend = None + + def __del__(self): + if self.backend is not None: + self.backend.close() + self.backend = None diff --git a/python/xorbits/_mars/lib/compression.py b/python/xorbits/_mars/lib/compression.py new file mode 100644 index 000000000..a24ff8c3a --- /dev/null +++ b/python/xorbits/_mars/lib/compression.py @@ -0,0 +1,55 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from gzip import GzipFile +from typing import BinaryIO + +try: + import lz4 + import lz4.frame +except ImportError: # pragma: no cover + lz4 = None + + +_compressions = {"gzip": lambda f: GzipFile(fileobj=f)} + +if lz4: + _compressions["lz4"] = lz4.frame.open + + +def compress(file: BinaryIO, compress_type: str) -> BinaryIO: + """ + Return a compressed file object. + + Parameters + ---------- + file: + file object. + compress_type: str + compression type. + + Returns + ------- + compressed_file: + compressed file object. + """ + try: + compress_ = _compressions[compress_type] + except KeyError: # pragma: no cover + raise ValueError( + f"Unknown compress type: {compress_type}, " + f'available include: {", ".join(_compressions)}' + ) + + return compress_(file) diff --git a/python/xorbits/_mars/lib/cython/__init__.py b/python/xorbits/_mars/lib/cython/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/lib/cython/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/lib/cython/libcpp.pxd b/python/xorbits/_mars/lib/cython/libcpp.pxd new file mode 100644 index 000000000..f183fd433 --- /dev/null +++ b/python/xorbits/_mars/lib/cython/libcpp.pxd @@ -0,0 +1,30 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# complementary header for C++ STL libs not included in Cython + +from libc.stdint cimport uint_fast64_t + + +cdef extern from "" namespace "std" nogil: + cdef cppclass mt19937_64: + ctypedef uint_fast64_t result_type + + mt19937_64() except + + mt19937_64(result_type seed) except + + result_type operator()() except + + result_type min() except + + result_type max() except + + void discard(size_t z) except + + void seed(result_type seed) except + diff --git a/python/xorbits/_mars/lib/filesystem/__init__.py b/python/xorbits/_mars/lib/filesystem/__init__.py new file mode 100644 index 000000000..574aac3a7 --- /dev/null +++ b/python/xorbits/_mars/lib/filesystem/__init__.py @@ -0,0 +1,23 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .azure import AzureBlobFileSystem +from .base import FileSystem +from .core import file_size, get_fs, glob, open_file, register_filesystem +from .fsmap import FSMap + +# noinspection PyUnresolvedReferences +from .hdfs import HadoopFileSystem +from .local import LocalFileSystem +from .s3 import S3FileSystem diff --git a/python/xorbits/_mars/lib/filesystem/_glob.py b/python/xorbits/_mars/lib/filesystem/_glob.py new file mode 100644 index 000000000..c859c473c --- /dev/null +++ b/python/xorbits/_mars/lib/filesystem/_glob.py @@ -0,0 +1,173 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import fnmatch +import os +import re + +from .core import FileSystem + +magic_check = re.compile("([*?[])") +magic_check_bytes = re.compile(b"([*?[])") + + +def has_magic(s): + if isinstance(s, bytes): # pragma: no cover + match = magic_check_bytes.search(s) + else: + match = magic_check.search(s) + return match is not None + + +def _ishidden(path): + return path[0] in (".", b"."[0]) + + +def _isrecursive(pattern): + if isinstance(pattern, bytes): # pragma: no cover + return pattern == b"**" + else: + return pattern == "**" + + +class FileSystemGlob: + def __init__(self, fs: FileSystem): + self._fs = fs + + def glob(self, pathname, recursive=False): + """Return a list of paths matching a pathname pattern. + + The pattern may contain simple shell-style wildcards a la + fnmatch. However, unlike fnmatch, filenames starting with a + dot are special cases that are not matched by '*' and '?' + patterns. + + If recursive is true, the pattern '**' will match any files and + zero or more directories and subdirectories. + """ + return list(self.iglob(pathname, recursive=recursive)) + + def iglob(self, pathname, recursive=False): + """Return an iterator which yields the paths matching a pathname pattern. + + The pattern may contain simple shell-style wildcards a la + fnmatch. However, unlike fnmatch, filenames starting with a + dot are special cases that are not matched by '*' and '?' + patterns. + + If recursive is true, the pattern '**' will match any files and + zero or more directories and subdirectories. + """ + it = self._iglob(pathname, recursive, False) + if recursive and _isrecursive(pathname): # pragma: no cover + s = next(it) # skip empty string + assert not s + return it + + def _iglob(self, pathname, recursive, dironly): + dirname, basename = self._fs.path_split(pathname.replace(os.path.sep, "/")) + if not has_magic(pathname): + assert not dironly + if basename: + if self._fs.exists(pathname): + yield pathname + else: # pragma: no cover + # Patterns ending with a slash should match only directories + if self._fs.isdir(dirname): + yield pathname + return + if not dirname: # pragma: no cover + if recursive and _isrecursive(basename): + yield from self._glob2(dirname, basename, dironly) + else: + yield from self._glob1(dirname, basename, dironly) + return + # `os.path.split()` returns the argument itself as a dirname if it is a + # drive or UNC path. Prevent an infinite recursion if a drive or UNC path + # contains magic characters (i.e. r'\\?\C:'). + if dirname != pathname and has_magic(dirname): + dirs = self._iglob(dirname, recursive, True) + else: + dirs = [dirname] + if has_magic(basename): + if recursive and _isrecursive(basename): + glob_in_dir = self._glob2 + else: + glob_in_dir = self._glob1 + else: + glob_in_dir = self._glob0 + for dirname in dirs: + for name in glob_in_dir(dirname, basename, dironly): + if dirname: + yield self._fs.path_join(dirname, name) + else: + yield name + + # These 2 helper functions non-recursively glob inside a literal directory. + # They return a list of basenames. _glob1 accepts a pattern while _glob0 + # takes a literal basename (so it only has to check for its existence). + + def _glob1(self, dirname, pattern, dironly): + names = list(self._iterdir(dirname, dironly)) + if not _ishidden(pattern): + names = (x for x in names if not _ishidden(x)) + return fnmatch.filter(names, pattern) + + def _glob0(self, dirname, basename, dironly): # pragma: no cover + if not basename: + # `os.path.split()` returns an empty basename for paths ending with a + # directory separator. 'q*x/' should match only directories. + if self._fs.isdir(dirname): + return [basename] + else: + if self._fs.exists(self._fs.path_join(dirname, basename)): + return [basename] + return [] + + # Following functions are not public but can be used by third-party code. + + def glob0(self, dirname, pattern): # pragma: no cover + return self._glob0(dirname, pattern, False) + + def glob1(self, dirname, pattern): # pragma: no cover + return self._glob1(dirname, pattern, False) + + # This helper function recursively yields relative pathnames inside a literal + # directory. + + def _glob2(self, dirname, pattern, dironly): # pragma: no cover + assert _isrecursive(pattern) + yield pattern[:0] + yield from self._rlistdir(dirname, dironly) + + # If dironly is false, yields all file names inside a directory. + # If dironly is true, yields only directory names. + def _iterdir(self, dirname, dironly): + if not dirname: # pragma: no cover + dirname = "" + if not self._fs.isdir(dirname): + return iter(()) + for entry in self._fs.ls(dirname): + if not dironly or self._fs.isdir(entry): + yield self._fs.path_split(entry)[-1] + + # Recursively yields relative pathnames inside a literal directory. + def _rlistdir(self, dirname, dironly): # pragma: no cover + names = list(self._iterdir(dirname, dironly)) + for x in names: + if not _ishidden(x): + yield x + path = self._fs.path_join(dirname, x) if dirname else x + for y in self._rlistdir(path, dironly): + yield self._fs.path_join(x, y) diff --git a/python/xorbits/_mars/lib/filesystem/_oss_lib/__init__.py b/python/xorbits/_mars/lib/filesystem/_oss_lib/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/lib/filesystem/_oss_lib/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/lib/filesystem/_oss_lib/common.py b/python/xorbits/_mars/lib/filesystem/_oss_lib/common.py new file mode 100644 index 000000000..9fc5ec310 --- /dev/null +++ b/python/xorbits/_mars/lib/filesystem/_oss_lib/common.py @@ -0,0 +1,198 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import base64 +import json +import os + +from ....utils import lazy_import +from ..base import path_type, stringify_path + +oss2 = lazy_import("oss2", placeholder=True) + +# OSS api time out +_oss_time_out = 10 + + +class OSSFileEntry: + def __init__( + self, path, *, is_dir=None, is_file=None, stat=None, storage_options=None + ): + self._path = path + self._name = os.path.basename(path) + self._is_file = is_file + self._is_dir = is_dir + self._stat = stat + self._storage_options = storage_options + + def is_dir(self): + if self._is_dir is None: + self._is_dir = oss_isdir(self._path) + return self._is_dir + + def is_file(self): + if self._is_file is None: + if self.is_dir() or not oss_exists(self._path): + self._is_file = False + else: + self._is_file = True + return self._is_file + + def stat(self): + if self._stat is None: + self._stat = oss_stat(self._path) + return self._stat + + @property + def name(self): + return self._name + + @property + def path(self): + return self._path + + +def parse_osspath(path: path_type): + # Extract OSS configuration from the encoded URL. + str_path = stringify_path(path) + parse_result = oss2.urlparse(str_path) + if parse_result.scheme != "oss": + raise ValueError( + f"Except scheme oss, but got scheme: {parse_result.scheme}" + f" in path: {str_path}" + ) + bucket = parse_result.hostname + if not (parse_result.username and parse_result.password): + raise RuntimeError(r"Please use build_oss_path to add OSS info") + param_dict = url_to_dict(parse_result.username) + access_key_id = param_dict["access_key_id"] + access_key_secret = parse_result.password + end_point = param_dict["end_point"] + key = parse_result.path + key = key[1:] if key.startswith("/") else key + return bucket, key, access_key_id, access_key_secret, end_point + + +def _get_oss_bucket(bucket, access_key_id, access_key_secret, end_point): + oss_bucket = oss2.Bucket( + auth=oss2.Auth( + access_key_id=access_key_id, access_key_secret=access_key_secret + ), + endpoint=end_point, + bucket_name=bucket, + connect_timeout=_oss_time_out, + ) + return oss_bucket + + +def oss_exists(path: path_type): + bucket, key, access_key_id, access_key_secret, end_point = parse_osspath(path) + oss_bucket = _get_oss_bucket(bucket, access_key_id, access_key_secret, end_point) + return oss_bucket.object_exists(key) or oss_isdir(path) + + +def oss_isdir(path: path_type): + """ + OSS has no concept of directories, but we define + a ossurl is dir, When there is at least one object + at the ossurl that is the prefix(end with char "/"), + it is considered as a directory. + """ + dirname = stringify_path(path) + if not dirname.endswith("/"): + dirname = dirname + "/" + bucket, key, access_key_id, access_key_secret, end_point = parse_osspath(dirname) + oss_bucket = _get_oss_bucket(bucket, access_key_id, access_key_secret, end_point) + isdir = False + for obj in oss2.ObjectIteratorV2(oss_bucket, prefix=key, max_keys=2): + if obj.key == key: + continue + isdir = True + break + return isdir + + +def oss_stat(path: path_type): + path = stringify_path(path) + bucket, key, access_key_id, access_key_secret, end_point = parse_osspath(path) + oss_bucket = _get_oss_bucket(bucket, access_key_id, access_key_secret, end_point) + if oss_isdir(path): + stat = dict(name=path, size=0, modified_time=-1) + stat["type"] = "directory" + else: + meta = oss_bucket.get_object_meta(key) + stat = dict( + name=path, + size=int(meta.headers["Content-Length"]), + modified_time=meta.headers["Last-Modified"], + ) + stat["type"] = "file" + return stat + + +def oss_scandir(dirname: path_type): + dirname = stringify_path(dirname) + if not dirname.endswith("/"): + dirname = dirname + "/" + bucket, key, access_key_id, access_key_secret, end_point = parse_osspath(dirname) + oss_bucket = _get_oss_bucket(bucket, access_key_id, access_key_secret, end_point) + dirname_set = set() + for obj in oss2.ObjectIteratorV2(oss_bucket, prefix=key): + rel_path = obj.key[len(key) :] + try: + inside_dirname, inside_filename = rel_path.split("/", 1) + except ValueError: + inside_dirname = None + inside_filename = rel_path + if inside_dirname is not None: + if inside_dirname in dirname_set: + continue + dirname_set.add(inside_dirname) + yield OSSFileEntry( + os.path.join(dirname, inside_dirname), + is_dir=True, + is_file=False, + stat={ + "name": os.path.join(dirname, inside_dirname), + "type": "directory", + "size": 0, + "modified_time": -1, + }, + ) + else: + yield OSSFileEntry( + os.path.join(dirname, inside_filename), + is_dir=False, + is_file=True, + stat={ + "name": os.path.join(dirname, inside_filename), + "type": "file", + "size": obj.size, + "modified_time": obj.last_modified, + }, + ) + + +def dict_to_url(param: dict): + # Encode the dictionary with url-safe-base64. + str_param = json.dumps(param) + url_param = base64.urlsafe_b64encode(bytes(str_param, encoding="utf8")) + return bytes.decode(url_param, encoding="utf8") + + +def url_to_dict(url_param: str): + # Decode url-safe-base64 encoded string. + bytes_param = bytes(url_param, encoding="utf8") + str_param = bytes.decode(base64.urlsafe_b64decode(bytes_param), encoding="utf8") + return json.loads(str_param) diff --git a/python/xorbits/_mars/lib/filesystem/_oss_lib/glob.py b/python/xorbits/_mars/lib/filesystem/_oss_lib/glob.py new file mode 100644 index 000000000..8b8d4f944 --- /dev/null +++ b/python/xorbits/_mars/lib/filesystem/_oss_lib/glob.py @@ -0,0 +1,147 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Filename globbing utility, modified from python glob. + +obviously,this implementation is not optimal, it will cause too many +oss requests. Lately, We can then convert the glob expression into +a regular expression, and then match the oss key list. +But before that, we need to figure out how to deal with magic char +in oss key, such like oss glob: oss://bucket/[key]/*, the key +oss://bucket/[key]/a exactly exists. + +Notes: + OSS need a bucket to specify the file or dir, the "**" pattern is + not supported. So _isrecursive(pattern) is removed. +""" + +import fnmatch +import os +import re + +from .common import oss_exists, oss_isdir, oss_scandir + +__all__ = ["glob", "iglob", "escape"] + + +def glob(pathname, *, recursive=False): + """Return a list of paths matching a pathname pattern. + The pattern may contain simple shell-style wildcards a la + fnmatch. However, unlike fnmatch, filenames starting with a + dot are special cases that are not matched by '*' and '?' + patterns. + """ + return list(iglob(pathname, recursive=recursive)) + + +def iglob(pathname, *, recursive=False): + """Return an iterator which yields the paths matching a pathname pattern. + The pattern may contain simple shell-style wildcards like + fnmatch. However, unlike fnmatch, filenames starting with a + dot are special cases that are not matched by '*' and '?' + patterns. + """ + it = _iglob(pathname, recursive, False) + return it + + +def _iglob(pathname, recursive, dironly): + dirname, basename = os.path.split(pathname) + if not has_magic(pathname): + assert not dironly + if basename: + if oss_exists(pathname): + yield pathname + else: + # Patterns ending with a slash should match only directories + if oss_isdir(dirname): + yield pathname + return + # dirname will not be None in oss path. + # Prevent an infinite recursion if a drive or UNC path + # contains magic characters (i.e. r'\\?\C:'). + if dirname != pathname and has_magic(dirname): + dirs = _iglob(dirname, recursive, True) + else: + dirs = [dirname] + if has_magic(basename): + glob_in_dir = _glob1 + else: + glob_in_dir = _glob0 + for dirname in dirs: + for name in glob_in_dir(dirname, basename, dironly): + yield os.path.join(dirname, name) + + +# These 2 helper functions non-recursively glob inside a literal directory. +# They return a list of basenames. _glob1 accepts a pattern while _glob0 +# takes a literal basename (so it only has to check for its existence). + + +def _glob1(dirname, pattern, dironly): + names = list(_iterdir(dirname, dironly)) + if not _ishidden(pattern): + names = (x for x in names if not _ishidden(x)) + return fnmatch.filter(names, pattern) + + +def _glob0(dirname, basename, dironly): + if not basename: + # `os.path.split()` returns an empty basename for paths ending with a + # directory separator. 'q*x/' should match only directories. + if oss_isdir(dirname): + return [basename] + else: + if oss_exists(os.path.join(dirname, basename)): + return [basename] + return [] + + +# If dironly is false, yields all file names inside a directory. +# If dironly is true, yields only directory names. +# An oss path must contain a dirname. +def _iterdir(dirname, dironly): + for entry in oss_scandir(dirname): + if not dironly or entry.is_dir(): + yield entry.name + return + + +magic_check = re.compile("([*?[])") +magic_check_bytes = re.compile(b"([*?[])") + + +def has_magic(s): + if isinstance(s, bytes): + match = magic_check_bytes.search(s) + else: + match = magic_check.search(s) + return match is not None + + +def _ishidden(path): + return False + + +def escape(pathname): + """Escape all special characters.""" + # Escaping is done by wrapping any of "*?[" between square brackets. + # Metacharacters do not work in the drive part and shouldn't be escaped. + drive, pathname = os.path.splitdrive(pathname) + if isinstance(pathname, bytes): + pathname = magic_check_bytes.sub(rb"[\1]", pathname) + else: + pathname = magic_check.sub(r"[\1]", pathname) + return drive + pathname diff --git a/python/xorbits/_mars/lib/filesystem/_oss_lib/handle.py b/python/xorbits/_mars/lib/filesystem/_oss_lib/handle.py new file mode 100644 index 000000000..1eb93999e --- /dev/null +++ b/python/xorbits/_mars/lib/filesystem/_oss_lib/handle.py @@ -0,0 +1,156 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from io import IOBase + +from ....utils import lazy_import +from .common import oss_stat, parse_osspath + +oss2 = lazy_import("oss2", placeholder=True) + + +class OSSIOBase(IOBase): + def __init__(self, path, mode): + self._path = path + ( + self._bucket_name, + self._key_name, + self._access_key_id, + self._access_key_secret, + self._end_point, + ) = parse_osspath(self._path) + self._bucket = self._get_bucket() + self._current_pos = 0 + self._size = None + self._buffer = b"" + self._buffer_size = 1 * 1024 + self._mode = mode + + @property + def mode(self): + return self._mode + + def fileno(self) -> int: + raise AttributeError + + def _get_bucket(self): + return oss2.Bucket( + auth=oss2.Auth( + access_key_id=self._access_key_id, + access_key_secret=self._access_key_secret, + ), + endpoint=self._end_point, + bucket_name=self._bucket_name, + ) + + def _get_size(self): + if self._size is None: + self._size = int(oss_stat(self._path)["size"]) + return self._size + + def seek(self, pos, whence=0): + if whence == 0: + if pos < 0: + raise OSError("Invalid argument") + self._current_pos = pos + elif whence == 2: + self._current_pos = self._get_size() + pos + elif whence == 1: + check_pos = self._current_pos + pos + if check_pos < 0: + raise OSError("Invalid argument") + else: + self._current_pos = self._current_pos + pos + else: + raise ValueError('Parameter "whence" should be 0 or 1 or 2') + if pos > 0 and self._current_pos > self._get_size() - 1: + self._current_pos = self._get_size() + return self._current_pos + + def seekable(self): + return True + + def read(self, size=-1): + """ + Read and return up to size bytes, where size is an int. + + If the argument is omitted, None, or negative, reads and + returns all data until EOF. + + If the argument is positive, multiple raw reads may be issued to satisfy + the byte count (unless EOF is reached first). + + Returns an empty bytes array on EOF. + """ + if self._current_pos == self._get_size() or size == 0: + return b"" + elif size < 0: + obj = self._bucket.get_object( + self._key_name, byte_range=(self._current_pos, None) + ) + self._current_pos = self._get_size() + else: + obj = self._bucket.get_object( + self._key_name, + byte_range=(self._current_pos, self._current_pos + size - 1), + ) + self._current_pos = self._current_pos + size + content = obj.read() + return content + + def readline(self, size=-1): + # For backwards compatibility, a (slowish) readline(). + def nreadahead(): + # Read to the beginning of the next line + read_to = min( + self._get_size() - 1, self._current_pos + self._buffer_size - 1 + ) + buffer = self._bucket.get_object( + self._key_name, byte_range=(self._current_pos, read_to) + ).read() + if not buffer: + return 1 + n = (buffer.find(b"\n") + 1) or len(buffer) + if size >= 0: + n = min(n, size) + return n + + if size is None: + size = -1 + else: + try: + size_index = size.__index__ + except AttributeError: + raise TypeError(f"{size!r} is not an integer") + else: + size = size_index() + res = bytearray() + while size < 0 or len(res) < size: + b = self.read(nreadahead()) + if not b: + break + res += b + if res.endswith(b"\n"): + break + return bytes(res) + + def readable(self): + return True + + def writable(self): + return False + + def close(self): + # already closed by oss + pass diff --git a/python/xorbits/_mars/lib/filesystem/arrow.py b/python/xorbits/_mars/lib/filesystem/arrow.py new file mode 100644 index 000000000..2960cf3c3 --- /dev/null +++ b/python/xorbits/_mars/lib/filesystem/arrow.py @@ -0,0 +1,236 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import subprocess +import weakref +from typing import BinaryIO, Dict, Iterator, List, TextIO, Tuple, Union +from urllib.parse import urlparse + +import pyarrow as pa +from pyarrow.fs import FileInfo, FileSelector +from pyarrow.fs import FileSystem as ArrowFileSystem +from pyarrow.fs import FileType +from pyarrow.fs import HadoopFileSystem as ArrowHadoopFileSystem +from pyarrow.fs import LocalFileSystem as ArrowLocalFileSystem + +from ...utils import implements, stringify_path +from .core import FileSystem, path_type + +__all__ = ("ArrowBasedLocalFileSystem", "HadoopFileSystem") + + +# When pyarrow.fs.FileSystem gc collected, +# the underlying connection will be closed, +# so we hold the reference to make sure +# FileSystem will not be gc collected before file object +_file_to_filesystems = weakref.WeakKeyDictionary() + + +class ArrowBasedFileSystem(FileSystem): + """ + FileSystem implemented with arrow fs API (>=2.0.0). + """ + + def __init__(self, arrow_fs: ArrowFileSystem, sequential_read=False): + self._arrow_fs = arrow_fs + # for open('rb'), open a sequential reading only or not + self._sequential_read = sequential_read + + @staticmethod + def _process_path(path): + return stringify_path(path) + + @implements(FileSystem.cat) + def cat(self, path: path_type) -> bytes: + path = self._process_path(path) + file: pa.NativeFile = self._arrow_fs.open_input_stream(path) + return file.read() + + @implements(FileSystem.ls) + def ls(self, path: path_type) -> List[path_type]: + path = self._process_path(path) + file_selector: FileSelector = FileSelector(path) + paths = [] + for file_info in self._arrow_fs.get_file_info(file_selector): + paths.append(file_info.path) + return paths + + def _get_file_info(self, path: path_type) -> FileInfo: + path = self._process_path(path) + file_info: FileInfo = self._arrow_fs.get_file_info([path])[0] + return file_info + + @implements(FileSystem.delete) + def delete(self, path: path_type, recursive: bool = False): + path = self._process_path(path) + info = self._get_file_info(path) + if info.is_file: + self._arrow_fs.delete_file(path) + elif info.type == FileType.Directory: + if not recursive and len(self.ls(path)) > 0: + raise OSError(f"[Errno 66] Directory not empty: '{path}'") + self._arrow_fs.delete_dir(path) + else: # pragma: no cover + raise TypeError(f"path({path}) to delete must be a file or directory") + + @implements(FileSystem.rename) + def rename(self, path: path_type, new_path: path_type): + path = self._process_path(path) + new_path = self._process_path(new_path) + self._arrow_fs.move(path, new_path) + + @implements(FileSystem.stat) + def stat(self, path: path_type) -> Dict: + path = self._process_path(path) + info = self._get_file_info(path) + stat = dict(name=path, size=info.size, modified_time=info.mtime_ns / 1e9) + if info.type == FileType.File: + stat["type"] = "file" + elif info.type == FileType.Directory: + stat["type"] = "directory" + else: # pragma: no cover + stat["type"] = "other" + return stat + + @implements(FileSystem.mkdir) + def mkdir(self, path: path_type, create_parents: bool = True): + path = self._process_path(path) + self._arrow_fs.create_dir(path, recursive=create_parents) + + @implements(FileSystem.isdir) + def isdir(self, path: path_type) -> bool: + path = self._process_path(path) + info = self._get_file_info(path) + return info.type == FileType.Directory + + @implements(FileSystem.isfile) + def isfile(self, path: path_type) -> bool: + path = self._process_path(path) + info = self._get_file_info(path) + return info.is_file + + @implements(FileSystem._isfilestore) + def _isfilestore(self) -> bool: + return True + + @implements(FileSystem.exists) + def exists(self, path: path_type): + path = self._process_path(path) + info = self._get_file_info(path) + return info.type != FileType.NotFound + + @implements(FileSystem.open) + def open(self, path: path_type, mode: str = "rb") -> Union[BinaryIO, TextIO]: + path = self._process_path(path) + is_binary = mode.endswith("b") + if not is_binary: # pragma: no cover + raise ValueError( + f"mode can only be binary for arrow based filesystem, got {mode}" + ) + mode = mode.rstrip("b") + if mode == "w": + file = self._arrow_fs.open_output_stream(path) + elif mode == "r": + if self._sequential_read: # pragma: no cover + file = self._arrow_fs.open_input_stream(path) + else: + file = self._arrow_fs.open_input_file(path) + elif mode == "a": + file = self._arrow_fs.open_append_stream(path) + else: # pragma: no cover + raise ValueError( + f'mode can only be "wb", "rb" and "ab" for ' + f"arrow based filesystem, got {mode}" + ) + + _file_to_filesystems[file] = self._arrow_fs + return file + + @implements(FileSystem.walk) + def walk(self, path: path_type) -> Iterator[Tuple[str, List[str], List[str]]]: + path = self._process_path(path) + q = [path] + while q: + curr = q.pop(0) + file_selector: FileSelector = FileSelector(curr) + dirs, files = [], [] + for info in self._arrow_fs.get_file_info(file_selector): + if info.type == FileType.File: + files.append(info.base_name) + elif info.type == FileType.Directory: + dirs.append(info.base_name) + q.append(info.path) + else: # pragma: no cover + continue + yield curr, dirs, files + + @implements(FileSystem.glob) + def glob(self, path: path_type, recursive: bool = False) -> List[path_type]: + from ._glob import FileSystemGlob + + path = self._process_path(path) + return FileSystemGlob(self).glob(path, recursive=recursive) + + +class ArrowBasedLocalFileSystem(ArrowBasedFileSystem): + def __init__(self): + super().__init__(ArrowLocalFileSystem()) + + _instance = None + + @classmethod + def get_instance(cls): + if cls._instance is None: + cls._instance = ArrowBasedLocalFileSystem() + return cls._instance + + +class HadoopFileSystem(ArrowBasedFileSystem): + def __init__( + self, + host="default", + port=0, + user=None, + kerb_ticket=None, + driver="libhdfs", + extra_conf=None, + ): + assert driver == "libhdfs" + if "HADOOP_HOME" in os.environ and "CLASSPATH" not in os.environ: + classpath_proc = subprocess.run( + [os.environ["HADOOP_HOME"] + "/bin/hdfs", "classpath", "--glob"], + stdout=subprocess.PIPE, + ) + os.environ["CLASSPATH"] = classpath_proc.stdout.decode().strip() + arrow_fs = ArrowHadoopFileSystem( + host=host, + port=port, + user=user, + kerb_ticket=kerb_ticket, + extra_conf=extra_conf, + ) + super().__init__(arrow_fs) + + @staticmethod + def _process_path(path): + path = ArrowBasedFileSystem._process_path(path) + # use urlparse to extract path from like: + # hdfs://localhost:8020/tmp/test/simple_test.csv, + # due to the reason that pa.fs.HadoopFileSystem cannot accept + # path with hdfs:// prefix + if path.startswith("hdfs://"): + return urlparse(path).path + else: + return path diff --git a/python/xorbits/_mars/lib/filesystem/azure.py b/python/xorbits/_mars/lib/filesystem/azure.py new file mode 100644 index 000000000..4c02155d0 --- /dev/null +++ b/python/xorbits/_mars/lib/filesystem/azure.py @@ -0,0 +1,36 @@ +# Copyright 2022 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +try: # pragma: no cover + # make sure adlfs is installed + from adlfs import AzureBlobFileSystem as _AzureBlobFileSystem + + # make sure fsspec is installed + from .fsspec_adapter import FsSpecAdapter + + del _AzureBlobFileSystem +except ImportError: + FsSpecAdapter = None + +if FsSpecAdapter is not None: # pragma: no cover + from .core import register_filesystem + + class AzureBlobFileSystem(FsSpecAdapter): + def __init__(self, **kwargs): + super().__init__("az", **kwargs) + + register_filesystem("az", AzureBlobFileSystem) + register_filesystem("abfs", AzureBlobFileSystem) +else: + AzureBlobFileSystem = None diff --git a/python/xorbits/_mars/lib/filesystem/base.py b/python/xorbits/_mars/lib/filesystem/base.py new file mode 100644 index 000000000..feb01657a --- /dev/null +++ b/python/xorbits/_mars/lib/filesystem/base.py @@ -0,0 +1,263 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from abc import ABC, abstractmethod +from typing import BinaryIO, Dict, Iterator, List, TextIO, Tuple, Union +from urllib.parse import urlparse + +from ...utils import stringify_path + +path_type = Union[str, os.PathLike] + + +class FileSystem(ABC): + """ + Abstract filesystem interface + """ + + @abstractmethod + def cat(self, path: path_type) -> bytes: + """ + Return contents of file as a bytes object + + Parameters + ---------- + path : str or path-like + File path to read content from. + + Returns + ------- + contents : bytes + """ + + @abstractmethod + def ls(self, path: path_type) -> List[path_type]: + """ + Return list of file paths + + Returns + ------- + paths : list + """ + + @abstractmethod + def delete(self, path: path_type, recursive: bool = False): + """ + Delete the indicated file or directory + + Parameters + ---------- + path : str + recursive : bool, default False + If True, also delete child paths for directories + """ + + def disk_usage(self, path: path_type) -> int: + """ + Compute bytes used by all contents under indicated path in file tree + + Parameters + ---------- + path : string + Can be a file path or directory + + Returns + ------- + usage : int + """ + path = stringify_path(path) + path_info = self.stat(path) + if path_info["type"] == "file": + return path_info["size"] + + total = 0 + for root, directories, files in self.walk(path): + for child_path in files: + abspath = self.path_join(root, child_path) + total += self.stat(abspath)["size"] + + return total + + def path_join(self, *args): + return self.pathsep.join(args) + + def path_split(self, path): + """ + Split a pathname. Returns tuple "(head, tail)" where "tail" is everything after the final slash. Either part + may be empty. + + Parameters + ---------- + path : string + Can be a file path or directory + + Returns + ------- + usage : int + """ + splits = path.rsplit(self.pathsep, 1) + if len(splits) == 1: + return "", splits[0] + else: + return splits + + @abstractmethod + def stat(self, path: path_type) -> Dict: + """ + Information about a filesystem entry. + + Returns + ------- + stat : dict + """ + + def rm(self, path: path_type, recursive: bool = False): + """ + Alias for FileSystem.delete + """ + return self.delete(path, recursive=recursive) + + def mv(self, path, new_path): + """ + Alias for FileSystem.rename + """ + return self.rename(path, new_path) + + @abstractmethod + def rename(self, path: path_type, new_path: path_type): + """ + Rename file, like UNIX mv command + + Parameters + ---------- + path : string + Path to alter + new_path : string + Path to move to + """ + + @abstractmethod + def mkdir(self, path: path_type, create_parents: bool = True): + """ + Create a directory. + + Parameters + ---------- + path : str + Path to the directory. + create_parents : bool, default True + If the parent directories don't exists create them as well. + """ + + @abstractmethod + def exists(self, path: path_type): + """ + Return True if path exists. + + Parameters + ---------- + path : str + Path to check. + """ + + @abstractmethod + def isdir(self, path: path_type) -> bool: + """ + Return True if path is a directory. + + Parameters + ---------- + path : str + Path to check. + """ + + @abstractmethod + def isfile(self, path: path_type) -> bool: + """ + Return True if path is a file. + + Parameters + ---------- + path : str + Path to check. + """ + + @abstractmethod + def _isfilestore(self) -> bool: + """ + Returns True if this FileSystem is a unix-style file store with + directories. + """ + + @abstractmethod + def open(self, path: path_type, mode: str = "rb") -> Union[BinaryIO, TextIO]: + """ + Open file for reading or writing. + """ + + @abstractmethod + def walk(self, path: path_type) -> Iterator[Tuple[str, List[str], List[str]]]: + """ + Directory tree generator. + + Parameters + ---------- + path : str + + Returns + ------- + generator + """ + + @abstractmethod + def glob(self, path: path_type, recursive: bool = False) -> List[path_type]: + """ + Return a list of paths matching a pathname pattern. + + Parameters + ---------- + path : str + Pattern may contain simple shell-style wildcards + recursive : bool + If recursive is true, the pattern '**' will match any files and + zero or more directories and subdirectories. + + Returns + ------- + paths : List + """ + + @property + def pathsep(self) -> str: + return "/" + + @staticmethod + def parse_from_path(uri: str): + parsed_uri = urlparse(uri) + options = dict() + options["host"] = parsed_uri.netloc.rsplit("@", 1)[-1].rsplit(":", 1)[0] + if parsed_uri.port: + options["port"] = parsed_uri.port + if parsed_uri.username: + options["user"] = parsed_uri.username + if parsed_uri.password: + options["password"] = parsed_uri.password + return options + + @classmethod + def get_storage_options(cls, storage_options: Dict, uri: str) -> Dict: + options = cls.parse_from_path(uri) + storage_options.update(options) + return storage_options diff --git a/python/xorbits/_mars/lib/filesystem/core.py b/python/xorbits/_mars/lib/filesystem/core.py new file mode 100644 index 000000000..db122537c --- /dev/null +++ b/python/xorbits/_mars/lib/filesystem/core.py @@ -0,0 +1,95 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob as glob_ +import os +from typing import Dict, List +from urllib.parse import urlparse + +from ..compression import compress +from .base import FileSystem, path_type +from .local import LocalFileSystem +from .oss import OSSFileSystem + +_filesystems = {"file": LocalFileSystem, "oss": OSSFileSystem} +_scheme_to_dependencies = { + "hdfs": ["pyarrow"], + "az": ["fsspec", "adlfs"], + "abfs": ["fsspec", "adlfs"], + "s3": ["fsspec", "s3fs"], +} + + +def register_filesystem(name: str, fs): + _filesystems[name] = fs + + +def get_fs(path: path_type, storage_options: Dict = None) -> FileSystem: + if storage_options is None: + storage_options = dict() + + # detect scheme + if os.path.exists(path) or glob_.glob(path): + scheme = "file" + else: + scheme = urlparse(path).scheme + if scheme == "" or len(scheme) == 1: # len == 1 for windows + scheme = "file" + + if scheme in _filesystems: + file_system_type = _filesystems[scheme] + if scheme == "file" or scheme == "oss": + # local file systems are singletons. + return file_system_type.get_instance() + else: + storage_options = file_system_type.get_storage_options( + storage_options, path + ) + return file_system_type(**storage_options) + elif scheme in _scheme_to_dependencies: # pragma: no cover + dependencies = ", ".join(_scheme_to_dependencies[scheme]) + raise ImportError(f"Need to install {dependencies} to access {scheme}.") + else: + raise ValueError( + f"Unknown file system type: {scheme}, " + f'available include: {", ".join(_scheme_to_dependencies.keys())}' + ) + + +def glob(path: path_type, storage_options: Dict = None) -> List[path_type]: + if "*" in path: + fs = get_fs(path, storage_options) + return fs.glob(path) + else: + return [path] + + +def file_size(path: path_type, storage_options: Dict = None) -> int: + fs = get_fs(path, storage_options) + return fs.stat(path)["size"] + + +def open_file( + path: path_type, + mode: str = "rb", + compression: str = None, + storage_options: Dict = None, +): + fs = get_fs(path, storage_options) + file = fs.open(path, mode=mode) + + if compression is not None: + file = compress(file, compression) + + return file diff --git a/python/xorbits/_mars/lib/filesystem/fsmap.py b/python/xorbits/_mars/lib/filesystem/fsmap.py new file mode 100644 index 000000000..82380824e --- /dev/null +++ b/python/xorbits/_mars/lib/filesystem/fsmap.py @@ -0,0 +1,164 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import MutableMapping +from urllib.parse import urlparse + +from .local import LocalFileSystem + + +class FSMap(MutableMapping): + """ + Wrap a FileSystem instance as a mutable wrapping. + The keys of the mapping become files under the given root, and the + values (which must be bytes) the contents of those files. + + Parameters + ---------- + root: string + prefix for all the files + fs: FileSystem instance + check: bool (=True) + performs a touch at the location, to check for write access. + """ + + def __init__(self, root, fs, check=False, create=False): + self.fs = fs + self.root = self._get_path(fs, root) + if create: + if not self.fs.exists(root): + self.fs.mkdir(root) + if check: + if not self.fs.exists(root): + raise ValueError( + f"Path {root} does not exist. Create with the ``create=True`` keyword" + ) + with self.fs.open(fs.pathsep.join([root, "a"]), "w"): + pass + self.fs.rm(fs.pathsep.join([root, "a"])) + + @staticmethod + def _get_path(fs, path): + return path if isinstance(fs, LocalFileSystem) else urlparse(path).path + + @staticmethod + def _normalize_path(fs, path, lstrip=False, rstrip=False): + if fs.pathsep != "/": # pragma: no cover + path = path.replace("/", fs.pathsep) + if lstrip: + path = path.lstrip(fs.pathsep) + if rstrip: + path = path.rstrip(fs.pathsep) + return path + + @staticmethod + def _join_path(fs, paths): + if fs.pathsep == "/": + return "/".join(paths) + + new_paths = [] + for i, path in enumerate(paths): + path = FSMap._normalize_path( + fs, path, lstrip=i > 0, rstrip=i < len(paths) - 1 + ) + new_paths.append(path) + return fs.pathsep.join(new_paths) + + def clear(self): + """Remove all keys below root - empties out mapping""" + try: + self.fs.rm(self.root, True) + self.fs.mkdir(self.root) + except: # noqa: E722 # pragma: no cover + pass + + def _key_to_str(self, key): + """Generate full path for the key""" + if isinstance(key, (tuple, list)): + key = str(tuple(key)) + else: + key = str(key) + return self._join_path(self.fs, [self.root, key]) if self.root else key + + def _str_to_key(self, s): + """Strip path of to leave key name""" + key = self._normalize_path(self.fs, s[len(self.root) :], lstrip=True) + if self.fs.pathsep != "/": # pragma: no cover + key = key.replace(self.fs.pathsep, "/") + return key + + def __getitem__(self, key, default=None): + """Retrieve data""" + key = self._key_to_str(key) + try: + result = self.fs.cat(key) + except: # noqa: E722 + if default is not None: + return default + raise KeyError(key) + return result + + def pop(self, key, default=None): + result = self.__getitem__(key, default) + try: + del self[key] + except KeyError: + pass + return result + + @staticmethod + def _parent(fs, path): + path = FSMap._get_path(fs, path.rstrip(fs.pathsep)) + if fs.pathsep in path: + return path.rsplit(fs.pathsep, 1)[0] + else: # pragma: no cover + return "" + + def __setitem__(self, key, value): + """Store value in key""" + key = self._key_to_str(key) + try: + self.fs.mkdir(self._parent(self.fs, key)) + except FileExistsError: + pass + with self.fs.open(key, "wb") as f: + f.write(value) + + @staticmethod + def _find(fs, path): + out = set() + for path, dirs, files in fs.walk(path): + out.update(fs.pathsep.join([path, f]) for f in files) + if fs.isfile(path) and path not in out: + # walk works on directories, but find should also return [path] + # when path happens to be a file + out.add(path) + return sorted(out) + + def __iter__(self): + return (self._str_to_key(x) for x in self._find(self.fs, self.root)) + + def __len__(self): + return len(self._find(self.fs, self.root)) + + def __delitem__(self, key): + """Remove key""" + try: + self.fs.rm(self._key_to_str(key)) + except: # noqa: E722 + raise KeyError + + def __contains__(self, key): + """Does key exist in mapping?""" + return self.fs.exists(self._key_to_str(key)) diff --git a/python/xorbits/_mars/lib/filesystem/fsspec_adapter.py b/python/xorbits/_mars/lib/filesystem/fsspec_adapter.py new file mode 100644 index 000000000..e8b913078 --- /dev/null +++ b/python/xorbits/_mars/lib/filesystem/fsspec_adapter.py @@ -0,0 +1,132 @@ +# Copyright 2022 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Iterator, Tuple, Union, BinaryIO, TextIO, Dict +from urllib.parse import urlparse, urlunparse, ParseResult + +from fsspec import filesystem +from fsspec.core import stringify_path + +from ...utils import implements +from .core import FileSystem +from .core import path_type + + +class FsSpecAdapter(FileSystem): + def __init__(self, scheme: str, **kwargs): + self._fs = filesystem(scheme, **kwargs) + self._scheme = scheme + + @implements(FileSystem.cat) + def cat(self, path: path_type) -> bytes: + return self._fs.cat_file(self._normalize_path(path)) + + @implements(FileSystem.ls) + def ls(self, path: path_type) -> List[path_type]: + entries = [] + for entry in self._fs.ls(self._normalize_path(path), detail=False): + if isinstance(entry, Dict): + entries.append(entry.get("name")) + elif isinstance(entry, str): + entries.append(entry) + else: # pragma: no cover + raise TypeError(f"Expect str or dict, but got {type(entry)}") + return self._append_scheme(entries) + + @implements(FileSystem.delete) + def delete(self, path: path_type, recursive: bool = False): + raise NotImplementedError + + @implements(FileSystem.stat) + def stat(self, path: path_type) -> Dict: + return self._fs.info(self._normalize_path(path)) + + @implements(FileSystem.rename) + def rename(self, path: path_type, new_path: path_type): + raise NotImplementedError + + @implements(FileSystem.mkdir) + def mkdir(self, path: path_type, create_parents: bool = True): + raise NotImplementedError + + @implements(FileSystem.exists) + def exists(self, path: path_type): + return self._fs.exists(self._normalize_path(path)) + + @implements(FileSystem.isdir) + def isdir(self, path: path_type) -> bool: + return self._fs.isdir(self._normalize_path(path)) + + @implements(FileSystem.isfile) + def isfile(self, path: path_type) -> bool: + return self._fs.isfile(self._normalize_path(path)) + + @implements(FileSystem._isfilestore) + def _isfilestore(self) -> bool: + raise NotImplementedError + + @implements(FileSystem.open) + def open(self, path: path_type, mode: str = "rb") -> Union[BinaryIO, TextIO]: + return self._fs.open(self._normalize_path(path), mode=mode) + + @implements(FileSystem.walk) + def walk(self, path: path_type) -> Iterator[Tuple[str, List[str], List[str]]]: + for root, dirs, files in self._fs.walk(path): + yield self._append_scheme([root])[0], self._append_scheme( + dirs + ), self._append_scheme(files) + + @implements(FileSystem.glob) + def glob(self, path: path_type, recursive: bool = False) -> List[path_type]: + from ._glob import FileSystemGlob + + return self._append_scheme( + FileSystemGlob(self).glob(self._normalize_path(path), recursive=recursive) + ) + + @staticmethod + def _normalize_path(path: path_type) -> str: + """ + Stringify path and remove its scheme. + """ + path_str = stringify_path(path) + parsed = urlparse(path_str) + if parsed.scheme: + return urlunparse( + ParseResult( + scheme="", + netloc=parsed.netloc, + path=parsed.path, + params="", + query="", + fragment="", + ) + ) + else: + return path_str + + def _append_scheme(self, paths: List[path_type]) -> List[path_type]: + return [ + urlunparse( + ParseResult( + scheme=self._scheme, + netloc="", + path=path, + params="", + query="", + fragment="", + ) + ) + for path in paths + ] diff --git a/python/xorbits/_mars/lib/filesystem/hdfs.py b/python/xorbits/_mars/lib/filesystem/hdfs.py new file mode 100644 index 000000000..91ae22403 --- /dev/null +++ b/python/xorbits/_mars/lib/filesystem/hdfs.py @@ -0,0 +1,31 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +try: + from pyarrow.fs import HadoopFileSystem as _ArrowHadoopFileSystem + + from .arrow import HadoopFileSystem + + del _ArrowHadoopFileSystem +except ImportError: # pragma: no cover + try: + # pyarrow < 2.0.0 + from pyarrow import HadoopFileSystem + except ImportError: + HadoopFileSystem = None + +from .core import register_filesystem + +if HadoopFileSystem is not None: # pragma: no branch + register_filesystem("hdfs", HadoopFileSystem) diff --git a/python/xorbits/_mars/lib/filesystem/local.py b/python/xorbits/_mars/lib/filesystem/local.py new file mode 100644 index 000000000..e2a8ee6ef --- /dev/null +++ b/python/xorbits/_mars/lib/filesystem/local.py @@ -0,0 +1,112 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import os +import shutil +from typing import BinaryIO, Dict, Iterator, List, TextIO, Tuple, Union + +from ...utils import implements, stringify_path +from .base import FileSystem, path_type + + +class LocalFileSystem(FileSystem): + _instance = None + + @classmethod + def get_instance(cls): + if cls._instance is None: + cls._instance = LocalFileSystem() + return cls._instance + + @implements(FileSystem.cat) + def cat(self, path: path_type): + with self.open(path, "rb") as f: + return f.read() + + @implements(FileSystem.ls) + def ls(self, path: path_type) -> List[path_type]: + path = stringify_path(path) + return sorted(os.path.join(path, x) for x in os.listdir(path)) + + @implements(FileSystem.delete) + def delete(self, path: path_type, recursive: bool = False): + if os.path.isfile(path): + os.remove(path) + elif not recursive: + os.rmdir(path) + else: + shutil.rmtree(path) + + @implements(FileSystem.rename) + def rename(self, path: path_type, new_path: path_type): + os.rename(path, new_path) + + @implements(FileSystem.stat) + def stat(self, path: path_type) -> Dict: + os_stat = os.stat(path) + stat = dict(name=path, size=os_stat.st_size, modified_time=os_stat.st_mtime) + if os.path.isfile(path): + stat["type"] = "file" + elif os.path.isdir(path): + stat["type"] = "directory" + else: # pragma: no cover + stat["type"] = "other" + return stat + + @implements(FileSystem.mkdir) + def mkdir(self, path: path_type, create_parents: bool = True): + path = stringify_path(path) + if create_parents: + os.makedirs(path) + else: + os.mkdir(path) + + @implements(FileSystem.isdir) + def isdir(self, path: path_type) -> bool: + path = stringify_path(path) + return os.path.isdir(path) + + @implements(FileSystem.isfile) + def isfile(self, path: path_type) -> bool: + path = stringify_path(path) + return os.path.isfile(path) + + @implements(FileSystem._isfilestore) + def _isfilestore(self) -> bool: + return True + + @implements(FileSystem.exists) + def exists(self, path: path_type): + path = stringify_path(path) + return os.path.exists(path) + + @implements(FileSystem.open) + def open(self, path: path_type, mode: str = "rb") -> Union[BinaryIO, TextIO]: + path = stringify_path(path) + return open(path, mode=mode) + + @implements(FileSystem.walk) + def walk(self, path: path_type) -> Iterator[Tuple[str, List[str], List[str]]]: + path = stringify_path(path) + return os.walk(path) + + @implements(FileSystem.glob) + def glob(self, path: path_type, recursive: bool = False) -> List[path_type]: + path = stringify_path(path) + return glob.glob(path, recursive=recursive) + + @property + def pathsep(self) -> str: + return os.path.sep diff --git a/python/xorbits/_mars/lib/filesystem/oss.py b/python/xorbits/_mars/lib/filesystem/oss.py new file mode 100644 index 000000000..95cf29a0b --- /dev/null +++ b/python/xorbits/_mars/lib/filesystem/oss.py @@ -0,0 +1,157 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, Iterator, List, Tuple +from urllib import parse + +from ...utils import implements, lazy_import +from ._oss_lib import common as oc +from ._oss_lib.glob import glob +from ._oss_lib.handle import OSSIOBase +from .base import FileSystem, path_type + +oss2 = lazy_import("oss2", placeholder=True) + +_oss_time_out = 10 + + +class OSSFileSystem(FileSystem): + _instance = None + + @classmethod + def get_instance(cls): + if cls._instance is None: + cls._instance = OSSFileSystem() + return cls._instance + + @implements(FileSystem.cat) + def cat(self, path: path_type): + raise NotImplementedError + + @implements(FileSystem.ls) + def ls(self, path: path_type) -> List[path_type]: + file_list = [] + file_entry = oc.OSSFileEntry(path) + if not file_entry.is_dir(): + raise OSError("ls for file is not supported") + else: + bucket, key, access_key_id, access_key_secret, end_point = oc.parse_osspath( + path + ) + oss_bucket = oss2.Bucket( + auth=oss2.Auth( + access_key_id=access_key_id, access_key_secret=access_key_secret + ), + endpoint=end_point, + bucket_name=bucket, + connect_timeout=_oss_time_out, + ) + for obj in oss2.ObjectIteratorV2(oss_bucket, prefix=key): + if obj.key.endswith("/"): + continue + obj_path = rf"oss://{bucket}/{obj.key}" + file_list.append( + build_oss_path( + obj_path, access_key_id, access_key_secret, end_point + ) + ) + return file_list + + @implements(FileSystem.delete) + def delete(self, path: path_type, recursive: bool = False): + raise NotImplementedError + + @implements(FileSystem.rename) + def rename(self, path: path_type, new_path: path_type): + raise NotImplementedError + + @implements(FileSystem.stat) + def stat(self, path: path_type) -> Dict: + ofe = oc.OSSFileEntry(path) + return ofe.stat() + + @implements(FileSystem.mkdir) + def mkdir(self, path: path_type, create_parents: bool = True): + raise NotImplementedError + + @implements(FileSystem.isdir) + def isdir(self, path: path_type) -> bool: + file_entry = oc.OSSFileEntry(path) + return file_entry.is_dir() + + @implements(FileSystem.isfile) + def isfile(self, path: path_type) -> bool: + file_entry = oc.OSSFileEntry(path) + return file_entry.is_file() + + @implements(FileSystem._isfilestore) + def _isfilestore(self) -> bool: + raise NotImplementedError + + @implements(FileSystem.exists) + def exists(self, path: path_type): + return oc.oss_exists(path) + + @implements(FileSystem.open) + def open(self, path: path_type, mode: str = "rb") -> OSSIOBase: + file_handle = OSSIOBase(path, mode) + return file_handle + + @implements(FileSystem.walk) + def walk(self, path: path_type) -> Iterator[Tuple[str, List[str], List[str]]]: + raise NotImplementedError + + @implements(FileSystem.glob) + def glob(self, path: path_type, recursive: bool = False) -> List[path_type]: + return glob(path, recursive=recursive) + + +def build_oss_path(path: path_type, access_key_id, access_key_secret, end_point): + """ + Returns a path with oss info. + Used to register the access_key_id, access_key_secret and + endpoint of OSS. The access_key_id and endpoint are put + into the url with url-safe-base64 encoding. + + Parameters + ---------- + path : path_type + The original oss url. + + access_key_id : str + The access key id of oss. + + access_key_secret : str + The access key secret of oss. + + end_point : str + The endpoint of oss. + + Returns + ------- + path_type + Path include the encoded access key id, end point and + access key secret of oss. + """ + if isinstance(path, (list, tuple)): + path = path[0] + param_dict = {"access_key_id": access_key_id, "end_point": end_point} + id_endpoint = oc.dict_to_url(param_dict) + password = access_key_secret + parse_result = parse.urlparse(path) + new_path = ( + f"{parse_result.scheme}://{id_endpoint}:{password}" + f"@{parse_result.netloc}{parse_result.path}" + ) + return new_path diff --git a/python/xorbits/_mars/lib/filesystem/s3.py b/python/xorbits/_mars/lib/filesystem/s3.py new file mode 100644 index 000000000..aac457241 --- /dev/null +++ b/python/xorbits/_mars/lib/filesystem/s3.py @@ -0,0 +1,79 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from typing import Dict + +""" +An example to read csv from s3 +------------------------------ +>>> import mars +>>> import mars.dataframe as md +>>> +>>> mars.new_session() +>>> # Pass endpoint_url / aws_access_key_id / aws_secret_access_key to read_csv. +>>> mdf = md.read_csv("s3://bucket/example.csv", index_col=0, storage_options={ +>>> "client_kwargs": { +>>> "endpoint_url": "http://192.168.1.12:9000", +>>> "aws_access_key_id": "", +>>> "aws_secret_access_key": "", +>>> "aws_session_token": "", +>>> }}) +>>> # Export environment vars AWS_ENDPOINT_URL / AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY / AWS_SESSION_TOKEN. +>>> mdf = md.read_csv("s3://bucket/example.csv", index_col=0) +>>> r = mdf.head(1000).execute() +>>> print(r) +""" + +try: # pragma: no cover + # make sure s3fs is installed + from s3fs import S3FileSystem as _S3FileSystem + + # make sure fsspec is installed + from .fsspec_adapter import FsSpecAdapter + + del _S3FileSystem +except ImportError: + FsSpecAdapter = None + +if FsSpecAdapter is not None: # pragma: no cover + from .core import register_filesystem + + class S3FileSystem(FsSpecAdapter): + def __init__(self, **kwargs): + super().__init__("s3", **kwargs) + + @staticmethod + def parse_from_path(uri: str): + client_kwargs = { + "endpoint_url": os.environ.get("AWS_ENDPOINT_URL"), + "aws_access_key_id": os.environ.get("AWS_ACCESS_KEY_ID"), + "aws_secret_access_key": os.environ.get("AWS_SECRET_ACCESS_KEY"), + "aws_session_token": os.environ.get("AWS_SESSION_TOKEN"), + } + client_kwargs = {k: v for k, v in client_kwargs.items() if v is not None} + return {"client_kwargs": client_kwargs} + + @classmethod + def get_storage_options(cls, storage_options: Dict, uri: str) -> Dict: + options = cls.parse_from_path(uri) + for k, v in storage_options.items(): + if k == "client_kwargs": + options["client_kwargs"].update(v) + else: + options[k] = v + return options + + register_filesystem("s3", S3FileSystem) +else: + S3FileSystem = None diff --git a/python/xorbits/_mars/lib/filesystem/tests/__init__.py b/python/xorbits/_mars/lib/filesystem/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/lib/filesystem/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/lib/filesystem/tests/test_filesystem.py b/python/xorbits/_mars/lib/filesystem/tests/test_filesystem.py new file mode 100644 index 000000000..d03696e97 --- /dev/null +++ b/python/xorbits/_mars/lib/filesystem/tests/test_filesystem.py @@ -0,0 +1,223 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob as _glob +import os +import tempfile + +import numpy as np +import pytest + +try: + import pyarrow as pa +except ImportError: # pragma: no cover + pa = None + +from ....tests.core import require_hadoop +from ....utils import lazy_import +from .. import FileSystem, FSMap, LocalFileSystem, glob + +if pa is not None: + from ..arrow import ArrowBasedLocalFileSystem, HadoopFileSystem +else: # pragma: no cover + ArrowBasedLocalFileSystem = None + +fsspec_installed = lazy_import("fsspec") is not None + + +def test_path_parser(): + path = "hdfs://user:password@localhost:8080/test" + parsed_result = FileSystem.parse_from_path(path) + assert parsed_result["host"] == "localhost" + assert parsed_result["port"] == 8080 + assert parsed_result["user"] == "user" + assert parsed_result["password"] == "password" + + +def test_local_filesystem(): + local_fs1 = LocalFileSystem.get_instance() + local_fs2 = LocalFileSystem.get_instance() + assert local_fs1 is local_fs2 + + with tempfile.TemporaryDirectory() as tempdir: + file_path = os.path.join(tempdir, "test") + + with open(file_path, "wb") as f: + f.write(b"text for test") + assert local_fs1.stat(tempdir)["type"] == "directory" + assert local_fs1.stat(file_path)["type"] == "file" + assert len(glob(tempdir + "*")) == 1 + + +@pytest.mark.parametrize( + "fs_type", + [LocalFileSystem, ArrowBasedLocalFileSystem] + if pa is not None + else [LocalFileSystem], +) +def test_filesystems(fs_type): + fs = fs_type.get_instance() + + with tempfile.TemporaryDirectory() as root: + test1_dir = os.path.join(root, "test1") + fs.mkdir(test1_dir, create_parents=False) + test2_dir = os.path.join(root, "test2") + sub_test2_dir = os.path.join(test2_dir, "sub_test2") + fs.mkdir(sub_test2_dir) + + sub_test2_dir_stat = fs.stat(sub_test2_dir) + assert sub_test2_dir_stat["type"] == "directory" + assert sub_test2_dir_stat["name"] == sub_test2_dir + assert fs.isdir(sub_test2_dir) + + test1_file = os.path.join(test1_dir, "test1") + with fs.open(test1_file, "wb") as f: + f.write(b"abc test") + with fs.open(test1_file, "ab") as f: + f.write(b"\nappend test") + with fs.open(test1_file, "rb") as f: + content = f.read() + with open(test1_file, "rb") as f2: + expected = f2.read() + assert content == expected + + assert fs.cat(test1_file) == expected + + assert fs.isfile(test1_file) + test1_file_stat = fs.stat(test1_file) + assert test1_file_stat["type"] == "file" + assert test1_file_stat["name"] == test1_file + assert test1_file_stat["size"] == os.stat(test1_file).st_size + np.testing.assert_almost_equal( + test1_file_stat["modified_time"], os.stat(test1_file).st_mtime, decimal=6 + ) + + walked = [ + (os.path.normpath(root), dirs, files) for root, dirs, files in fs.walk(root) + ] + expected = os.walk(root) + assert sorted(walked) == sorted(expected) + + test2_file = os.path.join(sub_test2_dir, "test2") + with fs.open(test2_file, "wb") as f: + f.write(b"def test") + + for recursive in [False, True]: + globs = [ + os.path.normpath(p) + for p in fs.glob(os.path.join(root, "*"), recursive=recursive) + ] + expected = [ + os.path.normpath(p) + for p in _glob.glob(os.path.join(root, "*"), recursive=recursive) + ] + assert sorted(globs) == sorted(expected) + + for path in [os.path.join(root, "*", "*"), test1_dir]: + globs = [os.path.normpath(p) for p in fs.glob(path)] + expected = [os.path.normpath(p) for p in _glob.glob(path)] + assert sorted(globs) == sorted(expected) + + test1_new_file = os.path.join(test1_dir, "test1_new") + fs.rename(test1_file, test1_new_file) + test1_new_file2 = os.path.join(test1_dir, "test1_new2") + fs.mv(test1_new_file, test1_new_file2) + assert fs.exists(test1_new_file2) + assert not fs.exists(test1_file) + + assert fs.disk_usage(test1_dir) > 0 + + fs.delete(test2_file) + assert not fs.exists(test2_file) + + assert fs._isfilestore() + + with pytest.raises(OSError): + fs.delete(test1_dir) + fs.delete(test1_dir, recursive=True) + assert not fs.exists(test1_dir) + + +@require_hadoop +def test_hadoop_filesystem(): + fs = HadoopFileSystem(host="localhost", port=8020) + + test_dir = "/tmp/test/test_hadoop_fs" + fs.mkdir(test_dir) + test_file = f"{test_dir}/my_file.txt" + test_file_content = b"text for text" + with fs.open(test_file, "wb") as f: + f.write(test_file_content) + with fs.open(test_file, "rb") as f: + assert test_file_content == f.read() + # test file with hdfs:// prefix + assert fs.exists(f"hdfs://{test_dir}") + + +def test_fsmap(): + fs = LocalFileSystem.get_instance() + with tempfile.TemporaryDirectory() as root: + fs_map = FSMap(root, fs, check=True) + + path = "/to/path/test_file" + test_content = b"text for test" + fs_map[path] = test_content + assert fs_map[path] == test_content + assert len(fs_map) == 1 + assert path in fs_map + + path2 = "/to/path2/test_file2" + fs_map[path2] = test_content + assert len(fs_map) == 2 + + del fs_map[path] + assert list(fs_map) == ["to/path2/test_file2"] + + path3 = "/to2/path3/test_file3" + fs_map[path3] = test_content + assert fs_map.pop(path3) == test_content + assert fs_map.pop(path3, "fake_content") == "fake_content" + with pytest.raises(KeyError): + fs_map.pop("not_exist") + + fs_map.clear() + assert len(fs_map) == 0 + + # test root not exist + with pytest.raises(ValueError): + _ = FSMap(root + "/path2", fs, check=True) + + # create root + fs_map = FSMap(root + "/path2", fs, create=True) + assert len(fs_map) == 0 + + +@pytest.mark.skipif(not fsspec_installed, reason="fsspec not installed") +def test_get_fs(): + from .. import get_fs, register_filesystem + from ..fsspec_adapter import FsSpecAdapter + + class InMemoryFileSystemAdapter(FsSpecAdapter): + def __init__(self, **kwargs): + super().__init__("memory", **kwargs) + + register_filesystem("memory", InMemoryFileSystemAdapter) + + assert isinstance(get_fs("file://"), LocalFileSystem) + assert isinstance(get_fs("memory://"), InMemoryFileSystemAdapter) + + try: + get_fs("unknown://") + except ValueError as e: + assert "Unknown file system type" in e.__str__() diff --git a/python/xorbits/_mars/lib/filesystem/tests/test_fsspec_adapter.py b/python/xorbits/_mars/lib/filesystem/tests/test_fsspec_adapter.py new file mode 100644 index 000000000..c33c4d642 --- /dev/null +++ b/python/xorbits/_mars/lib/filesystem/tests/test_fsspec_adapter.py @@ -0,0 +1,164 @@ +# Copyright 2022 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from ....utils import lazy_import + +fsspec_installed = lazy_import("fsspec") is not None + + +@pytest.mark.skipif(not fsspec_installed, reason="fsspec not installed") +def test_fsspec_adapter(): + """ + Assuming the implementations follows fsspec strictly, we only need to test if the adapter + works correctly. + """ + from ..fsspec_adapter import FsSpecAdapter + + adapter = FsSpecAdapter(scheme="memory") + + fs = adapter._fs + # generate directories and files as follows: + # . + # ├── dir + # │ ├── bar.txt + # │ └── subdir + # │ └── baz.txt + # └── foo.txt + with fs.open("foo.txt", mode="wb") as f: + f.write(str.encode("foo")) + fs.mkdir("dir") + fs.mkdirs("/dir/subdir") + with fs.open("/dir/bar.txt", mode="wb") as f: + f.write(str.encode("bar")) + with fs.open("/dir/subdir/baz.txt", mode="wb") as f: + f.write(str.encode("baz")) + + # open + f = adapter.open("test.txt", mode="wb") + f.write(str.encode("test")) + f.close() + + # cat + assert "test" == adapter.cat("test.txt").decode() + try: + adapter.cat("non-existent.txt") + pytest.fail() + except FileNotFoundError: + pass + + # ls + entries = adapter.ls("/") + assert 3 == len(entries) + assert "memory:/test.txt" in entries + assert "memory:/foo.txt" in entries + assert "memory:/dir" in entries + entries = adapter.ls("dir") + assert 2 == len(entries) + assert "memory:/dir/bar.txt" in entries + assert "memory:/dir/subdir" in entries + entries = adapter.ls("test.txt") + assert 1 == len(entries) + assert "memory:/test.txt" in entries + try: + adapter.ls("non-existent.txt") + pytest.fail() + except FileNotFoundError: + pass + + # stat + stat = adapter.stat("test.txt") + assert stat is not None + assert stat["name"] == "/test.txt" + assert stat["type"] == "file" + stat = adapter.stat("dir") + assert stat is not None + assert stat["name"] == "/dir" + assert stat["type"] == "directory" + try: + adapter.stat("non-existent.txt") + pytest.fail() + except FileNotFoundError: + pass + + # exists + assert adapter.exists("test.txt") + assert not adapter.exists("non-existent.txt") + + # isdir + assert adapter.isdir("dir") + assert not adapter.isdir("test.txt") + assert not adapter.isdir("non-existent.txt") + + # isfile + assert adapter.isfile("test.txt") + assert not adapter.isfile("dir") + assert not adapter.isfile("non-existent.txt") + + # walk + for root, dirs, files in adapter.walk("/"): + if root == "memory:": + assert dirs == ["memory:dir"] + assert files == ["memory:foo.txt", "memory:test.txt"] + elif root == "memory:/dir": + assert dirs == ["memory:subdir"] + assert files == ["memory:bar.txt"] + elif root == "memory:/dir/subdir": + assert len(dirs) == 0 + assert files == ["memory:baz.txt"] + else: + pytest.fail(f"unexpected dir: {root}") + + # glob + # the expected results come from built-in glob lib. + expected = [ + "memory:foo.txt", + "memory:dir", + "memory:dir/subdir", + "memory:dir/subdir/baz.txt", + "memory:dir/bar.txt", + "memory:test.txt", + ] + expected.sort() + actual = adapter.glob("**", recursive=True) + actual.sort() + assert actual == expected + expected = ["memory:foo.txt"] + actual = adapter.glob("**/foo.txt", recursive=True) + assert actual == expected + expected = ["memory:dir/bar.txt"] + actual = adapter.glob("**/bar.txt", recursive=True) + assert actual == expected + expected = ["memory:dir/subdir/baz.txt"] + actual = adapter.glob("**/baz.txt", recursive=True) + assert actual == expected + expected = ["memory:dir/bar.txt", "memory:dir/subdir/baz.txt"] + actual = adapter.glob("**/ba[rz].txt", recursive=True) + assert actual == expected + actual = adapter.glob("**/ba?.txt", recursive=True) + assert actual == expected + expected = ["memory:foo.txt", "memory:test.txt", "memory:dir"] + expected.sort() + actual = adapter.glob("**", recursive=False) + actual.sort() + assert actual == expected + actual = adapter.glob("*", recursive=False) + actual.sort() + assert actual == expected + expected = ["memory:foo.txt", "memory:test.txt"] + expected.sort() + actual = adapter.glob("*.txt", recursive=False) + actual.sort() + assert actual == expected diff --git a/python/xorbits/_mars/lib/filesystem/tests/test_oss.py b/python/xorbits/_mars/lib/filesystem/tests/test_oss.py new file mode 100644 index 000000000..5ce3be7cd --- /dev/null +++ b/python/xorbits/_mars/lib/filesystem/tests/test_oss.py @@ -0,0 +1,186 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +from io import BytesIO + +import pytest + +from .... import dataframe as md +from ....tests.core import mock +from .. import oss +from .._oss_lib import glob as og +from .._oss_lib.common import OSSFileEntry +from ..oss import build_oss_path + + +class OSSObjInfo: + def __init__(self, name, content): + self.key = name + # Use the current time as "Last-Modified" in the test. + self.last_modified = int(time.time()) + self.size = len(content.encode("utf8")) + + +class ObjectMeta: + def __init__(self, key, obj_dict): + self.headers = {} + self.headers["Last-Modified"] = int(time.time()) + self.headers["Content-Length"] = len(obj_dict[key].encode("utf8")) + + +class MockObject: + def __init__(self, obj_dict, key, byte_range): + self._stream = BytesIO(obj_dict[key].encode("utf8")) + self._byte_range = byte_range + + def read(self): + self._stream.seek(self._byte_range[0]) + if self._byte_range[1] is None: + return self._stream.read() + else: + size = self._byte_range[1] - self._byte_range[0] + 1 + return self._stream.read(size) + + +class SideEffectBucket: + def __init__(self, *_, **__): + self.obj_dict = { + "file.csv": "id1,id2,id3\n1,2,3\n", + "dir/": "", + "dir/file1.csv": "2", + "dir/file2.csv": "3", + "dir/subdir/": "", + "dir/subdir/file3.csv": "s4", + "dir/subdir/file4.csv": "s5", + "dir2/": "", + "dir2/file6.csv": "6", + "dir2/file7.csv": "7", + } + + def get_object_meta(self, key): + return ObjectMeta(key, self.obj_dict) + + def object_exists(self, key): + return key in self.obj_dict.keys() + + def get_object(self, key, byte_range): + return MockObject(self.obj_dict, key, byte_range) + + +class SideEffectObjIter: + def __init__(self, *args, **kwargs): + self.bucket = args[0] + self.prefix = kwargs["prefix"] + + def __iter__(self): + for name, content in self.bucket.obj_dict.items(): + if name.startswith(self.prefix): + yield OSSObjInfo(name, content) + + +@mock.patch("oss2.Bucket", side_effect=SideEffectBucket) +@mock.patch("oss2.ObjectIteratorV2", side_effect=SideEffectObjIter) +def test_oss_filesystem(fake_obj_iter, fake_oss_bucket, setup): + access_key_id = "your_access_key_id" + access_key_secret = "your_access_key_secret" + end_point = "your_endpoint" + + file_path = f"oss://bucket/file.csv" + dir_path = f"oss://bucket/dir/" + dir_path_content_magic = f"oss://bucket/dir*/" + other_scheme_path = f"scheme://netloc/path" + not_exist_file_path = f"oss://bucket/not_exist.csv" + + fake_file_path = build_oss_path( + file_path, access_key_id, access_key_secret, end_point + ) + fake_dir_path = build_oss_path( + dir_path, access_key_id, access_key_secret, end_point + ) + fake_dir_path_contains_magic = build_oss_path( + dir_path_content_magic, access_key_id, access_key_secret, end_point + ) + fake_other_scheme_path = build_oss_path( + other_scheme_path, access_key_id, access_key_secret, end_point + ) + fake_not_exist_file_path = build_oss_path( + not_exist_file_path, access_key_id, access_key_secret, end_point + ) + fs = oss.OSSFileSystem.get_instance() + + # Test OSSFileSystem. + assert len(fs.ls(fake_dir_path)) == 4 + assert not fs.isfile(fake_dir_path) + assert fs.isdir(fake_dir_path) + assert not fs.isdir(fake_file_path) + assert fs.isfile(fake_file_path) + assert fs.exists(fake_file_path) + assert not fs.exists(fake_not_exist_file_path) + assert fs.stat(fake_file_path)["type"] == "file" + assert fs.stat(fake_dir_path)["type"] == "directory" + assert fs.glob(fake_dir_path) == [fake_dir_path] + + with pytest.raises(ValueError) as e: + fs.exists(fake_other_scheme_path) + msg1 = e.value.args[0] + assert ( + msg1 == f"Except scheme oss, but got scheme: " + f"scheme in path: {fake_other_scheme_path}" + ) + + with pytest.raises(RuntimeError) as e: + fs.exists(file_path) + msg2 = e.value.args[0] + assert msg2 == "Please use build_oss_path to add OSS info" + + with pytest.raises(OSError): + print(fs.ls(fake_file_path)) + + assert len(fs.glob(fake_file_path)) == 1 + assert len(fs.glob(fake_dir_path + "*", recursive=True)) == 4 + assert len(fs.glob(fake_dir_path_contains_magic)) == 2 + + # Test the specific functions of glob. + assert og.has_magic(b"*") + assert og.escape(b"*") == b"[*]" + assert og.escape("*") == "[*]" + + # test OSSIOBase + with fs.open(fake_file_path) as f: + assert f.readline() == b"id1,id2,id3\n" + assert f.readline() == b"1,2,3\n" + f.seek(-1, 2) + assert f.readline() == b"\n" + with pytest.raises(AttributeError): + f.fileno() + with pytest.raises(OSError): + f.seek(-1) + with pytest.raises(OSError): + f.seek(-100, 1) + with pytest.raises(ValueError): + f.seek(1, 3) + f.seek(0) + assert f.read() == b"id1,id2,id3\n1,2,3\n" + f.seek(0) + assert f.readline(2) == b"id" + f.seek(0) + with pytest.raises(TypeError): + f.readline("2") + + fe = OSSFileEntry(fake_file_path) + assert fe.path == fake_file_path + + df = md.read_csv(fake_file_path).execute() + assert df.shape == (1, 3) diff --git a/python/xorbits/_mars/lib/filesystem/tests/test_s3.py b/python/xorbits/_mars/lib/filesystem/tests/test_s3.py new file mode 100644 index 000000000..d01a2be20 --- /dev/null +++ b/python/xorbits/_mars/lib/filesystem/tests/test_s3.py @@ -0,0 +1,98 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import pytest + +from ....dataframe import read_csv +from ..core import register_filesystem +from ..s3 import S3FileSystem + + +class KwArgsException(Exception): + def __init__(self, kwargs): + self.kwargs = kwargs + + +if S3FileSystem is not None: + + class TestS3FileSystem(S3FileSystem): + def __init__(self, **kwargs): + super().__init__(**kwargs) + raise KwArgsException(kwargs) + +else: + TestS3FileSystem = None + + +@pytest.mark.skipif(S3FileSystem is None, reason="S3 is not supported") +def test_client_kwargs(): + register_filesystem("s3", TestS3FileSystem) + + test_kwargs = { + "endpoint_url": "http://192.168.1.12:9000", + "aws_access_key_id": "test_id", + "aws_secret_access_key": "test_key", + "aws_session_token": "test_session_token", + } + + def _assert_true(): + # Pass endpoint_url / aws_access_key_id / aws_secret_access_key / aws_session_token to read_csv. + with pytest.raises(KwArgsException) as e: + read_csv( + "s3://bucket/example.csv", + index_col=0, + storage_options={"client_kwargs": test_kwargs}, + ) + assert e.value.kwargs == { + "client_kwargs": { + "endpoint_url": "http://192.168.1.12:9000", + "aws_access_key_id": "test_id", + "aws_secret_access_key": "test_key", + "aws_session_token": "test_session_token", + } + } + + _assert_true() + + test_env = { + "AWS_ENDPOINT_URL": "a", + "AWS_ACCESS_KEY_ID": "b", + "AWS_SECRET_ACCESS_KEY": "c", + "AWS_SESSION_TOKEN": "d", + } + for k, v in test_env.items(): + os.environ[k] = v + + try: + _assert_true() + + for k, v in test_kwargs.items(): + with pytest.raises(KwArgsException) as e: + read_csv( + "s3://bucket/example.csv", + index_col=0, + storage_options={"client_kwargs": {k: v}}, + ) + expect = { + "endpoint_url": "a", + "aws_access_key_id": "b", + "aws_secret_access_key": "c", + "aws_session_token": "d", + } + expect[k] = v + assert e.value.kwargs == {"client_kwargs": expect} + finally: + for k, v in test_env.items(): + os.environ.pop(k, None) diff --git a/python/xorbits/_mars/lib/groupby_wrapper.py b/python/xorbits/_mars/lib/groupby_wrapper.py new file mode 100644 index 000000000..f1477a086 --- /dev/null +++ b/python/xorbits/_mars/lib/groupby_wrapper.py @@ -0,0 +1,279 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +from collections.abc import Iterable + +import cloudpickle +import numpy as np +from pandas.core.groupby import DataFrameGroupBy, SeriesGroupBy + +from ..utils import estimate_pandas_size, no_default, pd_release_version + +_HAS_SQUEEZE = pd_release_version < (1, 1, 0) +_HAS_DROPNA = pd_release_version >= (1, 1, 0) +_GROUP_KEYS_NO_DEFAULT = pd_release_version >= (1, 5, 0) + +_default_group_keys = no_default if _GROUP_KEYS_NO_DEFAULT else True + + +class GroupByWrapper: + def __init__( + self, + obj, + groupby_obj=None, + keys=None, + axis=0, + level=None, + grouper=None, + exclusions=None, + selection=None, + as_index=True, + sort=True, + group_keys=_default_group_keys, + squeeze=False, + observed=False, + dropna=True, + mutated=False, + grouper_cache=None, + ): + def fill_value(v, key): + return ( + v if v is not None or groupby_obj is None else getattr(groupby_obj, key) + ) + + self.obj = obj + self.keys = fill_value(keys, "keys") + self.axis = fill_value(axis, "axis") + self.level = fill_value(level, "level") + self.exclusions = fill_value(exclusions, "exclusions") + self.selection = selection + self.as_index = fill_value(as_index, "as_index") + self.sort = fill_value(sort, "sort") + self.group_keys = fill_value(group_keys, "group_keys") + self.squeeze = fill_value(squeeze, "squeeze") + self.observed = fill_value(observed, "observed") + self.mutated = fill_value(mutated, "mutated") + self.dropna = fill_value(dropna, "dropna") + + if groupby_obj is None: + groupby_kw = dict( + keys=keys, + axis=axis, + level=level, + grouper=grouper, + exclusions=exclusions, + as_index=as_index, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + mutated=mutated, + dropna=dropna, + ) + if not _HAS_SQUEEZE: # pragma: no branch + groupby_kw.pop("squeeze") + if not _HAS_DROPNA: # pragma: no branch + groupby_kw.pop("dropna") + + if obj.ndim == 2: + self.groupby_obj = DataFrameGroupBy(obj, **groupby_kw) + else: + self.groupby_obj = SeriesGroupBy(obj, **groupby_kw) + else: + self.groupby_obj = groupby_obj + + if grouper_cache: + self.groupby_obj.grouper._cache = grouper_cache + if selection: + self.groupby_obj = self.groupby_obj[selection] + + self.is_frame = isinstance(self.groupby_obj, DataFrameGroupBy) + + def __getitem__(self, item): + return GroupByWrapper( + self.obj, + keys=self.keys, + axis=self.axis, + level=self.level, + grouper=self.groupby_obj.grouper, + exclusions=self.exclusions, + selection=item, + as_index=self.as_index, + sort=self.sort, + group_keys=self.group_keys, + squeeze=self.squeeze, + observed=self.observed, + dropna=self.dropna, + mutated=self.mutated, + ) + + def __getattr__(self, item): + if item.startswith("_"): # pragma: no cover + return object.__getattribute__(self, item) + if item in getattr(self.obj, "columns", ()): + return self.__getitem__(item) + return getattr(self.groupby_obj, item) + + def __iter__(self): + return self.groupby_obj.__iter__() + + def __sizeof__(self): + return sys.getsizeof(self.obj) + sys.getsizeof( + getattr(self.groupby_obj.grouper, "_cache", None) + ) + + def estimate_size(self): + return estimate_pandas_size(self.obj) + estimate_pandas_size(self.obj.index) + + def __reduce__(self): + return ( + type(self).from_tuple, + (self.to_tuple(pickle_function=True, truncate=True),), + ) + + def __bool__(self): + return bool(np.prod(self.shape)) + + @property + def empty(self): + return self.obj.empty + + @property + def shape(self): + shape = list(self.groupby_obj.obj.shape) + if self.is_frame and self.selection: + shape[1] = len(self.selection) + return tuple(shape) + + @property + def _selected_obj(self): + return getattr(self.groupby_obj, "_selected_obj") + + def to_tuple(self, truncate=False, pickle_function=False): + if self.selection and truncate: + if isinstance(self.selection, Iterable) and not isinstance( + self.selection, str + ): + item_list = list(self.selection) + else: + item_list = [self.selection] + item_set = set(item_list) + + if isinstance(self.keys, list): + sel_keys = self.keys + elif self.keys in self.obj.columns: + sel_keys = [self.keys] + else: + sel_keys = [] + + all_items = item_list + [k for k in sel_keys or () if k not in item_set] + if set(all_items) == set(self.obj.columns): + obj = self.obj + else: + obj = self.obj[all_items] + else: + obj = self.obj + + if pickle_function and callable(self.keys): + keys = cloudpickle.dumps(self.keys) + else: + keys = self.keys + + return ( + obj, + keys, + self.axis, + self.level, + self.exclusions, + self.selection, + self.as_index, + self.sort, + self.group_keys, + self.squeeze, + self.observed, + self.dropna, + self.mutated, + getattr(self.groupby_obj.grouper, "_cache", dict()), + ) + + @classmethod + def from_tuple(cls, tp): + ( + obj, + keys, + axis, + level, + exclusions, + selection, + as_index, + sort, + group_keys, + squeeze, + observed, + dropna, + mutated, + grouper_cache, + ) = tp + + if isinstance(keys, (bytes, bytearray)): + keys = cloudpickle.loads(keys) + + return cls( + obj, + keys=keys, + axis=axis, + level=level, + exclusions=exclusions, + selection=selection, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + dropna=dropna, + mutated=mutated, + grouper_cache=grouper_cache, + ) + + +def wrapped_groupby( + obj, + by=None, + axis=0, + level=None, + as_index=True, + sort=True, + group_keys=_default_group_keys, + squeeze=False, + observed=False, + dropna=True, +): + groupby_kw = dict( + by=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + dropna=dropna, + ) + if not _HAS_SQUEEZE: # pragma: no branch + groupby_kw.pop("squeeze") + if not _HAS_DROPNA: # pragma: no branch + groupby_kw.pop("dropna") + + groupby_obj = obj.groupby(**groupby_kw) + return GroupByWrapper(obj, groupby_obj=groupby_obj, as_index=as_index) diff --git a/python/xorbits/_mars/lib/mkl_interface.py b/python/xorbits/_mars/lib/mkl_interface.py new file mode 100644 index 000000000..c864ecf5c --- /dev/null +++ b/python/xorbits/_mars/lib/mkl_interface.py @@ -0,0 +1,86 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ctypes +import os +import sys + + +def _load_mkl_rt(lib_name): + """ + Load certain MKL library + """ + if sys.platform.startswith("win"): + lib_path = os.path.join(sys.prefix, "Library", "bin", lib_name + ".dll") + elif sys.platform == "darwin": + lib_path = os.path.join(sys.prefix, "lib", "lib" + lib_name + ".dylib") + else: + lib_path = os.path.join(sys.prefix, "lib", "lib" + lib_name + ".so") + if not os.path.exists(lib_path): + lib_path = None + + if lib_path: + return ctypes.cdll.LoadLibrary(lib_path) + + +class MKLVersion(ctypes.Structure): + _fields_ = [ + ("major", ctypes.c_int), + ("minor", ctypes.c_int), + ("update", ctypes.c_int), + ("product_status", ctypes.c_char_p), + ("build", ctypes.c_char_p), + ("processor", ctypes.c_char_p), + ("platform", ctypes.c_char_p), + ] + + +mkl_free_buffers = None +mkl_get_version = None +mkl_mem_stat = None + +mkl_rt = _load_mkl_rt("mkl_rt") +if mkl_rt: + try: + mkl_free_buffers = mkl_rt.mkl_free_buffers + mkl_free_buffers.argtypes = [] + mkl_free_buffers.restype = None + except AttributeError: # pragma: no cover + pass + + try: + _mkl_mem_stat = mkl_rt.mkl_mem_stat + _mkl_mem_stat.argtypes = [ctypes.POINTER(ctypes.c_int32)] + _mkl_mem_stat.restype = ctypes.c_int64 + + def mkl_mem_stat(): + n_bufs = ctypes.c_int32(0) + size = _mkl_mem_stat(ctypes.pointer(n_bufs)) + return size, n_bufs.value + + except AttributeError: # pragma: no cover + pass + + try: + _mkl_get_version = mkl_rt.mkl_get_version + _mkl_get_version.argtypes = [ctypes.POINTER(MKLVersion)] + _mkl_get_version.restype = None + + def mkl_get_version(): + version = MKLVersion() + _mkl_get_version(version) + return version + + except AttributeError: # pragma: no cover + pass diff --git a/python/xorbits/_mars/lib/mmh3_src/MurmurHash3.cpp b/python/xorbits/_mars/lib/mmh3_src/MurmurHash3.cpp new file mode 100755 index 000000000..4c73436da --- /dev/null +++ b/python/xorbits/_mars/lib/mmh3_src/MurmurHash3.cpp @@ -0,0 +1,339 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +// Note - The x86 and x64 versions do _not_ produce the same results, as the +// algorithms are optimized for their respective platforms. You can still +// compile and run any of them on any platform, but your performance with the +// non-native version will be less than optimal. + +#include "MurmurHash3.h" + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +#define FORCE_INLINE __forceinline + +#include + +#define ROTL32(x,y) _rotl(x,y) +#define ROTL64(x,y) _rotl64(x,y) + +#define BIG_CONSTANT(x) (x) + +// Other compilers + +#else // defined(_MSC_VER) + +#if defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && GNUC_MINOR >= 4)) +/* gcc version >= 4.4 4.1 = RHEL 5, 4.4 = RHEL 6. Don't inline for RHEL 5 gcc which is 4.1*/ +#define FORCE_INLINE inline __attribute__((always_inline)) +#else +#define FORCE_INLINE +#endif + +inline uint32_t rotl32 ( uint32_t x, int8_t r ) +{ + return (x << r) | (x >> (32 - r)); +} + +inline uint64_t rotl64 ( uint64_t x, int8_t r ) +{ + return (x << r) | (x >> (64 - r)); +} + +#define ROTL32(x,y) rotl32(x,y) +#define ROTL64(x,y) rotl64(x,y) + +#define BIG_CONSTANT(x) (x##LLU) + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- +// Block read - if your platform needs to do endian-swapping or can only +// handle aligned reads, do the conversion here + +FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i ) +{ + return p[i]; +} + +FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i ) +{ + return p[i]; +} + +//----------------------------------------------------------------------------- +// Finalization mix - force all bits of a hash block to avalanche + +FORCE_INLINE uint32_t fmix ( uint32_t h ) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +//---------- + +FORCE_INLINE uint64_t fmix ( uint64_t k ) +{ + k ^= k >> 33; + k *= BIG_CONSTANT(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + + return k; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, Py_ssize_t len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const Py_ssize_t nblocks = len / 4; + + uint32_t h1 = seed; + + const uint32_t c1 = 0xcc9e2d51; + const uint32_t c2 = 0x1b873593; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); + + for(Py_ssize_t i = -nblocks; i; i++) + { + uint32_t k1 = getblock(blocks,i); + + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = ROTL32(h1,13); + h1 = h1*5+0xe6546b64; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*4); + + uint32_t k1 = 0; + + switch(len & 3) + { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: k1 ^= tail[0]; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; + + h1 = fmix(h1); + + *(uint32_t*)out = h1; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_128 ( const void * key, const Py_ssize_t len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const Py_ssize_t nblocks = len / 16; + + uint32_t h1 = seed; + uint32_t h2 = seed; + uint32_t h3 = seed; + uint32_t h4 = seed; + + const uint32_t c1 = 0x239b961b; + const uint32_t c2 = 0xab0e9789; + const uint32_t c3 = 0x38b34ae5; + const uint32_t c4 = 0xa1e38b93; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); + + for(Py_ssize_t i = -nblocks; i; i++) + { + uint32_t k1 = getblock(blocks,i*4+0); + uint32_t k2 = getblock(blocks,i*4+1); + uint32_t k3 = getblock(blocks,i*4+2); + uint32_t k4 = getblock(blocks,i*4+3); + + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + + h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; + + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; + + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; + + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint32_t k1 = 0; + uint32_t k2 = 0; + uint32_t k3 = 0; + uint32_t k4 = 0; + + switch(len & 15) + { + case 15: k4 ^= tail[14] << 16; + case 14: k4 ^= tail[13] << 8; + case 13: k4 ^= tail[12] << 0; + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + case 12: k3 ^= tail[11] << 24; + case 11: k3 ^= tail[10] << 16; + case 10: k3 ^= tail[ 9] << 8; + case 9: k3 ^= tail[ 8] << 0; + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + case 8: k2 ^= tail[ 7] << 24; + case 7: k2 ^= tail[ 6] << 16; + case 6: k2 ^= tail[ 5] << 8; + case 5: k2 ^= tail[ 4] << 0; + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + case 4: k1 ^= tail[ 3] << 24; + case 3: k1 ^= tail[ 2] << 16; + case 2: k1 ^= tail[ 1] << 8; + case 1: k1 ^= tail[ 0] << 0; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + h1 = fmix(h1); + h2 = fmix(h2); + h3 = fmix(h3); + h4 = fmix(h4); + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + ((uint32_t*)out)[0] = h1; + ((uint32_t*)out)[1] = h2; + ((uint32_t*)out)[2] = h3; + ((uint32_t*)out)[3] = h4; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x64_128 ( const void * key, const Py_ssize_t len, + const uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const Py_ssize_t nblocks = len / 16; + + uint64_t h1 = seed; + uint64_t h2 = seed; + + const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); + const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); + + //---------- + // body + + const uint64_t * blocks = (const uint64_t *)(data); + + for(Py_ssize_t i = 0; i < nblocks; i++) + { + uint64_t k1 = getblock(blocks,i*2+0); + uint64_t k2 = getblock(blocks,i*2+1); + + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + + h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; + + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch(len & 15) + { + case 15: k2 ^= uint64_t(tail[14]) << 48; + case 14: k2 ^= uint64_t(tail[13]) << 40; + case 13: k2 ^= uint64_t(tail[12]) << 32; + case 12: k2 ^= uint64_t(tail[11]) << 24; + case 11: k2 ^= uint64_t(tail[10]) << 16; + case 10: k2 ^= uint64_t(tail[ 9]) << 8; + case 9: k2 ^= uint64_t(tail[ 8]) << 0; + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + case 8: k1 ^= uint64_t(tail[ 7]) << 56; + case 7: k1 ^= uint64_t(tail[ 6]) << 48; + case 6: k1 ^= uint64_t(tail[ 5]) << 40; + case 5: k1 ^= uint64_t(tail[ 4]) << 32; + case 4: k1 ^= uint64_t(tail[ 3]) << 24; + case 3: k1 ^= uint64_t(tail[ 2]) << 16; + case 2: k1 ^= uint64_t(tail[ 1]) << 8; + case 1: k1 ^= uint64_t(tail[ 0]) << 0; + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix(h1); + h2 = fmix(h2); + + h1 += h2; + h2 += h1; + + ((uint64_t*)out)[0] = h1; + ((uint64_t*)out)[1] = h2; +} + +//----------------------------------------------------------------------------- diff --git a/python/xorbits/_mars/lib/mmh3_src/MurmurHash3.h b/python/xorbits/_mars/lib/mmh3_src/MurmurHash3.h new file mode 100755 index 000000000..75e248c1b --- /dev/null +++ b/python/xorbits/_mars/lib/mmh3_src/MurmurHash3.h @@ -0,0 +1,43 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +#ifndef _MURMURHASH3_H_ +#define _MURMURHASH3_H_ + + +// To handle 64-bit data; see https://docs.python.org/2.7/c-api/arg.html +#ifndef PY_SSIZE_T_CLEAN +#define PY_SSIZE_T_CLEAN +#endif +#include + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) +typedef unsigned __int8 uint8_t; +typedef unsigned __int32 uint32_t; +typedef unsigned __int64 uint64_t; + +// Other compilers + +#else // defined(_MSC_VER) + +#include + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, Py_ssize_t len, uint32_t seed, void * out ); + +void MurmurHash3_x86_128 ( const void * key, Py_ssize_t len, uint32_t seed, void * out ); + +void MurmurHash3_x64_128 ( const void * key, Py_ssize_t len, uint32_t seed, void * out ); + +//----------------------------------------------------------------------------- + +#endif // _MURMURHASH3_H_ diff --git a/python/xorbits/_mars/lib/mmh3_src/mmh3module.cpp b/python/xorbits/_mars/lib/mmh3_src/mmh3module.cpp new file mode 100755 index 000000000..d5b17104d --- /dev/null +++ b/python/xorbits/_mars/lib/mmh3_src/mmh3module.cpp @@ -0,0 +1,387 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. mmh3 Python module was written by Hajime Senuma, +// and is also placed in the public domain. +// The authors hereby disclaim copyright to these source codes. + +// To handle 64-bit data; see https://docs.python.org/2.7/c-api/arg.html +#ifndef PY_SSIZE_T_CLEAN +#define PY_SSIZE_T_CLEAN +#endif + +#include +#include +#include +#include "MurmurHash3.h" + +#if defined(_MSC_VER) +typedef signed __int8 int8_t; +typedef signed __int32 int32_t; +typedef signed __int64 int64_t; +typedef unsigned __int8 uint8_t; +typedef unsigned __int32 uint32_t; +typedef unsigned __int64 uint64_t; +// Other compilers +#else // defined(_MSC_VER) +#include +#endif // !defined(_MSC_VER) + +static int +_GetMemoryViewDataAndSize(PyObject *mview, const char **target_str, + Py_ssize_t *target_str_len) { + Py_buffer *mview_buffer = NULL; + + if (!PyMemoryView_Check(mview)) { + PyErr_Format(PyExc_TypeError, "key must be byte-like object " + "or memoryview, not '%.200s'", + mview->ob_type->tp_name); + return 0; + } + + mview_buffer = PyMemoryView_GET_BUFFER(mview); + *target_str = (const char *)mview_buffer->buf; + *target_str_len = mview_buffer->len; + return 1; +} + +static PyObject * +mmh3_hash(PyObject *self, PyObject *args, PyObject *keywds) +{ + const char *target_str; + Py_ssize_t target_str_len; + PyObject *target_mview = NULL; + uint32_t seed = 0; + int32_t result[1]; + long long_result = 0; + int is_signed = 1; + + static char *kwlist[] = {(char *)"key", (char *)"seed", + (char *)"signed", NULL}; + +#ifndef _MSC_VER + static uint64_t mask[] = {0x0ffffffff, 0xffffffffffffffff}; +#endif + + if (!PyArg_ParseTupleAndKeywords(args, keywds, "s#|IB", kwlist, + &target_str, &target_str_len, &seed, &is_signed)) { + if (!PyArg_ParseTupleAndKeywords(args, keywds, "O|IB", kwlist, + &target_mview, &seed, &is_signed)) { + return NULL; + } + PyErr_Clear(); + Py_INCREF(target_mview); + + if (!_GetMemoryViewDataAndSize(target_mview, &target_str, &target_str_len)) { + Py_DECREF(target_mview); + return NULL; + } + } + + MurmurHash3_x86_32(target_str, target_str_len, seed, result); + + if (target_mview) { + Py_DECREF(target_mview); + } + +#if defined(_MSC_VER) + /* for Windows envs */ + long_result = result[0]; + if (is_signed == 1) { + return PyLong_FromLong(long_result); + } else { + return PyLong_FromUnsignedLong(long_result); + } +#else + /* for standard envs */ + long_result = result[0] & mask[is_signed]; + return PyLong_FromLong(long_result); +#endif +} + +static PyObject * +mmh3_hash_from_buffer(PyObject *self, PyObject *args, PyObject *keywds) +{ + Py_buffer target_buf; + Py_buffer *target_buf_ptr; + PyObject *target_mview = NULL; + uint32_t seed = 0; + int32_t result[1]; + long long_result = 0; + int is_signed = 1; + + static char *kwlist[] = {(char *)"key", (char *)"seed", + (char *)"signed", NULL}; + +#ifndef _MSC_VER + static uint64_t mask[] = {0x0ffffffff, 0xffffffffffffffff}; +#endif + + if (!PyArg_ParseTupleAndKeywords(args, keywds, "s*|IB", kwlist, + &target_buf, &seed, &is_signed)) { + if (!PyArg_ParseTupleAndKeywords(args, keywds, "O|IB", kwlist, + &target_mview, &seed, &is_signed)) { + return NULL; + } + PyErr_Clear(); + Py_INCREF(target_mview); + + if (!PyMemoryView_Check(target_mview)) { + PyErr_Format(PyExc_TypeError, "key must be byte-like object " + "or memoryview, not '%.200s'", + target_mview->ob_type->tp_name); + Py_DECREF(target_mview); + return NULL; + } + + target_buf_ptr = PyMemoryView_GET_BUFFER(target_mview); + } else { + target_buf_ptr = &target_buf; + } + + MurmurHash3_x86_32(target_buf_ptr->buf, target_buf_ptr->len, seed, result); + + if (target_mview) { + Py_DECREF(target_mview); + } + +#if defined(_MSC_VER) + /* for Windows envs */ + long_result = result[0]; + if (is_signed == 1) { + return PyLong_FromLong(long_result); + } else { + return PyLong_FromUnsignedLong(long_result); + } +#else + /* for standard envs */ + long_result = result[0] & mask[is_signed]; + return PyLong_FromLong(long_result); +#endif +} + +static PyObject * +mmh3_hash64(PyObject *self, PyObject *args, PyObject *keywds) +{ + const char *target_str; + Py_ssize_t target_str_len; + PyObject *target_mview = NULL; + uint32_t seed = 0; + uint64_t result[2]; + char x64arch = 1; + int is_signed = 1; + + static char *kwlist[] = {(char *)"key", (char *)"seed", + (char *)"x64arch", (char *)"signed", NULL}; + + static char *valflag[] = {(char *) "KK", (char *) "LL"}; + + if (!PyArg_ParseTupleAndKeywords(args, keywds, "s#|IBB", kwlist, + &target_str, &target_str_len, &seed, &x64arch, &is_signed)) { + if (!PyArg_ParseTupleAndKeywords(args, keywds, "O|IBB", kwlist, + &target_mview, &seed, &x64arch, &is_signed)) { + return NULL; + } + PyErr_Clear(); + Py_INCREF(target_mview); + + if (!_GetMemoryViewDataAndSize(target_mview, &target_str, &target_str_len)) { + Py_DECREF(target_mview); + return NULL; + } + } + + if (x64arch == 1) { + MurmurHash3_x64_128(target_str, target_str_len, seed, result); + } else { + MurmurHash3_x86_128(target_str, target_str_len, seed, result); + } + + if (target_mview) { + Py_DECREF(target_mview); + } + + PyObject *retval = Py_BuildValue(valflag[is_signed], result[0], result[1]); + return retval; +} + +static PyObject * +mmh3_hash128(PyObject *self, PyObject *args, PyObject *keywds) +{ + const char *target_str; + Py_ssize_t target_str_len; + PyObject *target_mview = NULL; + uint32_t seed = 0; + uint64_t result[2]; + char x64arch = 1; + char is_signed = 0; + + static char *kwlist[] = {(char *)"key", (char *)"seed", + (char *)"x64arch", (char *)"signed", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, keywds, "s#|IBB", kwlist, + &target_str, &target_str_len, &seed, &x64arch, &is_signed)) { + if (!PyArg_ParseTupleAndKeywords(args, keywds, "O|IBB", kwlist, + &target_mview, &seed, &x64arch, &is_signed)) { + return NULL; + } + PyErr_Clear(); + Py_INCREF(target_mview); + + if (!_GetMemoryViewDataAndSize(target_mview, &target_str, &target_str_len)) { + Py_DECREF(target_mview); + return NULL; + } + } + + if (x64arch == 1) { + MurmurHash3_x64_128(target_str, target_str_len, seed, result); + } else { + MurmurHash3_x86_128(target_str, target_str_len, seed, result); + } + + if (target_mview) { + Py_DECREF(target_mview); + } + + /** + * _PyLong_FromByteArray is not a part of official Python/C API + * and can be displaced (although it is practically stable). cf. + * https://mail.python.org/pipermail/python-list/2006-August/372368.html + */ + PyObject *retval = _PyLong_FromByteArray((unsigned char *)result, 16, 1, is_signed); + + return retval; +} + +static PyObject * +mmh3_hash_bytes(PyObject *self, PyObject *args, PyObject *keywds) +{ + const char *target_str = NULL; + Py_ssize_t target_str_len; + PyObject *target_mview = NULL; + uint32_t seed = 0; + uint32_t result[4]; + char x64arch = 1; + + static char *kwlist[] = {(char *)"key", (char *)"seed", + (char *)"x64arch", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, keywds, "s#|IB", kwlist, + &target_str, &target_str_len, &seed, &x64arch)) { + if (!PyArg_ParseTupleAndKeywords(args, keywds, "O|IB", kwlist, + &target_mview, &seed, &x64arch)) { + return NULL; + } + PyErr_Clear(); + Py_INCREF(target_mview); + + if (!_GetMemoryViewDataAndSize(target_mview, &target_str, &target_str_len)) { + Py_DECREF(target_mview); + return NULL; + } + } + + if (x64arch == 1) { + MurmurHash3_x64_128(target_str, target_str_len, seed, result); + } else { + MurmurHash3_x86_128(target_str, target_str_len, seed, result); + } + + if (target_mview) { + Py_DECREF(target_mview); + } + + char bytes[16]; + memcpy(bytes, result, 16); + return PyBytes_FromStringAndSize(bytes, 16); +} + +struct module_state { + PyObject *error; +}; + +#if PY_MAJOR_VERSION >= 3 +#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m)) +#else +#define GETSTATE(m) (&_state) +static struct module_state _state; +#endif + +static PyMethodDef Mmh3Methods[] = { + {"hash", (PyCFunction)mmh3_hash, METH_VARARGS | METH_KEYWORDS, + "hash(key[, seed=0, signed=True]) -> hash value\n Return a 32 bit integer."}, + {"hash_from_buffer", (PyCFunction)mmh3_hash_from_buffer, METH_VARARGS | METH_KEYWORDS, + "hash_from_buffer(key[, seed=0, signed=True]) -> hash value from a memory buffer\n Return a 32 bit integer. Designed for large memory-views such as numpy arrays."}, + {"hash64", (PyCFunction)mmh3_hash64, METH_VARARGS | METH_KEYWORDS, + "hash64(key[, seed=0, x64arch=True, signed=True]) -> (hash value 1, hash value 2)\n Return a tuple of two 64 bit integers for a string. Optimized for the x64 bit architecture when x64arch=True, otherwise for x86."}, + {"hash128", (PyCFunction)mmh3_hash128, METH_VARARGS | METH_KEYWORDS, + "hash128(key[, seed=0, x64arch=True, signed=False]]) -> hash value\n Return a 128 bit long integer. Optimized for the x64 bit architecture when x64arch=True, otherwise for x86."}, + {"hash_bytes", (PyCFunction)mmh3_hash_bytes, + METH_VARARGS | METH_KEYWORDS, + "hash_bytes(key[, seed=0, x64arch=True]) -> bytes\n Return a 128 bit hash value as bytes for a string. Optimized for the x64 bit architecture when x64arch=True, otherwise for the x86."}, + {NULL, NULL, 0, NULL} +}; + +#if PY_MAJOR_VERSION >= 3 + +static int mmh3_traverse(PyObject *m, visitproc visit, void *arg) { + Py_VISIT(GETSTATE(m)->error); + return 0; +} + +static int mmh3_clear(PyObject *m) { + Py_CLEAR(GETSTATE(m)->error); + return 0; +} + +static struct PyModuleDef mmh3module = { + PyModuleDef_HEAD_INIT, + "mmh3", + "mmh3 is a Python front-end to MurmurHash3, a fast and robust hash library created by Austin Appleby (http://code.google.com/p/smhasher/).\n Ported by Hajime Senuma \n Try hash('foobar') or hash('foobar', 1984).\n If you find any bugs, please submit an issue via https://github.com/hajimes/mmh3", + sizeof(struct module_state), + Mmh3Methods, + NULL, + mmh3_traverse, + mmh3_clear, + NULL +}; + +#define INITERROR return NULL + +extern "C" { +PyMODINIT_FUNC +PyInit_mmh3(void) + +#else // PY_MAJOR_VERSION >= 3 +#define INITERROR return + +extern "C" { +void +initmmh3(void) +#endif // PY_MAJOR_VERSION >= 3 + +{ +#if PY_MAJOR_VERSION >= 3 + PyObject *module = PyModule_Create(&mmh3module); +#else + PyObject *module = Py_InitModule("mmh3", Mmh3Methods); +#endif + + if (module == NULL) + INITERROR; + + PyModule_AddStringConstant(module, "__version__", "2.5.1"); + + struct module_state *st = GETSTATE(module); + + st->error = PyErr_NewException((char *) "mmh3.Error", NULL, NULL); + if (st->error == NULL) { + Py_DECREF(module); + INITERROR; + } + +#if PY_MAJOR_VERSION >= 3 + return module; +#endif +} +} // extern "C" diff --git a/python/xorbits/_mars/lib/nvutils.py b/python/xorbits/_mars/lib/nvutils.py new file mode 100644 index 000000000..d3af99bfa --- /dev/null +++ b/python/xorbits/_mars/lib/nvutils.py @@ -0,0 +1,713 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses +import logging +import os +import subprocess +import sys +import uuid +from collections import namedtuple +from ctypes import ( + CDLL, + POINTER, + Structure, + byref, + c_char, + c_char_p, + c_int, + c_uint, + c_ulonglong, + create_string_buffer, +) +from typing import List, Optional, Tuple, Union + +from ..utils import parse_readable_size + +logger = logging.getLogger(__name__) + +# Some constants taken from cuda.h +CUDA_SUCCESS = 0 +CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16 +CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39 +CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13 +CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33 +CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34 +CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36 + +CU_NO_CUDA_CAPABLE_DEVICE_DETECTED = 100 + +# nvml constants +NVML_SUCCESS = 0 +NVML_ERROR_UNINITIALIZED = 1 +NVML_ERROR_INVALID_ARGUMENT = 2 +NVML_ERROR_NOT_SUPPORTED = 3 +NVML_ERROR_NO_PERMISSION = 4 +NVML_ERROR_ALREADY_INITIALIZED = 5 +NVML_ERROR_NOT_FOUND = 6 +NVML_ERROR_INSUFFICIENT_SIZE = 7 +NVML_ERROR_INSUFFICIENT_POWER = 8 +NVML_ERROR_DRIVER_NOT_LOADED = 9 +NVML_ERROR_TIMEOUT = 10 +NVML_ERROR_IRQ_ISSUE = 11 +NVML_ERROR_LIBRARY_NOT_FOUND = 12 +NVML_ERROR_FUNCTION_NOT_FOUND = 13 +NVML_ERROR_CORRUPTED_INFOROM = 14 +NVML_ERROR_GPU_IS_LOST = 15 +NVML_ERROR_RESET_REQUIRED = 16 +NVML_ERROR_OPERATING_SYSTEM = 17 +NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18 +NVML_ERROR_IN_USE = 19 +NVML_ERROR_MEMORY = 20 +NVML_ERROR_NO_DATA = 21 +NVML_ERROR_VGPU_ECC_NOT_SUPPORTED = 22 +NVML_ERROR_INSUFFICIENT_RESOURCES = 23 +NVML_ERROR_FREQ_NOT_SUPPORTED = 24 +NVML_ERROR_UNKNOWN = 999 +NVML_TEMPERATURE_GPU = 0 +NVML_DRIVER_NOT_LOADED = 9 +NVML_DEVICE_UUID_V2_BUFFER_SIZE = 96 +NVML_VALUE_NOT_AVAILABLE_ulonglong = c_ulonglong(-1) +NVML_DEVICE_MIG_DISABLE = 0x0 +NVML_DEVICE_MIG_ENABLE = 0x1 + + +class _CUuuid_t(Structure): + _fields_ = [("bytes", c_char * 16)] + + +class _nvmlUtilization_t(Structure): + _fields_ = [ + ("gpu", c_uint), + ("memory", c_uint), + ] + + +class _struct_nvmlDevice_t(Structure): + pass # opaque handle + + +_nvmlDevice_t = POINTER(_struct_nvmlDevice_t) + + +class _nvmlBAR1Memory_t(Structure): + _fields_ = [ + ("total", c_ulonglong), + ("free", c_ulonglong), + ("used", c_ulonglong), + ] + + +class _nvmlProcessInfo_t(Structure): + _fields_ = [ + ("pid", c_uint), + ("usedGpuMemory", c_ulonglong), + ("gpuInstanceId", c_uint), + ("computeInstanceId", c_uint), + ] + + +## Alternative object +# Allows the object to be printed +# Allows mismatched types to be assigned +# - like None when the Structure variant requires c_uint +class nvmlFriendlyObject: + def __init__(self, dictionary): + for x in dictionary: + setattr(self, x, dictionary[x]) + + def __str__(self): + return self.__dict__.__str__() + + +def nvmlStructToFriendlyObject(struct): + d = {} + for x in struct._fields_: + key = x[0] + value = getattr(struct, key) + # only need to convert from bytes if bytes, no need to check python version. + d[key] = value.decode() if isinstance(value, bytes) else value + obj = nvmlFriendlyObject(d) + return obj + + +@dataclasses.dataclass +class CudaDeviceInfo: + uuid: bytes = None + device_index: int = None + mig_index: int = None + + +@dataclasses.dataclass +class CudaContext: + has_context: bool + device_info: CudaDeviceInfo = None + + +_is_windows: bool = sys.platform.startswith("win") +_is_wsl: bool = "WSL_DISTRO_NAME" in os.environ + + +def _load_nv_library(*libnames): + for lib in libnames: + try: + return CDLL(lib) + except OSError: + continue + + +_cuda_lib = _nvml_lib = None + +_cu_device_info = namedtuple( + "_cu_device_info", "index uuid name multiprocessors cuda_cores threads" +) +_nvml_driver_info = namedtuple("_nvml_driver_info", "driver_version cuda_version") +_nvml_device_status = namedtuple( + "_nvml_device_status", + "gpu_util mem_util temperature fb_total_mem fb_used_mem fb_free_mem", +) + +_init_pid = None +_gpu_count = None +_driver_info = None +_device_infos = dict() + +_no_device_warned = False + + +class NVError(Exception): + def __init__(self, msg, *args, errno=None): + self._errno = errno + super().__init__(msg or "Unknown error", *args) + + def __str__(self): + return f"({self._errno}) {super().__str__()}" + + @property + def errno(self): + return self._errno + + @property + def message(self): + return super().__str__() + + +class NVDeviceAPIError(NVError): + pass + + +class NVMLAPIError(NVError): + pass + + +def _cu_check_error(result): + if result != CUDA_SUCCESS: + _error_str = c_char_p() + _cuda_lib.cuGetErrorString(result, byref(_error_str)) + err_value = _error_str.value.decode() if _error_str.value is not None else None + raise NVDeviceAPIError(err_value, errno=result) + + +_nvmlErrorString = None + + +def _nvml_check_error(result): + global _nvmlErrorString + if _nvmlErrorString is None: + _nvmlErrorString = _nvml_lib.nvmlErrorString + _nvmlErrorString.restype = c_char_p + + if result != NVML_SUCCESS: + _error_str = _nvmlErrorString(result) + raise NVMLAPIError(_error_str.decode(), errno=result) + + +_cu_process_var_to_cores = { + (1, 0): 8, + (1, 1): 8, + (1, 2): 8, + (1, 3): 8, + (2, 0): 32, + (2, 1): 48, +} + + +def _cu_get_processor_cores(major, minor): + return _cu_process_var_to_cores.get((major, minor), 192) + + +def _init_cp(): + global _cuda_lib, _no_device_warned + if _init_pid == os.getpid(): + return + + libcuda_paths = ["libcuda.so", "libcuda.dylib", "cuda.dll", "nvcuda.dll"] + if _is_wsl: + libcuda_paths = ["/usr/lib/wsl/lib/libcuda.so"] + libcuda_paths + _cuda_lib = _load_nv_library(*libcuda_paths) + + if _cuda_lib is None: + return + try: + _cu_check_error(_cuda_lib.cuInit(0)) + except NVDeviceAPIError as ex: + if ex.errno == CU_NO_CUDA_CAPABLE_DEVICE_DETECTED: + _cuda_lib = None + if not _no_device_warned: + logger.warning("No CUDA device detected") + _no_device_warned = True + else: + logger.exception("Failed to initialize libcuda.") + return + + +def _init_nvml(): + global _nvml_lib, _no_device_warned + if _init_pid == os.getpid(): + return + + nvml_paths = [ + "libnvidia-ml.so", + "libnvidia-ml.so.1", + "libnvidia-ml.dylib", + "nvml.dll", + ] + if _is_windows: + nvml_paths.append( + os.path.join( + os.getenv("ProgramFiles", "C:/Program Files"), + "NVIDIA Corporation/NVSMI/nvml.dll", + ) + ) + if _is_wsl: + nvml_paths = ["/usr/lib/wsl/lib/libnvidia-ml.so.1"] + nvml_paths + _nvml_lib = _load_nv_library(*nvml_paths) + + if _nvml_lib is None: + return + try: + _nvml_check_error(_nvml_lib.nvmlInit_v2()) + except NVMLAPIError as ex: + if ex.errno == NVML_DRIVER_NOT_LOADED: + _nvml_lib = None + if not _no_device_warned: + logger.warning( + "Failed to load libnvidia-ml: %s, no CUDA device will be enabled", + ex.message, + ) + _no_device_warned = True + else: + logger.exception("Failed to initialize libnvidia-ml.") + return + + +def _init(): + global _init_pid + + _init_cp() + _init_nvml() + + if _nvml_lib is not None and _cuda_lib is not None: + _init_pid = os.getpid() + + +def get_device_count() -> int: + global _gpu_count + + if _gpu_count is not None: + return _gpu_count + + _init_nvml() + if _nvml_lib is None: + return None + + if "CUDA_VISIBLE_DEVICES" in os.environ: + devices = os.environ["CUDA_VISIBLE_DEVICES"].strip() + if not devices or devices == "-1": + _gpu_count = 0 + else: + _gpu_count = len(devices.split(",")) + else: + n_gpus = c_uint() + _cu_check_error(_nvml_lib.nvmlDeviceGetCount(byref(n_gpus))) + _gpu_count = n_gpus.value + return _gpu_count + + +def _get_all_device_count() -> int: + _init_nvml() + if _nvml_lib is None: + return None + + n_gpus = c_uint() + _cu_check_error(_nvml_lib.nvmlDeviceGetCount(byref(n_gpus))) + return n_gpus.value + + +def get_driver_info() -> _nvml_driver_info: + global _driver_info + + _init_nvml() + if _nvml_lib is None: + return None + if _driver_info is not None: + return _driver_info + + version_buf = create_string_buffer(100) + cuda_version = c_uint() + + _nvml_check_error( + _nvml_lib.nvmlSystemGetDriverVersion(version_buf, len(version_buf)) + ) + _nvml_check_error(_nvml_lib.nvmlSystemGetCudaDriverVersion(byref(cuda_version))) + + _driver_info = _nvml_driver_info( + driver_version=version_buf.value.decode(), + cuda_version=".".join(str(v) for v in divmod(cuda_version.value, 1000)), + ) + return _driver_info + + +def get_device_info(dev_index: int) -> _cu_device_info: + try: + return _device_infos[dev_index] + except KeyError: + pass + + _init() + if _init_pid is None: + return None + + device = c_int() + name_buf = create_string_buffer(100) + uuid_t = _CUuuid_t() + cc_major = c_int() + cc_minor = c_int() + cores = c_int() + threads_per_core = c_int() + + _cu_check_error(_cuda_lib.cuDeviceGet(byref(device), c_int(dev_index))) + _cu_check_error(_cuda_lib.cuDeviceGetName(name_buf, len(name_buf), device)) + _cu_check_error(_cuda_lib.cuDeviceGetUuid(byref(uuid_t), device)) + _cu_check_error( + _cuda_lib.cuDeviceComputeCapability(byref(cc_major), byref(cc_minor), device) + ) + _cu_check_error( + _cuda_lib.cuDeviceGetAttribute( + byref(cores), CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device + ) + ) + _cu_check_error( + _cuda_lib.cuDeviceGetAttribute( + byref(threads_per_core), + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, + device, + ) + ) + + if "CUDA_VISIBLE_DEVICES" in os.environ: + real_dev_index = [ + int(s) for s in os.environ["CUDA_VISIBLE_DEVICES"].split(",") + ][dev_index] + else: + real_dev_index = dev_index + + info = _device_infos[dev_index] = _cu_device_info( + index=real_dev_index, + uuid=uuid.UUID(bytes=uuid_t.bytes), + name=name_buf.value.decode(), + multiprocessors=cores.value, + cuda_cores=cores.value + * _cu_get_processor_cores(cc_major.value, cc_minor.value), + threads=cores.value * threads_per_core.value, + ) + return info + + +def get_device_status(dev_index: int) -> _nvml_device_status: + _init() + if _init_pid is None: + return None + + c_device = _nvmlDevice_t() + c_utils = _nvmlUtilization_t() + c_temperature = c_uint() + c_memory_info = _nvmlBAR1Memory_t() + + dev_uuid = get_device_info(dev_index).uuid + + uuid_str = ("GPU-" + str(dev_uuid)).encode() + + if not _is_wsl: + _nvml_check_error( + _nvml_lib.nvmlDeviceGetHandleByUUID(uuid_str, byref(c_device)) + ) + + _nvml_check_error( + _nvml_lib.nvmlDeviceGetUtilizationRates(c_device, byref(c_utils)) + ) + gpu_util = c_utils.gpu + mem_util = c_utils.memory + + _nvml_check_error( + _nvml_lib.nvmlDeviceGetTemperature( + c_device, NVML_TEMPERATURE_GPU, byref(c_temperature) + ) + ) + temperature = c_temperature.value + + _nvml_check_error( + _nvml_lib.nvmlDeviceGetMemoryInfo(c_device, byref(c_memory_info)) + ) + fb_total_mem = c_memory_info.total + fb_free_mem = c_memory_info.free + fb_used_mem = c_memory_info.used + else: + import defusedxml + + proc = subprocess.Popen( + ["/usr/lib/wsl/lib/nvidia-smi", "-q", f"--id={dev_index}", "-x"], + stdout=subprocess.PIPE, + ) + proc.wait() + xml_result = defusedxml.ElementTree.fromstring(proc.stdout.read()) + gpu_node = xml_result.find("gpu") + + fb_node = gpu_node.find("fb_memory_usage") + fb_total_mem = int(parse_readable_size(fb_node.find("total").text)[0]) + fb_free_mem = int(parse_readable_size(fb_node.find("free").text)[0]) + fb_used_mem = int(parse_readable_size(fb_node.find("used").text)[0]) + + util_node = gpu_node.find("utilization") + if util_node.find("gpu_util").text == "N/A": + gpu_util = 0 + else: + gpu_util = int(util_node.find("gpu_util")) + if util_node.find("memory_util").text == "N/A": + mem_util = 0 + else: + mem_util = int(util_node.find("memory_util")) + + temperature = int(gpu_node.find("temperature").find("gpu_temp").text[:-1]) + + return _nvml_device_status( + gpu_util=gpu_util, + mem_util=mem_util, + temperature=temperature, + fb_total_mem=fb_total_mem, + fb_free_mem=fb_free_mem, + fb_used_mem=fb_used_mem, + ) + + +def get_handle_by_index(index: int) -> _nvmlDevice_t: + _init_nvml() + if _nvml_lib is None: + return None + + c_index = c_int(index) + device = _nvmlDevice_t() + _nvml_check_error(_nvml_lib.nvmlDeviceGetHandleByIndex_v2(c_index, byref(device))) + return device + + +def get_handle_by_uuid(uuid: bytes) -> _nvmlDevice_t: + _init_nvml() + if _nvml_lib is None: + return None + + c_uuid = c_char_p(uuid) + device = _nvmlDevice_t() + _nvml_check_error(_nvml_lib.nvmlDeviceGetHandleByUUID(c_uuid, byref(device))) + return device + + +def get_mig_mode(device: _nvmlDevice_t) -> Tuple[int, int]: + _init_nvml() + if _nvml_lib is None: + return None + + c_current_mode, c_pending_mode = c_uint(), c_uint() + _nvml_check_error( + _nvml_lib.nvmlDeviceGetMigMode( + device, byref(c_current_mode), byref(c_pending_mode) + ) + ) + return c_current_mode.value, c_pending_mode.value + + +def get_max_mig_device_count(device: _nvmlDevice_t) -> int: + _init_nvml() + if _nvml_lib is None: + return None + + c_count = c_uint() + _nvml_check_error(_nvml_lib.nvmlDeviceGetMaxMigDeviceCount(device, byref(c_count))) + return c_count.value + + +def get_mig_device_handle_by_index(device: _nvmlDevice_t, index: int) -> _nvmlDevice_t: + _init_nvml() + if _nvml_lib is None: + return None + + c_index = c_uint(index) + mig_device = _nvmlDevice_t() + _nvml_check_error( + _nvml_lib.nvmlDeviceGetMigDeviceHandleByIndex( + device, c_index, byref(mig_device) + ) + ) + return mig_device + + +def get_index(handle: _nvmlDevice_t) -> int: + _init_nvml() + if _nvml_lib is None: + return None + + c_index = c_uint() + _nvml_check_error(_nvml_lib.nvmlDeviceGetIndex(handle, byref(c_index))) + return c_index.value + + +def get_uuid(handle: _nvmlDevice_t) -> bytes: + _init_nvml() + if _nvml_lib is None: + return None + + c_uuid = create_string_buffer(NVML_DEVICE_UUID_V2_BUFFER_SIZE) + _nvml_check_error( + _nvml_lib.nvmlDeviceGetUUID( + handle, c_uuid, c_uint(NVML_DEVICE_UUID_V2_BUFFER_SIZE) + ) + ) + return c_uuid.value + + +def get_index_and_uuid(device: Union[int, bytes, str]) -> CudaDeviceInfo: + _init_nvml() + if _nvml_lib is None: + return None + + try: + device_index = int(device) + device_handle = get_handle_by_index(device_index) + uuid = get_uuid(device_handle) + except ValueError: + uuid = device if isinstance(device, bytes) else device.encode() + uuid_handle = get_handle_by_uuid(uuid) + device_index = get_index(uuid_handle) + uuid = get_uuid(uuid_handle) + + return CudaDeviceInfo(uuid=uuid, device_index=device_index) + + +def get_compute_running_processes(handle: _nvmlDevice_t) -> List[nvmlFriendlyObject]: + _init_nvml() + if _nvml_lib is None: + return None + + c_count = c_uint(0) + func = getattr(_nvml_lib, "nvmlDeviceGetComputeRunningProcesses_v3", None) + if func is None: + func = getattr(_nvml_lib, "nvmlDeviceGetComputeRunningProcesses_v2") + ret = func(handle, byref(c_count), None) + + if ret == NVML_SUCCESS: + # special case, no running processes + return [] + elif ret == NVML_ERROR_INSUFFICIENT_SIZE: + # typical case + # oversize the array in case more processes are created + c_count.value = c_count.value * 2 + 5 + proc_array = _nvmlProcessInfo_t * c_count.value + c_procs = proc_array() + + _nvml_check_error(func(handle, byref(c_count), c_procs)) + + procs = [] + for i in range(c_count.value): + # use an alternative struct for this object + obj = nvmlStructToFriendlyObject(c_procs[i]) + if obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value: + # special case for WDDM on Windows, see comment above + obj.usedGpuMemory = None + procs.append(obj) + + return procs + else: + # error case + _nvml_check_error(ret) + + +def _running_process_matches(handle: _nvmlDevice_t) -> bool: + """Check whether the current process is same as that of handle + Parameters + ---------- + handle : _nvmlDevice_t + NVML handle to CUDA device + Returns + ------- + out : bool + Whether the device handle has a CUDA context on the running process. + """ + return any(os.getpid() == o.pid for o in get_compute_running_processes(handle)) + + +def get_cuda_context() -> CudaContext: + """Check whether the current process already has a CUDA context created.""" + + _init() + if _init_pid is None: + return CudaContext(has_context=False) + + for index in range(_get_all_device_count()): + handle = get_handle_by_index(index) + try: + mig_current_mode, mig_pending_mode = get_mig_mode(handle) + except NVMLAPIError as e: + if e.errno == NVML_ERROR_NOT_SUPPORTED: + mig_current_mode = NVML_DEVICE_MIG_DISABLE + else: + raise + if mig_current_mode == NVML_DEVICE_MIG_ENABLE: + for mig_index in range(get_max_mig_device_count(handle)): + try: + mig_handle = get_mig_device_handle_by_index(handle, mig_index) + except NVMLAPIError as e: + if e.errno == NVML_ERROR_NOT_FOUND: + # No MIG device with that index + continue + else: + raise + if _running_process_matches(mig_handle): + return CudaContext( + has_context=True, + device_info=CudaDeviceInfo( + uuid=get_uuid(handle), + device_index=index, + mig_index=mig_index, + ), + ) + else: + if _running_process_matches(handle): + return CudaContext( + has_context=True, + device_info=CudaDeviceInfo( + uuid=get_uuid(handle), device_index=index + ), + ) + + return CudaContext(has_context=False) diff --git a/python/xorbits/_mars/lib/ordered_set.pyx b/python/xorbits/_mars/lib/ordered_set.pyx new file mode 100644 index 000000000..0f9055c74 --- /dev/null +++ b/python/xorbits/_mars/lib/ordered_set.pyx @@ -0,0 +1,517 @@ +# distutils: language = c++ +# cython: embedsignature = True +# cython: language_level = 3 +# cython: annotate = True +# Copy from https://github.com/simonpercivall/orderedset/blob/master/lib/orderedset/_orderedset.pyx +import sys + +if sys.version_info[0] == 2: + from itertools import izip + from collections import Set, MutableSet, Iterable +else: + izip = zip + from collections.abc import Set, MutableSet, Iterable + +from cpython cimport PyDict_Contains, PyIndex_Check + + +cdef extern from "Python.h": + int PySlice_GetIndicesEx(slice, ssize_t length, ssize_t *start, + ssize_t *stop, ssize_t *step, ssize_t *slicelength) except -1 + + +__all__ = ["OrderedSet"] + + +cdef class entry: + cdef object key + cdef entry prev + cdef entry next + + +cdef inline void _add(_OrderedSet oset, object key): + cdef entry end = oset.end + cdef dict map = oset.map + cdef entry next + + if not PyDict_Contains(map, key): + next = entry() + next.key, next.prev, next.next = key, end.prev, end + end.prev.next = end.prev = map[key] = next + oset.os_used += 1 + + +cdef void _discard(_OrderedSet oset, object key): + cdef dict map = oset.map + cdef entry _entry + + if PyDict_Contains(map, key): + _entry = map.pop(key) + _entry.prev.next = _entry.next + _entry.next.prev = _entry.prev + oset.os_used -= 1 + + +cdef inline object _isorderedsubset(seq1, seq2): + if not len(seq1) <= len(seq2): + return False + for self_elem, other_elem in izip(seq1, seq2): + if not self_elem == other_elem: + return False + return True + + +cdef class OrderedSetIterator(object): + cdef _OrderedSet oset + cdef entry curr + cdef ssize_t si_used + + def __cinit__(self, _OrderedSet oset): + self.oset = oset + self.curr = oset.end + self.si_used = oset.os_used + + def __iter__(self): + return self + + def __next__(self): + cdef entry item + + if self.si_used != self.oset.os_used: + # make this state sticky + self.si_used = -1 + raise RuntimeError('%s changed size during iteration' % type(self.oset).__name__) + + item = self.curr.next + if item == self.oset.end: + raise StopIteration() + self.curr = item + return item.key + + +cdef class OrderedSetReverseIterator(object): + cdef _OrderedSet oset + cdef entry curr + cdef ssize_t si_used + + def __cinit__(self, _OrderedSet oset): + self.oset = oset + self.curr = oset.end + self.si_used = oset.os_used + + def __iter__(self): + return self + + def __next__(self): + cdef entry item + + if self.si_used != self.oset.os_used: + # make this state sticky + self.si_used = -1 + raise RuntimeError('%s changed size during iteration' % type(self.oset).__name__) + + item = self.curr.prev + if item is self.oset.end: + raise StopIteration() + self.curr = item + return item.key + + +cdef class _OrderedSet(object): + cdef dict map + cdef entry end + cdef ssize_t os_used + + def __cinit__(self): + self.map = {} + self.os_used = 0 + self.end = end = entry() + end.prev = end.next = end + + def __init__(self, object iterable=None): + cdef dict map = self.map + cdef entry end = self.end + cdef entry next + + if iterable is not None: + for elem in iterable: + if not PyDict_Contains(map, elem): + next = entry() + next.key, next.prev, next.next = elem, end.prev, end + end.prev.next = end.prev = map[elem] = next + self.os_used += 1 + + @classmethod + def _from_iterable(cls, it): + return cls(it) + + ## + # set methods + ## + cpdef add(self, elem): + """Add element `elem` to the set.""" + _add(self, elem) + + cpdef discard(self, elem): + """Remove element `elem` from the ``OrderedSet`` if it is present.""" + _discard(self, elem) + + cpdef pop(self, last=True): + """Remove last element. Raises ``KeyError`` if the ``OrderedSet`` is empty.""" + if not self: + raise KeyError('OrderedSet is empty') + key = self.end.prev.key if last else self.end.next.key + _discard(self, key) + return key + + def remove(self, elem): + """ + Remove element `elem` from the ``set``. + Raises :class:`KeyError` if `elem` is not contained in the set. + """ + if elem not in self: + raise KeyError(elem) + _discard(self, elem) + + def clear(self): + """Remove all elements from the `set`.""" + cdef entry end = self.end + end.next.prev = end.next = None + + # reinitialize + self.map = {} + self.os_used = 0 + self.end = end = entry() + end.prev = end.next = end + + def copy(self): + """ + :rtype: OrderedSet + :return: a new ``OrderedSet`` with a shallow copy of self. + """ + return self._from_iterable(self) + + def difference(self, other): + """``OrderedSet - other`` + + :rtype: OrderedSet + :return: a new ``OrderedSet`` with elements in the set that are not in the others. + """ + return self - other + + def difference_update(self, other): + """``OrderedSet -= other`` + + Update the ``OrderedSet``, removing elements found in others. + """ + self -= other + + def __sub__(self, other): + """ + :rtype: OrderedSet + """ + ostyp = type(self if isinstance(self, OrderedSet) else other) + + if not isinstance(self, Iterable): + return NotImplemented + if not isinstance(other, Set): + if not isinstance(other, Iterable): + return NotImplemented + other = ostyp._from_iterable(other) + + return ostyp._from_iterable(value for value in self if value not in other) + + def __isub__(self, other): + if other is self: + self.clear() + else: + for value in other: + self.discard(value) + return self + + def intersection(self, other): + """``OrderedSet & other`` + + :rtype: OrderedSet + :return: a new ``OrderedSet`` with elements common to the set and all others. + """ + return self & other + + def intersection_update(self, other): + """``OrderedSet &= other`` + + Update the ``OrderedSet``, keeping only elements found in it and all others. + """ + self &= other + + def __and__(self, other): + """ + :rtype: OrderedSet + """ + ostyp = type(self if isinstance(self, OrderedSet) else other) + + if not isinstance(self, Iterable): + return NotImplemented + if not isinstance(other, Set): + if not isinstance(other, Iterable): + return NotImplemented + other = ostyp._from_iterable(other) + + return ostyp._from_iterable(value for value in self if value in other) + + def __iand__(self, it): + for value in (self - it): + self.discard(value) + return self + + def isdisjoint(self, other): + """ + Return True if the set has no elements in common with other. + Sets are disjoint if and only if their intersection is the empty set. + + :rtype: bool + """ + for value in other: + if value in self: + return False + return True + + def issubset(self, other): + """``OrderedSet <= other`` + + :rtype: bool + + Test whether the ``OrderedSet`` is a proper subset of other, that is, + ``OrderedSet <= other and OrderedSet != other``. + """ + return self <= other + + def issuperset(self, other): + """``OrderedSet >= other`` + + :rtype: bool + + Test whether every element in other is in the set. + """ + return other <= self + + def isorderedsubset(self, other): + return _isorderedsubset(self, other) + + def isorderedsuperset(self, other): + return _isorderedsubset(other, self) + + def symmetric_difference(self, other): + """``OrderedSet ^ other`` + + :rtype: OrderedSet + :return: a new ``OrderedSet`` with elements in either the set or other but not both. + """ + return self ^ other + + def symmetric_difference_update(self, other): + """``OrderedSet ^= other`` + + Update the ``OrderedSet``, keeping only elements found in either set, but not in both. + """ + self ^= other + + def __xor__(self, other): + """ + :rtype: OrderedSet + """ + if not isinstance(self, Iterable): + return NotImplemented + if not isinstance(other, Iterable): + return NotImplemented + + return (self - other) | (other - self) + + def __ixor__(self, other): + if other is self: + self.clear() + else: + if not isinstance(other, Set): + other = self._from_iterable(other) + for value in other: + if value in self: + self.discard(value) + else: + self.add(value) + return self + + def union(self, other): + """``OrderedSet | other`` + + :rtype: OrderedSet + :return: a new ``OrderedSet`` with elements from the set and all others. + """ + return self | other + + def update(self, other): + """``OrderedSet |= other`` + + Update the ``OrderedSet``, adding elements from all others. + """ + self |= other + + def __or__(self, other): + """ + :rtype: OrderedSet + """ + ostyp = type(self if isinstance(self, OrderedSet) else other) + + if not isinstance(self, Iterable): + return NotImplemented + if not isinstance(other, Iterable): + return NotImplemented + chain = (e for s in (self, other) for e in s) + return ostyp._from_iterable(chain) + + def __ior__(self, other): + for elem in other: + _add(self, elem) + return self + + ## + # list methods + ## + def index(self, elem): + """Return the index of `elem`. Rases :class:`ValueError` if not in the OrderedSet.""" + if elem not in self: + raise ValueError("%s is not in %s" % (elem, type(self).__name__)) + cdef entry curr = self.end.next + cdef ssize_t index = 0 + while curr.key != elem: + curr = curr.next + index += 1 + return index + + cdef _getslice(self, slice item): + cdef ssize_t start, stop, step, slicelength, place, i + cdef entry curr + cdef _OrderedSet result + PySlice_GetIndicesEx(item, len(self), &start, &stop, &step, &slicelength) + + result = type(self)() + place = start + curr = self.end + + if slicelength <= 0: + pass + elif step > 0: + # normal forward slice + i = 0 + while slicelength > 0: + while i <= place: + curr = curr.next + i += 1 + _add(result, curr.key) + place += step + slicelength -= 1 + else: + # we're going backwards + i = len(self) + while slicelength > 0: + while i > place: + curr = curr.prev + i -= 1 + _add(result, curr.key) + place += step + slicelength -= 1 + return result + + cdef _getindex(self, ssize_t index): + cdef ssize_t _len = len(self) + if index >= _len or (index < 0 and abs(index) > _len): + raise IndexError("list index out of range") + + cdef entry curr + if index >= 0: + curr = self.end.next + while index: + curr = curr.next + index -= 1 + else: + index = abs(index) - 1 + curr = self.end.prev + while index: + curr = curr.prev + index -= 1 + return curr.key + + def __getitem__(self, item): + """Return the `elem` at `index`. Raises :class:`IndexError` if `index` is out of range.""" + if isinstance(item, slice): + return self._getslice(item) + if not PyIndex_Check(item): + raise TypeError("%s indices must be integers, not %s" % (type(self).__name__, type(item))) + return self._getindex(item) + + ## + # sequence methods + ## + def __len__(self): + return len(self.map) + + def __contains__(self, elem): + return elem in self.map + + def __iter__(self): + return OrderedSetIterator(self) + + def __reversed__(self): + return OrderedSetReverseIterator(self) + + def __reduce__(self): + items = list(self) + inst_dict = vars(self).copy() + return self.__class__, (items, ), inst_dict + + +class OrderedSet(_OrderedSet, MutableSet): + """ + An ``OrderedSet`` object is an ordered collection of distinct hashable objects. + + It works like the :class:`set` type, but remembers insertion order. + + It also supports :meth:`__getitem__` and :meth:`index`, like the + :class:`list` type. + """ + def __repr__(self): + if not self: + return '%s()' % (self.__class__.__name__,) + return '%s(%r)' % (self.__class__.__name__, list(self)) + + def __eq__(self, other): + if isinstance(other, (_OrderedSet, list)): + return len(self) == len(other) and list(self) == list(other) + elif isinstance(other, Set): + return set(self) == set(other) + return NotImplemented + + def __le__(self, other): + if isinstance(other, Set): + return len(self) <= len(other) and set(self) <= set(other) + elif isinstance(other, list): + return len(self) <= len(other) and list(self) <= list(other) + return NotImplemented + + def __lt__(self, other): + if isinstance(other, Set): + return len(self) < len(other) and set(self) < set(other) + elif isinstance(other, list): + return len(self) < len(other) and list(self) < list(other) + return NotImplemented + + def __ge__(self, other): + ret = self < other + if ret is NotImplemented: + return ret + return not ret + + def __gt__(self, other): + ret = self <= other + if ret is NotImplemented: + return ret + return not ret diff --git a/python/xorbits/_mars/lib/sparse/__init__.py b/python/xorbits/_mars/lib/sparse/__init__.py new file mode 100644 index 000000000..a7fa00d1b --- /dev/null +++ b/python/xorbits/_mars/lib/sparse/__init__.py @@ -0,0 +1,859 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import builtins +import operator +from collections.abc import Iterable +from functools import partial, reduce + +from .array import SparseNDArray, call_sparse +from .core import get_sparse_module, issparse +from .matrix import SparseMatrix +from .vector import SparseVector + + +def asarray(x, shape=None): + from .core import issparse + + if issparse(x): + return SparseNDArray(x, shape=shape) + + return x + + +def add(a, b, **_): + try: + return a + b + except TypeError: + if hasattr(b, "__radd__"): + return b.__radd__(a) + raise + + +def subtract(a, b, **_): + try: + return a - b + except TypeError: + if hasattr(b, "__rsub__"): + return b.__rsub__(a) + raise + + +def multiply(a, b, **_): + try: + return a * b + except TypeError: + if hasattr(b, "__rmul__"): + return b.__rmul__(a) + raise + + +def divide(a, b, **_): + try: + return a / b + except TypeError: + if hasattr(b, "__rdiv__"): + return b.__rdiv__(a) + raise + + +def true_divide(a, b, **_): + try: + return a / b + except TypeError: + if hasattr(b, "__rtruediv__"): + return b.__rtruediv__(a) + raise + + +def floor_divide(a, b, **_): + try: + return a // b + except TypeError: + if hasattr(b, "__rfloordiv__"): + return b.__rfloordiv__(a) + raise + + +def power(a, b, **_): + try: + return a**b + except TypeError: + if hasattr(b, "__rpow__"): + return b.__rpow__(a) + raise + + +def mod(a, b, **_): + try: + return a % b + except TypeError: + if hasattr(b, "__rmod__"): + return b.__rmod__(a) + raise + + +def _call_bin(method, a, b, **kwargs): + from .core import cp, get_array_module, issparse + + # order does not take effect for sparse + kwargs.pop("order", None) + if hasattr(a, method): + res = getattr(a, method)(b, **kwargs) + elif get_array_module(a).isscalar(a): + res = call_sparse(method, a, b, **kwargs) + else: + assert get_array_module(a) == get_array_module(b) + xp = get_array_module(a) + try: + res = getattr(xp, method)(a, b, **kwargs) + except TypeError: + if xp is cp and issparse(b): + res = getattr(xp, method)(a, b.toarray(), **kwargs) + else: + raise + + if res is NotImplemented: + raise NotImplementedError + + return res + + +def _call_unary(method, x, *args, **kwargs): + from .core import get_array_module + + # order does not take effect for sparse + kwargs.pop("order", None) + if hasattr(x, method): + res = getattr(x, method)(*args, **kwargs) + else: + xp = get_array_module(x) + res = getattr(xp, method)(x, *args, **kwargs) + + if res is NotImplemented: + raise NotImplementedError + + return res + + +def float_power(a, b, **kw): + return _call_bin("float_power", a, b, **kw) + + +def fmod(a, b, **kw): + return _call_bin("fmod", a, b, **kw) + + +def logaddexp(a, b, **kw): + return _call_bin("logaddexp", a, b, **kw) + + +def logaddexp2(a, b, **kw): + return _call_bin("logaddexp2", a, b, **kw) + + +def negative(x, **_): + return -x + + +def positive(x, **_): + return operator.pos(x) + + +def absolute(x, **_): + return builtins.abs(x) + + +abs = absolute + + +fabs = partial(_call_unary, "fabs") + + +def rint(x, **kw): + return _call_unary("rint", x, **kw) + + +def sign(x, **kw): + return _call_unary("sign", x, **kw) + + +def conj(x, **kw): + return _call_unary("conj", x, **kw) + + +def exp(x, **kw): + return _call_unary("exp", x, **kw) + + +def exp2(x, **kw): + return _call_unary("exp2", x, **kw) + + +def log(x, **kw): + return _call_unary("log", x, **kw) + + +def log2(x, **kw): + return _call_unary("log2", x, **kw) + + +def log10(x, **kw): + return _call_unary("log10", x, **kw) + + +def expm1(x, **kw): + return _call_unary("expm1", x, **kw) + + +def log1p(x, **kw): + return _call_unary("log1p", x, **kw) + + +def sqrt(x, **kw): + return _call_unary("sqrt", x, **kw) + + +def square(x, **kw): + return _call_unary("square", x, **kw) + + +def cbrt(x, **kw): + return _call_unary("cbrt", x, **kw) + + +def reciprocal(x, **kw): + return _call_unary("reciprocal", x, **kw) + + +gamma = partial(_call_unary, "gamma") +gammaln = partial(_call_unary, "gammaln") +loggamma = partial(_call_unary, "loggamma") +gammasgn = partial(_call_unary, "gammasgn") +gammainc = partial(_call_bin, "gammainc") +gammaincinv = partial(_call_bin, "gammaincinv") +gammaincc = partial(_call_bin, "gammaincc") +gammainccinv = partial(_call_bin, "gammainccinv") +beta = partial(_call_bin, "beta") +betaln = partial(_call_bin, "betaln") +betainc = partial(call_sparse, "betainc") +betaincinv = partial(call_sparse, "betaincinv") +psi = partial(_call_unary, "psi") +rgamma = partial(_call_unary, "rgamma") +polygamma = partial(_call_bin, "polygamma") +multigammaln = partial(_call_bin, "multigammaln") +digamma = partial(_call_unary, "digamma") +poch = partial(_call_bin, "poch") + +entr = partial(_call_unary, "entr") +rel_entr = partial(_call_bin, "rel_entr") +kl_div = partial(_call_bin, "kl_div") + +xlogy = partial(_call_bin, "xlogy") + +erf = partial(_call_unary, "erf") +erfc = partial(_call_unary, "erfc") +erfcx = partial(_call_unary, "erfcx") +erfi = partial(_call_unary, "erfi") +erfinv = partial(_call_unary, "erfinv") +erfcinv = partial(_call_unary, "erfcinv") +wofz = partial(_call_unary, "wofz") +dawsn = partial(_call_unary, "dawsn") +voigt_profile = partial(call_sparse, "voigt_profile") + +jv = partial(_call_bin, "jv") +jve = partial(_call_bin, "jve") +yn = partial(_call_bin, "yn") +yv = partial(_call_bin, "yv") +yve = partial(_call_bin, "yve") +kn = partial(_call_bin, "kn") +kv = partial(_call_bin, "kv") +kve = partial(_call_bin, "kve") +iv = partial(_call_bin, "iv") +ive = partial(_call_bin, "ive") +hankel1 = partial(_call_bin, "hankel1") +hankel1e = partial(_call_bin, "hankel1e") +hankel2 = partial(_call_bin, "hankel2") +hankel2e = partial(_call_bin, "hankel2e") + +hyp2f1 = partial(call_sparse, "hyp2f1") +hyp1f1 = partial(call_sparse, "hyp1f1") +hyperu = partial(call_sparse, "hyperu") +hyp0f1 = partial(_call_bin, "hyp0f1") + +ellip_harm = partial(call_sparse, "ellip_harm") +ellip_harm_2 = partial(call_sparse, "ellip_harm_2") +ellip_normal = partial(call_sparse, "ellip_normal") + +ellipk = partial(_call_unary, "ellipk") +ellipkm1 = partial(_call_unary, "ellipkm1") +ellipkinc = partial(_call_bin, "ellipkinc") +ellipe = partial(_call_unary, "ellipe") +ellipeinc = partial(_call_bin, "ellipeinc") +elliprc = partial(_call_bin, "elliprc") +elliprd = partial(call_sparse, "elliprd") +elliprf = partial(call_sparse, "elliprf") +elliprg = partial(call_sparse, "elliprg") +elliprj = partial(call_sparse, "elliprj") + +airy = partial(_call_unary, "airy") +airye = partial(_call_unary, "airye") +itairy = partial(_call_unary, "itairy") + + +def equal(a, b, **_): + try: + return a == b + except TypeError: + return b == a + + +def not_equal(a, b, **_): + try: + return a != b + except TypeError: + return b != a + + +def less(a, b, **_): + try: + return a < b + except TypeError: + return b > a + + +def less_equal(a, b, **_): + try: + return a <= b + except TypeError: + return b >= a + + +def greater(a, b, **_): + try: + return a > b + except TypeError: + return b < a + + +def greater_equal(a, b, **_): + try: + return a >= b + except TypeError: + return b <= a + + +def logical_and(a, b, **kw): + return _call_bin("logical_and", a, b, **kw) + + +def logical_or(a, b, **kw): + return _call_bin("logical_or", a, b, **kw) + + +def logical_xor(a, b, **kw): + return _call_bin("logical_xor", a, b, **kw) + + +def logical_not(x, **kw): + return _call_unary("logical_not", x, **kw) + + +def isclose(a, b, **kw): + return _call_bin("isclose", a, b, **kw) + + +def bitwise_and(a, b, **_): + try: + return a & b + except TypeError: + return b & a + + +def bitwise_or(a, b, **_): + try: + return a | b + except TypeError: + return b | a + + +def bitwise_xor(a, b, **_): + try: + return operator.xor(a, b) + except TypeError: + return operator.xor(b, a) + + +def invert(x, **_): + return ~x + + +def left_shift(a, b, **_): + return a << b + + +def right_shift(a, b, **_): + return a >> b + + +def sin(x, **kw): + return _call_unary("sin", x, **kw) + + +def cos(x, **kw): + return _call_unary("cos", x, **kw) + + +def tan(x, **kw): + return _call_unary("tan", x, **kw) + + +def arcsin(x, **kw): + return _call_unary("arcsin", x, **kw) + + +def arccos(x, **kw): + return _call_unary("arccos", x, **kw) + + +def arctan(x, **kw): + return _call_unary("arctan", x, **kw) + + +def arctan2(a, b, **kw): + return _call_bin("arctan2", a, b, **kw) + + +def hypot(a, b, **kw): + return _call_bin("hypot", a, b, **kw) + + +def sinh(x, **kw): + return _call_unary("sinh", x, **kw) + + +def cosh(x, **kw): + return _call_unary("cosh", x, **kw) + + +def tanh(x, **kw): + return _call_unary("tanh", x, **kw) + + +def arcsinh(x, **kw): + return _call_unary("arcsinh", x, **kw) + + +def arccosh(x, **kw): + return _call_unary("arccosh", x, **kw) + + +def around(x, **kw): + return _call_unary("around", x, **kw) + + +def arctanh(x, **kw): + return _call_unary("arctanh", x, **kw) + + +def deg2rad(x, **kw): + return _call_unary("deg2rad", x, **kw) + + +def rad2deg(x, **kw): + return _call_unary("rad2deg", x, **kw) + + +def angle(x, **kw): + return _call_unary("angle", x, **kw) + + +def isinf(x, **kw): + return _call_unary("isinf", x, **kw) + + +def isnan(x, **kw): + return _call_unary("isnan", x, **kw) + + +def signbit(x, **kw): + return _call_unary("signbit", x, **kw) + + +def dot(a, b, sparse=True, **_): + from .core import issparse + + if not issparse(a): + ret = a.dot(b) + if not sparse: + return ret + else: + xps = get_sparse_module(ret) + return SparseNDArray(xps.csr_matrix(ret), shape=ret.shape) + + return a.dot(b, sparse=sparse) + + +def tensordot(a, b, axes=2, sparse=True): + if isinstance(axes, Iterable): + a_axes, b_axes = axes + else: + a_axes = tuple(range(a.ndim - 1, a.ndim - axes - 1, -1)) + b_axes = tuple(range(0, axes)) + + if isinstance(a_axes, Iterable): + a_axes = tuple(a_axes) + else: + a_axes = (a_axes,) + if isinstance(b_axes, Iterable): + b_axes = tuple(b_axes) + else: + b_axes = (b_axes,) + + if a_axes == (a.ndim - 1,) and b_axes == (b.ndim - 2,): + return dot(a, b, sparse=sparse) + + if a.ndim == b.ndim == 2: + if a_axes == (a.ndim - 1,) and b_axes == (b.ndim - 1,): + # inner product of multiple dims + return dot(a, b.T, sparse=sparse) + + if a.ndim == 1 or b.ndim == 1: + return dot(a, b, sparse=sparse) + + raise NotImplementedError + + +def matmul(a, b, sparse=True, **_): + return dot(a, b, sparse=sparse) + + +def concatenate(tensors, axis=0): + return reduce(lambda a, b: _call_bin("concatenate", a, b, axis=axis), tensors) + + +def transpose(tensor, axes=None): + return _call_unary("transpose", tensor, axes=axes) + + +def swapaxes(tensor, axis1, axis2): + return _call_unary("swapaxes", tensor, axis1, axis2) + + +def sum(tensor, axis=None, **kw): + return _call_unary("sum", tensor, axis=axis, **kw) + + +def prod(tensor, axis=None, **kw): + return _call_unary("prod", tensor, axis=axis, **kw) + + +def amax(tensor, axis=None, **kw): + return _call_unary("amax", tensor, axis=axis, **kw) + + +max = amax + + +def amin(tensor, axis=None, **kw): + return _call_unary("amin", tensor, axis=axis, **kw) + + +min = amin + + +def all(tensor, axis=None, **kw): + return _call_unary("all", tensor, axis=axis, **kw) + + +def any(tensor, axis=None, **kw): + return _call_unary("any", tensor, axis=axis, **kw) + + +def mean(tensor, axis=None, **kw): + return _call_unary("mean", tensor, axis=axis, **kw) + + +def nansum(tensor, axis=None, **kw): + return _call_unary("nansum", tensor, axis=axis, **kw) + + +def nanprod(tensor, axis=None, **kw): + return _call_unary("nanprod", tensor, axis=axis, **kw) + + +def nanmax(tensor, axis=None, **kw): + return _call_unary("nanmax", tensor, axis=axis, **kw) + + +def nanmin(tensor, axis=None, **kw): + return _call_unary("nanmin", tensor, axis=axis, **kw) + + +def argmax(tensor, axis=None, **kw): + return _call_unary("argmax", tensor, axis=axis, **kw) + + +def nanargmax(tensor, axis=None, **kw): + return _call_unary("nanargmax", tensor, axis=axis, **kw) + + +def argmin(tensor, axis=None, **kw): + return _call_unary("argmin", tensor, axis=axis, **kw) + + +def nanargmin(tensor, axis=None, **kw): + return _call_unary("nanargmin", tensor, axis=axis, **kw) + + +def var(tensor, axis=None, **kw): + return _call_unary("var", tensor, axis=axis, **kw) + + +def cumsum(tensor, axis=None, **kw): + return _call_unary("cumsum", tensor, axis=axis, **kw) + + +def cumprod(tensor, axis=None, **kw): + return _call_unary("cumprod", tensor, axis=axis, **kw) + + +def nancumsum(tensor, axis=None, **kw): + return _call_unary("nancumsum", tensor, axis=axis, **kw) + + +def nancumprod(tensor, axis=None, **kw): + return _call_unary("nancumprod", tensor, axis=axis, **kw) + + +def count_nonzero(tensor, axis=None, **kw): + return _call_unary("count_nonzero", tensor, axis=axis, **kw) + + +def maximum(a, b, **kw): + return _call_bin("maximum", a, b, **kw) + + +def minimum(a, b, **kw): + return _call_bin("minimum", a, b, **kw) + + +def fmax(a, b, **kw): + return _call_bin("fmax", a, b, **kw) + + +def fmin(a, b, **kw): + return _call_bin("fmin", a, b, **kw) + + +def floor(x, **kw): + return _call_unary("floor", x, **kw) + + +def ceil(x, **kw): + return _call_unary("ceil", x, **kw) + + +def trunc(x, **kw): + return _call_unary("trunc", x, **kw) + + +def degrees(x, **kw): + return _call_unary("degrees", x, **kw) + + +def radians(x, **kw): + return _call_unary("radians", x, **kw) + + +def clip(a, a_max, a_min, **kw): + from .core import get_array_module + + if hasattr(a, "clip"): + res = getattr(a, "clip")(a_max, a_min, **kw) + else: + xp = get_array_module(a) + res = getattr(xp, "clip")(a, a_max, a_min, **kw) + + if res is NotImplemented: + raise NotImplementedError + + return res + + +def iscomplex(x, **kw): + return _call_unary("iscomplex", x, **kw) + + +def real(x, **_): + return x.real + + +def imag(x, **_): + return x.imag + + +def fix(x, **kw): + return _call_unary("fix", x, **kw) + + +def i0(x, **kw): + return _call_unary("i0", x, **kw) + + +def nan_to_num(x, **kw): + return _call_unary("nan_to_num", x, **kw) + + +def copysign(a, b, **kw): + return _call_bin("copysign", a, b, **kw) + + +def nextafter(a, b, **kw): + return _call_bin("nextafter", a, b, **kw) + + +def spacing(x, **kw): + return _call_unary("spacing", x, **kw) + + +def ldexp(a, b, **kw): + return _call_bin("ldexp", a, b, **kw) + + +def frexp(x, **kw): + return _call_unary("frexp", x, **kw) + + +def modf(x, **kw): + return _call_unary("modf", x, **kw) + + +def sinc(x, **kw): + return _call_unary("sinc", x, **kw) + + +def isfinite(x, **kw): + return _call_unary("isfinite", x, **kw) + + +def isreal(x, **kw): + return _call_unary("isreal", x, **kw) + + +def isfortran(x, **kw): + return call_sparse("isfortran", x, **kw) + + +def where(cond, x, y): + if any([i.ndim not in (0, 2) for i in (cond, x, y)]): + raise NotImplementedError + + from .matrix import where as matrix_where + + return matrix_where(cond, x, y) + + +def digitize(x, bins, right=False): + return _call_unary("digitize", x, bins, right) + + +def repeat(a, repeats, axis=None): + return _call_unary("repeat", a, repeats, axis=axis) + + +def fill_diagonal(a, val, wrap=False): + return _call_unary("fill_diagonal", a, val, wrap=wrap) + + +def unique(a, return_index=False, return_inverse=False, return_counts=False, axis=None): + return _call_unary( + "unique", + a, + return_index=return_index, + return_inverse=return_inverse, + return_counts=return_counts, + axis=axis, + ) + + +def zeros(shape, dtype=float, gpu=False): + if len(shape) == 2: + from .matrix import zeros_sparse_matrix + + return zeros_sparse_matrix(shape, dtype=dtype, gpu=gpu) + + raise NotImplementedError + + +def ones_like(x): + from .core import get_array_module + + return get_array_module(x).ones(x.shape) + + +def diag(v, k=0, gpu=False): + assert v.ndim in {1, 2} + + from .matrix import diag_sparse_matrix + + return diag_sparse_matrix(v, k=k, gpu=gpu) + + +def eye(N, M=None, k=0, dtype=float, gpu=False): + from .matrix import eye_sparse_matrix + + return eye_sparse_matrix(N, M=M, k=k, dtype=dtype, gpu=gpu) + + +def triu(m, k=0, gpu=False): + if m.ndim == 2: + from .matrix import triu_sparse_matrix + + return triu_sparse_matrix(m, k=k, gpu=gpu) + + raise NotImplementedError + + +def tril(m, k=0, gpu=False): + if m.ndim == 2: + from .matrix import tril_sparse_matrix + + return tril_sparse_matrix(m, k=k, gpu=gpu) + + raise NotImplementedError + + +def lu(m): + from .matrix import lu_sparse_matrix + + return lu_sparse_matrix(m) + + +def solve_triangular(a, b, lower=False, sparse=True): + from .matrix import solve_triangular_sparse_matrix + + return solve_triangular_sparse_matrix(a, b, lower=lower, sparse=sparse) + + +def block(arrs): + arr = arrs[0] + while isinstance(arr, list): + arr = arr[0] + if arr.ndim != 2: # pragma: no cover + raise NotImplementedError + + from .matrix import block + + return block(arrs) diff --git a/python/xorbits/_mars/lib/sparse/array.py b/python/xorbits/_mars/lib/sparse/array.py new file mode 100644 index 000000000..5f986657a --- /dev/null +++ b/python/xorbits/_mars/lib/sparse/array.py @@ -0,0 +1,1603 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import partialmethod + +from ...utils import ceildiv +from .core import ( + cp, + cps, + get_array_module, + get_sparse_module, + is_cupy, + issparse, + naked, + np, +) + + +class SparseNDArray: + __slots__ = ("__weakref__",) + __array_priority__ = 21 + + def __new__(cls, *args, **kwargs): + shape = kwargs.get("shape", None) + if shape is not None and len(shape) == 1: + from .vector import SparseVector + + return object.__new__(SparseVector) + if len(args) == 1 and issparse(args[0]) and args[0].ndim == 2: + from .matrix import SparseMatrix + + return object.__new__(SparseMatrix) + + else: + if cls is not SparseNDArray: + return object.__new__(cls) + else: + raise ValueError( + f"The construct params of {cls.__name__} are invalid: " + f"args={args}, kwargs={kwargs}" + ) + + @property + def raw(self): + raise NotImplementedError + + +def call_sparse(method, *args, **kwargs): + new_args = [] + make_dense = False + matrix = None + for arg in args: + if hasattr(arg, "spmatrix"): + # todo add support for multiple sparse arrays + if make_dense or matrix is not None: + make_dense = True + matrix = arg + new_args.append(matrix.spmatrix.data) + else: + if isinstance(arg, np.ndarray): + make_dense = True + new_args.append(arg) + + spmatrix = matrix.spmatrix + if make_dense: + new_args = [arg.toarray() if hasattr(arg, "spmatrix") else arg for arg in args] + + xp = get_array_module(spmatrix) + try: + new_data = getattr(xp, method)(*new_args, **kwargs) + except AttributeError: + if xp is np: + from scipy import special + else: + from cupyx.scipy import special + new_data = getattr(special, method)(*new_args, **kwargs) + + if not make_dense: + new_spmatrix = get_sparse_module(spmatrix).csr_matrix( + (new_data, spmatrix.indices, spmatrix.indptr), spmatrix.shape + ) + else: + new_spmatrix = get_sparse_module(spmatrix).csr_matrix(new_data) + return SparseNDArray(new_spmatrix, shape=matrix.shape) + + +class SparseArray(SparseNDArray): + __slots__ = ("spmatrix",) + + @property + def ndim(self): + return len(self.shape) + + def tocsr(self): + return self + + def toarray(self): + if self.shape != self.spmatrix.shape: + return self.spmatrix.toarray().reshape(self.shape) + else: + return self.spmatrix.toarray() + + def todense(self): + return self.toarray() + + def ascupy(self): + is_cp = get_array_module(self.spmatrix) is cp + if is_cp: + return self + mat_tuple = ( + cp.asarray(self.data), + cp.asarray(self.indices), + cp.asarray(self.indptr), + ) + return SparseNDArray( + cps.csr_matrix(mat_tuple, shape=self.spmatrix.shape), shape=self.shape + ) + + def asscipy(self): + is_cp = get_array_module(self.spmatrix) is cp + if not is_cp: + return self + return SparseNDArray(self.spmatrix.get(), shape=self.shape) + + def __array__(self, dtype=None): + x = self.toarray() + if dtype and x.dtype != dtype: + return x.astype(dtype) + return x + + @property + def nbytes(self): + return ( + self.spmatrix.data.nbytes + + self.spmatrix.indptr.nbytes + + self.spmatrix.indices.nbytes + ) + + @property + def raw(self): + return self.spmatrix + + @property + def data(self): + return self.spmatrix.data + + @property + def indptr(self): + return self.spmatrix.indptr + + @property + def indices(self): + return self.spmatrix.indices + + @property + def nnz(self): + return self.spmatrix.nnz + + @property + def shape(self): + raise self.spmatrix.shape + + @property + def dtype(self): + return self.spmatrix.dtype + + def copy(self): + return SparseNDArray(self.spmatrix.copy(), shape=self.shape) + + @property + def real(self): + xps = get_sparse_module(self.spmatrix) + return SparseNDArray( + xps.csr_matrix( + (self.spmatrix.data.real, self.spmatrix.indices, self.spmatrix.indptr), + self.spmatrix.shape, + ), + shape=self.shape, + ) + + @real.setter + def real(self, r): + xps = get_sparse_module(self.spmatrix) + x = self.spmatrix.toarray() + if issparse(r): + r = r.toarray() + x.real = r + self.spmatrix = xps.csr_matrix(x) + + @property + def imag(self): + xps = get_sparse_module(self.spmatrix) + return SparseNDArray( + xps.csr_matrix( + (self.spmatrix.data.imag, self.spmatrix.indices, self.spmatrix.indptr), + self.spmatrix.shape, + ), + shape=self.shape, + ) + + @imag.setter + def imag(self, imag): + xps = get_sparse_module(self.spmatrix) + x = self.spmatrix.toarray() + if issparse(imag): + imag = imag.toarray() + x.imag = imag + self.spmatrix = xps.csr_matrix(x) + + def __getattr__(self, attr): + is_cp = get_array_module(self.spmatrix) is cp + if attr == "device" and is_cp: + try: + return self.spmatrix.device + except NotImplementedError: + return cp.cuda.Device(0) + if attr == "get" and is_cp: + return lambda: SparseNDArray(self.spmatrix.get(), shape=self.shape) + + return super().__getattribute__(attr) + + def __getstate__(self): + return self.spmatrix + + def __setstate__(self, state): + self.spmatrix = state + + def astype(self, dtype, **_): + dtype = np.dtype(dtype) + if self.dtype == dtype: + return self + return SparseNDArray(self.spmatrix.astype(dtype), shape=self.shape) + + def transpose(self, axes=None): + raise NotImplementedError + + def swapaxes(self, axis1, axis2): + if axis1 == 0 and axis2 == 1: + return self + + assert axis1 == 1 and axis2 == 0 + return self.transpose() + + def reshape(self, shape, **_): + sp_shape = shape if len(shape) == 2 else (1, shape[0]) + spmatrix = self.spmatrix.tolil().reshape(sp_shape) + return SparseNDArray(spmatrix, shape=shape) + + def broadcast_to(self, shape): + # TODO(jisheng): implement broadcast_to + raise NotImplementedError + + def squeeze(self, axis=None): + # TODO(jisheng): implement squeeze + raise NotImplementedError + + @property + def T(self): + raise NotImplementedError + + # ---------------- arithmetic ---------------------- + + def __add__(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + other_xp = get_array_module(naked_other) + if other_xp.isscalar(naked_other): + return call_sparse("add", self, naked_other) + if issparse(naked_other): + x = self.spmatrix + naked_other + else: + x = self.toarray() + naked_other + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def __radd__(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + other_xp = get_array_module(naked_other) + if other_xp.isscalar(naked_other): + return call_sparse("add", naked_other, self) + if issparse(naked_other): + x = self.spmatrix + naked_other + else: + x = self.toarray() + naked_other + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def __sub__(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + other_xp = get_array_module(naked_other) + if other_xp.isscalar(naked_other): + return call_sparse("subtract", self, naked_other) + if issparse(naked_other): + x = self.spmatrix - naked_other + else: + x = self.toarray() - naked_other + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def __rsub__(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + other_xp = get_array_module(naked_other) + if other_xp.isscalar(naked_other): + return call_sparse("subtract", naked_other, self) + if issparse(naked_other): + x = naked_other - self.spmatrix + else: + x = naked_other - self.toarray() + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def __mul__(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + if is_cupy(self.spmatrix): + if not cp.isscalar(naked_other): + # TODO(jisheng): cupy does not implement multiply method + is_other_sparse = issparse(naked_other) + if ( + is_other_sparse + and self.spmatrix.nnz == naked_other.nnz + and cp.all(self.spmatrix.indptr == naked_other.indptr) + and cp.all(self.spmatrix.indices == naked_other.indices) + ): + x = cps.csr_matrix( + ( + self.spmatrix.data * naked_other.data, + self.spmatrix.indices, + self.spmatrix.indptr, + ), + self.spmatrix.shape, + ) + else: + if is_other_sparse: + naked_other = other.toarray() + dense = self.spmatrix.toarray() + res = cp.multiply(dense, naked_other, out=dense) + x = cps.csr_matrix(res) + else: + x = self.spmatrix * naked_other + else: + x = self.spmatrix.multiply(naked_other) + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def __rmul__(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + if is_cupy(self.spmatrix): + if not cp.isscalar(naked_other): + # TODO(jisheng): cupy does not implement multiply method + is_other_sparse = issparse(naked_other) + if ( + is_other_sparse + and self.spmatrix.nnz == naked_other.nnz + and cp.all(self.spmatrix.indptr == naked_other.indptr) + and cp.all(self.spmatrix.indices == naked_other.indices) + ): + x = cps.csr_matrix( + ( + naked_other.data * self.spmatrix.data, + self.spmatrix.indices, + self.spmatrix.indptr, + ), + self.spmatrix.shape, + ) + else: + if is_other_sparse: + naked_other = other.toarray() + dense = self.spmatrix.toarray() + res = cp.multiply(naked_other, dense, out=dense) + x = cps.csr_matrix(res) + else: + x = naked_other * self.spmatrix + else: + x = self.spmatrix.multiply(naked_other) + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def __matmul__(self, other): + from . import matmul + + return matmul(self, other) + + def __rmatmul__(self, other): + from . import matmul + + return matmul(other, self) + + def __div__(self, other): + return self.__truediv__(other) + + def __rdiv__(self, other): + return self.__rtruediv__(other) + + def __truediv__(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + x = self.spmatrix / naked_other + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def __rtruediv__(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + try: + x = naked_other / self.spmatrix + except TypeError: + x = naked_other / self.spmatrix.toarray() + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def __floordiv__(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + other_xp = get_array_module(naked_other) + if other_xp.isscalar(naked_other): + return call_sparse("floor_divide", self, naked_other) + else: + if issparse(naked_other): + naked_other = other.toarray() + x = get_sparse_module(self.spmatrix).csr_matrix( + self.toarray() // naked_other + ) + else: + x = self.toarray() // naked_other + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def __rfloordiv__(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + other_xp = get_array_module(naked_other) + if other_xp.isscalar(naked_other): + return call_sparse("floor_divide", naked_other, self) + else: + if issparse(naked_other): + naked_other = other.toarray() + x = get_sparse_module(self.spmatrix).csr_matrix( + naked_other // self.toarray() + ) + else: + x = naked_other // self.toarray() + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def __pow__(self, other, modulo=None): + if modulo is not None: + return NotImplemented + + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + if get_array_module(naked_other).isscalar(naked_other): + try: + x = self.spmatrix.power(naked_other) + except ValueError as e: # pragma: no cover + # https://github.com/mars-project/mars/issues/3268 + # https://github.com/scipy/scipy/issues/8678 + assert "WRITEBACKIFCOPY" in e.args[0] + self.spmatrix = self.spmatrix.copy() + x = self.spmatrix.power(naked_other) + else: + if issparse(naked_other): + naked_other = other.toarray() + x = self.toarray() ** naked_other + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def __rpow__(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + if issparse(naked_other): + naked_other = other.toarray() + x = naked_other ** self.toarray() + return get_array_module(x).asarray(x) + + def float_power(self, other): + ret = self.__pow__(other) + ret = naked(ret).astype(float) + if issparse(ret): + return SparseNDArray(ret, shape=self.shape) + return ret + + def __mod__(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + if get_array_module(naked_other).isscalar(naked_other): + data = self.spmatrix.data % naked_other + x = get_sparse_module(self.spmatrix).csr_matrix( + (data, self.spmatrix.indices, self.spmatrix.indptr), self.spmatrix.shape + ) + else: + if issparse(naked_other): + naked_other = other.toarray() + x = get_sparse_module(self.spmatrix).csr_matrix( + self.toarray() % naked_other + ) + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def __rmod__(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + is_sparse = issparse(naked_other) + if issparse(naked_other): + naked_other = other.toarray() + if get_array_module(naked_other).isscalar(naked_other): + data = naked_other % self.spmatrix.data + x = get_sparse_module(self.spmatrix).csr_matrix( + (data, self.spmatrix.indices, self.spmatrix.indptr), self.spmatrix.shape + ) + else: + x = naked_other % self.toarray() + if is_sparse: + x = get_sparse_module(self.spmatrix).csr_matrix(x) + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def fmod(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + + xp = get_array_module(self.spmatrix) + if get_array_module(naked_other).isscalar(naked_other): + return call_sparse("fmod", self, naked_other) + else: + if issparse(naked_other): + naked_other = other.toarray() + x = get_sparse_module(self.spmatrix).csr_matrix( + xp.fmod(self.toarray(), naked_other) + ) + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def logaddexp(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + + xp = get_array_module(self.spmatrix) + if get_array_module(naked_other).isscalar(naked_other): + return call_sparse("logaddexp", self, naked_other) + if issparse(naked_other): + naked_other = other.toarray() + return xp.logaddexp(self.toarray(), naked_other) + + def logaddexp2(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + + xp = get_array_module(self.spmatrix) + if get_array_module(naked_other).isscalar(naked_other): + return call_sparse("logaddexp2", self, naked_other) + if issparse(naked_other): + naked_other = other.toarray() + return xp.logaddexp2(self.toarray(), naked_other) + + def __neg__(self): + return SparseNDArray(-self.spmatrix, shape=self.shape) + + def __pos__(self): + return SparseNDArray(self.spmatrix.copy(), shape=self.shape) + + def __abs__(self): + return SparseNDArray(abs(self.spmatrix), shape=self.shape) + + def fabs(self): + xp = get_array_module(self.spmatrix) + return SparseNDArray( + get_sparse_module(self.spmatrix).csr_matrix( + xp.abs(self.spmatrix), dtype="f8" + ), + shape=self.shape, + ) + + def rint(self): + return SparseNDArray(self.spmatrix.rint(), shape=self.shape) + + def sign(self): + return SparseNDArray(self.spmatrix.sign(), shape=self.shape) + + def conj(self): + return SparseNDArray(self.spmatrix.conj(), shape=self.shape) + + def exp(self): + return call_sparse("exp", self) + + def exp2(self): + return call_sparse("exp2", self) + + def log(self): + return call_sparse("log", self) + + def log2(self): + return call_sparse("log2", self) + + def log10(self): + return call_sparse("log10", self) + + def expm1(self): + return SparseNDArray(self.spmatrix.expm1(), shape=self.shape) + + def log1p(self): + return SparseNDArray(self.spmatrix.log1p(), shape=self.shape) + + def sqrt(self): + return SparseNDArray(self.spmatrix.sqrt(), shape=self.shape) + + def square(self): + return call_sparse("square", self) + + def cbrt(self): + return call_sparse("cbrt", self) + + def reciprocal(self): + return call_sparse("reciprocal", self) + + def _scipy_unary(self, func_name): + spmatrix = self.spmatrix + xp = get_array_module(spmatrix) + if xp is np: + from scipy import special + else: + from cupyx.scipy import special + + new_data = getattr(special, func_name)(spmatrix.data) + new_spmatrix = get_sparse_module(spmatrix).csr_matrix( + (new_data, spmatrix.indices, spmatrix.indptr), spmatrix.shape + ) + return SparseNDArray(new_spmatrix, shape=self.shape) + + def _scipy_binary(self, func_name, other): + try: + naked_other = naked(other) + except TypeError: # pragma: no cover + return NotImplemented + + xp = get_array_module(self.spmatrix) + + if xp is np: + from scipy import special + else: # pragma: no cover + from cupyx.scipy import special + + func = getattr(special, func_name) + + if get_array_module(naked_other).isscalar(naked_other): # pragma: no cover + return call_sparse(func, self, naked_other) + else: + if issparse(naked_other): # pragma: no cover + naked_other = other.toarray() + x = get_sparse_module(self.spmatrix).csr_matrix( + func(self.toarray(), naked_other) + ) + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + gamma = partialmethod(_scipy_unary, "gamma") + gammaln = partialmethod(_scipy_unary, "gammaln") + loggamma = partialmethod(_scipy_unary, "loggamma") + gammasgn = partialmethod(_scipy_unary, "gammasgn") + gammainc = partialmethod(_scipy_binary, "gammainc") + gammaincinv = partialmethod(_scipy_binary, "gammaincinv") + gammaincc = partialmethod(_scipy_binary, "gammaincc") + gammainccinv = partialmethod(_scipy_binary, "gammainccinv") + beta = partialmethod(_scipy_binary, "beta") + betaln = partialmethod(_scipy_binary, "betaln") + psi = partialmethod(_scipy_unary, "psi") + rgamma = partialmethod(_scipy_unary, "rgamma") + polygamma = partialmethod(_scipy_binary, "polygamma") + multigammaln = partialmethod(_scipy_binary, "multigammaln") + digamma = partialmethod(_scipy_unary, "digamma") + poch = partialmethod(_scipy_binary, "poch") + + erf = partialmethod(_scipy_unary, "erf") + erfc = partialmethod(_scipy_unary, "erfc") + erfcx = partialmethod(_scipy_unary, "erfcx") + erfi = partialmethod(_scipy_unary, "erfi") + erfinv = partialmethod(_scipy_unary, "erfinv") + erfcinv = partialmethod(_scipy_unary, "erfcinv") + wofz = partialmethod(_scipy_unary, "wofz") + dawsn = partialmethod(_scipy_unary, "dawsn") + entr = partialmethod(_scipy_unary, "entr") + + ellipk = partialmethod(_scipy_unary, "ellipk") + ellipkm1 = partialmethod(_scipy_unary, "ellipkm1") + ellipkinc = partialmethod(_scipy_binary, "ellipkinc") + ellipe = partialmethod(_scipy_unary, "ellipe") + ellipeinc = partialmethod(_scipy_binary, "ellipeinc") + elliprc = partialmethod(_scipy_binary, "elliprc") + + rel_entr = partialmethod(_scipy_binary, "rel_entr") + kl_div = partialmethod(_scipy_binary, "kl_div") + xlogy = partialmethod(_scipy_binary, "xlogy") + + jv = partialmethod(_scipy_binary, "jv") + jve = partialmethod(_scipy_binary, "jve") + yn = partialmethod(_scipy_binary, "yn") + yv = partialmethod(_scipy_binary, "yv") + yve = partialmethod(_scipy_binary, "yve") + kn = partialmethod(_scipy_binary, "kn") + kv = partialmethod(_scipy_binary, "kv") + kve = partialmethod(_scipy_binary, "kve") + iv = partialmethod(_scipy_binary, "iv") + ive = partialmethod(_scipy_binary, "ive") + hankel1 = partialmethod(_scipy_binary, "hankel1") + hankel1e = partialmethod(_scipy_binary, "hankel1e") + hankel2 = partialmethod(_scipy_binary, "hankel2") + hankel2e = partialmethod(_scipy_binary, "hankel2e") + + hyp0f1 = partialmethod(_scipy_binary, "hyp0f1") + + airy = partialmethod(_scipy_unary, "airy") + airye = partialmethod(_scipy_unary, "airye") + itairy = partialmethod(_scipy_unary, "itairy") + + def __eq__(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + + if get_array_module(naked_other).isscalar(naked_other): + return call_sparse("equal", self, naked_other) + if is_cupy(self.spmatrix): + return NotImplemented + else: + if issparse(naked_other): + x = self.spmatrix == naked_other + else: + x = self.toarray() == other + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def __ne__(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + + if get_array_module(naked_other).isscalar(naked_other): + return call_sparse("not_equal", self, naked_other) + if is_cupy(self.spmatrix): + return NotImplemented + else: + if issparse(naked_other): + x = self.spmatrix != naked_other + else: + x = self.toarray() != other + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def __lt__(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + + if get_array_module(naked_other).isscalar(naked_other): + return call_sparse("less", self, naked_other) + if is_cupy(self.spmatrix): + return NotImplemented + else: + if issparse(naked_other): + x = self.spmatrix < naked_other + else: + x = self.toarray() < other + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def __le__(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + + if get_array_module(naked_other).isscalar(naked_other): + return call_sparse("less_equal", self, naked_other) + if is_cupy(self.spmatrix): + return NotImplemented + else: + if issparse(naked_other): + x = self.spmatrix <= naked_other + else: + x = self.toarray() <= other + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def __gt__(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + + if get_array_module(naked_other).isscalar(naked_other): + return call_sparse("greater", self, naked_other) + if is_cupy(self.spmatrix): + return NotImplemented + else: + if issparse(naked_other): + x = self.spmatrix > naked_other + else: + x = self.toarray() > other + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def __ge__(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + + if get_array_module(naked_other).isscalar(naked_other): + return call_sparse("greater_equal", self, naked_other) + if is_cupy(self.spmatrix): + return NotImplemented + else: + if issparse(naked_other): + x = self.spmatrix >= naked_other + else: + x = self.toarray() >= other + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def logical_and(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + + if is_cupy(self.spmatrix): + return NotImplemented + else: + other_xp = get_array_module(naked_other) + if other_xp.isscalar(naked_other): + naked_other = other_xp.array(naked_other).astype(bool) + else: + naked_other = naked_other.astype(bool) + x = self.spmatrix.astype(bool).multiply(naked_other) + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def logical_or(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + + if is_cupy(self.spmatrix): + return NotImplemented + else: + other_xp = get_array_module(naked_other) + if other_xp.isscalar(naked_other): + if naked_other != 0: + x = np.logical_and(self.toarray(), naked_other) + else: + x = self.spmatrix.astype(bool) + else: + naked_other = naked_other.astype(bool) + x = (self.spmatrix.astype(bool) + naked_other).astype(bool) + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def logical_xor(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + + if is_cupy(self.spmatrix): + return NotImplemented + else: + other_xp = get_array_module(naked_other) + if other_xp.isscalar(naked_other): + naked_other = other_xp.array(naked_other).astype(bool) + else: + naked_other = naked_other.astype(bool) + x = self.spmatrix.astype(bool) != naked_other + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def logical_not(self): + return call_sparse("logical_not", self) + + @staticmethod + def _bitwise(this, other, method_name): + try: + naked_this = naked(this) + except TypeError: + return NotImplemented + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + + if not issparse(naked_this): + return SparseArray._bitwise(naked_other, naked_this, method_name) + + if issparse(naked_other): + naked_other = other.toarray() + + xp = get_array_module(naked_this) + xps = get_sparse_module(naked_this) + return SparseNDArray( + xps.csr_matrix(getattr(xp, method_name)(this.toarray(), naked_other)), + shape=naked_this.shape, + ) + + def __and__(self, other): + if get_array_module(other).isscalar(other): + return call_sparse("bitwise_and", self, other) + return self._bitwise(self.spmatrix, other, "bitwise_and") + + def __rand__(self, other): + if get_array_module(other).isscalar(other): + return call_sparse("bitwise_and", other, self) + return self._bitwise(other, self.spmatrix, "bitwise_and") + + def __or__(self, other): + if get_array_module(other).isscalar(other): + return call_sparse("bitwise_or", self, other) + return self._bitwise(self.spmatrix, other, "bitwise_or") + + def __ror__(self, other): + if get_array_module(other).isscalar(other): + return call_sparse("bitwise_or", other, self) + return self._bitwise(other, self.spmatrix, "bitwise_or") + + def __xor__(self, other): + if get_array_module(other).isscalar(other): + return call_sparse("bitwise_xor", self, other) + return self._bitwise(self.spmatrix, other, "bitwise_xor") + + def __rxor__(self, other): + if get_array_module(other).isscalar(other): + return call_sparse("bitwise_xor", other, self) + return self._bitwise(other, self.spmatrix, "bitwise_xor") + + def isclose(self, other, **kw): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + + xp = get_array_module(naked_other) + if issparse(naked_other): + naked_other = other.toarray() + return xp.isclose(self.toarray(), naked_other, **kw) + + def __invert__(self): + return call_sparse("invert", self) + + @staticmethod + def _shift(this, other, method_name): + try: + naked_this = naked(this) + except TypeError: + return NotImplemented + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + + xps = get_sparse_module(naked_this) + xp = get_array_module(naked_this) + + if xp.isscalar(naked_this): + other_xp = get_array_module(naked_other) + data = getattr(other_xp, method_name)(naked_this, naked_other.data) + indices, indptr, shape = ( + naked_other.indices, + naked_other.indptr, + naked_other.shape, + ) + elif isinstance(naked_this, xp.ndarray): + # dense + return getattr(xp, method_name)(naked_this, other.toarray()) + else: + tp = ( + np.int32 if is_cupy(naked_this) else np.bool_ + ) # cupy.sparse does not support bool + mask = xps.csr_matrix( + ( + (naked_this.data > 0).astype(tp), + naked_this.indices, + naked_this.indptr, + ), + naked_this.shape, + ) + naked_other = mask.multiply(naked_other) + indices, indptr, shape = ( + naked_this.indices, + naked_this.indptr, + naked_this.shape, + ) + data = getattr(xp, method_name)(naked_this.data, naked_other.data) + + return SparseNDArray( + xps.csr_matrix((data, indices, indptr), shape), shape=shape + ) + + def __lshift__(self, other): + return self._shift(self.spmatrix, other, "left_shift") + + def __rlshift__(self, other): + return self._shift(other, self.spmatrix, "left_shift") + + def __rshift__(self, other): + return self._shift(self.spmatrix, other, "right_shift") + + def __rrshift__(self, other): + return self._shift(other, self.spmatrix, "right_shift") + + def sin(self): + return SparseNDArray(self.spmatrix.sin(), shape=self.shape) + + def cos(self): + return call_sparse("cos", self) + + def tan(self): + return SparseNDArray(self.spmatrix.tan(), shape=self.shape) + + def arcsin(self): + return SparseNDArray(self.spmatrix.arcsin(), shape=self.shape) + + def arccos(self): + return call_sparse("arccos", self) + + def arctan(self): + return SparseNDArray(self.spmatrix.arctan(), shape=self.shape) + + def arctan2(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + + xp = get_array_module(self.spmatrix) + other_xp = get_array_module(naked_other) + if other_xp.isscalar(naked_other): + return call_sparse("arctan2", self, naked_other) + if issparse(naked_other): + naked_other = other.toarray() + x = xp.arctan2(self.toarray(), naked_other) + return SparseNDArray(get_sparse_module(x).csr_matrix(x), shape=self.shape) + + def hypot(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + + xp = get_array_module(self.spmatrix) + other_xp = get_array_module(naked_other) + if other_xp.isscalar(naked_other): + return call_sparse("hypot", self, naked_other) + if issparse(naked_other): + naked_other = other.toarray() + x = xp.hypot(self.toarray(), naked_other) + return SparseNDArray(get_sparse_module(x).csr_matrix(x), shape=self.shape) + + def sinh(self): + return SparseNDArray(self.spmatrix.sinh(), shape=self.shape) + + def cosh(self): + xp = get_array_module(self.spmatrix) + return xp.cosh(self.toarray()) + + def tanh(self): + return SparseNDArray(self.spmatrix.tanh(), shape=self.shape) + + def arcsinh(self): + return SparseNDArray(self.spmatrix.arcsinh(), shape=self.shape) + + def arccosh(self): + return call_sparse("arccosh", self) + + def arctanh(self): + return SparseNDArray(self.spmatrix.arctanh(), shape=self.shape) + + def around(self, decimals=0): + return call_sparse("around", self, decimals=decimals) + + def deg2rad(self): + return SparseNDArray(self.spmatrix.deg2rad(), shape=self.shape) + + def rad2deg(self): + return SparseNDArray(self.spmatrix.rad2deg(), shape=self.shape) + + def angle(self, deg=0): + return call_sparse("angle", self, deg=deg) + + def dot(self, other, sparse=True): + raise NotImplementedError + + def concatenate(self, other, axis=0): + raise NotImplementedError + + def _reduction( + self, method_name, axis=None, dtype=None, keepdims=None, todense=False, **kw + ): + raise NotImplementedError + + def sum(self, axis=None, dtype=None, keepdims=None): + return self._reduction("sum", axis=axis, dtype=dtype, keepdims=keepdims) + + def prod(self, axis=None, dtype=None, keepdims=None): + return self._reduction( + "sum", axis=axis, dtype=dtype, keepdims=keepdims, todense=True + ) + + def amax(self, axis=None, dtype=None, keepdims=None): + return self._reduction("max", axis=axis, dtype=dtype, keepdims=keepdims) + + def amin(self, axis=None, dtype=None, keepdims=None): + return self._reduction("min", axis=axis, dtype=dtype, keepdims=keepdims) + + def all(self, axis=None, dtype=None, keepdims=None): + ret = self._reduction( + "all", axis=axis, dtype=dtype, keepdims=keepdims, todense=True + ) + if not issparse(ret): + if get_array_module(ret).isscalar(ret): + return ret + xps = get_sparse_module(self.spmatrix) + ret = SparseNDArray(xps.csr_matrix(ret)) + return ret + return ret + + def any(self, axis=None, dtype=None, keepdims=None): + ret = self._reduction( + "any", axis=axis, dtype=dtype, keepdims=keepdims, todense=True + ) + if not issparse(ret): + if get_array_module(ret).isscalar(ret): + return ret + xps = get_sparse_module(self.spmatrix) + ret = SparseNDArray(xps.csr_matrix(ret)) + return ret + return ret + + def mean(self, axis=None, dtype=None, keepdims=None): + return self._reduction("mean", axis=axis, dtype=dtype, keepdims=keepdims) + + def nansum(self, axis=None, dtype=None, keepdims=None): + return self._reduction( + "nansum", axis=axis, dtype=dtype, keepdims=keepdims, todense=True + ) + + def nanprod(self, axis=None, dtype=None, keepdims=None): + return self._reduction( + "nanprod", axis=axis, dtype=dtype, keepdims=keepdims, todense=True + ) + + def nanmax(self, axis=None, dtype=None, keepdims=None): + ret = self._reduction( + "nanmax", axis=axis, dtype=dtype, keepdims=keepdims, todense=True + ) + if not issparse(ret): + if get_array_module(ret).isscalar(ret): + return ret + xps = get_sparse_module(self.spmatrix) + ret = SparseNDArray(xps.csr_matrix(ret)) + return ret + return ret + + def nanmin(self, axis=None, dtype=None, keepdims=None): + ret = self._reduction( + "nanmin", axis=axis, dtype=dtype, keepdims=keepdims, todense=True + ) + if not issparse(ret): + if get_array_module(ret).isscalar(ret): + return ret + xps = get_sparse_module(self.spmatrix) + ret = SparseNDArray(xps.csr_matrix(ret)) + return ret + return ret + + def nanmean(self, axis=None, dtype=None, keepdims=None): + return self._reduction( + "nanmean", axis=axis, dtype=dtype, keepdims=keepdims, todense=True + ) + + def argmax(self, axis=None, dtype=None, keepdims=None): + return self._reduction("argmax", axis=axis, dtype=dtype, keepdims=keepdims) + + def nanargmax(self, axis=None, dtype=None, keepdims=None): + return self._reduction( + "nanargmax", axis=axis, dtype=dtype, keepdims=keepdims, todense=True + ) + + def argmin(self, axis=None, dtype=None, keepdims=None): + return self._reduction("argmin", axis=axis, dtype=dtype, keepdims=keepdims) + + def nanargmin(self, axis=None, dtype=None, keepdims=None): + return self._reduction( + "nanargmin", axis=axis, dtype=dtype, keepdims=keepdims, todense=True + ) + + def var(self, axis=None, dtype=None, ddof=0, keepdims=None): + return self._reduction( + "var", axis=axis, dtype=dtype, ddof=ddof, keepdims=keepdims, todense=True + ) + + def cumsum(self, axis=None, dtype=None): + return self.toarray().cumsum(axis=axis) + + def cumprod(self, axis=None, dtype=None): + return self.toarray().cumprod(axis=axis) + + def nancumsum(self, axis=None, dtype=None): + xp = get_array_module(self.spmatrix) + return xp.nancumsum(self.toarray(), axis=axis) + + def nancumprod(self, axis=None, dtype=None): + xp = get_array_module(self.spmatrix) + return xp.nancumprod(self.toarray(), axis=axis) + + def count_nonzero(self, axis=None, dtype=None, keepdims=None): + if axis is None: + return get_array_module(self.spmatrix).array( + [self.spmatrix.count_nonzero()] + )[0] + else: + return get_array_module(self.spmatrix).count_nonzero( + self.toarray(), axis=axis + ) + + def __getitem__(self, item): + if isinstance(item, SparseArray): + item = item.spmatrix + if isinstance(item, list): + item = tuple(item) + + x = self.spmatrix[item] + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def __setitem__(self, key, value): + if is_cupy(self.spmatrix): + return NotImplemented + else: + x = self.spmatrix.tolil() + x[key] = value + x = x.tocsr() + self.spmatrix = x + + def _maximum_minimum(self, other, method_name): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + + if is_cupy(self.spmatrix): + # TODO(jisheng): cupy does not implement sparse maximum and minimum + return NotImplemented + + xps = get_sparse_module(self.spmatrix) + xp = get_array_module(self.spmatrix) + has_nan = xps.csr_matrix( + (xp.isnan(self.spmatrix.data), self.spmatrix.indices, self.spmatrix.indptr), + self.spmatrix.shape, + ) + if issparse(naked_other): + has_nan += xps.csr_matrix( + (xp.isnan(naked_other.data), naked_other.indices, naked_other.indptr), + naked_other.shape, + ) + + if issparse(naked_other): + x = getattr(self.spmatrix, method_name)(naked_other) + else: + x = getattr(xp, method_name)(self.toarray(), naked_other) + + if has_nan.sum() > 0: + x = x + (has_nan * np.nan) + + if issparse(x): + return SparseNDArray(x, shape=self.shape) + + return get_array_module(x).asarray(x) + + def maximum(self, other): + return self._maximum_minimum(other, "maximum") + + def minimum(self, other): + return self._maximum_minimum(other, "minimum") + + def fmax(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + + x = self.spmatrix.maximum(naked_other) + if issparse(x): + return SparseArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def fmin(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + + x = self.spmatrix.minimum(naked_other) + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def isinf(self): + return call_sparse("isinf", self) + + def isnan(self): + return call_sparse("isnan", self) + + def signbit(self): + return call_sparse("signbit", self) + + def floor(self): + return SparseNDArray(self.spmatrix.floor(), shape=self.shape) + + def ceil(self): + return SparseNDArray(self.spmatrix.ceil(), shape=self.shape) + + def trunc(self): + return SparseNDArray(self.spmatrix.trunc(), shape=self.shape) + + def degrees(self): + return call_sparse("degrees", self) + + def radians(self): + return call_sparse("radians", self) + + def clip(self, a_min, a_max): + try: + a_min = naked(a_min) + except TypeError: + return NotImplemented + + try: + a_max = naked(a_max) + except TypeError: + return NotImplemented + + x = self.spmatrix.maximum(a_min) + if issparse(x): + x = x.minimum(a_max) + elif issparse(a_max): + x = a_max.minimum(x) + else: + xp = get_array_module(x) + x = xp.minimum(x, a_max) + if issparse(x): + return SparseNDArray(x, shape=self.shape) + return get_array_module(x).asarray(x) + + def iscomplex(self): + return call_sparse("iscomplex", self) + + def fix(self): + return call_sparse("fix", self) + + def i0(self): + xp = get_array_module(self.spmatrix) + data = xp.i0(self.spmatrix.data).reshape(self.spmatrix.data.shape) + x = get_sparse_module(self.spmatrix).csr_matrix( + (data, self.spmatrix.indices, self.spmatrix.indptr), self.spmatrix.shape + ) + return SparseNDArray(x, shape=self.shape) + + def nan_to_num(self): + return call_sparse("nan_to_num", self) + + def copysign(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + + if get_array_module(naked_other).isscalar(naked_other): + return call_sparse("copysign", self, naked_other) + + if issparse(naked_other): + naked_other = other.toarray() + + xp = get_array_module(self.spmatrix) + return xp.copysign(self.toarray(), naked_other) + + def nextafter(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + + ret_sparse = False + if issparse(naked_other): + ret_sparse = True + naked_other = other.toarray() + + xp = get_array_module(self.spmatrix) + xps = get_sparse_module(self.spmatrix) + + x = xp.nextafter(self.toarray(), naked_other) + if ret_sparse: + return SparseNDArray(xps.csr_matrix(x), shape=self.shape) + return x + + def spacing(self): + if is_cupy(self.spmatrix): + raise NotImplementedError + return call_sparse("spacing", self) + + def ldexp(self, other): + try: + naked_other = naked(other) + except TypeError: + return NotImplemented + + if get_array_module(naked_other).isscalar(naked_other): + return call_sparse("ldexp", self, naked_other) + + if issparse(naked_other): + naked_other = other.toarray() + + return SparseNDArray(self.spmatrix.multiply(2**naked_other)) + + def frexp(self, **kw): + xp = get_array_module(self.spmatrix) + xps = get_sparse_module(self.spmatrix) + x, y = xp.frexp(self.toarray(), **kw) + return ( + SparseNDArray(xps.csr_matrix(x), shape=self.shape), + SparseNDArray(xps.csr_matrix(y), shape=self.shape), + ) + + def modf(self, **kw): + xp = get_array_module(self.spmatrix) + xps = get_sparse_module(self.spmatrix) + x, y = xp.modf(self.toarray(), **kw) + return ( + SparseNDArray(xps.csr_matrix(x), shape=self.shape), + SparseNDArray(xps.csr_matrix(y), shape=self.shape), + ) + + def sinc(self): + return call_sparse("sinc", self) + + def isfinite(self): + return call_sparse("isfinite", self) + + def isreal(self): + return call_sparse("isreal", self) + + def digitize(self, bins, right=False): + return call_sparse("digitize", self, bins=bins, right=right) + + def repeat(self, repeats, axis=None): + if axis is None: + raise NotImplementedError + + xp = get_array_module(self.spmatrix) + xps = get_sparse_module(self.spmatrix) + r = xp.repeat(self.toarray(), repeats, axis=axis) + x = xps.csr_matrix(r) + return SparseNDArray(x, shape=r.shape) + + @staticmethod + def _expand_val(val, expect_val_size, xp): + if val.size > expect_val_size: + val = val[:expect_val_size] + elif val.size < expect_val_size: + n_repeat = ceildiv(expect_val_size, val.size) + val = xp.tile(val, n_repeat)[:expect_val_size] + return val + + def fill_diagonal(self, val, wrap=False): + lil_matrix = self.spmatrix.tolil() + + xp = get_array_module(self.spmatrix) + val = xp.asarray(val) + if val.ndim > 1: + val = val.ravel() + is_tall_matrix = lil_matrix.shape[0] > lil_matrix.shape[1] + 1 + n_rows, n_cols = lil_matrix.shape + + if not wrap or not is_tall_matrix: + if val.ndim > 0: + # check if val is long enough + expect_val_size = min(n_rows, n_cols) + val = self._expand_val(val, expect_val_size, xp) + lil_matrix.setdiag(val) + matrix = lil_matrix + else: + block_size = n_cols + 1 + + n_block = n_rows // block_size + n_vals = n_cols * n_block + if n_rows % block_size > 0: + # 1 chunk left + n_block += 1 + n_vals += min(n_rows % block_size, n_cols) + + if val.ndim > 0: + val = self._expand_val(val, n_vals, xp) + + sub_matrices = [] + for i in range(n_block): + sub_lil_matrix = lil_matrix[i * block_size : (i + 1) * block_size] + if val.ndim > 0: + sub_val = val[i * n_cols : (i + 1) * n_cols] + else: + sub_val = val + sub_lil_matrix.setdiag(sub_val) + sub_matrices.append(sub_lil_matrix) + + xps = get_sparse_module(self.spmatrix) + matrix = SparseArray(xps.vstack(sub_matrices, format="csr")) + + self.spmatrix = matrix.tocsr() + + def unique( + self, return_index=False, return_inverse=False, return_counts=False, axis=None + ): + if return_inverse or return_index: # pragma: no cover + raise NotImplementedError + if self.ndim == 2 and axis is not None: # pragma: no cover + raise NotImplementedError + + xp = get_array_module(self.spmatrix) + return xp.unique(self.spmatrix.data, return_counts=return_counts) diff --git a/python/xorbits/_mars/lib/sparse/core.py b/python/xorbits/_mars/lib/sparse/core.py new file mode 100644 index 000000000..1e65b2323 --- /dev/null +++ b/python/xorbits/_mars/lib/sparse/core.py @@ -0,0 +1,90 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +try: + import scipy.sparse as sps + import scipy.sparse.linalg as splinalg +except ImportError: # pragma: no cover + sps = None + splinalg = None + +from ...utils import lazy_import + +splinalg = splinalg +cp = lazy_import("cupy", rename="cp") +cps = lazy_import("cupy.sparse", rename="cps") + + +def issparse(x): + if cps and cps.issparse(x): + # is cupy.sparse + return True + if sps and sps.issparse(x): + # is scipy.sparse + return True + if np and isinstance(x, np.ndarray): + return False + if cp and isinstance(x, cp.ndarray): + return False + + from .array import SparseNDArray + + return isinstance(x, SparseNDArray) + + +def is_sparse_or_dense(x): + if issparse(x): + return True + m = get_array_module(x) + if m.isscalar(x): + return True + return isinstance(x, m.ndarray) + + +def get_dense_module(x): + from .array import SparseNDArray + + if cp: + if isinstance(x, SparseNDArray): + return get_array_module(x.raw) + return get_array_module(x) + + return np + + +def get_array_module(x): + if cp: + return cp.get_array_module(x) + return np + + +def get_sparse_module(x): + m = get_array_module(x) + if m is np: + return sps + return cps + + +def is_cupy(x): + return get_array_module(x) is cp + + +def naked(x): + if hasattr(x, "spmatrix"): + return x.spmatrix + if not is_sparse_or_dense(x): + raise TypeError("only sparse matrix or ndarray accepted") + return x diff --git a/python/xorbits/_mars/lib/sparse/matrix.py b/python/xorbits/_mars/lib/sparse/matrix.py new file mode 100644 index 000000000..b4f401bc2 --- /dev/null +++ b/python/xorbits/_mars/lib/sparse/matrix.py @@ -0,0 +1,239 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import Iterable +from typing import List + +import numpy as np + +from .array import SparseArray, SparseNDArray +from .core import ( + cp, + cps, + get_array_module, + get_sparse_module, + issparse, + naked, + splinalg, + sps, +) + + +def zeros_sparse_matrix(shape, dtype=float, gpu=False): + m = sps if not gpu else cps + return SparseMatrix(m.csr_matrix(shape, dtype=np.dtype(dtype))) + + +def diag_sparse_matrix(v, k=0, gpu=False): + v = naked(v) + if gpu and get_array_module(v) is not cp: + v = cp.asarray(v) + if not gpu and get_array_module(v) is not np: + v = v.get() + + if v.ndim == 1: + sparse_m = sps if not gpu else cps + m = n = v.size + k + mat = sparse_m.spdiags(v[None], [k], m, n, format="csr") + return SparseMatrix(mat) + else: + assert v.ndim == 2 + sparse_m = sps if not gpu else cps + sparse_eye = sparse_m.eye(v.shape[0], v.shape[1], k=k) + mat = sparse_eye.multiply(v).tocoo() + size = sparse_eye.nnz + col = mat.col - max(k, 0) + row = get_array_module(col).zeros((len(col),)) + return SparseNDArray( + sparse_m.csr_matrix((mat.data, (row, col)), shape=(1, size)), shape=(size,) + ) + + +def eye_sparse_matrix(N, M=None, k=0, dtype=float, gpu=False): + m = sps if not gpu else cps + return SparseMatrix(m.eye(N, n=M, k=k, dtype=dtype, format="csr")) + + +def triu_sparse_matrix(m, k=0, gpu=False): + m = naked(m) + if gpu and get_array_module(m) is not cp: + m = cp.asarray(m) + if not gpu and get_array_module(m) is not np: + m = m.get() + + sparse_m = sps if not gpu else cps + mat = sparse_m.triu(m, k=k) + return SparseMatrix(mat) + + +def tril_sparse_matrix(m, k=0, gpu=False): + m = naked(m) + if gpu and get_array_module(m) is not cp: + m = cp.asarray(m) + if not gpu and get_array_module(m) is not np: + m = m.get() + + sparse_m = sps if not gpu else cps + mat = sparse_m.tril(m, k=k) + return SparseMatrix(mat) + + +def where(cond, x, y): + cond, x, y = [SparseMatrix(i) if issparse(i) else i for i in (cond, x, y)] + return cond * x + (cond * (-y) + y) + + +def lu_sparse_matrix(a): + a = naked(a) + a = a.tocsc() + super_lu = splinalg.splu( + a, permc_spec="NATURAL", diag_pivot_thresh=0, options={"SymmetricMode": True} + ) + l_ = super_lu.L + u = super_lu.U + p = sps.lil_matrix(a.shape) + p[super_lu.perm_r.copy(), np.arange(a.shape[1])] = 1 + return ( + SparseMatrix(p), + SparseMatrix(l_), + SparseMatrix(u), + ) + + +def solve_triangular_sparse_matrix(a, b, lower=False, sparse=True): + a = naked(a) + b = b.toarray() if issparse(b) else b + + x = splinalg.spsolve_triangular(a, b, lower=lower) + if sparse: + spx = ( + sps.csr_matrix(x).reshape(x.shape[0], 1) + if len(x.shape) == 1 + else sps.csr_matrix(x) + ) + return SparseNDArray(spx, shape=x.shape) + else: + return x + + +def block(arrs: List[List[SparseArray]]) -> SparseArray: + mats = [] + for dim_arrs in arrs: + mats.append([naked(a) for a in dim_arrs]) + return SparseNDArray(sps.bmat(mats, format="csr")) + + +class SparseMatrix(SparseArray): + __slots__ = ("spmatrix",) + + def __init__(self, spmatrix, shape=()): + if shape and len(shape) != 2: + raise ValueError("Only accept 2-d array") + if isinstance(spmatrix, SparseMatrix): + self.spmatrix = spmatrix.spmatrix + else: + self.spmatrix = spmatrix.tocsr() + + @property + def shape(self): + return self.spmatrix.shape + + @property + def size(self): + return int(np.prod(self.shape)) + + def transpose(self, axes=None): + assert axes is None or tuple(axes) == (1, 0) + return SparseMatrix(self.spmatrix.transpose()) + + @property + def T(self): + return SparseMatrix(self.spmatrix.T) + + def dot(self, other, sparse=True): + other_shape = other.shape + try: + other = naked(other) + except TypeError: + return NotImplemented + + if sparse: + if len(other_shape) == 1: + x = self.spmatrix.dot(other.T) + else: + x = self.spmatrix.dot(other) + else: + a = self.spmatrix.toarray() + if issparse(other): + other = other.toarray().reshape(other_shape) + x = a.dot(other) + if issparse(x): + shape = (x.shape[0],) if len(other_shape) == 1 else x.shape + return SparseNDArray(x, shape=shape) + return get_array_module(x).asarray(x) + + def concatenate(self, other, axis=0): + try: + other = naked(other) + except TypeError: + return NotImplemented + + if issparse(other): + xps = get_sparse_module(self.spmatrix) + if axis not in (0, 1): + raise ValueError("axis can only be 0 or 1") + method = xps.vstack if axis == 0 else xps.hstack + x = method((self.spmatrix, other)) + else: + xp = get_array_module(self.spmatrix) + x = xp.concatenate((self.spmatrix.toarray(), other), axis=axis) + + if issparse(x): + return SparseMatrix(x) + return get_array_module(x).asarray(x) + + def _reduction( + self, method_name, axis=None, dtype=None, keepdims=None, todense=False, **kw + ): + # TODO: support keepdims + if isinstance(axis, tuple): + if sorted(axis) != [0, 1]: + assert len(axis) == 1 + axis = axis[0] + else: + axis = None + + if todense: + x = self.spmatrix.toarray() + x = getattr(get_array_module(x), method_name)(x, axis=axis, **kw) + else: + x = getattr(self.spmatrix, method_name)(axis=axis, **kw) + if not isinstance(axis, Iterable): + axis = (axis,) + axis = list(range(len(self.shape))) if axis is None else axis + shape = tuple( + s if i not in axis else 1 + for i, s in enumerate(self.shape) + if keepdims or i not in axis + ) + m = get_array_module(x) + if issparse(x): + return SparseNDArray(x, shape=shape) + if m.isscalar(x): + if keepdims: + return m.array([x])[0].reshape((1,) * self.ndim) + else: + return m.array([x])[0] + else: + return m.asarray(x).reshape(shape) diff --git a/python/xorbits/_mars/lib/sparse/tests/__init__.py b/python/xorbits/_mars/lib/sparse/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/lib/sparse/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/lib/sparse/tests/test_sparse.py b/python/xorbits/_mars/lib/sparse/tests/test_sparse.py new file mode 100644 index 000000000..2abf150f5 --- /dev/null +++ b/python/xorbits/_mars/lib/sparse/tests/test_sparse.py @@ -0,0 +1,474 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pickle + +import numpy as np +import pytest +import scipy.sparse as sps + +from ... import sparse as mls +from .. import SparseMatrix, SparseNDArray, SparseVector +from ..core import issparse + +s1_data = sps.csr_matrix([[1, 0, 1], [0, 0, 1]]) +s2_data = sps.csr_matrix([[0, 1, 1], [1, 0, 1]]) +v1_data = np.random.rand(3) +v1 = sps.csr_matrix(v1_data) +v2_data = np.random.rand(2) +v2 = sps.csr_matrix(v2_data) +d1 = np.array([1, 2, 3]) + + +def assert_array_equal(a, b, almost=False): + if issparse(a): + a = a.toarray() + else: + a = np.asarray(a) + if issparse(b): + b = b.toarray() + else: + b = np.asarray(b) + if not almost: + np.testing.assert_array_equal(a, b) + else: + np.testing.assert_almost_equal(a, b) + + +def test_sparse_creation(): + with pytest.raises(ValueError): + SparseNDArray() + + s = SparseNDArray(s1_data) + assert s.ndim == 2 + assert isinstance(s, SparseMatrix) + assert_array_equal(s.toarray(), s1_data.A) + assert_array_equal(s.todense(), s1_data.A) + + ss = pickle.loads(pickle.dumps(s)) + assert s == ss + assert_array_equal(ss.toarray(), s1_data.A) + assert_array_equal(ss.todense(), s1_data.A) + + v = SparseNDArray(v1, shape=(3,)) + assert s.ndim + assert isinstance(v, SparseVector) + assert v.shape == (3,) + assert_array_equal(v.todense(), v1_data) + assert_array_equal(v.toarray(), v1_data) + assert_array_equal(v, v1_data) + + vv = pickle.loads(pickle.dumps(v)) + assert v == vv + assert_array_equal(vv.todense(), v1_data) + assert_array_equal(vv.toarray(), v1_data) + assert_array_equal(vv, v1_data) + + +def test_sparse_add(): + s1 = SparseNDArray(s1_data) + s2 = SparseNDArray(s2_data) + + assert_array_equal(s1 + s2, s1 + s2) + assert_array_equal(s1 + d1, s1 + d1) + assert_array_equal(d1 + s1, d1 + s1) + r = sps.csr_matrix(((s1.data + 1), s1.indices, s1.indptr), s1.shape) + assert_array_equal(s1 + 1, r) + r = sps.csr_matrix(((1 + s1.data), s1.indices, s1.indptr), s1.shape) + assert_array_equal(1 + s1, r) + + # test sparse vector + v = SparseNDArray(v1, shape=(3,)) + assert_array_equal(v + v, v1_data + v1_data) + assert_array_equal(v + d1, v1_data + d1) + assert_array_equal(d1 + v, d1 + v1_data) + r = sps.csr_matrix(((v1.data + 1), v1.indices, v1.indptr), v1.shape) + assert_array_equal(v + 1, r.toarray().reshape(3)) + r = sps.csr_matrix(((1 + v1.data), v1.indices, v1.indptr), v1.shape) + assert_array_equal(1 + v, r.toarray().reshape(3)) + + +def test_sparse_subtract(): + s1 = SparseNDArray(s1_data) + s2 = SparseNDArray(s2_data) + + assert_array_equal(s1 - s2, s1 - s2) + assert_array_equal(s1 - d1, s1 - d1) + assert_array_equal(d1 - s1, d1 - s1) + r = sps.csr_matrix(((s1.data - 1), s1.indices, s1.indptr), s1.shape) + assert_array_equal(s1 - 1, r) + r = sps.csr_matrix(((1 - s1.data), s1.indices, s1.indptr), s1.shape) + assert_array_equal(1 - s1, r) + + # test sparse vector + v = SparseNDArray(v1, shape=(3,)) + assert_array_equal(v - v, v1_data - v1_data) + assert_array_equal(v - d1, v1_data - d1) + assert_array_equal(d1 - v, d1 - v1_data) + r = sps.csr_matrix(((v1.data - 1), v1.indices, v1.indptr), v1.shape) + assert_array_equal(v - 1, r.toarray().reshape(3)) + r = sps.csr_matrix(((1 - v1.data), v1.indices, v1.indptr), v1.shape) + assert_array_equal(1 - v, r.toarray().reshape(3)) + + +def test_sparse_multiply(): + s1 = SparseNDArray(s1_data) + s2 = SparseNDArray(s2_data) + + assert_array_equal(s1 * s2, s1_data.multiply(s2_data)) + assert_array_equal(s1 * d1, s1_data.multiply(d1)) + assert_array_equal(d1 * s1, s1_data.multiply(d1)) + assert_array_equal(s1 * 2, s1 * 2) + assert_array_equal(2 * s1, s1 * 2) + + # test sparse vector + v = SparseNDArray(v1, shape=(3,)) + assert_array_equal(v * v, v1_data * v1_data) + assert_array_equal(v * d1, v1_data * d1) + assert_array_equal(d1 * v, d1 * v1_data) + r = sps.csr_matrix(((v1.data * 1), v1.indices, v1.indptr), v1.shape) + assert_array_equal(v * 1, r.toarray().reshape(3)) + r = sps.csr_matrix(((1 * v1.data), v1.indices, v1.indptr), v1.shape) + assert_array_equal(1 * v, r.toarray().reshape(3)) + + +def test_sparse_divide(): + s1 = SparseNDArray(s1_data) + s2 = SparseNDArray(s2_data) + + assert_array_equal(s1 / s2, s1 / s2) + assert_array_equal(s1 / d1, s1 / d1) + assert_array_equal(d1 / s1, d1 / s1.toarray()) + assert_array_equal(s1 / 2, s1 / 2) + assert_array_equal(2 / s1, 2 / s1.toarray()) + + # test sparse vector + v = SparseNDArray(v1, shape=(3,)) + assert_array_equal(v / v, v1_data / v1_data) + assert_array_equal(v / d1, v1_data / d1) + assert_array_equal(d1 / v, d1 / v1_data) + r = sps.csr_matrix(((v1.data / 1), v1.indices, v1.indptr), v1.shape) + assert_array_equal(v / 1, r.toarray().reshape(3)) + r = sps.csr_matrix(((1 / v1.data), v1.indices, v1.indptr), v1.shape) + assert_array_equal(1 / v, r.toarray().reshape(3)) + + +def test_sparse_floor_divide(): + s1 = SparseNDArray(s1_data) + s2 = SparseNDArray(s2_data) + + assert_array_equal(s1 // s2, s1.toarray() // s2.toarray()) + assert_array_equal(s1 // d1, s1.toarray() // d1) + assert_array_equal(d1 // s1, d1 // s1.toarray()) + assert_array_equal(s1 // 2, s1.toarray() // 2) + assert_array_equal(2 // s1, 2 // s1.toarray()) + + # test sparse vector + v = SparseNDArray(v1, shape=(3,)) + assert_array_equal(v // v, v1_data // v1_data) + assert_array_equal(v // d1, v1_data // d1) + assert_array_equal(d1 // v, d1 // v1_data) + r = sps.csr_matrix(((v1.data // 1), v1.indices, v1.indptr), v1.shape) + assert_array_equal(v // 1, r.toarray().reshape(3)) + r = sps.csr_matrix(((1 // v1.data), v1.indices, v1.indptr), v1.shape) + assert_array_equal(1 // v, r.toarray().reshape(3)) + + +def test_sparse_power(): + s1 = SparseNDArray(s1_data) + s2 = SparseNDArray(s2_data) + + assert_array_equal(s1**s2, s1.toarray() ** s2.toarray()) + assert_array_equal(s1**d1, s1.toarray() ** d1) + assert_array_equal(d1**s1, d1 ** s1.toarray()) + assert_array_equal(s1**2, s1_data.power(2)) + assert_array_equal(2**s1, 2 ** s1.toarray()) + + # test sparse vector + v = SparseNDArray(v1, shape=(3,)) + assert_array_equal(v**v, v1_data**v1_data) + assert_array_equal(v**d1, v1_data**d1) + assert_array_equal(d1**v, d1**v1_data) + r = sps.csr_matrix(((v1.data**1), v1.indices, v1.indptr), v1.shape) + assert_array_equal(v**1, r.toarray().reshape(3)) + r = sps.csr_matrix(((1**v1.data), v1.indices, v1.indptr), v1.shape) + assert_array_equal(1**v, r.toarray().reshape(3)) + + +def test_sparse_mod(): + s1 = SparseNDArray(s1_data) + s2 = SparseNDArray(s2_data) + + assert_array_equal(s1 % s2, s1.toarray() % s2.toarray()) + assert_array_equal(s1 % d1, s1.toarray() % d1) + assert_array_equal(d1 % s1, d1 % s1.toarray()) + assert_array_equal(s1 % 2, s1.toarray() % 2) + assert_array_equal(2 % s1, 2 % s1.toarray()) + + # test sparse vector + v = SparseNDArray(v1, shape=(3,)) + assert_array_equal(v % v, v1_data % v1_data) + assert_array_equal(v % d1, v1_data % d1) + assert_array_equal(d1 % v, d1 % v1_data) + r = sps.csr_matrix(((v1.data % 1), v1.indices, v1.indptr), v1.shape) + assert_array_equal(v % 1, r.toarray().reshape(3)) + r = sps.csr_matrix(((1 % v1.data), v1.indices, v1.indptr), v1.shape) + assert_array_equal(1 % v, r.toarray().reshape(3)) + + +def test_sparse_bin(): + s1 = SparseNDArray(s1_data) + s2 = SparseNDArray(s2_data) + v = SparseNDArray(v1, shape=(3,)) + + for method in ( + "fmod", + "logaddexp", + "logaddexp2", + "equal", + "not_equal", + "less", + "less_equal", + "greater", + "greater_equal", + "hypot", + "arctan2", + ): + lm, rm = getattr(mls, method), getattr(np, method) + assert_array_equal(lm(s1, s2), rm(s1.toarray(), s2.toarray())) + assert_array_equal(lm(s1, d1), rm(s1.toarray(), d1)) + assert_array_equal(lm(d1, s1), rm(d1, s1.toarray())) + r1 = sps.csr_matrix((rm(s1.data, 2), s1.indices, s1.indptr), s1.shape) + assert_array_equal(lm(s1, 2), r1) + r2 = sps.csr_matrix((rm(2, s1.data), s1.indices, s1.indptr), s1.shape) + assert_array_equal(lm(2, s1), r2) + + # test sparse + assert_array_equal(lm(v, v), rm(v1_data, v1_data)) + assert_array_equal(lm(v, d1), rm(v1_data, d1)) + assert_array_equal(lm(d1, v), rm(d1, v1_data)) + assert_array_equal(lm(v, 2), rm(v1_data, 2)) + assert_array_equal(lm(2, v), rm(2, v1_data)) + + +def test_sparse_unary(): + s1 = SparseNDArray(s1_data) + v = SparseNDArray(v1, shape=(3,)) + + for method in ( + "negative", + "positive", + "absolute", + "abs", + "fabs", + "rint", + "sign", + "conj", + "exp", + "exp2", + "log", + "log2", + "log10", + "expm1", + "log1p", + "sqrt", + "square", + "cbrt", + "reciprocal", + "sin", + "cos", + "tan", + "arcsin", + "arccos", + "arctan", + "arcsinh", + "arccosh", + "arctanh", + "deg2rad", + "rad2deg", + "angle", + "isnan", + "isinf", + "signbit", + "sinc", + "isreal", + "isfinite", + ): + lm, rm = getattr(mls, method), getattr(np, method) + r = sps.csr_matrix((rm(s1.data), s1.indices, s1.indptr), s1.shape) + assert_array_equal(lm(s1), r) + assert_array_equal(lm(v), rm(v1_data)) + + +def test_sparse_dot(): + s1 = SparseNDArray(s1_data) + s2 = SparseNDArray(s2_data) + v1_s = SparseNDArray(v1, shape=(3,)) + v2_s = SparseNDArray(v2, shape=(2,)) + + assert_array_equal(mls.dot(s1, s2.T), s1.dot(s2.T)) + assert_array_equal(s1.dot(d1), s1.dot(d1)) + assert_array_equal(d1.dot(s1.T), d1.dot(s1.T.toarray())) + + assert_array_equal(s1 @ s2.T, s1_data @ s2_data.T) + + assert_array_equal(mls.tensordot(s1, s2.T, axes=(1, 0)), s1.dot(s2.T)) + assert_array_equal(mls.tensordot(s1, d1, axes=(1, -1)), s1.dot(d1)) + assert_array_equal(mls.tensordot(d1, s1.T, axes=(0, 0)), d1.dot(s1.T.toarray())) + + assert_array_equal(mls.dot(s1, v1_s), s1.dot(v1_data)) + assert_array_equal(mls.dot(s2, v1_s), s2.dot(v1_data)) + assert_array_equal(mls.dot(v2_s, s1), v2_data.dot(s1_data.A)) + assert_array_equal(mls.dot(v2_s, s2), v2_data.dot(s2_data.A)) + assert_array_equal(mls.dot(v1_s, v1_s), v1_data.dot(v1_data), almost=True) + assert_array_equal(mls.dot(v2_s, v2_s), v2_data.dot(v2_data), almost=True) + + assert_array_equal(mls.dot(v2_s, s1, sparse=False), v2_data.dot(s1_data.A)) + assert_array_equal(mls.dot(v1_s, v1_s, sparse=False), v1_data.dot(v1_data)) + + +def test_sparse_sum(): + s1 = SparseNDArray(s1_data) + v = SparseNDArray(v1, shape=(3,)) + assert s1.sum() == s1.sum() + np.testing.assert_array_equal(s1.sum(axis=1), np.asarray(s1.sum(axis=1)).reshape(2)) + np.testing.assert_array_equal(s1.sum(axis=0), np.asarray(s1.sum(axis=0)).reshape(3)) + np.testing.assert_array_equal(v.sum(), np.asarray(v1_data.sum())) + + +def test_sparse_setitem(): + s1 = SparseNDArray(s1_data.copy()) + s1[1:2, 1] = [2] + ss1 = s1_data.tolil() + ss1[1:2, 1] = [2] + np.testing.assert_array_equal(s1.toarray(), ss1.toarray()) + + v = SparseVector(v1, shape=(3,)) + v[1:2] = [2] + vv1 = v1_data.copy() + vv1[1:2] = [2] + np.testing.assert_array_equal(v.toarray(), vv1) + + +def test_sparse_maximum(): + s1 = SparseNDArray(s1_data) + s2 = SparseNDArray(s2_data) + + np.testing.assert_array_equal(s1.maximum(s2).toarray(), s1.maximum(s2).toarray()) + + v = SparseVector(v1, shape=(3,)) + np.testing.assert_array_equal(v.maximum(d1), np.maximum(v1_data, d1)) + + +def test_sparse_minimum(): + s1 = SparseNDArray(s1_data) + s2 = SparseNDArray(s2_data) + + np.testing.assert_array_equal(s1.minimum(s2).toarray(), s1.minimum(s2).toarray()) + + v = SparseVector(v1, shape=(3,)) + np.testing.assert_array_equal(v.minimum(d1), np.minimum(v1_data, d1)) + + +def test_sparse_fill_diagonal(): + s1 = sps.random(100, 11, density=0.3, format="csr", random_state=0) + + # fill scalar + arr = SparseNDArray(s1) + arr.fill_diagonal(3) + + expected = s1.copy().A + np.fill_diagonal(expected, 3) + + np.testing.assert_array_equal(arr.toarray(), expected) + + # fill scalar, wrap=True + arr = SparseNDArray(s1) + arr.fill_diagonal(3, wrap=True) + + expected = s1.copy().A + np.fill_diagonal(expected, 3, wrap=True) + + np.testing.assert_array_equal(arr.toarray(), expected) + + # fill list + arr = SparseNDArray(s1) + arr.fill_diagonal([1, 2, 3]) + + expected = s1.copy().A + np.fill_diagonal(expected, [1, 2, 3]) + + np.testing.assert_array_equal(arr.toarray(), expected) + + # fill list, wrap=True + arr = SparseNDArray(s1) + arr.fill_diagonal([1, 2, 3], wrap=True) + + expected = s1.copy().A + np.fill_diagonal(expected, [1, 2, 3], wrap=True) + + np.testing.assert_array_equal(arr.toarray(), expected) + + # fill long list + val = np.random.RandomState(0).rand(101) + arr = SparseNDArray(s1) + arr.fill_diagonal(val) + + expected = s1.copy().A + np.fill_diagonal(expected, val) + + np.testing.assert_array_equal(arr.toarray(), expected) + + # fill long list, wrap=True + val = np.random.RandomState(0).rand(101) + arr = SparseNDArray(s1) + arr.fill_diagonal(val, wrap=True) + + expected = s1.copy().A + np.fill_diagonal(expected, val, wrap=True) + + np.testing.assert_array_equal(arr.toarray(), expected) + + # fill ndarray + val = np.random.RandomState(0).rand(3, 4) + arr = SparseNDArray(s1) + arr.fill_diagonal(val) + + expected = s1.copy().A + np.fill_diagonal(expected, val) + + np.testing.assert_array_equal(arr.toarray(), expected) + + # fill ndarray, wrap=True + val = np.random.RandomState(0).rand(3, 4) + arr = SparseNDArray(s1) + arr.fill_diagonal(val, wrap=True) + + expected = s1.copy().A + np.fill_diagonal(expected, val, wrap=True) + + np.testing.assert_array_equal(arr.toarray(), expected) + + +def test_sparse_block(): + r1 = sps.rand(10, 5) + r2 = sps.rand(10, 3) + r3 = sps.rand(3, 5) + r4 = sps.rand(3, 3) + + result = mls.block( + [[SparseNDArray(r1), SparseNDArray(r2)], [SparseNDArray(r3), SparseNDArray(r4)]] + ) + expected = sps.bmat([[r1, r2], [r3, r4]]) + assert_array_equal(result, expected) diff --git a/python/xorbits/_mars/lib/sparse/vector.py b/python/xorbits/_mars/lib/sparse/vector.py new file mode 100644 index 000000000..86ad51e9f --- /dev/null +++ b/python/xorbits/_mars/lib/sparse/vector.py @@ -0,0 +1,148 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .array import SparseArray, SparseNDArray +from .core import get_array_module, get_sparse_module, is_cupy, issparse, naked, np + + +class SparseVector(SparseArray): + __slots__ = ("spmatrix",) + + def __init__(self, spvector, shape=()): + if shape and len(shape) != 1: + raise ValueError("Only accept 1-d array") + if isinstance(spvector, SparseVector): + self.spmatrix = spvector.spmatrix + else: + spvector = spvector.reshape(1, shape[0]) + self.spmatrix = spvector.tocsr() + + @property + def shape(self): + return (self.spmatrix.shape[1],) + + def transpose(self, axes=None): + assert axes is None or tuple(axes) == (0,) + return self + + @property + def T(self): + return self + + def __truediv__(self, other): + try: + other = naked(other) + except TypeError: + return NotImplemented + x = self.spmatrix / other + if issparse(x): + return SparseNDArray(x, shape=self.shape) + if x.shape != self.shape: + x = np.asarray(x).reshape(self.shape) + return get_array_module(x).asarray(x) + + def __rtruediv__(self, other): + try: + other = naked(other) + except TypeError: + return NotImplemented + try: + x = other / self.spmatrix + except TypeError: + x = other / self.spmatrix.toarray() + if issparse(x): + return SparseNDArray(x, shape=self.shape) + if x.shape != self.shape: + x = np.asarray(x).reshape(self.shape) + return get_array_module(x).asarray(x) + + def dot(self, other, sparse=True): + other_shape = other.shape + try: + other = naked(other) + except TypeError: + return NotImplemented + + if not sparse: + a = self.toarray() + if issparse(other): + other = other.toarray().reshape(other_shape) + + x = a.dot(other) + else: + if len(other_shape) == 1: + x = self.spmatrix.dot(other.T) + else: + x = self.spmatrix.dot(other) + if issparse(x): + if x.shape == (1, 1): + # return scalar + return x.toarray()[0, 0] + shape = (x.shape[1],) + return SparseNDArray(x, shape=shape) + return get_array_module(x).asarray(x) + + def concatenate(self, other, axis=0): + if other.ndim != 1: + raise ValueError("all the input arrays must have same number of dimensions") + + try: + other = naked(other) + except TypeError: + return NotImplemented + + if issparse(other): + xps = get_sparse_module(self.spmatrix) + if axis != 0: + raise ValueError("axis can only be 0") + other = other.reshape(1, other.shape[0]) if other.shape[0] != 1 else other + x = xps.hstack((self.spmatrix.reshape(1, self.shape[0]), other)) + else: + xp = get_array_module(self.spmatrix) + x = xp.concatenate( + (self.spmatrix.toarray().reshape(self.shape), other), axis=axis + ) + + if issparse(x): + return SparseNDArray(x, shape=(x.shape[1],)) + return get_array_module(x).asarray(x) + + def _reduction( + self, method_name, axis=None, dtype=None, keepdims=None, todense=False, **kw + ): + if not todense: + assert keepdims is None or keepdims is False + + if isinstance(axis, tuple): + assert axis == (0,) + axis = None + + if todense: + x = self.spmatrix.toarray() + x = getattr(get_array_module(x), method_name)(x, axis=axis, **kw) + else: + x = getattr(self.spmatrix, method_name)(axis=axis, **kw) + + m = get_array_module(x) + return m.array([x])[0] + + def __setitem__(self, key, value): + if is_cupy(self.spmatrix): + return NotImplemented + else: + x = self.spmatrix.tolil() + key = (0,) + (key,) + x[key] = value + x = x.tocsr() + self.spmatrix = x diff --git a/python/xorbits/_mars/lib/tbcode.py b/python/xorbits/_mars/lib/tbcode.py new file mode 100644 index 000000000..34c637d0c --- /dev/null +++ b/python/xorbits/_mars/lib/tbcode.py @@ -0,0 +1,96 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This utility module dumps code of remote traceback and loads them +into local linecache. This enables displaying codes of remote +tracebacks correctly. +""" + +import linecache +import os +import types +from collections import defaultdict + + +def dump_traceback_code(tb: types.TracebackType, number_of_lines_of_context: int = 5): + """ + Dump codes before and after lines of tracebacks. + + Parameters + ---------- + tb: types.TracebackType + Traceback object + number_of_lines_of_context: int + Total number of lines around the code + Returns + ------- + result: dict + Dumped code lines of traceback + """ + results = defaultdict(lambda: dict(fragments=[])) + + while tb: + file_name = tb.tb_frame.f_code.co_filename + if linecache.getline(file_name, tb.tb_lineno): # pragma: no branch + code_lines = linecache.cache[file_name][2] + left_range = max(tb.tb_lineno - number_of_lines_of_context // 2 - 1, 0) + right_range = min(left_range + number_of_lines_of_context, len(code_lines)) + + cache_data = linecache.cache[file_name] + fragment = cache_data[2][left_range:right_range] + results[file_name]["fragments"].append( + dict(left=left_range, right=right_range, code=fragment) + ) + results[file_name].update( + dict(size=cache_data[0], lines=len(cache_data[2])) + ) + tb = tb.tb_next + return dict(results) + + +def load_traceback_code(code_frags: dict, cache: dict = None): + """ + Load dumped codes for remote tracebacks. + + Parameters + ---------- + code_frags: dict + Dumped codes for remote traceback. + cache: dict + Target for codes to be dumped, for test purpose only. + Production code should keep this field as None. + """ + if cache is not None: + real_cache = False + else: + real_cache = True + cache = linecache.cache + + for file_name, profile in code_frags.items(): + if real_cache and os.path.exists(file_name): + # skip rewriting caches of existing files + continue + + if file_name not in cache: + # keep field 1 (mtime) as None to ensure lazy cache + cache[file_name] = ( + profile["size"], + None, + [""] * profile["lines"], + file_name, + ) + for fragment in profile["fragments"]: + left_range, right_range = fragment["left"], fragment["right"] + cache[file_name][2][left_range:right_range] = fragment["code"] diff --git a/python/xorbits/_mars/lib/tblib/__init__.py b/python/xorbits/_mars/lib/tblib/__init__.py new file mode 100644 index 000000000..69a5e28e0 --- /dev/null +++ b/python/xorbits/_mars/lib/tblib/__init__.py @@ -0,0 +1,329 @@ +import re +import sys +from types import CodeType +from types import FrameType +from types import TracebackType + +try: + from __pypy__ import tproxy +except ImportError: + tproxy = None +try: + from .cpython import tb_set_next +except ImportError: + tb_set_next = None + +if not tb_set_next and not tproxy: + raise ImportError("Cannot use tblib. Runtime not supported.") + +__version__ = "1.7.0" +__all__ = "Traceback", "TracebackParseError", "Frame", "Code" + +PY3 = sys.version_info[0] == 3 +FRAME_RE = re.compile( + r'^\s*File "(?P.+)", line (?P\d+)(, in (?P.+))?$' +) + + +class _AttrDict(dict): + __slots__ = () + + def __getattr__(self, name): + try: + return self[name] + except KeyError: + raise AttributeError(name) + + +# noinspection PyPep8Naming +class __traceback_maker(Exception): + pass + + +class TracebackParseError(Exception): + pass + + +class Code(object): + """ + Class that replicates just enough of the builtin Code object to enable serialization and traceback rendering. + """ + + co_code = None + + def __init__(self, code): + self.co_filename = code.co_filename + self.co_name = code.co_name + self.co_argcount = 0 + self.co_kwonlyargcount = 0 + self.co_varnames = () + self.co_nlocals = 0 + self.co_stacksize = 0 + self.co_flags = 64 + self.co_firstlineno = 0 + + # noinspection SpellCheckingInspection + def __tproxy__(self, operation, *args, **kwargs): + """ + Necessary for PyPy's tproxy. + """ + if operation in ("__getattribute__", "__getattr__"): + return getattr(self, args[0]) + else: + return getattr(self, operation)(*args, **kwargs) + + +class Frame(object): + """ + Class that replicates just enough of the builtin Frame object to enable serialization and traceback rendering. + """ + + def __init__(self, frame): + self.f_locals = {} + self.f_globals = { + k: v for k, v in frame.f_globals.items() if k in ("__file__", "__name__") + } + self.f_code = Code(frame.f_code) + self.f_lineno = frame.f_lineno + + def clear(self): + """ + For compatibility with PyPy 3.5; + clear() was added to frame in Python 3.4 + and is called by traceback.clear_frames(), which + in turn is called by unittest.TestCase.assertRaises + """ + + # noinspection SpellCheckingInspection + def __tproxy__(self, operation, *args, **kwargs): + """ + Necessary for PyPy's tproxy. + """ + if operation in ("__getattribute__", "__getattr__"): + if args[0] == "f_code": + return tproxy(CodeType, self.f_code.__tproxy__) + else: + return getattr(self, args[0]) + else: + return getattr(self, operation)(*args, **kwargs) + + +class Traceback(object): + """ + Class that wraps builtin Traceback objects. + """ + + tb_next = None + + def __init__(self, tb): + self.tb_frame = Frame(tb.tb_frame) + # noinspection SpellCheckingInspection + self.tb_lineno = int(tb.tb_lineno) + + # Build in place to avoid exceeding the recursion limit + tb = tb.tb_next + prev_traceback = self + cls = type(self) + while tb is not None: + traceback = object.__new__(cls) + traceback.tb_frame = Frame(tb.tb_frame) + traceback.tb_lineno = int(tb.tb_lineno) + prev_traceback.tb_next = traceback + prev_traceback = traceback + tb = tb.tb_next + + def as_traceback(self): + """ + Convert to a builtin Traceback object that is usable for raising or rendering a stacktrace. + """ + if tproxy: + return tproxy(TracebackType, self.__tproxy__) + if not tb_set_next: + raise RuntimeError("Unsupported Python interpreter!") + + current = self + top_tb = None + tb = None + while current: + f_code = current.tb_frame.f_code + code = compile( + "\n" * (current.tb_lineno - 1) + "raise __traceback_maker", + current.tb_frame.f_code.co_filename, + "exec", + ) + if hasattr(code, "replace"): + # Python 3.8 and newer + code = code.replace( + co_argcount=0, + co_filename=f_code.co_filename, + co_name=f_code.co_name, + co_freevars=(), + co_cellvars=(), + ) + elif PY3: + code = CodeType( + 0, + code.co_kwonlyargcount, + code.co_nlocals, + code.co_stacksize, + code.co_flags, + code.co_code, + code.co_consts, + code.co_names, + code.co_varnames, + f_code.co_filename, + f_code.co_name, + code.co_firstlineno, + code.co_lnotab, + (), + (), + ) + else: + code = CodeType( + 0, + code.co_nlocals, + code.co_stacksize, + code.co_flags, + code.co_code, + code.co_consts, + code.co_names, + code.co_varnames, + f_code.co_filename.encode(), + f_code.co_name.encode(), + code.co_firstlineno, + code.co_lnotab, + (), + (), + ) + + # noinspection PyBroadException + try: + exec(code, dict(current.tb_frame.f_globals), {}) + except Exception: + next_tb = sys.exc_info()[2].tb_next + if top_tb is None: + top_tb = next_tb + if tb is not None: + tb_set_next(tb, next_tb) + tb = next_tb + del next_tb + + current = current.tb_next + try: + return top_tb + finally: + del top_tb + del tb + + to_traceback = as_traceback + + # noinspection SpellCheckingInspection + def __tproxy__(self, operation, *args, **kwargs): + """ + Necessary for PyPy's tproxy. + """ + if operation in ("__getattribute__", "__getattr__"): + if args[0] == "tb_next": + return self.tb_next and self.tb_next.as_traceback() + elif args[0] == "tb_frame": + return tproxy(FrameType, self.tb_frame.__tproxy__) + else: + return getattr(self, args[0]) + else: + return getattr(self, operation)(*args, **kwargs) + + def as_dict(self): + """ + Converts to a dictionary representation. You can serialize the result to JSON as it only has + builtin objects like dicts, lists, ints or strings. + """ + if self.tb_next is None: + tb_next = None + else: + tb_next = self.tb_next.to_dict() + + code = { + "co_filename": self.tb_frame.f_code.co_filename, + "co_name": self.tb_frame.f_code.co_name, + } + frame = { + "f_globals": self.tb_frame.f_globals, + "f_code": code, + "f_lineno": self.tb_frame.f_lineno, + } + return { + "tb_frame": frame, + "tb_lineno": self.tb_lineno, + "tb_next": tb_next, + } + + to_dict = as_dict + + @classmethod + def from_dict(cls, dct): + """ + Creates an instance from a dictionary with the same structure as ``.as_dict()`` returns. + """ + if dct["tb_next"]: + tb_next = cls.from_dict(dct["tb_next"]) + else: + tb_next = None + + code = _AttrDict( + co_filename=dct["tb_frame"]["f_code"]["co_filename"], + co_name=dct["tb_frame"]["f_code"]["co_name"], + ) + frame = _AttrDict( + f_globals=dct["tb_frame"]["f_globals"], + f_code=code, + f_lineno=dct["tb_frame"]["f_lineno"], + ) + tb = _AttrDict( + tb_frame=frame, + tb_lineno=dct["tb_lineno"], + tb_next=tb_next, + ) + return cls(tb) + + @classmethod + def from_string(cls, string, strict=True): + """ + Creates an instance by parsing a stacktrace. Strict means that parsing stops when lines are not indented by at least two spaces + anymore. + """ + frames = [] + header = strict + + for line in string.splitlines(): + line = line.rstrip() + if header: + if line == "Traceback (most recent call last):": + header = False + continue + frame_match = FRAME_RE.match(line) + if frame_match: + frames.append(frame_match.groupdict()) + elif line.startswith(" "): + pass + elif strict: + break # traceback ended + + if frames: + previous = None + for frame in reversed(frames): + previous = _AttrDict( + frame, + tb_frame=_AttrDict( + frame, + f_globals=_AttrDict( + __file__=frame["co_filename"], + __name__="?", + ), + f_code=_AttrDict(frame), + f_lineno=int(frame["tb_lineno"]), + ), + tb_next=previous, + ) + return cls(previous) + else: + raise TracebackParseError("Could not find any frames in %r." % string) diff --git a/python/xorbits/_mars/lib/tblib/cpython.py b/python/xorbits/_mars/lib/tblib/cpython.py new file mode 100644 index 000000000..06d898364 --- /dev/null +++ b/python/xorbits/_mars/lib/tblib/cpython.py @@ -0,0 +1,83 @@ +""" +Taken verbatim from Jinja2. +https://github.com/mitsuhiko/jinja2/blob/master/jinja2/debug.py#L267 +""" +import platform +import sys + + +def _init_ugly_crap(): + """This function implements a few ugly things so that we can patch the + traceback objects. The function returned allows resetting `tb_next` on + any python traceback object. Do not attempt to use this on non cpython + interpreters + """ + import ctypes + from types import TracebackType + + # figure out side of _Py_ssize_t + if hasattr(ctypes.pythonapi, "Py_InitModule4_64"): + _Py_ssize_t = ctypes.c_int64 + else: + _Py_ssize_t = ctypes.c_int + + # regular python + class _PyObject(ctypes.Structure): + pass + + _PyObject._fields_ = [ + ("ob_refcnt", _Py_ssize_t), + ("ob_type", ctypes.POINTER(_PyObject)), + ] + + # python with trace + if hasattr(sys, "getobjects"): + + class _PyObject(ctypes.Structure): + pass + + _PyObject._fields_ = [ + ("_ob_next", ctypes.POINTER(_PyObject)), + ("_ob_prev", ctypes.POINTER(_PyObject)), + ("ob_refcnt", _Py_ssize_t), + ("ob_type", ctypes.POINTER(_PyObject)), + ] + + class _Traceback(_PyObject): + pass + + _Traceback._fields_ = [ + ("tb_next", ctypes.POINTER(_Traceback)), + ("tb_frame", ctypes.POINTER(_PyObject)), + ("tb_lasti", ctypes.c_int), + ("tb_lineno", ctypes.c_int), + ] + + def tb_set_next(tb, next): + """Set the tb_next attribute of a traceback object.""" + if not ( + isinstance(tb, TracebackType) + and (next is None or isinstance(next, TracebackType)) + ): + raise TypeError("tb_set_next arguments must be traceback objects") + obj = _Traceback.from_address(id(tb)) + if tb.tb_next is not None: + old = _Traceback.from_address(id(tb.tb_next)) + old.ob_refcnt -= 1 + if next is None: + obj.tb_next = ctypes.POINTER(_Traceback)() + else: + next = _Traceback.from_address(id(next)) + next.ob_refcnt += 1 + obj.tb_next = ctypes.pointer(next) + + return tb_set_next + + +tb_set_next = None +try: + if platform.python_implementation() == "CPython": + tb_set_next = _init_ugly_crap() +except Exception as exc: + sys.stderr.write("Failed to initialize cpython support: {!r}".format(exc)) +del _init_ugly_crap diff --git a/python/xorbits/_mars/lib/tblib/decorators.py b/python/xorbits/_mars/lib/tblib/decorators.py new file mode 100644 index 000000000..77778bc97 --- /dev/null +++ b/python/xorbits/_mars/lib/tblib/decorators.py @@ -0,0 +1,44 @@ +import sys +from functools import wraps + +from . import Traceback + + +class Error(object): + def __init__(self, exc_type, exc_value, traceback): + self.exc_type = exc_type + self.exc_value = exc_value + self.__traceback = Traceback(traceback) + + @property + def traceback(self): + return self.__traceback.as_traceback() + + def reraise(self): + raise self.exc_value.with_traceback(self.traceback) from None + + +def return_error(func, exc_type=Exception): + @wraps(func) + def return_exceptions_wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except exc_type: + return Error(*sys.exc_info()) + + return return_exceptions_wrapper + + +returns_error = ( + return_errors +) = returns_errors = return_error # cause I make too many typos + + +@return_error +def apply_with_return_error(args): + """ + args is a tuple where the first argument is a callable. + eg:: + apply_with_return_error((func, 1, 2, 3)) - this will call func(1, 2, 3) + """ + return args[0](*args[1:]) diff --git a/python/xorbits/_mars/lib/tblib/pickling_support.py b/python/xorbits/_mars/lib/tblib/pickling_support.py new file mode 100644 index 000000000..995f4c620 --- /dev/null +++ b/python/xorbits/_mars/lib/tblib/pickling_support.py @@ -0,0 +1,91 @@ +import sys +from types import TracebackType + +from . import Frame +from . import Traceback + +if sys.version_info.major >= 3: + import copyreg +else: + import copy_reg as copyreg + + +def unpickle_traceback(tb_frame, tb_lineno, tb_next): + ret = object.__new__(Traceback) + ret.tb_frame = tb_frame + ret.tb_lineno = tb_lineno + ret.tb_next = tb_next + return ret.as_traceback() + + +def pickle_traceback(tb): + return unpickle_traceback, ( + Frame(tb.tb_frame), + tb.tb_lineno, + tb.tb_next and Traceback(tb.tb_next), + ) + + +def unpickle_exception(func, args, cause, tb): + inst = func(*args) + inst.__cause__ = cause + inst.__traceback__ = tb + return inst + + +def pickle_exception(obj): + # All exceptions, unlike generic Python objects, define __reduce_ex__ + # __reduce_ex__(4) should be no different from __reduce_ex__(3). + # __reduce_ex__(5) could bring benefits in the unlikely case the exception + # directly contains buffers, but PickleBuffer objects will cause a crash when + # running on protocol=4, and there's no clean way to figure out the current + # protocol from here. Note that any object returned by __reduce_ex__(3) will + # still be pickled with protocol 5 if pickle.dump() is running with it. + rv = obj.__reduce_ex__(3) + if isinstance(rv, str): + raise TypeError("str __reduce__ output is not supported") + assert isinstance(rv, tuple) and len(rv) >= 2 + + return (unpickle_exception, rv[:2] + (obj.__cause__, obj.__traceback__)) + rv[2:] + + +def _get_subclasses(cls): + # Depth-first traversal of all direct and indirect subclasses of cls + to_visit = [cls] + while to_visit: + this = to_visit.pop() + yield this + to_visit += list(this.__subclasses__()) + + +def install(*exc_classes_or_instances): + copyreg.pickle(TracebackType, pickle_traceback) + + if sys.version_info.major < 3: + # Dummy decorator? + if len(exc_classes_or_instances) == 1: + exc = exc_classes_or_instances[0] + if isinstance(exc, type) and issubclass(exc, BaseException): + return exc + return + + if not exc_classes_or_instances: + for exception_cls in _get_subclasses(BaseException): + copyreg.pickle(exception_cls, pickle_exception) + return + + for exc in exc_classes_or_instances: + if isinstance(exc, BaseException): + while exc is not None: + copyreg.pickle(type(exc), pickle_exception) + exc = exc.__cause__ + elif isinstance(exc, type) and issubclass(exc, BaseException): + copyreg.pickle(exc, pickle_exception) + # Allow using @install as a decorator for Exception classes + if len(exc_classes_or_instances) == 1: + return exc + else: + raise TypeError( + "Expected subclasses or instances of BaseException, got %s" + % (type(exc)) + ) diff --git a/python/xorbits/_mars/lib/tests/__init__.py b/python/xorbits/_mars/lib/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/lib/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/lib/tests/test_lib.py b/python/xorbits/_mars/lib/tests/test_lib.py new file mode 100644 index 000000000..56ab750c1 --- /dev/null +++ b/python/xorbits/_mars/lib/tests/test_lib.py @@ -0,0 +1,133 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pickle +import sys + +import numpy as np +import pandas as pd + +from ...tests.core import assert_groupby_equal +from ...utils import calc_data_size, estimate_pandas_size +from ..groupby_wrapper import wrapped_groupby +from ..tbcode import dump_traceback_code, load_traceback_code + + +def test_groupby_wrapper(): + df = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + }, + index=pd.MultiIndex.from_tuples([(i // 4, i) for i in range(8)]), + ) + + conv_func = lambda x: pickle.loads(pickle.dumps(x)) + + grouped = conv_func(wrapped_groupby(df, level=0)) + assert_groupby_equal(grouped, df.groupby(level=0)) + assert grouped.shape == (8, 4) + assert grouped.is_frame is True + assert sys.getsizeof(grouped) > sys.getsizeof(grouped.groupby_obj) + assert calc_data_size(grouped) > sys.getsizeof(grouped.groupby_obj) + assert grouped.estimate_size() > estimate_pandas_size(grouped.groupby_obj) + + grouped = conv_func(wrapped_groupby(df, level=0).C) + assert_groupby_equal(grouped, df.groupby(level=0).C) + assert grouped.shape == (8,) + assert grouped.is_frame is False + + grouped = conv_func(wrapped_groupby(df, "B")) + assert_groupby_equal(grouped, df.groupby("B")) + assert grouped.shape == (8, 4) + assert grouped.is_frame is True + + grouped = conv_func(wrapped_groupby(df, "B").C) + assert_groupby_equal(grouped, df.groupby("B").C, with_selection=True) + assert grouped.shape == (8,) + assert grouped.is_frame is False + + grouped = conv_func(wrapped_groupby(df, "B")[["C", "D"]]) + assert_groupby_equal(grouped, df.groupby("B")[["C", "D"]], with_selection=True) + assert grouped.shape == (8, 2) + assert grouped.is_frame is True + + grouped = conv_func(wrapped_groupby(df, ["B", "C"])) + assert_groupby_equal(grouped, df.groupby(["B", "C"])) + assert grouped.shape == (8, 4) + assert grouped.is_frame is True + + grouped = conv_func(wrapped_groupby(df, ["B", "C"]).C) + assert_groupby_equal(grouped, df.groupby(["B", "C"]).C, with_selection=True) + assert grouped.shape == (8,) + assert grouped.is_frame is False + + grouped = conv_func(wrapped_groupby(df, ["B", "C"])[["A", "D"]]) + assert_groupby_equal( + grouped, df.groupby(["B", "C"])[["A", "D"]], with_selection=True + ) + assert grouped.shape == (8, 2) + assert grouped.is_frame is True + + grouped = conv_func(wrapped_groupby(df, ["B", "C"])[["C", "D"]]) + assert_groupby_equal( + grouped, df.groupby(["B", "C"])[["C", "D"]], with_selection=True + ) + assert grouped.shape == (8, 2) + assert grouped.is_frame is True + + grouped = conv_func(wrapped_groupby(df, lambda x: x[-1] % 2)) + assert_groupby_equal(grouped, df.groupby(lambda x: x[-1] % 2), with_selection=True) + assert grouped.shape == (8, 4) + assert grouped.is_frame is True + + grouped = conv_func(wrapped_groupby(df, lambda x: x[-1] % 2).C) + assert_groupby_equal( + grouped, df.groupby(lambda x: x[-1] % 2).C, with_selection=True + ) + assert grouped.shape == (8,) + assert grouped.is_frame is False + + grouped = conv_func(wrapped_groupby(df, lambda x: x[-1] % 2)[["C", "D"]]) + assert_groupby_equal( + grouped, df.groupby(lambda x: x[-1] % 2)[["C", "D"]], with_selection=True + ) + assert grouped.shape == (8, 2) + assert grouped.is_frame is True + + grouped = conv_func(wrapped_groupby(df.B, lambda x: x[-1] % 2)) + assert_groupby_equal( + grouped, df.B.groupby(lambda x: x[-1] % 2), with_selection=True + ) + assert grouped.shape == (8,) + assert grouped.is_frame is False + + +def test_traceback_code(): + def get_tb(): + try: + raise ValueError + except ValueError: + return sys.exc_info()[-1] + + tb = get_tb() + frags = dump_traceback_code(tb) + + target_dict = dict() + load_traceback_code(frags, target_dict) + code_lines = target_dict[__file__][2] + assert "raise" in code_lines[tb.tb_lineno - 1] + assert len([line for line in code_lines if line]) == 5 diff --git a/python/xorbits/_mars/lib/tests/test_nvutils.py b/python/xorbits/_mars/lib/tests/test_nvutils.py new file mode 100644 index 000000000..a514a645d --- /dev/null +++ b/python/xorbits/_mars/lib/tests/test_nvutils.py @@ -0,0 +1,38 @@ +# Copyright 2022 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...tests.core import require_cupy +from ...utils import lazy_import +from .. import nvutils + + +cupy = lazy_import("cupy") + + +@require_cupy +def test_nvutil(): + device_info = nvutils.get_device_info(0) + assert device_info.uuid is not None + + # run something + _ = cupy.ones(10) + + handle = nvutils.get_handle_by_index(0) + assert nvutils._running_process_matches(handle) + assert nvutils.get_cuda_context().has_context + + info = nvutils.get_index_and_uuid(0) + info2 = nvutils.get_index_and_uuid(info.uuid) + assert info.device_index == info2.device_index + assert info.uuid == info2.uuid diff --git a/python/xorbits/_mars/lib/uhashring/__init__.py b/python/xorbits/_mars/lib/uhashring/__init__.py new file mode 100644 index 000000000..8cc5d9ba1 --- /dev/null +++ b/python/xorbits/_mars/lib/uhashring/__init__.py @@ -0,0 +1,3 @@ +from .ring import HashRing + +__all__ = ["HashRing", "monkey"] diff --git a/python/xorbits/_mars/lib/uhashring/monkey.py b/python/xorbits/_mars/lib/uhashring/monkey.py new file mode 100644 index 000000000..bbe6edf11 --- /dev/null +++ b/python/xorbits/_mars/lib/uhashring/monkey.py @@ -0,0 +1,40 @@ +from . import HashRing + +__all__ = ["patch_memcache"] + + +def patch_memcache(): + """Monkey patch python-memcached to implement our consistent hashring + in its node selection and operations. + """ + + def _init(self, servers, *k, **kw): + self._old_init(servers, *k, **kw) + + nodes = {} + for server in self.servers: + conf = { + "hostname": server.ip, + "instance": server, + "port": server.port, + "weight": server.weight, + } + nodes[server.ip] = conf + self.uhashring = HashRing(nodes) + + def _get_server(self, key): + if isinstance(key, tuple): + return self._old_get_server(key) + + for i in range(self._SERVER_RETRIES): + for node in self.uhashring.range(key): + if node["instance"].connect(): + return node["instance"], key + + return None, None + + memcache = __import__("memcache") + memcache.Client._old_get_server = memcache.Client._get_server + memcache.Client._old_init = memcache.Client.__init__ + memcache.Client.__init__ = _init + memcache.Client._get_server = _get_server diff --git a/python/xorbits/_mars/lib/uhashring/ring.py b/python/xorbits/_mars/lib/uhashring/ring.py new file mode 100644 index 000000000..8fc19c03e --- /dev/null +++ b/python/xorbits/_mars/lib/uhashring/ring.py @@ -0,0 +1,341 @@ +from bisect import bisect + +from .ring_ketama import KetamaRing +from .ring_meta import MetaRing + + +class HashRing(object): + """Implement a consistent hashing ring.""" + + def __init__(self, nodes=[], **kwargs): + """Create a new HashRing given the implementation. + + :param nodes: nodes used to create the continuum (see doc for format). + :param hash_fn: use this callable function to hash keys, can be set to + 'ketama' to use the ketama compatible implementation. + :param vnodes: default number of vnodes per node. + :param weight_fn: use this function to calculate the node's weight. + """ + hash_fn = kwargs.get("hash_fn", None) + vnodes = kwargs.get("vnodes", None) + weight_fn = kwargs.get("weight_fn", None) + + if hash_fn == "ketama": + if vnodes is None: + vnodes = 40 + self.runtime = KetamaRing() + else: + if vnodes is None: + vnodes = 160 + self.runtime = MetaRing(hash_fn) + + self._default_vnodes = vnodes + self.hashi = self.runtime.hashi + + if weight_fn and not hasattr(weight_fn, "__call__"): + raise TypeError("weight_fn should be a callable function") + self._weight_fn = weight_fn + + if self._configure_nodes(nodes): + self.runtime._create_ring(self.runtime._nodes.items()) + + def _configure_nodes(self, nodes): + """Parse and set up the given nodes. + + :param nodes: nodes used to create the continuum (see doc for format). + """ + if isinstance(nodes, str): + nodes = [nodes] + elif not isinstance(nodes, (dict, list)): + raise ValueError( + f"nodes configuration should be a list or a dict, got {type(nodes)}" + ) + + conf_changed = False + for node in nodes: + conf = { + "hostname": node, + "instance": None, + "nodename": node, + "port": None, + "vnodes": self._default_vnodes, + "weight": 1, + } + current_conf = self.runtime._nodes.get(node, {}) + nodename = node + # new node, trigger a ring update + if not current_conf: + conf_changed = True + # complex config + if isinstance(nodes, dict): + node_conf = nodes[node] + if isinstance(node_conf, int): + conf["weight"] = node_conf + elif isinstance(node_conf, dict): + for k, v in node_conf.items(): + if k in conf: + conf[k] = v + # changing those config trigger a ring update + if k in ["nodename", "vnodes", "weight"]: + if current_conf.get(k) != v: + conf_changed = True + else: + raise ValueError( + "node configuration should be a dict or an int," + f" got {type(node_conf)}" + ) + if self._weight_fn: + conf["weight"] = self._weight_fn(**conf) + # changing the weight of a node trigger a ring update + if current_conf.get("weight") != conf["weight"]: + conf_changed = True + self.runtime._nodes[nodename] = conf + return conf_changed + + def __delitem__(self, nodename): + """Remove the given node. + + :param nodename: the node name. + """ + self.runtime._remove_node(nodename) + + remove_node = __delitem__ + + def __getitem__(self, key): + """Returns the instance of the node matching the hashed key. + + :param key: the key to look for. + """ + return self._get(key, "instance") + + get_node_instance = __getitem__ + + def __setitem__(self, nodename, conf={"weight": 1}): + """Add the given node with its associated configuration. + + :param nodename: the node name. + :param conf: the node configuration. + """ + if self._configure_nodes({nodename: conf}): + self.runtime._create_ring([(nodename, self._nodes[nodename])]) + + add_node = __setitem__ + + def _get_pos(self, key): + """Get the index of the given key in the sorted key list. + + We return the position with the nearest hash based on + the provided key unless we reach the end of the continuum/ring + in which case we return the 0 (beginning) index position. + + :param key: the key to hash and look for. + """ + p = bisect(self.runtime._keys, self.hashi(key)) + if p == len(self.runtime._keys): + return 0 + else: + return p + + def _get(self, key, what): + """Generic getter magic method. + + The node with the nearest but not less hash value is returned. + + :param key: the key to look for. + :param what: the information to look for in, allowed values: + - instance (default): associated node instance + - nodename: node name + - pos: index of the given key in the ring + - tuple: ketama compatible (pos, name) tuple + - weight: node weight + """ + if not self.runtime._ring: + return None + + pos = self._get_pos(key) + if what == "pos": + return pos + + nodename = self.runtime._ring[self.runtime._keys[pos]] + if what in ["hostname", "instance", "port", "weight"]: + return self.runtime._nodes[nodename][what] + elif what == "dict": + return self.runtime._nodes[nodename] + elif what == "nodename": + return nodename + elif what == "tuple": + return (self.runtime._keys[pos], nodename) + + def get(self, key): + """Returns the node object dict matching the hashed key. + + :param key: the key to look for. + """ + return self._get(key, "dict") + + def get_instances(self): + """Returns a list of the instances of all the configured nodes.""" + return [ + c.get("instance") for c in self.runtime._nodes.values() if c.get("instance") + ] + + def get_key(self, key): + """Alias of ketama hashi method, returns the hash of the given key. + + This method is present for hash_ring compatibility. + + :param key: the key to look for. + """ + return self.hashi(key) + + def get_node(self, key): + """Returns the node name of the node matching the hashed key. + + :param key: the key to look for. + """ + return self._get(key, "nodename") + + def get_node_hostname(self, key): + """Returns the hostname of the node matching the hashed key. + + :param key: the key to look for. + """ + return self._get(key, "hostname") + + def get_node_port(self, key): + """Returns the port of the node matching the hashed key. + + :param key: the key to look for. + """ + return self._get(key, "port") + + def get_node_pos(self, key): + """Returns the index position of the node matching the hashed key. + + :param key: the key to look for. + """ + return self._get(key, "pos") + + def get_node_weight(self, key): + """Returns the weight of the node matching the hashed key. + + :param key: the key to look for. + """ + return self._get(key, "weight") + + def get_nodes(self): + """Returns a list of the names of all the configured nodes.""" + return self.runtime._nodes.keys() + + def get_points(self): + """Returns a ketama compatible list of (position, nodename) tuples.""" + return [(k, self.runtime._ring[k]) for k in self.runtime._keys] + + def get_server(self, key): + """Returns a ketama compatible (position, nodename) tuple. + + :param key: the key to look for. + """ + return self._get(key, "tuple") + + def iterate_nodes(self, key, distinct=True): + """hash_ring compatibility implementation. + + Given a string key it returns the nodes as a generator that + can hold the key. + The generator iterates one time through the ring + starting at the correct position. + if `distinct` is set, then the nodes returned will be unique, + i.e. no virtual copies will be returned. + """ + if not self.runtime._ring: + yield None + else: + for node in self.range(key, unique=distinct): + yield node["nodename"] + + def print_continuum(self): + """Prints a ketama compatible continuum report.""" + numpoints = len(self.runtime._keys) + if numpoints: + print(f"Numpoints in continuum: {numpoints}") + else: + print("Continuum empty") + for p in self.get_points(): + point, node = p + print(f"{node} ({point})") + + def range(self, key, size=None, unique=True): + """Returns a generator of nodes' configuration available + in the continuum/ring. + + :param key: the key to look for. + :param size: limit the list to at most this number of nodes. + :param unique: a node may only appear once in the list (default True). + """ + all_nodes = set() + if unique: + size = size or len(self.runtime._nodes) + else: + all_nodes = [] + + pos = self._get_pos(key) + for key in self.runtime._keys[pos:]: + nodename = self.runtime._ring[key] + if unique: + if nodename in all_nodes: + continue + all_nodes.add(nodename) + else: + all_nodes.append(nodename) + yield self.runtime._nodes[nodename] + if len(all_nodes) == size: + break + else: + for i, key in enumerate(self.runtime._keys): + if i < pos: + nodename = self.runtime._ring[key] + if unique: + if nodename in all_nodes: + continue + all_nodes.add(nodename) + else: + all_nodes.append(nodename) + yield self.runtime._nodes[nodename] + if len(all_nodes) == size: + break + + def regenerate(self): + self.runtime._create_ring(self.runtime._nodes.items()) + + @property + def conf(self): + return self.runtime._nodes + + nodes = conf + + @property + def distribution(self): + return self.runtime._distribution + + @property + def ring(self): + return self.runtime._ring + + continuum = ring + + @property + def size(self): + return len(self.runtime._ring) + + @property + def _ring(self): + return self.runtime._ring + + @property + def _nodes(self): + return self.runtime._nodes + + @property + def _keys(self): + return self.runtime._keys diff --git a/python/xorbits/_mars/lib/uhashring/ring_ketama.py b/python/xorbits/_mars/lib/uhashring/ring_ketama.py new file mode 100644 index 000000000..03e61e9c1 --- /dev/null +++ b/python/xorbits/_mars/lib/uhashring/ring_ketama.py @@ -0,0 +1,81 @@ +from bisect import insort +from collections import Counter +from hashlib import md5 +from sys import version_info + + +class KetamaRing(object): + """Implement a ketama compatible consistent hashing ring.""" + + def __init__(self): + """Create a new HashRing.""" + self._distribution = Counter() + self._keys = [] + self._nodes = {} + self._replicas = 4 + self._ring = {} + + if version_info >= (3,): + self._listbytes = lambda x: x + + def hashi(self, key, replica=0): + """Returns a ketama compatible hash from the given key.""" + dh = self._listbytes(md5(str(key).encode("utf-8")).digest()) + rd = replica * 4 + return (dh[3 + rd] << 24) | (dh[2 + rd] << 16) | (dh[1 + rd] << 8) | dh[0 + rd] + + def _hashi_weight_generator(self, node_name, node_conf): + """Calculate the weight factor of the given node and + yield its hash key for every configured replica. + + :param node_name: the node name. + """ + ks = ( + node_conf["vnodes"] * len(self._nodes) * node_conf["weight"] + ) // self._weight_sum + for w in range(0, ks): + w_node_name = f"{node_name}-{w}" + for i in range(0, self._replicas): + yield self.hashi(w_node_name, replica=i) + + @staticmethod + def _listbytes(data): + """Python 2 compatible int iterator from str. + + :param data: the string to int iterate upon. + """ + return map(ord, data) + + def _create_ring(self, nodes): + """Generate a ketama compatible continuum/ring.""" + _weight_sum = 0 + for node_conf in self._nodes.values(): + _weight_sum += node_conf["weight"] + self._weight_sum = _weight_sum + + _distribution = Counter() + _keys = [] + _ring = {} + for node_name, node_conf in self._nodes.items(): + for h in self._hashi_weight_generator(node_name, node_conf): + _ring[h] = node_name + insort(_keys, h) + _distribution[node_name] += 1 + self._distribution = _distribution + self._keys = _keys + self._ring = _ring + + def _remove_node(self, node_name): + """Remove the given node from the continuum/ring. + + :param node_name: the node name. + """ + try: + self._nodes.pop(node_name) + except Exception: + raise KeyError( + f"node '{node_name}' not found, " + f"available nodes: {list(self._nodes.keys())}" + ) + else: + self._create_ring(self._nodes) diff --git a/python/xorbits/_mars/lib/uhashring/ring_meta.py b/python/xorbits/_mars/lib/uhashring/ring_meta.py new file mode 100644 index 000000000..33bc3d5b2 --- /dev/null +++ b/python/xorbits/_mars/lib/uhashring/ring_meta.py @@ -0,0 +1,52 @@ +from collections import Counter +from hashlib import md5 + + +class MetaRing(object): + """Implement a tunable consistent hashing ring.""" + + def __init__(self, hash_fn): + """Create a new HashRing. + + :param hash_fn: use this callable function to hash keys. + """ + self._distribution = Counter() + self._keys = [] + self._nodes = {} + self._ring = {} + + if hash_fn and not hasattr(hash_fn, "__call__"): + raise TypeError("hash_fn should be a callable function") + self._hash_fn = hash_fn or ( + lambda key: int(md5(str(key).encode("utf-8")).hexdigest(), 16) + ) + + def hashi(self, key): + """Returns an integer derived from the md5 hash of the given key.""" + return self._hash_fn(key) + + def _create_ring(self, nodes): + """Generate a ketama compatible continuum/ring.""" + for node_name, node_conf in nodes: + for w in range(0, node_conf["vnodes"] * node_conf["weight"]): + self._distribution[node_name] += 1 + self._ring[self.hashi(f"{node_name}-{w}")] = node_name + self._keys = sorted(self._ring.keys()) + + def _remove_node(self, node_name): + """Remove the given node from the continuum/ring. + + :param node_name: the node name. + """ + try: + node_conf = self._nodes.pop(node_name) + except Exception: + raise KeyError( + f"node '{node_name}' not found, " + f"available nodes: {list(self._nodes.keys())}" + ) + else: + self._distribution.pop(node_name) + for w in range(0, node_conf["vnodes"] * node_conf["weight"]): + del self._ring[self.hashi(f"{node_name}-{w}")] + self._keys = sorted(self._ring.keys()) diff --git a/python/xorbits/_mars/lib/version.py b/python/xorbits/_mars/lib/version.py new file mode 100644 index 000000000..32457773a --- /dev/null +++ b/python/xorbits/_mars/lib/version.py @@ -0,0 +1,606 @@ +# File merged from these files: +# setuptools/pkg_resources/_vendor/packaging/_structures.py +# setuptools/pkg_resources/_vendor/packaging/_typing.py +# setuptools/pkg_resources/_vendor/packaging/version.py +# Originally released under Apache License, Version 2.0, and the BSD License. + +# This file is dual licensed under the terms of the Apache License, Version +# 2.0, and the BSD License. See the LICENSE file in the root of this repository +# for complete details. + +# Copyright Jason R. Coombs +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +import collections +import itertools +import re +import warnings +from typing import Callable, Iterator, List, Optional, SupportsInt, Tuple, Union + +__all__ = ["parse", "Version", "LegacyVersion", "InvalidVersion", "VERSION_PATTERN"] + + +class InfinityType(object): + def __repr__(self): + # type: () -> str + return "Infinity" + + def __hash__(self): + # type: () -> int + return hash(repr(self)) + + def __lt__(self, other): + # type: (object) -> bool + return False + + def __le__(self, other): + # type: (object) -> bool + return False + + def __eq__(self, other): + # type: (object) -> bool + return isinstance(other, self.__class__) + + def __ne__(self, other): + # type: (object) -> bool + return not isinstance(other, self.__class__) + + def __gt__(self, other): + # type: (object) -> bool + return True + + def __ge__(self, other): + # type: (object) -> bool + return True + + def __neg__(self): + # type: (object) -> NegativeInfinityType + return NegativeInfinity + + +Infinity = InfinityType() + + +class NegativeInfinityType(object): + def __repr__(self): + # type: () -> str + return "-Infinity" + + def __hash__(self): + # type: () -> int + return hash(repr(self)) + + def __lt__(self, other): + # type: (object) -> bool + return True + + def __le__(self, other): + # type: (object) -> bool + return True + + def __eq__(self, other): + # type: (object) -> bool + return isinstance(other, self.__class__) + + def __ne__(self, other): + # type: (object) -> bool + return not isinstance(other, self.__class__) + + def __gt__(self, other): + # type: (object) -> bool + return False + + def __ge__(self, other): + # type: (object) -> bool + return False + + def __neg__(self): + # type: (object) -> InfinityType + return Infinity + + +NegativeInfinity = NegativeInfinityType() + + +InfiniteTypes = Union[InfinityType, NegativeInfinityType] +PrePostDevType = Union[InfiniteTypes, Tuple[str, int]] +SubLocalType = Union[InfiniteTypes, int, str] +LocalType = Union[ + NegativeInfinityType, + Tuple[ + Union[ + SubLocalType, + Tuple[SubLocalType, str], + Tuple[NegativeInfinityType, SubLocalType], + ], + ..., + ], +] +CmpKey = Tuple[ + int, Tuple[int, ...], PrePostDevType, PrePostDevType, PrePostDevType, LocalType +] +LegacyCmpKey = Tuple[int, Tuple[str, ...]] +VersionComparisonMethod = Callable[ + [Union[CmpKey, LegacyCmpKey], Union[CmpKey, LegacyCmpKey]], bool +] + +_Version = collections.namedtuple( + "_Version", ["epoch", "release", "dev", "pre", "post", "local"] +) + + +def parse(version: str) -> Union["LegacyVersion", "Version"]: + """ + Parse the given version string and return either a :class:`Version` object + or a :class:`LegacyVersion` object depending on if the given version is + a valid PEP 440 version or a legacy version. + """ + try: + return Version(version) + except InvalidVersion: + return LegacyVersion(version) + + +class InvalidVersion(ValueError): + """ + An invalid version was found, users should refer to PEP 440. + """ + + +class _BaseVersion: + _key: Union[CmpKey, LegacyCmpKey] + + def __hash__(self) -> int: + return hash(self._key) + + # Please keep the duplicated `isinstance` check + # in the six comparisons hereunder + # unless you find a way to avoid adding overhead function calls. + def __lt__(self, other: "_BaseVersion") -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key < other._key + + def __le__(self, other: "_BaseVersion") -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key <= other._key + + def __eq__(self, other: object) -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key == other._key + + def __ge__(self, other: "_BaseVersion") -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key >= other._key + + def __gt__(self, other: "_BaseVersion") -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key > other._key + + def __ne__(self, other: object) -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key != other._key + + +class LegacyVersion(_BaseVersion): + def __init__(self, version: str) -> None: + self._version = str(version) + self._key = _legacy_cmpkey(self._version) + + warnings.warn( + "Creating a LegacyVersion has been deprecated and will be " + "removed in the next major release", + DeprecationWarning, + ) + + def __str__(self) -> str: + return self._version + + def __repr__(self) -> str: + return f"" + + @property + def public(self) -> str: + return self._version + + @property + def base_version(self) -> str: + return self._version + + @property + def epoch(self) -> int: + return -1 + + @property + def release(self) -> None: + return None + + @property + def pre(self) -> None: + return None + + @property + def post(self) -> None: + return None + + @property + def dev(self) -> None: + return None + + @property + def local(self) -> None: + return None + + @property + def is_prerelease(self) -> bool: + return False + + @property + def is_postrelease(self) -> bool: + return False + + @property + def is_devrelease(self) -> bool: + return False + + +_legacy_version_component_re = re.compile(r"(\d+ | [a-z]+ | \.| -)", re.VERBOSE) + +_legacy_version_replacement_map = { + "pre": "c", + "preview": "c", + "-": "final-", + "rc": "c", + "dev": "@", +} + + +def _parse_version_parts(s: str) -> Iterator[str]: + for part in _legacy_version_component_re.split(s): + part = _legacy_version_replacement_map.get(part, part) + + if not part or part == ".": + continue + + if part[:1] in "0123456789": + # pad for numeric comparison + yield part.zfill(8) + else: + yield "*" + part + + # ensure that alpha/beta/candidate are before final + yield "*final" + + +def _legacy_cmpkey(version: str) -> LegacyCmpKey: + # We hardcode an epoch of -1 here. A PEP 440 version can only have a epoch + # greater than or equal to 0. This will effectively put the LegacyVersion, + # which uses the defacto standard originally implemented by setuptools, + # as before all PEP 440 versions. + epoch = -1 + + # This scheme is taken from pkg_resources.parse_version setuptools prior to + # it's adoption of the packaging library. + parts: List[str] = [] + for part in _parse_version_parts(version.lower()): + if part.startswith("*"): + # remove "-" before a prerelease tag + if part < "*final": + while parts and parts[-1] == "*final-": + parts.pop() + + # remove trailing zeros from each series of numeric parts + while parts and parts[-1] == "00000000": + parts.pop() + + parts.append(part) + + return epoch, tuple(parts) + + +# Deliberately not anchored to the start and end of the string, to make it +# easier for 3rd party code to reuse +VERSION_PATTERN = r""" + v? + (?: + (?:(?P[0-9]+)!)? # epoch + (?P[0-9]+(?:\.[0-9]+)*) # release segment + (?P
                                          # pre-release
+            [-_\.]?
+            (?P(a|b|c|rc|alpha|beta|pre|preview))
+            [-_\.]?
+            (?P[0-9]+)?
+        )?
+        (?P                                         # post release
+            (?:-(?P[0-9]+))
+            |
+            (?:
+                [-_\.]?
+                (?Ppost|rev|r)
+                [-_\.]?
+                (?P[0-9]+)?
+            )
+        )?
+        (?P                                          # dev release
+            [-_\.]?
+            (?Pdev)
+            [-_\.]?
+            (?P[0-9]+)?
+        )?
+    )
+    (?:\+(?P[a-z0-9]+(?:[-_\.][a-z0-9]+)*))?       # local version
+"""
+
+
+class Version(_BaseVersion):
+    _regex = re.compile(r"^\s*" + VERSION_PATTERN + r"\s*$", re.VERBOSE | re.IGNORECASE)
+
+    def __init__(self, version: str) -> None:
+        # Validate the version and parse it into pieces
+        match = self._regex.search(version)
+        if not match:
+            raise InvalidVersion(f"Invalid version: '{version}'")
+
+        # Store the parsed out pieces of the version
+        self._version = _Version(
+            epoch=int(match.group("epoch")) if match.group("epoch") else 0,
+            release=tuple(int(i) for i in match.group("release").split(".")),
+            pre=_parse_letter_version(match.group("pre_l"), match.group("pre_n")),
+            post=_parse_letter_version(
+                match.group("post_l"), match.group("post_n1") or match.group("post_n2")
+            ),
+            dev=_parse_letter_version(match.group("dev_l"), match.group("dev_n")),
+            local=_parse_local_version(match.group("local")),
+        )
+
+        # Generate a key which will be used for sorting
+        self._key = _cmpkey(
+            self._version.epoch,
+            self._version.release,
+            self._version.pre,
+            self._version.post,
+            self._version.dev,
+            self._version.local,
+        )
+
+    def __repr__(self) -> str:
+        return f""
+
+    def __str__(self) -> str:
+        parts = []
+
+        # Epoch
+        if self.epoch != 0:
+            parts.append(f"{self.epoch}!")
+
+        # Release segment
+        parts.append(".".join(str(x) for x in self.release))
+
+        # Pre-release
+        if self.pre is not None:
+            parts.append("".join(str(x) for x in self.pre))
+
+        # Post-release
+        if self.post is not None:
+            parts.append(f".post{self.post}")
+
+        # Development release
+        if self.dev is not None:
+            parts.append(f".dev{self.dev}")
+
+        # Local version segment
+        if self.local is not None:
+            parts.append(f"+{self.local}")
+
+        return "".join(parts)
+
+    @property
+    def epoch(self) -> int:
+        _epoch: int = self._version.epoch
+        return _epoch
+
+    @property
+    def release(self) -> Tuple[int, ...]:
+        _release: Tuple[int, ...] = self._version.release
+        return _release
+
+    @property
+    def pre(self) -> Optional[Tuple[str, int]]:
+        _pre: Optional[Tuple[str, int]] = self._version.pre
+        return _pre
+
+    @property
+    def post(self) -> Optional[int]:
+        return self._version.post[1] if self._version.post else None
+
+    @property
+    def dev(self) -> Optional[int]:
+        return self._version.dev[1] if self._version.dev else None
+
+    @property
+    def local(self) -> Optional[str]:
+        if self._version.local:
+            return ".".join(str(x) for x in self._version.local)
+        else:
+            return None
+
+    @property
+    def public(self) -> str:
+        return str(self).split("+", 1)[0]
+
+    @property
+    def base_version(self) -> str:
+        parts = []
+
+        # Epoch
+        if self.epoch != 0:
+            parts.append(f"{self.epoch}!")
+
+        # Release segment
+        parts.append(".".join(str(x) for x in self.release))
+
+        return "".join(parts)
+
+    @property
+    def is_prerelease(self) -> bool:
+        return self.dev is not None or self.pre is not None
+
+    @property
+    def is_postrelease(self) -> bool:
+        return self.post is not None
+
+    @property
+    def is_devrelease(self) -> bool:
+        return self.dev is not None
+
+    @property
+    def major(self) -> int:
+        return self.release[0] if len(self.release) >= 1 else 0
+
+    @property
+    def minor(self) -> int:
+        return self.release[1] if len(self.release) >= 2 else 0
+
+    @property
+    def micro(self) -> int:
+        return self.release[2] if len(self.release) >= 3 else 0
+
+
+def _parse_letter_version(
+    letter: str, number: Union[str, bytes, SupportsInt]
+) -> Optional[Tuple[str, int]]:
+    if letter:
+        # We consider there to be an implicit 0 in a pre-release if there is
+        # not a numeral associated with it.
+        if number is None:
+            number = 0
+
+        # We normalize any letters to their lower case form
+        letter = letter.lower()
+
+        # We consider some words to be alternate spellings of other words and
+        # in those cases we want to normalize the spellings to our preferred
+        # spelling.
+        if letter == "alpha":
+            letter = "a"
+        elif letter == "beta":
+            letter = "b"
+        elif letter in ["c", "pre", "preview"]:
+            letter = "rc"
+        elif letter in ["rev", "r"]:
+            letter = "post"
+
+        return letter, int(number)
+    if not letter and number:
+        # We assume if we are given a number, but we are not given a letter
+        # then this is using the implicit post release syntax (e.g. 1.0-1)
+        letter = "post"
+
+        return letter, int(number)
+
+    return None
+
+
+_local_version_separators = re.compile(r"[\._-]")
+
+
+def _parse_local_version(local: str) -> Optional[LocalType]:
+    """
+    Takes a string like abc.1.twelve and turns it into ("abc", 1, "twelve").
+    """
+    if local is not None:
+        return tuple(
+            part.lower() if not part.isdigit() else int(part)
+            for part in _local_version_separators.split(local)
+        )
+    return None
+
+
+def _cmpkey(
+    epoch: int,
+    release: Tuple[int, ...],
+    pre: Optional[Tuple[str, int]],
+    post: Optional[Tuple[str, int]],
+    dev: Optional[Tuple[str, int]],
+    local: Optional[Tuple[SubLocalType]],
+) -> CmpKey:
+    # When we compare a release version, we want to compare it with all of the
+    # trailing zeros removed. So we'll use a reverse the list, drop all the now
+    # leading zeros until we come to something non zero, then take the rest
+    # re-reverse it back into the correct order and make it a tuple and use
+    # that for our sorting key.
+    _release = tuple(
+        reversed(list(itertools.dropwhile(lambda x: x == 0, reversed(release))))
+    )
+
+    # We need to "trick" the sorting algorithm to put 1.0.dev0 before 1.0a0.
+    # We'll do this by abusing the pre segment, but we _only_ want to do this
+    # if there is not a pre or a post segment. If we have one of those then
+    # the normal sorting rules will handle this case correctly.
+    if pre is None and post is None and dev is not None:
+        _pre: PrePostDevType = NegativeInfinity
+    # Versions without a pre-release (except as noted above) should sort after
+    # those with one.
+    elif pre is None:
+        _pre = Infinity
+    else:
+        _pre = pre
+
+    # Versions without a post segment should sort before those with one.
+    if post is None:
+        _post: PrePostDevType = NegativeInfinity
+
+    else:
+        _post = post
+
+    # Versions without a development segment should sort after those with one.
+    if dev is None:
+        _dev: PrePostDevType = Infinity
+
+    else:
+        _dev = dev
+
+    if local is None:
+        # Versions without a local segment should sort before those with one.
+        _local: LocalType = NegativeInfinity
+    else:
+        # Versions with a local segment need that segment parsed to implement
+        # the sorting rules in PEP440.
+        # - Alpha numeric segments sort before numeric segments
+        # - Alpha numeric segments sort lexicographically
+        # - Numeric segments sort numerically
+        # - Shorter versions sort before longer versions when the prefixes
+        #   match exactly
+        _local = tuple(
+            (i, "") if isinstance(i, int) else (NegativeInfinity, i) for i in local
+        )
+
+    return epoch, _release, _pre, _post, _dev, _local
diff --git a/python/xorbits/_mars/metrics/__init__.py b/python/xorbits/_mars/metrics/__init__.py
new file mode 100644
index 000000000..e53d36497
--- /dev/null
+++ b/python/xorbits/_mars/metrics/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .api import (
+    Metrics,
+    Percentile,
+    init_metrics,
+    record_time_cost_percentile,
+    shutdown_metrics,
+)
diff --git a/python/xorbits/_mars/metrics/api.py b/python/xorbits/_mars/metrics/api.py
new file mode 100644
index 000000000..b258997a1
--- /dev/null
+++ b/python/xorbits/_mars/metrics/api.py
@@ -0,0 +1,292 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import time
+import weakref
+from contextlib import contextmanager
+from enum import Enum
+from queue import PriorityQueue
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple
+
+from .backends.console import console_metric
+from .backends.metric import AbstractMetric
+from .backends.prometheus import prometheus_metric
+from .backends.ray import ray_metric
+
+logger = logging.getLogger(__name__)
+
+_init = False
+_metric_backend = "console"
+_backends_cls = {
+    "console": console_metric,
+    "prometheus": prometheus_metric,
+    "ray": ray_metric,
+}
+
+
+_metrics_to_be_initialized = weakref.WeakSet()
+
+
+def init_metrics(backend="console", config: Dict[str, Any] = None):
+    global _init
+    if _init is True:
+        return
+
+    backend = backend or "console"
+    if backend not in _backends_cls:
+        raise NotImplementedError(f"Do not support metric backend {backend}")
+    global _metric_backend
+    _metric_backend = backend
+    if _metric_backend == "prometheus":
+        try:
+            from prometheus_client import start_http_server
+
+            from ..utils import get_next_port
+
+            port = config.get("port", 0) if config else 0
+            port = port or get_next_port()
+            start_http_server(port)
+            logger.warning(
+                "Finished startup prometheus http server and port is %d", port
+            )
+        except ImportError:
+            logger.warning(
+                "Failed to start prometheus http server because there is no prometheus_client"
+            )
+    _init = True
+    for m in _metrics_to_be_initialized:
+        cls = getattr(_backends_cls[_metric_backend], m.type)
+        metric = cls(m.name, m.description, m.tag_keys)
+        m.set_metric(metric)
+    logger.info("Finished initialize the metrics of backend: %s.", _metric_backend)
+
+
+def shutdown_metrics():
+    global _metric_backend
+    _metric_backend = "console"
+    global _init
+    _init = False
+    logger.info("Shutdown metrics of backend: %s.", _metric_backend)
+
+
+class _MetricWrapper(AbstractMetric):
+    _metric: AbstractMetric
+    _log_not_init_error: bool
+
+    def __init__(
+        self,
+        name: str,
+        description: str = "",
+        tag_keys: Optional[Tuple[str, ...]] = None,
+        metric_type: str = "Counter",
+    ):
+        self._name = name
+        self._description = description
+        self._tag_keys = tag_keys or tuple()
+        self._type = metric_type
+        self._metric = None
+        self._log_not_init_error = False
+
+    @property
+    def type(self):
+        return self._type
+
+    @property
+    def value(self):
+        assert (
+            self._metric is not None
+        ), "Metric is not initialized, please call `init_metrics()` before using metrics."
+        return self._metric.value
+
+    def set_metric(self, metric):
+        assert metric is not None, "Argument metric is None, please check it."
+        self._metric = metric
+
+    def record(self, value=1, tags: Optional[Dict[str, str]] = None):
+        if self._metric is not None:
+            self._metric.record(value, tags)
+        elif not self._log_not_init_error:
+            self._log_not_init_error = True
+            logger.warning(
+                "Metric is not initialized, please call `init_metrics()` before using metrics."
+            )
+
+
+def gen_metric(func):
+    def wrapper(
+        name, descriptions: str = "", tag_keys: Optional[Tuple[str, ...]] = None
+    ):
+        if _init is True:
+            return func(name, descriptions, tag_keys)
+        else:
+            logger.info(
+                "Metric %s will be initialized when invoking `init_metrics()`.", name
+            )
+            metric = _MetricWrapper(
+                name, descriptions, tag_keys, func.__name__.capitalize()
+            )
+            _metrics_to_be_initialized.add(metric)
+            return metric
+
+    return wrapper
+
+
+class Metrics:
+    """
+    A factory to generate different types of metrics.
+
+    Note:
+        Counter, Meter and Histogram are not thread safe.
+
+    Examples
+    --------
+    >>> c1 = counter('counter1', 'A counter')
+    >>> c1.record(1)
+
+    >>> c2 = counter('counter2', 'A counter', ('service', 'tenant'))
+    >>> c2.record(1, {'service': 'mars', 'tenant': 'test'})
+
+    >>> g1 = gauge('gauge1')
+    >>> g1.record(1)
+
+    >>> m1 = meter('meter1')
+    >>> m1.record(1)
+
+    >>> h1 = histogram('histogram1')
+    >>> h1.record(1)
+    """
+
+    @staticmethod
+    @gen_metric
+    def counter(
+        name, description: str = "", tag_keys: Optional[Tuple[str, ...]] = None
+    ):
+        logger.debug(
+            "Initializing a counter with name: %s, tag keys: %s, backend: %s",
+            name,
+            tag_keys,
+            _metric_backend,
+        )
+        return _backends_cls[_metric_backend].Counter(name, description, tag_keys)
+
+    @staticmethod
+    @gen_metric
+    def gauge(name, description: str = "", tag_keys: Optional[Tuple[str, ...]] = None):
+        logger.debug(
+            "Initializing a gauge whose name: %s, tag keys: %s, backend: %s",
+            name,
+            tag_keys,
+            _metric_backend,
+        )
+        return _backends_cls[_metric_backend].Gauge(name, description, tag_keys)
+
+    @staticmethod
+    @gen_metric
+    def meter(name, description: str = "", tag_keys: Optional[Tuple[str, ...]] = None):
+        logger.debug(
+            "Initializing a meter whose name: %s, tag keys: %s, backend: %s",
+            name,
+            tag_keys,
+            _metric_backend,
+        )
+        return _backends_cls[_metric_backend].Meter(name, description, tag_keys)
+
+    @staticmethod
+    @gen_metric
+    def histogram(
+        name, description: str = "", tag_keys: Optional[Tuple[str, ...]] = None
+    ):
+        logger.debug(
+            "Initializing a histogram whose name: %s, tag keys: %s, backend: %s",
+            name,
+            tag_keys,
+            _metric_backend,
+        )
+        return _backends_cls[_metric_backend].Histogram(name, description, tag_keys)
+
+
+class Percentile:
+    class PercentileType(Enum):
+        P99 = 1
+        P95 = 2
+        P90 = 3
+
+    def __init__(self, capacity: int, window: int, callback: Callable[[float], None]):
+        self._capacity = capacity
+        self._window = window
+        self._callback = callback
+        self._min_heap = PriorityQueue()
+        self._cur_num = 0
+
+        if capacity <= 0 or window <= 0:
+            raise ValueError(
+                f"capacity or window expect to get a positive integer,"
+                f"but capacity got: {capacity} and window got: {window}"
+            )
+
+    def record_data(self, value):
+        store_value = -1 * value
+        if self._min_heap.qsize() < self._capacity:
+            self._min_heap.put(store_value)
+        else:
+            top_value = self._min_heap.get_nowait()
+            store_value = store_value if top_value < store_value else top_value
+            self._min_heap.put(store_value)
+
+        self._cur_num += 1
+        if self._cur_num % self._window == 0:
+            self._callback(-1 * self._min_heap.get_nowait())
+            self._cur_num = 0
+            self._min_heap = PriorityQueue()
+
+    @classmethod
+    def build_p99(cls, callback: Callable[[float], None], window: int):
+        return cls(int(window * 0.01), window, callback)
+
+    @classmethod
+    def build_p95(cls, callback: Callable[[float], None], window: int):
+        return cls(int(window * 0.05), window, callback)
+
+    @classmethod
+    def build_p90(cls, callback: Callable[[float], None], window: int):
+        return cls(int(window * 0.1), window, callback)
+
+
+_percentile_builder = {
+    Percentile.PercentileType.P99: Percentile.build_p99,
+    Percentile.PercentileType.P95: Percentile.build_p95,
+    Percentile.PercentileType.P90: Percentile.build_p90,
+}
+
+
+class PercentileArg(NamedTuple):
+    percentile_type: Percentile.PercentileType
+    callback: Callable[[float], None]
+    window: int
+
+
+@contextmanager
+def record_time_cost_percentile(percentile_args: List[PercentileArg]):
+    percentile_list = [
+        _percentile_builder[percentile_type](callback, window)
+        for percentile_type, callback, window in percentile_args
+    ]
+    st_time = time.time()
+
+    yield
+
+    cost_time = time.time() - st_time
+    for percentile in percentile_list:
+        percentile.record_data(cost_time)
diff --git a/python/xorbits/_mars/metrics/backends/__init__.py b/python/xorbits/_mars/metrics/backends/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/metrics/backends/console/__init__.py b/python/xorbits/_mars/metrics/backends/console/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/console/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/metrics/backends/console/console_metric.py b/python/xorbits/_mars/metrics/backends/console/console_metric.py
new file mode 100644
index 000000000..c76ecbbc7
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/console/console_metric.py
@@ -0,0 +1,78 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from typing import Dict, Optional, Tuple
+
+from ..metric import (
+    AbstractCounter,
+    AbstractGauge,
+    AbstractHistogram,
+    AbstractMeter,
+    AbstractMetric,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class SimpleMetric:
+    def __init__(
+        self, name: str, description: str = "", tag_keys: Optional[Tuple[str]] = None
+    ):
+        self._name = name
+        self._description = description
+        self._tag_keys = tag_keys
+        self._value = 0
+
+    def update(self, value: float = 1.0, tags: Optional[Dict[str, str]] = None):
+        self._value = value
+        logger.debug(
+            "Reporting metric with name: %s, description: %s, value: %s, tags: %s",
+            self._name,
+            self._description,
+            value,
+            tags,
+        )
+
+    @property
+    def value(self):
+        return self._value
+
+
+class ConsoleMetricMixin(AbstractMetric):
+    @property
+    def value(self):
+        return self._metric.value
+
+    def _init(self):
+        self._metric = SimpleMetric(self._name, self._description, self._tag_keys)
+
+    def _record(self, value=1, tags: Optional[Dict[str, str]] = None):
+        self._metric.update(value, tags)
+
+
+class Counter(ConsoleMetricMixin, AbstractCounter):
+    pass
+
+
+class Gauge(ConsoleMetricMixin, AbstractGauge):
+    pass
+
+
+class Meter(ConsoleMetricMixin, AbstractMeter):
+    pass
+
+
+class Histogram(ConsoleMetricMixin, AbstractHistogram):
+    pass
diff --git a/python/xorbits/_mars/metrics/backends/console/tests/__init__.py b/python/xorbits/_mars/metrics/backends/console/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/console/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/metrics/backends/console/tests/test_console_metric.py b/python/xorbits/_mars/metrics/backends/console/tests/test_console_metric.py
new file mode 100644
index 000000000..3e41cc459
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/console/tests/test_console_metric.py
@@ -0,0 +1,63 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..console_metric import Counter, Gauge, Histogram, Meter
+
+
+def test_counter():
+    c = Counter("test_counter", "A test counter", ("service", "tenant"))
+    assert c.name == "test_counter"
+    assert c.description == "A test counter"
+    assert c.tag_keys == ("service", "tenant")
+    assert c.type == "Counter"
+    c.record(1, {"service": "mars", "tenant": "test"})
+    c.record(2, {"service": "mars", "tenant": "test"})
+    assert c.value == 3
+
+
+def test_gauge():
+    g = Gauge("test_gauge", "A test gauge")
+    assert g.name == "test_gauge"
+    assert g.description == "A test gauge"
+    assert g.tag_keys == ()
+    assert g.type == "Gauge"
+    g.record(1)
+    assert g.value == 1
+    g.record(2)
+    assert g.value == 2
+
+
+def test_meter():
+    m = Meter("test_meter")
+    assert m.name == "test_meter"
+    assert m.description == ""
+    assert m.tag_keys == ()
+    assert m.type == "Meter"
+    m.record(1)
+    assert m.value == 0
+    m.record(2001)
+    assert m.value > 0
+
+
+def test_histogram():
+    h = Histogram("test_histogram")
+    assert h.name == "test_histogram"
+    assert h.description == ""
+    assert h.tag_keys == ()
+    assert h.type == "Histogram"
+    h.record(1)
+    assert h.value == 0
+    for i in range(2002):
+        h.record(1)
+    assert h.value > 0
diff --git a/python/xorbits/_mars/metrics/backends/metric.py b/python/xorbits/_mars/metrics/backends/metric.py
new file mode 100644
index 000000000..78f00b63a
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/metric.py
@@ -0,0 +1,145 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+from abc import ABC
+from typing import Dict, Optional, Tuple
+
+_THRESHOLD = 2000
+_RECORDED_INTERVAL_SECS = 1
+
+
+class AbstractMetric(ABC):
+    """Base class of metrics."""
+
+    _type = None
+
+    def __init__(
+        self, name: str, description: str = "", tag_keys: Optional[Tuple[str]] = None
+    ):
+        assert isinstance(name, str), "Argument name should be a str"
+        assert isinstance(description, str), "Argument description should be a str"
+        if tag_keys is not None:
+            assert isinstance(tag_keys, tuple) and all(
+                isinstance(tag, str) for tag in tag_keys
+            ), "Argument tag_keys should be a tuple and its elements should be str"
+        self._name = name
+        self._description = description
+        self._tag_keys = tag_keys or tuple()
+        self._init()
+
+    @property
+    def type(self):
+        return self._type
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def description(self):
+        return self._description
+
+    @property
+    def tag_keys(self):
+        return self._tag_keys
+
+    def _init(self):
+        """Some initialization in subclass."""
+        pass
+
+    def record(self, value=1, tags: Optional[Dict[str, str]] = None):
+        """A public method called by users."""
+        pass
+
+    def _record(self, value: float = 1.0, tags: Optional[Dict[str, str]] = None):
+        """An internal method called by record() and should be
+        implemented by different metric backends.
+        """
+        pass
+
+
+class AbstractCounter(AbstractMetric):
+    """A counter records the counts of events."""
+
+    _type = "Counter"
+
+    def __init__(
+        self, name: str, description: str = "", tag_keys: Optional[Tuple[str]] = None
+    ):
+        super().__init__(name, description, tag_keys)
+        self._count = 0
+
+    def record(self, value=1, tags: Optional[Dict[str, str]] = None):
+        self._count += value
+        self._record(self._count, tags)
+
+
+class AbstractGauge(AbstractMetric):
+    """A gauge represents a single numerical value that can be
+    arbitrarily set.
+    """
+
+    _type = "Gauge"
+
+    def record(self, value=1, tags: Optional[Dict[str, str]] = None):
+        self._record(value, tags)
+
+
+class AbstractMeter(AbstractMetric):
+    """A meter measures the rate at which a set of events occur."""
+
+    _type = "Meter"
+
+    def __init__(
+        self, name: str, description: str = "", tag_keys: Optional[Tuple[str]] = None
+    ):
+        super().__init__(name, description, tag_keys)
+        self._count = 0
+        self._last_time = time.time()
+
+    def record(self, value=1, tags: Optional[Dict[str, str]] = None):
+        self._count += value
+        now = time.time()
+        past = now - self._last_time
+        if self._count >= _THRESHOLD or past >= _RECORDED_INTERVAL_SECS:
+            qps = self._count / past
+            self._record(qps, tags)
+            self._last_time = now
+            self._count = 0
+
+
+class AbstractHistogram(AbstractMetric):
+    """A histogram measures the distribution of values in a stream of data."""
+
+    _type = "Histogram"
+
+    def __init__(
+        self, name: str, description: str = "", tag_keys: Optional[Tuple[str]] = None
+    ):
+        super().__init__(name, description, tag_keys)
+        self._data = list()
+        self._last_time = time.time()
+
+    def record(self, value=1, tags: Optional[Dict[str, str]] = None):
+        self._data.append(value)
+        now = time.time()
+        if (
+            len(self._data) >= _THRESHOLD
+            or now - self._last_time >= _RECORDED_INTERVAL_SECS
+        ):
+            avg = sum(self._data) / len(self._data)
+            self._record(avg, tags)
+            self._data.clear()
+            self._last_time = now
diff --git a/python/xorbits/_mars/metrics/backends/prometheus/__init__.py b/python/xorbits/_mars/metrics/backends/prometheus/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/prometheus/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/metrics/backends/prometheus/prometheus_metric.py b/python/xorbits/_mars/metrics/backends/prometheus/prometheus_metric.py
new file mode 100644
index 000000000..e5eaa38da
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/prometheus/prometheus_metric.py
@@ -0,0 +1,70 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import socket
+from typing import Dict, Optional
+
+from ....utils import lazy_import
+from ..metric import (
+    AbstractCounter,
+    AbstractGauge,
+    AbstractHistogram,
+    AbstractMeter,
+    AbstractMetric,
+)
+
+pc = lazy_import("prometheus_client", rename="pc")
+
+
+class PrometheusMetricMixin(AbstractMetric):
+    def _init(self):
+        # Prometheus metric name must match the regex `[a-zA-Z_:][a-zA-Z0-9_:]*`
+        # `.` is a common character in metrics, so here replace it with `:`
+        self._name = self._name.replace(".", ":")
+        self._tag_keys = self._tag_keys + (
+            "host",
+            "pid",
+        )
+        self._tags = {"host": socket.gethostname(), "pid": os.getpid()}
+        try:
+            self._metric = (
+                pc.Gauge(self._name, self._description, self._tag_keys) if pc else None
+            )
+        except ValueError:  # pragma: no cover
+            self._metric = None
+
+    def _record(self, value=1, tags: Optional[Dict[str, str]] = None):
+        if self._metric:
+            if tags is not None:
+                tags.update(self._tags)
+            else:
+                tags = self._tags
+            self._metric.labels(**tags).set(value)
+
+
+class Counter(PrometheusMetricMixin, AbstractCounter):
+    pass
+
+
+class Gauge(PrometheusMetricMixin, AbstractGauge):
+    pass
+
+
+class Meter(PrometheusMetricMixin, AbstractMeter):
+    pass
+
+
+class Histogram(PrometheusMetricMixin, AbstractHistogram):
+    pass
diff --git a/python/xorbits/_mars/metrics/backends/prometheus/tests/__init__.py b/python/xorbits/_mars/metrics/backends/prometheus/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/prometheus/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/metrics/backends/prometheus/tests/test_prometheus_metric.py b/python/xorbits/_mars/metrics/backends/prometheus/tests/test_prometheus_metric.py
new file mode 100644
index 000000000..9f903a6d6
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/prometheus/tests/test_prometheus_metric.py
@@ -0,0 +1,111 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+import pytest
+
+try:
+    import requests
+except ImportError:
+    requests = None
+
+try:
+    from prometheus_client import start_http_server
+except ImportError:
+    start_http_server = None
+
+from .....utils import get_next_port
+from ..prometheus_metric import Counter, Gauge, Histogram, Meter
+
+_PROMETHEUS_CLIENT_PORT = get_next_port()
+
+
+@pytest.fixture(scope="module")
+def start_prometheus_http_server():
+    if start_http_server:
+        start_http_server(_PROMETHEUS_CLIENT_PORT)
+
+
+def verify_metric(name, value, delta=1e-6):
+    if start_http_server is None or requests is None:
+        return
+    resp = requests.get("http://127.0.0.1:{}".format(_PROMETHEUS_CLIENT_PORT)).text
+    assert name in resp
+    lines = resp.splitlines()
+    for line in lines:
+        if line.startswith(name):
+            items = line.split(" ")
+            assert len(items) == 2
+            assert pytest.approx(float(items[1]), abs=delta) == value
+
+
+def test_counter(start_prometheus_http_server):
+    c = Counter("test_counter", "A test counter", ("service", "tenant"))
+    assert c.name == "test_counter"
+    assert c.description == "A test counter"
+    assert set(["host", "pid"]).issubset(set(c.tag_keys))
+    assert set(["service", "tenant"]).issubset(set(c.tag_keys))
+    assert c.type == "Counter"
+    c.record(1, {"service": "mars", "tenant": "test"})
+    verify_metric("test_counter", 1.0)
+    c.record(2, {"service": "mars", "tenant": "test"})
+    verify_metric("test_counter", 3.0)
+
+
+def test_gauge(start_prometheus_http_server):
+    g = Gauge("test_gauge", "A test gauge")
+    assert g.name == "test_gauge"
+    assert g.description == "A test gauge"
+    assert set(["host", "pid"]).issubset(set(g.tag_keys))
+    assert g.type == "Gauge"
+    g.record(0.1)
+    verify_metric("test_gauge", 0.1)
+    g.record(1.1)
+    verify_metric("test_gauge", 1.1)
+
+
+def test_meter(start_prometheus_http_server):
+    m = Meter("test_meter")
+    assert m.name == "test_meter"
+    assert m.description == ""
+    assert set(["host", "pid"]).issubset(set(m.tag_keys))
+    assert m.type == "Meter"
+    num = 3
+    while num > 0:
+        m.record(1)
+        time.sleep(1)
+        num -= 1
+    verify_metric("test_meter", 1, 0.05)
+
+
+def test_histogram(start_prometheus_http_server):
+    h = Histogram("test_histogram")
+    assert h.name == "test_histogram"
+    assert h.description == ""
+    assert set(["host", "pid"]).issubset(set(h.tag_keys))
+    assert h.type == "Histogram"
+    num = 3
+    while num > 0:
+        h.record(1)
+        h.record(2)
+        time.sleep(1)
+        num -= 1
+    verify_metric("test_histogram", 1.5, 0.15)
+    num = 3
+    while num > 0:
+        h.record(3)
+        time.sleep(1)
+        num -= 1
+    verify_metric("test_histogram", 3, 0.1)
diff --git a/python/xorbits/_mars/metrics/backends/ray/__init__.py b/python/xorbits/_mars/metrics/backends/ray/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/ray/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/metrics/backends/ray/ray_metric.py b/python/xorbits/_mars/metrics/backends/ray/ray_metric.py
new file mode 100644
index 000000000..51ee5f775
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/ray/ray_metric.py
@@ -0,0 +1,76 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Optional
+
+from ....utils import lazy_import, lazy_import_on_load
+from ..metric import (
+    AbstractCounter,
+    AbstractGauge,
+    AbstractHistogram,
+    AbstractMeter,
+    AbstractMetric,
+)
+
+ray_metrics = lazy_import("ray.util.metrics", rename="ray_metrics")
+
+_ray_gauge_set_available = None
+
+
+@lazy_import_on_load(ray_metrics)
+def _reload_ray_gauge_set_available():
+    """
+    Note: Gauge `record` method is deprecated in ray 1.3.0 version, so here
+    make it compatible with the old and new ray versions.
+    """
+    global _ray_gauge_set_available
+
+    if _ray_gauge_set_available is not None:
+        return _ray_gauge_set_available
+    _ray_gauge_set_available = (
+        True if ray_metrics and hasattr(ray_metrics.Gauge, "set") else False
+    )
+    return _ray_gauge_set_available
+
+
+class RayMetricMixin(AbstractMetric):
+    def _init(self):
+        _reload_ray_gauge_set_available()
+
+        if ray_metrics is not None:  # pragma: no branch
+            self._metric = ray_metrics.Gauge(
+                self._name, self._description, self._tag_keys
+            )
+
+    def _record(self, value=1, tags: Optional[Dict[str, str]] = None):
+        if _ray_gauge_set_available:
+            self._metric.set(value, tags)
+        elif ray_metrics is not None:  # pragma: no branch
+            self._metric.record(value, tags)
+
+
+class Counter(RayMetricMixin, AbstractCounter):
+    pass
+
+
+class Gauge(RayMetricMixin, AbstractGauge):
+    pass
+
+
+class Meter(RayMetricMixin, AbstractMeter):
+    pass
+
+
+class Histogram(RayMetricMixin, AbstractHistogram):
+    pass
diff --git a/python/xorbits/_mars/metrics/backends/ray/tests/__init__.py b/python/xorbits/_mars/metrics/backends/ray/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/ray/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/metrics/backends/ray/tests/test_ray_metric.py b/python/xorbits/_mars/metrics/backends/ray/tests/test_ray_metric.py
new file mode 100644
index 000000000..75b4747bb
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/ray/tests/test_ray_metric.py
@@ -0,0 +1,62 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .....tests.core import require_ray
+from ..ray_metric import Counter, Gauge, Histogram, Meter
+
+
+@require_ray
+def test_record():
+    c = Counter("test_counter")
+    assert c.record(1) is None
+
+
+@require_ray
+def test_counter():
+    c = Counter("test_counter", "A test counter", ("service", "tenant"))
+    assert c.name == "test_counter"
+    assert c.description == "A test counter"
+    assert c.tag_keys == ("service", "tenant")
+    assert c.type == "Counter"
+    assert c.record(1, {"service": "mars", "tenant": "test"}) is None
+
+
+@require_ray
+def test_gauge():
+    g = Gauge("test_gauge", "A test gauge")
+    assert g.name == "test_gauge"
+    assert g.description == "A test gauge"
+    assert g.tag_keys == ()
+    assert g.type == "Gauge"
+    assert g.record(1) is None
+
+
+@require_ray
+def test_meter():
+    m = Meter("test_meter")
+    assert m.name == "test_meter"
+    assert m.description == ""
+    assert m.tag_keys == ()
+    assert m.type == "Meter"
+    assert m.record(1) is None
+
+
+@require_ray
+def test_histogram():
+    h = Histogram("test_histogram")
+    assert h.name == "test_histogram"
+    assert h.description == ""
+    assert h.tag_keys == ()
+    assert h.type == "Histogram"
+    assert h.record(1) is None
diff --git a/python/xorbits/_mars/metrics/backends/tests/__init__.py b/python/xorbits/_mars/metrics/backends/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/metrics/backends/tests/test_metric.py b/python/xorbits/_mars/metrics/backends/tests/test_metric.py
new file mode 100644
index 000000000..96ab4c2e8
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/tests/test_metric.py
@@ -0,0 +1,109 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from ..metric import (
+    AbstractCounter,
+    AbstractGauge,
+    AbstractHistogram,
+    AbstractMeter,
+    AbstractMetric,
+)
+
+
+def test_illegal_arguments():
+    class DummyMetric(AbstractMetric):
+        pass
+
+    DummyMetric.__abstractmethods__ = set()
+    with pytest.raises(AssertionError):
+        DummyMetric(1)
+
+    with pytest.raises(AssertionError):
+        DummyMetric("dummy_metric", 1)
+
+    with pytest.raises(AssertionError):
+        DummyMetric("dummy_metric", "A test metric", "service")
+
+    with pytest.raises(AssertionError):
+        DummyMetric("dummy_metric", "A test metric", ("service", 1))
+
+
+def test_dummy_metric():
+    class DummyMetric(AbstractMetric):
+        pass
+
+    DummyMetric.__abstractmethods__ = set()
+    m = DummyMetric("dummy_metric", "A test metric", ("service", "tenant"))
+    assert isinstance(m, AbstractMetric)
+    assert m.name == "dummy_metric"
+    assert m.description == "A test metric"
+    assert m.tag_keys == ("service", "tenant")
+    assert m.type is None
+    assert m._init() is None
+    assert m.record() is None
+    assert m._record() is None
+
+
+def test_counter():
+    class DummyCounter(AbstractCounter):
+        pass
+
+    DummyCounter.__abstractmethods__ = set()
+    c = DummyCounter("test_counter", "A test counter", ("service", "tenant"))
+    assert c.name == "test_counter"
+    assert c.description == "A test counter"
+    assert c.tag_keys == ("service", "tenant")
+    assert c.type == "Counter"
+    assert c.record(1, {"service": "mars", "tenant": "test"}) is None
+
+
+def test_gauge():
+    class DummyGauge(AbstractGauge):
+        pass
+
+    DummyGauge.__abstractmethods__ = set()
+    g = DummyGauge("test_gauge", "A test gauge")
+    assert g.name == "test_gauge"
+    assert g.description == "A test gauge"
+    assert g.tag_keys == ()
+    assert g.type == "Gauge"
+    assert g.record(1) is None
+
+
+def test_meter():
+    class DummyMeter(AbstractMeter):
+        pass
+
+    DummyMeter.__abstractmethods__ = set()
+    m = DummyMeter("test_meter")
+    assert m.name == "test_meter"
+    assert m.description == ""
+    assert m.tag_keys == ()
+    assert m.type == "Meter"
+    assert m.record(1) is None
+
+
+def test_histogram():
+    class DummyHistogram(AbstractHistogram):
+        pass
+
+    DummyHistogram.__abstractmethods__ = set()
+    h = DummyHistogram("test_histogram")
+    assert h.name == "test_histogram"
+    assert h.description == ""
+    assert h.tag_keys == ()
+    assert h.type == "Histogram"
+    assert h.record(1) is None
diff --git a/python/xorbits/_mars/metrics/tests/__init__.py b/python/xorbits/_mars/metrics/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/metrics/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/metrics/tests/test_metric_api.py b/python/xorbits/_mars/metrics/tests/test_metric_api.py
new file mode 100644
index 000000000..138c6c01e
--- /dev/null
+++ b/python/xorbits/_mars/metrics/tests/test_metric_api.py
@@ -0,0 +1,168 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from random import random
+
+import pytest
+
+from .. import api
+from ..api import (
+    Metrics,
+    Percentile,
+    _percentile_builder,
+    init_metrics,
+    record_time_cost_percentile,
+    shutdown_metrics,
+)
+
+
+@pytest.fixture
+def init():
+    init_metrics()
+
+
+def test_init_metrics():
+    init_metrics()
+    assert api._metric_backend == "console"
+    shutdown_metrics()
+    init_metrics("console")
+    assert api._metric_backend == "console"
+    shutdown_metrics()
+    init_metrics(backend="console")
+    assert api._metric_backend == "console"
+    shutdown_metrics()
+    init_metrics("prometheus")
+    assert api._metric_backend == "prometheus"
+    shutdown_metrics()
+    init_metrics(backend="prometheus", config={"port": 0})
+    assert api._metric_backend == "prometheus"
+    shutdown_metrics()
+    init_metrics("ray")
+    assert api._metric_backend == "ray"
+    shutdown_metrics()
+    with pytest.raises(NotImplementedError):
+        init_metrics("not_exist")
+
+
+@pytest.mark.parametrize("init_firstly", [True, False])
+def test_counter(init_firstly):
+    if init_firstly:
+        init_metrics()
+    c = Metrics.counter("test_counter", "A test counter", ("service", "tenant"))
+    assert c.name == "test_counter"
+    assert c.description == "A test counter"
+    assert c.tag_keys == ("service", "tenant")
+    assert c.type == "Counter"
+    if not init_firstly:
+        init_metrics()
+    c.record(1, {"service": "mars", "tenant": "test"})
+    c.record(2, {"service": "mars", "tenant": "test"})
+    assert c.value == 3
+
+
+@pytest.mark.parametrize("init_firstly", [True, False])
+def test_gauge(init_firstly):
+    if init_firstly:
+        init_metrics()
+    g = Metrics.gauge("test_gauge", "A test gauge")
+    assert g.name == "test_gauge"
+    assert g.description == "A test gauge"
+    assert g.tag_keys == ()
+    assert g.type == "Gauge"
+    if not init_firstly:
+        init_metrics()
+    g.record(1)
+    assert g.value == 1
+    g.record(2)
+    assert g.value == 2
+
+
+@pytest.mark.parametrize("init_firstly", [True, False])
+def test_meter(init_firstly):
+    if init_firstly:
+        init_metrics()
+    m = Metrics.meter("test_meter")
+    assert m.name == "test_meter"
+    assert m.description == ""
+    assert m.tag_keys == ()
+    assert m.type == "Meter"
+    if not init_firstly:
+        init_metrics()
+    m.record(1)
+    assert m.value == 0
+    m.record(2001)
+    assert m.value > 0
+
+
+@pytest.mark.parametrize("init_firstly", [True, False])
+def test_histogram(init_firstly):
+    if init_firstly:
+        init_metrics()
+    h = Metrics.histogram("test_histogram")
+    assert h.name == "test_histogram"
+    assert h.description == ""
+    assert h.tag_keys == ()
+    assert h.type == "Histogram"
+    if not init_firstly:
+        init_metrics()
+    h.record(1)
+    assert h.value == 0
+    for i in range(2002):
+        h.record(1)
+    assert h.value > 0
+
+
+def test_percentile_report():
+    def gen_callback(data):
+        def callback(value):
+            data.append(value)
+
+        return callback
+
+    data90 = []
+    data95 = []
+    data99 = []
+
+    all_data = []
+    percentile_args = [
+        (Percentile.PercentileType.P90, gen_callback(data90), 100),
+        (Percentile.PercentileType.P95, gen_callback(data95), 100),
+        (Percentile.PercentileType.P99, gen_callback(data99), 100),
+    ]
+    percentile_list = [
+        _percentile_builder[percentile_type](callback, window)
+        for percentile_type, callback, window in percentile_args
+    ]
+    for _ in range(199):
+        data = random()
+        all_data.append(data)
+        for percentile in percentile_list:
+            percentile.record_data(data)
+    sub_data = sorted(all_data[:100])
+    print(sub_data[:10])
+    assert len(data90) == 1 and sub_data[10 - 1] == data90[0]
+    assert len(data95) == 1 and sub_data[5 - 1] == data95[0]
+    assert len(data99) == 1 and sub_data[1 - 1] == data99[0]
+
+
+def test_invaild_percentile_report():
+    with pytest.raises(ValueError):
+        Percentile(-1, 10, lambda x: ...)
+
+    with pytest.raises(ValueError):
+        Percentile(1, -1, lambda x: ...)
+
+    with pytest.raises(ValueError):
+        with record_time_cost_percentile([]):
+            raise ValueError
diff --git a/python/xorbits/_mars/opcodes.py b/python/xorbits/_mars/opcodes.py
new file mode 100644
index 000000000..5d42caaf6
--- /dev/null
+++ b/python/xorbits/_mars/opcodes.py
@@ -0,0 +1,584 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+NULL = 0
+
+# creation
+# tensor
+SCALAR = 1
+TENSOR_DATA_SOURCE = 2
+TENSOR_ONES = 3
+TENSOR_ONES_LIKE = 4
+TENSOR_ZEROS = 5
+TENSOR_ZEROS_LIKE = 6
+TENSOR_EMPTY = 7
+TENSOR_EMPTY_LIKE = 8
+TENSOR_FULL = 9
+TENSOR_FULL_LIKE = 25
+TENSOR_ARANGE = 10
+TENSOR_INDICES = 11
+TENSOR_DIAG = 12
+TENSOR_EYE = 13
+TENSOR_LINSPACE = 14
+TENSOR_TRIU = 15
+TENSOR_TRIL = 16
+# external storage
+TENSOR_FROM_TILEDB = 18
+TENSOR_STORE_TILEDB = 19
+TENSOR_STORE_TILEDB_CONSOLIDATE = 20
+TENSOR_FROM_DATAFRAME = 22
+TENSOR_FROM_HDF5 = 27
+TENSOR_STORE_HDF5 = 28
+TENSOR_FROM_ZARR = 29
+TENSOR_STORE_ZARR = 32
+
+# dataframe
+DATAFRAME_DATA_SOURCE = 17
+DATAFRAME_FROM_TENSOR = 21
+DATAFRAME_FROM_RECORDS = 24
+# series
+SERIES_DATA_SOURCE = 23
+SERIES_FROM_TENSOR = 26
+SERIES_FROM_INDEX = 39
+# index
+INDEX_DATA_SOURCE = 33
+DATE_RANGE = 34
+TIMEDELTA_RANGE = 35
+CHECK_MONOTONIC = 38
+# misc
+MEMORY_USAGE = 36
+REBALANCE = 37
+
+# GPU
+TO_GPU = 30
+TO_CPU = 31
+
+# random
+RAND_RAND = 41
+RAND_RANDN = 42
+RAND_RANDINT = 43
+RAND_RANDOM_INTEGERS = 44
+RAND_RANDOM_SAMPLE = 45
+RAND_RANDOM = 46
+RAND_RANF = 47
+RAND_SAMPLE = 48
+RAND_BYTES = 49
+
+# random distribution
+RAND_BETA = 50
+RAND_BINOMIAL = 51
+RAND_CHISQUARE = 52
+RAND_CHOICE = 53
+RAND_DIRICHLET = 54
+RAND_EXPONENTIAL = 55
+RAND_F = 56
+RAND_GAMMA = 57
+RAND_GEOMETRIC = 58
+RAND_GUMBEL = 59
+RAND_HYPERGEOMETRIC = 60
+RAND_LAPLACE = 61
+RAND_LOGISTIC = 62
+RAND_LOGNORMAL = 63
+RAND_LOGSERIES = 64
+RAND_MULTINOMIAL = 65
+RAND_MULTIVARIATE_NORMAL = 66
+RAND_NEGATIVE_BINOMIAL = 67
+RAND_NONCENTRAL_CHISQURE = 68
+RAND_NONCENTRAL_F = 69
+RAND_NORMAL = 70
+RAND_PARETO = 71
+RAND_PERMUTATION = 72
+RAND_POSSION = 73
+RAND_POWER = 74
+RAND_RAYLEIGH = 75
+RAND_SHUFFLE = 76
+RAND_STANDARD_CAUCHY = 77
+RAND_STANDARD_EXPONENTIAL = 78
+RAND_STANDARD_GAMMMA = 79
+RAND_STANDARD_NORMAL = 80
+RAND_STANDARD_T = 81
+RAND_TOMAXINT = 82
+RAND_TRIANGULAR = 83
+RAND_UNIFORM = 84
+RAND_VONMISES = 85
+RAND_WALD = 86
+RAND_WEIBULL = 87
+RAND_ZIPF = 88
+PERMUTATION = 89
+UNIQUE = 90
+
+# ufunc
+ADD = 101
+SUB = 102
+MUL = 103
+DIV = 104
+TRUEDIV = 105
+FLOORDIV = 106
+POW = 107
+MOD = 108
+FMOD = 109
+LOGADDEXP = 110
+LOGADDEXP2 = 111
+NEGATIVE = 112
+POSITIVE = 113
+ABSOLUTE = 114
+FABS = 115
+ABS = 116
+RINT = 117
+SIGN = 118
+CONJ = 119
+EXP = 120
+EXP2 = 121
+LOG = 122
+LOG2 = 123
+LOG10 = 124
+EXPM1 = 125
+LOG1P = 126
+SQRT = 127
+SQUARE = 128
+CBRT = 129
+RECIPROCAL = 130
+EQ = 131
+NE = 132
+LT = 133
+LE = 134
+GT = 135
+GE = 136
+SIN = 137
+COS = 138
+TAN = 139
+ARCSIN = 140
+ARCCOS = 141
+ARCTAN = 142
+ARCTAN2 = 143
+HYPOT = 144
+SINH = 145
+COSH = 146
+TANH = 147
+ARCSINH = 148
+ARCCOSH = 149
+ARCTANH = 150
+DEG2RAD = 151
+RAD2DEG = 152
+BITAND = 153
+BITOR = 154
+BITXOR = 155
+INVERT = 156
+LSHIFT = 157
+RSHIFT = 158
+AND = 159
+OR = 160
+XOR = 161
+NOT = 162
+MAXIMUM = 163
+MINIMUM = 164
+AROUND = 165
+FLOAT_POWER = 166
+FMAX = 167
+FMIN = 168
+ISFINITE = 169
+ISINF = 170
+ISNAN = 171
+SIGNBIT = 172
+COPYSIGN = 173
+NEXTAFTER = 174
+SPACING = 175
+LDEXP = 176
+FREXP = 177
+MODF = 178
+FLOOR = 179
+CEIL = 180
+TRUNC = 181
+DEGREES = 182
+RADIANS = 183
+CLIP = 184
+ISREAL = 185
+ISCOMPLEX = 186
+REAL = 187
+IMAG = 188
+FIX = 189
+I0 = 190
+SINC = 191
+NAN_TO_NUM = 192
+ISCLOSE = 193
+DIVMOD = 194
+ANGLE = 195
+SET_REAL = 196
+SET_IMAG = 197
+
+# special
+SPECIAL = 200
+
+# spatial
+PDIST = 231
+CDIST = 232
+SQUAREFORM = 233
+
+# tree operand
+TREE_ADD = 251
+TREE_MULTIPLY = 252
+TREE_OR = 253
+
+# reduction
+CUMSUM = 301
+CUMPROD = 302
+PROD = 303
+SUM = 304
+MAX = 305
+MIN = 306
+ALL = 307
+ANY = 308
+MEAN = 309
+ARGMAX = 310
+ARGMIN = 311
+NANSUM = 312
+NANMAX = 313
+NANMIN = 314
+NANPROD = 315
+NANMEAN = 316
+NANARGMAX = 317
+NANARGMIN = 318
+COUNT_NONZERO = 319
+MOMENT = 320
+NANMOMENT = 321
+VAR = 322
+STD = 323
+NANVAR = 324
+NANSTD = 325
+NANCUMSUM = 326
+NANCUMPROD = 327
+COUNT = 343
+CUMMAX = 344
+CUMMIN = 345
+CUMCOUNT = 346
+CORR = 347
+REDUCTION_SIZE = 348
+CUSTOM_REDUCTION = 349
+SKEW = 350
+KURTOSIS = 351
+SEM = 352
+STR_CONCAT = 353
+MAD = 354
+
+# tensor operand
+RESHAPE = 401
+SLICE = 402
+INDEX = 403
+INDEXSETVALUE = 404
+CONCATENATE = 405
+RECHUNK = 406
+ASTYPE = 407
+TRANSPOSE = 408
+SWAPAXES = 409
+BROADCAST_TO = 410
+STACK = 411
+WHERE = 412
+CHOOSE = 413
+NONZERO = 414
+ARGWHERE = 415
+UNRAVEL_INDEX = 416
+RAVEL_MULTI_INDEX = 417
+ARRAY_SPLIT = 418
+SQUEEZE = 419
+DIGITIZE = 420
+REPEAT = 421
+COPYTO = 422
+ISIN = 423
+SEARCHSORTED = 428
+SORT = 429
+HISTOGRAM = 430
+HISTOGRAM_BIN_EDGES = 431
+PARTITION = 432
+QUANTILE = 440
+FILL_DIAGONAL = 441
+NORMALIZE = 442
+TOPK = 443
+TRAPZ = 444
+GET_SHAPE = 445
+BINCOUNT = 446
+# fancy index, distributed phase is a shuffle operation that
+# the fancy indexes will be distributed to the left chunks
+# the concat phase will concat back the indexed left chunks and index
+# according to the original fancy index order
+FANCY_INDEX_DISTRIBUTE = 424
+FANCY_INDEX_CONCAT = 425
+
+# linear algebra
+TENSORDOT = 501
+DOT = 502
+MATMUL = 503
+CHOLESKY = 510
+QR = 511
+SVD = 512
+LU = 513
+SOLVE_TRIANGULAR = 520
+INV = 521
+NORM = 530
+
+# fft
+FFT = 601
+IFFT = 602
+FFT2 = 603
+IFFT2 = 604
+FFTN = 605
+IFFTN = 606
+RFFT = 607
+IRFFT = 608
+RFFT2 = 609
+IRFFT2 = 610
+RFFTN = 611
+IRFFTN = 612
+HFFT = 613
+IHFFT = 614
+FFTFREQ = 615
+FFTFREQ_CHUNK = 616
+RFFTFREQ = 617
+FFTSHIFT = 618
+IFFTSHIFT = 619
+
+# einsum
+EINSUM = 630
+
+# sparse creation
+SPARSE_MATRIX_DATA_SOURCE = 701
+DENSE_TO_SPARSE = 702
+SPARSE_TO_DENSE = 703
+
+# DataFrame
+MAP = 710
+DESCRIBE = 712
+FILL_NA = 713
+AGGREGATE = 714
+STRING_METHOD = 715
+DATETIME_METHOD = 716
+APPLY = 717
+TRANSFORM = 718
+CHECK_NA = 719
+DROP_NA = 720
+NUNIQUE = 721
+CUT = 722
+SHIFT = 723
+DIFF = 724
+VALUE_COUNTS = 725
+TO_DATETIME = 726
+DATAFRAME_DROP = 727
+DROP_DUPLICATES = 728
+MELT = 729
+RENAME = 731
+INSERT = 732
+MAP_CHUNK = 733
+CARTESIAN_CHUNK = 734
+EXPLODE = 735
+REPLACE = 736
+RENAME_AXIS = 737
+DATAFRAME_EVAL = 738
+DUPLICATED = 739
+DELETE = 740
+ALIGN = 741
+
+FUSE = 801
+
+# table like input for tensor
+TABLE_COO = 1003
+# store tensor as coo format
+STORE_COO = 1004
+
+# shuffle
+SHUFFLE_PROXY = 2001
+DATAFRAME_INDEX_ALIGN = 2004
+
+# indexing
+DATAFRAME_SET_INDEX = 2020
+DATAFRAME_SET_AXIS = 730
+DATAFRAME_ILOC_GETITEM = 2021
+DATAFRAME_ILOC_SETITEM = 2022
+DATAFRAME_LOC_GETITEM = 2023
+DATAFRAME_LOC_SETITEM = 2024
+
+# merge
+DATAFRAME_MERGE = 2010
+DATAFRAME_SHUFFLE_MERGE_ALIGN = 2011
+
+# bloom filter
+DATAFRAME_BLOOM_FILTER = 2014
+
+# append
+APPEND = 2015
+
+# reset index
+RESET_INDEX = 2028
+# reindex
+REINDEX = 2029
+
+# groupby
+GROUPBY = 2030
+GROUPBY_AGG = 2033
+GROUPBY_CONCAT = 2034
+GROUPBY_HEAD = 2035
+GROUPBY_SAMPLE_ILOC = 2036
+GROUPBY_SORT_REGULAR_SAMPLE = 2037
+GROUPBY_SORT_PIVOT = 2038
+GROUPBY_SORT_SHUFFLE = 2039
+
+# parallel sorting by regular sampling
+PSRS_SORT_REGULAR_SMAPLE = 2040
+PSRS_CONCAT_PIVOT = 2041
+PSRS_SHUFFLE = 2042
+PSRS_ALIGN = 2043
+# partition
+CALC_PARTITIONS_INFO = 2046
+PARTITION_MERGED = 2047
+
+# dataframe sort
+SORT_VALUES = 2050
+SORT_INDEX = 2051
+
+# window
+ROLLING_AGG = 2060
+EXPANDING_AGG = 2061
+EWM_AGG = 2062
+
+# store
+READ_CSV = 2100
+TO_CSV = 2101
+READ_PARQUET = 2103
+TO_PARQUET = 2104
+READ_SQL = 2105
+TO_SQL = 2108
+READ_RAYDATASET = 2109
+READ_MLDATASET = 2106
+
+TO_CSV_STAT = 2102
+
+# standardize range index
+STANDARDIZE_RANGE_INDEX = 2107
+
+# successors exclusive
+SUCCESSORS_EXCLUSIVE = 2002
+
+# read images
+IMREAD = 2110
+
+# machine learning
+
+# pairwise distances
+PAIRWISE_EUCLIDEAN_DISTANCES = 2200
+PAIRWISE_MANHATTAN_DISTANCES = 2201
+PAIRWISE_COSINE_DISTANCES = 2202
+PAIRWISE_HAVERSINE_DISTANCES = 2203
+PAIRWISE_DISTANCES_TOPK = 2204
+
+# nearest neighbors
+KD_TREE_TRAIN = 2230
+KD_TREE_QUERY = 2231
+BALL_TREE_TRAIN = 2232
+BALL_TREE_QUERY = 2233
+FAISS_BUILD_INDEX = 2234
+FAISS_TRAIN_SAMPLED_INDEX = 2235
+FAISS_QUERY = 2236
+PROXIMA_SIMPLE_BUILDER = 2238
+PROXIMA_SIMPLE_SEARCHER = 2239
+KNEIGHBORS_GRAPH = 2237
+
+# cluster
+KMEANS_PLUS_PLUS_INIT = 2250
+KMEANS_SCALABLE_PLUS_PLUS_INIT = 2251
+KMEANS_ELKAN_INIT_BOUNDS = 2252
+KMEANS_ELKAN_UPDATE = 2253
+KMEANS_ELKAN_POSTPROCESS = 2254
+KMEANS_LLOYD_UPDATE = 2255
+KMEANS_LLOYD_POSTPROCESS = 2256
+KMEANS_INERTIA = 2257
+KMEANS_RELOCASTE_EMPTY_CLUSTERS = 2258
+
+# XGBoost
+XGBOOST_TRAIN = 3001
+XGBOOST_PREDICT = 3002
+TO_DMATRIX = 3003
+START_TRACKER = 3004
+
+# LightGBM
+LGBM_TRAIN = 3020
+LGBM_PREDICT = 3021
+LGBM_ALIGN = 3022
+
+# TensorFlow
+RUN_TENSORFLOW = 3010
+
+# PyTorch
+RUN_PYTORCH = 3011
+
+# statsmodels
+STATSMODELS_TRAIN = 3012
+STATSMODELS_PREDICT = 3013
+
+# learn
+# checks
+CHECK_NON_NEGATIVE = 3300
+# classifier check targets
+CHECK_TARGETS = 3301
+ASSERT_ALL_FINITE = 3302
+# multilabel
+IS_MULTILABEL = 3303
+# get type
+TYPE_OF_TARGET = 3304
+# classification
+ACCURACY_SCORE = 3305
+# port detection
+COLLECT_PORTS = 3306
+# unique labels
+UNIQUE_LABELS = 3307
+# preprocessing
+LABEL_BINARIZE = 3308
+# ensemble: blockwise
+BLOCKWISE_ENSEMBLE_FIT = 3309
+BLOCKWISE_ENSEMBLE_PREDICT = 3310
+# ensemble: bagging
+BAGGING_SHUFFLE_SAMPLE = 3400
+BAGGING_SHUFFLE_REINDEX = 3401
+BAGGING_FIT = 3402
+BAGGING_PREDICTION = 3403
+
+# Remote Functions and class
+REMOTE_FUNCATION = 5001
+RUN_SCRIPT = 5002
+
+# vineyard
+TENSOR_FROM_VINEYARD_CHUNK = 4000
+TENSOR_FROM_VINEYARD_META = 4001
+TENSOR_STORE_VINEYARD_CHUNK = 4002
+TENSOR_STORE_VINEYARD_META = 4003
+DATAFRAME_FROM_VINEYARD_CHUNK = 4004
+DATAFRAME_FROM_VINEYARD_META = 4005
+DATAFRAME_STORE_VINEYARD_CHUNK = 4006
+DATAFRAME_STORE_VINEYARD_META = 4007
+
+CHOLESKY_FUSE = 999988
+
+# fetches
+FETCH_SHUFFLE = 999998
+FETCH = 999999
+
+
+_val_to_dict = dict()
+for _var_name, _var_val in globals().copy().items():
+    if not isinstance(_var_val, int):
+        continue
+    if _var_val in _val_to_dict:  # pragma: no cover
+        raise ImportError(
+            f"Cannot import opcode: {_var_name} and "
+            f"{_val_to_dict[_var_val]} collides with value {_var_val}"
+        )
+    _val_to_dict[_var_val] = _var_name
+del _val_to_dict, _var_name, _var_val
diff --git a/python/xorbits/_mars/optimization/__init__.py b/python/xorbits/_mars/optimization/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/optimization/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/optimization/logical/__init__.py b/python/xorbits/_mars/optimization/logical/__init__.py
new file mode 100644
index 000000000..852b4fe10
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import OptimizationRecords
diff --git a/python/xorbits/_mars/optimization/logical/chunk/__init__.py b/python/xorbits/_mars/optimization/logical/chunk/__init__.py
new file mode 100644
index 000000000..30b40aa92
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/chunk/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .column_pruning import ChunkGetitemPruneDataSource
+from .core import optimize
+from .head import ChunkHeadPushDown
diff --git a/python/xorbits/_mars/optimization/logical/chunk/column_pruning.py b/python/xorbits/_mars/optimization/logical/chunk/column_pruning.py
new file mode 100644
index 000000000..bb6f7b6c9
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/chunk/column_pruning.py
@@ -0,0 +1,24 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....dataframe.indexing.getitem import DataFrameIndex
+from ..common.column_pruning import GetitemPruneDataSource
+from .core import register_operand_based_optimization_rule
+
+
+@register_operand_based_optimization_rule([DataFrameIndex])
+class ChunkGetitemPruneDataSource(GetitemPruneDataSource):
+    """
+    Prune data source via getitem.
+    """
diff --git a/python/xorbits/_mars/optimization/logical/chunk/core.py b/python/xorbits/_mars/optimization/logical/chunk/core.py
new file mode 100644
index 000000000..702ce7905
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/chunk/core.py
@@ -0,0 +1,39 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List, Type
+
+from ....core import ChunkGraph
+from ....typing import OperandType
+from ..core import OperandBasedOptimizationRule, OptimizationRecords, Optimizer
+
+
+class ChunkOptimizer(Optimizer):
+    """
+    Chunk Optimizer
+    """
+
+
+def register_operand_based_optimization_rule(op_types: List[Type[OperandType]]):
+    def wrap(rule_type: Type[OperandBasedOptimizationRule]):
+        for op_type in op_types:
+            rule_type.register_operand(op_type)
+        ChunkOptimizer.register_rule(rule_type)
+
+    return wrap
+
+
+def optimize(chunk_graph: ChunkGraph) -> OptimizationRecords:
+    return ChunkOptimizer.optimize(chunk_graph)
diff --git a/python/xorbits/_mars/optimization/logical/chunk/head.py b/python/xorbits/_mars/optimization/logical/chunk/head.py
new file mode 100644
index 000000000..234aebf30
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/chunk/head.py
@@ -0,0 +1,24 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....dataframe.indexing.iloc import DataFrameIlocGetItem, SeriesIlocGetItem
+from ..common.head import HeadPushDown
+from .core import register_operand_based_optimization_rule
+
+
+@register_operand_based_optimization_rule([DataFrameIlocGetItem, SeriesIlocGetItem])
+class ChunkHeadPushDown(HeadPushDown):
+    """
+    Head push down.
+    """
diff --git a/python/xorbits/_mars/optimization/logical/chunk/tests/__init__.py b/python/xorbits/_mars/optimization/logical/chunk/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/chunk/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/optimization/logical/chunk/tests/test_column_pruning.py b/python/xorbits/_mars/optimization/logical/chunk/tests/test_column_pruning.py
new file mode 100644
index 000000000..03432910a
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/chunk/tests/test_column_pruning.py
@@ -0,0 +1,70 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+
+import pandas as pd
+import pytest
+
+from ..... import dataframe as md
+from .....core import (
+    ChunkGraphBuilder,
+    TileableGraph,
+    TileableGraphBuilder,
+    TileContext,
+    enter_mode,
+)
+from .. import optimize
+
+
+@pytest.fixture(scope="module")
+def gen_data1():
+    with tempfile.TemporaryDirectory() as tempdir:
+        df = pd.DataFrame(
+            {
+                "a": [3, 4, 5, 3, 5, 4, 1, 2, 3],
+                "b": [1, 3, 4, 5, 6, 5, 4, 4, 4],
+                "c": list("aabaaddce"),
+                "d": list("abaaaddce"),
+            }
+        )
+        yield df, tempdir
+
+
+@enter_mode(build=True)
+def test_groupby_read_csv(gen_data1):
+    pdf, tempdir = gen_data1
+    file_path = os.path.join(tempdir, "test.csv")
+    pdf.to_csv(file_path)
+
+    df1 = md.read_csv(file_path)
+    df2 = df1[["a", "b"]]
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    context = TileContext()
+    chunk_graph_builder = ChunkGraphBuilder(
+        graph, fuse_enabled=False, tile_context=context
+    )
+    chunk_graph = next(chunk_graph_builder.build())
+    chunk1 = context[df1.data].chunks[0].data
+    chunk2 = context[df2.data].chunks[0].data
+    records = optimize(chunk_graph)
+    opt_chunk1 = records.get_optimization_result(chunk1)
+    assert opt_chunk1 is None
+    opt_chunk2 = records.get_optimization_result(chunk2)
+    assert opt_chunk2 is not None
+    assert opt_chunk2.op.usecols == ["a", "b"]
+    # original tileable should not be modified
+    assert chunk2.inputs[0] is chunk1
diff --git a/python/xorbits/_mars/optimization/logical/chunk/tests/test_head.py b/python/xorbits/_mars/optimization/logical/chunk/tests/test_head.py
new file mode 100644
index 000000000..945644c70
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/chunk/tests/test_head.py
@@ -0,0 +1,68 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+
+import pandas as pd
+import pytest
+
+from ..... import dataframe as md
+from .....core import (
+    ChunkGraphBuilder,
+    TileableGraph,
+    TileableGraphBuilder,
+    TileContext,
+    enter_mode,
+)
+from .. import optimize
+
+
+@pytest.fixture(scope="module")
+def gen_data1():
+    with tempfile.TemporaryDirectory() as tempdir:
+        df = pd.DataFrame(
+            {
+                "a": [3, 4, 5, 3, 5, 4, 1, 2, 3],
+                "b": [1, 3, 4, 5, 6, 5, 4, 4, 4],
+                "c": list("aabaaddce"),
+                "d": list("abaaaddce"),
+            }
+        )
+        yield df, tempdir
+
+
+@enter_mode(build=True)
+def test_read_csv_head(gen_data1):
+    pdf, tempdir = gen_data1
+    file_path = os.path.join(tempdir, "test.csv")
+    pdf.to_csv(file_path)
+
+    df1 = md.read_csv(file_path)
+    df2 = df1.head(5)
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    context = TileContext()
+    chunk_graph_builder = ChunkGraphBuilder(
+        graph, fuse_enabled=False, tile_context=context
+    )
+    chunk_graph = next(chunk_graph_builder.build())
+    chunk1 = context[df1.data].chunks[0].data
+    chunk2 = context[df2.data].chunks[0].data
+    records = optimize(chunk_graph)
+    assert records.get_optimization_result(chunk1) is None
+    opt_chunk2 = records.get_optimization_result(chunk2)
+    assert opt_chunk2.op.nrows == 5
+    assert len(chunk_graph) == 1
+    assert opt_chunk2 in chunk_graph.results
diff --git a/python/xorbits/_mars/optimization/logical/common/__init__.py b/python/xorbits/_mars/optimization/logical/common/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/common/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/optimization/logical/common/column_pruning.py b/python/xorbits/_mars/optimization/logical/common/column_pruning.py
new file mode 100644
index 000000000..a05709a21
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/common/column_pruning.py
@@ -0,0 +1,239 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABCMeta, abstractmethod
+from typing import Any, List
+
+from ....core import CHUNK_TYPE, OperandType, TileableType
+from ....dataframe.datasource.core import ColumnPruneSupportedDataSourceMixin
+from ....dataframe.utils import parse_index
+from ....utils import implements
+from ..core import (
+    OperandBasedOptimizationRule,
+    OptimizationRecord,
+    OptimizationRecordType,
+)
+
+
+class PruneDataSource(OperandBasedOptimizationRule, metaclass=ABCMeta):
+    def _all_successor_prune_pushdown(self, successors: List[TileableType]):
+        for succ in successors:
+            prune_rule_types = [
+                rule_type
+                for rule_type in self._rule_type_to_op_types
+                if issubclass(rule_type, PruneDataSource)
+                and isinstance(succ.op, tuple(self._rule_type_to_op_types[rule_type]))
+            ]
+            if not prune_rule_types:
+                return False
+
+            for rule_type in prune_rule_types:
+                rule = self._cached_rule(rule_type)
+                if not rule._need_prune(succ.op):
+                    return False
+        return True
+
+    @implements(OperandBasedOptimizationRule.match_operand)
+    def match_operand(self, op: OperandType) -> bool:
+        node = op.outputs[0]
+        input_node = self._graph.predecessors(node)[0]
+        successors = self._graph.successors(input_node)
+        return self._all_successor_prune_pushdown(successors)
+
+    @abstractmethod
+    def _need_prune(self, op: OperandType) -> bool:
+        """
+        Check if this operand can prune
+
+        Returns
+        -------
+        need_prune : bool
+        """
+
+    @abstractmethod
+    def _get_selected_columns(self, op: OperandType) -> List[Any]:
+        """
+        Get selected columns to prune data source.
+
+        Parameters
+        ----------
+        op : OperandType
+            Operand.
+
+        Returns
+        -------
+        columns : list
+            Columns selected.
+        """
+
+    def _merge_selected_columns(self, selected_columns: List[Any], op: OperandType):
+        input_node = self._graph.predecessors(op.outputs[0])[0]
+        original_node = self._records.get_original_entity(input_node)
+        if original_node is None:
+            # not pruned before
+            original_all_columns = input_node.dtypes.index.tolist()
+            if set(selected_columns) != set(original_all_columns):
+                # not prune all fields
+                return [c for c in original_all_columns if c in selected_columns]
+            else:
+                return []
+        else:
+            # pruned before
+            original_all_columns = original_node.dtypes.index.tolist()
+            original_pruned_columns = input_node.op.get_columns()
+            pruned_columns_set = set(selected_columns) | set(original_pruned_columns)
+            # pruned before, cannot revert it,
+            # so we just return pruned columns
+            # even though no columns pruned
+            return [c for c in original_all_columns if c in pruned_columns_set]
+
+    @implements(OperandBasedOptimizationRule.apply_to_operand)
+    def apply_to_operand(self, op: OperandType):
+        node = op.outputs[0]
+        data_source_node = self._graph.predecessors(node)[0]
+
+        if (
+            isinstance(node, CHUNK_TYPE)
+            and self._graph.count_successors(data_source_node) == 1
+        ):
+            # merge into data source only for chunk
+            data_source_params = node.params.copy()
+            data_source_params.update(data_source_node.extra_params)
+            data_source_op = data_source_node.op.copy()
+            data_source_op._key = data_source_node.op.key
+            data_source_op._output_types = op.output_types
+            if node.ndim == 1:
+                data_source_op.set_pruned_columns(node.name, keep_order=True)
+            else:
+                data_source_op.set_pruned_columns(
+                    node.dtypes.index.tolist(), keep_order=True
+                )
+            new_entity = (
+                data_source_op.new_tileable
+                if not isinstance(node, CHUNK_TYPE)
+                else data_source_op.new_chunk
+            )
+            new_data_source_node = new_entity(
+                data_source_node.inputs, kws=[data_source_params]
+            ).data
+            new_data_source_node._key = node.key
+            new_data_source_node._id = node.id
+            # just remove the input data
+            self._graph.add_node(new_data_source_node)
+            for succ in self._graph.successors(node):
+                self._graph.add_edge(new_data_source_node, succ)
+            self._graph.remove_node(data_source_node)
+            self._graph.remove_node(node)
+
+            # mark optimization record
+            # the input node is removed
+            self._records.append_record(
+                OptimizationRecord(
+                    data_source_node, None, OptimizationRecordType.delete
+                )
+            )
+            self._records.append_record(
+                OptimizationRecord(
+                    node, new_data_source_node, OptimizationRecordType.replace
+                )
+            )
+            new_outputs = [new_data_source_node]
+        else:
+            selected_columns: List[Any] = self._get_selected_columns(op)
+            original_node = self._records.get_original_entity(data_source_node)
+            if original_node is not None:
+                # pruned before
+                dtypes = original_node.dtypes
+            else:
+                dtypes = data_source_node.dtypes
+            data_source_params = data_source_node.params.copy()
+            data_source_params["shape"] = (
+                data_source_node.shape[0],
+                len(selected_columns),
+            )
+            data_source_params["dtypes"] = dtypes = dtypes[selected_columns]
+            data_source_params["columns_value"] = parse_index(
+                dtypes.index, store_data=True
+            )
+            data_source_params.update(data_source_node.extra_params)
+            data_source_node_op = data_source_node.op.copy()
+            data_source_node_op._key = data_source_node.op.key
+            data_source_node_op.set_pruned_columns(selected_columns, keep_order=True)
+            new_data_source_node = data_source_node_op.new_tileable(
+                data_source_node_op.inputs, kws=[data_source_params]
+            ).data
+
+            self._replace_node(data_source_node, new_data_source_node)
+            # mark optimization record
+            self._records.append_record(
+                OptimizationRecord(
+                    data_source_node,
+                    new_data_source_node,
+                    OptimizationRecordType.replace,
+                )
+            )
+
+            new_op = op.copy()
+            new_op._key = op.key
+            kws = []
+            for out in op.outputs:
+                params = out.params.copy()
+                params.update(out.extra_params)
+                kws.append(params)
+            new_entity = (
+                new_op.new_tileables
+                if not isinstance(node, CHUNK_TYPE)
+                else new_op.new_chunks
+            )
+            new_outputs = [t.data for t in new_entity([new_data_source_node], kws=kws)]
+
+            for out, new_out in zip(op.outputs, new_outputs):
+                new_out._id = out.id
+                new_out._key = out.key
+                self._replace_node(out, new_out)
+                # mark optimization record
+                self._records.append_record(
+                    OptimizationRecord(out, new_out, OptimizationRecordType.replace)
+                )
+
+        for out, new_out in zip(op.outputs, new_outputs):
+            # check out if it's in result
+            try:
+                i = self._graph.results.index(out)
+                self._graph.results[i] = new_out
+            except ValueError:
+                pass
+
+
+class GetitemPruneDataSource(PruneDataSource):
+    def _need_prune(self, op: OperandType) -> bool:
+        data_source_node = self._graph.predecessors(op.outputs[0])[0]
+        input_can_be_pruned = isinstance(
+            data_source_node.op, ColumnPruneSupportedDataSourceMixin
+        )
+        if (
+            input_can_be_pruned
+            and data_source_node not in self._graph.results
+            and op.col_names is not None
+        ):
+            selected_columns = self._get_selected_columns(op)
+            if not isinstance(op.outputs[0], CHUNK_TYPE) and not selected_columns:
+                # no columns selected, skip
+                return False
+            return True
+        return False
+
+    def _get_selected_columns(self, op: OperandType) -> List[str]:
+        columns = op.col_names if isinstance(op.col_names, list) else [op.col_names]
+        return self._merge_selected_columns(columns, op)
diff --git a/python/xorbits/_mars/optimization/logical/common/head.py b/python/xorbits/_mars/optimization/logical/common/head.py
new file mode 100644
index 000000000..7e020294a
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/common/head.py
@@ -0,0 +1,145 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+from ....core import CHUNK_TYPE, OperandType, TileableType
+from ....dataframe.base.value_counts import DataFrameValueCounts
+from ....dataframe.datasource.core import HeadOptimizedDataSource
+from ....dataframe.sort.core import DataFrameSortOperand
+from ....dataframe.utils import parse_index
+from ....utils import implements
+from ..core import (
+    OperandBasedOptimizationRule,
+    OptimizationRecord,
+    OptimizationRecordType,
+)
+
+
+class HeadPushDown(OperandBasedOptimizationRule):
+    @implements(OperandBasedOptimizationRule.match_operand)
+    def match_operand(self, op: OperandType) -> bool:
+        node = op.outputs[0]
+        input_node = self._graph.predecessors(node)[0]
+        successors = self._graph.successors(input_node)
+        return self._all_successor_head_pushdown(successors)
+
+    def _all_successor_head_pushdown(self, successors: List[TileableType]):
+        for succ in successors:
+            push_down_rule_types = [
+                rule_type
+                for rule_type in self._rule_type_to_op_types
+                if issubclass(rule_type, HeadPushDown)
+                and isinstance(succ.op, tuple(self._rule_type_to_op_types[rule_type]))
+            ]
+            if not push_down_rule_types:
+                return False
+
+            for rule_type in push_down_rule_types:
+                rule = self._cached_rule(rule_type)
+                if not rule._can_push_down(succ.op):
+                    return False
+        return True
+
+    def _can_push_down(self, op: OperandType) -> bool:
+        input_nodes = self._graph.predecessors(op.outputs[0])
+        accept_types = (
+            HeadOptimizedDataSource,
+            DataFrameSortOperand,
+            DataFrameValueCounts,
+        )
+        if (
+            len(input_nodes) == 1
+            and op.can_be_optimized()
+            and isinstance(input_nodes[0].op, accept_types)
+            and input_nodes[0] not in self._graph.results
+        ):
+            return True
+        return False
+
+    def apply_to_operand(self, op: OperandType):
+        node = op.outputs[0]
+        input_node = self._graph.predecessors(node)[0]
+        nrows = input_node.op.nrows or 0
+        head = op.indexes[0].stop
+
+        new_input_op = input_node.op.copy()
+        new_input_op._key = input_node.op.key
+        new_input_op.nrows = nrows = max(nrows, head)
+        new_input_params = input_node.params.copy()
+        new_input_params["shape"] = (nrows,) + input_node.shape[1:]
+        pandas_index = node.index_value.to_pandas()[:nrows]
+        new_input_params["index_value"] = parse_index(pandas_index, node)
+        new_input_params.update(input_node.extra_params)
+        new_entity = (
+            new_input_op.new_tileable
+            if not isinstance(node, CHUNK_TYPE)
+            else new_input_op.new_chunk
+        )
+        new_input_node = new_entity(input_node.inputs, kws=[new_input_params]).data
+
+        if (
+            new_input_node.op.nrows == head
+            and self._graph.count_successors(input_node) == 1
+        ):
+            new_input_node._key = node.key
+            new_input_node._id = node.id
+            # just remove the input data
+            self._graph.add_node(new_input_node)
+            for succ in self._graph.successors(node):
+                self._graph.add_edge(new_input_node, succ)
+            for pred in self._graph.predecessors(input_node):
+                self._graph.add_edge(pred, new_input_node)
+            self._graph.remove_node(input_node)
+            self._graph.remove_node(node)
+
+            # mark optimization record
+            # the input node is removed
+            self._records.append_record(
+                OptimizationRecord(input_node, None, OptimizationRecordType.delete)
+            )
+            self._records.append_record(
+                OptimizationRecord(node, new_input_node, OptimizationRecordType.replace)
+            )
+            new_node = new_input_node
+        else:
+            self._replace_node(input_node, new_input_node)
+            new_op = op.copy()
+            new_op._key = op.key
+            params = node.params.copy()
+            params.update(node.extra_params)
+            new_entity = (
+                new_op.new_tileable
+                if not isinstance(node, CHUNK_TYPE)
+                else new_op.new_chunk
+            )
+            new_node = new_entity([new_input_node], kws=[params]).data
+            self._replace_node(node, new_node)
+
+            # mark optimization record
+            self._records.append_record(
+                OptimizationRecord(
+                    input_node, new_input_node, OptimizationRecordType.replace
+                )
+            )
+            self._records.append_record(
+                OptimizationRecord(node, new_node, OptimizationRecordType.replace)
+            )
+
+        # check node if it's in result
+        try:
+            i = self._graph.results.index(node)
+            self._graph.results[i] = new_node
+        except ValueError:
+            pass
diff --git a/python/xorbits/_mars/optimization/logical/core.py b/python/xorbits/_mars/optimization/logical/core.py
new file mode 100644
index 000000000..6e5cffeb2
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/core.py
@@ -0,0 +1,285 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import weakref
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from dataclasses import dataclass
+from enum import Enum
+from typing import Dict, List, Optional, Set, Type
+
+from ...core import EntityType, OperandType, enter_mode
+from ...core.graph import EntityGraph
+from ...utils import implements
+
+
+class OptimizationRecordType(Enum):
+    replace = 0
+    new = 1
+    delete = 2
+
+
+@dataclass
+class OptimizationRecord:
+    original_entity: EntityType = None
+    new_entity: EntityType = None
+    record_type: OptimizationRecordType = None
+
+
+class OptimizationRecords:
+    _records: List[OptimizationRecord]
+    _original_entity_to_records: Dict[EntityType, OptimizationRecord]
+
+    def __init__(self):
+        self._records = list()
+        self._original_entity_to_records = dict()
+        self._optimized_entity_to_records = dict()
+
+    def append_record(self, record: OptimizationRecord):
+        self._records.append(record)
+        if record.record_type in (
+            OptimizationRecordType.replace,
+            OptimizationRecordType.delete,
+        ):
+            self._original_entity_to_records[record.original_entity] = record
+        if record.record_type in (
+            OptimizationRecordType.new,
+            OptimizationRecordType.replace,
+        ):
+            self._optimized_entity_to_records[record.new_entity] = record
+
+    def get_optimization_result(
+        self, original_entity: EntityType, default: Optional[EntityType] = None
+    ) -> EntityType:
+        entity = original_entity
+        if entity not in self._original_entity_to_records:
+            return default
+        while entity in self._original_entity_to_records:
+            record = self._original_entity_to_records[entity]
+            if record.record_type == OptimizationRecordType.replace:
+                entity = record.new_entity
+            else:
+                assert record.record_type == OptimizationRecordType.delete
+                return None
+        return entity
+
+    def get_original_entity(
+        self, optimized_entity: EntityType, default: Optional[EntityType] = None
+    ) -> EntityType:
+        entity = optimized_entity
+        if entity not in self._optimized_entity_to_records:
+            return default
+        while entity in self._optimized_entity_to_records:
+            record = self._optimized_entity_to_records[entity]
+            if record.record_type == OptimizationRecordType.replace:
+                entity = record.original_entity
+            else:
+                assert record.record_type == OptimizationRecordType.new
+                return None
+        return entity
+
+
+class OptimizationRule(ABC):
+    _preds_to_remove = weakref.WeakKeyDictionary()
+
+    def __init__(
+        self,
+        graph: EntityGraph,
+        records: OptimizationRecords,
+        optimizer_cls: Type["Optimizer"],
+    ):
+        self._graph = graph
+        self._records = records
+        self._optimizer_cls = optimizer_cls
+        self._cached_rule = functools.lru_cache(maxsize=None)(
+            lambda _rule_type: _rule_type(
+                self._graph, self._records, self._optimizer_cls
+            )
+        )
+
+    @abstractmethod
+    def apply(self) -> bool:
+        """
+        Apply the rule to the graph.
+
+        Returns
+        -------
+        bool
+            If the graph got optimized.
+        """
+        pass
+
+    def _replace_node(self, original_node: EntityType, new_node: EntityType):
+        predecessors = self._graph.predecessors(original_node)
+        successors = self._graph.successors(original_node)
+        self._graph.remove_node(original_node)
+        self._graph.add_node(new_node)
+        for pred in predecessors:
+            self._graph.add_edge(pred, new_node)
+        for succ in successors:
+            self._graph.add_edge(new_node, succ)
+
+    def _add_collapsable_predecessor(self, node: EntityType, predecessor: EntityType):
+        pred_original = self._records.get_original_entity(predecessor, predecessor)
+        if predecessor not in self._preds_to_remove:
+            self._preds_to_remove[pred_original] = {node}
+        else:
+            self._preds_to_remove[pred_original].add(node)
+
+    def _remove_collapsable_predecessors(self, node: EntityType):
+        node = self._records.get_optimization_result(node) or node
+        preds_opt_to_remove = []
+        for pred in self._graph.predecessors(node):
+            pred_original = self._records.get_original_entity(pred, pred)
+            pred_opt = self._records.get_optimization_result(pred, pred)
+
+            if pred_opt in self._graph.results or pred_original in self._graph.results:
+                continue
+            affect_succ = self._preds_to_remove.get(pred_original) or []
+            affect_succ_opt = [
+                self._records.get_optimization_result(s, s) for s in affect_succ
+            ]
+            if all(s in affect_succ_opt for s in self._graph.successors(pred)):
+                preds_opt_to_remove.append((pred_original, pred_opt))
+
+        for pred_original, pred_opt in preds_opt_to_remove:
+            self._graph.remove_node(pred_opt)
+            self._records.append_record(
+                OptimizationRecord(pred_original, None, OptimizationRecordType.delete)
+            )
+
+
+class OperandBasedOptimizationRule(OptimizationRule):
+    """
+    Optimization rule that optimize certain operands of the graph in topological way.
+    """
+
+    _rule_type_to_op_types: Dict[
+        Type[OptimizationRule], Set[Type[OperandType]]
+    ] = defaultdict(set)
+
+    @implements(OptimizationRule.apply)
+    def apply(self) -> bool:
+        visited = set()
+        optimized = False
+        for entity in list(self._graph.topological_iter()):
+            op = entity.op
+            if op in visited:
+                continue
+            visited.add(op)
+
+            if entity not in self._graph:  # pragma: no cover
+                # maybe removed during optimization
+                continue
+            op_types = self._rule_type_to_op_types[type(self)]
+            if isinstance(op, tuple(op_types)) and self.match_operand(op):
+                optimized = True
+                self.apply_to_operand(op)
+
+        return optimized
+
+    @abstractmethod
+    def apply_to_operand(self, op: OperandType) -> None:
+        """
+        Apply this rule to the given operand.
+
+        Parameters
+        ----------
+        op : OperandType
+            Operand.
+        """
+        pass
+
+    @abstractmethod
+    def match_operand(self, op: OperandType) -> bool:
+        """
+        If this operand matches this rule.
+
+        Parameters
+        ----------
+        op : OperandType
+            Operand.
+
+        Returns
+        -------
+        bool
+            If this operand matches this rule.
+        """
+        pass
+
+    @classmethod
+    def register_operand(cls, op_type: Type[OperandType]):
+        cls._rule_type_to_op_types[cls].add(op_type)
+        for derived in op_type.__subclasses__():
+            cls._rule_type_to_op_types[cls].add(derived)
+
+
+class Optimizer(ABC):
+    _rule_types: List[Type[OptimizationRule]]
+
+    @classmethod
+    def register_rule(cls, rule_type: Type[OptimizationRule]):
+        if not hasattr(cls, "_rule_types"):
+            cls._rule_types = []
+        cls._rule_types.append(rule_type)
+
+    @classmethod
+    def _replace_inputs(cls, graph: EntityGraph, records: OptimizationRecords):
+        for node in graph:
+            for succ in graph.successors(node):
+                input_optimized = False
+                new_inputs = []
+                for inp in succ.inputs:
+                    optimized = records.get_optimization_result(inp)
+                    if optimized is None:
+                        optimized = inp
+                    if optimized is not inp:
+                        input_optimized = True
+                    new_inputs.append(optimized)
+                if input_optimized:
+                    succ.inputs = new_inputs
+
+    @classmethod
+    @enter_mode(build=True)
+    def optimize(cls, graph: EntityGraph) -> OptimizationRecords:
+        """
+        Optimize a graph.
+
+        Parameters
+        ----------
+        graph : EntityGraph
+            Tileable or chunk graph.
+
+        Returns
+        -------
+        optimization_records : OptimizationRecords
+            Optimization records.
+        """
+        records = OptimizationRecords()
+        cached_rule = functools.lru_cache(maxsize=None)(
+            lambda _rule_type: _rule_type(graph, records, cls)
+        )
+
+        for rule_type in cls._rule_types:
+            rule = cached_rule(rule_type)
+            if rule.apply():
+                cls._replace_inputs(graph, records)
+                new_results = []
+                for result in graph.results:
+                    new_results.append(
+                        records.get_optimization_result(result, default=result)
+                    )
+                graph.results = new_results
+
+        return records
diff --git a/python/xorbits/_mars/optimization/logical/tileable/__init__.py b/python/xorbits/_mars/optimization/logical/tileable/__init__.py
new file mode 100644
index 000000000..f28460357
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .arithmetic_query import SeriesArithmeticToEval
+
+# TODO: the order of applying optimization rules depends on the order of import
+# Column pruning must be applied first for now.
+from .column_pruning import ColumnPruningRule
+from .core import optimize
+from .head import HeadPushDown
diff --git a/python/xorbits/_mars/optimization/logical/tileable/arithmetic_query.py b/python/xorbits/_mars/optimization/logical/tileable/arithmetic_query.py
new file mode 100644
index 000000000..56a580aef
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/arithmetic_query.py
@@ -0,0 +1,366 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import weakref
+from typing import NamedTuple, Optional
+
+import numpy as np
+from pandas.api.types import is_scalar
+
+from .... import dataframe as md
+from ....core import ENTITY_TYPE, Tileable, get_output_types
+from ....dataframe.arithmetic.core import DataFrameBinopUfunc, DataFrameUnaryUfunc
+from ....dataframe.base.eval import DataFrameEval
+from ....dataframe.indexing.getitem import DataFrameIndex
+from ....dataframe.indexing.setitem import DataFrameSetitem
+from ....typing import OperandType
+from ....utils import implements
+from ..core import OptimizationRecord, OptimizationRecordType
+from ..tileable.core import register_operand_based_optimization_rule
+from .core import OperandBasedOptimizationRule
+
+
+class EvalExtractRecord(NamedTuple):
+    tileable: Optional[Tileable] = None
+    expr: Optional[str] = None
+    variables: Optional[dict] = None
+
+
+def _get_binop_builder(op_str: str):
+    def builder(lhs: str, rhs: str):
+        return f"({lhs}) {op_str} ({rhs})"
+
+    return builder
+
+
+_func_name_to_builder = {
+    "add": _get_binop_builder("+"),
+    "sub": _get_binop_builder("-"),
+    "mul": _get_binop_builder("*"),
+    "floordiv": _get_binop_builder("//"),
+    "truediv": _get_binop_builder("/"),
+    "pow": _get_binop_builder("**"),
+    "eq": _get_binop_builder("=="),
+    "ne": _get_binop_builder("!="),
+    "lt": _get_binop_builder("<"),
+    "le": _get_binop_builder("<="),
+    "gt": _get_binop_builder(">"),
+    "ge": _get_binop_builder(">="),
+    "__and__": _get_binop_builder("&"),
+    "__or__": _get_binop_builder("|"),
+    "__xor__": _get_binop_builder("^"),
+    "negative": lambda expr: f"-({expr})",
+    "__invert__": lambda expr: f"~({expr})",
+}
+_extract_result_cache = weakref.WeakKeyDictionary()
+
+
+@register_operand_based_optimization_rule([DataFrameUnaryUfunc, DataFrameBinopUfunc])
+class SeriesArithmeticToEval(OperandBasedOptimizationRule):
+    _var_counter = 0
+
+    @classmethod
+    def _next_var_id(cls):
+        cls._var_counter += 1
+        return cls._var_counter
+
+    @implements(OperandBasedOptimizationRule.match_operand)
+    def match_operand(self, op: OperandType) -> bool:
+        if op.gpu:
+            return False
+        _, expr, _ = self._extract_eval_expression(op.outputs[0])
+        return expr is not None
+
+    @staticmethod
+    def _is_select_dataframe_column(tileable) -> bool:
+        if not isinstance(tileable, md.Series) or not isinstance(
+            tileable.op, DataFrameIndex
+        ):
+            return False
+
+        input_df = tileable.inputs[0]
+        index_op: DataFrameIndex = tileable.op
+        if (
+            not isinstance(input_df, md.DataFrame)
+            or input_df.dtypes is None
+            or not input_df.dtypes.index.is_unique
+            or any(not isinstance(v, str) for v in input_df.dtypes.keys())
+        ):
+            return False
+
+        return (
+            isinstance(input_df, md.DataFrame)
+            and input_df.dtypes is not None
+            and index_op.col_names is not None
+            and index_op.col_names in input_df.dtypes
+            and index_op.mask is None
+        )
+
+    def _extract_eval_expression(self, tileable) -> EvalExtractRecord:
+        if is_scalar(tileable):
+            if isinstance(tileable, (int, bool, str, bytes, np.integer, np.bool_)):
+                return EvalExtractRecord(expr=repr(tileable))
+            else:
+                var_name = f"__eval_scalar_var{self._next_var_id()}"
+                var_dict = {var_name: tileable}
+                return EvalExtractRecord(expr=f"@{var_name}", variables=var_dict)
+
+        if not isinstance(tileable, ENTITY_TYPE):  # pragma: no cover
+            return EvalExtractRecord()
+
+        if tileable in _extract_result_cache:
+            return _extract_result_cache[tileable]
+
+        if self._is_select_dataframe_column(tileable):
+            result = self._extract_column_select(tileable)
+        elif isinstance(tileable.op, DataFrameUnaryUfunc):
+            result = self._extract_unary(tileable)
+        elif isinstance(tileable.op, DataFrameBinopUfunc):
+            if tileable.op.fill_value is not None or tileable.op.level is not None:
+                result = EvalExtractRecord()
+            else:
+                result = self._extract_binary(tileable)
+        else:
+            result = EvalExtractRecord()
+
+        _extract_result_cache[tileable] = result
+        return result
+
+    @classmethod
+    def _extract_column_select(cls, tileable) -> EvalExtractRecord:
+        return EvalExtractRecord(tileable.inputs[0], f"`{tileable.op.col_names}`")
+
+    def _extract_unary(self, tileable) -> EvalExtractRecord:
+        op = tileable.op
+        func_name = getattr(op, "_func_name") or getattr(op, "_bin_func_name")
+        if func_name not in _func_name_to_builder:  # pragma: no cover
+            return EvalExtractRecord()
+
+        in_tileable, expr, variables = self._extract_eval_expression(op.inputs[0])
+        if in_tileable is None:
+            return EvalExtractRecord()
+
+        self._add_collapsable_predecessor(tileable, op.inputs[0])
+        return EvalExtractRecord(
+            in_tileable, _func_name_to_builder[func_name](expr), variables
+        )
+
+    def _extract_binary(self, tileable) -> EvalExtractRecord:
+        op = tileable.op
+        func_name = getattr(op, "_func_name", None) or getattr(op, "_bit_func_name")
+        if func_name not in _func_name_to_builder:  # pragma: no cover
+            return EvalExtractRecord()
+
+        lhs_tileable, lhs_expr, lhs_vars = self._extract_eval_expression(op.lhs)
+        if lhs_tileable is not None:
+            self._add_collapsable_predecessor(tileable, op.lhs)
+        rhs_tileable, rhs_expr, rhs_vars = self._extract_eval_expression(op.rhs)
+        if rhs_tileable is not None:
+            self._add_collapsable_predecessor(tileable, op.rhs)
+
+        if lhs_expr is None or rhs_expr is None:
+            return EvalExtractRecord()
+        if (
+            lhs_tileable is not None
+            and rhs_tileable is not None
+            and lhs_tileable.key != rhs_tileable.key
+        ):
+            return EvalExtractRecord()
+
+        variables = (lhs_vars or dict()).copy()
+        variables.update(rhs_vars or dict())
+        in_tileable = next(t for t in [lhs_tileable, rhs_tileable] if t is not None)
+        return EvalExtractRecord(
+            in_tileable, _func_name_to_builder[func_name](lhs_expr, rhs_expr), variables
+        )
+
+    @implements(OperandBasedOptimizationRule.apply_to_operand)
+    def apply_to_operand(self, op: OperandType):
+        node = op.outputs[0]
+        in_tileable, expr, variables = self._extract_eval_expression(node)
+        opt_in_tileable = self._records.get_optimization_result(
+            in_tileable, in_tileable
+        )
+
+        new_op = DataFrameEval(
+            _key=node.op.key,
+            _output_types=get_output_types(node),
+            expr=expr,
+            variables=variables or dict(),
+            parser="pandas",
+            is_query=False,
+        )
+        new_node = new_op.new_tileable(
+            [opt_in_tileable], _key=node.key, _id=node.id, **node.params
+        ).data
+
+        self._remove_collapsable_predecessors(node)
+        self._replace_node(node, new_node)
+        self._graph.add_edge(opt_in_tileable, new_node)
+
+        self._records.append_record(
+            OptimizationRecord(node, new_node, OptimizationRecordType.replace)
+        )
+
+        # check node if it's in result
+        try:
+            i = self._graph.results.index(node)
+            self._graph.results[i] = new_node
+        except ValueError:
+            pass
+
+
+class _DataFrameEvalRewriteRule(OperandBasedOptimizationRule):
+    @implements(OperandBasedOptimizationRule.match_operand)
+    def match_operand(self, op: OperandType) -> bool:
+        optimized_eval_op = self._get_optimized_eval_op(op)
+        if (
+            op.gpu
+            or not isinstance(optimized_eval_op, DataFrameEval)
+            or optimized_eval_op.is_query
+            or optimized_eval_op.inputs[0].key != op.inputs[0].key
+        ):
+            return False
+        return True
+
+    def _build_new_eval_op(self, op: OperandType):
+        raise NotImplementedError
+
+    def _get_optimized_eval_op(self, op: OperandType) -> OperandType:
+        in_columnar_node = self._get_input_columnar_node(op)
+        optimized = self._records.get_optimization_result(in_columnar_node)
+        return optimized.op if optimized is not None else in_columnar_node.op
+
+    def _get_input_columnar_node(self, op: OperandType) -> ENTITY_TYPE:
+        raise NotImplementedError
+
+    def _update_op_node(self, old_node: ENTITY_TYPE, new_node: ENTITY_TYPE):
+        self._replace_node(old_node, new_node)
+        for in_tileable in new_node.inputs:
+            self._graph.add_edge(in_tileable, new_node)
+
+        original_node = self._records.get_original_entity(old_node, old_node)
+        self._records.append_record(
+            OptimizationRecord(original_node, new_node, OptimizationRecordType.replace)
+        )
+
+    @implements(OperandBasedOptimizationRule.apply_to_operand)
+    def apply_to_operand(self, op: DataFrameIndex):
+        node = op.outputs[0]
+        in_tileable = op.inputs[0]
+        in_columnar_node = self._get_input_columnar_node(op)
+        opt_in_tileable = self._records.get_optimization_result(
+            in_tileable, in_tileable
+        )
+
+        new_op = self._build_new_eval_op(op)
+        new_node = new_op.new_tileable(
+            [opt_in_tileable], _key=node.key, _id=node.id, **node.params
+        ).data
+
+        self._add_collapsable_predecessor(node, in_columnar_node)
+        self._remove_collapsable_predecessors(node)
+        self._update_op_node(node, new_node)
+
+
+@register_operand_based_optimization_rule([DataFrameIndex])
+class DataFrameBoolEvalToQuery(_DataFrameEvalRewriteRule):
+    @implements(OperandBasedOptimizationRule.match_operand)
+    def match_operand(self, op: DataFrameIndex) -> bool:
+        if (
+            op.col_names is not None
+            or not isinstance(op.mask, md.Series)
+            or op.mask.dtype != bool
+        ):
+            return False
+        return super().match_operand(op)
+
+    def _get_input_columnar_node(self, op: OperandType) -> ENTITY_TYPE:
+        return op.mask
+
+    def _build_new_eval_op(self, op: OperandType):
+        in_eval_op = self._get_optimized_eval_op(op)
+        return DataFrameEval(
+            _key=op.key,
+            _output_types=get_output_types(op.outputs[0]),
+            expr=in_eval_op.expr,
+            variables=in_eval_op.variables,
+            parser="pandas",
+            is_query=True,
+        )
+
+
+@register_operand_based_optimization_rule([DataFrameSetitem])
+class DataFrameEvalSetItemToEval(_DataFrameEvalRewriteRule):
+    @implements(OperandBasedOptimizationRule.match_operand)
+    def match_operand(self, op: DataFrameSetitem):
+        if not isinstance(op.indexes, str) or not isinstance(op.value, md.Series):
+            return False
+        return super().match_operand(op)
+
+    def _get_input_columnar_node(self, op: DataFrameSetitem) -> ENTITY_TYPE:
+        return op.value
+
+    def _build_new_eval_op(self, op: DataFrameSetitem):
+        in_eval_op = self._get_optimized_eval_op(op)
+        return DataFrameEval(
+            _key=op.key,
+            _output_types=get_output_types(op.outputs[0]),
+            expr=f"`{op.indexes}` = {in_eval_op.expr}",
+            variables=in_eval_op.variables,
+            parser="pandas",
+            is_query=False,
+            self_target=True,
+        )
+
+    @implements(OperandBasedOptimizationRule.apply_to_operand)
+    def apply_to_operand(self, op: DataFrameIndex):
+        super().apply_to_operand(op)
+
+        node = op.outputs[0]
+        opt_node = self._records.get_optimization_result(node, node)
+        if not isinstance(opt_node.op, DataFrameEval):  # pragma: no cover
+            return
+
+        # when encountering consecutive SetItems, expressions can be
+        # merged as a multiline expression
+        pred_opt_node = opt_node.inputs[0]
+        if (
+            isinstance(pred_opt_node.op, DataFrameEval)
+            and opt_node.op.parser == pred_opt_node.op.parser == "pandas"
+            and not opt_node.op.is_query
+            and not pred_opt_node.op.is_query
+            and opt_node.op.self_target
+            and pred_opt_node.op.self_target
+        ):
+            new_expr = pred_opt_node.op.expr + "\n" + opt_node.op.expr
+            new_variables = (pred_opt_node.op.variables or dict()).copy()
+            new_variables.update(opt_node.op.variables or dict())
+
+            new_op = DataFrameEval(
+                _key=op.key,
+                _output_types=get_output_types(op.outputs[0]),
+                expr=new_expr,
+                variables=new_variables,
+                parser="pandas",
+                is_query=False,
+                self_target=True,
+            )
+            new_node = new_op.new_tileable(
+                pred_opt_node.inputs, _key=node.key, _id=node.id, **node.params
+            ).data
+
+            self._add_collapsable_predecessor(opt_node, pred_opt_node)
+            self._remove_collapsable_predecessors(opt_node)
+            self._update_op_node(opt_node, new_node)
diff --git a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/__init__.py b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/__init__.py
new file mode 100644
index 000000000..0ef066397
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .column_pruning_rule import ColumnPruningRule
diff --git a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/column_pruning_rule.py b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/column_pruning_rule.py
new file mode 100644
index 000000000..41927a78a
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/column_pruning_rule.py
@@ -0,0 +1,241 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Dict, Set, Any, Type, Union, Optional
+
+import pandas as pd
+
+from .input_column_selector import InputColumnSelector
+from .self_column_selector import SelfColumnSelector
+from ..core import register_optimization_rule
+from ...core import (
+    OptimizationRecord,
+    OptimizationRecordType,
+    OptimizationRule,
+    OptimizationRecords,
+    Optimizer,
+)
+from .....core import TileableData
+from .....core.graph import EntityGraph
+from .....dataframe.core import (
+    parse_index,
+    BaseSeriesData,
+    BaseDataFrameData,
+)
+from .....dataframe.datasource.core import ColumnPruneSupportedDataSourceMixin
+from .....dataframe.groupby.aggregation import DataFrameGroupByAgg
+from .....dataframe.indexing.getitem import DataFrameIndex
+from .....dataframe.merge import DataFrameMerge
+from .....utils import implements
+
+OPTIMIZABLE_OP_TYPES = (DataFrameMerge, DataFrameGroupByAgg)
+
+
+@register_optimization_rule()
+class ColumnPruningRule(OptimizationRule):
+    def __init__(
+        self,
+        graph: EntityGraph,
+        records: OptimizationRecords,
+        optimizer_cls: Type["Optimizer"],
+    ):
+        super().__init__(graph, records, optimizer_cls)
+        self._context: Dict[TileableData, Dict[TileableData, Set[Any]]] = {}
+
+    def _get_successor_required_columns(self, data: TileableData) -> Set[Any]:
+        """
+        Get columns required by the successors of the given tileable data.
+        """
+        successors = self._get_successors(data)
+        if successors:
+            return set().union(
+                *[self._context[successor][data] for successor in successors]
+            )
+        else:
+            return self._get_all_columns(data)
+
+    @staticmethod
+    def _get_self_required_columns(data: TileableData) -> Set[Any]:
+        return SelfColumnSelector.select(data)
+
+    def _get_required_columns(self, data: TileableData) -> Optional[Set[Any]]:
+        required_columns = set()
+        successor_required_columns = self._get_successor_required_columns(data)
+        if successor_required_columns is None:
+            return None
+        required_columns.update(successor_required_columns)
+        self_required_columns = self._get_self_required_columns(data)
+        required_columns.update(self_required_columns)
+        return required_columns
+
+    @staticmethod
+    def _get_all_columns(data: TileableData) -> Union[Set[Any], None]:
+        """
+        Return all the columns of given tileable data. If the given tileable data is neither
+        BaseDataFrameData nor BaseSeriesData, None will be returned, indicating that column pruning
+        is not available for the given tileable data.
+        """
+        if isinstance(data, BaseDataFrameData) and data.dtypes is not None:
+            return set(data.dtypes.index)
+        elif isinstance(data, BaseSeriesData):
+            return {data.name}
+        else:
+            return None
+
+    def _get_successors(self, data: TileableData) -> List[TileableData]:
+        """
+        Get successors of the given tileable data.
+
+        Column pruning is available only when every successor is available for column pruning
+        (i.e. appears in the context).
+        """
+        successors = list(self._graph.successors(data))
+        if all(successor in self._context for successor in successors):
+            return successors
+        else:
+            return []
+
+    def _build_context(self) -> None:
+        """
+        Select required columns for each tileable data in the graph.
+        """
+        for data in self._graph.topological_iter(reverse=True):
+            if self._is_skipped_type(data):
+                continue
+            self._context[data] = InputColumnSelector.select(
+                data, self._get_successor_required_columns(data)
+            )
+
+    def _prune_columns(self) -> List[TileableData]:
+        pruned_nodes: List[TileableData] = []
+        datasource_nodes: List[TileableData] = []
+
+        node_list = list(self._graph.topological_iter())
+        for data in node_list:
+            if self._is_skipped_type(data):
+                continue
+
+            op = data.op
+
+            successor_required_columns = self._get_successor_required_columns(data)
+            if (
+                isinstance(op, ColumnPruneSupportedDataSourceMixin)
+                and successor_required_columns is not None
+                and set(successor_required_columns) != self._get_all_columns(data)
+            ):
+                op.set_pruned_columns(list(successor_required_columns))
+                self.effective = True
+                pruned_nodes.append(data)
+                datasource_nodes.append(data)
+                continue
+
+            if isinstance(op, OPTIMIZABLE_OP_TYPES):
+                predecessors = list(self._graph.predecessors(data))
+                for predecessor in predecessors:
+                    if (
+                        self._is_skipped_type(predecessor)
+                        or predecessor in datasource_nodes
+                        # if the group by key is a series, no need to do column pruning
+                        or isinstance(predecessor, BaseSeriesData)
+                    ):
+                        continue
+
+                    pruned_columns = list(self._context[data][predecessor])
+                    if set(pruned_columns) == self._get_all_columns(predecessor):
+                        continue
+
+                    # new node init
+                    new_node_op = DataFrameIndex(
+                        col_names=pruned_columns,
+                    )
+                    new_params = predecessor.params.copy()
+                    new_params["shape"] = (
+                        new_params["shape"][0],
+                        len(pruned_columns),
+                    )
+                    new_params["dtypes"] = new_params["dtypes"][pruned_columns]
+                    new_params["columns_value"] = parse_index(
+                        new_params["dtypes"].index, store_data=True
+                    )
+                    new_node = new_node_op.new_dataframe(
+                        [predecessor], **new_params
+                    ).data
+
+                    # update context
+                    del self._context[data][predecessor]
+                    self._context[new_node] = {predecessor: set(pruned_columns)}
+                    self._context[data][new_node] = set(pruned_columns)
+
+                    # change edges and nodes
+                    self._graph.remove_edge(predecessor, data)
+                    self._graph.add_node(new_node)
+                    self._graph.add_edge(predecessor, new_node)
+                    self._graph.add_edge(new_node, data)
+
+                    self._records.append_record(
+                        OptimizationRecord(
+                            predecessor, new_node, OptimizationRecordType.new
+                        )
+                    )
+                    # update inputs
+                    data.inputs[data.inputs.index(predecessor)] = new_node
+                    self.effective = True
+                    pruned_nodes.extend([predecessor])
+        return pruned_nodes
+
+    def _update_tileable_params(self, pruned_nodes: List[TileableData]) -> None:
+        # change dtypes and columns_value
+        queue = [n for n in pruned_nodes]
+        affected_nodes = set()
+        while len(queue) > 0:
+            node = queue.pop(0)
+            if isinstance(node.op, ColumnPruneSupportedDataSourceMixin):
+                affected_nodes.add(node)
+            for successor in self._graph.successors(node):
+                if successor not in affected_nodes:
+                    queue.append(successor)
+                    if not self._is_skipped_type(successor):
+                        affected_nodes.add(successor)
+
+        for node in affected_nodes:
+            required_columns = self._get_required_columns(node)
+            if (
+                isinstance(node, BaseDataFrameData)
+                and required_columns is not None
+                and set(required_columns) != set(node.dtypes.index)
+            ):
+                new_dtypes = pd.Series(
+                    dict(
+                        (col, dtype)
+                        for col, dtype in node.dtypes.items()
+                        if col in required_columns
+                    )
+                )
+                new_columns_value = parse_index(new_dtypes.index, store_data=True)
+                node._dtypes = new_dtypes
+                node._columns_value = new_columns_value
+                node._shape = (node.shape[0], len(new_dtypes))
+
+    @implements(OptimizationRule.apply)
+    def apply(self):
+        self._build_context()
+        pruned_nodes = self._prune_columns()
+        self._update_tileable_params(pruned_nodes)
+
+    @staticmethod
+    def _is_skipped_type(data: TileableData) -> bool:
+        """
+        If column pruning should be applied to the given tileable data.
+        """
+        return not isinstance(data, (BaseSeriesData, BaseDataFrameData))
diff --git a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/input_column_selector.py b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/input_column_selector.py
new file mode 100644
index 000000000..4481dca91
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/input_column_selector.py
@@ -0,0 +1,255 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from typing import Callable, Dict, Any, Set
+
+from .....core import TileableData
+from .....dataframe import NamedAgg
+from .....dataframe.arithmetic.core import DataFrameBinOp, DataFrameUnaryOp
+from .....dataframe.core import (
+    BaseDataFrameData,
+    BaseSeriesData,
+)
+from .....dataframe.groupby.aggregation import DataFrameGroupByAgg
+from .....dataframe.indexing.getitem import DataFrameIndex
+from .....dataframe.indexing.setitem import DataFrameSetitem
+from .....dataframe.merge import DataFrameMerge
+from .....typing import OperandType
+from .utils import get_cols_exclude_index
+
+
+class InputColumnSelector:
+    _OP_TO_SELECT_FUNCTION = {}
+
+    @staticmethod
+    def select_all_input_columns(
+        tileable_data: TileableData, _required_cols: Set[Any]
+    ) -> Dict[TileableData, Set[Any]]:
+        ret = {}
+        for inp in tileable_data.op.inputs:
+            if isinstance(inp, BaseDataFrameData):
+                ret[inp] = set(inp.dtypes.index)
+            elif isinstance(inp, BaseSeriesData):
+                ret[inp] = {inp.name}
+        return ret
+
+    @staticmethod
+    def select_required_input_columns(
+        tileable_data: TileableData, required_cols: Set[Any]
+    ) -> Dict[TileableData, Set[Any]]:
+        ret = {}
+        for inp in tileable_data.op.inputs:
+            if isinstance(inp, BaseDataFrameData):
+                ret[inp] = required_cols.intersection(set(inp.dtypes.index))
+            elif isinstance(inp, BaseSeriesData):
+                ret[inp] = {inp.name}
+        return ret
+
+    @classmethod
+    def register(
+        cls,
+        op_cls: OperandType,
+        func: Callable[[TileableData, Set[Any]], Dict[TileableData, Set[Any]]],
+        replace: bool = False,
+    ) -> None:
+        if op_cls not in cls._OP_TO_SELECT_FUNCTION or replace:
+            cls._OP_TO_SELECT_FUNCTION[op_cls] = func
+        else:
+            raise ValueError(f"key {op_cls} exists.")
+
+    @classmethod
+    def unregister(cls, op_cls: OperandType) -> None:
+        if op_cls in cls._OP_TO_SELECT_FUNCTION:
+            del cls._OP_TO_SELECT_FUNCTION[op_cls]
+
+    @classmethod
+    def select(
+        cls, tileable_data: TileableData, required_cols: Set[Any]
+    ) -> Dict[TileableData, Set[Any]]:
+        """
+        Get the column pruning results of given tileable data.
+
+        Parameters
+        ----------
+        tileable_data : TileableData
+            The tileable data to be processed.
+        required_cols: List[Any]
+            Names of columns required by the successors of the given tileable data. If required_cols is None, all the
+            input columns will be selected.
+        Returns
+        -------
+        Dict[TileableData: List[Any]]
+            A dictionary that represents the column pruning results. For every key-value pairs in the dictionary, the
+            key is a predecessor of the given tileable data, and the value is a list of column names that the given
+            tileable data depends on.
+        """
+        if required_cols is None:
+            return cls.select_all_input_columns(tileable_data, set())
+
+        op_type = type(tileable_data.op)
+        if op_type in cls._OP_TO_SELECT_FUNCTION:
+            return cls._OP_TO_SELECT_FUNCTION[op_type](tileable_data, required_cols)
+        for op_cls in op_type.__mro__:
+            if op_cls in cls._OP_TO_SELECT_FUNCTION:
+                cls._OP_TO_SELECT_FUNCTION[op_type] = cls._OP_TO_SELECT_FUNCTION[op_cls]
+                return cls._OP_TO_SELECT_FUNCTION[op_cls](tileable_data, required_cols)
+        return cls.select_all_input_columns(tileable_data, required_cols)
+
+
+def register_selector(op_type: OperandType) -> Callable:
+    def wrap(selector_func: Callable):
+        InputColumnSelector.register(op_type, selector_func)
+        return selector_func
+
+    return wrap
+
+
+@register_selector(DataFrameMerge)
+def df_merge_select_function(
+    tileable_data: TileableData, required_cols: Set[Any]
+) -> Dict[TileableData, Set[Any]]:
+    op: DataFrameMerge = tileable_data.op
+    assert len(op.inputs) == 2
+    assert isinstance(op.inputs[0], BaseDataFrameData)
+    assert isinstance(op.inputs[1], BaseDataFrameData)
+    left_data: BaseDataFrameData = op.inputs[0]
+    right_data: BaseDataFrameData = op.inputs[1]
+
+    ret = defaultdict(set)
+    for df, suffix in zip([left_data, right_data], op.suffixes):
+        for col in df.dtypes.index:
+            if col in required_cols:
+                ret[df].add(col)
+            else:
+                # TODO: this does not work when col is a tuple.
+                suffix_col = str(col) + suffix
+                if suffix_col in required_cols:
+                    ret[df].add(col)
+                    # The column in the other dataframe has to be selected as well. Otherwise, in
+                    # the runtime, there will not be a column with suffix.
+                    other_data = right_data if df is left_data else left_data
+                    ret[other_data].add(col)
+
+    if op.on is not None:
+        ret[left_data].update(get_cols_exclude_index(left_data, op.on))
+        ret[right_data].update(get_cols_exclude_index(right_data, op.on))
+    if op.left_on is not None:
+        ret[left_data].update(get_cols_exclude_index(left_data, op.left_on))
+    if op.right_on is not None:
+        ret[right_data].update(get_cols_exclude_index(right_data, op.right_on))
+
+    return ret
+
+
+@register_selector(DataFrameGroupByAgg)
+def df_groupby_agg_select_function(
+    tileable_data: TileableData, required_cols: Set[Any]
+) -> Dict[TileableData, Set[Any]]:
+    op: DataFrameGroupByAgg = tileable_data.op
+    assert isinstance(op.inputs[0], (BaseDataFrameData, BaseSeriesData))
+    inp: BaseDataFrameData = op.inputs[0]
+    by = op.groupby_params["by"]
+    selection = op.groupby_params.get("selection", None)
+    raw_func = op.raw_func
+
+    ret = {}
+    # group by a series
+    groupby_series = False
+    if isinstance(by, list) and len(by) == 1 and isinstance(by[0], BaseSeriesData):
+        groupby_series = True
+        ret[by[0]] = by[0].name
+
+    if isinstance(inp, BaseSeriesData):
+        ret[inp] = {inp.name}
+    else:
+        selected_cols = set()
+        # group by keys should be included
+        if not groupby_series:
+            selected_cols.update(get_cols_exclude_index(inp, by))
+        # add agg columns
+        if op.raw_func is not None:
+            if op.raw_func == "size":
+                # special for size, its return value is always series
+                pass
+            elif isinstance(raw_func, dict):
+                selected_cols.update(set(raw_func.keys()))
+            else:
+                # no specified agg columns
+                # required_cols should always be a subset of selection
+                for col in required_cols:
+                    # col is a tuple when required col is a MultiIndex
+                    if isinstance(col, tuple):
+                        for c in col:
+                            selected_cols.add(c)
+                    selected_cols.add(col)
+                if selection is not None:
+                    if isinstance(selection, (list, tuple)):
+                        selected_cols.update(set(selection))
+                    else:
+                        selected_cols.add(selection)
+        elif op.raw_func_kw:
+            # add renamed columns
+            for _, origin in op.raw_func_kw.items():
+                if isinstance(origin, NamedAgg):
+                    selected_cols.add(origin.column)
+                else:
+                    assert isinstance(origin, tuple)
+                    selected_cols.add(origin[0])
+
+        ret[inp] = selected_cols.intersection(inp.dtypes.index)
+    return ret
+
+
+@register_selector(DataFrameSetitem)
+def df_setitem_select_function(
+    tileable_data: TileableData, required_cols: Set[Any]
+) -> Dict[TileableData, Set[Any]]:
+    if len(tileable_data.inputs) == 1:
+        # if value is not a Mars object, return required input columns
+        return InputColumnSelector.select_required_input_columns(
+            tileable_data, required_cols
+        )
+    else:
+        df, value = tileable_data.inputs
+        ret = {df: required_cols.intersection(set(df.dtypes.index))}
+        # if value is a Mars object, return all its columns so that setitem can be executed
+        if isinstance(value, BaseDataFrameData):
+            value_cols = set(value.dtypes.index)
+            ret[value] = value_cols
+        elif isinstance(value, BaseSeriesData):
+            value_cols = {value.name}
+            ret[value] = value_cols
+        return ret
+
+
+@register_selector(DataFrameIndex)
+def df_getitem_select_function(
+    tileable_data: TileableData, required_cols: Set[Any]
+) -> Dict[TileableData, Set[Any]]:
+    if tileable_data.op.col_names:
+        return InputColumnSelector.select_required_input_columns(
+            tileable_data, required_cols
+        )
+    else:
+        return InputColumnSelector.select_all_input_columns(
+            tileable_data, required_cols
+        )
+
+
+SELECT_REQUIRED_OP_TYPES = [DataFrameBinOp, DataFrameUnaryOp]
+for op_type in SELECT_REQUIRED_OP_TYPES:
+    InputColumnSelector.register(
+        op_type, InputColumnSelector.select_required_input_columns
+    )
diff --git a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/self_column_selector.py b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/self_column_selector.py
new file mode 100644
index 000000000..6c2ec83e2
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/self_column_selector.py
@@ -0,0 +1,156 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Set, Any, Callable
+
+from .utils import get_cols_exclude_index
+from .....core import TileableData
+from .....dataframe.core import BaseDataFrameData, BaseSeriesData
+from .....dataframe.groupby.aggregation import DataFrameGroupByAgg
+from .....dataframe.indexing.getitem import DataFrameIndex
+from .....dataframe.indexing.setitem import DataFrameSetitem
+from .....dataframe.merge import DataFrameMerge
+from .....typing import OperandType
+
+
+class SelfColumnSelector:
+    _OP_TO_SELECT_FUNCTION = {}
+
+    @classmethod
+    def register(
+        cls,
+        op_cls: OperandType,
+        func: Callable[[TileableData], Set[Any]],
+        replace: bool = False,
+    ) -> None:
+        if op_cls not in cls._OP_TO_SELECT_FUNCTION or replace:
+            cls._OP_TO_SELECT_FUNCTION[op_cls] = func
+        else:
+            raise ValueError(f"key {op_cls} exists.")
+
+    @classmethod
+    def select(cls, tileable_data: TileableData) -> Set[Any]:
+        """
+        TODO: docstring
+        """
+        op_type = type(tileable_data.op)
+        if op_type in cls._OP_TO_SELECT_FUNCTION:
+            return cls._OP_TO_SELECT_FUNCTION[op_type](tileable_data)
+        for op_cls in op_type.__mro__:
+            if op_cls in cls._OP_TO_SELECT_FUNCTION:
+                cls._OP_TO_SELECT_FUNCTION[op_type] = cls._OP_TO_SELECT_FUNCTION[op_cls]
+                return cls._OP_TO_SELECT_FUNCTION[op_cls](tileable_data)
+        return set()
+
+
+def register_selector(op_type: OperandType) -> Callable:
+    def wrap(selector_func: Callable):
+        SelfColumnSelector.register(op_type, selector_func)
+        return selector_func
+
+    return wrap
+
+
+@register_selector(DataFrameSetitem)
+def df_setitem_select_function(tileable_data: TileableData) -> Set[Any]:
+    if isinstance(tileable_data.op.indexes, list):
+        return set(tileable_data.op.indexes)
+    else:
+        return {tileable_data.op.indexes}
+
+
+@register_selector(DataFrameIndex)
+def df_getitem_select_function(tileable_data: TileableData) -> Set[Any]:
+    if tileable_data.op.col_names is not None:
+        col_names = tileable_data.op.col_names
+        if isinstance(col_names, list):
+            return set(tileable_data.op.col_names)
+        else:
+            return {tileable_data.op.col_names}
+    else:
+        if isinstance(tileable_data, BaseDataFrameData):
+            return set(tileable_data.dtypes.index)
+        elif isinstance(tileable_data, BaseSeriesData):
+            return {tileable_data.name}
+
+
+@register_selector(DataFrameGroupByAgg)
+def df_groupby_agg_select_function(tileable_data: TileableData) -> Set[Any]:
+    """
+    Make sure the "group by columns" are preserved.
+    """
+
+    op: DataFrameGroupByAgg = tileable_data.op
+    by = op.groupby_params["by"]
+
+    if isinstance(tileable_data, BaseDataFrameData):
+        return get_cols_exclude_index(tileable_data, by)
+    elif isinstance(tileable_data, BaseSeriesData):
+        return {tileable_data.name}
+    else:
+        return set()
+
+
+@register_selector(DataFrameMerge)
+def df_merge_select_function(tileable_data: TileableData) -> Set[Any]:
+    """
+    Make sure the merge keys are preserved.
+    """
+
+    op: DataFrameMerge = tileable_data.op
+    on = op.on
+    if on is not None:
+        return get_cols_exclude_index(tileable_data, on)
+
+    ret = set()
+    left_data: BaseDataFrameData = op.inputs[0]
+    right_data: BaseDataFrameData = op.inputs[1]
+    left_index = op.left_index
+    right_index = op.right_index
+    left_on = op.left_on if isinstance(op.left_on, list) else [op.left_on]
+    right_on = op.right_on if isinstance(op.right_on, list) else [op.right_on]
+
+    if left_index and right_index:
+        return ret
+
+    if left_index:
+        for col in right_data.dtypes.index:
+            if col in right_on:
+                ret.add(col)
+        return ret
+    if right_index:
+        for col in left_data.dtypes.index:
+            if col in left_on:
+                ret.add(col)
+        return ret
+
+    for data, merge_keys, suffix in zip(
+        [left_data, right_data], [left_on, right_on], op.suffixes
+    ):
+        if merge_keys is None:
+            continue
+        for col in data.dtypes.index:
+            if col in merge_keys:
+                other_data = right_data if data is left_data else left_data
+                other_merge_keys = right_on if merge_keys is left_on else left_on
+
+                if col in other_data.dtypes.index and col not in other_merge_keys:
+                    # if the merge key exists in the other dataframe but not in the other
+                    # dataframe's merge keys, suffixes will be added.
+                    # TODO: this does not work when col is a tuple.
+                    suffix_col = str(col) + suffix
+                    ret.add(suffix_col)
+                else:
+                    ret.add(col)
+    return ret
diff --git a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/__init__.py b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/__init__.py
new file mode 100644
index 000000000..313d6ba7a
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_column_pruning.py b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_column_pruning.py
new file mode 100644
index 000000000..7190d9287
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_column_pruning.py
@@ -0,0 +1,592 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+
+import pandas as pd
+import pytest
+
+from ...... import dataframe as md
+from ...... import tensor as mt
+from ......dataframe.arithmetic import DataFrameMul
+from ......dataframe.base.eval import DataFrameEval
+from ......dataframe.base.isin import DataFrameIsin
+from ......dataframe.core import DataFrameData, SeriesData, DataFrameGroupByData
+from ......dataframe.datasource.dataframe import DataFrameDataSource
+from ......dataframe.datasource.read_csv import DataFrameReadCSV
+from ......dataframe.datasource.read_parquet import DataFrameReadParquet
+from ......dataframe.groupby.aggregation import DataFrameGroupByAgg
+from ......dataframe.groupby.core import DataFrameGroupByOperand
+from ......dataframe.indexing.getitem import DataFrameIndex
+from ......dataframe.indexing.setitem import DataFrameSetitem
+from ......dataframe.merge import DataFrameMerge
+from ......optimization.logical.tileable import optimize
+from ......tensor.core import TensorData
+from ......tensor.datasource import ArrayDataSource
+
+
+@pytest.fixture()
+def gen_data1():
+    with tempfile.TemporaryDirectory() as tempdir:
+        df = pd.DataFrame(
+            {
+                "c1": [3, 4, 5, 3, 5, 4, 1, 2, 3],
+                "c2": [1, 3, 4, 5, 6, 5, 4, 4, 4],
+                "c3": list("aabaaddce"),
+                "c4": list("abaaaddce"),
+            }
+        )
+
+        df2 = pd.DataFrame(
+            {
+                "c1": [3, 3, 4, 5, 6, 5, 4, 4, 4],
+                "c2": [1, 3, 4, 5, 6, 5, 4, 4, 4],
+                "c3": list("aabaaddce"),
+                "c4": list("abaaaddce"),
+            }
+        )
+        file_path = os.path.join(tempdir, "test.csv")
+        file_path2 = os.path.join(tempdir, "test2.csv")
+
+        df.to_csv(file_path, index=False)
+        df2.to_csv(file_path2, index=False)
+        yield file_path, file_path2
+
+
+@pytest.fixture()
+def gen_data2():
+    with tempfile.TemporaryDirectory() as tempdir:
+        df = pd.DataFrame(
+            {
+                "c1": [3, 4, 5, 3, 5, 4, 1, 2, 3],
+                "c2": [1, 3, 4, 5, 6, 5, 4, 4, 4],
+                "c3": [1, 3, 4, 1, 1, 9, 4, 4, 4],
+                "c4": [3, 0, 5, 3, 5, 4, 1, 2, 10],
+            }
+        )
+
+        df2 = pd.DataFrame(
+            {
+                "cc1": [3, 4, 5, 3, 5, 4, 1, 2, 3],
+                "cc2": [1, 6, 4, 5, 6, 5, 4, 4, 4],
+                "cc3": [1, 3, 4, 1, 1, 9, 4, 8, 4],
+                "cc4": [3, 0, 5, 3, 5, 4, 1, 2, 10],
+            }
+        )
+
+        file_path = os.path.join(tempdir, "test.pq")
+        file_path2 = os.path.join(tempdir, "test2.pq")
+        df.to_parquet(file_path)
+        df2.to_parquet(file_path2)
+        yield file_path, file_path2
+
+
+def test_groupby(setup, gen_data2):
+    # no column pruning
+    file_path, file_path2 = gen_data2
+    df1 = md.read_parquet(file_path)
+    df2 = md.read_parquet(file_path2)
+    m = df1.merge(df2, left_on="c1", right_on="cc1")
+    g = m.groupby(["c1"])
+
+    graph = g.build_graph()
+    optimize(graph)
+
+    assert len(graph.result_tileables) == 1
+    groupby_data = graph.result_tileables[0]
+    assert isinstance(groupby_data, DataFrameGroupByData)
+    assert isinstance(groupby_data.op, DataFrameGroupByOperand)
+    assert len(groupby_data.dtypes) == 8
+
+    assert len(groupby_data.inputs) == 1
+    merge_data = groupby_data.inputs[0]
+    assert isinstance(merge_data, DataFrameData)
+    assert isinstance(merge_data.op, DataFrameMerge)
+    assert len(groupby_data.dtypes) == 8
+
+    assert len(merge_data.inputs) == 2
+    left_data = merge_data.inputs[0]
+    right_data = merge_data.inputs[1]
+    assert isinstance(left_data, DataFrameData)
+    assert isinstance(left_data.op, DataFrameReadParquet)
+    assert len(left_data.dtypes) == 4
+    assert isinstance(right_data, DataFrameData)
+    assert isinstance(right_data.op, DataFrameReadParquet)
+    assert len(right_data.dtypes) == 4
+
+
+def test_tensor(setup):
+    t = mt.tensor((1, 2, 3))
+    s = md.DataFrame({"foo": (1, 2, 3), "bar": (4, 5, 6)}).isin(t)
+
+    graph = s.build_graph()
+    optimize(graph)
+
+    assert len(graph.result_tileables) == 1
+    isin_data = graph.result_tileables[0]
+    assert isinstance(isin_data, DataFrameData)
+    assert isinstance(isin_data.op, DataFrameIsin)
+    assert len(isin_data.dtypes) == 2
+
+    assert len(isin_data.inputs) == 2
+    df_data = isin_data.inputs[0]
+    assert isinstance(df_data, DataFrameData)
+    assert isinstance(df_data.op, DataFrameDataSource)
+    assert len(df_data.dtypes) == 2
+
+    tensor_data = isin_data.inputs[1]
+    assert isinstance(tensor_data, TensorData)
+    assert isinstance(tensor_data.op, ArrayDataSource)
+
+
+def test_groupby_agg(setup, gen_data1):
+    file_path, _ = gen_data1
+
+    df1 = md.read_csv(file_path)
+    c = df1.groupby("c1")["c2"].sum()
+
+    graph = c.build_graph()
+    optimize(graph)
+    groupby_agg_node = graph.result_tileables[0]
+    assert isinstance(groupby_agg_node, SeriesData)
+    assert isinstance(groupby_agg_node.op, DataFrameGroupByAgg)
+    assert type(groupby_agg_node.op) is DataFrameGroupByAgg
+    assert groupby_agg_node.name == "c2"
+
+    groupby_agg_node_preds = graph.predecessors(groupby_agg_node)
+    assert len(groupby_agg_node_preds) == 1
+    read_csv_node = groupby_agg_node_preds[0]
+    assert isinstance(read_csv_node, DataFrameData)
+    assert isinstance(read_csv_node.op, DataFrameReadCSV)
+    assert len(read_csv_node.op.usecols) == 2
+    assert len({"c1", "c2"} ^ set(read_csv_node.op.usecols)) == 0
+
+    raw = pd.read_csv(file_path)
+    pd_res = raw.groupby("c1")["c2"].sum()
+    r = c.execute().fetch()
+    pd.testing.assert_series_equal(r, pd_res)
+
+
+def test_merge_and_getitem(setup, gen_data1):
+    file_path, file_path2 = gen_data1
+    df1 = md.read_csv(file_path)
+    df2 = md.read_csv(file_path2, names=["c1", "c2", "cc3", "cc4"], header=0)
+    r = df1.merge(df2)["c1"]
+
+    graph = r.build_graph()
+    optimize(graph)
+
+    index_node = graph.result_tileables[0]
+    assert isinstance(index_node.op, DataFrameIndex)
+    assert index_node.name == "c1"
+
+    assert len(graph.predecessors(index_node)) == 1
+    merge_node = graph.predecessors(index_node)[0]
+    assert type(merge_node.op) is DataFrameMerge
+
+    read_csv_node_left, read_csv_node_right = graph.predecessors(merge_node)
+    assert type(read_csv_node_left.op) is DataFrameReadCSV
+    assert type(read_csv_node_right.op) is DataFrameReadCSV
+    assert len(read_csv_node_left.op.usecols) == 2
+    assert len(read_csv_node_right.op.usecols) == 2
+    assert set(read_csv_node_left.op.usecols) == {"c1", "c2"}
+    assert set(read_csv_node_right.op.usecols) == {"c1", "c2"}
+
+    r = r.execute().fetch()
+    raw1 = pd.read_csv(file_path)
+    raw2 = pd.read_csv(file_path2, names=["c1", "c2", "cc3", "cc4"], header=0)
+    expected = raw1.merge(raw2)["c1"]
+    pd.testing.assert_series_equal(r, expected)
+
+
+def test_merge_on_one_column(setup, gen_data1):
+    file_path, file_path2 = gen_data1
+    df1 = md.read_csv(file_path)
+    df2 = md.read_csv(file_path2)
+    c = df1.merge(df2, left_on="c1", right_on="c1")["c1"]
+
+    graph = c.build_graph()
+    optimize(graph)
+
+    index_node = graph.result_tileables[0]
+    assert type(index_node.op) is DataFrameIndex
+
+    index_node_preds = graph.predecessors(index_node)
+    assert len(index_node_preds) == 1
+
+    merge_node = index_node_preds[0]
+    assert type(merge_node.op) is DataFrameMerge
+
+    merge_node_preds = graph.predecessors(merge_node)
+    assert len(merge_node_preds) == 2
+
+    read_csv_node = merge_node_preds[0]
+    read_csv_op = read_csv_node.op
+    assert type(read_csv_op) is DataFrameReadCSV
+    assert len(read_csv_op.usecols) == 1
+    assert read_csv_op.usecols == ["c1"]
+
+    r = c.execute().fetch()
+    raw1 = pd.read_csv(file_path)
+    raw2 = pd.read_csv(file_path2)
+    expected = raw1.merge(raw2, left_on="c1", right_on="c1")["c1"]
+    pd.testing.assert_series_equal(r, expected)
+
+
+def test_merge_on_two_columns(setup, gen_data1):
+    file_path, file_path2 = gen_data1
+    df1 = md.read_csv(file_path)
+    df2 = md.read_csv(file_path2)
+    c = df1.merge(df2, left_on=["c1", "c2"], right_on=["c1", "c2"])[["c1", "c2"]]
+
+    graph = c.build_graph()
+    optimize(graph)
+
+    index_node = graph.result_tileables[0]
+    assert type(index_node.op) is DataFrameIndex
+    assert len(index_node.op.col_names) == 2
+
+    merge_node = graph.predecessors(index_node)[0]
+    read_csv_node = graph.predecessors(merge_node)[0]
+    assert type(read_csv_node.op) is DataFrameReadCSV
+
+    use_cols = read_csv_node.op.usecols
+    assert len(use_cols) == 2
+    assert set(use_cols) & {"c1", "c2"} == {"c1", "c2"}
+    assert len(set(use_cols) ^ {"c1", "c2"}) == 0
+
+    r = c.execute().fetch()
+    raw1 = pd.read_csv(file_path)
+    raw2 = pd.read_csv(file_path2)
+    expected = raw1.merge(raw2, left_on=["c1", "c2"], right_on=["c1", "c2"])[
+        ["c1", "c2"]
+    ]
+    pd.testing.assert_frame_equal(r, expected)
+
+
+def test_groupby_agg_then_merge(setup, gen_data1):
+    file_path, file_path2 = gen_data1
+    df1 = md.read_csv(file_path)
+    df2 = md.read_csv(file_path2)
+    r_group_res = df1.groupby(["c1"])[["c2"]].sum()
+    c = df2.merge(r_group_res, left_on=["c2"], right_on=["c2"])[["c1", "c3"]]
+    graph = c.build_graph()
+    optimize(graph)
+    r = c.execute().fetch()
+
+    raw1 = pd.read_csv(file_path)
+    raw2 = pd.read_csv(file_path2)
+    group_res = raw1.groupby(["c1"])[["c2"]].sum()
+    expected = raw2.merge(group_res, left_on=["c2"], right_on=["c2"])[["c1", "c3"]]
+    pd.testing.assert_frame_equal(r, expected)
+
+    index_node = graph.result_tileables[0]
+    assert type(index_node.op) is DataFrameIndex
+
+    merge_node = graph.predecessors(index_node)[0]
+    merge_node_preds = graph.predecessors(merge_node)
+
+    df2_node = [n for n in merge_node_preds if type(n.op) is DataFrameReadCSV][0]
+    assert set(df2_node.op.usecols) == {"c1", "c2", "c3"}
+
+    df1_node = [
+        n
+        for n in graph._nodes
+        if type(n.op) is DataFrameReadCSV and n.op.path == file_path
+    ][0]
+    assert type(df1_node.op) is DataFrameReadCSV
+    assert set(df1_node.op.usecols) == {"c1", "c2"}
+
+
+def test_merge_then_groupby_apply(setup, gen_data2):
+    file_path, file_path2 = gen_data2
+    df1 = md.read_parquet(file_path)
+    df2 = md.read_parquet(file_path2)
+
+    c = (
+        (
+            ((df1 + 1) * 2).merge(df2, left_on=["c1", "c3"], right_on=["cc2", "cc4"])[
+                ["c1", "cc4"]
+            ]
+            * 2
+        )
+        .groupby(["cc4"])
+        .apply(lambda x: x / x.sum())
+    )
+    graph = c.build_graph()
+    optimize(graph)
+    r = c.execute().fetch()
+
+    raw1 = pd.read_parquet(file_path)
+    raw2 = pd.read_parquet(file_path2)
+    expected = (
+        (
+            ((raw1 + 1) * 2).merge(raw2, left_on=["c1", "c3"], right_on=["cc2", "cc4"])[
+                ["c1", "cc4"]
+            ]
+            * 2
+        )
+        .groupby(["cc4"])
+        .apply(lambda x: x / x.sum())
+    )
+    pd.testing.assert_frame_equal(r, expected)
+
+    read_parquet_nodes = [n for n in graph._nodes if type(n.op) is DataFrameReadParquet]
+    assert len(read_parquet_nodes) == 2
+
+    for n in read_parquet_nodes:
+        assert len(n.op.get_columns()) == 2
+
+    merge_node = [n for n in graph._nodes if type(n.op) is DataFrameMerge][0]
+    merge_node_preds = graph.predecessors(merge_node)
+    assert len(merge_node_preds) == 2
+
+    inserted_node = [n for n in merge_node_preds if type(n.op) is DataFrameIndex][0]
+    assert len(inserted_node.op.col_names) == 2
+    assert set(inserted_node.op.col_names) == {"c1", "c3"}
+
+    mul_node = graph.predecessors(inserted_node)[0]
+    assert type(mul_node.op) is DataFrameMul
+    assert set(mul_node.dtypes.index.tolist()) == {"c1", "c3"}
+
+
+def test_two_merges(setup, gen_data2):
+    file_path, file_path2 = gen_data2
+    df1 = md.read_parquet(file_path)
+    df2 = md.read_parquet(file_path2)
+    c = (
+        (df1 + 1)
+        .merge((df2 + 2), left_on=["c2", "c3"], right_on=["cc1", "cc4"])[
+            ["c2", "c4", "cc1", "cc2"]
+        ]
+        .merge(df2, left_on=["cc1"], right_on=["cc3"])
+    )
+    graph = c.build_graph()
+    optimize(graph)
+    r = c.execute().fetch()
+
+    raw1 = pd.read_parquet(file_path)
+    raw2 = pd.read_parquet(file_path2)
+
+    expected = (
+        (raw1 + 1)
+        .merge((raw2 + 2), left_on=["c2", "c3"], right_on=["cc1", "cc4"])[
+            ["c2", "c4", "cc1", "cc2"]
+        ]
+        .merge(raw2, left_on=["cc1"], right_on=["cc3"])
+    )
+    pd.testing.assert_frame_equal(r, expected)
+
+    parquet_nodes = [n for n in graph._nodes if type(n.op) is DataFrameReadParquet]
+    assert len(parquet_nodes) == 2
+
+    # df1 read parquet push down
+    df1_node = [n for n in parquet_nodes if n.op.path == file_path][0]
+    assert set(df1_node.op.get_columns()) == {"c2", "c3", "c4"}
+
+    # df2 read parquet not push down since it needs all the columns
+    df2_node = [n for n in parquet_nodes if n.op.path == file_path2][0]
+    assert df2_node.op.columns is None
+
+    # prove that inserted nodes take effect
+    inserted_nodes = [n for n in graph._nodes if type(n.op) is DataFrameIndex]
+    assert len(inserted_nodes) == 3
+
+    index_after_merge_node = [
+        n for n in inserted_nodes if type(graph.predecessors(n)[0].op) is DataFrameMerge
+    ][0]
+    assert set(index_after_merge_node.op.col_names) == {"c2", "c4", "cc1", "cc2"}
+
+
+def test_two_groupby_aggs_with_multi_index(setup, gen_data2):
+    file_path, _ = gen_data2
+    df = md.read_parquet(file_path)
+    c = (
+        (df * 2)
+        .groupby(["c2", "c3"])
+        .apply(lambda x: x["c1"].sum() / x["c2"].mean())
+        .reset_index()
+        .groupby("c3")
+        .agg(["min", "max"])
+    )
+    graph = c.build_graph()
+    optimize(graph)
+    r = c.execute().fetch()
+
+    raw = pd.read_parquet(file_path)
+    expected = (
+        (raw * 2)
+        .groupby(["c2", "c3"])
+        .apply(lambda x: x["c1"].sum() / x["c2"].mean())
+        .reset_index()
+        .groupby("c3")
+        .agg(["min", "max"])
+    )
+    pd.testing.assert_frame_equal(r, expected)
+
+    apply_node = [n for n in graph._nodes if type(n.op) is DataFrameGroupByAgg][0]
+    assert set(apply_node.columns.index_value._index_value._data) == {
+        (0, "min"),
+        (0, "max"),
+        ("c2", "max"),
+        ("c2", "min"),
+    }
+
+    # apply cannot push down
+    read_parquet_node = [
+        n
+        for n in graph._nodes
+        if type(n.op) is DataFrameReadParquet and n.op.path == file_path
+    ][0]
+    assert read_parquet_node.op.get_columns() is None
+
+
+def test_merge_and_get_col_with_suffix(setup, gen_data1):
+    file_path, file_path2 = gen_data1
+    left = md.read_csv(file_path)
+    right = md.read_csv(file_path2)
+    r = left.merge(right, on="c1")[["c3_x"]]
+
+    graph = r.build_graph()
+    optimize(graph)
+
+    index_node = graph.result_tileables[0]
+    assert isinstance(index_node.op, DataFrameIndex)
+    assert index_node.op.col_names == ["c3_x"]
+
+    assert len(graph.predecessors(index_node)) == 1
+    merge_node = graph.predecessors(index_node)[0]
+    assert type(merge_node.op) is DataFrameMerge
+
+    read_csv_node_left, read_csv_node_right = graph.predecessors(merge_node)
+    assert type(read_csv_node_left.op) is DataFrameReadCSV
+    assert type(read_csv_node_right.op) is DataFrameReadCSV
+    assert len(read_csv_node_left.op.usecols) == 2
+    assert len(read_csv_node_right.op.usecols) == 2
+    assert set(read_csv_node_left.op.usecols) == {"c1", "c3"}
+    assert set(read_csv_node_right.op.usecols) == {"c1", "c3"}
+
+    r = r.execute().fetch()
+    raw1 = pd.read_csv(file_path)
+    raw2 = pd.read_csv(file_path2)
+    expected = raw1.merge(raw2, on="c1")[["c3_x"]]
+    pd.testing.assert_frame_equal(r, expected)
+
+
+def test_getitem_with_mask(setup, gen_data1):
+    """
+    Getitem with mask shouldn't prune any column.
+    """
+    file_path, file_path2 = gen_data1
+    df = md.read_csv(file_path)
+    df2 = md.read_csv(file_path2)
+
+    df = df[df2["c1"] > 3]
+    r = df.groupby(by="c1", as_index=False).sum()["c2"]
+
+    graph = r.build_graph()
+    optimize(graph)
+
+    index_node = graph.result_tileables[0]
+    assert isinstance(index_node.op, DataFrameIndex)
+    assert index_node.name == "c2"
+
+    assert len(graph.predecessors(index_node)) == 1
+    gb_node = graph.predecessors(index_node)[0]
+    assert isinstance(gb_node.op, DataFrameGroupByAgg)
+    assert set(gb_node.dtypes.index) == {"c1", "c2"}
+
+    assert len(graph.predecessors(gb_node)) == 1
+    index_node_2 = graph.predecessors(gb_node)[0]
+    isinstance(index_node_2.op, DataFrameIndex)
+    assert set(index_node_2.dtypes.index) == {"c1", "c2"}
+
+    assert len(graph.predecessors(index_node_2)) == 1
+    index_node_3 = graph.predecessors(index_node_2)[0]
+    isinstance(index_node_3.op, DataFrameIndex)
+    assert set(index_node_3.dtypes.index) == {"c1", "c2", "c3", "c4"}
+
+    assert len(graph.predecessors(index_node_3)) == 2
+    read_csv_node, eval_node = graph.predecessors(index_node_3)
+    assert isinstance(read_csv_node.op, DataFrameReadCSV)
+    assert isinstance(eval_node.op, DataFrameEval)
+    assert read_csv_node.op.usecols is None  # all the columns.
+    assert eval_node.name == "c1"
+
+    assert len(graph.predecessors(eval_node)) == 1
+    read_csv_node_2 = graph.predecessors(eval_node)[0]
+    assert isinstance(read_csv_node_2.op, DataFrameReadCSV)
+    assert read_csv_node_2.op.usecols == ["c1"]
+
+    raw1 = pd.read_csv(file_path)
+    raw2 = pd.read_csv(file_path2)
+    raw1 = raw1[raw2["c1"] > 3]
+    expected = raw1.groupby(by="c1", as_index=False).sum()["c2"]
+    pd.testing.assert_series_equal(
+        r.execute(extra_config={"check_series_name": False}).fetch(), expected
+    )
+
+
+def test_setitem(setup, gen_data1):
+    """
+    The output of DataFrameSetitem should preserve the column being set so that tile can work
+    correctly.
+    """
+    file_path, file_path2 = gen_data1
+    df = md.read_csv(file_path)
+    df2 = md.read_csv(file_path2)
+
+    df["c5"] = df2["c1"]
+    r = df.groupby(by="c1", as_index=False).sum()["c2"]
+
+    graph = r.build_graph()
+    optimize(graph)
+
+    index_node = graph.result_tileables[0]
+    assert isinstance(index_node.op, DataFrameIndex)
+    assert index_node.name == "c2"
+
+    assert len(graph.predecessors(index_node)) == 1
+    gb_node = graph.predecessors(index_node)[0]
+    assert isinstance(gb_node.op, DataFrameGroupByAgg)
+    assert set(gb_node.dtypes.index) == {"c1", "c2"}
+
+    assert len(graph.predecessors(gb_node)) == 1
+    index_node_2 = graph.predecessors(gb_node)[0]
+    isinstance(index_node_2.op, DataFrameIndex)
+    assert set(index_node_2.dtypes.index) == {"c1", "c2"}
+
+    assert len(graph.predecessors(index_node_2)) == 1
+    setitem_node = graph.predecessors(index_node_2)[0]
+    isinstance(setitem_node.op, DataFrameSetitem)
+    assert set(setitem_node.dtypes.index) == {"c1", "c2", "c5"}
+
+    assert len(graph.predecessors(setitem_node)) == 2
+    read_csv_node, index_node_3 = graph.predecessors(setitem_node)
+    assert isinstance(read_csv_node.op, DataFrameReadCSV)
+    assert isinstance(index_node_3.op, DataFrameIndex)
+    assert set(read_csv_node.op.usecols) == {"c1", "c2"}
+    assert index_node_3.name == "c1"
+
+    assert len(graph.predecessors(index_node_3)) == 1
+    read_csv_node_2 = graph.predecessors(index_node_3)[0]
+    assert isinstance(read_csv_node_2.op, DataFrameReadCSV)
+    assert read_csv_node_2.op.usecols == ["c1"]
+
+    raw1 = pd.read_csv(file_path)
+    raw2 = pd.read_csv(file_path2)
+    raw1["c5"] = raw2["c1"]
+    expected = raw1.groupby(by="c1", as_index=False).sum()["c2"]
+    pd.testing.assert_series_equal(r.execute().fetch(), expected)
diff --git a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_input_column_selector.py b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_input_column_selector.py
new file mode 100644
index 000000000..5938f358a
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_input_column_selector.py
@@ -0,0 +1,301 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Any, Set, List, Union
+
+import pytest
+
+from ..input_column_selector import InputColumnSelector
+from ......core import TileableData, ENTITY_TYPE
+from ......core.operand import Operand
+from ......dataframe import DataFrame, Series
+from ......tensor import tensor
+
+
+class MockOperand(Operand):
+    _mock_input: TileableData = DataFrame({"foo": (1, 2, 3), "bar": (4, 5, 6)}).data
+
+    @property
+    def inputs(self) -> List[Union[ENTITY_TYPE]]:
+        return [self._mock_input]
+
+    @classmethod
+    def get_mock_input(cls) -> TileableData:
+        return cls._mock_input
+
+
+class MockEntityData(TileableData):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._op = MockOperand()
+
+
+def test_register():
+    def _select_input_columns(
+        tileable_data: TileableData, required_cols: Set[Any]
+    ) -> Dict[TileableData, Set[Any]]:
+        return {}
+
+    InputColumnSelector.register(MockOperand, _select_input_columns)
+    mock_data = MockEntityData()
+    assert InputColumnSelector.select(mock_data, {"foo"}) == {}
+
+    # unregister
+    InputColumnSelector.unregister(MockOperand)
+    assert InputColumnSelector.select(mock_data, {"foo"}) == {
+        MockOperand.get_mock_input(): {"foo", "bar"}
+    }
+
+
+def test_df_groupby_agg():
+    df: DataFrame = DataFrame(
+        {
+            "foo": (1, 1, 2, 2),
+            "bar": (3, 4, 3, 4),
+            "baz": (5, 6, 7, 8),
+            "qux": (9, 10, 11, 12),
+        }
+    )
+
+    s = df.groupby(by="foo")["baz"].sum()
+    input_columns = InputColumnSelector.select(s.data, {"baz"})
+    assert len(input_columns) == 1
+    assert df.data in input_columns
+    assert input_columns[df.data] == {"foo", "baz"}
+
+    s = df.groupby(by=["foo", "bar"]).sum()
+    input_columns = InputColumnSelector.select(s.data, {"baz"})
+    assert len(input_columns) == 1
+    assert df.data in input_columns
+    assert input_columns[df.data] == {"foo", "bar", "baz"}
+
+    s = df.groupby(by="foo").agg(["sum", "max"])
+    input_columns = InputColumnSelector.select(s.data, {"baz"})
+    assert len(input_columns) == 1
+    assert df.data in input_columns
+    assert input_columns[df.data] == {"foo", "baz"}
+
+    s = df.groupby(by="foo")["bar", "baz"].agg(["sum", "max"])
+    input_columns = InputColumnSelector.select(s.data, {"baz"})
+    assert len(input_columns) == 1
+    assert df.data in input_columns
+    assert input_columns[df.data] == {"foo", "bar", "baz"}
+
+    s = df.groupby(by="foo").agg(new_bar=("bar", "sum"), new_baz=("baz", "sum"))
+    input_columns = InputColumnSelector.select(s.data, {"new_bar"})
+    assert len(input_columns) == 1
+    assert df.data in input_columns
+    assert input_columns[df.data] == {"foo", "bar", "baz"}
+
+
+@pytest.mark.skip(reason="group by index is not supported yet")
+def test_df_groupby_index_agg():
+    df: DataFrame = DataFrame({"foo": (1, 1, 3), "bar": (4, 5, 6)})
+    df = df.set_index("foo")
+    s = df.groupby(by="foo").sum()
+    input_columns = InputColumnSelector.select(s.data, {"bar"})
+    assert len(input_columns) == 1
+    assert df.data in input_columns
+    assert input_columns[df.data] == {"bar"}
+
+
+def test_df_merge():
+    left: DataFrame = DataFrame({"foo": (1, 2, 3), "bar": (4, 5, 6), 1: (7, 8, 9)})
+    right = DataFrame({"foo": (1, 2), "bar": (4, 5), "baz": (5, 8), 1: (7, 8)})
+
+    joined = left.merge(right, on=["foo"])
+
+    input_columns = InputColumnSelector.select(joined.data, {"foo"})
+    assert left.data in input_columns
+    assert input_columns[left.data] == {"foo"}
+    assert right.data in input_columns
+    assert input_columns[right.data] == {"foo"}
+
+    input_columns = InputColumnSelector.select(joined.data, {"foo", "baz"})
+    assert left.data in input_columns
+    assert input_columns[left.data] == {"foo"}
+    assert right.data in input_columns
+    assert input_columns[right.data] == {"foo", "baz"}
+
+    input_columns = InputColumnSelector.select(joined.data, {"foo", "1_x"})
+    assert left.data in input_columns
+    assert input_columns[left.data] == {"foo", 1}
+    assert right.data in input_columns
+    assert input_columns[right.data] == {"foo", 1}
+
+    joined = left.merge(right, on=["foo", "bar"])
+    input_columns = InputColumnSelector.select(joined.data, {"baz"})
+    assert left.data in input_columns
+    assert input_columns[left.data] == {"foo", "bar"}
+    assert right.data in input_columns
+    assert input_columns[right.data] == {"foo", "bar", "baz"}
+
+    joined = left.merge(right, on=["foo", "bar"])
+    input_columns = InputColumnSelector.select(joined.data, {"1_x", "1_y"})
+    assert left.data in input_columns
+    assert input_columns[left.data] == {"foo", "bar", 1}
+    assert right.data in input_columns
+    assert input_columns[right.data] == {"foo", "bar", 1}
+
+
+def test_df_merge_on_index():
+    left: DataFrame = DataFrame({"foo": (1, 2, 3), "bar": (4, 5, 6), 1: (7, 8, 9)})
+    left = left.set_index("foo")
+    right = DataFrame({"foo": (1, 2), "bar": (4, 5), "baz": (5, 8), 1: (7, 8)})
+    right = right.set_index("foo")
+
+    # join on index
+    joined = left.merge(right, on="foo")
+    input_columns = InputColumnSelector.select(joined.data, {"baz"})
+    assert left.data in input_columns
+    assert input_columns[left.data] == set()
+    assert right.data in input_columns
+    assert input_columns[right.data] == {"baz"}
+
+    # left_on is an index and right_on is a column
+    joined = left.merge(right, left_on="foo", right_on="bar")
+    input_columns = InputColumnSelector.select(joined.data, {"baz"})
+    assert left.data in input_columns
+    assert input_columns[left.data] == set()
+    assert right.data in input_columns
+    assert input_columns[right.data] == {"bar", "baz"}
+
+    # left_on is a column and right_on is an index
+    joined = left.merge(right, left_on="bar", right_on="foo")
+    input_columns = InputColumnSelector.select(joined.data, {"baz"})
+    assert left.data in input_columns
+    assert input_columns[left.data] == {"bar"}
+    assert right.data in input_columns
+    assert input_columns[right.data] == {"baz"}
+
+
+def test_df_arithmatic_ops():
+    def add(x, y):
+        return x + y
+
+    def sub(x, y):
+        return x - y
+
+    def mul(x, y):
+        return x * y
+
+    def div(x, y):
+        return x / y
+
+    ops = (add, sub, mul, div)
+    df1: DataFrame = DataFrame({"foo": (1, 2, 3), "bar": (4, 5, 6)})
+    df2: DataFrame = DataFrame({"foo": (1, 2, 3), "bar": (4, 5, 6)})
+
+    for op in ops:
+        res: DataFrame = op(df1, 1)
+        input_columns = InputColumnSelector.select(res.data, {"foo"})
+        assert len(input_columns) == 1
+        assert res.data.inputs[0] in input_columns
+        assert input_columns[res.data.inputs[0]] == {"foo"}
+
+    for op in ops:
+        res: DataFrame = op(df1, df2)
+        input_columns = InputColumnSelector.select(res.data, {"foo"})
+        assert len(input_columns) == 2
+        assert res.data.inputs[0] in input_columns
+        assert input_columns[res.data.inputs[0]] == {"foo"}
+        assert res.data.inputs[1] in input_columns
+        assert input_columns[res.data.inputs[1]] == {"foo"}
+
+
+def test_df_setitem():
+    df: DataFrame = DataFrame(
+        {
+            "foo": (1, 1, 2, 2),
+            "bar": (3, 4, 3, 4),
+            "baz": (5, 6, 7, 8),
+            "qux": (9, 10, 11, 12),
+        }
+    )
+
+    # scaler
+    df[4] = 13
+    input_columns = InputColumnSelector.select(df.data, {"foo"})
+    assert len(input_columns) == 1
+    assert df.data.inputs[0] in input_columns
+    assert input_columns[df.data.inputs[0]] == {"foo"}
+
+    # scaler tensor
+    df[5] = tensor()
+    input_columns = InputColumnSelector.select(df.data, {"foo"})
+    assert len(input_columns) == 1
+    assert df.data.inputs[0] in input_columns
+    assert input_columns[df.data.inputs[0]] == {"foo"}
+
+    # tensor
+    df[6] = tensor([13, 14, 15, 16])
+    input_columns = InputColumnSelector.select(df.data, {"foo"})
+    assert len(input_columns) == 2
+    assert df.data.inputs[0] in input_columns
+    assert input_columns[df.data.inputs[0]] == {"foo"}
+    assert df.data.inputs[1] in input_columns
+    assert input_columns[df.data.inputs[1]] == {None}
+
+    # series
+    df[7] = Series([13, 14, 15, 16])
+    input_columns = InputColumnSelector.select(df.data, {"foo"})
+    assert len(input_columns) == 2
+    assert df.data.inputs[0] in input_columns
+    assert input_columns[df.data.inputs[0]] == {"foo"}
+    assert df.data.inputs[1] in input_columns
+    assert input_columns[df.data.inputs[1]] == {None}
+
+    # dataframe
+    df[[8, 9]] = df[["foo", "bar"]]
+    input_columns = InputColumnSelector.select(df.data, {8})
+    assert len(input_columns) == 2
+    assert df.data.inputs[0] in input_columns
+    assert input_columns[df.data.inputs[0]] == set()
+    assert df.data.inputs[1] in input_columns
+    assert input_columns[df.data.inputs[1]] == {"foo", "bar"}
+
+
+def test_select_all():
+    df: DataFrame = DataFrame(
+        {
+            "foo": (1, 1, 2, 2),
+            "bar": (3, 4, 3, 4),
+            "baz": (5, 6, 7, 8),
+            "qux": (9, 10, 11, 12),
+        }
+    )
+    head = df.head()
+    input_columns = InputColumnSelector.select(head.data, {"foo"})
+    assert len(input_columns) == 1
+    assert head.data.inputs[0] in input_columns
+    assert input_columns[head.data.inputs[0]] == {"foo", "bar", "baz", "qux"}
+
+
+def test_getitem():
+    df: DataFrame = DataFrame(
+        {
+            "foo": (1, 1, 2, 2),
+            "bar": (3, 4, 3, 4),
+            "baz": (5, 6, 7, 8),
+            "qux": (9, 10, 11, 12),
+        }
+    )
+
+    getitem = df[df["foo"] == 1]
+    input_columns = InputColumnSelector.select(getitem.data, {"foo"})
+    assert input_columns[getitem.data.inputs[0]] == {"foo", "bar", "baz", "qux"}
+
+    getitem = df["foo"]
+    input_columns = InputColumnSelector.select(getitem.data, {"foo"})
+    assert input_columns[getitem.data.inputs[0]] == {"foo"}
diff --git a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_self_column_selector.py b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_self_column_selector.py
new file mode 100644
index 000000000..7655e8aa0
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_self_column_selector.py
@@ -0,0 +1,97 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ......dataframe import DataFrame
+from ..self_column_selector import SelfColumnSelector
+
+
+def test_df_setitem():
+    df = DataFrame({"foo": (1, 1, 3)})
+
+    df["bar"] = [1, 2, 3]
+    required_columns = SelfColumnSelector.select(df.data)
+    assert required_columns == {"bar"}
+
+    df[["baz", "qux"]] = 1, 2
+    required_columns = SelfColumnSelector.select(df.data)
+    assert required_columns == {"baz", "qux"}
+
+
+def test_df_getitem():
+    df = DataFrame({"foo": (1, 1, 3), "bar": (4, 5, 6)})
+
+    getitem = df["foo"]
+    required_columns = SelfColumnSelector.select(getitem.data)
+    assert required_columns == {"foo"}
+
+    getitem = df[["foo", "bar"]]
+    required_columns = SelfColumnSelector.select(getitem.data)
+    assert required_columns == {"foo", "bar"}
+
+
+def test_df_groupby_agg():
+    df = DataFrame({"foo": (1, 1, 3), "bar": (4, 5, 6)})
+
+    a = df.groupby(by="foo", as_index=False).sum()
+    required_columns = SelfColumnSelector.select(a.data)
+    assert required_columns == {"foo"}
+
+    a = df.groupby(by="foo").sum()
+    required_columns = SelfColumnSelector.select(a.data)
+    assert required_columns == set()
+
+
+def test_df_merge():
+    left = DataFrame({"foo": (1, 2, 3), "bar": (4, 5, 6), 1: (7, 8, 9)})
+    right = DataFrame({"foo": (1, 2), "bar": (4, 5), "baz": (5, 8), 1: (7, 8)})
+
+    joined = left.merge(right, on="foo")
+    required_columns = SelfColumnSelector.select(joined.data)
+    assert required_columns == {"foo"}
+
+    joined = left.merge(right, on=["foo", "bar"])
+    required_columns = SelfColumnSelector.select(joined.data)
+    assert required_columns == {"foo", "bar"}
+
+    joined = left.merge(right, left_on=["foo", "bar"], right_on=["foo", "bar"])
+    required_columns = SelfColumnSelector.select(joined.data)
+    assert required_columns == {"foo", "bar"}
+
+    joined = left.merge(right)
+    required_columns = SelfColumnSelector.select(joined.data)
+    assert required_columns == {"foo", "bar", 1}
+
+    joined = left.merge(right, left_on=["foo"], right_on=["bar"])
+    required_columns = SelfColumnSelector.select(joined.data)
+    assert required_columns == {"foo_x", "bar_y"}
+
+    joined = left.merge(right, left_index=True, right_index=True)
+    required_columns = SelfColumnSelector.select(joined.data)
+    assert required_columns == set()
+
+    joined = left.merge(right, left_index=True, right_on="foo")
+    required_columns = SelfColumnSelector.select(joined.data)
+    assert required_columns == {"foo"}
+
+    joined = left.merge(right, left_index=True, right_on=["foo"])
+    required_columns = SelfColumnSelector.select(joined.data)
+    assert required_columns == {"foo"}
+
+    joined = left.merge(right, left_on="foo", right_index=True)
+    required_columns = SelfColumnSelector.select(joined.data)
+    assert required_columns == {"foo"}
+
+    joined = left.merge(right, left_on=["foo"], right_index=True)
+    required_columns = SelfColumnSelector.select(joined.data)
+    assert required_columns == {"foo"}
diff --git a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/utils.py b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/utils.py
new file mode 100644
index 000000000..fdb188fc2
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/utils.py
@@ -0,0 +1,31 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Set
+
+from .....dataframe.core import BaseDataFrameData
+
+
+def get_cols_exclude_index(inp: BaseDataFrameData, cols: Any) -> Set[Any]:
+    ret = set()
+    if isinstance(cols, (list, tuple)):
+        for col in cols:
+            if col in inp.dtypes.index:
+                # exclude index
+                ret.add(col)
+    else:
+        if cols in inp.dtypes.index:
+            # exclude index
+            ret.add(cols)
+    return ret
diff --git a/python/xorbits/_mars/optimization/logical/tileable/core.py b/python/xorbits/_mars/optimization/logical/tileable/core.py
new file mode 100644
index 000000000..b1ca31165
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/core.py
@@ -0,0 +1,49 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Type
+
+from ....core import OperandType, TileableGraph
+from ..core import (
+    OperandBasedOptimizationRule,
+    OptimizationRecords,
+    OptimizationRule,
+    Optimizer,
+)
+
+
+class TileableOptimizer(Optimizer):
+    """
+    Tileable Optimizer
+    """
+
+
+def register_optimization_rule():
+    def wrap(rule_type: Type[OptimizationRule]):
+        TileableOptimizer.register_rule(rule_type)
+
+    return wrap
+
+
+def register_operand_based_optimization_rule(op_types: List[Type[OperandType]]):
+    def wrap(rule_type: Type[OperandBasedOptimizationRule]):
+        for op_type in op_types:
+            rule_type.register_operand(op_type)
+        TileableOptimizer.register_rule(rule_type)
+
+    return wrap
+
+
+def optimize(tileable_graph: TileableGraph) -> OptimizationRecords:
+    return TileableOptimizer.optimize(tileable_graph)
diff --git a/python/xorbits/_mars/optimization/logical/tileable/head.py b/python/xorbits/_mars/optimization/logical/tileable/head.py
new file mode 100644
index 000000000..3183a2249
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/head.py
@@ -0,0 +1,24 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....dataframe.indexing.iloc import DataFrameIlocGetItem, SeriesIlocGetItem
+from ..common.head import HeadPushDown
+from .core import register_operand_based_optimization_rule
+
+
+@register_operand_based_optimization_rule([DataFrameIlocGetItem, SeriesIlocGetItem])
+class TileableHeadPushDown(HeadPushDown):
+    """
+    Head push down.
+    """
diff --git a/python/xorbits/_mars/optimization/logical/tileable/tests/__init__.py b/python/xorbits/_mars/optimization/logical/tileable/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/optimization/logical/tileable/tests/test_arithmetic_query.py b/python/xorbits/_mars/optimization/logical/tileable/tests/test_arithmetic_query.py
new file mode 100644
index 000000000..87c73d9c9
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/tests/test_arithmetic_query.py
@@ -0,0 +1,184 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+import numpy as np
+import pandas as pd
+
+from ..... import dataframe as md
+from ..... import execute, fetch
+from .....core import TileableGraph, TileableGraphBuilder, enter_mode
+from .....dataframe.base.eval import DataFrameEval
+from .. import optimize
+
+_var_pattern = re.compile(r"@__eval_scalar_var\d+")
+
+
+def _norm_vars(var_str):
+    return _var_pattern.sub("@scalar", var_str)
+
+
+@enter_mode(build=True)
+def test_arithmetic_query(setup):
+    raw = pd.DataFrame(np.random.rand(100, 10), columns=list("ABCDEFGHIJ"))
+    raw2 = pd.DataFrame(np.random.rand(100, 5), columns=list("ABCDE"))
+
+    # does not support heterogeneous sources
+    df1 = md.DataFrame(raw, chunk_size=10)
+    df2 = md.DataFrame(raw2, chunk_size=10)
+    df3 = -(df1["A"] + df2["B"])
+    graph = TileableGraph([df3.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df3.data) is None
+
+    # does not support customized args in arithmetic
+    df1 = md.DataFrame(raw, chunk_size=10)
+    df3 = (-df1["A"]).add(df1["B"], fill_value=0.0)
+    graph = TileableGraph([df3.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df3.data) is None
+
+    # does not support GPU
+    df1 = md.DataFrame(raw, chunk_size=10, gpu=True)
+    df4 = (-df1["A"]).add(df1["B"])
+    graph = TileableGraph([df4.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df4.data) is None
+
+    # does not support non-string headers
+    df1 = md.DataFrame(np.random.rand(100, 5))
+    df2 = df1[0] + df1[1]
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df2.data) is None
+
+    df1 = md.DataFrame(raw, chunk_size=10)
+    df2 = -df1["A"] + df1["B"] * 5
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    opt_df2 = records.get_optimization_result(df2.data)
+    assert opt_df2.op.expr == "(-(`A`)) + ((`B`) * (5))"
+
+    pd.testing.assert_series_equal(df2.execute().fetch(), -raw["A"] + raw["B"] * 5)
+
+    df1 = md.DataFrame(raw, chunk_size=10)
+    df2 = -df1["A"] + df1["B"] * 5 + 3 * df1["C"]
+    graph = TileableGraph([df1["A"].data, df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    opt_df2 = records.get_optimization_result(df2.data)
+    assert opt_df2.op.expr == "((-(`A`)) + ((`B`) * (5))) + ((3) * (`C`))"
+
+    r_df2, _r_col_a = fetch(execute(df2, df1["A"]))
+    pd.testing.assert_series_equal(r_df2, -raw["A"] + raw["B"] * 5 + 3 * raw["C"])
+
+
+@enter_mode(build=True)
+def test_bool_eval_to_query(setup):
+    raw = pd.DataFrame(np.random.rand(100, 10), columns=list("ABCDEFGHIJ"))
+
+    # does not support non-eval inputs
+    df1 = md.DataFrame(raw, chunk_size=10)
+    df2 = df1[(df1["A"] * 5).astype(bool)]
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df2.data) is None
+
+    df1 = md.DataFrame(raw, chunk_size=10)
+    df2 = df1[(df1["A"] > 0.5) & (df1["C"] < 0.5)]
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    opt_df2 = records.get_optimization_result(df2.data)
+    assert isinstance(opt_df2.op, DataFrameEval)
+    assert opt_df2.op.is_query
+    assert _norm_vars(opt_df2.op.expr) == "((`A`) > (@scalar)) & ((`C`) < (@scalar))"
+
+    pd.testing.assert_frame_equal(
+        df2.execute().fetch(), raw[(raw["A"] > 0.5) & (raw["C"] < 0.5)]
+    )
+
+    raw = pd.DataFrame(np.random.rand(100, 10), columns=list("ABCDEFGHIJ"))
+    df1 = md.DataFrame(raw, chunk_size=10)
+    df2 = df1[(df1["A"] > 0.5) & (df1["C"] < 0.5)] + 1
+    assert isinstance(opt_df2.op, DataFrameEval)
+    assert opt_df2.op.is_query
+
+    r_df2, _r_col_a = fetch(execute(df2, df1["A"]))
+    pd.testing.assert_frame_equal(r_df2, raw[(raw["A"] > 0.5) & (raw["C"] < 0.5)] + 1)
+
+    raw = pd.DataFrame(
+        {
+            "a": np.arange(100),
+            "b": [pd.Timestamp("2022-1-1") + pd.Timedelta(days=i) for i in range(100)],
+        }
+    )
+    df1 = md.DataFrame(raw, chunk_size=10)
+    df2 = df1[df1.b < pd.Timestamp("2022-3-20")]
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    opt_df2 = records.get_optimization_result(df2.data)
+    assert _norm_vars(opt_df2.op.expr) == "(`b`) < (@scalar)"
+
+    r_df2 = fetch(execute(df2))
+    pd.testing.assert_frame_equal(r_df2, raw[raw.b < pd.Timestamp("2022-3-20")])
+
+
+@enter_mode(build=True)
+def test_eval_setitem_to_eval(setup):
+    raw = pd.DataFrame(np.random.rand(100, 10), columns=list("ABCDEFGHIJ"))
+    raw2 = pd.DataFrame(np.random.rand(100, 5), columns=list("ABCDE"))
+
+    # does not support non-eval value setting
+    df1 = md.DataFrame(raw, chunk_size=10)
+    df1["K"] = 345
+    graph = TileableGraph([df1.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df1.data) is None
+
+    df1 = md.DataFrame(raw, chunk_size=10)
+    df2 = md.DataFrame(raw2, chunk_size=10)
+    df3 = df1.merge(df2, on="A", suffixes=("", "_"))
+    df3["K"] = df3["A"] * (1 - df3["B"])
+    df3["L"] = df3["K"] - df3["A"]
+    df3["M"] = df3["K"] + df3["L"]
+
+    graph = TileableGraph([df3.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    opt_df3 = records.get_optimization_result(df3.data)
+    assert opt_df3.op.expr == "\n".join(
+        [
+            "`K` = (`A`) * ((1) - (`B`))",
+            "`L` = (`K`) - (`A`)",
+            "`M` = (`K`) + (`L`)",
+        ]
+    )
+    assert len(graph) == 4
+    assert len([n for n in graph if isinstance(n.op, DataFrameEval)]) == 1
+
+    r_df3 = raw.merge(raw2, on="A", suffixes=("", "_"))
+    r_df3["K"] = r_df3["A"] * (1 - r_df3["B"])
+    r_df3["L"] = r_df3["K"] - r_df3["A"]
+    r_df3["M"] = r_df3["K"] + r_df3["L"]
+    pd.testing.assert_frame_equal(df3.execute().fetch(), r_df3)
diff --git a/python/xorbits/_mars/optimization/logical/tileable/tests/test_head.py b/python/xorbits/_mars/optimization/logical/tileable/tests/test_head.py
new file mode 100644
index 000000000..b95b8adcd
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/tests/test_head.py
@@ -0,0 +1,225 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ..... import dataframe as md
+from .....core import TileableGraph, TileableGraphBuilder, enter_mode
+from .....dataframe.indexing.iloc import DataFrameIlocGetItem, SeriesIlocGetItem
+from .. import optimize
+
+
+@pytest.fixture(scope="module")
+def prepare_data():
+    rs = np.random.RandomState(0)
+    df = pd.DataFrame(
+        {
+            "a": rs.randint(10, size=100),
+            "b": rs.rand(100),
+            "c": rs.choice(list("abc"), size=100),
+        }
+    )
+
+    with tempfile.TemporaryDirectory() as tempdir:
+        yield tempdir, df
+
+
+def _execute_iloc(*_):  # pragma: no cover
+    raise ValueError("cannot run iloc")
+
+
+_iloc_operand_executors = {
+    DataFrameIlocGetItem: _execute_iloc,
+    SeriesIlocGetItem: _execute_iloc,
+}
+
+
+@enter_mode(build=True)
+def test_read_csv_head(prepare_data, setup):
+    tempdir, pdf = prepare_data
+    file_path = os.path.join(tempdir, "test.csv")
+    pdf.to_csv(file_path, index=False)
+
+    size = os.stat(file_path).st_size / 2
+    df1 = md.read_csv(file_path, chunk_bytes=size)
+    df2 = df1.head(5)
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df1.data) is None
+    opt_df2 = records.get_optimization_result(df2.data)
+    assert opt_df2.op.nrows == 5
+    assert len(graph) == 1
+    assert opt_df2 in graph.results
+
+    result = df2.execute(
+        extra_config={"operand_executors": _iloc_operand_executors}
+    ).fetch()
+    expected = pdf.head(5)
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test multiple head
+    df3 = df1.head(10)
+    graph = TileableGraph([df2.data, df3.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    opt_df1 = records.get_optimization_result(df1.data)
+    assert opt_df1 is not None
+    assert opt_df1.op.nrows == 10
+    opt_df2 = records.get_optimization_result(df2.data)
+    assert opt_df2 is not None
+    assert graph.predecessors(opt_df2)[0] is opt_df1
+    assert opt_df2.inputs[0] is opt_df1
+    opt_df3 = records.get_optimization_result(df3.data)
+    assert opt_df3 is not None
+    assert graph.predecessors(opt_df3)[0] is opt_df1
+    assert opt_df3.inputs[0] is opt_df1
+
+    # test head with successor
+    df1 = md.read_csv(file_path, chunk_bytes=size)
+    df2 = df1.head(5)
+    df3 = df2 + 1
+    graph = TileableGraph([df3.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df1.data) is None
+    opt_df2 = records.get_optimization_result(df2.data)
+    assert opt_df2.op.nrows == 5
+    assert len(graph) == 2
+
+
+@enter_mode(build=True)
+def test_read_parquet_head(prepare_data, setup):
+    tempdir, pdf = prepare_data
+    dirname = os.path.join(tempdir, "test_parquet")
+    os.makedirs(dirname)
+    for i in range(3):
+        file_path = os.path.join(dirname, f"test{i}.parquet")
+        pdf[i * 40 : (i + 1) * 40].to_parquet(file_path, index=False)
+
+    df1 = md.read_parquet(dirname)
+    df2 = df1.head(5)
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df1.data) is None
+    opt_df2 = records.get_optimization_result(df2.data)
+    assert opt_df2.op.nrows == 5
+    assert len(graph) == 1
+    assert opt_df2 in graph.results
+
+    result = df2.execute(
+        extra_config={"operand_executors": _iloc_operand_executors}
+    ).fetch()
+    expected = pdf.head(5)
+    pd.testing.assert_frame_equal(result, expected)
+
+
+@enter_mode(build=True)
+def test_sort_head(prepare_data, setup):
+    _, pdf = prepare_data
+
+    df1 = md.DataFrame(pdf, chunk_size=20)
+    df1 = df1.sort_values(by="b")
+    df2 = df1.head(10)
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df1.data) is None
+    opt_df2 = records.get_optimization_result(df2.data)
+    assert opt_df2.op.nrows == 10
+    assert len(graph) == 2
+    assert opt_df2 in graph.results
+
+    result = df2.execute(
+        extra_config={"operand_executors": _iloc_operand_executors}
+    ).fetch()
+    expected = pdf.sort_values(by="b").head(10)
+    pd.testing.assert_frame_equal(result, expected)
+
+    pdf2 = pdf.copy()
+    pdf2.set_index("b", inplace=True)
+    df1 = md.DataFrame(pdf2, chunk_size=20)
+    df1 = df1.sort_index()
+    df2 = df1.head(10)
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df1.data) is None
+    opt_df2 = records.get_optimization_result(df2.data)
+    assert opt_df2.op.nrows == 10
+    assert len(graph) == 2
+    assert opt_df2 in graph.results
+
+    result = df2.execute(
+        extra_config={"operand_executors": _iloc_operand_executors}
+    ).fetch()
+    expected = pdf2.sort_index().head(10)
+    pd.testing.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("chunk_size", [5, 10])
+@enter_mode(build=True)
+def test_value_counts_head(prepare_data, setup, chunk_size):
+    _, pdf = prepare_data
+    df = md.DataFrame(pdf, chunk_size=chunk_size)
+
+    df1 = df["a"].value_counts(method="tree")
+    df2 = df1.head(3)
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df1.data) is None
+    opt_df2 = records.get_optimization_result(df2.data)
+    assert opt_df2.op.nrows == 3
+    assert len(graph) == 3
+    assert opt_df2 in graph.results
+
+    result = df2.execute(
+        extra_config={"operand_executors": _iloc_operand_executors}
+    ).fetch()
+    expected = pdf["a"].value_counts().head(3)
+    pd.testing.assert_series_equal(result, expected)
+
+
+@enter_mode(build=True)
+def test_no_head(prepare_data):
+    tempdir, pdf = prepare_data
+    file_path = os.path.join(tempdir, "test.csv")
+    pdf.to_csv(file_path, index=False)
+
+    size = os.stat(file_path).st_size / 2
+    df1 = md.read_csv(file_path, chunk_bytes=size)
+    df2 = df1.iloc[1:10]
+
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df1.data) is None
+    assert records.get_optimization_result(df2.data) is None
+
+    df2 = df1.head(3)
+    df3 = df1 + 1
+
+    graph = TileableGraph([df2.data, df3.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df1.data) is None
+    assert records.get_optimization_result(df2.data) is None
+    assert records.get_optimization_result(df3.data) is None
diff --git a/python/xorbits/_mars/optimization/physical/__init__.py b/python/xorbits/_mars/optimization/physical/__init__.py
new file mode 100644
index 000000000..8f9a8f9ad
--- /dev/null
+++ b/python/xorbits/_mars/optimization/physical/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import optimize
+from .cupy import CupyRuntimeOptimizer
+from .numexpr import NumexprRuntimeOptimizer
diff --git a/python/xorbits/_mars/optimization/physical/core.py b/python/xorbits/_mars/optimization/physical/core.py
new file mode 100644
index 000000000..a3b66afcd
--- /dev/null
+++ b/python/xorbits/_mars/optimization/physical/core.py
@@ -0,0 +1,97 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Dict, List, Tuple, Type
+
+from ...core import ChunkGraph, ChunkType, OperandType
+from ...utils import build_fuse_chunk
+
+
+class RuntimeOptimizer(ABC):
+    engine = None
+
+    def __init__(self, graph: ChunkGraph):
+        self._graph = graph
+
+    @classmethod
+    @abstractmethod
+    def is_available(cls) -> bool:
+        """
+        Check this optimizer is available.
+
+        Returns
+        -------
+        is_available : bool
+            Available.
+        """
+
+    @abstractmethod
+    def optimize(self):
+        """
+        Optimize chunk graph.
+        """
+
+    def _fuse_nodes(
+        self, fuses: List[List[ChunkType]], fuse_cls: OperandType
+    ) -> Tuple[List[List[ChunkType]], List[ChunkType]]:
+        graph = self._graph
+        fused_nodes = []
+
+        for fuse in fuses:
+            head_node = fuse[0]
+            tail_node = fuse[-1]
+
+            fused_chunk = build_fuse_chunk(
+                fuse, fuse_cls, op_kw={"dtype": tail_node.dtype}
+            ).data
+            graph.add_node(fused_chunk)
+            for node in graph.iter_successors(tail_node):
+                graph.add_edge(fused_chunk, node)
+            for node in graph.iter_predecessors(head_node):
+                graph.add_edge(node, fused_chunk)
+            for node in fuse:
+                graph.remove_node(node)
+            fused_nodes.append(fused_chunk)
+
+            try:
+                # check tail node if it's in results
+                i = graph.results.index(tail_node)
+                graph.results[i] = fused_chunk
+            except ValueError:
+                pass
+
+        return fuses, fused_nodes
+
+
+_engine_to_optimizers: Dict[str, Type[RuntimeOptimizer]] = dict()
+
+
+def register_optimizer(optimizer_cls: Type[RuntimeOptimizer]):
+    _engine_to_optimizers[optimizer_cls.engine] = optimizer_cls
+    return optimizer_cls
+
+
+def optimize(graph: ChunkGraph, engines: List[str] = None) -> ChunkGraph:
+    if engines is None:
+        engines = ["numexpr", "cupy"]
+
+    for engine in engines:
+        optimizer_cls = _engine_to_optimizers[engine]
+        optimizer = optimizer_cls(graph)
+        if not optimizer.is_available():
+            continue
+        optimizer.optimize()
+
+    return graph
diff --git a/python/xorbits/_mars/optimization/physical/cupy.py b/python/xorbits/_mars/optimization/physical/cupy.py
new file mode 100644
index 000000000..16dd9278e
--- /dev/null
+++ b/python/xorbits/_mars/optimization/physical/cupy.py
@@ -0,0 +1,70 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...tensor import arithmetic
+from ...tensor.fuse import TensorCpFuseChunk
+from ...utils import lazy_import
+from .core import RuntimeOptimizer, register_optimizer
+
+cp = lazy_import("cupy", rename="cp")
+CP_INSTALLED = cp is not None
+
+CP_ELEMENTWISE_OP = {
+    arithmetic.TensorSubtract,
+    arithmetic.TensorMultiply,
+    arithmetic.TensorTrueDiv,
+    arithmetic.TensorSqrt,
+}
+CP_OP = CP_ELEMENTWISE_OP
+
+
+@register_optimizer
+class CupyRuntimeOptimizer(RuntimeOptimizer):
+    engine = "cupy"
+
+    @classmethod
+    def is_available(cls) -> bool:
+        return CP_INSTALLED
+
+    def optimize(self):
+        fuses = []
+        explored = set()
+
+        graph = self._graph
+        for node in graph.topological_iter():
+            if type(node.op) not in CP_OP:
+                continue
+            if node in explored:
+                continue
+            if graph.count_predecessors(node) != 1:
+                continue
+            if node in graph.results:
+                continue
+
+            selected = [node]
+            # add successors
+            cur_node = graph.successors(node)[0]
+            while (
+                graph.count_predecessors(cur_node) == 1 and type(cur_node.op) in CP_OP
+            ):
+                selected.append(cur_node)
+                if graph.count_successors(cur_node) != 1 or cur_node in graph.results:
+                    break
+                else:
+                    cur_node = graph.successors(cur_node)[0]
+            if len(selected) > 1:
+                explored.update(selected)
+                fuses.append(list(selected))
+
+        return self._fuse_nodes(fuses, TensorCpFuseChunk)
diff --git a/python/xorbits/_mars/optimization/physical/numexpr.py b/python/xorbits/_mars/optimization/physical/numexpr.py
new file mode 100644
index 000000000..7f5d8c475
--- /dev/null
+++ b/python/xorbits/_mars/optimization/physical/numexpr.py
@@ -0,0 +1,252 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dataclasses
+import functools
+import logging
+from typing import List, Set
+
+import numpy as np
+
+from ...core import ChunkGraph, ChunkType
+from ...tensor import arithmetic, reduction
+from ...tensor.fuse import TensorNeFuseChunk
+from ...tensor.fuse.numexpr import NUMEXPR_INSTALLED
+from .core import RuntimeOptimizer, register_optimizer
+
+logger = logging.getLogger(__name__)
+
+
+REDUCTION = object()
+REDUCTION_OP = {
+    reduction.TensorSum,
+    reduction.TensorProd,
+    reduction.TensorMax,
+    reduction.TensorMin,
+}
+SUPPORT_OP = {
+    arithmetic.TensorAdd,
+    arithmetic.TensorSubtract,
+    arithmetic.TensorMultiply,
+    arithmetic.TensorDivide,
+    arithmetic.TensorPower,
+    arithmetic.TensorMod,
+    arithmetic.TensorNegative,
+    arithmetic.TensorAbs,
+    arithmetic.TensorConj,
+    arithmetic.TensorExp,
+    arithmetic.TensorLog,
+    arithmetic.TensorLog10,
+    arithmetic.TensorExpm1,
+    arithmetic.TensorLog1p,
+    arithmetic.TensorSqrt,
+    arithmetic.TensorEqual,
+    arithmetic.TensorNotEqual,
+    arithmetic.TensorLessThan,
+    arithmetic.TensorLessEqual,
+    arithmetic.TensorGreaterThan,
+    arithmetic.TensorGreaterEqual,
+    arithmetic.TensorSin,
+    arithmetic.TensorCos,
+    arithmetic.TensorTan,
+    arithmetic.TensorArcsin,
+    arithmetic.TensorArccos,
+    arithmetic.TensorArctan,
+    arithmetic.TensorSinh,
+    arithmetic.TensorCosh,
+    arithmetic.TensorTanh,
+    arithmetic.TensorArcsinh,
+    arithmetic.TensorArccosh,
+    arithmetic.TensorArctanh,
+    arithmetic.TensorLshift,
+    arithmetic.TensorRshift,
+    arithmetic.TensorTreeAdd,
+    arithmetic.TensorTreeMultiply,
+    arithmetic.TensorFloor,
+    arithmetic.TensorCeil,
+    arithmetic.TensorAnd,
+    arithmetic.TensorOr,
+    arithmetic.TensorNot,
+    reduction.TensorSum,
+    reduction.TensorProd,
+    reduction.TensorMax,
+    reduction.TensorMin,
+}
+
+
+@dataclasses.dataclass
+class _Fuse:
+    graph: ChunkGraph
+    heads: List[ChunkType]
+    tails: List[ChunkType]
+
+
+def _can_fuse(node: ChunkType):
+    op = node.op
+    op_type = type(op)
+    if op_type in REDUCTION_OP:
+        if len(op.axis) == 1 or len(op.axis) == node.ndim:
+            return REDUCTION
+        else:
+            return False
+    # return op_type in SUPPORT_OP
+    if op_type not in SUPPORT_OP:
+        return False
+    if op_type in (arithmetic.TensorOr, arithmetic.TensorAnd):
+        # numexpr only support logical and or:
+        # https://numexpr.readthedocs.io/projects/NumExpr3/en/latest/user_guide.html#supported-operators
+        if np.isscalar(op.lhs) or np.isscalar(op.rhs):
+            return False
+    return True
+
+
+def _collect_fuse(
+    graph: ChunkGraph,
+    node: ChunkType,
+    graph_results: Set[ChunkType],
+    cached_can_fuse,
+):
+    fuse_graph = ChunkGraph()
+    fuse_graph.add_node(node)
+    fuse_heads = []
+    fuse_tails = []
+    tail_reduction_node = None
+
+    stack = [node]
+    # Do a full search of sub graph even the fuse tails > 1
+    while len(stack) != 0:
+        node = stack.pop()
+        is_head = graph.count_predecessors(node) == 0
+        for n in graph.iter_predecessors(node):
+            can_fuse = cached_can_fuse(n)
+            if can_fuse is False or can_fuse is REDUCTION:
+                is_head = True
+            elif not fuse_graph.contains(n):
+                stack.append(n)
+                fuse_graph.add_node(n)
+            else:
+                fuse_graph.add_edge(n, node)
+        if is_head:
+            fuse_heads.append(node)
+        # Skip the successors of tail reduction node.
+        if node is tail_reduction_node:
+            continue
+        is_tail = graph.count_successors(node) == 0 or node in graph_results
+        for n in graph.iter_successors(node):
+            can_fuse = cached_can_fuse(n)
+            if can_fuse is False:
+                is_tail = True
+            elif can_fuse is REDUCTION:
+                if tail_reduction_node is None:
+                    tail_reduction_node = n
+                    fuse_tails.append(n)
+                    stack.append(n)
+                    fuse_graph.add_node(n)
+                elif n is tail_reduction_node:
+                    fuse_graph.add_edge(node, n)
+                else:
+                    is_tail = True
+            elif not fuse_graph.contains(n):
+                stack.append(n)
+                fuse_graph.add_node(n)
+            else:
+                fuse_graph.add_edge(node, n)
+        if is_tail:
+            fuse_tails.append(node)
+
+    return _Fuse(fuse_graph, fuse_heads, fuse_tails)
+
+
+@register_optimizer
+class NumexprRuntimeOptimizer(RuntimeOptimizer):
+    engine = "numexpr"
+
+    @classmethod
+    def is_available(cls) -> bool:
+        return NUMEXPR_INSTALLED
+
+    def optimize(self):
+        fuses = []
+        explored = set()
+        cached_can_fuse = functools.lru_cache(maxsize=None)(_can_fuse)
+
+        graph = self._graph
+        graph_results = set(graph.results)
+        for node in graph.topological_iter():
+            if node.op.gpu or node.op.sparse:
+                # break
+                return [], []
+            if node in explored or node in graph_results:
+                continue
+            can_fuse = cached_can_fuse(node)
+            if can_fuse is True:
+                fuse = _collect_fuse(graph, node, graph_results, cached_can_fuse)
+                if len(fuse.graph) > 1:
+                    explored.update(fuse.graph)
+                    if len(fuse.tails) == 1:
+                        fuses.append(fuse)
+                    else:
+                        logger.info(
+                            "Refused fusing for numexpr because the tail node count > 1."
+                        )
+
+        return self._fuse_nodes(fuses, TensorNeFuseChunk)
+
+    def _fuse_nodes(self, fuses: List[_Fuse], fuse_cls):
+        graph = self._graph
+        fused_nodes = []
+
+        for fuse in fuses:
+            fuse_graph = fuse.graph
+            tail_nodes = fuse.tails
+            head_nodes = fuse.heads
+            inputs = [
+                inp for n in head_nodes for inp in n.inputs if inp not in fuse_graph
+            ]
+
+            tail_chunk = tail_nodes[0]
+            tail_chunk_op = tail_chunk.op
+            fuse_op = fuse_cls(
+                sparse=tail_chunk_op.sparse,
+                gpu=tail_chunk_op.gpu,
+                _key=tail_chunk_op.key,
+                fuse_graph=fuse_graph,
+                dtype=tail_chunk.dtype,
+            )
+            fused_chunk = fuse_op.new_chunk(
+                inputs,
+                kws=[tail_chunk.params],
+                _key=tail_chunk.key,
+                _chunk=tail_chunk,
+            ).data
+
+            graph.add_node(fused_chunk)
+            for node in graph.iter_successors(tail_chunk):
+                graph.add_edge(fused_chunk, node)
+            for head_chunk in head_nodes:
+                for node in graph.iter_predecessors(head_chunk):
+                    if not fuse_graph.contains(node):
+                        graph.add_edge(node, fused_chunk)
+            for node in fuse_graph:
+                graph.remove_node(node)
+            fused_nodes.append(fused_chunk)
+
+            try:
+                # check tail node if it's in results
+                i = graph.results.index(tail_chunk)
+                graph.results[i] = fused_chunk
+            except ValueError:
+                pass
+
+        return fuses, fused_nodes
diff --git a/python/xorbits/_mars/optimization/physical/tests/__init__.py b/python/xorbits/_mars/optimization/physical/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/optimization/physical/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/optimization/physical/tests/test_cupy.py b/python/xorbits/_mars/optimization/physical/tests/test_cupy.py
new file mode 100644
index 000000000..0a3194d0e
--- /dev/null
+++ b/python/xorbits/_mars/optimization/physical/tests/test_cupy.py
@@ -0,0 +1,41 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .... import tensor as mt
+from ....core import (
+    ChunkGraphBuilder,
+    TileableGraph,
+    TileableGraphBuilder,
+    TileContext,
+    enter_mode,
+)
+from ..cupy import CupyRuntimeOptimizer
+
+
+@enter_mode(build=True)
+def test_cupy():
+    t1 = mt.ones((100, 50), chunk_size=50, gpu=True)
+    t2 = mt.ones(50, chunk_size=50, gpu=True)
+    t = (t1 - t2) / mt.sqrt(t2 * (1 - t2) * len(t2))
+
+    graph = TileableGraph([t.data])
+    next(TileableGraphBuilder(graph).build())
+    context = TileContext()
+    chunk_graph_builder = ChunkGraphBuilder(
+        graph, fuse_enabled=False, tile_context=context
+    )
+    chunk_graph = next(chunk_graph_builder.build())
+
+    CupyRuntimeOptimizer(chunk_graph).optimize()
+    assert any(n.op.__class__.__name__ == "TensorCpFuseChunk" for n in chunk_graph)
diff --git a/python/xorbits/_mars/optimization/physical/tests/test_numexpr.py b/python/xorbits/_mars/optimization/physical/tests/test_numexpr.py
new file mode 100644
index 000000000..74d3a4b6e
--- /dev/null
+++ b/python/xorbits/_mars/optimization/physical/tests/test_numexpr.py
@@ -0,0 +1,383 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import operator
+
+from ....core import ChunkGraph
+from ....tensor.arithmetic import TensorTreeAdd
+from ....tensor.indexing import TensorSlice
+from ....tensor.reduction import TensorSum
+from ..numexpr import NumexprRuntimeOptimizer
+
+
+def test_numexpr():
+    r"""
+        graph(@: node, S: Slice Chunk, #: fused_node):
+
+        @                   @                          @
+          \               /                          /
+            @ --> @ --> S      ========>     # --> S
+          /               \                          \
+        @                   @                          @
+
+        fuse stopped at S, because numexpr don't support Slice op
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(6)
+    ]
+    chunk_slice = TensorSlice().new_chunk([None], None).data
+    chunk_reduction = TensorSum(axis=(1,)).new_chunk([None], None).data
+    graph = ChunkGraph([chunks[4], chunks[5]])
+    list(map(graph.add_node, chunks[:6]))
+    graph.add_node(chunk_slice)
+    graph.add_edge(chunks[0], chunks[2])
+    graph.add_edge(chunks[1], chunks[2])
+    graph.add_edge(chunks[2], chunks[3])
+    graph.add_edge(chunks[3], chunk_slice)
+    graph.add_edge(chunk_slice, chunks[4])
+    graph.add_edge(chunk_slice, chunks[5])
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    assert fused_nodes[0].composed == chunks[:4]
+    assert len(graph) == 4
+
+    r"""
+        graph(@: node, S: Slice Chunk, #: fused_node):
+
+        @                   @
+          \               /
+            @ --> @ --> @      ========>   Tail node count > 1, can't be fused.
+          /               \
+        @                   @
+
+        fuse stopped at S, because numexpr don't support Slice op
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(7)
+    ]
+    graph = ChunkGraph([chunks[5], chunks[6]])
+    list(map(graph.add_node, chunks[:7]))
+    graph.add_edge(chunks[0], chunks[2])
+    graph.add_edge(chunks[1], chunks[2])
+    graph.add_edge(chunks[2], chunks[3])
+    graph.add_edge(chunks[3], chunks[4])
+    graph.add_edge(chunks[4], chunks[5])
+    graph.add_edge(chunks[4], chunks[6])
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    assert len(fused_nodes) == 0
+    assert len(graph) == 7
+
+    r"""
+        graph(@: node, S: Slice Chunk, #: fused_node):
+
+        @           S       S
+          \        /       /
+            @ --> @ --> @      ========>   Tail node count > 1, can't be fused.
+          /               \
+        @                   @
+
+        fuse stopped at S, because numexpr don't support Slice op
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(6)
+    ]
+    chunk_slices = [
+        TensorSlice(_key=str(n)).new_chunk([None], None).data for n in range(2)
+    ]
+    graph = ChunkGraph([chunks[5], chunk_slices[0], chunk_slices[1]])
+    list(map(graph.add_node, chunks[:6]))
+    list(map(graph.add_node, chunk_slices[:2]))
+    graph.add_edge(chunks[0], chunks[2])
+    graph.add_edge(chunks[1], chunks[2])
+    graph.add_edge(chunks[2], chunks[3])
+    graph.add_edge(chunks[3], chunk_slices[0])
+    graph.add_edge(chunks[3], chunks[4])
+    graph.add_edge(chunks[4], chunks[5])
+    graph.add_edge(chunks[4], chunk_slices[1])
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    assert len(fused_nodes) == 0
+    assert len(graph) == 8
+
+    r"""
+        graph(@: node, S: Slice Chunk, #: fused_node):
+
+        @
+          \
+            @
+          /   \
+        @      \
+                 @   ========>   #
+        @      /
+          \   /
+            @
+          /
+        @
+
+        fuse stopped at S, because numexpr don't support Slice op
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(7)
+    ]
+    graph = ChunkGraph([chunks[6]])
+    list(map(graph.add_node, chunks[:7]))
+    graph.add_edge(chunks[0], chunks[2])
+    graph.add_edge(chunks[1], chunks[2])
+    graph.add_edge(chunks[3], chunks[5])
+    graph.add_edge(chunks[4], chunks[5])
+    graph.add_edge(chunks[2], chunks[6])
+    graph.add_edge(chunks[5], chunks[6])
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    sorted_composed = sorted(fused_nodes[0].composed, key=operator.attrgetter("key"))
+    assert sorted_composed == chunks
+    assert len(graph) == 1
+
+    r"""
+        graph(@: node, S: Slice Chunk, #: fused_node):
+
+        @
+          \
+            @
+          /   \                            #
+        @      \                              \
+                 S --> @ --> @  ========>       S --> #
+        @      /                              /
+          \   /                            #
+            @
+          /
+        @
+
+        fuse stopped at S, because numexpr don't support Slice op
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(8)
+    ]
+    graph = ChunkGraph([chunks[7]])
+    list(map(graph.add_node, chunks[:8]))
+    graph.add_node(chunk_slice)
+    graph.add_edge(chunks[0], chunks[2])
+    graph.add_edge(chunks[1], chunks[2])
+    graph.add_edge(chunks[3], chunks[5])
+    graph.add_edge(chunks[4], chunks[5])
+    graph.add_edge(chunks[2], chunk_slice)
+    graph.add_edge(chunks[5], chunk_slice)
+    graph.add_edge(chunk_slice, chunks[6])
+    graph.add_edge(chunks[6], chunks[7])
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    assert len(fused_nodes) == 3
+    assert sorted(len(n.composed) for n in fused_nodes) == [2, 3, 3]
+    assert len(graph) == 4
+    assert graph.contains(chunk_slice)
+
+    r"""
+        graph(@: node, S: Slice Chunk, #: fused_node):
+
+        S
+          \
+            @
+          /   \                         S
+        @      \                           \
+                 @ --- @   ========>    S --  #
+        @      /     /                     /
+          \   /     S                   S
+            @
+          /
+        S
+
+        fuse stopped at S, because numexpr don't support Slice op
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(6)
+    ]
+    chunk_slices = [
+        TensorSlice(_key=str(n)).new_chunk([None], None).data for n in range(3)
+    ]
+    graph = ChunkGraph([chunks[5]])
+    list(map(graph.add_node, chunks[:6]))
+    list(map(graph.add_node, chunk_slices[:3]))
+    graph.add_edge(chunk_slices[0], chunks[1])
+    graph.add_edge(chunks[0], chunks[1])
+    graph.add_edge(chunks[2], chunks[3])
+    graph.add_edge(chunk_slices[1], chunks[3])
+    graph.add_edge(chunks[1], chunks[4])
+    graph.add_edge(chunks[3], chunks[4])
+    graph.add_edge(chunks[4], chunks[5])
+    graph.add_edge(chunk_slices[2], chunks[5])
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    assert len(fused_nodes) == 1
+    sorted_composed = sorted(fused_nodes[0].composed, key=operator.attrgetter("key"))
+    assert sorted_composed == chunks
+    assert len(graph) == 4
+    assert graph.count_predecessors(fused_nodes[0]) == 3
+
+    r"""
+        graph(@: node, S: Slice Chunk, #: fused_node):
+
+        @ --> @ --> S --> @  ========>  # --> S --> @
+
+        fuse stopped at S, because numexpr don't support Slice op
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(4)
+    ]
+    graph = ChunkGraph([chunks[2]])
+    list(map(graph.add_node, chunks[:3]))
+    graph.add_node(chunk_slice)
+    graph.add_edge(chunks[0], chunks[1])
+    graph.add_edge(chunks[1], chunk_slice)
+    graph.add_edge(chunk_slice, chunks[2])
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    assert fused_nodes[0].composed == chunks[:2]
+    assert len(fused_nodes) == 1
+
+    r"""
+        graph(@: node, S: Slice Chunk, #: fused_node):
+
+        @ --> @ --> S --> @ --> @   ========>  # --> S --> #
+
+        fuse stopped at S, because numexpr don't support Slice op
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(4)
+    ]
+    graph = ChunkGraph([chunks[3]])
+    list(map(graph.add_node, chunks[:4]))
+    graph.add_node(chunk_slice)
+    graph.add_edge(chunks[0], chunks[1])
+    graph.add_edge(chunks[1], chunk_slice)
+    graph.add_edge(chunk_slice, chunks[2])
+    graph.add_edge(chunks[2], chunks[3])
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    assert fused_nodes[0].composed == chunks[:2]
+    assert fused_nodes[1].composed == chunks[2:4]
+
+    r"""
+        graph(@: node, R: Reduction Chunk, #: fused_node):
+
+        @ --> @ --> R --> @ --> @   ========>  # --> #
+
+        fuse stopped at R, because reduction should be the last in the numexpr stack.
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(4)
+    ]
+    graph = ChunkGraph([chunks[3]])
+    list(map(graph.add_node, chunks[:4]))
+    graph.add_node(chunk_reduction)
+    graph.add_edge(chunks[0], chunks[1])
+    graph.add_edge(chunks[1], chunk_reduction)
+    graph.add_edge(chunk_reduction, chunks[2])
+    graph.add_edge(chunks[2], chunks[3])
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    assert len(fused_nodes) == 2
+    assert fused_nodes[0].composed == chunks[:2] + [chunk_reduction]
+    assert fused_nodes[1].composed == chunks[2:4]
+    assert len(graph) == 2
+
+    r"""
+        graph(@: node, R: Reduction Chunk, #: fused_node):
+
+        R --> @ --> @   ========>  R --> #
+
+        fuse stopped at R, because reduction should be the last in the numexpr stack.
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(2)
+    ]
+    graph = ChunkGraph([chunks[1]])
+    list(map(graph.add_node, chunks[:2]))
+    graph.add_node(chunk_reduction)
+    graph.add_edge(chunk_reduction, chunks[0])
+    graph.add_edge(chunks[0], chunks[1])
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    assert len(fused_nodes) == 1
+    assert fused_nodes[0].composed == chunks[:2]
+    assert len(graph) == 2
+
+    r"""
+        graph(@: node, R: Reduction Chunk, #: fused_node):
+
+        @ --> @ --> R   ========>  #
+
+        fuse stopped at R, because reduction should be the last in the numexpr stack.
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(2)
+    ]
+    graph = ChunkGraph([chunk_reduction])
+    list(map(graph.add_node, chunks[:2]))
+    graph.add_node(chunk_reduction)
+    graph.add_edge(chunks[0], chunks[1])
+    graph.add_edge(chunks[1], chunk_reduction)
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    assert len(fused_nodes) == 1
+    assert fused_nodes[0].composed == chunks[:2] + [chunk_reduction]
+    assert len(graph) == 1
+
+    r"""
+        graph(@: node, R: Reduction Chunk, #: fused_node):
+
+        @
+          \                                        R
+            R     R                               /
+          /  \   /         =============>  # --> #     R
+        @      R     R                            \   /
+             /   \  /                               @ --> R
+            @     @ --> R
+
+        fuse stopped at R, because reduction should be the last in the numexpr stack.
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(4)
+    ]
+    chunk_reductions = [
+        TensorSum(axis=(1,), _key=str(n)).new_chunk([None], None).data for n in range(5)
+    ]
+    graph = ChunkGraph([chunk_reductions[2], chunk_reductions[3], chunk_reductions[4]])
+    list(map(graph.add_node, chunks[:4]))
+    list(map(graph.add_node, chunk_reductions[:5]))
+    graph.add_edge(chunks[0], chunk_reductions[0])
+    graph.add_edge(chunks[1], chunk_reductions[0])
+    graph.add_edge(chunks[2], chunk_reductions[1])
+    graph.add_edge(chunk_reductions[0], chunk_reductions[1])
+    graph.add_edge(chunk_reductions[1], chunk_reductions[2])
+    graph.add_edge(chunk_reductions[1], chunks[3])
+    graph.add_edge(chunks[3], chunk_reductions[3])
+    graph.add_edge(chunks[3], chunk_reductions[4])
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    assert len(fused_nodes) == 2
+    assert fused_nodes[0].composed == [chunks[2], chunk_reductions[1]]
+    assert set(fused_nodes[1].composed) == {chunks[0], chunks[1], chunk_reductions[0]}
+    assert len(graph) == 6
diff --git a/python/xorbits/_mars/oscar/__init__.py b/python/xorbits/_mars/oscar/__init__.py
new file mode 100644
index 000000000..48bafea48
--- /dev/null
+++ b/python/xorbits/_mars/oscar/__init__.py
@@ -0,0 +1,57 @@
+# isort: skip_file
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TypeVar, Union
+
+# import aio to ensure patch enabled for Python 3.6
+from ..lib import aio
+
+del aio
+
+from . import debug
+from .api import (
+    actor_ref,
+    create_actor,
+    has_actor,
+    destroy_actor,
+    kill_actor,
+    Actor,
+    StatelessActor,
+    create_actor_pool,
+    setup_cluster,
+    wait_actor_pool_recovered,
+    get_pool_config,
+)
+from .backends import allocate_strategy
+from .backends.pool import MainActorPoolType
+from .batch import extensible
+from .core import ActorRef
+from .debug import set_debug_options, get_debug_options, DebugOptions
+from .errors import (
+    ActorNotExist,
+    ActorAlreadyExist,
+    ServerClosed,
+    SendMessageFailed,
+    Return,
+)
+from .utils import create_actor_ref
+
+# make sure methods are registered
+from .backends import mars, ray, test
+
+del mars, ray, test
+
+_T = TypeVar("_T")
+ActorRefType = Union[ActorRef, _T]
diff --git a/python/xorbits/_mars/oscar/api.py b/python/xorbits/_mars/oscar/api.py
new file mode 100644
index 000000000..4382d4839
--- /dev/null
+++ b/python/xorbits/_mars/oscar/api.py
@@ -0,0 +1,142 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from numbers import Number
+from typing import Any, Dict, Tuple, Type
+from urllib.parse import urlparse
+
+from .backend import get_backend
+from .context import get_context
+from .core import ActorRef, _Actor, _StatelessActor
+
+
+async def create_actor(actor_cls, *args, uid=None, address=None, **kwargs) -> ActorRef:
+    ctx = get_context()
+    return await ctx.create_actor(actor_cls, *args, uid=uid, address=address, **kwargs)
+
+
+async def has_actor(actor_ref: ActorRef) -> bool:
+    ctx = get_context()
+    return await ctx.has_actor(actor_ref)
+
+
+async def destroy_actor(actor_ref: ActorRef):
+    ctx = get_context()
+    return await ctx.destroy_actor(actor_ref)
+
+
+async def actor_ref(*args, **kwargs) -> ActorRef:
+    ctx = get_context()
+    return await ctx.actor_ref(*args, **kwargs)
+
+
+async def kill_actor(actor_ref):
+    ctx = get_context()
+    return await ctx.kill_actor(actor_ref)
+
+
+async def create_actor_pool(address: str, n_process: int = None, **kwargs):
+    if address is None:
+        raise ValueError("address has to be provided")
+    if "://" not in address:
+        scheme = None
+    else:
+        scheme = urlparse(address).scheme or None
+
+    return await get_backend(scheme).create_actor_pool(
+        address, n_process=n_process, **kwargs
+    )
+
+
+async def wait_actor_pool_recovered(address: str, main_pool_address: str = None):
+    ctx = get_context()
+    return await ctx.wait_actor_pool_recovered(address, main_pool_address)
+
+
+async def get_pool_config(address: str):
+    ctx = get_context()
+    return await ctx.get_pool_config(address)
+
+
+def setup_cluster(address_to_resources: Dict[str, Dict[str, Number]]):
+    scheme_to_address_resources = defaultdict(dict)
+    for address, resources in address_to_resources.items():
+        if address is None:
+            raise ValueError("address has to be provided")
+        if "://" not in address:
+            scheme = None
+        else:
+            scheme = urlparse(address).scheme or None
+
+        scheme_to_address_resources[scheme][address] = resources
+    for scheme, address_resources in scheme_to_address_resources.items():
+        get_backend(scheme).get_driver_cls().setup_cluster(address_resources)
+
+
+class AsyncActorMixin:
+    @classmethod
+    def default_uid(cls):
+        return cls.__name__
+
+    def __new__(cls, *args, **kwargs):
+        try:
+            return _actor_implementation[cls](*args, **kwargs)
+        except KeyError:
+            return super().__new__(cls, *args, **kwargs)
+
+    async def __post_create__(self):
+        """
+        Method called after actor creation
+        """
+        return await super().__post_create__()
+
+    async def __pre_destroy__(self):
+        """
+        Method called before actor destroy
+        """
+        return await super().__pre_destroy__()
+
+    async def __on_receive__(self, message: Tuple[Any]):
+        """
+        Handle message from other actors and dispatch them to user methods
+
+        Parameters
+        ----------
+        message : tuple
+            Message shall be (method_name,) + args + (kwargs,)
+        """
+        return await super().__on_receive__(message)
+
+
+class Actor(AsyncActorMixin, _Actor):
+    pass
+
+
+class StatelessActor(AsyncActorMixin, _StatelessActor):
+    pass
+
+
+_actor_implementation: Dict[Type[Actor], Type[Actor]] = dict()
+
+
+def register_actor_implementation(actor_cls: Type[Actor], impl_cls: Type[Actor]):
+    _actor_implementation[actor_cls] = impl_cls
+
+
+def unregister_actor_implementation(actor_cls: Type[Actor]):
+    try:
+        del _actor_implementation[actor_cls]
+    except KeyError:
+        pass
diff --git a/python/xorbits/_mars/oscar/backend.py b/python/xorbits/_mars/oscar/backend.py
new file mode 100644
index 000000000..23c0a1716
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backend.py
@@ -0,0 +1,62 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Dict, Type
+
+from .context import register_backend_context
+from .driver import register_backend_driver
+
+__all__ = ["BaseActorBackend", "register_backend", "get_backend"]
+
+
+class BaseActorBackend(ABC):
+    @staticmethod
+    @abstractmethod
+    def name():
+        pass
+
+    @staticmethod
+    @abstractmethod
+    def get_context_cls():
+        pass
+
+    @classmethod
+    async def create_actor_pool(cls, address: str, n_process: int = None, **kwargs):
+        pass
+
+    @staticmethod
+    @abstractmethod
+    def get_driver_cls():
+        pass
+
+
+_scheme_to_backend_cls: Dict[str, Type[BaseActorBackend]] = dict()
+
+
+def register_backend(backend_cls: Type[BaseActorBackend]):
+    name = backend_cls.name()
+    if isinstance(name, (list, tuple)):
+        names = name
+    else:
+        names = [name]
+    for name in names:
+        _scheme_to_backend_cls[name] = backend_cls
+        register_backend_context(name, backend_cls.get_context_cls())
+        register_backend_driver(name, backend_cls.get_driver_cls())
+    return backend_cls
+
+
+def get_backend(name):
+    return _scheme_to_backend_cls[name]
diff --git a/python/xorbits/_mars/oscar/backends/__init__.py b/python/xorbits/_mars/oscar/backends/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/oscar/backends/allocate_strategy.py b/python/xorbits/_mars/oscar/backends/allocate_strategy.py
new file mode 100644
index 000000000..f9fcc3ffe
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/allocate_strategy.py
@@ -0,0 +1,159 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from random import choice
+from typing import Dict, Optional, Tuple
+
+from ...utils import implements
+from ..core import ActorRef
+from ..errors import NoIdleSlot
+from .config import ActorPoolConfig
+from .message import _MessageBase
+
+allocated_value = Tuple["AllocateStrategy", Optional[_MessageBase]]
+allocated_values = Dict[Optional[ActorRef], allocated_value]
+allocated_type = Dict[str, allocated_values]
+
+
+class AllocateStrategy(ABC):
+    __slots__ = ()
+
+    @abstractmethod
+    def get_allocated_address(
+        self, config: ActorPoolConfig, allocated: allocated_type
+    ) -> str:
+        """
+        Get external address where the actor allocated to.
+
+        Parameters
+        ----------
+        config: ActorPoolConfig
+            Actor pool config.
+        allocated:
+            Already allocated of actor and its strategy.
+
+        Returns
+        -------
+        allocated_address: str
+            External address to allocate.
+        """
+
+
+class AddressSpecified(AllocateStrategy):
+    __slots__ = ("address",)
+
+    def __init__(self, address):
+        self.address = address
+
+    @implements(AllocateStrategy.get_allocated_address)
+    def get_allocated_address(
+        self, config: ActorPoolConfig, allocated: allocated_type
+    ) -> str:
+        return self.address
+
+
+class MainPool(AllocateStrategy):
+    __slots__ = ()
+
+    @implements(AllocateStrategy.get_allocated_address)
+    def get_allocated_address(
+        self, config: ActorPoolConfig, allocated: allocated_type
+    ) -> str:
+        # allocate to main process
+        main_process_index = config.get_process_indexes()[0]
+        return config.get_external_address(main_process_index)
+
+
+class Random(AllocateStrategy):
+    __slots__ = ()
+
+    @implements(AllocateStrategy.get_allocated_address)
+    def get_allocated_address(
+        self, config: ActorPoolConfig, allocated: allocated_type
+    ) -> str:
+        return choice(config.get_external_addresses())
+
+
+class RandomSubPool(AllocateStrategy):
+    __slots__ = ()
+
+    @implements(AllocateStrategy.get_allocated_address)
+    def get_allocated_address(
+        self, config: ActorPoolConfig, allocated: allocated_type
+    ) -> str:
+        return choice(config.get_external_addresses()[1:])
+
+
+class ProcessIndex(AllocateStrategy):
+    __slots__ = ("process_index",)
+
+    def __init__(self, process_index: int):
+        self.process_index = process_index
+
+    @implements(AllocateStrategy.get_allocated_address)
+    def get_allocated_address(
+        self, config: ActorPoolConfig, allocated: allocated_type
+    ) -> str:
+        actual_process_index = config.get_process_indexes()[self.process_index]
+        return config.get_pool_config(actual_process_index)["external_address"][0]
+
+
+class RandomLabel(AllocateStrategy):
+    __slots__ = ("label",)
+
+    def __init__(self, label):
+        self.label = label
+
+    @implements(AllocateStrategy.get_allocated_address)
+    def get_allocated_address(
+        self, config: ActorPoolConfig, allocated: allocated_type
+    ) -> str:
+        return choice(config.get_external_addresses(label=self.label))
+
+
+class IdleLabel(AllocateStrategy):
+    __slots__ = "label", "mark"
+
+    def __init__(self, label, mark):
+        self.label = label
+        self.mark = mark
+
+    def __hash__(self):
+        return hash((type(self), self.label, self.mark))
+
+    def __eq__(self, other):
+        return (
+            isinstance(other, IdleLabel)
+            and self.label == other.label
+            and self.mark == other.mark
+        )
+
+    @implements(AllocateStrategy.get_allocated_address)
+    def get_allocated_address(
+        self, config: ActorPoolConfig, allocated: allocated_type
+    ) -> str:
+        addresses = config.get_external_addresses(label=self.label)
+        for addr in addresses:
+            occupied = False
+            for strategy, _ in allocated.get(addr, dict()).values():
+                if strategy == self:
+                    occupied = True
+                    break
+            if not occupied:
+                return addr
+        raise NoIdleSlot(
+            f"No idle slot for creating actor "
+            f"with label {self.label}, mark {self.mark}"
+        )
diff --git a/python/xorbits/_mars/oscar/backends/communication/__init__.py b/python/xorbits/_mars/oscar/backends/communication/__init__.py
new file mode 100644
index 000000000..02b720e06
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/communication/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import Channel, Client, Server
+from .core import gen_local_address, get_client_type, get_server_type
+from .dummy import DummyChannel, DummyClient, DummyServer
+from .socket import (
+    SocketChannel,
+    SocketClient,
+    SocketServer,
+    UnixSocketClient,
+    UnixSocketServer,
+)
+from .ucx import (  # noqa: F401 # pylint: disable=unused-import
+    UCXChannel,
+    UCXClient,
+    UCXServer,
+)
diff --git a/python/xorbits/_mars/oscar/backends/communication/base.py b/python/xorbits/_mars/oscar/backends/communication/base.py
new file mode 100644
index 000000000..513136609
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/communication/base.py
@@ -0,0 +1,305 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from enum import Enum
+from typing import Any, Callable, Coroutine, Dict, Type
+
+from ....utils import classproperty, implements
+
+
+class ChannelType(Enum):
+    local = 0  # for local communication
+    ipc = 1  # inproc
+    remote = 2  # remote
+    ray = 3  # for ray actors communication
+
+
+class Channel(ABC):
+    """
+    Channel is used to do data exchange between server and client.
+    """
+
+    __slots__ = "local_address", "dest_address", "compression"
+
+    name = None
+
+    def __init__(
+        self, local_address: str = None, dest_address: str = None, compression=None
+    ):
+        self.local_address = local_address
+        self.dest_address = dest_address
+        self.compression = compression
+
+    @abstractmethod
+    async def send(self, message: Any):
+        """
+        Send data to dest. There should be only one send for one recv, otherwise recv messages
+        may overlap.
+
+        Parameters
+        ----------
+        message:
+            data that sent to dest.
+        """
+
+    @abstractmethod
+    async def recv(self):
+        """
+        Receive data that sent from dest.
+        """
+
+    @abstractmethod
+    async def close(self):
+        """
+        Close channel.
+        """
+
+    @property
+    @abstractmethod
+    def closed(self) -> bool:
+        """
+        This channel is closed or not.
+
+        Returns
+        -------
+        closed:
+            If the channel is closed.
+        """
+
+    @property
+    @abstractmethod
+    def type(self) -> ChannelType:
+        """
+        Channel is used for, can be dummy, ipc or remote.
+
+        Returns
+        -------
+        channel_type: ChannelType
+            type that can be dummy, ipc or remote.
+        """
+
+    @property
+    def info(self) -> Dict:
+        return {
+            "name": self.name,
+            "compression": self.compression,
+            "type": self.type,
+            "local_address": self.local_address,
+            "dest_address": self.dest_address,
+        }
+
+
+class Server(ABC):
+    __slots__ = "address", "channel_handler"
+
+    scheme = None
+
+    def __init__(
+        self, address: str, channel_handler: Callable[[Channel], Coroutine] = None
+    ):
+        self.address = address
+        self.channel_handler = channel_handler
+
+    @classproperty
+    @abstractmethod
+    def client_type(self) -> Type["Client"]:
+        """
+        Return the corresponding client type.
+
+        Returns
+        -------
+        client_type: type
+            client type.
+        """
+
+    @property
+    @abstractmethod
+    def channel_type(self) -> ChannelType:
+        """
+        Channel type, can be dummy, ipc or remote.
+
+        Returns
+        -------
+        channel_type: ChannelType
+            type that can be dummy, ipc or remote.
+        """
+
+    @staticmethod
+    @abstractmethod
+    async def create(config: Dict) -> "Server":
+        """
+        Create a server instance according to configuration.
+
+        Parameters
+        ----------
+        config: dict
+            configuration about creating a channel.
+
+        Returns
+        -------
+        server: Server
+            a server that waiting for connections from clients.
+        """
+
+    @abstractmethod
+    async def start(self):
+        """
+        Used for listening to port or similar stuff.
+        """
+
+    @abstractmethod
+    async def join(self, timeout=None):
+        """
+        Wait forever until timeout.
+        """
+
+    @abstractmethod
+    async def on_connected(self, *args, **kwargs):
+        """
+        Return a channel when new client connected.
+
+        Returns
+        -------
+        channel: Channel
+            channel for communication
+        """
+
+    @abstractmethod
+    async def stop(self):
+        """
+        Stop the server.
+        """
+
+    @property
+    @abstractmethod
+    def stopped(self) -> bool:
+        """
+        If this server is stopped or not.
+
+        Returns
+        -------
+        if_stopped: bool
+           This server is stopped or not.
+        """
+
+    @property
+    def info(self) -> Dict:
+        return {
+            "name": self.scheme,
+            "address": self.address,
+            "channel_type": self.channel_type,
+        }
+
+    @classmethod
+    def parse_config(cls, config: dict) -> dict:
+        # skip parsing config by default
+        return dict()
+
+    async def __aenter__(self):
+        await self.start()
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.stop()
+
+
+class Client(ABC):
+    __slots__ = "local_address", "dest_address", "channel"
+
+    scheme = None
+
+    def __init__(self, local_address: str, dest_address: str, channel: Channel):
+        self.local_address = local_address
+        self.dest_address = dest_address
+        self.channel = channel
+
+    @property
+    def channel_type(self) -> ChannelType:
+        """
+        Channel type, can be dummy, ipc or remote.
+
+        Returns
+        -------
+        channel_type: ChannelType
+            type that can be dummy, ipc or remote.
+        """
+        return self.channel.type
+
+    @staticmethod
+    @abstractmethod
+    async def connect(
+        dest_address: str, local_address: str = None, **kwargs
+    ) -> "Client":
+        """
+        Create a client that is able to connect to some server.
+
+        Parameters
+        ----------
+        dest_address: str
+            Destination server address that to connect to.
+        local_address: str
+            local address.
+
+        Returns
+        -------
+        client: Client
+            Client that holds a channel to communicate.
+        """
+
+    @classmethod
+    def parse_config(cls, config: dict) -> dict:
+        # skip parsing config by default
+        return dict()
+
+    @implements(Channel.send)
+    async def send(self, message):
+        return await self.channel.send(message)
+
+    @implements(Channel.recv)
+    async def recv(self):
+        return await self.channel.recv()
+
+    async def close(self):
+        """
+        Close connection.
+        """
+        await self.channel.close()
+
+    @property
+    def closed(self) -> bool:
+        """
+        This client is closed or not.
+
+        Returns
+        -------
+        closed: bool
+            If the client is closed.
+        """
+        return self.channel.closed
+
+    @property
+    def info(self) -> Dict:
+        return {
+            "local_address": self.local_address,
+            "dest_address": self.dest_address,
+            "channel_name": self.channel.name,
+            "channel_type": self.channel_type,
+        }
+
+    async def __aenter__(self):
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.close()
diff --git a/python/xorbits/_mars/oscar/backends/communication/core.py b/python/xorbits/_mars/oscar/backends/communication/core.py
new file mode 100644
index 000000000..d716fb372
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/communication/core.py
@@ -0,0 +1,66 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Type
+from urllib.parse import urlparse
+
+from .base import Client, Server
+
+_scheme_to_client_types: Dict[str, Type[Client]] = dict()
+_scheme_to_server_types: Dict[str, Type[Server]] = dict()
+
+
+def register_client(client_type: Type[Client]):
+    _scheme_to_client_types[client_type.scheme] = client_type
+    return client_type
+
+
+def register_server(server_type: Type[Server]):
+    _scheme_to_server_types[server_type.scheme] = server_type
+    return server_type
+
+
+def _check_scheme(scheme: str, types: Dict):
+    if scheme == "":
+        scheme = None
+    if scheme not in types:  # pragma: no cover
+        raise ValueError(
+            f"address illegal, address scheme "
+            f"should be one of "
+            f'{", ".join(types)}, '
+            f"got {scheme}"
+        )
+    return scheme
+
+
+def get_scheme(address: str) -> str:
+    if "://" not in address:
+        scheme = None
+    else:
+        scheme = urlparse(address).scheme
+    return scheme
+
+
+def get_client_type(address: str) -> Type[Client]:
+    scheme = _check_scheme(get_scheme(address), _scheme_to_client_types)
+    return _scheme_to_client_types[scheme]
+
+
+def get_server_type(address: str) -> Type[Server]:
+    scheme = _check_scheme(get_scheme(address), _scheme_to_server_types)
+    return _scheme_to_server_types[scheme]
+
+
+def gen_local_address(process_index: int) -> str:
+    return f"dummy://{process_index}"
diff --git a/python/xorbits/_mars/oscar/backends/communication/dummy.py b/python/xorbits/_mars/oscar/backends/communication/dummy.py
new file mode 100644
index 000000000..08c0ba8e5
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/communication/dummy.py
@@ -0,0 +1,230 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import concurrent.futures as futures
+import weakref
+from typing import Any, Callable, Coroutine, Dict, Type
+from urllib.parse import urlparse
+
+from ....utils import abc_type_require_weakref_slot, classproperty, implements
+from ...errors import ServerClosed
+from .base import Channel, ChannelType, Client, Server
+from .core import register_client, register_server
+from .errors import ChannelClosed
+
+DEFAULT_DUMMY_ADDRESS = "dummy://0"
+
+
+class DummyChannel(Channel):
+    """
+    Channel for communications in same process.
+    """
+
+    __slots__ = "_in_queue", "_out_queue", "_closed"
+
+    name = "dummy"
+
+    def __init__(
+        self,
+        in_queue: asyncio.Queue,
+        out_queue: asyncio.Queue,
+        closed: asyncio.Event,
+        local_address: str = None,
+        dest_address: str = None,
+        compression=None,
+    ):
+        super().__init__(
+            local_address=local_address,
+            dest_address=dest_address,
+            compression=compression,
+        )
+        self._in_queue = in_queue
+        self._out_queue = out_queue
+        self._closed = closed
+
+    @property
+    @implements(Channel.type)
+    def type(self) -> ChannelType:
+        return ChannelType.local
+
+    @implements(Channel.send)
+    async def send(self, message: Any):
+        if self._closed.is_set():  # pragma: no cover
+            raise ChannelClosed("Channel already closed, cannot send message")
+        # put message directly into queue
+        self._out_queue.put_nowait(message)
+
+    @implements(Channel.recv)
+    async def recv(self):
+        if self._closed.is_set():  # pragma: no cover
+            raise ChannelClosed("Channel already closed, cannot write message")
+        try:
+            return await self._in_queue.get()
+        except RuntimeError:
+            if self._closed.is_set():
+                pass
+
+    @implements(Channel.close)
+    async def close(self):
+        self._closed.set()
+
+    @property
+    @implements(Channel.closed)
+    def closed(self) -> bool:
+        return self._closed.is_set()
+
+
+@register_server
+class DummyServer(Server):
+    __slots__ = (
+        ("_closed", "_channels", "_tasks") + ("__weakref__",)
+        if abc_type_require_weakref_slot
+        else tuple()
+    )
+
+    _address_to_instances: Dict[str, "DummyServer"] = weakref.WeakValueDictionary()
+    scheme = "dummy"
+
+    def __init__(
+        self, address: str, channel_handler: Callable[[Channel], Coroutine] = None
+    ):
+        super().__init__(address, channel_handler)
+        self._closed = asyncio.Event()
+        self._channels = []
+        self._tasks = []
+
+    @classmethod
+    def get_instance(cls, address: str):
+        return cls._address_to_instances[address]
+
+    @classproperty
+    @implements(Server.client_type)
+    def client_type(self) -> Type["Client"]:
+        return DummyClient
+
+    @property
+    @implements(Server.channel_type)
+    def channel_type(self) -> ChannelType:
+        return ChannelType.local
+
+    @staticmethod
+    @implements(Server.create)
+    async def create(config: Dict) -> "DummyServer":
+        config = config.copy()
+        address = config.pop("address", DEFAULT_DUMMY_ADDRESS)
+        handle_channel = config.pop("handle_channel")
+        if urlparse(address).scheme != DummyServer.scheme:  # pragma: no cover
+            raise ValueError(
+                f"Address for DummyServer "
+                f'should be starts with "dummy://", '
+                f"got {address}"
+            )
+        if config:  # pragma: no cover
+            raise TypeError(
+                f"Creating DummyServer got unexpected " f'arguments: {",".join(config)}'
+            )
+        try:
+            server = DummyServer.get_instance(address)
+            if server.stopped:
+                raise KeyError("server closed")
+        except KeyError:
+            server = DummyServer(address, handle_channel)
+            DummyServer._address_to_instances[address] = server
+        return server
+
+    @implements(Server.start)
+    async def start(self):
+        # nothing needs to do for dummy server
+        pass
+
+    @implements(Server.join)
+    async def join(self, timeout=None):
+        wait_coro = self._closed.wait()
+        try:
+            await asyncio.wait_for(wait_coro, timeout=timeout)
+        except (futures.TimeoutError, asyncio.TimeoutError):
+            pass
+
+    @implements(Server.on_connected)
+    async def on_connected(self, *args, **kwargs):
+        if self._closed.is_set():  # pragma: no cover
+            raise ServerClosed("Dummy server already closed")
+
+        channel = args[0]
+        assert isinstance(channel, DummyChannel)
+        if kwargs:  # pragma: no cover
+            raise TypeError(
+                f"{type(self).__name__} got unexpected "
+                f'arguments: {",".join(kwargs)}'
+            )
+        self._channels.append(channel)
+        await self.channel_handler(channel)
+
+    @implements(Server.stop)
+    async def stop(self):
+        self._closed.set()
+        _ = [t.cancel() for t in self._tasks]
+        await asyncio.gather(*(channel.close() for channel in self._channels))
+
+    @property
+    @implements(Server.stopped)
+    def stopped(self) -> bool:
+        return self._closed.is_set()
+
+
+@register_client
+class DummyClient(Client):
+    __slots__ = ("_task",)
+
+    scheme = DummyServer.scheme
+
+    def __init__(self, local_address: str, dest_address: str, channel: Channel):
+        super().__init__(local_address, dest_address, channel)
+
+    @staticmethod
+    @implements(Client.connect)
+    async def connect(
+        dest_address: str, local_address: str = None, **kwargs
+    ) -> "Client":
+        if urlparse(dest_address).scheme != DummyServer.scheme:  # pragma: no cover
+            raise ValueError(
+                f'Destination address should start with "dummy://" '
+                f"for DummyClient, got {dest_address}"
+            )
+        server = DummyServer.get_instance(dest_address)
+        if server is None:  # pragma: no cover
+            raise RuntimeError(
+                f"DummyServer {dest_address} needs to be created first before DummyClient"
+            )
+        if server.stopped:  # pragma: no cover
+            raise ConnectionError(f"Dummy server {dest_address} closed")
+
+        q1, q2 = asyncio.Queue(), asyncio.Queue()
+        closed = asyncio.Event()
+        client_channel = DummyChannel(q1, q2, closed, local_address=local_address)
+        server_channel = DummyChannel(q2, q1, closed, dest_address=local_address)
+
+        conn_coro = server.on_connected(server_channel)
+        task = asyncio.create_task(conn_coro)
+        client = DummyClient(local_address, dest_address, client_channel)
+        client._task = task
+        server._tasks.append(task)
+        return client
+
+    @implements(Client.close)
+    async def close(self):
+        await super().close()
+        self._task.cancel()
+        self._task = None
diff --git a/python/xorbits/_mars/oscar/backends/communication/errors.py b/python/xorbits/_mars/oscar/backends/communication/errors.py
new file mode 100644
index 000000000..c8cc79b6c
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/communication/errors.py
@@ -0,0 +1,19 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....core.base import MarsError
+
+
+class ChannelClosed(MarsError):
+    pass
diff --git a/python/xorbits/_mars/oscar/backends/communication/socket.py b/python/xorbits/_mars/oscar/backends/communication/socket.py
new file mode 100644
index 000000000..d2adffe53
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/communication/socket.py
@@ -0,0 +1,363 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import concurrent.futures as futures
+import os
+import socket
+import sys
+import tempfile
+from abc import ABCMeta
+from asyncio import AbstractServer, StreamReader, StreamWriter
+from functools import lru_cache
+from hashlib import md5
+from typing import Any, Callable, Coroutine, Dict, Type
+from urllib.parse import urlparse
+
+from ....serialization import AioDeserializer, AioSerializer, deserialize
+from ....utils import classproperty, implements, to_binary
+from .base import Channel, ChannelType, Client, Server
+from .core import register_client, register_server
+from .utils import read_buffers, write_buffers
+
+_is_windows: bool = sys.platform.startswith("win")
+
+
+class SocketChannel(Channel):
+    __slots__ = "reader", "writer", "_channel_type", "_send_lock", "_recv_lock"
+
+    name = "socket"
+
+    def __init__(
+        self,
+        reader: StreamReader,
+        writer: StreamWriter,
+        local_address: str = None,
+        dest_address: str = None,
+        compression: int = None,
+        channel_type: ChannelType = None,
+    ):
+        super().__init__(
+            local_address=local_address,
+            dest_address=dest_address,
+            compression=compression,
+        )
+        self.reader = reader
+        self.writer = writer
+        self._channel_type = channel_type
+
+        self._send_lock = asyncio.Lock()
+        self._recv_lock = asyncio.Lock()
+
+    @property
+    @implements(Channel.type)
+    def type(self) -> ChannelType:
+        return self._channel_type
+
+    @implements(Channel.send)
+    async def send(self, message: Any):
+        # get buffers
+        compress = self.compression or 0
+        serializer = AioSerializer(message, compress=compress)
+        buffers = await serializer.run()
+
+        # write buffers
+        write_buffers(self.writer, buffers)
+        async with self._send_lock:
+            # add lock, or when parallel send,
+            # assertion error may be raised
+            await self.writer.drain()
+
+    @implements(Channel.recv)
+    async def recv(self):
+        deserializer = AioDeserializer(self.reader)
+        async with self._recv_lock:
+            header = await deserializer.get_header()
+            buffers = await read_buffers(header, self.reader)
+        return deserialize(header, buffers)
+
+    @implements(Channel.close)
+    async def close(self):
+        self.writer.close()
+        try:
+            await self.writer.wait_closed()
+        except ConnectionResetError:  # pragma: no cover
+            pass
+
+    @property
+    @implements(Channel.closed)
+    def closed(self):
+        return self.writer.is_closing()
+
+
+class _BaseSocketServer(Server, metaclass=ABCMeta):
+    __slots__ = "_aio_server", "_channels"
+
+    def __init__(
+        self,
+        address: str,
+        aio_server: AbstractServer,
+        channel_handler: Callable[[Channel], Coroutine] = None,
+    ):
+        super().__init__(address, channel_handler)
+        # asyncio.Server
+        self._aio_server = aio_server
+        self._channels = []
+
+    @implements(Server.start)
+    async def start(self):
+        await self._aio_server.start_serving()
+
+    @implements(Server.join)
+    async def join(self, timeout=None):
+        if timeout is None:
+            await self._aio_server.serve_forever()
+        else:
+            future = asyncio.create_task(self._aio_server.serve_forever())
+            try:
+                await asyncio.wait_for(future, timeout=timeout)
+            except (futures.TimeoutError, asyncio.TimeoutError):
+                future.cancel()
+
+    @implements(Server.on_connected)
+    async def on_connected(self, *args, **kwargs):
+        reader, writer = args
+        local_address = kwargs.pop("local_address", None)
+        dest_address = kwargs.pop("dest_address", None)
+        if kwargs:  # pragma: no cover
+            raise TypeError(
+                f"{type(self).__name__} got unexpected "
+                f'arguments: {",".join(kwargs)}'
+            )
+        channel = SocketChannel(
+            reader,
+            writer,
+            local_address=local_address,
+            dest_address=dest_address,
+            channel_type=self.channel_type,
+        )
+        self._channels.append(channel)
+        # handle over channel to some handlers
+        await self.channel_handler(channel)
+
+    @implements(Server.stop)
+    async def stop(self):
+        self._aio_server.close()
+        await self._aio_server.wait_closed()
+        # close all channels
+        await asyncio.gather(
+            *(channel.close() for channel in self._channels if not channel.closed)
+        )
+
+    @property
+    @implements(Server.stopped)
+    def stopped(self) -> bool:
+        return not self._aio_server.is_serving()
+
+
+@register_server
+class SocketServer(_BaseSocketServer):
+    __slots__ = "host", "port"
+
+    scheme = None
+
+    def __init__(
+        self,
+        host: str,
+        port: int,
+        aio_server: AbstractServer,
+        channel_handler: Callable[[Channel], Coroutine] = None,
+    ):
+        address = f"{host}:{port}"
+        super().__init__(address, aio_server, channel_handler=channel_handler)
+        self.host = host
+        self.port = port
+
+    @classproperty
+    @implements(Server.client_type)
+    def client_type(self) -> Type["Client"]:
+        return SocketClient
+
+    @property
+    @implements(Server.channel_type)
+    def channel_type(self) -> ChannelType:
+        return ChannelType.remote
+
+    @staticmethod
+    @implements(Server.create)
+    async def create(config: Dict) -> "Server":
+        config = config.copy()
+        if "address" in config:
+            address = config.pop("address")
+            host, port = address.split(":", 1)
+            port = int(port)
+        else:
+            host = config.pop("host")
+            port = int(config.pop("port"))
+        handle_channel = config.pop("handle_channel")
+        if "start_serving" not in config:
+            config["start_serving"] = False
+
+        async def handle_connection(reader: StreamReader, writer: StreamWriter):
+            # create a channel when client connected
+            return await server.on_connected(
+                reader, writer, local_address=server.address
+            )
+
+        port = port if port != 0 else None
+        aio_server = await asyncio.start_server(
+            handle_connection, host=host, port=port, **config
+        )
+
+        # get port of the socket if not specified
+        if not port:
+            port = aio_server.sockets[0].getsockname()[1]
+
+        if _is_windows:
+            for sock in aio_server.sockets:
+                sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, True)
+
+        server = SocketServer(host, port, aio_server, channel_handler=handle_channel)
+        return server
+
+
+@register_client
+class SocketClient(Client):
+    __slots__ = ()
+
+    scheme = SocketServer.scheme
+
+    @staticmethod
+    @implements(Client.connect)
+    async def connect(
+        dest_address: str, local_address: str = None, **kwargs
+    ) -> "Client":
+        host, port = dest_address.split(":", 1)
+        port = int(port)
+        (reader, writer) = await asyncio.open_connection(host=host, port=port, **kwargs)
+        channel = SocketChannel(
+            reader, writer, local_address=local_address, dest_address=dest_address
+        )
+        return SocketClient(local_address, dest_address, channel)
+
+
+TEMPDIR = tempfile.gettempdir()
+
+
+@lru_cache(100)
+def _gen_unix_socket_default_path(process_index):
+    return f"{TEMPDIR}/mars/{md5(to_binary(str(process_index))).hexdigest()}"  # nosec
+
+
+@register_server
+class UnixSocketServer(_BaseSocketServer):
+    __slots__ = "process_index", "path"
+
+    scheme = "unixsocket"
+
+    def __init__(
+        self,
+        process_index: int,
+        aio_server: AbstractServer,
+        path: str,
+        channel_handler: Callable[[Channel], Coroutine] = None,
+    ):
+        address = f"{self.scheme}:///{process_index}"
+        super().__init__(address, aio_server, channel_handler=channel_handler)
+        self.process_index = process_index
+        self.path = path
+
+    @classproperty
+    @implements(Server.client_type)
+    def client_type(self) -> Type["Client"]:
+        return UnixSocketClient
+
+    @property
+    @implements(Server.channel_type)
+    def channel_type(self) -> ChannelType:
+        return ChannelType.ipc
+
+    @staticmethod
+    @implements(Server.create)
+    async def create(config: Dict) -> "Server":
+        config = config.copy()
+        if "address" in config:
+            process_index = int(urlparse(config.pop("address")).path.lstrip("/"))
+        else:
+            process_index = config.pop("process_index")
+        handle_channel = config.pop("handle_channel")
+        path = config.pop("path", _gen_unix_socket_default_path(process_index))
+
+        dirname = os.path.dirname(path)
+        if not os.path.exists(dirname):
+            os.makedirs(dirname, exist_ok=True)
+
+        if "start_serving" not in config:
+            config["start_serving"] = False
+
+        async def handle_connection(reader, writer):
+            # create a channel when client connected
+            return await server.on_connected(
+                reader, writer, local_address=server.address
+            )
+
+        aio_server = await asyncio.start_unix_server(
+            handle_connection, path=path, **config
+        )
+
+        for sock in aio_server.sockets:
+            sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, True)
+
+        server = UnixSocketServer(
+            process_index, aio_server, path, channel_handler=handle_channel
+        )
+        return server
+
+    @implements(Server.stop)
+    async def stop(self):
+        await super().stop()
+        try:
+            os.remove(self.path)
+        except OSError:  # pragma: no cover
+            pass
+
+
+@register_client
+class UnixSocketClient(Client):
+    __slots__ = ()
+
+    scheme = UnixSocketServer.scheme
+
+    @staticmethod
+    @lru_cache(100)
+    def _get_process_index(addr):
+        return int(urlparse(addr).path.lstrip("/"))
+
+    @staticmethod
+    @implements(Client.connect)
+    async def connect(
+        dest_address: str, local_address: str = None, **kwargs
+    ) -> "Client":
+        process_index = UnixSocketClient._get_process_index(dest_address)
+        path = kwargs.pop("path", _gen_unix_socket_default_path(process_index))
+        try:
+            (reader, writer) = await asyncio.open_unix_connection(path, **kwargs)
+        except FileNotFoundError:
+            raise ConnectionRefusedError(
+                "Cannot connect unix socket due to file not exists"
+            )
+        channel = SocketChannel(
+            reader, writer, local_address=local_address, dest_address=dest_address
+        )
+        return UnixSocketClient(local_address, dest_address, channel)
diff --git a/python/xorbits/_mars/oscar/backends/communication/tests/__init__.py b/python/xorbits/_mars/oscar/backends/communication/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/communication/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/oscar/backends/communication/tests/test_comm.py b/python/xorbits/_mars/oscar/backends/communication/tests/test_comm.py
new file mode 100644
index 000000000..4600b662f
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/communication/tests/test_comm.py
@@ -0,0 +1,228 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import multiprocessing
+import sys
+from typing import Dict, List, Tuple, Type, Union
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from .....lib.aio import AioEvent
+from .....tests.core import require_cudf, require_cupy
+from .....utils import get_next_port, lazy_import
+from .. import (
+    Channel,
+    DummyChannel,
+    DummyClient,
+    DummyServer,
+    Server,
+    SocketChannel,
+    SocketClient,
+    SocketServer,
+    UCXServer,
+    UnixSocketClient,
+    UnixSocketServer,
+    get_client_type,
+)
+from ..ucx import UCXInitializer
+
+test_data = np.random.RandomState(0).rand(10, 10)
+port = get_next_port()
+cupy = lazy_import("cupy")
+cudf = lazy_import("cudf")
+ucp = lazy_import("ucp")
+
+
+def gen_params():
+    # server_type, config, con
+    params: List[Tuple[Type[Server], Dict, str]] = [
+        (SocketServer, dict(host="127.0.0.1", port=port), f"127.0.0.1:{port}"),
+    ]
+    if sys.platform != "win32":
+        params.append((UnixSocketServer, dict(process_index="0"), f"unixsocket:///0"))
+    if ucp is not None:
+        ucp_port = get_next_port()
+        # test ucx
+        params.append(
+            (UCXServer, dict(host="127.0.0.1", port=ucp_port), f"127.0.0.1:{ucp_port}")
+        )
+    return params
+
+
+params = gen_params()
+local_params = gen_params().copy()
+local_params.append((DummyServer, dict(), "dummy://0"))
+
+
+@pytest.mark.parametrize("server_type, config, con", local_params)
+@pytest.mark.asyncio
+async def test_comm(server_type, config, con):
+    async def check_data(chan: Union[SocketChannel, DummyChannel]):
+        np.testing.assert_array_equal(test_data, await chan.recv())
+        await chan.send("success")
+
+    config = config.copy()
+    config["handle_channel"] = check_data
+
+    # create server
+    server = await server_type.create(config)
+    await server.start()
+    assert isinstance(server.info, dict)
+
+    # create client
+    client = await server_type.client_type.connect(con)
+    assert isinstance(client.info, dict)
+    assert isinstance(client.channel.info, dict)
+    await client.send(test_data)
+
+    assert "success" == await client.recv()
+
+    await client.close()
+    assert client.closed
+
+    # create client2
+    async with await server_type.client_type.connect(con) as client2:
+        assert not client2.closed
+    assert client2.closed
+
+    await server.join(0.001)
+    await server.stop()
+
+    assert server.stopped
+
+    if server_type is UCXServer:
+        UCXInitializer.reset()
+        # skip create server on same port for ucx
+        return
+
+    async with await server_type.create(config) as server2:
+        assert not server2.stopped
+    assert server2.stopped
+
+
+def _wrap_test(server_started_event, conf, tp):
+    async def _test():
+        async def check_data(chan: SocketChannel):
+            np.testing.assert_array_equal(test_data, await chan.recv())
+            await chan.send("success")
+
+        nonlocal conf
+        conf = conf.copy()
+        conf["handle_channel"] = check_data
+
+        # create server
+        server = await tp.create(conf)
+        await server.start()
+        server_started_event.set()
+        await server.join()
+
+    asyncio.run(_test())
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("server_type, config, con", params)
+async def test_multiprocess_comm(server_type, config, con):
+    if server_type is UCXServer:
+        UCXInitializer.reset()
+
+    server_started = multiprocessing.Event()
+
+    p = multiprocessing.Process(
+        target=_wrap_test, args=(server_started, config, server_type)
+    )
+    p.daemon = True
+    p.start()
+
+    try:
+        await AioEvent(server_started).wait()
+
+        # create client
+        client = await server_type.client_type.connect(con)
+        await client.channel.send(test_data)
+
+        assert "success" == await client.recv()
+
+        await client.close()
+        assert client.closed
+    finally:
+        p.kill()
+
+
+cupy_data = np.arange(100).reshape((10, 10))
+cudf_data = pd.DataFrame({"col1": np.arange(10), "col2": [f"s{i}" for i in range(10)]})
+
+
+def _wrap_cuda_test(server_started_event, conf, tp):
+    async def _test():
+        async def check_data(chan: Channel):
+            import cupy
+
+            r = await chan.recv()
+
+            if isinstance(r, cupy.ndarray):
+                np.testing.assert_array_equal(cupy.asnumpy(r), cupy_data)
+            else:
+                pd.testing.assert_frame_equal(r.to_pandas(), cudf_data)
+            await chan.send("success")
+
+        conf["handle_channel"] = check_data
+
+        # create server
+        server = await tp.create(conf)
+        await server.start()
+        server_started_event.set()
+        await server.join()
+
+    asyncio.run(_test())
+
+
+@require_cupy
+@require_cudf
+@pytest.mark.parametrize("server_type", [SocketServer, UCXServer])
+@pytest.mark.asyncio
+async def test_multiprocess_cuda_comm(server_type):
+    mp_ctx = multiprocessing.get_context("spawn")
+
+    server_started = mp_ctx.Event()
+    port = get_next_port()
+    p = mp_ctx.Process(
+        target=_wrap_cuda_test,
+        args=(server_started, dict(host="127.0.0.1", port=port), server_type),
+    )
+    p.daemon = True
+    p.start()
+
+    await AioEvent(server_started).wait()
+
+    # create client
+    client = await server_type.client_type.connect(f"127.0.0.1:{port}")
+
+    await client.channel.send(cupy.asarray(cupy_data))
+    assert "success" == await client.recv()
+
+    client = await server_type.client_type.connect(f"127.0.0.1:{port}")
+
+    await client.channel.send(cudf.DataFrame(cudf_data))
+    assert "success" == await client.recv()
+
+    await client.close()
+
+
+def test_get_client_type():
+    assert issubclass(get_client_type("127.0.0.1"), SocketClient)
+    assert issubclass(get_client_type("unixsocket:///1"), UnixSocketClient)
+    assert issubclass(get_client_type("dummy://"), DummyClient)
diff --git a/python/xorbits/_mars/oscar/backends/communication/ucx.py b/python/xorbits/_mars/oscar/backends/communication/ucx.py
new file mode 100644
index 000000000..5026464fd
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/communication/ucx.py
@@ -0,0 +1,481 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import concurrent.futures as futures
+import functools
+import logging
+import os
+import weakref
+from typing import Any, Callable, Coroutine, Dict, Tuple, Type, List
+
+import cloudpickle
+import numpy as np
+
+from ....utils import lazy_import, implements, classproperty
+from ....lib.nvutils import get_index_and_uuid, get_cuda_context
+from ....serialization import deserialize
+from ....serialization.aio import AioSerializer, get_header_length, BUFFER_SIZES_NAME
+from .base import Channel, ChannelType, Server, Client
+from .core import register_client, register_server
+from .errors import ChannelClosed
+
+ucp = lazy_import("ucp")
+numba_cuda = lazy_import("numba.cuda")
+rmm = lazy_import("rmm")
+
+_warning_suffix = (
+    "This is often the result of a CUDA-enabled library calling a CUDA runtime function before "
+    "spawning worker processes. Please make sure any such function calls don't happen "
+    "at import time or in the global scope of a program."
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+def synchronize_stream(stream: int = 0):
+    ctx = numba_cuda.current_context()
+    cu_stream = numba_cuda.driver.drvapi.cu_stream(stream)
+    stream = numba_cuda.driver.Stream(ctx, cu_stream, None)
+    stream.synchronize()
+
+
+class UCXInitializer:
+    _inited = False
+
+    @staticmethod
+    def _get_options(ucx_config: dict) -> Tuple[dict, dict]:
+        """
+        Get options and envs from ucx options in oscar config
+        """
+        options = dict()
+        envs = dict()
+
+        # if any of the flags are set, as long as they are not Null/None,
+        # we assume we should configure basic TLS settings for UCX, otherwise we
+        # leave UCX to its default configuration
+        if any(ucx_config.get(name) for name in ["tcp", "nvlink", "infiniband"]):
+            if ucx_config.get("rdmacm"):  # pragma: no cover
+                tls = "tcp"
+                tls_priority = "rdmacm"
+            else:
+                tls = "tcp"
+                tls_priority = "tcp"
+
+            # CUDA COPY can optionally be used with ucx -- we rely on the user
+            # to define when messages will include CUDA objects.  Note:
+            # defining only the Infiniband flag will not enable cuda_copy
+            if any(
+                ucx_config.get(name) for name in ["nvlink", "cuda-copy"]
+            ):  # pragma: no cover
+                tls += ",cuda_copy"
+
+            if ucx_config.get("infiniband"):  # pragma: no cover
+                tls = "rc," + tls
+            if ucx_config.get("nvlink"):  # pragma: no cover
+                tls += ",cuda_ipc"
+
+            options["TLS"] = tls
+            options["SOCKADDR_TLS_PRIORITY"] = tls_priority
+        elif "UCX_TLS" in os.environ:  # pragma: no cover
+            options["TLS"] = os.environ["UCX_TLS"]
+
+        for k, v in ucx_config.get("environment", dict()).items():  # pragma: no cover
+            # {"some-name": value} is translated to {"UCX_SOME_NAME": value}
+            key = f'UCX_{"_".join(s.upper() for s in k.split("-"))}'
+            opt_key = key[4:]
+            if opt_key in options:
+                logger.warning(
+                    f"Ignoring {k}={v} (key={key}) in ucx.environment, "
+                    f"preferring {opt_key}={options[opt_key]} "
+                    "from high level options"
+                )
+            elif key in os.environ:
+                # This is only info because setting UCX configuration via
+                # environment variables is a reasonably common approach
+                logger.info(
+                    f"Ignoring {k}={v} (key={key}) in ucx.environment, "
+                    f"preferring {key}={os.environ[key]} from external environment"
+                )
+            else:
+                envs[key] = v
+
+        return options, envs
+
+    @staticmethod
+    def init(ucx_config: dict):
+        if UCXInitializer._inited:
+            return
+
+        options, envs = UCXInitializer._get_options(ucx_config)
+
+        # We ensure the CUDA context is created before initializing UCX. This can't
+        # be safely handled externally because communications start before
+        # preload scripts run.
+        # Precedence:
+        # 1. external environment
+        # 2. ucx_config (high level settings passed to ucp.init)
+        # 3. ucx_environment (low level settings equivalent to environment variables)
+        ucx_tls = os.environ.get("UCX_TLS", options.get("TLS", envs.get("UCX_TLS", "")))
+        if (
+            ucx_config.get("create-cuda-contex") is True
+            # This is not foolproof, if UCX_TLS=all we might require CUDA
+            # depending on configuration of UCX, but this is better than
+            # nothing
+            or ("cuda" in ucx_tls and "^cuda" not in ucx_tls)
+        ):
+            if numba_cuda is None:  # pragma: no cover
+                raise ImportError(
+                    "CUDA support with UCX requires Numba for context management"
+                )
+
+            pre_existing_cuda_context = get_cuda_context()
+            if pre_existing_cuda_context.has_context:
+                dev = pre_existing_cuda_context.device_info
+                logger.warning(
+                    f"A CUDA context for device {dev.device_index} ({str(dev.uuid)}) "
+                    f"already exists on process ID {os.getpid()}. {_warning_suffix}"
+                )
+
+            numba_cuda.current_context()
+
+            cuda_context_created = get_cuda_context()
+            cuda_visible_device = get_index_and_uuid(
+                os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
+            )
+            if (
+                cuda_context_created.has_context
+                and cuda_context_created.device_info.uuid != cuda_visible_device.uuid
+            ):  # pragma: no cover
+                cuda_context_created_dev = cuda_context_created.device_info
+                logger.warning(
+                    f"Worker with process ID {os.getpid()} should have a CUDA context assigned to device "
+                    f"{cuda_visible_device.device_index} ({str(cuda_visible_device.uuid)}), "
+                    f"but instead the CUDA context is on device {cuda_context_created_dev.device_index} "
+                    f"({str(cuda_context_created_dev.uuid)}). {_warning_suffix}"
+                )
+
+        original_environ = os.environ
+        new_environ = os.environ.copy()
+        new_environ.update(envs)
+        os.environ = new_environ
+        try:
+            ucp.init(
+                options=options, env_takes_precedence=True, blocking_progress_mode=False
+            )
+        finally:
+            os.environ = original_environ
+
+        UCXInitializer._inited = True
+
+    @staticmethod
+    def reset():
+        ucp.reset()
+        UCXInitializer._inited = False
+
+
+class UCXChannel(Channel):
+    __slots__ = (
+        "ucp_endpoint",
+        "_closed",
+        "_has_close_callback",
+        "_send_lock",
+        "_recv_lock",
+        "__weakref__",
+    )
+
+    name = "ucx"
+
+    def __init__(
+        self,
+        ucp_endpoint: "ucp.Endpoint",
+        local_address: str = None,
+        dest_address: str = None,
+        compression: int = None,
+    ):
+        super().__init__(
+            local_address=local_address,
+            dest_address=dest_address,
+            compression=compression,
+        )
+        self.ucp_endpoint = ucp_endpoint
+
+        self._send_lock = asyncio.Lock()
+        self._recv_lock = asyncio.Lock()
+
+        # When the UCX endpoint closes or errors the registered callback
+        # is called.
+        if hasattr(self.ucp_endpoint, "set_close_callback"):
+            ref = weakref.ref(self)
+            self.ucp_endpoint.set_close_callback(
+                functools.partial(UCXChannel._close_channel, ref)
+            )
+            self._closed = False
+            self._has_close_callback = True
+        else:  # pragma: no cover
+            self._has_close_callback = False
+
+    @staticmethod
+    def _close_channel(channel_ref: weakref.ReferenceType):
+        channel = channel_ref()
+        if channel is not None:
+            channel._closed = True
+
+    @property
+    @implements(Channel.type)
+    def type(self) -> ChannelType:
+        return ChannelType.remote
+
+    @implements(Channel.send)
+    async def send(self, message: Any):
+        if self.closed:
+            raise ChannelClosed("UCX Endpoint is closed, unable to send message")
+
+        compress = self.compression or 0
+        serializer = AioSerializer(message, compress=compress)
+        buffers = await serializer.run()
+        try:
+            # It is necessary to first synchronize the default stream before start
+            # sending We synchronize the default stream because UCX is not
+            # stream-ordered and syncing the default stream will wait for other
+            # non-blocking CUDA streams. Note this is only sufficient if the memory
+            # being sent is not currently in use on non-blocking CUDA streams.
+            if any(hasattr(buf, "__cuda_array_interface__") for buf in buffers):
+                # has GPU buffer
+                synchronize_stream(0)
+
+            async with self._send_lock:
+                for buffer in buffers:
+                    if buffer.nbytes if hasattr(buffer, "nbytes") else len(buffer) > 0:
+                        await self.ucp_endpoint.send(buffer)
+        except ucp.exceptions.UCXBaseException:  # pragma: no cover
+            self.abort()
+            raise ChannelClosed("While writing, the connection was closed")
+
+    @implements(Channel.recv)
+    async def recv(self):
+        async with self._recv_lock:
+            try:
+                info_buffer = np.empty(11, dtype="u1").data
+                await self.ucp_endpoint.recv(info_buffer)
+                head_length = get_header_length(info_buffer)
+                header_buffer = np.empty(head_length, dtype="u1").data
+                await self.ucp_endpoint.recv(header_buffer)
+                header = cloudpickle.loads(header_buffer)
+
+                is_cuda_buffers = header[0].get("is_cuda_buffers")
+                buffer_sizes = header[0].pop(BUFFER_SIZES_NAME)
+
+                buffers = []
+                for is_cuda_buffer, buf_size in zip(is_cuda_buffers, buffer_sizes):
+                    if buf_size == 0:  # pragma: no cover
+                        buffers.append(bytes())
+                    elif is_cuda_buffer:
+                        cuda_buffer = rmm.DeviceBuffer(size=buf_size)
+                        await self.ucp_endpoint.recv(cuda_buffer)
+                        buffers.append(cuda_buffer)
+                    else:
+                        buffer = np.empty(buf_size, dtype="u1").data
+                        await self.ucp_endpoint.recv(buffer)
+                        buffers.append(buffer)
+            except BaseException as e:
+                if not self._closed:
+                    # In addition to UCX exceptions, may be CancelledError or another
+                    # "low-level" exception. The only safe thing to do is to abort.
+                    self.abort()
+                    raise ChannelClosed(
+                        f"Connection closed by writer.\nInner exception: {e!r}"
+                    ) from e
+                else:
+                    raise EOFError("Server closed already")
+        return deserialize(header, buffers)
+
+    def abort(self):
+        self._closed = True
+        if self.ucp_endpoint is not None:
+            self.ucp_endpoint.abort()
+            self.ucp_endpoint = None
+
+    @implements(Channel.close)
+    async def close(self):
+        self._closed = True
+        if self.ucp_endpoint is not None:
+            await self.ucp_endpoint.close()
+            # abort
+            self.ucp_endpoint.abort()
+            self.ucp_endpoint = None
+
+    @property
+    @implements(Channel.closed)
+    def closed(self):
+        if self._has_close_callback is None:  # pragma: no cover
+            # The self._closed flag is separate from the endpoint's lifetime, even when
+            # the endpoint has closed or errored, there may be messages on its buffer
+            # still to be received, even though sending is not possible anymore.
+            return self._closed
+        else:
+            return self.ucp_endpoint is None
+
+
+@register_server
+class UCXServer(Server):
+    __slots__ = "host", "port", "_ucp_listener", "_channels", "_closed"
+
+    scheme = "ucx"
+
+    _ucp_listener: "ucp.Listener"
+    _channels: List[UCXChannel]
+
+    def __init__(
+        self,
+        host: str,
+        port: int,
+        ucp_listener: "ucp.Listener",
+        channel_handler: Callable[[Channel], Coroutine] = None,
+    ):
+        super().__init__(f"{UCXServer.scheme}://{host}:{port}", channel_handler)
+        self.host = host
+        self.port = port
+        self._ucp_listener = ucp_listener
+        self._channels = []
+        self._closed = asyncio.Event()
+
+    @classproperty
+    @implements(Server.client_type)
+    def client_type(self) -> Type["Client"]:
+        return UCXClient
+
+    @property
+    @implements(Server.channel_type)
+    def channel_type(self) -> ChannelType:
+        return ChannelType.remote
+
+    @staticmethod
+    async def create(config: Dict) -> "Server":
+        config = config.copy()
+        if "address" in config:
+            address = config.pop("address")
+            prefix = f"{UCXServer.scheme}://"
+            if address.startswith(prefix):
+                address = address[len(prefix) :]
+            host, port = address.split(":", 1)
+            port = int(port)
+        else:
+            host = config.pop("host")
+            port = int(config.pop("port"))
+        handle_channel = config.pop("handle_channel")
+
+        # init
+        UCXInitializer.init(config.get("ucx", dict()))
+
+        async def serve_forever(client_ucp_endpoint: "ucp.Endpoint"):
+            try:
+                await server.on_connected(
+                    client_ucp_endpoint, local_address=server.address
+                )
+            except ChannelClosed:  # pragma: no cover
+                logger.debug("Connection closed before handshake completed")
+                return
+
+        ucp_listener = ucp.create_listener(serve_forever, port=port)
+
+        # get port of the ucp listener if not specified
+        if not port:
+            port = ucp_listener.port
+
+        server = UCXServer(host, port, ucp_listener, channel_handler=handle_channel)
+        return server
+
+    @classmethod
+    def parse_config(cls, config: dict) -> dict:
+        return config
+
+    @implements(Server.start)
+    async def start(self):
+        pass
+
+    @implements(Server.join)
+    async def join(self, timeout=None):
+        wait_coro = self._closed.wait()
+        try:
+            await asyncio.wait_for(wait_coro, timeout=timeout)
+        except (futures.TimeoutError, asyncio.TimeoutError):
+            pass
+
+    @implements(Server.on_connected)
+    async def on_connected(self, *args, **kwargs):
+        (ucp_endpoint,) = args
+        local_address = kwargs.pop("local_address", None)
+        dest_address = kwargs.pop("dest_address", None)
+        if kwargs:  # pragma: no cover
+            raise TypeError(
+                f"{type(self).__name__} got unexpected "
+                f'arguments: {",".join(kwargs)}'
+            )
+        channel = UCXChannel(
+            ucp_endpoint, local_address=local_address, dest_address=dest_address
+        )
+        self._channels.append(channel)
+        # handle over channel to some handlers
+        await self.channel_handler(channel)
+
+    @implements(Server.stop)
+    async def stop(self):
+        self._ucp_listener.close()
+        # close all channels
+        await asyncio.gather(
+            *(channel.close() for channel in self._channels if not channel.closed)
+        )
+        self._ucp_listener = None
+        self._closed.set()
+
+    @property
+    @implements(Server.stopped)
+    def stopped(self) -> bool:
+        return self._ucp_listener is None
+
+
+@register_client
+class UCXClient(Client):
+    __slots__ = ()
+
+    scheme = UCXServer.scheme
+
+    @classmethod
+    def parse_config(cls, config: dict) -> dict:
+        return config
+
+    @staticmethod
+    @implements(Client.connect)
+    async def connect(
+        dest_address: str, local_address: str = None, **kwargs
+    ) -> "Client":
+        prefix = f"{UCXClient.scheme}://"
+        if dest_address.startswith(prefix):
+            dest_address = dest_address[len(prefix) :]
+        host, port = dest_address.split(":", 1)
+        port = int(port)
+        kwargs = kwargs.copy()
+        ucx_config = kwargs.pop("config", dict()).get("ucx", dict())
+        UCXInitializer.init(ucx_config)
+
+        try:
+            ucp_endpoint = await ucp.create_endpoint(host, port)
+        except ucp.exceptions.UCXBaseException:  # pragma: no cover
+            raise ChannelClosed("Connection closed before handshake completed")
+        channel = UCXChannel(
+            ucp_endpoint, local_address=local_address, dest_address=dest_address
+        )
+        return UCXClient(local_address, dest_address, channel)
diff --git a/python/xorbits/_mars/oscar/backends/communication/utils.py b/python/xorbits/_mars/oscar/backends/communication/utils.py
new file mode 100644
index 000000000..7fa11a659
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/communication/utils.py
@@ -0,0 +1,96 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from asyncio import StreamReader, StreamWriter
+from typing import Dict, List, Union
+
+import numpy as np
+
+from ....serialization.aio import BUFFER_SIZES_NAME
+from ....utils import lazy_import
+
+cupy = lazy_import("cupy")
+cudf = lazy_import("cudf")
+rmm = lazy_import("rmm")
+
+CUDA_CHUNK_SIZE = 16 * 1024**2
+
+
+def _convert_to_cupy_ndarray(
+    cuda_buffer: Union["cupy.ndarray", "rmm.DeviceBuffer"]
+) -> "cupy.ndarray":
+    if isinstance(cuda_buffer, cupy.ndarray):
+        return cuda_buffer
+
+    size = cuda_buffer.nbytes
+    data = cuda_buffer.__cuda_array_interface__["data"][0]
+    memory = cupy.cuda.UnownedMemory(data, size, cuda_buffer)
+    ptr = cupy.cuda.MemoryPointer(memory, 0)
+    return cupy.ndarray(shape=size, dtype="u1", memptr=ptr)
+
+
+def write_buffers(writer: StreamWriter, buffers: List):
+    def _write_cuda_buffer(cuda_buffer: Union["cupy.ndarray", "rmm.DeviceBuffer"]):
+        # convert cuda buffer to cupy ndarray
+        cuda_buffer = _convert_to_cupy_ndarray(cuda_buffer)
+
+        chunk_size = CUDA_CHUNK_SIZE
+        offset = 0
+        nbytes = buffer.nbytes
+        while offset < nbytes:
+            size = chunk_size if (offset + chunk_size) < nbytes else nbytes - offset
+            # slice on cupy ndarray
+            chunk_buffer = cuda_buffer[offset : offset + size]
+            # `get` will return numpy ndarray,
+            # write its data which is a memoryview into writer
+            writer.write(chunk_buffer.get().data)
+            offset += size
+
+    for buffer in buffers:
+        if hasattr(buffer, "__cuda_array_interface__"):
+            # GPU buffer
+            _write_cuda_buffer(buffer)
+        else:
+            writer.write(buffer)
+
+
+async def read_buffers(header: Dict, reader: StreamReader):
+    is_cuda_buffers = header[0].get("is_cuda_buffers")
+    buffer_sizes = header[0].pop(BUFFER_SIZES_NAME)
+
+    buffers = []
+    for is_cuda_buffer, buf_size in zip(is_cuda_buffers, buffer_sizes):
+        if is_cuda_buffer:  # pragma: no cover
+            if buf_size == 0:
+                content = await reader.readexactly(buf_size)
+                buffers.append(content)
+            else:
+                buffer = rmm.DeviceBuffer(size=buf_size)
+                arr = _convert_to_cupy_ndarray(buffer)
+                offset = 0
+                chunk_size = CUDA_CHUNK_SIZE
+                while offset < buf_size:
+                    read_size = (
+                        chunk_size
+                        if (offset + chunk_size) < buf_size
+                        else buf_size - offset
+                    )
+                    content = await reader.readexactly(read_size)
+                    chunk_arr = np.frombuffer(content, dtype="u1")
+                    arr[offset : offset + len(content)].set(chunk_arr)
+                    offset += read_size
+                buffers.append(buffer)
+        else:
+            buffers.append(await reader.readexactly(buf_size))
+    return buffers
diff --git a/python/xorbits/_mars/oscar/backends/config.py b/python/xorbits/_mars/oscar/backends/config.py
new file mode 100644
index 000000000..a05055f52
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/config.py
@@ -0,0 +1,137 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Union
+
+
+class ActorPoolConfig:
+    __slots__ = ("_conf",)
+
+    def __init__(self, conf: Dict = None):
+        if conf is None:
+            conf = dict()
+        self._conf = conf
+        if "pools" not in self._conf:
+            self._conf["pools"] = dict()
+        if "mapping" not in self._conf:
+            self._conf["mapping"] = dict()
+        if "metrics" not in self._conf:
+            self._conf["metrics"] = dict()
+        if "comm" not in self._conf:
+            self._conf["comm"] = dict()
+
+    @property
+    def n_pool(self):
+        return len(self._conf["pools"])
+
+    def add_pool_conf(
+        self,
+        process_index: int,
+        label: str,
+        internal_address: str,
+        external_address: Union[str, List[str]],
+        env: Dict = None,
+        modules: List[str] = None,
+        suspend_sigint: bool = False,
+        use_uvloop: bool = False,
+        logging_conf: Dict = None,
+        kwargs: Dict = None,
+    ):
+        pools: Dict = self._conf["pools"]
+        if not isinstance(external_address, list):
+            external_address = [external_address]
+        pools[process_index] = {
+            "label": label,
+            "internal_address": internal_address,
+            "external_address": external_address,
+            "env": env,
+            "modules": modules,
+            "suspend_sigint": suspend_sigint,
+            "use_uvloop": use_uvloop,
+            "logging_conf": logging_conf,
+            "kwargs": kwargs or {},
+        }
+
+        mapping: Dict = self._conf["mapping"]
+        for addr in external_address:
+            mapping[addr] = internal_address
+
+    def get_pool_config(self, process_index: int):
+        return self._conf["pools"][process_index]
+
+    def get_external_address(self, process_index: int) -> str:
+        return self._conf["pools"][process_index]["external_address"][0]
+
+    def get_process_indexes(self):
+        return list(self._conf["pools"])
+
+    def get_process_index(self, external_address: str):
+        for process_index, conf in self._conf["pools"].items():
+            if external_address in conf["external_address"]:
+                return process_index
+        raise ValueError(
+            f"Cannot get process_index for {external_address}"
+        )  # pragma: no cover
+
+    def reset_pool_external_address(
+        self,
+        process_index: int,
+        external_address: Union[str, List[str]],
+    ):
+        if not isinstance(external_address, list):
+            external_address = [external_address]
+        cur_pool_config = self._conf["pools"][process_index]
+        internal_address = cur_pool_config["internal_address"]
+
+        mapping: Dict = self._conf["mapping"]
+        for addr in cur_pool_config["external_address"]:
+            if internal_address == addr:
+                # internal address may be the same as external address in Windows
+                internal_address = external_address[0]
+            mapping.pop(addr, None)
+
+        cur_pool_config["external_address"] = external_address
+        for addr in external_address:
+            mapping[addr] = internal_address
+
+    def get_external_addresses(self, label=None) -> List[str]:
+        result = []
+        for c in self._conf["pools"].values():
+            if label is not None:
+                if label == c["label"]:
+                    result.append(c["external_address"][0])
+            else:
+                result.append(c["external_address"][0])
+        return result
+
+    @property
+    def external_to_internal_address_map(self) -> Dict[str, str]:
+        return self._conf["mapping"]
+
+    def as_dict(self):
+        return self._conf
+
+    def add_metric_configs(self, metrics: Dict[str, Any]):
+        if metrics:
+            self._conf["metrics"].update(metrics)
+
+    def get_metric_configs(self):
+        return self._conf["metrics"]
+
+    def add_comm_config(self, comm_config: Dict[str, Any]):
+        if comm_config:
+            self._conf["comm"].update(comm_config)
+
+    def get_comm_config(self) -> dict:
+        return self._conf["comm"]
diff --git a/python/xorbits/_mars/oscar/backends/context.py b/python/xorbits/_mars/oscar/backends/context.py
new file mode 100644
index 000000000..68027af67
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/context.py
@@ -0,0 +1,242 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+from dataclasses import dataclass
+from typing import Tuple, Type, Union
+
+from ...utils import dataslots, to_binary
+from ..api import Actor
+from ..context import BaseActorContext
+from ..core import ActorRef, create_local_actor_ref
+from ..debug import debug_async_timeout, detect_cycle_send
+from ..errors import CannotCancelTask
+from ..utils import create_actor_ref
+from .allocate_strategy import AddressSpecified, AllocateStrategy
+from .core import ActorCaller
+from .message import (
+    DEFAULT_PROTOCOL,
+    ActorRefMessage,
+    CancelMessage,
+    ControlMessage,
+    ControlMessageType,
+    CreateActorMessage,
+    DestroyActorMessage,
+    ErrorMessage,
+    HasActorMessage,
+    ResultMessage,
+    SendMessage,
+    _MessageBase,
+    new_message_id,
+)
+from .router import Router
+
+
+@dataslots
+@dataclass
+class ProfilingContext:
+    task_id: str
+
+
+class MarsActorContext(BaseActorContext):
+    __slots__ = ("_caller",)
+
+    support_allocate_strategy = True
+
+    def __init__(self, address: str = None):
+        BaseActorContext.__init__(self, address)
+        self._caller = ActorCaller()
+
+    def __del__(self):
+        self._caller.cancel_tasks()
+
+    async def _call(
+        self, address: str, message: _MessageBase, wait: bool = True
+    ) -> Union[ResultMessage, ErrorMessage, asyncio.Future]:
+        return await self._caller.call(
+            Router.get_instance_or_empty(), address, message, wait=wait
+        )
+
+    @staticmethod
+    def _process_result_message(message: Union[ResultMessage, ErrorMessage]):
+        if isinstance(message, ResultMessage):
+            return message.result
+        else:
+            raise message.as_instanceof_cause()
+
+    async def _wait(self, future: asyncio.Future, address: str, message: _MessageBase):
+        try:
+            await asyncio.shield(future)
+        except asyncio.CancelledError:
+            try:
+                await self.cancel(address, message.message_id)
+            except CannotCancelTask:
+                # cancel failed, already finished
+                raise asyncio.CancelledError
+        except:  # noqa: E722  # nosec  # pylint: disable=bare-except
+            pass
+        return await future
+
+    async def create_actor(
+        self, actor_cls: Type[Actor], *args, uid=None, address: str = None, **kwargs
+    ) -> ActorRef:
+        router = Router.get_instance_or_empty()
+        address = address or self._address or router.external_address
+        allocate_strategy = kwargs.get("allocate_strategy", None)
+        if isinstance(allocate_strategy, AllocateStrategy):
+            allocate_strategy = kwargs.pop("allocate_strategy")
+        else:
+            allocate_strategy = AddressSpecified(address)
+        create_actor_message = CreateActorMessage(
+            new_message_id(),
+            actor_cls,
+            to_binary(uid),
+            args,
+            kwargs,
+            allocate_strategy,
+            protocol=DEFAULT_PROTOCOL,
+        )
+        future = await self._call(address, create_actor_message, wait=False)
+        result = await self._wait(future, address, create_actor_message)
+        return self._process_result_message(result)
+
+    async def has_actor(self, actor_ref: ActorRef) -> bool:
+        message = HasActorMessage(
+            new_message_id(), actor_ref, protocol=DEFAULT_PROTOCOL
+        )
+        future = await self._call(actor_ref.address, message, wait=False)
+        result = await self._wait(future, actor_ref.address, message)
+        return self._process_result_message(result)
+
+    async def destroy_actor(self, actor_ref: ActorRef):
+        message = DestroyActorMessage(
+            new_message_id(), actor_ref, protocol=DEFAULT_PROTOCOL
+        )
+        future = await self._call(actor_ref.address, message, wait=False)
+        result = await self._wait(future, actor_ref.address, message)
+        return self._process_result_message(result)
+
+    async def kill_actor(self, actor_ref: ActorRef, force: bool = True):
+        # get main_pool_address
+        control_message = ControlMessage(
+            new_message_id(),
+            actor_ref.address,
+            ControlMessageType.get_config,
+            "main_pool_address",
+            protocol=DEFAULT_PROTOCOL,
+        )
+        main_address = self._process_result_message(
+            await self._call(actor_ref.address, control_message)
+        )
+        real_actor_ref = await self.actor_ref(actor_ref)
+        if real_actor_ref.address == main_address:
+            raise ValueError("Cannot kill actor on main pool")
+        stop_message = ControlMessage(
+            new_message_id(),
+            real_actor_ref.address,
+            ControlMessageType.stop,
+            # default timeout (3 secs) and force
+            (3.0, force),
+            protocol=DEFAULT_PROTOCOL,
+        )
+        # stop server
+        result = await self._call(main_address, stop_message)
+        return self._process_result_message(result)
+
+    async def actor_ref(self, *args, **kwargs):
+        actor_ref = create_actor_ref(*args, **kwargs)
+        local_actor_ref = create_local_actor_ref(actor_ref.address, actor_ref.uid)
+        if local_actor_ref is not None:
+            return local_actor_ref
+        message = ActorRefMessage(
+            new_message_id(), actor_ref, protocol=DEFAULT_PROTOCOL
+        )
+        future = await self._call(actor_ref.address, message, wait=False)
+        result = await self._wait(future, actor_ref.address, message)
+        return self._process_result_message(result)
+
+    async def send(
+        self,
+        actor_ref: ActorRef,
+        message: Tuple,
+        wait_response: bool = True,
+        profiling_context: ProfilingContext = None,
+    ):
+        message = SendMessage(
+            new_message_id(),
+            actor_ref,
+            message,
+            protocol=DEFAULT_PROTOCOL,
+            profiling_context=profiling_context,
+        )
+
+        # use `%.500` to avoid print too long messages
+        with debug_async_timeout(
+            "actor_call_timeout",
+            "Calling %.500r on %s at %s timed out",
+            message.content,
+            actor_ref.uid,
+            actor_ref.address,
+        ):
+            detect_cycle_send(message, wait_response)
+            future = await self._call(actor_ref.address, message, wait=False)
+            if wait_response:
+                result = await self._wait(future, actor_ref.address, message)
+                return self._process_result_message(result)
+            else:
+                return future
+
+    async def cancel(self, address: str, cancel_message_id: bytes):
+        message = CancelMessage(
+            new_message_id(), address, cancel_message_id, protocol=DEFAULT_PROTOCOL
+        )
+        result = await self._call(address, message)
+        return self._process_result_message(result)
+
+    async def wait_actor_pool_recovered(self, address: str, main_address: str = None):
+        if main_address is None:
+            # get main_pool_address
+            control_message = ControlMessage(
+                new_message_id(),
+                address,
+                ControlMessageType.get_config,
+                "main_pool_address",
+                protocol=DEFAULT_PROTOCOL,
+            )
+            main_address = self._process_result_message(
+                await self._call(address, control_message)
+            )
+
+        # if address is main pool, it is never recovered
+        if address == main_address:
+            return
+
+        control_message = ControlMessage(
+            new_message_id(),
+            address,
+            ControlMessageType.wait_pool_recovered,
+            None,
+            protocol=DEFAULT_PROTOCOL,
+        )
+        self._process_result_message(await self._call(main_address, control_message))
+
+    async def get_pool_config(self, address: str):
+        control_message = ControlMessage(
+            new_message_id(),
+            address,
+            ControlMessageType.get_config,
+            None,
+            protocol=DEFAULT_PROTOCOL,
+        )
+        return self._process_result_message(await self._call(address, control_message))
diff --git a/python/xorbits/_mars/oscar/backends/core.py b/python/xorbits/_mars/oscar/backends/core.py
new file mode 100644
index 000000000..157f9abd0
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/core.py
@@ -0,0 +1,140 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import copy
+import logging
+from typing import Dict, Union
+
+from ...oscar.profiling import ProfilingData
+from ...utils import Timer
+from ..errors import ServerClosed
+from .communication import Client
+from .message import DeserializeMessageFailed, ErrorMessage, ResultMessage, _MessageBase
+from .router import Router
+
+ResultMessageType = Union[ResultMessage, ErrorMessage]
+logger = logging.getLogger(__name__)
+
+
+class ActorCaller:
+    __slots__ = "_client_to_message_futures", "_clients"
+
+    def __init__(self):
+        self._client_to_message_futures: Dict[
+            Client, Dict[bytes, asyncio.Future]
+        ] = dict()
+        self._clients: Dict[Client, asyncio.Task] = dict()
+
+    async def get_client(self, router: Router, dest_address: str) -> Client:
+        client = await router.get_client(dest_address, from_who=self)
+        if client not in self._clients:
+            self._clients[client] = asyncio.create_task(self._listen(client))
+            self._client_to_message_futures[client] = dict()
+            client_count = len(self._clients)
+            if client_count >= 100:  # pragma: no cover
+                if (client_count - 100) % 10 == 0:  # pragma: no cover
+                    logger.warning(
+                        "Actor caller has created too many clients (%s >= 100), "
+                        "the global router may not be set.",
+                        client_count,
+                    )
+        return client
+
+    async def _listen(self, client: Client):
+        while not client.closed:
+            try:
+                try:
+                    message: _MessageBase = await client.recv()
+                except (EOFError, ConnectionError, BrokenPipeError):
+                    # remote server closed, close client and raise ServerClosed
+                    try:
+                        await client.close()
+                    except (ConnectionError, BrokenPipeError):
+                        # close failed, ignore it
+                        pass
+                    raise ServerClosed(
+                        f"Remote server {client.dest_address} closed"
+                    ) from None
+                future = self._client_to_message_futures[client].pop(message.message_id)
+                future.set_result(message)
+            except DeserializeMessageFailed as e:
+                message_id = e.message_id
+                future = self._client_to_message_futures[client].pop(message_id)
+                future.set_exception(e.__cause__)
+            except Exception as e:  # noqa: E722  # pylint: disable=bare-except
+                message_futures = self._client_to_message_futures.get(client)
+                self._client_to_message_futures[client] = dict()
+                for future in message_futures.values():
+                    future.set_exception(copy.copy(e))
+            finally:
+                # message may have Ray ObjectRef, delete it early in case next loop doesn't run
+                # as soon as expected.
+                try:
+                    del message
+                except NameError:
+                    pass
+                try:
+                    del future
+                except NameError:
+                    pass
+                await asyncio.sleep(0)
+
+        message_futures = self._client_to_message_futures.get(client)
+        self._client_to_message_futures[client] = dict()
+        error = ServerClosed(f"Remote server {client.dest_address} closed")
+        for future in message_futures.values():
+            future.set_exception(copy.copy(error))
+
+    async def call(
+        self,
+        router: Router,
+        dest_address: str,
+        message: _MessageBase,
+        wait: bool = True,
+    ) -> Union[ResultMessage, ErrorMessage, asyncio.Future]:
+        client = await self.get_client(router, dest_address)
+        loop = asyncio.get_running_loop()
+        wait_response = loop.create_future()
+        self._client_to_message_futures[client][message.message_id] = wait_response
+
+        with Timer() as timer:
+            try:
+                await client.send(message)
+            except ConnectionError:
+                try:
+                    await client.close()
+                except ConnectionError:
+                    # close failed, ignore it
+                    pass
+                raise ServerClosed(f"Remote server {client.dest_address} closed")
+
+            if not wait:
+                r = wait_response
+            else:
+                r = await wait_response
+
+        ProfilingData.collect_actor_call(message, timer.duration)
+        return r
+
+    async def stop(self):
+        try:
+            await asyncio.gather(*[client.close() for client in self._clients])
+        except (ConnectionError, ServerClosed):
+            pass
+        self.cancel_tasks()
+
+    def cancel_tasks(self):
+        # cancel listening for all clients
+        _ = [task.cancel() for task in self._clients.values()]
diff --git a/python/xorbits/_mars/oscar/backends/mars/__init__.py b/python/xorbits/_mars/oscar/backends/mars/__init__.py
new file mode 100644
index 000000000..006392541
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/mars/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .backend import MarsActorBackend
diff --git a/python/xorbits/_mars/oscar/backends/mars/backend.py b/python/xorbits/_mars/oscar/backends/mars/backend.py
new file mode 100644
index 000000000..e050380f5
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/mars/backend.py
@@ -0,0 +1,72 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+
+from ...backend import BaseActorBackend, register_backend
+from ..context import MarsActorContext
+from .driver import MarsActorDriver
+from .pool import MainActorPool
+
+__all__ = ["MarsActorBackend"]
+
+
+def build_pool_kwargs(n_process: int, kwargs: Dict):
+    n_io_process = kwargs.pop("n_io_process", 0)
+    if n_io_process:
+        n_process += n_io_process
+
+        labels = kwargs["labels"]
+        envs = kwargs["envs"]
+        external_address_schemes = kwargs["external_address_schemes"]
+        enable_internal_addresses = kwargs["enable_internal_addresses"]
+        # sub-pools for IO(transfer and spill)
+        for _ in range(n_io_process):
+            if envs:  # pragma: no cover
+                envs.append(dict())
+            labels.append("io")
+            if external_address_schemes:
+                # just use main process' scheme for IO process
+                external_address_schemes.append(external_address_schemes[0])
+            if enable_internal_addresses:
+                # just use main process' setting for IO process
+                enable_internal_addresses.append(enable_internal_addresses[0])
+
+    return n_process, kwargs
+
+
+@register_backend
+class MarsActorBackend(BaseActorBackend):
+    @staticmethod
+    def name():
+        # None means Mars is default scheme
+        # ucx can be recognized as Mars backend as well
+        return [None, "ucx"]
+
+    @staticmethod
+    def get_context_cls():
+        return MarsActorContext
+
+    @staticmethod
+    def get_driver_cls():
+        return MarsActorDriver
+
+    @classmethod
+    async def create_actor_pool(cls, address: str, n_process: int = None, **kwargs):
+        from ..pool import create_actor_pool
+
+        n_process, kwargs = build_pool_kwargs(n_process, kwargs)
+        return await create_actor_pool(
+            address, pool_cls=MainActorPool, n_process=n_process, **kwargs
+        )
diff --git a/python/xorbits/_mars/oscar/backends/mars/driver.py b/python/xorbits/_mars/oscar/backends/mars/driver.py
new file mode 100644
index 000000000..67171171c
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/mars/driver.py
@@ -0,0 +1,25 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from numbers import Number
+from typing import Dict
+
+from ...driver import BaseActorDriver
+
+
+class MarsActorDriver(BaseActorDriver):
+    @classmethod
+    def setup_cluster(cls, address_to_resources: Dict[str, Dict[str, Number]]):
+        # nothing need to be done in driver of Mars backend
+        pass
diff --git a/python/xorbits/_mars/oscar/backends/mars/pool.py b/python/xorbits/_mars/oscar/backends/mars/pool.py
new file mode 100644
index 000000000..04762f088
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/mars/pool.py
@@ -0,0 +1,343 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import atexit
+import concurrent.futures as futures
+import configparser
+import contextlib
+import itertools
+import logging.config
+import multiprocessing
+import os
+import random
+import signal
+import sys
+import threading
+import uuid
+from dataclasses import dataclass
+from types import TracebackType
+from typing import List
+
+from ....utils import (
+    clean_mars_tmp_dir,
+    dataslots,
+    ensure_coverage,
+    reset_id_random_seed,
+)
+from ..config import ActorPoolConfig
+from ..message import CreateActorMessage
+from ..pool import MainActorPoolBase, SubActorPoolBase, _register_message_handler
+
+atexit.register(clean_mars_tmp_dir)
+
+_is_windows: bool = sys.platform.startswith("win")
+
+if sys.version_info[:2] == (3, 9):
+    # fix for Python 3.9, see https://bugs.python.org/issue43517
+    if sys.platform == "win32":
+        from multiprocessing import popen_spawn_win32 as popen_spawn
+
+        popen_forkserver = popen_fork = synchronize = None
+    else:
+        from multiprocessing import popen_fork, popen_forkserver
+        from multiprocessing import popen_spawn_posix as popen_spawn
+        from multiprocessing import synchronize
+    _ = popen_spawn, popen_forkserver, popen_fork, synchronize
+    del _
+elif sys.version_info[:2] == (3, 6):  # pragma: no cover
+    from multiprocessing.process import BaseProcess
+
+    # define kill method for multiprocessing
+    def _mp_kill(self):
+        if not _is_windows:
+            try:
+                os.kill(self.pid, signal.SIGKILL)
+            except ProcessLookupError:
+                pass
+            except OSError:
+                if self.wait(timeout=0.1) is None:
+                    raise
+        else:
+            self.terminate()
+
+    BaseProcess.kill = _mp_kill
+
+logger = logging.getLogger(__name__)
+_init_main_suspended_local = threading.local()
+
+
+def _patch_spawn_get_preparation_data():
+    try:
+        from multiprocessing import spawn as mp_spawn
+
+        _raw_get_preparation_data = mp_spawn.get_preparation_data
+
+        def _patched_get_preparation_data(*args, **kw):
+            ret = _raw_get_preparation_data(*args, **kw)
+            if getattr(_init_main_suspended_local, "value", False):
+                # make sure user module is not imported when start Mars cluster
+                ret.pop("init_main_from_name", None)
+                ret.pop("init_main_from_path", None)
+            return ret
+
+        _patched_get_preparation_data._mars_patched = True
+        if not getattr(mp_spawn.get_preparation_data, "_mars_patched", False):
+            mp_spawn.get_preparation_data = _patched_get_preparation_data
+    except (ImportError, AttributeError):  # pragma: no cover
+        pass
+
+
+@contextlib.contextmanager
+def _suspend_init_main():
+    try:
+        _init_main_suspended_local.value = True
+        yield
+    finally:
+        _init_main_suspended_local.value = False
+
+
+@dataslots
+@dataclass
+class SubpoolStatus:
+    # for status, 0 is succeeded, 1 is failed
+    status: int = None
+    external_addresses: List[str] = None
+    error: BaseException = None
+    traceback: TracebackType = None
+
+
+@_register_message_handler
+class MainActorPool(MainActorPoolBase):
+    @classmethod
+    def get_external_addresses(
+        cls,
+        address: str,
+        n_process: int = None,
+        ports: List[int] = None,
+        schemes: List[str] = None,
+    ):
+        """Get external address for every process"""
+        if ":" in address:
+            host, port = address.split(":", 1)
+            port = int(port)
+            if ports:
+                if len(ports) != n_process:
+                    raise ValueError(
+                        f"`ports` specified, but its count "
+                        f"is not equal to `n_process`, "
+                        f"number of ports: {len(ports)}, "
+                        f"n_process: {n_process}"
+                    )
+                sub_ports = ports
+            else:
+                sub_ports = [0] * n_process
+        else:
+            host = address
+            if ports and len(ports) != n_process + 1:
+                # ports specified, the first of which should be main port
+                raise ValueError(
+                    f"`ports` specified, but its count "
+                    f"is not equal to `n_process` + 1, "
+                    f"number of ports: {len(ports)}, "
+                    f"n_process + 1: {n_process + 1}"
+                )
+            elif not ports:
+                ports = [0] * (n_process + 1)
+            port = ports[0]
+            sub_ports = ports[1:]
+        if not schemes:
+            prefix_iter = itertools.repeat("")
+        else:
+            prefix_iter = [f"{scheme}://" if scheme else "" for scheme in schemes]
+        return [
+            f"{prefix}{host}:{port}"
+            for port, prefix in zip([port] + sub_ports, prefix_iter)
+        ]
+
+    @classmethod
+    def gen_internal_address(
+        cls, process_index: int, external_address: str = None
+    ) -> str:
+        if hasattr(asyncio, "start_unix_server"):
+            return f"unixsocket:///{process_index}"
+        else:
+            return external_address
+
+    @classmethod
+    async def start_sub_pool(
+        cls,
+        actor_pool_config: ActorPoolConfig,
+        process_index: int,
+        start_method: str = None,
+    ):
+        def start_pool_in_process():
+            ctx = multiprocessing.get_context(method=start_method)
+            status_queue = ctx.Queue()
+
+            with _suspend_init_main():
+                process = ctx.Process(
+                    target=cls._start_sub_pool,
+                    args=(actor_pool_config, process_index, status_queue),
+                    name=f"MarsActorPool{process_index}",
+                )
+                process.daemon = True
+                process.start()
+
+            # wait for sub actor pool to finish starting
+            process_status = status_queue.get()
+            return process, process_status
+
+        _patch_spawn_get_preparation_data()
+        loop = asyncio.get_running_loop()
+        with futures.ThreadPoolExecutor(1) as executor:
+            create_pool_task = loop.run_in_executor(executor, start_pool_in_process)
+            return await create_pool_task
+
+    @classmethod
+    async def wait_sub_pools_ready(cls, create_pool_tasks: List[asyncio.Task]):
+        processes = []
+        ext_addresses = []
+        for task in create_pool_tasks:
+            process, status = await task
+            if status.status == 1:
+                # start sub pool failed
+                raise status.error.with_traceback(status.traceback)
+            processes.append(process)
+            ext_addresses.append(status.external_addresses)
+        return processes, ext_addresses
+
+    @classmethod
+    def _start_sub_pool(
+        cls,
+        actor_config: ActorPoolConfig,
+        process_index: int,
+        status_queue: multiprocessing.Queue,
+    ):
+        ensure_coverage()
+
+        # make sure enough randomness for every sub pool
+        random.seed(uuid.uuid1().bytes)
+        reset_id_random_seed()
+
+        conf = actor_config.get_pool_config(process_index)
+        suspend_sigint = conf["suspend_sigint"]
+        if suspend_sigint:
+            signal.signal(signal.SIGINT, lambda *_: None)
+
+        logging_conf = conf["logging_conf"] or {}
+        if isinstance(logging_conf, configparser.RawConfigParser):
+            logging.config.fileConfig(logging_conf)
+        elif logging_conf.get("file"):
+            logging.config.fileConfig(logging_conf["file"])
+        elif logging_conf.get("level"):
+            logging.getLogger("__main__").setLevel(logging_conf["level"])
+            logging.getLogger("mars").setLevel(logging_conf["level"])
+            if logging_conf.get("format"):
+                logging.basicConfig(format=logging_conf["format"])
+
+        use_uvloop = conf["use_uvloop"]
+        if use_uvloop:
+            import uvloop
+
+            asyncio.set_event_loop(uvloop.new_event_loop())
+        else:
+            asyncio.set_event_loop(asyncio.new_event_loop())
+
+        coro = cls._create_sub_pool(actor_config, process_index, status_queue)
+        asyncio.run(coro)
+
+    @classmethod
+    async def _create_sub_pool(
+        cls,
+        actor_config: ActorPoolConfig,
+        process_index: int,
+        status_queue: multiprocessing.Queue,
+    ):
+        process_status = None
+        try:
+            cur_pool_config = actor_config.get_pool_config(process_index)
+            env = cur_pool_config["env"]
+            if env:
+                os.environ.update(env)
+            pool = await SubActorPool.create(
+                {"actor_pool_config": actor_config, "process_index": process_index}
+            )
+            external_addresses = cur_pool_config["external_address"]
+            process_status = SubpoolStatus(
+                status=0, external_addresses=external_addresses
+            )
+            await pool.start()
+        except:  # noqa: E722  # nosec  # pylint: disable=bare-except
+            _, error, tb = sys.exc_info()
+            process_status = SubpoolStatus(status=1, error=error, traceback=tb)
+            raise
+        finally:
+            status_queue.put(process_status)
+        await pool.join()
+
+    async def kill_sub_pool(
+        self, process: multiprocessing.Process, force: bool = False
+    ):
+        if (
+            "COV_CORE_SOURCE" in os.environ and not force and not _is_windows
+        ):  # pragma: no cover
+            # must shutdown gracefully, or coverage info lost
+            try:
+                os.kill(process.pid, signal.SIGINT)
+            except OSError:  # pragma: no cover
+                pass
+            process.terminate()
+            wait_pool = futures.ThreadPoolExecutor(1)
+            try:
+                loop = asyncio.get_running_loop()
+                await loop.run_in_executor(wait_pool, process.join, 3)
+            finally:
+                wait_pool.shutdown(False)
+        process.kill()
+        await asyncio.to_thread(process.join, 5)
+
+    async def is_sub_pool_alive(self, process: multiprocessing.Process):
+        try:
+            return await asyncio.to_thread(process.is_alive)
+        except RuntimeError as ex:  # pragma: no cover
+            if "cannot schedule new futures after interpreter shutdown" not in str(ex):
+                # when atexit is triggered, the default pool might be shutdown
+                # and to_thread will fail
+                raise
+            return process.is_alive()
+
+    async def recover_sub_pool(self, address: str):
+        process_index = self._config.get_process_index(address)
+        # process dead, restart it
+        # remember always use spawn to recover sub pool
+        task = asyncio.create_task(
+            self.start_sub_pool(self._config, process_index, "spawn")
+        )
+        self.sub_processes[address] = (await self.wait_sub_pools_ready([task]))[0][0]
+
+        if self._auto_recover == "actor":
+            # need to recover all created actors
+            for _, message in self._allocated_actors[address].values():
+                create_actor_message: CreateActorMessage = message
+                await self.call(address, create_actor_message)
+
+    async def start(self):
+        await super().start()
+        await self.start_monitor()
+
+
+@_register_message_handler
+class SubActorPool(SubActorPoolBase):
+    pass
diff --git a/python/xorbits/_mars/oscar/backends/mars/tests/__init__.py b/python/xorbits/_mars/oscar/backends/mars/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/mars/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/oscar/backends/mars/tests/test-logging.conf b/python/xorbits/_mars/oscar/backends/mars/tests/test-logging.conf
new file mode 100644
index 000000000..bb545c6b3
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/mars/tests/test-logging.conf
@@ -0,0 +1,26 @@
+[loggers]
+keys=root,test_mars_pool
+
+[handlers]
+keys=stream_handler
+
+[formatters]
+keys=formatter
+
+[logger_root]
+level=WARN
+handlers=stream_handler
+
+[logger_test_mars_pool]
+level=DEBUG
+handlers=stream_handler
+qualname=mars.oscar.backends.mars.tests
+propagate=0
+
+[handler_stream_handler]
+class=StreamHandler
+formatter=formatter
+args=(sys.stderr,)
+
+[formatter_formatter]
+format=%(asctime)s %(name)-12s %(process)d %(levelname)-8s %(message)s
diff --git a/python/xorbits/_mars/oscar/backends/mars/tests/test_allocate_strategy.py b/python/xorbits/_mars/oscar/backends/mars/tests/test_allocate_strategy.py
new file mode 100644
index 000000000..df79c5ffb
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/mars/tests/test_allocate_strategy.py
@@ -0,0 +1,84 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from .... import create_actor_ref
+from ....errors import NoIdleSlot
+from ...allocate_strategy import (
+    AddressSpecified,
+    IdleLabel,
+    MainPool,
+    Random,
+    RandomLabel,
+    RandomSubPool,
+)
+from ...config import ActorPoolConfig
+
+config = ActorPoolConfig()
+config.add_pool_conf(0, "main", "unixsocket:///0", "127.0.0.1:1111")
+config.add_pool_conf(1, "test", "unixsocket:///1", "127.0.0.1:1112")
+config.add_pool_conf(2, "test2", "unixsocket:///2", "127.0.0.1:1113")
+config.add_pool_conf(3, "test", "unixsocket:///3", "127.0.0.1:1114")
+
+
+def test_address_specified():
+    addr = "127.0.0.1:1112"
+    strategy = AddressSpecified(addr)
+    assert strategy.get_allocated_address(config, dict()) == addr
+
+
+def test_main_pool():
+    strategy = MainPool()
+    assert strategy.get_allocated_address(config, dict()) == "127.0.0.1:1111"
+
+
+def test_random():
+    strategy = Random()
+    addresses = config.get_external_addresses()
+    assert strategy.get_allocated_address(config, dict()) in addresses
+
+
+def test_random_sub_pool():
+    strategy = RandomSubPool()
+    addresses = config.get_external_addresses()[1:]
+    assert strategy.get_allocated_address(config, dict()) in addresses
+
+
+def test_random_label():
+    strategy = RandomLabel("test")
+    addresses = config.get_external_addresses(label="test")
+    assert len(addresses) == 2
+    assert strategy.get_allocated_address(config, dict()) in addresses
+
+
+def test_idle_label():
+    strategy = IdleLabel("test", "my_mark")
+    addresses = config.get_external_addresses(label="test")
+    assert len(addresses) == 2
+    allocated = {
+        addresses[0]: {create_actor_ref(addresses[0], b"id1"): (strategy, None)}
+    }
+    assert strategy.get_allocated_address(config, allocated) == addresses[1]
+
+    strategy2 = IdleLabel("test", "my_mark")
+    allocated = {
+        addresses[0]: {
+            create_actor_ref(addresses[0], b"id1"): (strategy, None),
+            create_actor_ref(addresses[0], b"id2"): (RandomLabel("test"), None),
+        },
+        addresses[1]: {create_actor_ref(addresses[1], b"id3"): (strategy2, None)},
+    }
+    with pytest.raises(NoIdleSlot):
+        strategy2.get_allocated_address(config, allocated)
diff --git a/python/xorbits/_mars/oscar/backends/mars/tests/test_debug.py b/python/xorbits/_mars/oscar/backends/mars/tests/test_debug.py
new file mode 100644
index 000000000..05fb10ee0
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/mars/tests/test_debug.py
@@ -0,0 +1,182 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+import os
+import sys
+from contextlib import contextmanager
+from io import StringIO
+from typing import List
+
+import pytest
+
+from ..... import oscar as mo
+from ....debug import get_debug_options, reload_debug_opts_from_env
+
+
+class DebugActor(mo.Actor):
+    def __init__(self):
+        self._log_file = None
+        self._pos = 0
+
+    @classmethod
+    async def wait(cls, delay: float):
+        await asyncio.sleep(delay)
+
+    @classmethod
+    async def raise_error(cls, exc):
+        raise exc
+
+    @classmethod
+    async def call_chain(
+        cls, chain: List, use_yield: bool = False, use_tell: bool = False
+    ):
+        if not chain:
+            return
+        ref_uid, ref_address = chain[0]
+        new_ref = await mo.actor_ref(ref_uid, address=ref_address)
+
+        if use_tell:
+            call_coro = new_ref.call_chain.tell(chain[1:])
+        else:
+            call_coro = new_ref.call_chain(chain[1:])
+
+        if use_yield:
+            yield call_coro
+        else:
+            await call_coro
+
+    async def call_self_ref(self):
+        await self.ref().wait(1)
+
+
+@pytest.fixture
+async def actor_pool():
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    pool = await mo.create_actor_pool(
+        "127.0.0.1", n_process=0, subprocess_start_method=start_method
+    )
+    await pool.start()
+    yield pool
+    await pool.stop()
+
+
+@pytest.fixture
+async def debug_logger():
+    log_file = StringIO()
+    logger = logging.getLogger("mars.oscar.debug")
+
+    log_handler = logging.StreamHandler(log_file)
+    log_handler.setLevel(logging.DEBUG)
+    logger.addHandler(log_handler)
+
+    try:
+        mo.set_debug_options(
+            mo.DebugOptions(
+                actor_call_timeout=1,
+                log_unhandled_errors=True,
+                log_cycle_send=True,
+            )
+        )
+        yield log_file
+    finally:
+        mo.set_debug_options(None)
+        logger.removeHandler(log_handler)
+        assert mo.get_debug_options() is None
+
+
+@contextmanager
+def cut_file_log(log_file) -> StringIO:
+    dest = StringIO()
+    pos = log_file.tell()
+    try:
+        yield dest
+    finally:
+        log_file.seek(pos, os.SEEK_SET)
+        dest.write(log_file.read())
+
+
+@pytest.mark.asyncio
+async def test_error_logs(actor_pool, debug_logger):
+    debug_ref = await mo.create_actor(
+        DebugActor, uid=DebugActor.default_uid(), address=actor_pool.external_address
+    )
+
+    with cut_file_log(debug_logger) as log_file:
+        await debug_ref.wait(0.2)
+    assert log_file.getvalue() == ""
+
+    with cut_file_log(debug_logger) as log_file:
+        await debug_ref.wait(1.2)
+    assert DebugActor.default_uid() in log_file.getvalue()
+
+    with pytest.raises(ValueError), cut_file_log(debug_logger) as log_file:
+        await debug_ref.raise_error(ValueError)
+    assert "ValueError" in log_file.getvalue()
+
+
+@pytest.mark.asyncio
+async def test_cycle_logs(actor_pool, debug_logger):
+    address = actor_pool.external_address
+    ref1 = await mo.create_actor(DebugActor, uid="debug_ref1", address=address)
+    ref2 = await mo.create_actor(DebugActor, uid="debug_ref2", address=address)
+
+    chain = [(ref2.uid, ref2.address)]
+
+    with cut_file_log(debug_logger) as log_file:
+        task = asyncio.create_task(ref1.call_chain(chain))
+        await asyncio.wait_for(task, 1)
+    assert log_file.getvalue() == ""
+
+    chain = [(ref2.uid, ref2.address), (ref1.uid, ref1.address)]
+
+    # test cycle detection with chain
+    with pytest.raises(asyncio.TimeoutError), cut_file_log(debug_logger) as log_file:
+        task = asyncio.create_task(ref1.call_chain(chain))
+        await asyncio.wait_for(task, 1)
+    assert "cycle" in log_file.getvalue()
+
+    # test yield call (should not produce loops)
+    with cut_file_log(debug_logger) as log_file:
+        task = asyncio.create_task(ref1.call_chain(chain, use_yield=True))
+        await asyncio.wait_for(task, 1)
+    assert log_file.getvalue() == ""
+
+    # test tell (should not produce loops)
+    with cut_file_log(debug_logger) as log_file:
+        task = asyncio.create_task(ref1.call_chain(chain, use_tell=True))
+        await asyncio.wait_for(task, 1)
+    assert log_file.getvalue() == ""
+
+    # test calling actor inside itself
+    with pytest.raises(asyncio.TimeoutError), cut_file_log(debug_logger) as log_file:
+        task = asyncio.create_task(ref1.call_self_ref())
+        await asyncio.wait_for(task, 1)
+    assert "cycle" in log_file.getvalue()
+
+
+def test_environ():
+    os.environ["DEBUG_OSCAR"] = "1"
+    try:
+        reload_debug_opts_from_env()
+        assert get_debug_options() is not None
+    finally:
+        os.environ.pop("DEBUG_OSCAR")
+        reload_debug_opts_from_env()
+        assert get_debug_options() is None
diff --git a/python/xorbits/_mars/oscar/backends/mars/tests/test_mars_actor_context.py b/python/xorbits/_mars/oscar/backends/mars/tests/test_mars_actor_context.py
new file mode 100644
index 000000000..88947a39e
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/mars/tests/test_mars_actor_context.py
@@ -0,0 +1,625 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+import os
+import sys
+import time
+import traceback
+from collections import deque
+
+import pandas as pd
+import pytest
+
+from ..... import oscar as mo
+from .....oscar.core import ActorRef, LocalActorRef
+from ....backends.allocate_strategy import RandomSubPool
+from ....debug import DebugOptions, get_debug_options, set_debug_options
+from ...router import Router
+
+logger = logging.getLogger(__name__)
+
+
+class DummyActor(mo.Actor):
+    def __init__(self, value):
+        super().__init__()
+
+        if value < 0:
+            raise ValueError("value < 0")
+        self.value = value
+
+    @mo.extensible
+    async def add(self, value):
+        if not isinstance(value, int):
+            raise TypeError("add number must be int")
+        self.value += value
+        return self.value
+
+    @add.batch
+    async def add(self, args_list, _kwargs_list):
+        self.value += sum(v[0] for v in args_list)
+        return self.value
+
+    @mo.extensible
+    async def add_ret(self, value):
+        return self.value + value
+
+    @add_ret.batch
+    async def add_ret(self, args_list, _kwargs_list):
+        sum_val = sum(v[0] for v in args_list)
+        return [self.value + sum_val for _ in args_list]
+
+    async def create(self, actor_cls, *args, **kw):
+        kw["address"] = self.address
+        return await mo.create_actor(actor_cls, *args, **kw)
+
+    async def create_ignore(self, actor_cls, *args, **kw):
+        try:
+            return await mo.create_actor(actor_cls, *args, **kw)
+        except ValueError:
+            pass
+
+    async def create_send(self, actor_cls, *args, **kw):
+        method = kw.pop("method")
+        method_args = kw.pop("method_args")
+        ref = await mo.create_actor(actor_cls, *args, **kw)
+        return await getattr(ref, method)(*method_args)
+
+    async def delete(self, value):
+        return await mo.destroy_actor(value)
+
+    async def has(self, value):
+        return await mo.has_actor(value)
+
+    async def send(self, uid, method, *args):
+        actor_ref = await mo.actor_ref(uid, address=self.address)
+        tp = (
+            LocalActorRef
+            if actor_ref.address == self.address and get_debug_options() is None
+            else ActorRef
+        )
+        assert (
+            type(actor_ref) is tp
+        ), f"Expect type of actor ref is {tp}, but got {actor_ref} instead."
+        return await getattr(actor_ref, method)(*args)
+
+    async def tell(self, uid, method, *args):
+        actor_ref = await mo.actor_ref(uid, address=self.address)
+        await getattr(actor_ref, method).tell(*args)
+
+    async def tell_delay(self, uid, method, *args, delay=None):
+        actor_ref = await mo.actor_ref(uid)
+        getattr(actor_ref, method).tell_delay(*args, delay=delay)
+
+    async def send_unpickled(self, value):
+        actor_ref = await mo.actor_ref(value)
+        return await actor_ref.send(lambda x: x)
+
+    async def create_unpickled(self):
+        return await mo.create_actor(DummyActor, lambda x: x, uid="admin-5")
+
+    async def destroy(self):
+        await self.ref().destroy()
+
+    def get_value(self):
+        return self.value
+
+    def get_ref(self):
+        ref = self.ref()
+        tp = LocalActorRef if get_debug_options() is None else ActorRef
+        assert (
+            type(ref) is tp
+        ), f"Expect type of actor ref is {tp}, but got {ref} instead."
+        return ref
+
+
+class RecordActor(mo.Actor):
+    def __init__(self):
+        self._records = []
+
+    def add_record(self, rec):
+        self._records.append(rec)
+
+    def get_records(self):
+        return self._records
+
+
+class CreateDestroyActor(mo.Actor):
+    def __init__(self):
+        self._record_ref = None
+
+    async def __post_create__(self):
+        self._record_ref = await mo.actor_ref(
+            RecordActor.default_uid(), address=self.address
+        )
+        await self._record_ref.add_record(f"create {self.uid}")
+        assert "sth" == await self.ref().echo("sth")
+
+    async def __pre_destroy__(self):
+        await self._record_ref.add_record(f"destroy {self.uid}")
+        assert "sth2" == await self.ref().echo("sth2")
+
+    def echo(self, message):
+        return message
+
+
+class ResourceLockActor(mo.StatelessActor):
+    def __init__(self, count=1):
+        self._sem = asyncio.Semaphore(count)
+        self._requests = deque()
+
+    async def apply(self, val=None):
+        await self._sem.acquire()
+        return val + 1 if val is not None else None
+
+    def release(self):
+        self._sem.release()
+
+
+class PromiseTestActor(mo.Actor):
+    def __init__(self, res_lock_ref):
+        self.res_lock_ref = res_lock_ref
+        self.call_log = []
+
+    async def _apply_step(self, idx, delay):
+        res = None
+        try:
+            self.call_log.append(("A", idx, time.time()))
+            res = yield self.res_lock_ref.apply(idx)
+            assert res == idx + 1
+
+            self.call_log.append(("B", idx, time.time()))
+            yield asyncio.sleep(delay)
+            self.call_log.append(("C", idx, time.time()))
+        finally:
+            yield self.res_lock_ref.release()
+            raise mo.Return(res)
+
+    async def test_promise_call(self, idx, delay=0.1):
+        return self._apply_step(idx, delay)
+
+    async def test_yield_tuple(self, delay=0.1):
+        tp = yield tuple(self._apply_step(idx, delay) for idx in range(4)) + (
+            asyncio.sleep(delay),
+            "PlainString",
+        )
+        raise mo.Return(tp)
+
+    async def async_raiser_func(self):
+        yield asyncio.sleep(0.1)
+        raise ValueError
+
+    async def test_yield_exceptions(self):
+        task = asyncio.create_task(self.ref().async_raiser_func())
+        return task
+
+    async def test_exceptions(self):
+        async def async_raiser():
+            yield asyncio.sleep(0.1)
+            raise SystemError
+
+        try:
+            yield async_raiser(),
+        except SystemError:
+            raise ValueError
+        raise KeyError
+
+    async def test_cancel(self, delay):
+        async def intermediate_error():
+            raise ValueError
+
+        async def task_fun():
+            try:
+                yield intermediate_error()
+            except ValueError:
+                pass
+            try:
+                yield asyncio.sleep(delay)
+            except asyncio.CancelledError:
+                self.call_log.append((time.time(), "CANCELLED"))
+                raise
+
+        self.call_log.append((time.time(), "START"))
+        return task_fun()
+
+    def get_call_log(self):
+        log = self.call_log
+        self.call_log = []
+        return log
+
+
+@pytest.mark.parametrize(indirect=True)
+@pytest.fixture(params=[False, True])
+async def actor_pool(request):
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    pool = await mo.create_actor_pool(
+        "127.0.0.1", n_process=2, subprocess_start_method=start_method
+    )
+
+    try:
+        if request.param:
+            set_debug_options(DebugOptions())
+        else:
+            set_debug_options(None)
+
+        await pool.start()
+        yield pool
+        await pool.stop()
+    finally:
+        set_debug_options(None)
+
+
+@pytest.mark.asyncio
+async def test_simple_local_actor_pool(actor_pool):
+    actor_ref = await mo.create_actor(
+        DummyActor, 100, address=actor_pool.external_address
+    )
+    assert await actor_ref.add(1) == 101
+    await actor_ref.add(1)
+
+    res = await actor_ref.get_value()
+    assert res == 102
+
+    ref2 = await actor_ref.get_ref()
+    assert actor_ref.address == ref2.address
+    assert actor_ref.uid == ref2.uid
+
+    ref = await mo.actor_ref(uid=actor_ref.uid, address=actor_pool.external_address)
+    assert await ref.add(2) == 104
+
+
+@pytest.mark.asyncio
+async def test_mars_post_create_pre_destroy(actor_pool):
+    rec_ref = await mo.create_actor(
+        RecordActor, uid=RecordActor.default_uid(), address=actor_pool.external_address
+    )
+    actor_ref = await mo.create_actor(
+        CreateDestroyActor, address=actor_pool.external_address
+    )
+    await actor_ref.destroy()
+
+    records = await rec_ref.get_records()
+    assert len(records) == 2
+    assert records[0].startswith("create")
+    assert records[1].startswith("destroy")
+
+
+@pytest.mark.asyncio
+async def test_mars_create_actor(actor_pool):
+    actor_ref = await mo.create_actor(
+        DummyActor, 1, address=actor_pool.external_address
+    )
+    # create actor inside on_receive
+    r = await actor_ref.create(DummyActor, 5, address=actor_pool.external_address)
+    ref = await mo.actor_ref(r, address=actor_pool.external_address)
+    assert await ref.add(10) == 15
+    # create actor inside on_receive and send message
+    r = await actor_ref.create_send(
+        DummyActor,
+        5,
+        method="add",
+        method_args=(1,),
+        address=actor_pool.external_address,
+    )
+    assert r == 6
+
+
+@pytest.mark.asyncio
+async def test_mars_create_actor_error(actor_pool):
+    ref1 = await mo.create_actor(
+        DummyActor, 1, uid="dummy1", address=actor_pool.external_address
+    )
+    with pytest.raises(mo.ActorAlreadyExist):
+        await mo.create_actor(
+            DummyActor, 1, uid="dummy1", address=actor_pool.external_address
+        )
+    await mo.destroy_actor(ref1)
+
+    with pytest.raises(ValueError):
+        await mo.create_actor(DummyActor, -1, address=actor_pool.external_address)
+    ref1 = await mo.create_actor(DummyActor, 1, address=actor_pool.external_address)
+    with pytest.raises(ValueError):
+        await ref1.create(DummyActor, -2, address=actor_pool.external_address)
+
+
+@pytest.mark.asyncio
+async def test_mars_send(actor_pool):
+    ref1 = await mo.create_actor(DummyActor, 1, address=actor_pool.external_address)
+    ref2 = await mo.actor_ref(
+        await ref1.create(DummyActor, 2, address=actor_pool.external_address)
+    )
+    assert await ref1.send(ref2, "add", 3) == 5
+
+    ref3 = await mo.create_actor(DummyActor, 1, address=actor_pool.external_address)
+    ref4 = await mo.create_actor(
+        DummyActor,
+        2,
+        address=actor_pool.external_address,
+        allocate_strategy=RandomSubPool(),
+    )
+    assert await ref4.send(ref3, "add", 3) == 4
+
+
+@pytest.mark.asyncio
+async def test_mars_send_error(actor_pool):
+    ref1 = await mo.create_actor(DummyActor, 1, address=actor_pool.external_address)
+    with pytest.raises(TypeError):
+        await ref1.add(1.0)
+    ref2 = await mo.create_actor(DummyActor, 2, address=actor_pool.external_address)
+    with pytest.raises(TypeError):
+        await ref1.send(ref2, "add", 1.0)
+    with pytest.raises(mo.ActorNotExist):
+        await (await mo.actor_ref("fake_uid", address=actor_pool.external_address)).add(
+            1
+        )
+
+
+@pytest.mark.asyncio
+async def test_mars_tell(actor_pool):
+    ref1 = await mo.create_actor(DummyActor, 1, address=actor_pool.external_address)
+    ref2 = await mo.actor_ref(await ref1.create(DummyActor, 2))
+    await ref1.tell(ref2, "add", 3)
+    assert await ref2.get_value() == 5
+
+    await ref1.tell_delay(ref2, "add", 4, delay=0.5)  # delay 0.5 secs
+    assert await ref2.get_value() == 5
+    await asyncio.sleep(0.45)
+    assert await ref2.get_value() == 5
+    await asyncio.sleep(0.2)
+    assert await ref2.get_value() == 9
+
+    # error needed when illegal uids are passed
+    with pytest.raises(ValueError):
+        await ref1.tell(await mo.actor_ref(set()), "add", 3)
+
+
+@pytest.mark.asyncio
+async def test_mars_batch_method(actor_pool):
+    ref1 = await mo.create_actor(DummyActor, 1, address=actor_pool.external_address)
+    batch_result = await ref1.add_ret.batch(
+        ref1.add_ret.delay(1), ref1.add_ret.delay(2), ref1.add_ret.delay(3)
+    )
+    assert len(batch_result) == 3
+    assert all(r == 7 for r in batch_result)
+
+    await ref1.add.batch(
+        ref1.add.delay(1), ref1.add.delay(2), ref1.add.delay(3), send=False
+    )
+    assert await ref1.get_value() == 7
+
+    with pytest.raises(ValueError):
+        await ref1.add_ret.batch(ref1.add_ret.delay(1), ref1.add.delay(2))
+
+
+@pytest.mark.asyncio
+async def test_gather_exception(actor_pool):
+    try:
+        Router.get_instance_or_empty()._cache.clear()
+        ref1 = await mo.create_actor(DummyActor, 1, address=actor_pool.external_address)
+        router = Router.get_instance_or_empty()
+        client = next(iter(router._cache.values()))
+
+        future = asyncio.Future()
+        client_channel = client.channel
+
+        class FakeChannel(type(client_channel)):
+            def __init__(self):
+                pass
+
+            def __getattr__(self, item):
+                return getattr(client_channel, item)
+
+            async def recv(self):
+                return await future
+
+        client.channel = FakeChannel()
+
+        class MyException(Exception):
+            pass
+
+        await ref1.add(1)
+        tasks = [ref1.add(i) for i in range(200)]
+        future.set_exception(MyException("Test recv exception!!"))
+        with pytest.raises(MyException) as ex:
+            await asyncio.gather(*tasks)
+        s = traceback.format_tb(ex.tb)
+        assert 10 > "\n".join(s).count("send") > 0
+    finally:
+        Router.get_instance_or_empty()._cache.clear()
+
+
+@pytest.mark.asyncio
+async def test_mars_destroy_has_actor(actor_pool):
+    ref1 = await mo.create_actor(DummyActor, 1, address=actor_pool.external_address)
+    ref2 = await mo.actor_ref(ref1)
+    ref2_add_method = ref2.add
+    assert isinstance(ref1, ActorRef)
+    assert await mo.has_actor(ref2)
+    await mo.destroy_actor(ref2)
+    assert not await mo.has_actor(ref1)
+    assert not await mo.has_actor(ref2)
+
+    if isinstance(ref2, LocalActorRef):
+        assert "weakref" in str(ref2)
+        assert "dead" in str(ref2)
+
+    # error needed when illegal uids are passed
+    with pytest.raises(ValueError):
+        await mo.has_actor(await mo.actor_ref(set()))
+
+    with pytest.raises(mo.ActorNotExist):
+        await ref2.add(1)
+
+    with pytest.raises(mo.ActorNotExist):
+        await ref2_add_method(1)
+
+    ref1 = await mo.create_actor(
+        DummyActor, 1, uid=ref1.uid, address=actor_pool.external_address
+    )
+
+    # the ref2 should be works after actor is recreated.
+    assert await ref2.add(1) == 2
+    # the ref2 method should be works after actor is recreated.
+    assert await ref2_add_method(1) == 3
+
+    assert isinstance(ref2, ActorRef)
+    assert await mo.has_actor(ref1)
+    await mo.destroy_actor(ref1)
+    assert not await mo.has_actor(ref1)
+    assert not await mo.has_actor(ref2)
+
+    ref1 = await mo.create_actor(DummyActor, 1, address=actor_pool.external_address)
+    ref2 = await ref1.create(DummyActor, 2, address=actor_pool.external_address)
+
+    assert await mo.has_actor(ref2)
+
+    await ref1.delete(ref2)
+    assert not await ref1.has(ref2)
+
+    with pytest.raises(mo.ActorNotExist):
+        await mo.destroy_actor(
+            await mo.actor_ref("fake_uid", address=actor_pool.external_address)
+        )
+
+    ref1 = await mo.create_actor(DummyActor, 1, address=actor_pool.external_address)
+    with pytest.raises(mo.ActorNotExist):
+        await ref1.delete(
+            await mo.actor_ref("fake_uid", address=actor_pool.external_address)
+        )
+
+    # test self destroy
+    ref1 = await mo.create_actor(DummyActor, 2, address=actor_pool.external_address)
+    await ref1.destroy()
+    assert not await mo.has_actor(ref1)
+
+
+@pytest.mark.asyncio
+async def test_mars_resource_lock(actor_pool):
+    ref = await mo.create_actor(ResourceLockActor, address=actor_pool.external_address)
+    event_list = []
+
+    async def test_task(idx):
+        await ref.apply()
+        event_list.append(("A", idx, time.time()))
+        await asyncio.sleep(0.1)
+        event_list.append(("B", idx, time.time()))
+        await ref.release()
+
+    tasks = [asyncio.create_task(test_task(idx)) for idx in range(4)]
+    await asyncio.wait(tasks)
+
+    for idx in range(0, len(event_list), 2):
+        event_pair = event_list[idx : idx + 2]
+        assert (event_pair[0][0], event_pair[1][0]) == ("A", "B")
+        assert event_pair[0][1] == event_pair[1][1]
+
+
+@pytest.mark.asyncio
+async def test_promise_chain(actor_pool):
+    lock_ref = await mo.create_actor(
+        ResourceLockActor, 2, address=actor_pool.external_address
+    )
+    promise_test_ref = await mo.create_actor(
+        PromiseTestActor, lock_ref, address=actor_pool.external_address
+    )
+
+    delay_val = 1.0
+
+    start_time = time.time()
+    tasks = [
+        asyncio.create_task(promise_test_ref.test_promise_call(idx, delay=delay_val))
+        for idx in range(4)
+    ]
+    await asyncio.gather(*tasks)
+
+    logs = pd.DataFrame(
+        await promise_test_ref.get_call_log(), columns=["group", "idx", "time"]
+    )
+    logs.time -= start_time
+    assert logs.query('group == "A"').time.max() < delay_val / 2
+    max_apply_time = (
+        logs.query('group == "A" | group == "B"')
+        .groupby("idx")
+        .apply(lambda s: s.time.max() - s.time.min())
+        .max()
+    )
+    assert max_apply_time > delay_val / 2
+    max_delay_time = (
+        logs.query('group == "B" | group == "C"')
+        .groupby("idx")
+        .apply(lambda s: s.time.max() - s.time.min())
+        .max()
+    )
+    assert max_delay_time > delay_val / 2
+
+    start_time = time.time()
+    ret = await promise_test_ref.test_yield_tuple(delay=delay_val)
+    assert set(ret) == {1, 2, 3, 4, None, "PlainString"}
+
+    logs = pd.DataFrame(
+        await promise_test_ref.get_call_log(), columns=["group", "idx", "time"]
+    )
+    logs.time -= start_time
+    assert logs.query('group == "A"').time.max() < delay_val / 2
+    max_apply_time = (
+        logs.query('group == "A" | group == "B"')
+        .groupby("idx")
+        .apply(lambda s: s.time.max() - s.time.min())
+        .max()
+    )
+    assert max_apply_time > delay_val / 2
+    max_delay_time = (
+        logs.query('group == "B" | group == "C"')
+        .groupby("idx")
+        .apply(lambda s: s.time.max() - s.time.min())
+        .max()
+    )
+    assert max_delay_time > delay_val / 2
+
+    with pytest.raises(ValueError):
+        await promise_test_ref.test_exceptions()
+    with pytest.raises(ValueError):
+        await promise_test_ref.test_yield_exceptions()
+
+    with pytest.raises(asyncio.CancelledError):
+        task = asyncio.create_task(promise_test_ref.test_cancel(5))
+        await asyncio.sleep(0.1)
+        task.cancel()
+        await task
+    call_log = await promise_test_ref.get_call_log()
+    assert len(call_log) == 2
+    assert call_log[1][0] - call_log[0][0] < 1
+
+
+class ActorCannotDestroy(mo.Actor):
+    async def __pre_destroy__(self):
+        raise ValueError("Cannot destroy")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("in_sub_pool", [True, False])
+async def test_error_in_pre_destroy(actor_pool, in_sub_pool):
+    pool = actor_pool
+
+    strategy = None if not in_sub_pool else RandomSubPool()
+    a = await mo.create_actor(
+        ActorCannotDestroy, address=pool.external_address, strategy=strategy
+    )
+    with pytest.raises(ValueError, match="Cannot destroy"):
+        await mo.destroy_actor(a)
diff --git a/python/xorbits/_mars/oscar/backends/mars/tests/test_pool.py b/python/xorbits/_mars/oscar/backends/mars/tests/test_pool.py
new file mode 100644
index 000000000..0e0281ce3
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/mars/tests/test_pool.py
@@ -0,0 +1,972 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+import os
+import re
+import sys
+import time
+
+import pytest
+
+from .....tests.core import mock, require_ucx
+from .....utils import get_next_port
+from .... import Actor, create_actor_ref, kill_actor
+from ....context import get_context
+from ....errors import ActorNotExist, NoIdleSlot, SendMessageFailed, ServerClosed
+from ...allocate_strategy import (
+    AddressSpecified,
+    IdleLabel,
+    MainPool,
+    ProcessIndex,
+    RandomSubPool,
+)
+from ...config import ActorPoolConfig
+from ...message import (
+    ActorRefMessage,
+    CancelMessage,
+    ControlMessage,
+    ControlMessageType,
+    CreateActorMessage,
+    DestroyActorMessage,
+    ErrorMessage,
+    HasActorMessage,
+    MessageType,
+    SendMessage,
+    TellMessage,
+    new_message_id,
+)
+from ...pool import create_actor_pool
+from ...router import Router
+from ..pool import MainActorPool, SubActorPool
+
+
+class _CannotBePickled:
+    def __getstate__(self):
+        raise RuntimeError("cannot pickle")
+
+
+class _CannotBeUnpickled:
+    def __getstate__(self):
+        return ()
+
+    def __setstate__(self, state):
+        raise RuntimeError("cannot unpickle")
+
+
+class TestActor(Actor):
+    __test__ = False
+
+    def __init__(self):
+        self.value = 0
+
+    def add(self, val):
+        self.value += val
+        return self.value
+
+    async def add_other(self, ref, val):
+        self.value += await ref.add(val)
+        return self.value
+
+    async def sleep(self, second):
+        try:
+            await asyncio.sleep(second)
+            return self.value
+        except asyncio.CancelledError:
+            return self.value + 1
+
+    def return_cannot_unpickle(self):
+        return _CannotBeUnpickled()
+
+    def raise_cannot_pickle(self):
+        raise ValueError(_CannotBePickled())
+
+
+def _add_pool_conf(
+    config: ActorPoolConfig,
+    process_index: int,
+    label: str,
+    internal_address: str,
+    external_address: str,
+    env: dict = None,
+):
+    if sys.platform.startswith("win"):
+        config.add_pool_conf(
+            process_index, label, external_address, external_address, env=env
+        )
+    else:
+        config.add_pool_conf(
+            process_index, label, internal_address, external_address, env=env
+        )
+
+
+def _raise_if_error(message):
+    if message.message_type == MessageType.error:
+        raise message.error.with_traceback(message.traceback)
+
+
+@pytest.fixture(autouse=True)
+def clear_routers():
+    yield
+    Router.set_instance(None)
+
+
+@pytest.mark.asyncio
+@mock.patch("mars.oscar.backends.mars.pool.SubActorPool.notify_main_pool_to_create")
+@mock.patch("mars.oscar.backends.mars.pool.SubActorPool.notify_main_pool_to_destroy")
+async def test_sub_actor_pool(notify_main_pool_to_create, notify_main_pool_to_destroy):
+    notify_main_pool_to_create.return_value = None
+    notify_main_pool_to_destroy.return_value = None
+    config = ActorPoolConfig()
+
+    ext_address0 = f"127.0.0.1:{get_next_port()}"
+    ext_address1 = f"127.0.0.1:{get_next_port()}"
+    _add_pool_conf(config, 0, "main", "unixsocket:///0", ext_address0)
+    _add_pool_conf(config, 1, "sub", "unixsocket:///1", ext_address1)
+
+    pool = await SubActorPool.create({"actor_pool_config": config, "process_index": 1})
+    await pool.start()
+
+    try:
+        create_actor_message = CreateActorMessage(
+            new_message_id(),
+            TestActor,
+            b"test",
+            tuple(),
+            dict(),
+            AddressSpecified(pool.external_address),
+        )
+        message = await pool.create_actor(create_actor_message)
+        assert message.message_type == MessageType.result
+        actor_ref = message.result
+        assert actor_ref.address == pool.external_address
+        assert actor_ref.uid == b"test"
+
+        has_actor_message = HasActorMessage(new_message_id(), actor_ref)
+        assert (await pool.has_actor(has_actor_message)).result is True
+
+        actor_ref_message = ActorRefMessage(new_message_id(), actor_ref)
+        assert (await pool.actor_ref(actor_ref_message)).result == actor_ref
+
+        tell_message = TellMessage(
+            new_message_id(), actor_ref, ("add", 0, (1,), dict())
+        )
+        message = await pool.tell(tell_message)
+        assert message.result is None
+
+        send_message = SendMessage(
+            new_message_id(), actor_ref, ("add", 0, (3,), dict())
+        )
+        message = await pool.send(send_message)
+        assert message.result == 4
+
+        # test error message
+        # type mismatch
+        send_message = SendMessage(
+            new_message_id(), actor_ref, ("add", 0, ("3",), dict())
+        )
+        result = await pool.send(send_message)
+        assert result.message_type == MessageType.error
+        assert isinstance(result.error, TypeError)
+
+        send_message = SendMessage(
+            new_message_id(),
+            create_actor_ref(actor_ref.address, "non_exist"),
+            ("add", 0, (3,), dict()),
+        )
+        result = await pool.send(send_message)
+        assert isinstance(result.error, ActorNotExist)
+
+        # test send message and cancel it
+        send_message = SendMessage(
+            new_message_id(), actor_ref, ("sleep", 0, (20,), dict())
+        )
+        result_task = asyncio.create_task(pool.send(send_message))
+        await asyncio.sleep(0)
+        start = time.time()
+        cancel_message = CancelMessage(
+            new_message_id(), actor_ref.address, send_message.message_id
+        )
+        cancel_task = asyncio.create_task(pool.cancel(cancel_message))
+        result = await asyncio.wait_for(cancel_task, 3)
+        assert result.message_type == MessageType.result
+        assert result.result is True
+        result = await result_task
+        # test time
+        assert time.time() - start < 3
+        assert result.message_type == MessageType.result
+        assert result.result == 5
+
+        # test processing message on background
+        async with await pool.router.get_client(pool.external_address) as client:
+            send_message = SendMessage(
+                new_message_id(), actor_ref, ("add", 0, (5,), dict())
+            )
+            await client.send(send_message)
+            result = await client.recv()
+            _raise_if_error(result)
+            assert result.result == 9
+
+            send_message = SendMessage(
+                new_message_id(), actor_ref, ("add", 0, ("5",), dict())
+            )
+            await client.send(send_message)
+            result = await client.recv()
+            assert isinstance(result.error, TypeError)
+
+        destroy_actor_message = DestroyActorMessage(new_message_id(), actor_ref)
+        message = await pool.destroy_actor(destroy_actor_message)
+        assert message.result == actor_ref.uid
+
+        # send destroy failed
+        message = await pool.destroy_actor(destroy_actor_message)
+        assert isinstance(message.error, ActorNotExist)
+
+        message = await pool.has_actor(has_actor_message)
+        assert not message.result
+
+        # test sync config
+        _add_pool_conf(
+            config, 1, "sub", "unixsocket:///1", f"127.0.0.1:{get_next_port()}"
+        )
+        sync_config_message = ControlMessage(
+            new_message_id(), "", ControlMessageType.sync_config, config
+        )
+        message = await pool.handle_control_command(sync_config_message)
+        assert message.result is True
+
+        # test get config
+        get_config_message = ControlMessage(
+            new_message_id(), "", ControlMessageType.get_config, None
+        )
+        message = await pool.handle_control_command(get_config_message)
+        config2 = message.result
+        assert config.as_dict() == config2.as_dict()
+
+        assert pool.router._mapping == Router.get_instance()._mapping
+        assert (
+            pool.router._curr_external_addresses
+            == Router.get_instance()._curr_external_addresses
+        )
+
+        stop_message = ControlMessage(
+            new_message_id(), "", ControlMessageType.stop, None
+        )
+        message = await pool.handle_control_command(stop_message)
+        assert message.result is True
+
+        await pool.join(0.05)
+        assert pool.stopped
+    finally:
+        await pool.stop()
+
+
+@pytest.mark.asyncio
+async def test_fail_when_create_subpool():
+    config = ActorPoolConfig()
+    my_label = "computation"
+    main_address = f"127.0.0.1:{get_next_port()}"
+    port = get_next_port()
+    _add_pool_conf(config, 0, "main", "unixsocket:///0", main_address)
+
+    # use the same port for sub pools, will raise `OSError` with "address already in use"
+    _add_pool_conf(
+        config, 1, my_label, "unixsocket:///1", f"127.0.0.1:{port}", env={"my_env": "1"}
+    )
+    _add_pool_conf(config, 2, my_label, "unixsocket:///2", f"127.0.0.1:{port}")
+
+    with pytest.raises(OSError):
+        await MainActorPool.create({"actor_pool_config": config})
+
+
+@pytest.mark.asyncio
+async def test_main_actor_pool():
+    config = ActorPoolConfig()
+    my_label = "computation"
+    main_address = f"127.0.0.1:{get_next_port()}"
+    _add_pool_conf(config, 0, "main", "unixsocket:///0", main_address)
+    _add_pool_conf(
+        config,
+        1,
+        my_label,
+        "unixsocket:///1",
+        f"127.0.0.1:{get_next_port()}",
+        env={"my_env": "1"},
+    )
+    _add_pool_conf(
+        config, 2, my_label, "unixsocket:///2", f"127.0.0.1:{get_next_port()}"
+    )
+
+    strategy = IdleLabel(my_label, "my_test")
+
+    async with await MainActorPool.create({"actor_pool_config": config}) as pool:
+        create_actor_message = CreateActorMessage(
+            new_message_id(), TestActor, b"test", tuple(), dict(), MainPool()
+        )
+        message = await pool.create_actor(create_actor_message)
+        actor_ref = message.result
+        assert actor_ref.address == main_address
+
+        create_actor_message1 = CreateActorMessage(
+            new_message_id(), TestActor, b"test1", tuple(), dict(), strategy
+        )
+        message1 = await pool.create_actor(create_actor_message1)
+        actor_ref1 = message1.result
+        assert actor_ref1.address in config.get_external_addresses(my_label)
+
+        create_actor_message2 = CreateActorMessage(
+            new_message_id(), TestActor, b"test2", tuple(), dict(), strategy
+        )
+        message2 = await pool.create_actor(create_actor_message2)
+        actor_ref2 = message2.result
+        assert actor_ref2.address in config.get_external_addresses(my_label)
+        assert actor_ref2.address != actor_ref1.address
+
+        create_actor_message3 = CreateActorMessage(
+            new_message_id(), TestActor, b"test3", tuple(), dict(), strategy
+        )
+        message3 = await pool.create_actor(create_actor_message3)
+        # no slot to allocate the same label
+        assert isinstance(message3.error, NoIdleSlot)
+
+        has_actor_message = HasActorMessage(
+            new_message_id(), create_actor_ref(main_address, b"test2")
+        )
+        assert (await pool.has_actor(has_actor_message)).result is True
+
+        actor_ref_message = ActorRefMessage(
+            new_message_id(), create_actor_ref(main_address, b"test2")
+        )
+        assert (await pool.actor_ref(actor_ref_message)).result == actor_ref2
+
+        # tell
+        tell_message = TellMessage(
+            new_message_id(), actor_ref1, ("add", 0, (2,), dict())
+        )
+        message = await pool.tell(tell_message)
+        assert message.result is None
+
+        # send
+        send_message = SendMessage(
+            new_message_id(), actor_ref1, ("add", 0, (4,), dict())
+        )
+        assert (await pool.send(send_message)).result == 6
+
+        # test error message
+        # type mismatch
+        send_message = SendMessage(
+            new_message_id(), actor_ref1, ("add", 0, ("3",), dict())
+        )
+        result = await pool.send(send_message)
+        assert isinstance(result.error, TypeError)
+
+        # send and tell to main process
+        tell_message = TellMessage(
+            new_message_id(), actor_ref, ("add", 0, (2,), dict())
+        )
+        message = await pool.tell(tell_message)
+        assert message.result is None
+        send_message = SendMessage(
+            new_message_id(), actor_ref, ("add", 0, (4,), dict())
+        )
+        assert (await pool.send(send_message)).result == 6
+
+        # send and cancel
+        send_message = SendMessage(
+            new_message_id(), actor_ref1, ("sleep", 0, (20,), dict())
+        )
+        result_task = asyncio.create_task(pool.send(send_message))
+        start = time.time()
+        cancel_message = CancelMessage(
+            new_message_id(), actor_ref1.address, send_message.message_id
+        )
+        cancel_task = asyncio.create_task(pool.cancel(cancel_message))
+        result = await asyncio.wait_for(cancel_task, 3)
+        assert result.message_type == MessageType.result
+        assert result.result is True
+        result = await result_task
+        assert time.time() - start < 3
+        assert result.message_type == MessageType.result
+        assert result.result == 7
+
+        # destroy
+        destroy_actor_message = DestroyActorMessage(new_message_id(), actor_ref1)
+        message = await pool.destroy_actor(destroy_actor_message)
+        assert message.result == actor_ref1.uid
+
+        tell_message = TellMessage(
+            new_message_id(), actor_ref1, ("add", 0, (2,), dict())
+        )
+        message = await pool.tell(tell_message)
+        assert isinstance(message, ErrorMessage)
+
+        # destroy via connecting to sub pool directly
+        async with await pool.router.get_client(
+            config.get_external_addresses()[-1]
+        ) as client:
+            destroy_actor_message = DestroyActorMessage(new_message_id(), actor_ref2)
+            await client.send(destroy_actor_message)
+            result = await client.recv()
+            _raise_if_error(result)
+            assert result.result == actor_ref2.uid
+
+        # test sync config
+        config.add_pool_conf(
+            3, "sub", "unixsocket:///3", f"127.0.0.1:{get_next_port()}"
+        )
+        sync_config_message = ControlMessage(
+            new_message_id(),
+            pool.external_address,
+            ControlMessageType.sync_config,
+            config,
+        )
+        message = await pool.handle_control_command(sync_config_message)
+        assert message.result is True
+
+        # test get config
+        get_config_message = ControlMessage(
+            new_message_id(),
+            config.get_external_addresses()[1],
+            ControlMessageType.get_config,
+            None,
+        )
+        message = await pool.handle_control_command(get_config_message)
+        config2 = message.result
+        assert config.as_dict() == config2.as_dict()
+
+    assert pool.stopped
+
+
+@pytest.mark.asyncio
+async def test_create_actor_pool():
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    pool = await create_actor_pool(
+        "127.0.0.1",
+        pool_cls=MainActorPool,
+        n_process=2,
+        subprocess_start_method=start_method,
+    )
+
+    async with pool:
+        # test global router
+        global_router = Router.get_instance()
+        # global router should not be the identical one with pool's router
+        assert global_router is not pool.router
+        assert pool.external_address in global_router._curr_external_addresses
+        assert pool.external_address in global_router._mapping
+
+        ctx = get_context()
+
+        # actor on main pool
+        actor_ref = await ctx.create_actor(
+            TestActor, uid="test-1", address=pool.external_address
+        )
+        assert await actor_ref.add(3) == 3
+        assert await actor_ref.add(1) == 4
+        assert (await ctx.has_actor(actor_ref)) is True
+        assert (await ctx.actor_ref(actor_ref)) == actor_ref
+        # test cancel
+        task = asyncio.create_task(actor_ref.sleep(20))
+        await asyncio.sleep(0)
+        task.cancel()
+        assert await task == 5
+        await ctx.destroy_actor(actor_ref)
+        assert (await ctx.has_actor(actor_ref)) is False
+        for f in actor_ref.add, ctx.actor_ref, ctx.destroy_actor:
+            with pytest.raises(ActorNotExist):
+                await f(actor_ref)
+
+        # actor on sub pool
+        actor_ref1 = await ctx.create_actor(
+            TestActor, uid="test-main", address=pool.external_address
+        )
+        actor_ref2 = await ctx.create_actor(
+            TestActor,
+            uid="test-2",
+            address=pool.external_address,
+            allocate_strategy=RandomSubPool(),
+        )
+        assert (
+            await ctx.actor_ref(uid="test-2", address=actor_ref2.address)
+        ) == actor_ref2
+        main_ref = await ctx.actor_ref(uid="test-main", address=actor_ref2.address)
+        assert main_ref.address == pool.external_address
+        main_ref = await ctx.actor_ref(actor_ref1)
+        assert main_ref.address == pool.external_address
+        assert actor_ref2.address != actor_ref.address
+        assert await actor_ref2.add(3) == 3
+        assert await actor_ref2.add(1) == 4
+        with pytest.raises(RuntimeError):
+            await actor_ref2.return_cannot_unpickle()
+        with pytest.raises(SendMessageFailed):
+            await actor_ref2.raise_cannot_pickle()
+        assert (await ctx.has_actor(actor_ref2)) is True
+        assert (await ctx.actor_ref(actor_ref2)) == actor_ref2
+        # test cancel
+        task = asyncio.create_task(actor_ref2.sleep(20))
+        start = time.time()
+        await asyncio.sleep(0)
+        task.cancel()
+        assert await task == 5
+        assert time.time() - start < 3
+        await ctx.destroy_actor(actor_ref2)
+        assert (await ctx.has_actor(actor_ref2)) is False
+
+    assert pool.stopped
+    # after pool shutdown, global router must has been cleaned
+    global_router = Router.get_instance()
+    assert len(global_router._curr_external_addresses) == 0
+    assert len(global_router._mapping) == 0
+
+
+@pytest.mark.asyncio
+async def test_errors():
+    with pytest.raises(ValueError):
+        _ = await create_actor_pool(
+            "127.0.0.1", pool_cls=MainActorPool, n_process=1, labels=["a"]
+        )
+
+    with pytest.raises(ValueError):
+        _ = await create_actor_pool(
+            f"127.0.0.1:{get_next_port()}",
+            pool_cls=MainActorPool,
+            n_process=1,
+            ports=[get_next_port(), get_next_port()],
+        )
+
+    with pytest.raises(ValueError):
+        _ = await create_actor_pool(
+            "127.0.0.1", pool_cls=MainActorPool, n_process=1, ports=[get_next_port()]
+        )
+
+    with pytest.raises(ValueError):
+        _ = await create_actor_pool(
+            "127.0.0.1", pool_cls=MainActorPool, n_process=1, auto_recover="illegal"
+        )
+
+    with pytest.raises(ValueError, match="external_address_schemes"):
+        _ = await create_actor_pool(
+            "127.0.0.1",
+            pool_cls=MainActorPool,
+            n_process=1,
+            external_address_schemes=["ucx"],
+        )
+
+    with pytest.raises(ValueError, match="enable_internal_addresses"):
+        _ = await create_actor_pool(
+            "127.0.0.1",
+            pool_cls=MainActorPool,
+            n_process=1,
+            enable_internal_addresses=[True],
+        )
+
+
+@pytest.mark.asyncio
+async def test_server_closed():
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    pool = await create_actor_pool(
+        "127.0.0.1",
+        pool_cls=MainActorPool,
+        n_process=2,
+        subprocess_start_method=start_method,
+        auto_recover=False,
+    )
+
+    ctx = get_context()
+
+    async with pool:
+        actor_ref = await ctx.create_actor(
+            TestActor, address=pool.external_address, allocate_strategy=ProcessIndex(1)
+        )
+
+        # check if error raised normally when subprocess killed
+        task = asyncio.create_task(actor_ref.sleep(10))
+        await asyncio.sleep(0)
+
+        # kill subprocess 1
+        process = list(pool._sub_processes.values())[0]
+        process.kill()
+        process.join()
+
+        with pytest.raises(ServerClosed):
+            # process already been killed,
+            # ServerClosed will be raised
+            await task
+
+        assert not process.is_alive()
+
+    with pytest.raises(RuntimeError):
+        await pool.start()
+
+    # test server unreachable
+    with pytest.raises(ConnectionError):
+        await ctx.has_actor(actor_ref)
+
+
+@pytest.mark.asyncio
+@pytest.mark.skipif(sys.platform.startswith("win"), reason="skip under Windows")
+@pytest.mark.parametrize("auto_recover", [False, True, "actor", "process"])
+async def test_auto_recover(auto_recover):
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    recovered = asyncio.Event()
+
+    def on_process_recover(*_):
+        recovered.set()
+
+    pool = await create_actor_pool(
+        "127.0.0.1",
+        pool_cls=MainActorPool,
+        n_process=2,
+        subprocess_start_method=start_method,
+        auto_recover=auto_recover,
+        on_process_recover=on_process_recover,
+    )
+
+    async with pool:
+        ctx = get_context()
+
+        # wait for recover of main pool always returned immediately
+        await ctx.wait_actor_pool_recovered(
+            pool.external_address, pool.external_address
+        )
+
+        # create actor on main
+        actor_ref = await ctx.create_actor(
+            TestActor, address=pool.external_address, allocate_strategy=MainPool()
+        )
+
+        with pytest.raises(ValueError):
+            # cannot kill actors on main pool
+            await kill_actor(actor_ref)
+
+        # create actor
+        actor_ref = await ctx.create_actor(
+            TestActor, address=pool.external_address, allocate_strategy=ProcessIndex(1)
+        )
+        # kill_actor will cause kill corresponding process
+        await ctx.kill_actor(actor_ref)
+
+        if auto_recover:
+            # process must have been killed
+            await ctx.wait_actor_pool_recovered(
+                actor_ref.address, pool.external_address
+            )
+            assert recovered.is_set()
+
+            expect_has_actor = True if auto_recover in ["actor", True] else False
+            assert await ctx.has_actor(actor_ref) is expect_has_actor
+        else:
+            with pytest.raises((ServerClosed, ConnectionError)):
+                await ctx.has_actor(actor_ref)
+
+
+@pytest.mark.parametrize(
+    "exception_config",
+    [
+        (Exception("recover exception"), False),
+        (asyncio.CancelledError("cancel monitor"), True),
+    ],
+)
+@pytest.mark.asyncio
+async def test_monitor_sub_pool_exception(exception_config):
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    recovered = asyncio.Event()
+    exception, done = exception_config
+
+    def on_process_recover(*_):
+        recovered.set()
+        raise exception
+
+    pool = await create_actor_pool(
+        "127.0.0.1",
+        pool_cls=MainActorPool,
+        n_process=2,
+        subprocess_start_method=start_method,
+        on_process_recover=on_process_recover,
+    )
+
+    async with pool:
+        ctx = get_context()
+        task = await pool.start_monitor()
+
+        # create actor
+        actor_ref = await ctx.create_actor(
+            TestActor, address=pool.external_address, allocate_strategy=ProcessIndex(1)
+        )
+        # kill_actor will cause kill corresponding process
+        await ctx.kill_actor(actor_ref)
+
+        await recovered.wait()
+        assert task.done() is done
+
+
+@pytest.mark.asyncio
+async def test_two_pools():
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+
+    ctx = get_context()
+
+    pool1 = await create_actor_pool(
+        "127.0.0.1",
+        pool_cls=MainActorPool,
+        n_process=2,
+        subprocess_start_method=start_method,
+    )
+    pool2 = await create_actor_pool(
+        "127.0.0.1",
+        pool_cls=MainActorPool,
+        n_process=2,
+        subprocess_start_method=start_method,
+    )
+
+    def is_interprocess_address(addr):
+        if sys.platform.startswith("win"):
+            return re.match(r"127\.0\.0\.1:\d+", addr)
+        else:
+            return addr.startswith("unixsocket://")
+
+    try:
+        actor_ref1 = await ctx.create_actor(
+            TestActor, address=pool1.external_address, allocate_strategy=MainPool()
+        )
+        assert actor_ref1.address == pool1.external_address
+        assert await actor_ref1.add(1) == 1
+        assert (
+            Router.get_instance()
+            .get_internal_address(actor_ref1.address)
+            .startswith("dummy://")
+        )
+
+        actor_ref2 = await ctx.create_actor(
+            TestActor, address=pool1.external_address, allocate_strategy=RandomSubPool()
+        )
+        assert actor_ref2.address in pool1._config.get_external_addresses()[1:]
+        assert await actor_ref2.add(3) == 3
+        assert is_interprocess_address(
+            Router.get_instance().get_internal_address(actor_ref2.address)
+        )
+
+        actor_ref3 = await ctx.create_actor(
+            TestActor, address=pool2.external_address, allocate_strategy=MainPool()
+        )
+        assert actor_ref3.address == pool2.external_address
+        assert await actor_ref3.add(5) == 5
+        assert (
+            Router.get_instance()
+            .get_internal_address(actor_ref3.address)
+            .startswith("dummy://")
+        )
+
+        actor_ref4 = await ctx.create_actor(
+            TestActor, address=pool2.external_address, allocate_strategy=RandomSubPool()
+        )
+        assert actor_ref4.address in pool2._config.get_external_addresses()[1:]
+        assert await actor_ref4.add(7) == 7
+        assert is_interprocess_address(
+            Router.get_instance().get_internal_address(actor_ref4.address)
+        )
+
+        assert await actor_ref2.add_other(actor_ref4, 3) == 13
+    finally:
+        await pool1.stop()
+        await pool2.stop()
+
+
+@pytest.mark.asyncio
+async def test_parallel_allocate_idle_label():
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    pool = await create_actor_pool(
+        "127.0.0.1",
+        pool_cls=MainActorPool,
+        n_process=2,
+        subprocess_start_method=start_method,
+        labels=[None, "my_label", "my_label"],
+    )
+
+    class _Actor(Actor):
+        def get_pid(self):
+            return os.getpid()
+
+    async with pool:
+        ctx = get_context()
+        strategy = IdleLabel("my_label", "tests")
+        tasks = [
+            ctx.create_actor(
+                _Actor, allocate_strategy=strategy, address=pool.external_address
+            ),
+            ctx.create_actor(
+                _Actor, allocate_strategy=strategy, address=pool.external_address
+            ),
+        ]
+        refs = await asyncio.gather(*tasks)
+        # outputs identical process ids, while the result should be different
+        assert len({await ref.get_pid() for ref in refs}) == 2
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "logging_conf",
+    [
+        {
+            "file": os.path.join(
+                os.path.dirname(os.path.abspath(__file__)), "test-logging.conf"
+            )
+        },
+        {"level": logging.DEBUG},
+        {"level": logging.DEBUG, "format": "%(asctime)s %(message)s"},
+    ],
+)
+async def test_logging_config(logging_conf):
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    pool = await create_actor_pool(
+        "127.0.0.1",
+        pool_cls=MainActorPool,
+        n_process=1,
+        subprocess_start_method=start_method,
+        labels=[None, "my_label"],
+        logging_conf=logging_conf,
+    )
+
+    class _Actor(Actor):
+        def get_logger_level(self):
+            logger = logging.getLogger(__name__)
+            return logger.getEffectiveLevel()
+
+    async with pool:
+        ctx = get_context()
+        strategy = IdleLabel("my_label", "tests")
+        ref = await ctx.create_actor(
+            _Actor, allocate_strategy=strategy, address=pool.external_address
+        )
+        assert await ref.get_logger_level() == logging.DEBUG
+
+
+@pytest.mark.asyncio
+async def test_ref_sub_pool_actor():
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    pool = await create_actor_pool(
+        "127.0.0.1",
+        pool_cls=MainActorPool,
+        n_process=1,
+        subprocess_start_method=start_method,
+    )
+
+    async with pool:
+        ctx = get_context()
+        ref1 = await ctx.create_actor(
+            TestActor, address=pool.external_address, allocate_strategy=RandomSubPool()
+        )
+        sub_address = ref1.address
+        ref2 = await ctx.create_actor(TestActor, address=sub_address)
+        ref2_main = await ctx.actor_ref(ref2.uid, address=pool.external_address)
+        assert ref2_main.address == sub_address
+
+        await ctx.destroy_actor(create_actor_ref(pool.external_address, ref2.uid))
+        assert not await ctx.has_actor(
+            create_actor_ref(pool.external_address, ref2.uid)
+        )
+        assert not await ctx.has_actor(create_actor_ref(sub_address, ref2.uid))
+
+
+class TestUCXActor(Actor):
+    __test__ = False
+
+    def __init__(self, init_val: int):
+        self._init_val = init_val
+
+    def verify(self, enabled_internal_addr: bool):
+        router = Router.get_instance()
+        assert router.external_address.startswith("ucx")
+        assert len(router._mapping) > 0
+        if not enabled_internal_addr:
+            # no internal address
+            assert all(v is None for v in router._mapping.values())
+        else:
+            assert all(v is not None for v in router._mapping.values())
+
+    def add(self, n: int):
+        return self._init_val + n
+
+    async def foo(self, ref, n: int):
+        assert self.address != ref.address
+        return self._init_val + await ref.add(n)
+
+
+@require_ucx
+@pytest.mark.asyncio
+@pytest.mark.parametrize("enable_internal_addr", [False, True])
+async def test_ucx(enable_internal_addr: bool):
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    pool = await create_actor_pool(
+        "127.0.0.1",
+        pool_cls=MainActorPool,
+        n_process=2,
+        subprocess_start_method=start_method,
+        external_address_schemes=["ucx"] * 3,
+        enable_internal_addresses=[enable_internal_addr] * 3,
+    )
+
+    async with pool:
+        ctx = get_context()
+        ref1 = await ctx.create_actor(
+            TestUCXActor,
+            1,
+            address=pool.external_address,
+            allocate_strategy=ProcessIndex(0),
+        )
+        await ref1.verify(enable_internal_addr)
+        ref2 = await ctx.create_actor(
+            TestUCXActor,
+            2,
+            address=pool.external_address,
+            allocate_strategy=ProcessIndex(1),
+        )
+        assert await ref1.foo(ref2, 3) == 6
diff --git a/python/xorbits/_mars/oscar/backends/message.pyi b/python/xorbits/_mars/oscar/backends/message.pyi
new file mode 100644
index 000000000..dac3aff7b
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/message.pyi
@@ -0,0 +1,214 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+from types import TracebackType
+from typing import Any, Type
+
+from ..core import ActorRef
+
+DEFAULT_PROTOCOL: int = 0
+
+class MessageType(Enum):
+    control = 0
+    result = 1
+    error = 2
+    create_actor = 3
+    destroy_actor = 4
+    has_actor = 5
+    actor_ref = 6
+    send = 7
+    tell = 8
+    cancel = 9
+
+class ControlMessageType(Enum):
+    stop = 0
+    restart = 1
+    sync_config = 2
+    get_config = 3
+    wait_pool_recovered = 4
+    add_sub_pool_actor = 5
+
+class _MessageBase:
+    message_type: MessageType
+    protocol: int
+    message_id: bytes
+    message_trace: list
+    profiling_context: Any
+
+    def __init__(
+        self,
+        message_id: bytes = None,
+        protocol: int = DEFAULT_PROTOCOL,
+        message_trace: list = None,
+        profiling_context: Any = None,
+    ): ...
+    def __repr__(self): ...
+
+class ControlMessage(_MessageBase):
+    message_type = MessageType.control
+
+    address: str
+    control_message_type: ControlMessageType
+    content: Any
+
+    def __init__(
+        self,
+        message_id: bytes = None,
+        address: str = None,
+        control_message_type: ControlMessageType = None,
+        content: Any = None,
+        protocol: int = DEFAULT_PROTOCOL,
+        message_trace: list = None,
+    ): ...
+
+class ResultMessage(_MessageBase):
+    message_type = MessageType.result
+
+    result: Any
+
+    def __init__(
+        self,
+        message_id: bytes = None,
+        result: Any = None,
+        protocol: int = DEFAULT_PROTOCOL,
+        message_trace: list = None,
+        profiling_context: Any = None,
+    ): ...
+
+class ErrorMessage(_MessageBase):
+    message_type = MessageType.error
+
+    address: str
+    pid: int
+    error_type: Type
+    error: BaseException
+    traceback: TracebackType
+
+    def __init__(
+        self,
+        message_id: bytes = None,
+        address: str = None,
+        pid: int = -1,
+        error_type: Type[BaseException] = None,
+        error: BaseException = None,
+        traceback: TracebackType = None,
+        protocol: int = DEFAULT_PROTOCOL,
+        message_trace: list = None,
+    ): ...
+    def as_instanceof_cause(self) -> BaseException: ...
+
+class CreateActorMessage(_MessageBase):
+    message_type = MessageType.create_actor
+
+    actor_cls: Type
+    actor_id: bytes
+    args: tuple
+    kwargs: dict
+    allocate_strategy: Any
+    from_main: bool
+
+    def __init__(
+        self,
+        message_id: bytes = None,
+        actor_cls: Type = None,
+        actor_id: bytes = None,
+        args: tuple = None,
+        kwargs: dict = None,
+        allocate_strategy: Any = None,
+        from_main: bool = False,
+        protocol: int = DEFAULT_PROTOCOL,
+        message_trace: list = None,
+    ): ...
+
+class DestroyActorMessage(_MessageBase):
+    message_type = MessageType.destroy_actor
+
+    actor_ref: ActorRef
+    from_main: bool
+
+    def __init__(
+        self,
+        message_id: bytes = None,
+        actor_ref: ActorRef = None,
+        from_main: bool = False,
+        protocol: int = DEFAULT_PROTOCOL,
+        message_trace: list = None,
+    ): ...
+
+class HasActorMessage(_MessageBase):
+    message_type = MessageType.has_actor
+
+    actor_ref: ActorRef
+
+    def __init__(
+        self,
+        message_id: bytes = None,
+        actor_ref: ActorRef = None,
+        protocol: int = DEFAULT_PROTOCOL,
+        message_trace: list = None,
+    ): ...
+
+class ActorRefMessage(_MessageBase):
+    message_type = MessageType.actor_ref
+
+    actor_ref: ActorRef
+
+    def __init__(
+        self,
+        message_id: bytes = None,
+        actor_ref: ActorRef = None,
+        protocol: int = DEFAULT_PROTOCOL,
+        message_trace: list = None,
+    ): ...
+
+class SendMessage(_MessageBase):
+    message_type = MessageType.send
+
+    actor_ref: ActorRef
+    content: Any
+
+    def __init__(
+        self,
+        message_id: bytes = None,
+        actor_ref: ActorRef = None,
+        content: object = None,
+        protocol: int = DEFAULT_PROTOCOL,
+        message_trace: list = None,
+        profiling_context: Any = None,
+    ): ...
+
+class TellMessage(SendMessage):
+    message_type = MessageType.tell
+
+class CancelMessage(_MessageBase):
+    message_type = MessageType.cancel
+
+    address: str
+    cancel_message_id: bytes
+
+    def __init__(
+        self,
+        message_id: bytes = None,
+        address: str = None,
+        cancel_message_id: bytes = None,
+        protocol: int = DEFAULT_PROTOCOL,
+        message_trace: list = None,
+    ): ...
+
+class DeserializeMessageFailed(RuntimeError):
+    def __init__(self, message_id: bytes): ...
+    def __str__(self): ...
+
+def new_message_id() -> bytes: ...
diff --git a/python/xorbits/_mars/oscar/backends/message.pyx b/python/xorbits/_mars/oscar/backends/message.pyx
new file mode 100644
index 000000000..a6a5bd987
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/message.pyx
@@ -0,0 +1,551 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+from types import TracebackType
+from typing import Any, Type
+
+from ...lib.tblib import pickling_support
+
+from ..._utils cimport new_random_id
+from ...serialization.core cimport Serializer
+
+from ...utils import wrap_exception
+
+from ..core cimport ActorRef
+
+# make sure traceback can be pickled
+pickling_support.install()
+
+cdef int _DEFAULT_PROTOCOL = 0
+DEFAULT_PROTOCOL = _DEFAULT_PROTOCOL
+
+
+class MessageType(Enum):
+    control = 0
+    result = 1
+    error = 2
+    create_actor = 3
+    destroy_actor = 4
+    has_actor = 5
+    actor_ref = 6
+    send = 7
+    tell = 8
+    cancel = 9
+
+
+class ControlMessageType(Enum):
+    stop = 0
+    restart = 1
+    sync_config = 2
+    get_config = 3
+    wait_pool_recovered = 4
+    add_sub_pool_actor = 5
+
+
+cdef class _MessageSerialItem:
+    cdef:
+        tuple serialized
+        list subs
+
+    def __cinit__(self, tuple serialized, list subs):
+        self.serialized = serialized
+        self.subs = subs
+
+
+cdef class _MessageBase:
+    message_type: MessageType = None
+
+    cdef:
+        public int protocol
+        public bytes message_id
+        public list message_trace
+        public object profiling_context
+
+    def __init__(
+        self,
+        bytes message_id = None,
+        int protocol = _DEFAULT_PROTOCOL,
+        list message_trace = None,
+        object profiling_context = None,
+    ):
+        self.message_id = message_id
+        self.protocol = protocol
+        # A message can be in the scope of other messages,
+        # this is mainly used for detecting deadlocks,
+        # e.g. Actor `A` sent a message(id: 1) to actor `B`,
+        # in the processing of `B`, it sent back a message(id: 2) to `A`,
+        # deadlock happens, because `A` is still waiting for reply from `B`.
+        # In this case, the `scoped_message_ids` will be [1, 2],
+        # `A` will find that id:1 already exists in inbox,
+        # thus deadlock detected.
+        self.message_trace = message_trace
+        self.profiling_context = profiling_context
+
+    cdef _MessageSerialItem serial(self):
+        return _MessageSerialItem(
+            (
+                self.message_type.value,
+                self.message_id,
+                self.protocol,
+                self.message_trace,
+                self.profiling_context,
+            ),
+            [],
+        )
+
+    cdef deserial_members(self, tuple serialized, list subs):
+        self.message_id = serialized[1]
+        self.protocol = serialized[2]
+        self.message_trace = serialized[3]
+        self.profiling_context = serialized[4]
+
+    def __repr__(self):
+        cdef list attr_reprs = []
+        for attr in dir(self):
+            if attr.startswith("_") or attr == "message_type":
+                continue
+            val = getattr(self, attr)
+            if callable(val):
+                continue
+            attr_reprs.append(f"{attr}={val!r}")
+        values = ", ".join(attr_reprs)
+        return f"{type(self).__name__}({values})"
+
+
+cdef class ControlMessage(_MessageBase):
+    message_type = MessageType.control
+
+    cdef:
+        public str address
+        public object control_message_type
+        public object content
+
+    def __init__(
+        self,
+        bytes message_id = None,
+        str address = None,
+        object control_message_type: ControlMessageType = None,
+        object content: Any = None,
+        int protocol = _DEFAULT_PROTOCOL,
+        list message_trace = None,
+    ):
+        _MessageBase.__init__(
+            self, message_id, protocol=protocol, message_trace=message_trace
+        )
+        self.address = address
+        self.control_message_type = control_message_type
+        self.content = content
+
+    cdef _MessageSerialItem serial(self):
+        cdef _MessageSerialItem item = _MessageBase.serial(self)
+        item.serialized += (
+            self.address,
+            self.control_message_type,
+        )
+        item.subs = [self.content]
+        return item
+
+    cdef deserial_members(self, tuple serialized, list subs):
+        _MessageBase.deserial_members(self, serialized, subs)
+        self.address = serialized[-2]
+        self.control_message_type = serialized[-1]
+        self.content = subs[0]
+
+
+cdef class ResultMessage(_MessageBase):
+    message_type = MessageType.result
+
+    cdef:
+        public object result
+
+    def __init__(
+        self,
+        bytes message_id = None,
+        object result: Any = None,
+        int protocol = _DEFAULT_PROTOCOL,
+        list message_trace = None,
+        object profiling_context = None,
+    ):
+        _MessageBase.__init__(
+            self,
+            message_id,
+            protocol=protocol,
+            message_trace=message_trace,
+            profiling_context=profiling_context,
+        )
+        self.result = result
+
+    cdef _MessageSerialItem serial(self):
+        cdef _MessageSerialItem item = _MessageBase.serial(self)
+        item.subs = [self.result]
+        return item
+
+    cdef deserial_members(self, tuple serialized, list subs):
+        _MessageBase.deserial_members(self, serialized, subs)
+        self.result = subs[0]
+
+
+class _AsCauseBase:
+    def __str__(self):
+        return f"[address={self.address}, pid={self.pid}] {str(self.__wrapped__)}"
+
+
+cdef class ErrorMessage(_MessageBase):
+    message_type = MessageType.error
+
+    cdef:
+        public str address
+        public long pid
+        public type error_type
+        public object error
+        public object traceback
+
+    def __init__(
+        self,
+        bytes message_id = None,
+        str address: str = None,
+        long pid = -1,
+        type error_type: Type[BaseException] = None,
+        object error: BaseException = None,
+        object traceback: TracebackType = None,
+        int protocol = _DEFAULT_PROTOCOL,
+        list message_trace = None,
+    ):
+        _MessageBase.__init__(
+            self, message_id, protocol=protocol, message_trace=message_trace
+        )
+        self.address = address
+        self.pid = pid
+        self.error_type = error_type
+        self.error = error
+        self.traceback = traceback
+
+    def as_instanceof_cause(self):
+        # Check the as_instanceof_cause is not recursive.
+        #
+        # e.g. SubtaskRunnerActor.run_subtask will reraise the exception raised
+        # from SubtaskProcessorActor.run. But these two actors are in the same
+        # process, so we don't want to append duplicated address and pid in the
+        # error message.
+        if issubclass(self.error_type, _AsCauseBase):
+            return self.error.with_traceback(self.traceback)
+
+        return wrap_exception(
+            self.error,
+            (_AsCauseBase,),
+            traceback=self.traceback,
+            attr_dict=dict(address=self.address, pid=self.pid),
+        )
+
+    cdef _MessageSerialItem serial(self):
+        cdef _MessageSerialItem item = _MessageBase.serial(self)
+        item.serialized += (self.address, self.pid)
+        item.subs = [self.error_type, self.error, self.traceback]
+        return item
+
+    cdef deserial_members(self, tuple serialized, list subs):
+        _MessageBase.deserial_members(self, serialized, subs)
+        self.address = serialized[-2]
+        self.pid = serialized[-1]
+        self.error_type = subs[0]
+        self.error = subs[1]
+        self.traceback = subs[2]
+
+
+cdef class CreateActorMessage(_MessageBase):
+    message_type = MessageType.create_actor
+
+    cdef:
+        public type actor_cls
+        public bytes actor_id
+        public tuple args
+        public dict kwargs
+        public object allocate_strategy
+        public object from_main
+
+    def __init__(
+        self,
+        bytes message_id = None,
+        type actor_cls = None,
+        bytes actor_id = None,
+        tuple args = None,
+        dict kwargs = None,
+        object allocate_strategy = None,
+        object from_main: bool = False,
+        int protocol = _DEFAULT_PROTOCOL,
+        list message_trace = None,
+    ):
+        _MessageBase.__init__(
+            self, message_id, protocol=protocol, message_trace=message_trace
+        )
+        self.actor_cls = actor_cls
+        self.actor_id = actor_id
+        self.args = args
+        self.kwargs = kwargs
+        self.allocate_strategy = allocate_strategy
+        self.from_main = from_main
+
+    cdef _MessageSerialItem serial(self):
+        cdef _MessageSerialItem item = _MessageBase.serial(self)
+        item.serialized += (
+            self.actor_id, self.allocate_strategy, self.from_main
+        )
+        item.subs = [self.actor_cls, self.args, self.kwargs]
+        return item
+
+    cdef deserial_members(self, tuple serialized, list subs):
+        _MessageBase.deserial_members(self, serialized, subs)
+        self.actor_id = serialized[-3]
+        self.allocate_strategy = serialized[-2]
+        self.from_main = serialized[-1]
+        self.actor_cls = subs[0]
+        self.args = subs[1]
+        self.kwargs = subs[2]
+
+
+cdef class DestroyActorMessage(_MessageBase):
+    message_type = MessageType.destroy_actor
+
+    cdef:
+        public ActorRef actor_ref
+        public object from_main
+
+    def __init__(
+        self,
+        bytes message_id = None,
+        ActorRef actor_ref = None,
+        object from_main: bool = False,
+        int protocol = _DEFAULT_PROTOCOL,
+        list message_trace = None,
+    ):
+        _MessageBase.__init__(
+            self, message_id, protocol=protocol, message_trace=message_trace
+        )
+        self.actor_ref = actor_ref
+        self.from_main = from_main
+
+    cdef _MessageSerialItem serial(self):
+        cdef _MessageSerialItem item = _MessageBase.serial(self)
+        item.serialized += (
+            self.actor_ref.address, self.actor_ref.uid, self.from_main
+        )
+        return item
+
+    cdef deserial_members(self, tuple serialized, list subs):
+        _MessageBase.deserial_members(self, serialized, subs)
+        self.actor_ref = ActorRef(serialized[-3], serialized[-2])
+        self.from_main = serialized[-1]
+
+
+cdef class HasActorMessage(_MessageBase):
+    message_type = MessageType.has_actor
+
+    cdef:
+        public ActorRef actor_ref
+
+    def __init__(
+        self,
+        bytes message_id = None,
+        ActorRef actor_ref = None,
+        int protocol = _DEFAULT_PROTOCOL,
+        list message_trace = None,
+    ):
+        _MessageBase.__init__(
+            self, message_id, protocol=protocol, message_trace=message_trace
+        )
+        self.actor_ref = actor_ref
+
+    cdef _MessageSerialItem serial(self):
+        cdef _MessageSerialItem item = _MessageBase.serial(self)
+        item.serialized += (
+            self.actor_ref.address, self.actor_ref.uid
+        )
+        return item
+
+    cdef deserial_members(self, tuple serialized, list subs):
+        _MessageBase.deserial_members(self, serialized, subs)
+        self.actor_ref = ActorRef(serialized[-2], serialized[-1])
+
+
+cdef class ActorRefMessage(_MessageBase):
+    message_type = MessageType.actor_ref
+
+    cdef:
+        public ActorRef actor_ref
+
+    def __init__(
+        self,
+        bytes message_id = None,
+        ActorRef actor_ref = None,
+        int protocol = _DEFAULT_PROTOCOL,
+        list message_trace = None,
+    ):
+        _MessageBase.__init__(
+            self, message_id, protocol=protocol, message_trace=message_trace
+        )
+        self.actor_ref = actor_ref
+
+    cdef _MessageSerialItem serial(self):
+        cdef _MessageSerialItem item = _MessageBase.serial(self)
+        item.serialized += (
+            self.actor_ref.address, self.actor_ref.uid
+        )
+        return item
+
+    cdef deserial_members(self, tuple serialized, list subs):
+        _MessageBase.deserial_members(self, serialized, subs)
+        self.actor_ref = ActorRef(serialized[-2], serialized[-1])
+
+
+cdef class SendMessage(_MessageBase):
+    message_type = MessageType.send
+
+    cdef:
+        public ActorRef actor_ref
+        public object content
+
+    def __init__(
+        self,
+        bytes message_id = None,
+        ActorRef actor_ref = None,
+        object content = None,
+        int protocol = _DEFAULT_PROTOCOL,
+        list message_trace = None,
+        object profiling_context = None,
+    ):
+        _MessageBase.__init__(
+            self,
+            message_id,
+            protocol=protocol,
+            message_trace=message_trace,
+            profiling_context=profiling_context,
+        )
+        self.actor_ref = actor_ref
+        self.content = content
+
+    cdef _MessageSerialItem serial(self):
+        cdef _MessageSerialItem item = _MessageBase.serial(self)
+        item.serialized += (
+            self.actor_ref.address, self.actor_ref.uid
+        )
+        item.subs = [self.content]
+        return item
+
+    cdef deserial_members(self, tuple serialized, list subs):
+        _MessageBase.deserial_members(self, serialized, subs)
+        self.actor_ref = ActorRef(serialized[-2], serialized[-1])
+        self.content = subs[0]
+
+
+cdef class TellMessage(SendMessage):
+    message_type = MessageType.tell
+
+
+cdef class CancelMessage(_MessageBase):
+    message_type = MessageType.cancel
+
+    cdef:
+        public str address
+        public bytes cancel_message_id
+
+    def __init__(
+        self,
+        bytes message_id = None,
+        str address = None,
+        bytes cancel_message_id = None,
+        int protocol = _DEFAULT_PROTOCOL,
+        list message_trace = None,
+    ):
+        _MessageBase.__init__(
+            self, message_id, protocol=protocol, message_trace=message_trace
+        )
+        self.address = address
+        self.cancel_message_id = cancel_message_id
+
+    cdef _MessageSerialItem serial(self):
+        cdef _MessageSerialItem item = _MessageBase.serial(self)
+        item.serialized += (
+            self.address, self.cancel_message_id
+        )
+        return item
+
+    cdef deserial_members(self, tuple serialized, list subs):
+        _MessageBase.deserial_members(self, serialized, subs)
+        self.address = serialized[-2]
+        self.cancel_message_id = serialized[-1]
+
+
+cdef dict _message_type_to_message_cls = {
+    MessageType.control.value: ControlMessage,
+    MessageType.result.value: ResultMessage,
+    MessageType.error.value: ErrorMessage,
+    MessageType.create_actor.value: CreateActorMessage,
+    MessageType.destroy_actor.value: DestroyActorMessage,
+    MessageType.has_actor.value: HasActorMessage,
+    MessageType.actor_ref.value: ActorRefMessage,
+    MessageType.send.value: SendMessage,
+    MessageType.tell.value: TellMessage,
+    MessageType.cancel.value: CancelMessage,
+}
+
+
+class DeserializeMessageFailed(RuntimeError):
+    def __init__(self, message_id):
+        self.message_id = message_id
+
+    def __str__(self):
+        return f"Deserialize {self.message_id} failed"
+
+
+cdef class MessageSerializer(Serializer):
+    serializer_id = 32105
+
+    cpdef serial(self, object obj, dict context):
+        cdef _MessageBase msg = <_MessageBase>obj
+        cdef _MessageSerialItem ser_item
+
+        assert msg.protocol == _DEFAULT_PROTOCOL, "only support protocol 0 for now"
+        ser_item = msg.serial()
+        return ser_item.serialized, ser_item.subs, False
+
+    cpdef deserial(self, tuple serialized, dict context, list subs):
+        cdef _MessageBase msg
+
+        msg_type = serialized[0]
+        msg = _message_type_to_message_cls[msg_type]()
+        msg.deserial_members(serialized, subs)
+        return msg
+
+    cpdef on_deserial_error(
+        self,
+        tuple serialized,
+        dict context,
+        list subs_serialized,
+        int error_index,
+        object exc,
+    ):
+        message_id = serialized[1]  # pos of message_id field
+        try:
+            raise DeserializeMessageFailed(message_id) from exc
+        except BaseException as new_ex:
+            return new_ex
+
+
+# register message serializer
+MessageSerializer.register(_MessageBase)
+
+
+cpdef bytes new_message_id():
+    return new_random_id(32)
diff --git a/python/xorbits/_mars/oscar/backends/pool.py b/python/xorbits/_mars/oscar/backends/pool.py
new file mode 100644
index 000000000..cd3b1d183
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/pool.py
@@ -0,0 +1,1519 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import concurrent.futures as futures
+import contextlib
+import itertools
+import logging
+import multiprocessing
+import os
+import threading
+import traceback
+from abc import ABC, ABCMeta, abstractmethod
+from typing import Callable, Coroutine, Dict, List, Optional, Type, TypeVar, Union
+
+from ...core.entrypoints import init_extension_entrypoints
+from ...metrics import init_metrics
+from ...utils import (
+    TypeDispatcher,
+    implements,
+    lazy_import,
+    register_asyncio_task_timeout_detector,
+    to_binary,
+)
+from ..api import Actor
+from ..core import ActorRef, register_local_pool
+from ..debug import debug_async_timeout, record_message_trace
+from ..errors import (
+    ActorAlreadyExist,
+    ActorNotExist,
+    CannotCancelTask,
+    SendMessageFailed,
+    ServerClosed,
+)
+from ..utils import create_actor_ref
+from .allocate_strategy import AddressSpecified, allocated_type
+from .communication import Channel, Server, gen_local_address, get_server_type
+from .communication.errors import ChannelClosed
+from .config import ActorPoolConfig
+from .core import ActorCaller, ResultMessageType
+from .message import (
+    DEFAULT_PROTOCOL,
+    ActorRefMessage,
+    CancelMessage,
+    ControlMessage,
+    ControlMessageType,
+    CreateActorMessage,
+    DestroyActorMessage,
+    ErrorMessage,
+    HasActorMessage,
+    MessageType,
+    ResultMessage,
+    SendMessage,
+    TellMessage,
+    _MessageBase,
+    new_message_id,
+)
+from .router import Router
+
+logger = logging.getLogger(__name__)
+ray = lazy_import("ray")
+
+
+@contextlib.contextmanager
+def _disable_log_temporally():
+    if os.getenv("CUDA_VISIBLE_DEVICES") == "-1":
+        # disable logging when CUDA_VISIBLE_DEVICES == -1
+        # many logging comes from ptxcompiler may distract users
+        try:
+            logging.disable(level=logging.ERROR)
+            yield
+        finally:
+            logging.disable(level=logging.NOTSET)
+    else:
+        yield
+
+
+class _ErrorProcessor:
+    def __init__(self, address: str, message_id: bytes, protocol):
+        self._address = address
+        self._message_id = message_id
+        self._protocol = protocol
+        self.result = None
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.result is None:
+            self.result = ErrorMessage(
+                self._message_id,
+                self._address,
+                os.getpid(),
+                exc_type,
+                exc_val,
+                exc_tb,
+                protocol=self._protocol,
+            )
+            return True
+
+
+def _register_message_handler(pool_type: Type["AbstractActorPool"]):
+    pool_type._message_handler = dict()
+    for message_type, handler in [
+        (MessageType.create_actor, pool_type.create_actor),
+        (MessageType.destroy_actor, pool_type.destroy_actor),
+        (MessageType.has_actor, pool_type.has_actor),
+        (MessageType.actor_ref, pool_type.actor_ref),
+        (MessageType.send, pool_type.send),
+        (MessageType.tell, pool_type.tell),
+        (MessageType.cancel, pool_type.cancel),
+        (MessageType.control, pool_type.handle_control_command),
+    ]:
+        pool_type._message_handler[message_type] = handler
+    return pool_type
+
+
+class AbstractActorPool(ABC):
+    __slots__ = (
+        "process_index",
+        "label",
+        "external_address",
+        "internal_address",
+        "env",
+        "_servers",
+        "_router",
+        "_config",
+        "_stopped",
+        "_actors",
+        "_caller",
+        "_process_messages",
+        "_asyncio_task_timeout_detector_task",
+    )
+
+    def __init__(
+        self,
+        process_index: int,
+        label: str,
+        external_address: str,
+        internal_address: str,
+        env: Dict,
+        router: Router,
+        config: ActorPoolConfig,
+        servers: List[Server],
+    ):
+        # register local pool for local actor lookup.
+        # The pool is weakrefed, so we don't need to unregister it.
+        register_local_pool(external_address, self)
+        self.process_index = process_index
+        self.label = label
+        self.external_address = external_address
+        self.internal_address = internal_address
+        self.env = env
+        self._router = router
+        self._config = config
+        self._servers = servers
+
+        self._stopped = asyncio.Event()
+
+        # states
+        # actor id -> actor
+        self._actors: Dict[bytes, Actor] = dict()
+        # message id -> future
+        self._process_messages: Dict[bytes, asyncio.Future] = dict()
+
+        # manage async actor callers
+        self._caller = ActorCaller()
+        self._asyncio_task_timeout_detector_task = (
+            register_asyncio_task_timeout_detector()
+        )
+        # load third party extensions.
+        init_extension_entrypoints()
+        # init metrics
+        metric_configs = self._config.get_metric_configs()
+        metric_backend = metric_configs.get("backend")
+        init_metrics(metric_backend, config=metric_configs.get(metric_backend))
+
+    @property
+    def router(self):
+        return self._router
+
+    @abstractmethod
+    async def create_actor(self, message: CreateActorMessage) -> ResultMessageType:
+        """
+        Create an actor.
+
+        Parameters
+        ----------
+        message: CreateActorMessage
+            message to create an actor.
+
+        Returns
+        -------
+        result_message
+            result or error message.
+        """
+
+    @abstractmethod
+    async def has_actor(self, message: HasActorMessage) -> ResultMessage:
+        """
+        Check if an actor exists or not.
+
+        Parameters
+        ----------
+        message: HasActorMessage
+            message
+
+        Returns
+        -------
+        result_message
+            result message contains if an actor exists or not.
+        """
+
+    @abstractmethod
+    async def destroy_actor(self, message: DestroyActorMessage) -> ResultMessageType:
+        """
+        Destroy an actor.
+
+        Parameters
+        ----------
+        message: DestroyActorMessage
+            message to destroy an actor.
+
+        Returns
+        -------
+        result_message
+            result or error message.
+        """
+
+    @abstractmethod
+    async def actor_ref(self, message: ActorRefMessage) -> ResultMessageType:
+        """
+        Get an actor's ref.
+
+        Parameters
+        ----------
+        message: ActorRefMessage
+            message to get an actor's ref.
+
+        Returns
+        -------
+        result_message
+            result or error message.
+        """
+
+    @abstractmethod
+    async def send(self, message: SendMessage) -> ResultMessageType:
+        """
+        Send a message to some actor.
+
+        Parameters
+        ----------
+        message: SendMessage
+            Message to send.
+
+        Returns
+        -------
+        result_message
+            result or error message.
+        """
+
+    @abstractmethod
+    async def tell(self, message: TellMessage) -> ResultMessageType:
+        """
+        Tell message to some actor.
+
+        Parameters
+        ----------
+        message: TellMessage
+            Message to tell.
+
+        Returns
+        -------
+        result_message
+            result or error message.
+        """
+
+    @abstractmethod
+    async def cancel(self, message: CancelMessage) -> ResultMessageType:
+        """
+        Cancel message that sent
+
+        Parameters
+        ----------
+        message: CancelMessage
+            Cancel message.
+
+        Returns
+        -------
+        result_message
+            result or error message
+        """
+
+    def _sync_pool_config(self, actor_pool_config: ActorPoolConfig):
+        self._config = actor_pool_config
+        # remove router from global one
+        global_router = Router.get_instance()
+        global_router.remove_router(self._router)
+        # update router
+        self._router.set_mapping(actor_pool_config.external_to_internal_address_map)
+        # update global router
+        global_router.add_router(self._router)
+
+    async def handle_control_command(
+        self, message: ControlMessage
+    ) -> ResultMessageType:
+        """
+        Handle control command.
+
+        Parameters
+        ----------
+        message: ControlMessage
+            Control message.
+
+        Returns
+        -------
+        result_message
+            result or error message.
+        """
+        with _ErrorProcessor(
+            self.external_address, message.message_id, protocol=message.protocol
+        ) as processor:
+            content = True
+            if message.control_message_type == ControlMessageType.stop:
+                await self.stop()
+            elif message.control_message_type == ControlMessageType.sync_config:
+                self._sync_pool_config(message.content)
+            elif message.control_message_type == ControlMessageType.get_config:
+                if message.content == "main_pool_address":
+                    main_process_index = self._config.get_process_indexes()[0]
+                    content = self._config.get_pool_config(main_process_index)[
+                        "external_address"
+                    ][0]
+                else:
+                    content = self._config
+            else:  # pragma: no cover
+                raise TypeError(
+                    f"Unable to handle control message "
+                    f"with type {message.control_message_type}"
+                )
+            processor.result = ResultMessage(
+                message.message_id, content, protocol=message.protocol
+            )
+
+        return processor.result
+
+    async def _run_coro(self, message_id: bytes, coro: Coroutine):
+        self._process_messages[message_id] = asyncio.tasks.current_task()
+        try:
+            return await coro
+        finally:
+            self._process_messages.pop(message_id, None)
+
+    async def _send_channel(
+        self, result: _MessageBase, channel: Channel, resend_failure: bool = True
+    ):
+        try:
+            await channel.send(result)
+        except (ChannelClosed, ConnectionResetError):
+            if not self._stopped.is_set():
+                raise
+        except Exception as ex:
+            logger.exception(
+                "Error when sending message %s from %s to %s",
+                result.message_id.hex(),
+                channel.local_address,
+                channel.dest_address,
+            )
+            if not resend_failure:  # pragma: no cover
+                raise
+
+            with _ErrorProcessor(
+                self.external_address, result.message_id, result.protocol
+            ) as processor:
+                error_msg = (
+                    f"Error when sending message {result.message_id.hex()}. "
+                    f"Caused by {ex!r}. "
+                )
+                if isinstance(result, ErrorMessage):
+                    format_tb = "\n".join(traceback.format_tb(result.traceback))
+                    error_msg += (
+                        f"\nOriginal error: {result.error!r}"
+                        f"Traceback: \n{format_tb}"
+                    )
+                else:
+                    error_msg += "See server logs for more details"
+                raise SendMessageFailed(error_msg) from None
+            await self._send_channel(processor.result, channel, resend_failure=False)
+
+    async def process_message(self, message: _MessageBase, channel: Channel):
+        handler = self._message_handler[message.message_type]
+        with _ErrorProcessor(
+            self.external_address, message.message_id, message.protocol
+        ) as processor:
+            # use `%.500` to avoid print too long messages
+            with debug_async_timeout(
+                "process_message_timeout",
+                "Process message %.500s of channel %s timeout.",
+                message,
+                channel,
+            ):
+                processor.result = await self._run_coro(
+                    message.message_id, handler(self, message)
+                )
+
+        await self._send_channel(processor.result, channel)
+
+    async def call(self, dest_address: str, message: _MessageBase) -> ResultMessageType:
+        return await self._caller.call(self._router, dest_address, message)
+
+    @staticmethod
+    def _parse_config(config: Dict, kw: Dict) -> Dict:
+        actor_pool_config: ActorPoolConfig = config.pop("actor_pool_config")
+        kw["config"] = actor_pool_config
+        kw["process_index"] = process_index = config.pop("process_index")
+        curr_pool_config = actor_pool_config.get_pool_config(process_index)
+        kw["label"] = curr_pool_config["label"]
+        external_addresses = curr_pool_config["external_address"]
+        kw["external_address"] = external_addresses[0]
+        kw["internal_address"] = curr_pool_config["internal_address"]
+        kw["router"] = Router(
+            external_addresses,
+            gen_local_address(process_index),
+            actor_pool_config.external_to_internal_address_map,
+            comm_config=actor_pool_config.get_comm_config(),
+        )
+        kw["env"] = curr_pool_config["env"]
+
+        if config:  # pragma: no cover
+            raise TypeError(
+                f"Creating pool got unexpected " f'arguments: {",".join(config)}'
+            )
+
+        return kw
+
+    @classmethod
+    @abstractmethod
+    async def create(cls, config: Dict) -> "AbstractActorPool":
+        """
+        Create an actor pool.
+
+        Parameters
+        ----------
+        config: Dict
+            configurations.
+
+        Returns
+        -------
+        actor_pool:
+            Actor pool.
+        """
+
+    async def start(self):
+        if self._stopped.is_set():
+            raise RuntimeError("pool has been stopped, cannot start again")
+        start_servers = [server.start() for server in self._servers]
+        await asyncio.gather(*start_servers)
+
+    async def join(self, timeout: float = None):
+        wait_stopped = asyncio.create_task(self._stopped.wait())
+
+        try:
+            await asyncio.wait_for(wait_stopped, timeout=timeout)
+        except (futures.TimeoutError, asyncio.TimeoutError):  # pragma: no cover
+            wait_stopped.cancel()
+
+    async def stop(self):
+        try:
+            # clean global router
+            router = Router.get_instance()
+            if router is not None:
+                router.remove_router(self._router)
+            stop_tasks = []
+            # stop all servers
+            stop_tasks.extend([server.stop() for server in self._servers])
+            # stop all clients
+            stop_tasks.append(self._caller.stop())
+            await asyncio.gather(*stop_tasks)
+
+            self._servers = []
+            if self._asyncio_task_timeout_detector_task:  # pragma: no cover
+                self._asyncio_task_timeout_detector_task.cancel()
+        finally:
+            self._stopped.set()
+
+    @property
+    def stopped(self) -> bool:
+        return self._stopped.is_set()
+
+    async def on_new_channel(self, channel: Channel):
+        while not self._stopped.is_set():
+            try:
+                message = await channel.recv()
+            except EOFError:
+                # no data to read, check channel
+                try:
+                    await channel.close()
+                except (ConnectionError, EOFError):
+                    # close failed, ignore
+                    pass
+                return
+            asyncio.create_task(self.process_message(message, channel))
+            # delete to release the reference of message
+            del message
+            await asyncio.sleep(0)
+
+    async def __aenter__(self):
+        await self.start()
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.stop()
+
+
+class ActorPoolBase(AbstractActorPool, metaclass=ABCMeta):
+    __slots__ = ()
+
+    @implements(AbstractActorPool.create_actor)
+    async def create_actor(self, message: CreateActorMessage) -> ResultMessageType:
+        with _ErrorProcessor(
+            self.external_address, message.message_id, message.protocol
+        ) as processor:
+            actor_id = message.actor_id
+            if actor_id in self._actors:
+                raise ActorAlreadyExist(
+                    f"Actor {actor_id} already exist, cannot create"
+                )
+
+            actor = message.actor_cls(*message.args, **message.kwargs)
+            actor.uid = actor_id
+            actor.address = address = self.external_address
+            self._actors[actor_id] = actor
+            await self._run_coro(message.message_id, actor.__post_create__())
+
+            result = ActorRef(address, actor_id)
+            # ensemble result message
+            processor.result = ResultMessage(
+                message.message_id, result, protocol=message.protocol
+            )
+        return processor.result
+
+    @implements(AbstractActorPool.has_actor)
+    async def has_actor(self, message: HasActorMessage) -> ResultMessage:
+        result = ResultMessage(
+            message.message_id,
+            message.actor_ref.uid in self._actors,
+            protocol=message.protocol,
+        )
+        return result
+
+    @implements(AbstractActorPool.destroy_actor)
+    async def destroy_actor(self, message: DestroyActorMessage) -> ResultMessageType:
+        with _ErrorProcessor(
+            self.external_address, message.message_id, message.protocol
+        ) as processor:
+            actor_id = message.actor_ref.uid
+            try:
+                actor = self._actors[actor_id]
+            except KeyError:
+                raise ActorNotExist(f"Actor {actor_id} does not exist")
+            await self._run_coro(message.message_id, actor.__pre_destroy__())
+            del self._actors[actor_id]
+
+            processor.result = ResultMessage(
+                message.message_id, actor_id, protocol=message.protocol
+            )
+        return processor.result
+
+    @implements(AbstractActorPool.actor_ref)
+    async def actor_ref(self, message: ActorRefMessage) -> ResultMessageType:
+        with _ErrorProcessor(
+            self.external_address, message.message_id, message.protocol
+        ) as processor:
+            actor_id = message.actor_ref.uid
+            if actor_id not in self._actors:
+                raise ActorNotExist(f"Actor {actor_id} does not exist")
+            result = ResultMessage(
+                message.message_id,
+                ActorRef(self.external_address, actor_id),
+                protocol=message.protocol,
+            )
+            processor.result = result
+        return processor.result
+
+    @implements(AbstractActorPool.send)
+    async def send(self, message: SendMessage) -> ResultMessageType:
+        with _ErrorProcessor(
+            self.external_address, message.message_id, message.protocol
+        ) as processor, record_message_trace(message):
+            actor_id = message.actor_ref.uid
+            if actor_id not in self._actors:
+                raise ActorNotExist(f"Actor {actor_id} does not exist")
+            coro = self._actors[actor_id].__on_receive__(message.content)
+            result = await self._run_coro(message.message_id, coro)
+            processor.result = ResultMessage(
+                message.message_id,
+                result,
+                protocol=message.protocol,
+                profiling_context=message.profiling_context,
+            )
+        return processor.result
+
+    @implements(AbstractActorPool.tell)
+    async def tell(self, message: TellMessage) -> ResultMessageType:
+        with _ErrorProcessor(
+            self.external_address, message.message_id, message.protocol
+        ) as processor:
+            actor_id = message.actor_ref.uid
+            if actor_id not in self._actors:  # pragma: no cover
+                raise ActorNotExist(f"Actor {actor_id} does not exist")
+            call = self._actors[actor_id].__on_receive__(message.content)
+            # asynchronously run, tell does not care about result
+            asyncio.create_task(call)
+            await asyncio.sleep(0)
+            processor.result = ResultMessage(
+                message.message_id,
+                None,
+                protocol=message.protocol,
+                profiling_context=message.profiling_context,
+            )
+        return processor.result
+
+    @implements(AbstractActorPool.cancel)
+    async def cancel(self, message: CancelMessage) -> ResultMessageType:
+        with _ErrorProcessor(
+            self.external_address, message.message_id, message.protocol
+        ) as processor:
+            future = self._process_messages.get(message.cancel_message_id)
+            if future is None or future.done():  # pragma: no cover
+                raise CannotCancelTask(
+                    "Task not exists, maybe it is done or cancelled already"
+                )
+            future.cancel()
+            processor.result = ResultMessage(
+                message.message_id, True, protocol=message.protocol
+            )
+        return processor.result
+
+    @staticmethod
+    def _set_global_router(router: Router):
+        # be cautious about setting global router
+        # for instance, multiple main pool may be created in the same process
+
+        # get default router or create an empty one
+        default_router = Router.get_instance_or_empty()
+        Router.set_instance(default_router)
+        # append this router to global
+        default_router.add_router(router)
+
+    @staticmethod
+    def _update_stored_addresses(
+        servers: List[Server],
+        raw_addresses: List[str],
+        actor_pool_config: ActorPoolConfig,
+        kw: Dict,
+    ):
+        process_index = kw["process_index"]
+        curr_pool_config = actor_pool_config.get_pool_config(process_index)
+        external_addresses = curr_pool_config["external_address"]
+        external_address_set = set(external_addresses)
+
+        kw["servers"] = servers
+
+        new_external_addresses = [
+            server.address
+            for server, raw_address in zip(servers, raw_addresses)
+            if raw_address in external_address_set
+        ]
+
+        if external_address_set != set(new_external_addresses):
+            external_addresses = new_external_addresses
+            actor_pool_config.reset_pool_external_address(
+                process_index, external_addresses
+            )
+            external_addresses = curr_pool_config["external_address"]
+
+            logger.debug(
+                "External address of process index %s updated to %s",
+                process_index,
+                external_addresses[0],
+            )
+            if kw["internal_address"] == kw["external_address"]:
+                # internal address may be the same as external address in Windows
+                kw["internal_address"] = external_addresses[0]
+            kw["external_address"] = external_addresses[0]
+
+            kw["router"] = Router(
+                external_addresses,
+                gen_local_address(process_index),
+                actor_pool_config.external_to_internal_address_map,
+                comm_config=actor_pool_config.get_comm_config(),
+            )
+
+    @classmethod
+    async def _create_servers(
+        cls, addresses: List[str], channel_handler: Callable, config: dict
+    ):
+        assert len(set(addresses)) == len(addresses)
+        # create servers
+        create_server_tasks = []
+        for addr in addresses:
+            server_type = get_server_type(addr)
+            extra_config = server_type.parse_config(config)
+            server_config = dict(address=addr, handle_channel=channel_handler)
+            server_config.update(extra_config)
+            task = asyncio.create_task(server_type.create(server_config))
+            create_server_tasks.append(task)
+
+        await asyncio.gather(*create_server_tasks)
+        return [f.result() for f in create_server_tasks]
+
+    @classmethod
+    @implements(AbstractActorPool.create)
+    async def create(cls, config: Dict) -> "ActorPoolType":
+        config = config.copy()
+        kw = dict()
+        cls._parse_config(config, kw)
+        process_index: int = kw["process_index"]
+        actor_pool_config = kw["config"]  # type: ActorPoolConfig
+        cur_pool_config = actor_pool_config.get_pool_config(process_index)
+        external_addresses = cur_pool_config["external_address"]
+        internal_address = kw["internal_address"]
+
+        # import predefined modules
+        modules = cur_pool_config["modules"] or []
+        for mod in modules:
+            __import__(mod, globals(), locals(), [])
+        # make sure all lazy imports loaded
+        with _disable_log_temporally():
+            TypeDispatcher.reload_all_lazy_handlers()
+
+        def handle_channel(channel):
+            return pool.on_new_channel(channel)
+
+        # create servers
+        server_addresses = list(external_addresses)
+        if internal_address:
+            server_addresses.append(internal_address)
+        server_addresses.append(gen_local_address(process_index))
+        server_addresses = sorted(set(server_addresses))
+        servers = await cls._create_servers(
+            server_addresses, handle_channel, actor_pool_config.get_comm_config()
+        )
+        cls._update_stored_addresses(servers, server_addresses, actor_pool_config, kw)
+
+        # set default router
+        # actor context would be able to use exact client
+        cls._set_global_router(kw["router"])
+
+        # create pool
+        pool = cls(**kw)
+        return pool
+
+
+ActorPoolType = TypeVar("ActorPoolType", bound=AbstractActorPool)
+MainActorPoolType = TypeVar("MainActorPoolType", bound="MainActorPoolBase")
+SubProcessHandle = Union[multiprocessing.Process, "ray.actor.ActorHandle"]
+
+
+class SubActorPoolBase(ActorPoolBase):
+    __slots__ = ("_main_address",)
+
+    def __init__(
+        self,
+        process_index: int,
+        label: str,
+        external_address: str,
+        internal_address: str,
+        env: Dict,
+        router: Router,
+        config: ActorPoolConfig,
+        servers: List[Server],
+        main_address: str,
+    ):
+        super().__init__(
+            process_index,
+            label,
+            external_address,
+            internal_address,
+            env,
+            router,
+            config,
+            servers,
+        )
+        self._main_address = main_address
+
+    async def notify_main_pool_to_destroy(
+        self, message: DestroyActorMessage
+    ):  # pragma: no cover
+        await self.call(self._main_address, message)
+
+    async def notify_main_pool_to_create(self, message: CreateActorMessage):
+        reg_message = ControlMessage(
+            new_message_id(),
+            self.external_address,
+            ControlMessageType.add_sub_pool_actor,
+            (self.external_address, message.allocate_strategy, message),
+        )
+        await self.call(self._main_address, reg_message)
+
+    @implements(AbstractActorPool.create_actor)
+    async def create_actor(self, message: CreateActorMessage) -> ResultMessageType:
+        result = await super().create_actor(message)
+        if not message.from_main:
+            await self.notify_main_pool_to_create(message)
+        return result
+
+    @implements(AbstractActorPool.actor_ref)
+    async def actor_ref(self, message: ActorRefMessage) -> ResultMessageType:
+        result = await super().actor_ref(message)
+        if isinstance(result, ErrorMessage):
+            # need a new message id to call main actor
+            main_message = ActorRefMessage(
+                new_message_id(),
+                create_actor_ref(self._main_address, message.actor_ref.uid),
+            )
+            result = await self.call(self._main_address, main_message)
+            # rewrite to message_id of the original request
+            result.message_id = message.message_id
+        return result
+
+    @implements(AbstractActorPool.destroy_actor)
+    async def destroy_actor(self, message: DestroyActorMessage) -> ResultMessageType:
+        result = await super().destroy_actor(message)
+        if isinstance(result, ResultMessage) and not message.from_main:
+            # sync back to main actor pool
+            await self.notify_main_pool_to_destroy(message)
+        return result
+
+    @implements(AbstractActorPool.handle_control_command)
+    async def handle_control_command(
+        self, message: ControlMessage
+    ) -> ResultMessageType:
+        if message.control_message_type == ControlMessageType.sync_config:
+            self._main_address = message.address
+        return await super().handle_control_command(message)
+
+    @staticmethod
+    def _parse_config(config: Dict, kw: Dict) -> Dict:
+        kw = AbstractActorPool._parse_config(config, kw)
+        config: ActorPoolConfig = kw["config"]
+        main_process_index = config.get_process_indexes()[0]
+        kw["main_address"] = config.get_pool_config(main_process_index)[
+            "external_address"
+        ][0]
+        return kw
+
+
+class MainActorPoolBase(ActorPoolBase):
+    __slots__ = (
+        "_allocated_actors",
+        "sub_actor_pool_manager",
+        "_auto_recover",
+        "_monitor_task",
+        "_on_process_down",
+        "_on_process_recover",
+        "_recover_events",
+    )
+
+    def __init__(
+        self,
+        process_index: int,
+        label: str,
+        external_address: str,
+        internal_address: str,
+        env: Dict,
+        router: Router,
+        config: ActorPoolConfig,
+        servers: List[Server],
+        subprocess_start_method: str = None,
+        auto_recover: Union[str, bool] = "actor",
+        on_process_down: Callable[[MainActorPoolType, str], None] = None,
+        on_process_recover: Callable[[MainActorPoolType, str], None] = None,
+    ):
+        super().__init__(
+            process_index,
+            label,
+            external_address,
+            internal_address,
+            env,
+            router,
+            config,
+            servers,
+        )
+        self._subprocess_start_method = subprocess_start_method
+
+        # auto recovering
+        self._auto_recover = auto_recover
+        self._monitor_task: Optional[asyncio.Task] = None
+        self._on_process_down = on_process_down
+        self._on_process_recover = on_process_recover
+        self._recover_events: Dict[str, asyncio.Event] = dict()
+
+        # states
+        self._allocated_actors: allocated_type = {
+            addr: dict() for addr in self._config.get_external_addresses()
+        }
+        self._allocation_lock = threading.Lock()
+
+        self.sub_processes: Dict[str, SubProcessHandle] = dict()
+
+    _process_index_gen = itertools.count()
+
+    @classmethod
+    def process_index_gen(cls, address):
+        # make sure different processes does not share process indexes
+        pid = os.getpid()
+        for idx in cls._process_index_gen:
+            yield pid << 16 + idx
+
+    @property
+    def _sub_processes(self):
+        return self.sub_processes
+
+    @implements(AbstractActorPool.create_actor)
+    async def create_actor(self, message: CreateActorMessage) -> ResultMessageType:
+        with _ErrorProcessor(
+            address=self.external_address,
+            message_id=message.message_id,
+            protocol=message.protocol,
+        ) as processor:
+            allocate_strategy = message.allocate_strategy
+            with self._allocation_lock:
+                # get allocated address according to corresponding strategy
+                address = allocate_strategy.get_allocated_address(
+                    self._config, self._allocated_actors
+                )
+                # set placeholder to make sure this label is occupied
+                self._allocated_actors[address][None] = (allocate_strategy, message)
+            if address == self.external_address:
+                # creating actor on main actor pool
+                result = await super().create_actor(message)
+                if isinstance(result, ResultMessage):
+                    self._allocated_actors[self.external_address][result.result] = (
+                        allocate_strategy,
+                        message,
+                    )
+                processor.result = result
+            else:
+                # creating actor on sub actor pool
+                # rewrite allocate strategy to AddressSpecified
+                new_allocate_strategy = AddressSpecified(address)
+                new_create_actor_message = CreateActorMessage(
+                    message.message_id,
+                    message.actor_cls,
+                    message.actor_id,
+                    message.args,
+                    message.kwargs,
+                    allocate_strategy=new_allocate_strategy,
+                    from_main=True,
+                    protocol=message.protocol,
+                    message_trace=message.message_trace,
+                )
+                result = await self.call(address, new_create_actor_message)
+                if isinstance(result, ResultMessage):
+                    self._allocated_actors[address][result.result] = (
+                        allocate_strategy,
+                        new_create_actor_message,
+                    )
+                processor.result = result
+
+            # revert placeholder
+            self._allocated_actors[address].pop(None, None)
+
+        return processor.result
+
+    @implements(AbstractActorPool.has_actor)
+    async def has_actor(self, message: HasActorMessage) -> ResultMessage:
+        actor_ref = message.actor_ref
+        # lookup allocated
+        for address, item in self._allocated_actors.items():
+            ref = create_actor_ref(address, to_binary(actor_ref.uid))
+            if ref in item:
+                return ResultMessage(
+                    message.message_id, True, protocol=message.protocol
+                )
+
+        return ResultMessage(message.message_id, False, protocol=message.protocol)
+
+    @implements(AbstractActorPool.destroy_actor)
+    async def destroy_actor(self, message: DestroyActorMessage) -> ResultMessageType:
+        actor_ref_message = ActorRefMessage(
+            message.message_id, message.actor_ref, protocol=message.protocol
+        )
+        result = await self.actor_ref(actor_ref_message)
+        if not isinstance(result, ResultMessage):
+            return result
+        real_actor_ref = result.result
+        if real_actor_ref.address == self.external_address:
+            result = await super().destroy_actor(message)
+            if result.message_type == MessageType.error:
+                return result
+            del self._allocated_actors[self.external_address][real_actor_ref]
+            return ResultMessage(
+                message.message_id, real_actor_ref.uid, protocol=message.protocol
+            )
+        # remove allocated actor ref
+        self._allocated_actors[real_actor_ref.address].pop(real_actor_ref, None)
+        new_destroy_message = DestroyActorMessage(
+            message.message_id,
+            real_actor_ref,
+            from_main=True,
+            protocol=message.protocol,
+        )
+        return await self.call(real_actor_ref.address, new_destroy_message)
+
+    @implements(AbstractActorPool.send)
+    async def send(self, message: SendMessage) -> ResultMessageType:
+        if message.actor_ref.uid in self._actors:
+            return await super().send(message)
+        actor_ref_message = ActorRefMessage(
+            message.message_id, message.actor_ref, protocol=message.protocol
+        )
+        result = await self.actor_ref(actor_ref_message)
+        if not isinstance(result, ResultMessage):
+            return result
+        actor_ref = result.result
+        new_send_message = SendMessage(
+            message.message_id,
+            actor_ref,
+            message.content,
+            protocol=message.protocol,
+            message_trace=message.message_trace,
+        )
+        return await self.call(actor_ref.address, new_send_message)
+
+    @implements(AbstractActorPool.tell)
+    async def tell(self, message: TellMessage) -> ResultMessageType:
+        if message.actor_ref.uid in self._actors:
+            return await super().tell(message)
+        actor_ref_message = ActorRefMessage(
+            message.message_id, message.actor_ref, protocol=message.protocol
+        )
+        result = await self.actor_ref(actor_ref_message)
+        if not isinstance(result, ResultMessage):
+            return result
+        actor_ref = result.result
+        new_tell_message = TellMessage(
+            message.message_id,
+            actor_ref,
+            message.content,
+            protocol=message.protocol,
+            message_trace=message.message_trace,
+        )
+        return await self.call(actor_ref.address, new_tell_message)
+
+    @implements(AbstractActorPool.actor_ref)
+    async def actor_ref(self, message: ActorRefMessage) -> ResultMessageType:
+        actor_ref = message.actor_ref
+        actor_ref.uid = to_binary(actor_ref.uid)
+        if actor_ref.address == self.external_address and actor_ref.uid in self._actors:
+            return ResultMessage(
+                message.message_id, actor_ref, protocol=message.protocol
+            )
+
+        # lookup allocated
+        for address, item in self._allocated_actors.items():
+            ref = create_actor_ref(address, actor_ref.uid)
+            if ref in item:
+                return ResultMessage(message.message_id, ref, protocol=message.protocol)
+
+        with _ErrorProcessor(
+            self.external_address, message.message_id, protocol=message.protocol
+        ) as processor:
+            raise ActorNotExist(
+                f"Actor {actor_ref.uid} does not exist in {actor_ref.address}"
+            )
+
+        return processor.result
+
+    @implements(AbstractActorPool.cancel)
+    async def cancel(self, message: CancelMessage) -> ResultMessageType:
+        if message.address == self.external_address:
+            # local message
+            return await super().cancel(message)
+        # redirect to sub pool
+        return await self.call(message.address, message)
+
+    @implements(AbstractActorPool.handle_control_command)
+    async def handle_control_command(
+        self, message: ControlMessage
+    ) -> ResultMessageType:
+        with _ErrorProcessor(
+            self.external_address, message.message_id, message.protocol
+        ) as processor:
+            if message.address == self.external_address:
+                if message.control_message_type == ControlMessageType.sync_config:
+                    # sync config, need to notify all sub pools
+                    tasks = []
+                    for addr in self.sub_processes:
+                        control_message = ControlMessage(
+                            new_message_id(),
+                            message.address,
+                            message.control_message_type,
+                            message.content,
+                            protocol=message.protocol,
+                            message_trace=message.message_trace,
+                        )
+                        tasks.append(
+                            asyncio.create_task(self.call(addr, control_message))
+                        )
+                    # call super
+                    task = asyncio.create_task(super().handle_control_command(message))
+                    tasks.append(task)
+                    await asyncio.gather(*tasks)
+                    processor.result = await task
+                else:
+                    processor.result = await super().handle_control_command(message)
+            elif message.control_message_type == ControlMessageType.stop:
+                timeout, force = (
+                    message.content if message.content is not None else (None, False)
+                )
+                await self.stop_sub_pool(
+                    message.address,
+                    self.sub_processes[message.address],
+                    timeout=timeout,
+                    force=force,
+                )
+                processor.result = ResultMessage(
+                    message.message_id, True, protocol=message.protocol
+                )
+            elif message.control_message_type == ControlMessageType.wait_pool_recovered:
+                if self._auto_recover and message.address not in self._recover_events:
+                    self._recover_events[message.address] = asyncio.Event()
+
+                event = self._recover_events.get(message.address, None)
+                if event is not None:
+                    await event.wait()
+                processor.result = ResultMessage(
+                    message.message_id, True, protocol=message.protocol
+                )
+            elif message.control_message_type == ControlMessageType.add_sub_pool_actor:
+                address, allocate_strategy, create_message = message.content
+                create_message.from_main = True
+                ref = create_actor_ref(address, to_binary(create_message.actor_id))
+                self._allocated_actors[address][ref] = (
+                    allocate_strategy,
+                    create_message,
+                )
+                processor.result = ResultMessage(
+                    message.message_id, True, protocol=message.protocol
+                )
+            else:
+                processor.result = await self.call(message.address, message)
+        return processor.result
+
+    @staticmethod
+    def _parse_config(config: Dict, kw: Dict) -> Dict:
+        kw["subprocess_start_method"] = config.pop("start_method", None)
+        kw["auto_recover"] = config.pop("auto_recover", "actor")
+        kw["on_process_down"] = config.pop("on_process_down", None)
+        kw["on_process_recover"] = config.pop("on_process_recover", None)
+        kw = AbstractActorPool._parse_config(config, kw)
+        return kw
+
+    @classmethod
+    @implements(AbstractActorPool.create)
+    async def create(cls, config: Dict) -> MainActorPoolType:
+        config = config.copy()
+        actor_pool_config: ActorPoolConfig = config.get("actor_pool_config")
+        start_method = config.get("start_method", None)
+        if "process_index" not in config:
+            config["process_index"] = actor_pool_config.get_process_indexes()[0]
+        curr_process_index = config.get("process_index")
+        old_config_addresses = set(actor_pool_config.get_external_addresses())
+
+        tasks = []
+        subpool_process_idxes = []
+        # create sub actor pools
+        n_sub_pool = actor_pool_config.n_pool - 1
+        if n_sub_pool > 0:
+            process_indexes = actor_pool_config.get_process_indexes()
+            for process_index in process_indexes:
+                if process_index == curr_process_index:
+                    continue
+                create_pool_task = asyncio.create_task(
+                    cls.start_sub_pool(actor_pool_config, process_index, start_method)
+                )
+                await asyncio.sleep(0)
+                # await create_pool_task
+                tasks.append(create_pool_task)
+                subpool_process_idxes.append(process_index)
+
+        processes, ext_addresses = await cls.wait_sub_pools_ready(tasks)
+        if ext_addresses:
+            for process_index, ext_address in zip(subpool_process_idxes, ext_addresses):
+                actor_pool_config.reset_pool_external_address(
+                    process_index, ext_address
+                )
+
+        # create main actor pool
+        pool: MainActorPoolType = await super().create(config)
+        addresses = actor_pool_config.get_external_addresses()[1:]
+
+        assert len(addresses) == len(
+            processes
+        ), f"addresses {addresses}, processes {processes}"
+        for addr, proc in zip(addresses, processes):
+            pool.attach_sub_process(addr, proc)
+
+        new_config_addresses = set(actor_pool_config.get_external_addresses())
+        if old_config_addresses != new_config_addresses:
+            control_message = ControlMessage(
+                message_id=new_message_id(),
+                address=pool.external_address,
+                control_message_type=ControlMessageType.sync_config,
+                content=actor_pool_config,
+            )
+            await pool.handle_control_command(control_message)
+
+        return pool
+
+    async def start_monitor(self):
+        if self._monitor_task is None:
+            self._monitor_task = asyncio.create_task(self.monitor_sub_pools())
+        return self._monitor_task
+
+    @implements(AbstractActorPool.stop)
+    async def stop(self):
+        global_router = Router.get_instance()
+        if global_router is not None:
+            global_router.remove_router(self._router)
+
+        # turn off auto recover to avoid errors
+        self._auto_recover = False
+        self._stopped.set()
+        if self._monitor_task and not self._monitor_task.done():
+            await self._monitor_task
+            self._monitor_task = None
+        await self.stop_sub_pools()
+        await super().stop()
+
+    @classmethod
+    @abstractmethod
+    async def start_sub_pool(
+        cls,
+        actor_pool_config: ActorPoolConfig,
+        process_index: int,
+        start_method: str = None,
+    ):
+        """Start a sub actor pool"""
+
+    @classmethod
+    @abstractmethod
+    async def wait_sub_pools_ready(cls, create_pool_tasks: List[asyncio.Task]):
+        """Wait all sub pools ready"""
+
+    def attach_sub_process(self, external_address: str, process: SubProcessHandle):
+        self.sub_processes[external_address] = process
+
+    async def stop_sub_pools(self):
+        to_stop_processes: Dict[str, SubProcessHandle] = dict()
+        for address, process in self.sub_processes.items():
+            if not await self.is_sub_pool_alive(process):
+                continue
+            to_stop_processes[address] = process
+
+        tasks = []
+        for address, process in to_stop_processes.items():
+            tasks.append(self.stop_sub_pool(address, process))
+        await asyncio.gather(*tasks)
+
+    async def stop_sub_pool(
+        self,
+        address: str,
+        process: SubProcessHandle,
+        timeout: float = None,
+        force: bool = False,
+    ):
+        if force:
+            await self.kill_sub_pool(process, force=True)
+            return
+
+        stop_message = ControlMessage(
+            new_message_id(),
+            address,
+            ControlMessageType.stop,
+            None,
+            protocol=DEFAULT_PROTOCOL,
+        )
+        try:
+            if timeout is None:
+                message = await self.call(address, stop_message)
+                if isinstance(message, ErrorMessage):  # pragma: no cover
+                    raise message.as_instanceof_cause()
+            else:
+                call = asyncio.create_task(self.call(address, stop_message))
+                try:
+                    await asyncio.wait_for(call, timeout)
+                except (futures.TimeoutError, asyncio.TimeoutError):  # pragma: no cover
+                    # timeout, just let kill to finish it
+                    force = True
+        except (ConnectionError, ServerClosed):  # pragma: no cover
+            # process dead maybe, ignore it
+            pass
+        # kill process
+        await self.kill_sub_pool(process, force=force)
+
+    @abstractmethod
+    async def kill_sub_pool(self, process: SubProcessHandle, force: bool = False):
+        """Kill a sub actor pool"""
+
+    @abstractmethod
+    async def is_sub_pool_alive(self, process: SubProcessHandle):
+        """
+        Check whether sub pool process is alive
+        Parameters
+        ----------
+        process : SubProcessHandle
+            sub pool process handle
+        Returns
+        -------
+        bool
+        """
+
+    @abstractmethod
+    def recover_sub_pool(self, address):
+        """Recover a sub actor pool"""
+
+    def process_sub_pool_lost(self, address: str):
+        if self._auto_recover in (False, "process"):
+            # process down, when not auto_recover
+            # or only recover process, remove all created actors
+            self._allocated_actors[address] = dict()
+
+    async def monitor_sub_pools(self):
+        try:
+            while not self._stopped.is_set():
+                for address, process in self.sub_processes.items():
+                    try:
+                        recover_events_discovered = address in self._recover_events
+                        if not await self.is_sub_pool_alive(
+                            process
+                        ):  # pragma: no cover
+                            if self._on_process_down is not None:
+                                self._on_process_down(self, address)
+                            self.process_sub_pool_lost(address)
+                            if self._auto_recover:
+                                await self.recover_sub_pool(address)
+                                if self._on_process_recover is not None:
+                                    self._on_process_recover(self, address)
+                        if recover_events_discovered:
+                            event = self._recover_events.pop(address)
+                            event.set()
+                    except asyncio.CancelledError:
+                        raise
+                    except RuntimeError as ex:  # pragma: no cover
+                        if (
+                            "cannot schedule new futures after interpreter shutdown"
+                            not in str(ex)
+                        ):
+                            # to silence log when process exit, otherwise it
+                            # will raise "RuntimeError: cannot schedule new futures
+                            # after interpreter shutdown".
+                            logger.exception("Monitor sub pool %s failed", address)
+                    except Exception:
+                        # log the exception instead of stop monitoring the
+                        # sub pool silently.
+                        logger.exception("Monitor sub pool %s failed", address)
+
+                # check every half second
+                await asyncio.sleep(0.5)
+        except asyncio.CancelledError:  # pragma: no cover
+            # cancelled
+            return
+
+    @classmethod
+    @abstractmethod
+    def get_external_addresses(
+        cls,
+        address: str,
+        n_process: int = None,
+        ports: List[int] = None,
+        schemes: List[str] = None,
+    ):
+        """Returns external addresses for n pool processes"""
+
+    @classmethod
+    @abstractmethod
+    def gen_internal_address(
+        cls, process_index: int, external_address: str = None
+    ) -> str:
+        """Returns internal address for pool of specified process index"""
+
+
+async def create_actor_pool(
+    address: str,
+    *,
+    pool_cls: Type[MainActorPoolType] = None,
+    n_process: int = None,
+    labels: List[str] = None,
+    ports: List[int] = None,
+    envs: List[Dict] = None,
+    external_address_schemes: List[str] = None,
+    enable_internal_addresses: List[bool] = None,
+    subprocess_start_method: str = None,
+    auto_recover: Union[str, bool] = "actor",
+    modules: List[str] = None,
+    suspend_sigint: bool = None,
+    use_uvloop: Union[str, bool] = "auto",
+    logging_conf: Union[Dict, None] = None,
+    on_process_down: Callable[[MainActorPoolType, str], None] = None,
+    on_process_recover: Callable[[MainActorPoolType, str], None] = None,
+    extra_conf: dict = None,
+    **kwargs,
+) -> MainActorPoolType:
+    from ... import dataframe, learn, remote, tensor
+
+    if n_process is None:
+        n_process = multiprocessing.cpu_count()
+    if labels and len(labels) != n_process + 1:
+        raise ValueError(
+            f"`labels` should be of size {n_process + 1}, got {len(labels)}"
+        )
+    if envs and len(envs) != n_process:
+        raise ValueError(f"`envs` should be of size {n_process}, got {len(envs)}")
+    if external_address_schemes and len(external_address_schemes) != n_process + 1:
+        raise ValueError(
+            f"`external_address_schemes` should be of size {n_process + 1}, "
+            f"got {len(external_address_schemes)}"
+        )
+    if enable_internal_addresses and len(enable_internal_addresses) != n_process + 1:
+        raise ValueError(
+            f"`enable_internal_addresses` should be of size {n_process + 1}, "
+            f"got {len(enable_internal_addresses)}"
+        )
+    elif not enable_internal_addresses:
+        enable_internal_addresses = [True] * (n_process + 1)
+    if auto_recover is True:
+        auto_recover = "actor"
+    if auto_recover not in ("actor", "process", False):
+        raise ValueError(
+            f'`auto_recover` should be one of "actor", "process", '
+            f"True or False, got {auto_recover}"
+        )
+    if use_uvloop == "auto":
+        try:
+            import uvloop  # noqa: F401 # pylint: disable=unused-variable
+
+            use_uvloop = True
+        except ImportError:
+            use_uvloop = False
+
+    modules = list(modules or []) + [
+        tensor.__name__,
+        dataframe.__name__,
+        learn.__name__,
+        remote.__name__,
+    ]
+
+    external_addresses = pool_cls.get_external_addresses(
+        address, n_process=n_process, ports=ports, schemes=external_address_schemes
+    )
+    actor_pool_config = ActorPoolConfig()
+    actor_pool_config.add_metric_configs(kwargs.get("metrics", {}))
+    # add main config
+    process_index_gen = pool_cls.process_index_gen(address)
+    main_process_index = next(process_index_gen)
+    main_internal_address = (
+        pool_cls.gen_internal_address(main_process_index, external_addresses[0])
+        if enable_internal_addresses[0]
+        else None
+    )
+    actor_pool_config.add_pool_conf(
+        main_process_index,
+        labels[0] if labels else None,
+        main_internal_address,
+        external_addresses[0],
+        modules=modules,
+        suspend_sigint=suspend_sigint,
+        use_uvloop=use_uvloop,
+        logging_conf=logging_conf,
+        kwargs=kwargs,
+    )
+    # add sub configs
+    for i in range(n_process):
+        sub_process_index = next(process_index_gen)
+        internal_address = (
+            pool_cls.gen_internal_address(sub_process_index, external_addresses[i + 1])
+            if enable_internal_addresses[i + 1]
+            else None
+        )
+        actor_pool_config.add_pool_conf(
+            sub_process_index,
+            labels[i + 1] if labels else None,
+            internal_address,
+            external_addresses[i + 1],
+            env=envs[i] if envs else None,
+            modules=modules,
+            suspend_sigint=suspend_sigint,
+            use_uvloop=use_uvloop,
+            logging_conf=logging_conf,
+            kwargs=kwargs,
+        )
+    actor_pool_config.add_comm_config(extra_conf)
+
+    pool: MainActorPoolType = await pool_cls.create(
+        {
+            "actor_pool_config": actor_pool_config,
+            "process_index": main_process_index,
+            "start_method": subprocess_start_method,
+            "auto_recover": auto_recover,
+            "on_process_down": on_process_down,
+            "on_process_recover": on_process_recover,
+        }
+    )
+    await pool.start()
+    return pool
diff --git a/python/xorbits/_mars/oscar/backends/ray/__init__.py b/python/xorbits/_mars/oscar/backends/ray/__init__.py
new file mode 100644
index 000000000..c566a786b
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .backend import RayActorBackend
diff --git a/python/xorbits/_mars/oscar/backends/ray/backend.py b/python/xorbits/_mars/oscar/backends/ray/backend.py
new file mode 100644
index 000000000..2873eb202
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/backend.py
@@ -0,0 +1,110 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import Dict
+
+from ....utils import Timer, lazy_import
+from ...backend import BaseActorBackend, register_backend
+from ..context import MarsActorContext
+from .driver import RayActorDriver
+from .pool import RayMainPool
+from .utils import get_placement_group, process_address_to_placement
+
+ray = lazy_import("ray")
+
+__all__ = ["RayActorBackend"]
+
+logger = logging.getLogger(__name__)
+
+
+@register_backend
+class RayActorBackend(BaseActorBackend):
+    @staticmethod
+    def name():
+        return "ray"
+
+    @staticmethod
+    def get_context_cls():
+        return MarsActorContext
+
+    @staticmethod
+    def get_driver_cls():
+        return RayActorDriver
+
+    @classmethod
+    async def _create_ray_pools(cls, address: str, n_process: int = None, **kwargs):
+        # pop `n_io_process` from kwargs as ray doesn't need this
+        kwargs.pop("n_io_process", 0)
+        pg_name, bundle_index, _ = process_address_to_placement(address)
+        from .pool import RayMainActorPool
+
+        pool_addresses = RayMainActorPool.get_external_addresses(address, n_process)
+        assert pool_addresses[0] == address
+        pg = get_placement_group(pg_name) if pg_name else None
+        num_cpus = kwargs.get("main_pool_cpus", 0)
+        sub_pools = {
+            sub_pool_address: RayMainActorPool.create_sub_pool(
+                address, sub_pool_address
+            )
+            for sub_pool_address in pool_addresses[1:]
+        }
+        actor_handle = (
+            ray.remote(RayMainPool)
+            .options(
+                num_cpus=num_cpus,
+                name=address,
+                max_concurrency=10000000,  # By default, 1000 tasks can be running concurrently.
+                max_restarts=-1,  # Auto restarts by ray
+                placement_group=pg,
+                placement_group_bundle_index=bundle_index,
+                placement_group_capture_child_tasks=False,
+            )
+            .remote(address, n_process, sub_pools, **kwargs)
+        )
+        pool_handle = RayPoolHandle(actor_handle, sub_pools)
+        return pool_handle
+
+    @classmethod
+    async def create_actor_pool(cls, address: str, n_process: int = None, **kwargs):
+        with Timer() as timer:
+            pool_handle = await cls._create_ray_pools(address, n_process, **kwargs)
+        logger.info(
+            "Submit create actor pool %s took %s seconds.",
+            pool_handle.main_pool,
+            timer.duration,
+        )
+        with Timer() as timer:
+            await pool_handle.main_pool.start.remote()
+        logger.info(
+            "Start actor pool %s took %s seconds.",
+            pool_handle.main_pool,
+            timer.duration,
+        )
+        return pool_handle
+
+
+class RayPoolHandle:
+    def __init__(
+        self,
+        main_pool: "ray.actor.ActorHandle",
+        sub_pools: Dict[str, "ray.actor.ActorHandle"],
+    ):
+        self.main_pool = main_pool
+        # Hold sub_pool actor handles to avoid gc.
+        self.sub_pools = sub_pools
+
+    def __getattr__(self, item):
+        if item in ("main_pool", "sub_pools"):  # pragma: no cover
+            return object.__getattribute__(self, item)
+        return getattr(self.main_pool, item)
diff --git a/python/xorbits/_mars/oscar/backends/ray/communication.py b/python/xorbits/_mars/oscar/backends/ray/communication.py
new file mode 100644
index 000000000..dc8e56a9c
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/communication.py
@@ -0,0 +1,552 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import concurrent.futures as futures
+import itertools
+import logging
+import time
+from abc import ABC
+from collections import namedtuple
+from dataclasses import dataclass
+from typing import Any, Callable, Coroutine, Dict, List, Set, Tuple, Type
+from urllib.parse import urlparse
+
+from ....metrics import Metrics
+from ....oscar.profiling import ProfilingData
+from ....serialization import deserialize, serialize
+from ....utils import Timer, classproperty, implements, lazy_import, lazy_import_on_load
+from ...debug import debug_async_timeout
+from ...errors import ServerClosed
+from ..communication.base import Channel, ChannelType, Client, Server
+from ..communication.core import register_client, register_server
+from ..communication.errors import ChannelClosed
+from .utils import report_event
+
+ray = lazy_import("ray")
+logger = logging.getLogger(__name__)
+
+ChannelID = namedtuple(
+    "ChannelID", ["local_address", "client_id", "channel_index", "dest_address"]
+)
+
+SERIALIZATION_TIMEOUT_MILLS = 1000
+DESERIALIZATION_TIMEOUT_MILLS = 1000
+
+
+def msg_to_simple_str(msg):  # pragma: no cover
+    """An helper that prints message structure without generate a big str."""
+    from ..message import SendMessage, _MessageBase
+
+    if type(msg) == _ArgWrapper:
+        msg = msg.message
+    if isinstance(msg, SendMessage):
+        return f"{str(type(msg).__name__)}(actor_ref={msg.actor_ref}, content={msg_to_simple_str(msg.content)})"
+    if isinstance(msg, _MessageBase):
+        return str(msg)
+    if isinstance(msg, List):
+        part_str = ", ".join([msg_to_simple_str(item) for item in msg[:5]])
+        return f"List<{part_str}...{len(msg)}>"
+    if isinstance(msg, Set):
+        part_str = ", ".join([msg_to_simple_str(item) for item in list(msg)[:5]])
+        return f"Set<{part_str}...{len(msg)}>"
+    if isinstance(msg, Tuple):
+        part_str = ", ".join([msg_to_simple_str(item) for item in msg[:5]])
+        return f"Tuple<{part_str}...{len(msg)}>"
+    if isinstance(msg, Dict):
+        part_str = []
+        it = iter(msg.items())
+        try:
+            while len(part_str) < 5:
+                entry = next(it)
+                part_str.append(
+                    f"k={msg_to_simple_str(entry[0])}, v={msg_to_simple_str(entry[1])}"
+                )
+        except StopIteration:
+            pass
+        part_str = ", ".join(part_str)
+        return f"Dict<{part_str}...{len(msg)}>"
+    if isinstance(msg, (str, float, int, bool)):
+        return "{!s:.50}".format(msg)
+    return str(type(msg))
+
+
+def _argwrapper_unpickler(serialized_message):
+    return _ArgWrapper(deserialize(*serialized_message))
+
+
+@dataclass
+class _ArgWrapper:
+    message: Any = None
+
+    def __init__(self, message):
+        self.message = message
+
+    def __reduce__(self):
+        return _argwrapper_unpickler, (
+            serialize(self.message, context={"serializer": "ray"}),
+        )
+
+
+@lazy_import_on_load(ray)
+def _init_ray_serialization_deserialization():
+    _ray_serialize = ray.serialization.SerializationContext.serialize
+    _ray_deserialize_object = ray.serialization.SerializationContext._deserialize_object
+    serialized_bytes_counter = Metrics.counter(
+        "mars.channel_serialized_bytes",
+        "The bytes serialized by mars ray channel.",
+    )
+    deserialized_bytes_counter = Metrics.counter(
+        "mars.channel_deserialized_bytes",
+        "The bytes deserialized by mars ray channel.",
+    )
+    serialization_time_mills = Metrics.counter(
+        "mars.channel_serialization_time_mills",
+        "The time used by mars ray channel serialization.",
+    )
+    deserialization_time_mills = Metrics.counter(
+        "mars.channel_deserialization_time_mills",
+        "The time used by mars ray channel deserialization.",
+    )
+
+    def _serialize(self, value):
+        if type(value) is _ArgWrapper:  # pylint: disable=unidiomatic-typecheck
+            message = value.message
+            with Timer() as timer:
+                serialized_object = _ray_serialize(self, value)
+                bytes_length = serialized_object.total_bytes
+                serialized_bytes_counter.record(bytes_length)
+            serialization_time_mills.record(timer.duration * 1000)
+            if bytes_length > 1 * 1024 * 1024 * 1024:  # pragma: no cover
+                logger.warning(
+                    "Serialize large message (%s bytes > 1GB) through ray channel, message: %s.",
+                    bytes_length,
+                    msg_to_simple_str(message),
+                )
+            if timer.duration * 1000 > SERIALIZATION_TIMEOUT_MILLS:  # pragma: no cover
+                report_event(
+                    "WARNING",
+                    "SERIALIZATION_TIMEOUT",
+                    f"Serialization took {timer.duration} seconds for {bytes_length} sized message {msg_to_simple_str(message)}.",
+                )
+            try:
+                if message.profiling_context is not None:
+                    task_id = message.profiling_context.task_id
+                    ProfilingData[task_id, "serialization"].inc(
+                        "serialize", timer.duration
+                    )
+            except AttributeError:  # pragma: no cover
+                logger.info(
+                    "Profiling serialization got error, the send "
+                    "message %s may not be an instance of message",
+                    type(message),
+                )
+        else:
+            serialized_object = _ray_serialize(self, value)
+        return serialized_object
+
+    def _deserialize_object(self, data, metadata, object_ref):
+        start_time = time.time()
+        bytes_length = 0
+        if data:
+            bytes_length = len(data)
+            deserialized_bytes_counter.record(bytes_length)
+        value = _ray_deserialize_object(self, data, metadata, object_ref)
+        duration = time.time() - start_time
+        deserialization_time_mills.record(duration * 1000)
+        if duration * 1000 > DESERIALIZATION_TIMEOUT_MILLS:  # pragma: no cover
+            report_event(
+                "WARNING",
+                "DESERIALIZATION_TIMEOUT",
+                f"Deserialization took {duration} seconds for "
+                f"{bytes_length} sized msg {msg_to_simple_str(value)}",
+            )
+        if type(value) is _ArgWrapper:  # pylint: disable=unidiomatic-typecheck
+            message = value.message
+            try:
+                if message.profiling_context is not None:
+                    task_id = message.profiling_context.task_id
+                    ProfilingData[task_id, "serialization"].inc(
+                        "deserialize", time.time() - start_time
+                    )
+            except AttributeError:  # pragma: no cover
+                logger.info(
+                    "Profiling serialization got error, the recv "
+                    "message %s may not be an instance of message",
+                    type(message),
+                )
+        return value
+
+    ray.serialization.SerializationContext.serialize = _serialize
+    ray.serialization.SerializationContext._deserialize_object = _deserialize_object
+
+
+class RayChannelException(Exception):
+    def __init__(self, exc_type, exc_value: BaseException, exc_traceback):
+        self.exc_type = exc_type
+        self.exc_value = exc_value
+        self.exc_traceback = exc_traceback
+
+
+class RayChannelBase(Channel, ABC):
+    """
+    Channel for communications between ray processes.
+    """
+
+    __slots__ = "_channel_index", "_channel_id", "_closed"
+
+    name = "ray"
+    _channel_index_gen = itertools.count()
+
+    def __init__(
+        self,
+        local_address: str = None,
+        dest_address: str = None,
+        channel_index: int = None,
+        channel_id: ChannelID = None,
+        compression=None,
+    ):
+        super().__init__(
+            local_address=local_address,
+            dest_address=dest_address,
+            compression=compression,
+        )
+        self._channel_index = channel_index or next(self._channel_index_gen)
+        self._channel_id = channel_id or ChannelID(
+            local_address, _gen_client_id(), self._channel_index, dest_address
+        )
+        self._closed = asyncio.Event()
+
+    @property
+    def channel_id(self) -> ChannelID:
+        return self._channel_id
+
+    @property
+    @implements(Channel.type)
+    def type(self) -> ChannelType:
+        return ChannelType.ray
+
+    @implements(Channel.close)
+    async def close(self):
+        self._closed.set()
+
+    @property
+    @implements(Channel.closed)
+    def closed(self) -> bool:
+        return self._closed.is_set()
+
+
+class RayClientChannel(RayChannelBase):
+    """
+    A channel from ray driver/actor to ray actor. Use ray call reply for client channel recv.
+    """
+
+    __slots__ = "_peer_actor", "_done", "_todo"
+
+    def __init__(
+        self,
+        dest_address: str = None,
+        channel_index: int = None,
+        channel_id: ChannelID = None,
+        compression=None,
+    ):
+        super().__init__(None, dest_address, channel_index, channel_id, compression)
+        # ray actor should be created with the address as the name.
+        self._peer_actor: "ray.actor.ActorHandle" = ray.get_actor(dest_address)
+        self._done = asyncio.Queue()
+        self._todo = set()
+
+    def _submit_task(self, message: Any, object_ref: "ray.ObjectRef"):
+        async def handle_task(message: Any, object_ref: "ray.ObjectRef"):
+            # use `%.500` to avoid print too long messages
+            with debug_async_timeout(
+                "ray_object_retrieval_timeout",
+                "Message that client sent to actor %s is %.500s and object_ref is %s",
+                self.dest_address,
+                message,
+                object_ref,
+            ):
+                try:
+                    result = await object_ref
+                except Exception as e:  # pragma: no cover
+                    # The error ClientObjectRef can't be formatted, so
+                    # we give it a string `ClientObjectRef` instead.
+                    try:
+                        object_ref_str = str(object_ref)
+                    except Exception:
+                        object_ref_str = "ClientObjectRef"
+                    logger.exception(
+                        "Get object %s from %s failed, got exception %s.",
+                        object_ref_str,
+                        self.dest_address,
+                        e,
+                    )
+                    raise
+            if isinstance(result, RayChannelException):
+                raise result.exc_value.with_traceback(result.exc_traceback)
+            return result.message
+
+        def _on_completion(future):
+            self._todo.remove(future)
+            self._done.put_nowait(future)
+
+        future = asyncio.ensure_future(handle_task(message, object_ref))
+        future.add_done_callback(_on_completion)
+        self._todo.add(future)
+
+    @implements(Channel.send)
+    async def send(self, message: Any):
+        if self._closed.is_set():  # pragma: no cover
+            raise ChannelClosed("Channel already closed, cannot send message")
+        # Put ray object ref to todo queue
+        task = self._peer_actor.__on_ray_recv__.remote(
+            self.channel_id, _ArgWrapper(message)
+        )
+        self._submit_task(message, task)
+        await asyncio.sleep(0)
+
+    @implements(Channel.recv)
+    async def recv(self):
+        if self._closed.is_set():  # pragma: no cover
+            raise ChannelClosed("Channel already closed, cannot recv message")
+        try:
+            # Wait first done.
+            future = await self._done.get()
+            return future.result()
+        except ray.exceptions.RayActorError:
+            if not self._closed.is_set():
+                # raise a EOFError as the SocketChannel does
+                raise EOFError("Server may be closed")
+        except (RuntimeError, ServerClosed) as e:  # pragma: no cover
+            if not self._closed.is_set():
+                raise e
+
+
+class RayServerChannel(RayChannelBase):
+    """
+    A channel from ray actor to ray driver/actor. Since ray actor can't call ray driver,
+    we use ray call reply for server channel send. Note that there can't be multiple
+    channel message sends for one received message, or else it will be taken as next
+    message's reply.
+    """
+
+    __slots__ = "_in_queue", "_out_queue", "_msg_recv_counter", "_msg_sent_counter"
+
+    def __init__(
+        self,
+        local_address: str = None,
+        channel_index: int = None,
+        channel_id: ChannelID = None,
+        compression=None,
+    ):
+        super().__init__(local_address, None, channel_index, channel_id, compression)
+        self._in_queue = asyncio.Queue()
+        self._out_queue = asyncio.Queue()
+        self._msg_recv_counter = 0
+        self._msg_sent_counter = 0
+
+    @implements(Channel.send)
+    async def send(self, message: Any):
+        if self._closed.is_set():  # pragma: no cover
+            raise ChannelClosed("Channel already closed, cannot send message")
+        # Current process is ray actor, we use ray call reply to send message to ray driver/actor.
+        # Not that we can only send once for every read message in channel, otherwise
+        # it will be taken as other message's reply.
+        await self._out_queue.put(message)
+        self._msg_sent_counter += 1
+        assert (
+            self._msg_sent_counter <= self._msg_recv_counter
+        ), "RayServerChannel channel doesn't support send multiple replies for one message."
+
+    @implements(Channel.recv)
+    async def recv(self):
+        if self._closed.is_set():  # pragma: no cover
+            raise ChannelClosed("Channel already closed, cannot write message")
+        try:
+            return await self._in_queue.get()
+        except RuntimeError:  # pragma: no cover
+            if not self._closed.is_set():
+                raise
+
+    async def __on_ray_recv__(self, message_wrapper):
+        """This method will be invoked when current process is a ray actor rather than a ray driver"""
+        self._msg_recv_counter += 1
+        await self._in_queue.put(message_wrapper.message)
+        result_message = await self._out_queue.get()
+        if self._closed.is_set():  # pragma: no cover
+            raise ChannelClosed("Channel already closed")
+        return _ArgWrapper(result_message)
+
+    @implements(Channel.close)
+    async def close(self):
+        await super().close()
+        self._out_queue.put_nowait(None)
+
+
+@register_server
+class RayServer(Server):
+    __slots__ = "_closed", "_channels", "_tasks"
+
+    scheme = "ray"
+    _server_instance = None
+    _ray_actor_started = False
+
+    def __init__(self, address, channel_handler: Callable[[Channel], Coroutine] = None):
+        super().__init__(address, channel_handler)
+        self._closed = asyncio.Event()
+        self._channels: Dict[ChannelID, RayServerChannel] = dict()
+        self._tasks: Dict[ChannelID, asyncio.Task] = dict()
+
+    @classproperty
+    @implements(Server.client_type)
+    def client_type(self) -> Type["Client"]:
+        return RayClient
+
+    @property
+    @implements(Server.channel_type)
+    def channel_type(self) -> ChannelType:
+        return ChannelType.ray
+
+    @classmethod
+    def set_ray_actor_started(cls):
+        cls._ray_actor_started = True
+
+    @classmethod
+    def is_ray_actor_started(cls):
+        return cls._ray_actor_started
+
+    @staticmethod
+    @implements(Server.create)
+    async def create(config: Dict) -> "RayServer":
+        if not RayServer.is_ray_actor_started():
+            logger.warning(
+                "Current process is not a ray actor, the ray server "
+                "will not receive messages from clients."
+            )
+        assert RayServer._server_instance is None
+        config = config.copy()
+        address = config.pop("address")
+        handle_channel = config.pop("handle_channel")
+        if urlparse(address).scheme != RayServer.scheme:  # pragma: no cover
+            raise ValueError(
+                f"Address for RayServer "
+                f'should be starts with "ray://", '
+                f"got {address}"
+            )
+        if config:  # pragma: no cover
+            raise TypeError(
+                f"Creating RayServer got unexpected " f'arguments: {",".join(config)}'
+            )
+        server = RayServer(address, handle_channel)
+        RayServer._server_instance = server
+        return server
+
+    @classmethod
+    def get_instance(cls):
+        return cls._server_instance
+
+    @classmethod
+    def clear(cls):
+        cls._server_instance = None
+        cls._ray_actor_started = False
+
+    @implements(Server.start)
+    async def start(self):
+        # nothing needs to do for ray server
+        pass
+
+    @implements(Server.join)
+    async def join(self, timeout=None):
+        wait_coro = self._closed.wait()
+        try:
+            await asyncio.wait_for(wait_coro, timeout=timeout)
+        except (futures.TimeoutError, asyncio.TimeoutError):  # pragma: no cover
+            pass
+
+    @implements(Server.on_connected)
+    async def on_connected(self, *args, **kwargs):
+        channel = args[0]
+        assert isinstance(channel, RayServerChannel)
+        if kwargs:  # pragma: no cover
+            raise TypeError(
+                f"{type(self).__name__} got unexpected "
+                f'arguments: {",".join(kwargs)}'
+            )
+        await self.channel_handler(channel)
+
+    @implements(Server.stop)
+    async def stop(self):
+        self._closed.set()
+        for task in self._tasks.values():
+            task.cancel()
+        self._tasks = dict()
+        for channel in self._channels.values():
+            await channel.close()
+        self._channels = dict()
+        self.clear()
+
+    @property
+    @implements(Server.stopped)
+    def stopped(self) -> bool:
+        return self._closed.is_set()
+
+    async def __on_ray_recv__(self, channel_id: ChannelID, message):
+        if self.stopped:
+            raise ServerClosed(
+                f"Remote server {self.address} closed, but got message {message} "
+                f"from channel {channel_id}"
+            )
+        channel = self._channels.get(channel_id)
+        if not channel:
+            _, _, peer_channel_index, peer_dest_address = channel_id
+            channel = RayServerChannel(
+                peer_dest_address, peer_channel_index, channel_id
+            )
+            self._channels[channel_id] = channel
+            self._tasks[channel_id] = asyncio.create_task(self.on_connected(channel))
+        return await channel.__on_ray_recv__(message)
+
+
+@register_client
+class RayClient(Client):
+    __slots__ = ()
+
+    scheme = RayServer.scheme
+
+    def __init__(self, local_address: str, dest_address: str, channel: Channel):
+        super().__init__(local_address, dest_address, channel)
+
+    @staticmethod
+    @implements(Client.connect)
+    async def connect(
+        dest_address: str, local_address: str = None, **kwargs
+    ) -> "Client":
+        if urlparse(dest_address).scheme != RayServer.scheme:  # pragma: no cover
+            raise ValueError(
+                f'Destination address should start with "ray://" '
+                f"for RayClient, got {dest_address}"
+            )
+        client_channel = RayClientChannel(dest_address)
+        client = RayClient(local_address, dest_address, client_channel)
+        return client
+
+    @implements(Client.close)
+    async def close(self):
+        await super().close()
+
+
+def _gen_client_id():
+    import uuid
+
+    return uuid.uuid4().hex
diff --git a/python/xorbits/_mars/oscar/backends/ray/driver.py b/python/xorbits/_mars/oscar/backends/ray/driver.py
new file mode 100644
index 000000000..9fe7ddc09
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/driver.py
@@ -0,0 +1,92 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import logging
+import os
+from numbers import Number
+from typing import Dict
+
+from ....utils import lazy_import
+from ...driver import BaseActorDriver
+from .utils import addresses_to_placement_group_info, process_placement_to_address
+
+ray = lazy_import("ray")
+logger = logging.getLogger(__name__)
+
+
+class RayActorDriver(BaseActorDriver):
+    _cluster_info = dict()
+
+    @classmethod
+    def setup_cluster(cls, address_to_resources: Dict[str, Dict[str, Number]]):
+        logger.info("Setup cluster with %s", address_to_resources)
+        # Note: Deep copy the dict to keep the origin values, because `bundles`
+        # returned by `addresses_to_placement_group_info()` will be modified
+        # by `ray.util.placement_group()`
+        original_address_to_resources = copy.deepcopy(address_to_resources)
+        pg_name, bundles = addresses_to_placement_group_info(address_to_resources)
+        logger.info("Creating placement group %s with bundles %s.", pg_name, bundles)
+        pg = ray.util.placement_group(name=pg_name, bundles=bundles, strategy="SPREAD")
+        create_pg_timeout = 120
+        done, _ = ray.wait([pg.ready()], timeout=create_pg_timeout)
+        if not done:  # pragma: no cover
+            raise Exception(
+                f"""Can't create placement group {pg.bundle_specs} in {create_pg_timeout} seconds"""
+            )
+        cluster_info = {
+            "original_address_to_resources": original_address_to_resources,
+            "address_to_resources": address_to_resources,
+            "pg_name": pg_name,
+            "pg_group": pg,
+            "main_pool_handles": [],  # Hold actor_handle to avoid actor being freed.
+        }
+        logger.info("Create placement group success.")
+        cls._cluster_info = cluster_info
+
+    @classmethod
+    def stop_cluster(cls):
+        logger.info("Stopping cluster %s.", cls._cluster_info)
+        if not cls._cluster_info:  # pragma: no cover
+            return
+        pg_name = cls._cluster_info["pg_name"]
+        pg = cls._cluster_info["pg_group"]
+        for index, bundle_spec in enumerate(pg.bundle_specs):
+            # Main pool took a process.
+            # If supervisor is created in the same node with worker, it will take a process too.
+            n_process = int(bundle_spec["CPU"]) + 2
+            for process_index in reversed(range(n_process)):
+                address = process_placement_to_address(
+                    pg_name, index, process_index=process_index
+                )
+                try:
+                    ray_actor = ray.get_actor(address)
+                    if "COV_CORE_SOURCE" in os.environ:  # pragma: no cover
+                        # must clean up first, or coverage info lost.
+                        # must save the local reference until this is fixed:
+                        # https://github.com/ray-project/ray/issues/7815
+                        ray.get(ray_actor.cleanup.remote())
+                    ray.kill(ray_actor, no_restart=True)
+                    while True:
+                        try:
+                            ray.get(ray_actor.wait.remote(30))
+                            logger.warning(
+                                "Waiting actor %s to be killed.", ray_actor
+                            )  # pragma: no cover
+                        except ray.exceptions.RayActorError:
+                            break
+                except:  # noqa: E722  # nosec  # pylint: disable=bare-except
+                    pass
+        ray.util.remove_placement_group(pg)
+        cls._cluster_info = dict()
+        logger.info("Stopped cluster %s.", pg_name)
diff --git a/python/xorbits/_mars/oscar/backends/ray/pool.py b/python/xorbits/_mars/oscar/backends/ray/pool.py
new file mode 100644
index 000000000..9d26e4dce
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/pool.py
@@ -0,0 +1,396 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import datetime
+import inspect
+import itertools
+import logging
+import os
+import sys
+import threading
+import time
+import types
+from abc import ABC, abstractmethod
+from enum import Enum
+from typing import List, Optional
+
+from ....utils import ensure_coverage, lazy_import, retry_callable
+from ... import ServerClosed
+from ..config import ActorPoolConfig
+from ..message import CreateActorMessage
+from ..pool import (
+    AbstractActorPool,
+    MainActorPoolBase,
+    SubActorPoolBase,
+    _register_message_handler,
+    create_actor_pool,
+)
+from ..router import Router
+from .communication import ChannelID, RayChannelException, RayServer
+from .utils import (
+    get_placement_group,
+    kill_and_wait,
+    process_address_to_placement,
+    process_placement_to_address,
+)
+
+ray = lazy_import("ray")
+logger = logging.getLogger(__name__)
+
+
+class RayPoolState(Enum):
+    INIT = 0
+    POOL_READY = 1
+    SERVICE_READY = 2
+
+
+@_register_message_handler
+class RayMainActorPool(MainActorPoolBase):
+    @classmethod
+    def process_index_gen(cls, address):
+        _, __, process_index = process_address_to_placement(address)
+        return itertools.count(process_index)
+
+    @classmethod
+    def get_external_addresses(
+        cls,
+        address: str,
+        n_process: int = None,
+        ports: List[int] = None,
+        schemes: List[str] = None,
+    ):
+        assert (
+            not ports
+        ), f"ports should be none when actor pool running on ray, but got {ports}"
+        pg_name, bundle_index, process_index = process_address_to_placement(address)
+        return [
+            process_placement_to_address(pg_name, bundle_index, process_index + i)
+            for i in range(n_process + 1)
+        ]
+
+    @classmethod
+    def gen_internal_address(
+        cls, process_index: int, external_address: str = None
+    ) -> str:
+        return external_address
+
+    @classmethod
+    def create_sub_pool(
+        cls,
+        main_pool_address,
+        sub_pool_address,
+    ):
+        pg_name, bundle_index, process_index = process_address_to_placement(
+            sub_pool_address
+        )
+        pg = get_placement_group(pg_name) if pg_name else None
+        # Hold actor_handle to avoid actor being freed.
+        actor_handle = (
+            ray.remote(RaySubPool)
+            .options(
+                num_cpus=0,
+                name=sub_pool_address,
+                max_concurrency=10000000,  # By default, 1000 tasks can be running concurrently.
+                max_restarts=-1,  # Auto restarts by ray
+                placement_group=pg,
+                placement_group_bundle_index=bundle_index,
+                placement_group_capture_child_tasks=False,
+            )
+            .remote(main_pool_address, process_index)
+        )
+        return actor_handle
+
+    @classmethod
+    async def start_sub_pool(
+        cls,
+        actor_pool_config: ActorPoolConfig,
+        process_index: int,
+        start_method: str = None,
+    ):
+        config = actor_pool_config.get_pool_config(process_index)
+        external_addresses = config["external_address"]
+        assert (
+            len(external_addresses) == 1
+        ), f"Ray pool allows only one external address but got {external_addresses}"
+        external_address = external_addresses[0]
+        pg_name, bundle_index, _process_index = process_address_to_placement(
+            external_address
+        )
+        assert process_index == _process_index, (
+            f"process_index {process_index} is not consistent with index {_process_index} "
+            f"in external_address {external_address}"
+        )
+        actor_handle = config["kwargs"]["sub_pool_handles"][external_address]
+        state = await retry_callable(
+            actor_handle.state.remote, ex_type=ray.exceptions.RayActorError, sync=False
+        )()
+        if state is RayPoolState.SERVICE_READY:  # pragma: no cover
+            logger.info("Ray sub pool %s is alive, kill it first.", external_address)
+            await kill_and_wait(actor_handle, no_restart=False)
+            # Wait sub pool process restarted.
+            await retry_callable(
+                actor_handle.state.remote,
+                ex_type=ray.exceptions.RayActorError,
+                sync=False,
+            )()
+        logger.info("Start to start ray sub pool %s.", external_address)
+        create_sub_pool_timeout = 120
+        try:
+            await asyncio.wait_for(
+                actor_handle.set_actor_pool_config.remote(actor_pool_config),
+                timeout=create_sub_pool_timeout,
+            )
+        except asyncio.TimeoutError:  # pragma: no cover
+            msg = (
+                f"Can not start ray sub pool {external_address} in {create_sub_pool_timeout} seconds.",
+            )
+            logger.error(msg)
+            raise Exception(msg)
+        await actor_handle.start.remote()
+        logger.info("Start ray sub pool %s successfully.", external_address)
+        return actor_handle
+
+    @classmethod
+    async def wait_sub_pools_ready(cls, create_pool_tasks: List[asyncio.Task]):
+        return [await t for t in create_pool_tasks], None
+
+    async def recover_sub_pool(self, address: str):
+        process = self.sub_processes[address]
+        # ray call will error when actor is restarting
+        await retry_callable(
+            process.state.remote, ex_type=ray.exceptions.RayActorError, sync=False
+        )()
+        await process.start.remote()
+
+        if self._auto_recover == "actor":
+            # need to recover all created actors
+            for _, message in self._allocated_actors[address].values():
+                create_actor_message: CreateActorMessage = message
+                await self.call(address, create_actor_message)
+            await process.mark_service_ready.remote()
+
+    async def kill_sub_pool(
+        self,
+        process: "ray.actor.ActorHandle",
+        force: bool = False,
+        no_restart: bool = False,
+    ):
+        logger.info("Start to kill ray sub pool %s", process)
+        await kill_and_wait(process, no_restart=no_restart)
+
+    async def is_sub_pool_alive(self, process: "ray.actor.ActorHandle"):
+        try:
+            if self._auto_recover == "process":
+                return await process.state.remote() in [
+                    RayPoolState.POOL_READY,
+                    RayPoolState.SERVICE_READY,
+                ]
+            else:
+                return await process.state.remote() == RayPoolState.SERVICE_READY
+        except Exception:
+            logger.info("Detected RaySubPool %s died", process)
+            return False
+
+
+@_register_message_handler
+class RaySubActorPool(SubActorPoolBase):
+    async def stop(self):
+        try:
+            # clean global router
+            Router.get_instance().remove_router(self._router)
+            await self._caller.stop()
+            self._servers = []
+        finally:
+            self._stopped.set()
+
+
+class RayPoolBase(ABC):
+    __slots__ = "_actor_pool", "_ray_server"
+
+    _actor_pool: Optional["AbstractActorPool"]
+    _state: RayPoolState = RayPoolState.INIT
+
+    def __new__(cls, *args, **kwargs):
+        if threading.current_thread() is threading.main_thread():
+            ensure_coverage()
+        return super().__new__(cls, *args, **kwargs)
+
+    def __init__(self):
+        self._actor_pool = None
+        self._ray_server = None
+        RayServer.set_ray_actor_started()
+
+    @abstractmethod
+    async def start(self):
+        """Start actor pool in ray actor"""
+
+    def _set_ray_server(self, actor_pool: AbstractActorPool):
+        ray_servers = [
+            server for server in actor_pool._servers if isinstance(server, RayServer)
+        ]
+        assert (
+            len(ray_servers) == 1
+        ), f"Ray only support single server but got {ray_servers}."
+        self._ray_server = ray_servers[0]
+
+    async def __on_ray_recv__(self, channel_id: ChannelID, message):
+        """Method for communication based on ray actors"""
+        try:
+            if self._ray_server is None:
+                raise ServerClosed(f"Remote server {channel_id.dest_address} closed")
+            return await self._ray_server.__on_ray_recv__(channel_id, message)
+        except Exception:  # pragma: no cover
+            return RayChannelException(*sys.exc_info())
+
+    async def actor_pool(self, attribute, *args, **kwargs):
+        attr = getattr(self._actor_pool, attribute)
+        if isinstance(attr, types.MethodType):
+            if inspect.iscoroutinefunction(attr):
+                return await attr(*args, **kwargs)
+            return attr(*args, **kwargs)
+        else:
+            return attr
+
+    def state(self):
+        return self._state
+
+    @staticmethod
+    def getpid():
+        return os.getpid()
+
+    async def wait(self, seconds):
+        await asyncio.sleep(seconds)
+
+    def cleanup(self):
+        logger.info("Cleaning up %s of process %s now", self, os.getpid())
+        try:
+            from pytest_cov.embed import cleanup
+
+            cleanup()
+        except ImportError:  # pragma: no cover
+            pass
+
+
+class RayMainPool(RayPoolBase):
+    _actor_pool: RayMainActorPool
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self._args = args
+        self._kwargs = kwargs
+        self._start_timestamp = time.time_ns()
+
+    async def start(self):
+        # create mars pool outside the constructor is to avoid ray actor creation failed.
+        # ray can't get the creation exception.
+        address, n_process, sub_pool_handles = self._args
+        assert (
+            self._state == RayPoolState.INIT
+        ), f"The pool {address} is already started, current state is {self._state}"
+        self._actor_pool = await create_actor_pool(
+            address,
+            n_process=n_process,
+            pool_cls=RayMainActorPool,
+            sub_pool_handles=sub_pool_handles,
+            **self._kwargs,
+        )
+        self._set_ray_server(self._actor_pool)
+        self._state = RayPoolState.POOL_READY
+        logger.info("Started main pool %s with %s processes.", address, n_process)
+
+    async def mark_service_ready(self):
+        results = []
+        for _, sub_pool in self._actor_pool.sub_processes.items():
+            r = sub_pool.mark_service_ready.remote()
+            results.append(r)
+        await asyncio.gather(*results)
+        self._state = RayPoolState.SERVICE_READY
+        await self._actor_pool.start_monitor()
+
+    async def alive(self):
+        await asyncio.sleep(30)
+        return self._start_timestamp
+
+
+class RaySubPool(RayPoolBase):
+    _actor_pool: RaySubActorPool
+
+    def __init__(self, *args):
+        super().__init__()
+        self._args = args
+        self._actor_pool_config = None
+        self._check_alive_task = None
+        self._main_pool_start_timestamp = None
+
+    def set_actor_pool_config(self, actor_pool_config):
+        self._actor_pool_config = actor_pool_config
+
+    async def start(self):
+        # create mars pool outside the constructor is to avoid ray actor creation failed.
+        # ray can't get the creation exception.
+        main_pool_address, process_index = self._args
+        logger.info(
+            "Start to init sub pool %s for main pool %s.",
+            process_index,
+            main_pool_address,
+        )
+        main_pool = ray.get_actor(main_pool_address)
+        self._check_alive_task = asyncio.create_task(
+            self.check_main_pool_alive(main_pool)
+        )
+        if self._actor_pool_config is None:
+            self._actor_pool_config = await main_pool.actor_pool.remote("_config")
+        pool_config = self._actor_pool_config.get_pool_config(process_index)
+        sub_pool_address = pool_config["external_address"]
+        assert (
+            self._state == RayPoolState.INIT
+        ), f"The pool {sub_pool_address} is already started, current state is {self._state}"
+        env = pool_config["env"]
+        if env:  # pragma: no cover
+            os.environ.update(env)
+        self._actor_pool = await RaySubActorPool.create(
+            {
+                "actor_pool_config": self._actor_pool_config,
+                "process_index": process_index,
+            }
+        )
+        self._set_ray_server(self._actor_pool)
+        await self._actor_pool.start()
+        asyncio.create_task(self._actor_pool.join())
+        self._state = RayPoolState.POOL_READY
+        logger.info("Started sub pool %s.", sub_pool_address)
+
+    def mark_service_ready(self):
+        self._state = RayPoolState.SERVICE_READY
+
+    async def check_main_pool_alive(self, main_pool):
+        try:
+            main_pool_start_timestamp = await main_pool.alive.remote()
+            if self._main_pool_start_timestamp is None:
+                self._main_pool_start_timestamp = main_pool_start_timestamp
+            if (
+                main_pool_start_timestamp != self._main_pool_start_timestamp
+            ):  # pragma: no cover
+                logger.error(
+                    "Main pool %s has restarted at %s, exit current sub pool now.",
+                    datetime.datetime.fromtimestamp(main_pool_start_timestamp / 1e9),
+                    main_pool,
+                )
+                os._exit(0)
+        except:  # noqa: E722  # pylint: disable=bare-except  # pragma: no cover
+            logger.exception(
+                "Main pool %s has exited, exit current sub pool now.", main_pool
+            )
+            os._exit(0)
diff --git a/python/xorbits/_mars/oscar/backends/ray/tests/__init__.py b/python/xorbits/_mars/oscar/backends/ray/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/oscar/backends/ray/tests/test_communication.py b/python/xorbits/_mars/oscar/backends/ray/tests/test_communication.py
new file mode 100644
index 000000000..a49314710
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/tests/test_communication.py
@@ -0,0 +1,145 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import inspect
+
+import pytest
+
+from .....tests.core import require_ray
+from .....utils import ensure_coverage, lazy_import
+from ....core import ActorRef
+from ....errors import ServerClosed
+from ...communication.base import ChannelType
+from ...message import SendMessage
+from ..communication import Channel, ChannelID, RayClient, RayServer, msg_to_simple_str
+
+ray = lazy_import("ray")
+
+
+class ServerActor:
+    def __new__(cls, *args, **kwargs):
+        ensure_coverage()
+        return super().__new__(cls, *args, **kwargs)
+
+    def __init__(self, address):
+        self.address = address
+        self.server = None
+
+    async def start(self):
+        RayServer.set_ray_actor_started()
+        self.server = await RayServer.create(
+            {"address": self.address, "handle_channel": self.on_new_channel}
+        )
+
+    async def on_new_channel(self, channel: Channel):
+        while True:
+            try:
+                message = await channel.recv()
+                await channel.send(message)
+            except EOFError:
+                # no data to read, check channel
+                await channel.close()
+                return
+            await asyncio.sleep(0.1)
+
+    async def __on_ray_recv__(self, channel_id: ChannelID, message):
+        """Method for communication based on ray actors"""
+        return await self.server.__on_ray_recv__(channel_id, message)
+
+    async def server(self, method_name, *args, **kwargs):
+        result = getattr(self.server, method_name)(*args, **kwargs)
+        if inspect.iscoroutine(result):
+            result = await result
+        return result
+
+
+class ServerCallActor(ServerActor):
+    def __init__(self, address):
+        super().__init__(address)
+
+    async def check(self, dest_address, x):
+        client = await RayClient.connect(dest_address, self.address)
+        await client.send(x)
+        return await client.recv() == x
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_driver_to_actor_channel(ray_start_regular):
+    dest_address = "ray://test_cluster/0/0"
+    server_actor = (
+        ray.remote(ServerActor).options(name=dest_address).remote(dest_address)
+    )
+    await server_actor.start.remote()
+    client = await RayClient.connect(dest_address, None)
+    assert client.channel_type == ChannelType.ray
+    for i in range(10):
+        await client.send(i)
+        assert await client.recv() == i
+    await server_actor.server.remote("stop")
+    with pytest.raises(ServerClosed):
+        await client.send(1)
+        await client.recv()
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_actor_to_actor_channel(ray_start_regular):
+    server1_address, server2_address = (
+        "ray://test_cluster/0/0",
+        "ray://test_cluster/0/1",
+    )
+    server_actor1 = (
+        ray.remote(ServerCallActor)
+        .options(name=server1_address)
+        .remote(server1_address)
+    )
+    server_actor2 = (
+        ray.remote(ServerCallActor)
+        .options(name=server2_address)
+        .remote(server2_address)
+    )
+    await server_actor1.start.remote()
+    await server_actor2.start.remote()
+    for client in [
+        await RayClient.connect(addr, None)
+        for addr in [server1_address, server2_address]
+    ]:
+        for i in range(10):
+            await client.send(i)
+            assert await client.recv() == i
+    for i in range(10):
+        assert await server_actor1.check.remote(server2_address, i)
+        assert await server_actor2.check.remote(server1_address, i)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_msg_to_simple_str(ray_start_regular):
+    assert msg_to_simple_str(1) == "1"
+    assert msg_to_simple_str(True) == "True"
+    assert msg_to_simple_str("a") == "a"
+    assert msg_to_simple_str([1, 2]) == "List<1, 2...2>"
+    assert msg_to_simple_str({1, 2}) == "Set<1, 2...2>"
+    assert msg_to_simple_str((1, 2.0, False)) == "Tuple<1, 2.0, False...3>"
+    assert msg_to_simple_str({"a": [1, 2]}) == "Dict...1>"
+    assert (
+        msg_to_simple_str(
+            SendMessage(
+                message_id=b"abc", actor_ref=ActorRef("addr", b"id"), content="abc"
+            )
+        )
+        == "SendMessage(actor_ref=ActorRef(uid=b'id', address='addr'), content=abc)"
+    )
diff --git a/python/xorbits/_mars/oscar/backends/ray/tests/test_ray_actor_context.py b/python/xorbits/_mars/oscar/backends/ray/tests/test_ray_actor_context.py
new file mode 100644
index 000000000..d2d2d3860
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/tests/test_ray_actor_context.py
@@ -0,0 +1,145 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import time
+
+import pytest
+
+from .....tests.core import require_ray
+from .....utils import lazy_import
+from ...mars.tests import test_mars_actor_context
+from ...router import Router
+from ..backend import RayActorBackend
+from ..communication import RayServer
+from ..pool import RayMainPool
+from ..utils import process_placement_to_address
+
+ray = lazy_import("ray")
+
+
+@pytest.fixture
+async def actor_pool_context():
+    pg_name, n_process = f"ray_cluster_{time.time_ns()}", 2
+    address = process_placement_to_address(pg_name, 0, process_index=0)
+    # Hold actor_handle to avoid actor being freed.
+    pg = ray.util.placement_group(
+        name=pg_name, bundles=[{"CPU": n_process}], strategy="SPREAD"
+    )
+    ray.get(pg.ready())
+    pg, _ = ray.util.get_placement_group(pg_name), 0
+    pool_handle = await RayActorBackend._create_ray_pools(address, n_process)
+    await pool_handle.start.remote()
+
+    class ProxyPool:
+        def __init__(self, ray_pool_actor_handle):
+            self.ray_pool_actor_handle = ray_pool_actor_handle
+
+        def __getattr__(self, item):
+            if hasattr(RayMainPool, item) and inspect.isfunction(
+                getattr(RayMainPool, item)
+            ):
+
+                def call(*args, **kwargs):
+                    ray.get(
+                        self.ray_pool_actor_handle.actor_pool.remote(
+                            item, *args, **kwargs
+                        )
+                    )
+
+                return call
+
+            return ray.get(self.ray_pool_actor_handle.actor_pool.remote(item))
+
+    yield ProxyPool(pool_handle)
+    for addr in [
+        process_placement_to_address(pg_name, 0, process_index=i)
+        for i in range(n_process)
+    ]:
+        try:
+            # kill main pool first to avoid main pool monitor task recreate sub pool
+            ray.kill(ray.get_actor(addr))
+        except:  # noqa: E722  # nosec  # pylint: disable=bare-except
+            pass
+    ray.util.remove_placement_group(pg)
+    Router.set_instance(None)
+    RayServer.clear()
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_simple_local_actor_pool(ray_start_regular_shared, actor_pool_context):
+    await test_mars_actor_context.test_simple_local_actor_pool(actor_pool_context)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_mars_post_create_pre_destroy(
+    ray_start_regular_shared, actor_pool_context
+):
+    await test_mars_actor_context.test_mars_post_create_pre_destroy(actor_pool_context)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_mars_create_actor(ray_start_regular_shared, actor_pool_context):
+    await test_mars_actor_context.test_mars_create_actor(actor_pool_context)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_mars_create_actor_error(ray_start_regular_shared, actor_pool_context):
+    await test_mars_actor_context.test_mars_create_actor_error(actor_pool_context)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_mars_send(ray_start_regular_shared, actor_pool_context):
+    await test_mars_actor_context.test_mars_send(actor_pool_context)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_mars_send_error(ray_start_regular_shared, actor_pool_context):
+    await test_mars_actor_context.test_mars_send_error(actor_pool_context)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_mars_tell(ray_start_regular_shared, actor_pool_context):
+    await test_mars_actor_context.test_mars_tell(actor_pool_context)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_mars_batch_method(ray_start_regular_shared, actor_pool_context):
+    await test_mars_actor_context.test_mars_batch_method(actor_pool_context)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_mars_destroy_has_actor(ray_start_regular_shared, actor_pool_context):
+    await test_mars_actor_context.test_mars_destroy_has_actor(actor_pool_context)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_mars_resource_lock(ray_start_regular_shared, actor_pool_context):
+    await test_mars_actor_context.test_mars_resource_lock(actor_pool_context)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_promise_chain(ray_start_regular_shared, actor_pool_context):
+    await test_mars_actor_context.test_promise_chain(actor_pool_context)
diff --git a/python/xorbits/_mars/oscar/backends/ray/tests/test_ray_actor_driver.py b/python/xorbits/_mars/oscar/backends/ray/tests/test_ray_actor_driver.py
new file mode 100644
index 000000000..744d81572
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/tests/test_ray_actor_driver.py
@@ -0,0 +1,174 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+
+import pytest
+
+from ..... import oscar as mo
+from .....tests.core import require_ray
+from .....utils import lazy_import
+from ..communication import RayServer
+from ..driver import RayActorDriver
+from ..utils import (
+    addresses_to_placement_group_info,
+    get_placement_group,
+    node_address_to_placement,
+    placement_group_info_to_addresses,
+    process_address_to_placement,
+    process_placement_to_address,
+)
+
+ray = lazy_import("ray")
+
+TEST_PLACEMENT_GROUP_NAME = "test_placement_group"
+TEST_PLACEMENT_GROUP_BUNDLES = [{"CPU": 3}, {"CPU": 5}, {"CPU": 7}]
+TEST_ADDRESS_TO_RESOURCES = placement_group_info_to_addresses(
+    TEST_PLACEMENT_GROUP_NAME, TEST_PLACEMENT_GROUP_BUNDLES
+)
+
+
+class DummyActor(mo.Actor):
+    def __init__(self, index):
+        super().__init__()
+        self._index = index
+
+    def getppid(self):
+        return os.getppid()
+
+    def index(self):
+        return self._index
+
+
+@pytest.fixture
+async def mars_cluster():
+    mo.setup_cluster(address_to_resources=TEST_ADDRESS_TO_RESOURCES)
+    main_pool_handles = []  # Hold actor_handle to avoid actor being freed.
+    for index, bundle_spec in enumerate(TEST_PLACEMENT_GROUP_BUNDLES):
+        address = process_placement_to_address(TEST_PLACEMENT_GROUP_NAME, index, 0)
+        actor_handle = await mo.create_actor_pool(address, bundle_spec["CPU"])
+        main_pool_handles.append(actor_handle)
+
+    yield
+
+    RayActorDriver.stop_cluster()
+    RayServer.clear()
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_create_actor_in_placement_group(ray_large_cluster, mars_cluster):
+    actor_refs = []
+    for i, r in enumerate(TEST_PLACEMENT_GROUP_BUNDLES):
+        for _ in range(r["CPU"]):
+            address = process_placement_to_address(TEST_PLACEMENT_GROUP_NAME, i, 0)
+            actor_ref = await mo.create_actor(DummyActor, i, address=address)
+            actor_refs.append(actor_ref)
+    results = []
+    for actor_ref in actor_refs:
+        ppid = await actor_ref.getppid()
+        index = await actor_ref.index()
+        results.append((ppid, index))
+
+    counter = collections.Counter(results)
+    assert len(counter) == len(TEST_PLACEMENT_GROUP_BUNDLES)
+    assert sorted(counter.values()) == sorted(
+        r["CPU"] for r in TEST_PLACEMENT_GROUP_BUNDLES
+    )
+
+
+def test_address_to_pg_bundle():
+    # Missing bundle index.
+    with pytest.raises(ValueError):
+        node_address_to_placement("ray://bundle_name")
+    # Extra path is not allowed.
+    with pytest.raises(ValueError):
+        node_address_to_placement("ray://bundle_name/0/")
+    # The scheme is not ray
+    with pytest.raises(ValueError):
+        node_address_to_placement("http://bundle_name/0")
+    # The bundle index is not an int string.
+    with pytest.raises(ValueError):
+        node_address_to_placement("ray://abc/def")
+    pg_name, bundle_index = node_address_to_placement("ray://bundle_name/0")
+    assert pg_name == "bundle_name"
+    assert bundle_index == 0
+    pg_name, bundle_index = node_address_to_placement("ray://127.0.0.1/1")
+    assert pg_name == "127.0.0.1"
+    assert bundle_index == 1
+    pg_name, bundle_index = node_address_to_placement("ray://127.0.0.1%2F2")
+    assert pg_name == "127.0.0.1"
+    assert bundle_index == 2
+    with pytest.raises(ValueError):
+        node_address_to_placement("ray://")
+
+
+def test_addresses_to_placement_group_info():
+    # Missing bundle index 1
+    with pytest.raises(ValueError):
+        addresses_to_placement_group_info(
+            {"ray://127.0.0.1/0": {"CPU": 1}, "ray://127.0.0.1/2": {"CPU": 1}}
+        )
+    # The bundle index is not starts from 0
+    with pytest.raises(ValueError):
+        addresses_to_placement_group_info({"ray://127.0.0.1/1": {"CPU": 1}})
+    pg_name, bundles = addresses_to_placement_group_info(
+        {"ray://127.0.0.1/0": {"CPU": 1}}
+    )
+    assert pg_name == "127.0.0.1"
+    assert bundles == [{"CPU": 1}]
+    pg_name, bundles = addresses_to_placement_group_info(
+        {
+            "ray://127.0.0.1/4": {"CPU": 4},
+            "ray://127.0.0.1/2": {"CPU": 2},
+            "ray://127.0.0.1/1": {"CPU": 1},
+            "ray://127.0.0.1/3": {"CPU": 3},
+            "ray://127.0.0.1/0": {"CPU": 0},
+        }
+    )
+    assert pg_name == "127.0.0.1"
+    assert bundles == [{"CPU": 0}, {"CPU": 1}, {"CPU": 2}, {"CPU": 3}, {"CPU": 4}]
+    pg_name, bundles = addresses_to_placement_group_info(TEST_ADDRESS_TO_RESOURCES)
+    assert pg_name == TEST_PLACEMENT_GROUP_NAME
+    assert bundles == TEST_PLACEMENT_GROUP_BUNDLES
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_get_placement_group(ray_large_cluster):
+    pg_name = "test_pg"
+    pg = ray.util.placement_group(name=pg_name, bundles=[{"CPU": 1}], strategy="SPREAD")
+    ray.get(pg.ready())
+    pg2 = get_placement_group(pg_name)
+    assert pg2.bundle_specs == pg.bundle_specs
+
+
+def test_address_to_placement():
+    assert process_address_to_placement("ray://test_cluster/0/0") == (
+        "test_cluster",
+        0,
+        0,
+    )
+    with pytest.raises(ValueError):
+        process_address_to_placement("ray://")
+    assert node_address_to_placement("ray://test_cluster/0") == ("test_cluster", 0)
+    with pytest.raises(ValueError):
+        node_address_to_placement("ray://")
+    with pytest.raises(ValueError):
+        node_address_to_placement("ray://test_cluster")
+    with pytest.raises(ValueError):
+        node_address_to_placement("ray://test_cluster/")
+    with pytest.raises(ValueError):
+        node_address_to_placement("ray://test_cluster//")
diff --git a/python/xorbits/_mars/oscar/backends/ray/tests/test_ray_pool.py b/python/xorbits/_mars/oscar/backends/ray/tests/test_ray_pool.py
new file mode 100644
index 000000000..48b4e270c
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/tests/test_ray_pool.py
@@ -0,0 +1,213 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+
+import pytest
+
+from ..... import oscar as mo
+from .....tests.core import mock, require_ray
+from .....utils import lazy_import
+from ....context import get_context
+from ....errors import ServerClosed
+from ...allocate_strategy import MainPool, ProcessIndex
+from ..backend import RayActorBackend
+from ..pool import RayMainActorPool, RayPoolState, create_actor_pool
+from ..utils import kill_and_wait, process_placement_to_address
+
+ray = lazy_import("ray")
+
+
+class TestActor(mo.Actor):
+    __test__ = False
+
+    async def kill(self, address, uid):
+        actor_ref = await mo.actor_ref(address, uid)
+        task = asyncio.create_task(actor_ref.crash())
+        return await task
+
+    async def crash(self):
+        os._exit(0)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_main_pool(ray_start_regular):
+    pg, pg_name, n_process = None, "ray_cluster", 3
+    if hasattr(ray.util, "get_placement_group"):
+        pg = ray.util.placement_group(name=pg_name, bundles=[{"CPU": n_process}])
+        ray.get(pg.ready())
+    address = process_placement_to_address(pg_name, 0, process_index=0)
+    addresses = RayMainActorPool.get_external_addresses(address, n_process)
+    assert addresses == [address] + [
+        process_placement_to_address(pg_name, 0, process_index=i + 1)
+        for i in range(n_process)
+    ]
+    assert RayMainActorPool.gen_internal_address(0, address) == address
+
+    pool_handle = await RayActorBackend._create_ray_pools(address, n_process)
+    main_actor_pool = await create_actor_pool(
+        address,
+        n_process=n_process,
+        pool_cls=RayMainActorPool,
+        sub_pool_handles=pool_handle.sub_pools,
+    )
+    async with main_actor_pool:
+        sub_processes = list(main_actor_pool.sub_processes.values())
+        assert len(sub_processes) == n_process
+        await main_actor_pool.kill_sub_pool(sub_processes[0], force=True)
+        assert not (await main_actor_pool.is_sub_pool_alive(sub_processes[0]))
+        await main_actor_pool.kill_sub_pool(sub_processes[1], force=False)
+        assert not (await main_actor_pool.is_sub_pool_alive(sub_processes[1]))
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_shutdown_sub_pool(ray_start_regular):
+    import ray
+
+    pg_name, n_process = "ray_cluster", 2
+    if hasattr(ray.util, "get_placement_group"):
+        pg = ray.util.placement_group(name=pg_name, bundles=[{"CPU": n_process}])
+        ray.get(pg.ready())
+    else:
+        pg = None
+    address = process_placement_to_address(pg_name, 0, process_index=0)
+    pool_handle = await RayActorBackend._create_ray_pools(address, n_process)
+    actor_handle = pool_handle.main_pool
+    await actor_handle.start.remote()
+    sub_pool_address1 = process_placement_to_address(pg_name, 0, process_index=1)
+    sub_pool_handle1 = ray.get_actor(sub_pool_address1)
+    sub_pool_address2 = process_placement_to_address(pg_name, 0, process_index=2)
+    sub_pool_handle2 = ray.get_actor(sub_pool_address2)
+    await actor_handle.actor_pool.remote(
+        "stop_sub_pool", sub_pool_address1, sub_pool_handle1, force=True
+    )
+    await actor_handle.actor_pool.remote(
+        "stop_sub_pool", sub_pool_address2, sub_pool_handle2, force=False
+    )
+    assert await sub_pool_handle1.state.remote() == RayPoolState.INIT
+    assert await sub_pool_handle2.state.remote() == RayPoolState.INIT
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_server_closed(ray_start_regular):
+    pg_name, n_process = "ray_cluster", 1
+    pg = ray.util.placement_group(name=pg_name, bundles=[{"CPU": n_process}])
+    ray.get(pg.ready())
+    address = process_placement_to_address(pg_name, 0, process_index=0)
+    # start the actor pool
+    actor_handle = await mo.create_actor_pool(address, n_process=n_process)
+    await actor_handle.mark_service_ready.remote()
+
+    ctx = get_context()
+    actor_main = await ctx.create_actor(
+        TestActor, address=address, uid="Test-main", allocate_strategy=ProcessIndex(0)
+    )
+
+    actor_sub = await ctx.create_actor(
+        TestActor, address=address, uid="Test-sub", allocate_strategy=ProcessIndex(1)
+    )
+
+    # test calling from ray driver to ray actor
+    task = asyncio.create_task(actor_sub.crash())
+
+    with pytest.raises(ServerClosed):
+        # process already died,
+        # ServerClosed will be raised
+        await task
+
+    # wait for recover of sub pool
+    await ctx.wait_actor_pool_recovered(actor_sub.address, address)
+
+    # test calling from ray actor to ray actor
+    task = asyncio.create_task(actor_main.kill(actor_sub.address, "Test-sub"))
+
+    with pytest.raises(ServerClosed):
+        await task
+
+
+@require_ray
+@pytest.mark.asyncio
+@pytest.mark.parametrize("auto_recover", [False, True, "actor", "process"])
+async def test_auto_recover(ray_start_regular, auto_recover):
+    pg_name, n_process = "ray_cluster", 1
+    pg = ray.util.placement_group(name=pg_name, bundles=[{"CPU": n_process}])
+    assert pg.wait(timeout_seconds=20)
+    address = process_placement_to_address(pg_name, 0, process_index=0)
+    actor_handle = await mo.create_actor_pool(
+        address, n_process=n_process, auto_recover=auto_recover
+    )
+    await actor_handle.mark_service_ready.remote()
+
+    ctx = get_context()
+
+    # wait for recover of main pool always returned immediately
+    await ctx.wait_actor_pool_recovered(address, address)
+
+    # create actor on main
+    actor_ref = await ctx.create_actor(
+        TestActor, address=address, allocate_strategy=MainPool()
+    )
+
+    with pytest.raises(ValueError):
+        # cannot kill actors on main pool
+        await mo.kill_actor(actor_ref)
+
+    # create actor
+    actor_ref = await ctx.create_actor(
+        TestActor, address=address, allocate_strategy=ProcessIndex(1)
+    )
+    # kill_actor will cause kill corresponding process
+    await ctx.kill_actor(actor_ref)
+
+    if auto_recover:
+        await ctx.wait_actor_pool_recovered(actor_ref.address, address)
+        sub_pool_address = process_placement_to_address(pg_name, 0, process_index=1)
+        sub_pool_handle = ray.get_actor(sub_pool_address)
+        if auto_recover == "process":
+            assert await sub_pool_handle.state.remote() == RayPoolState.POOL_READY
+        else:
+            assert await sub_pool_handle.state.remote() == RayPoolState.SERVICE_READY
+
+        expect_has_actor = True if auto_recover in ["actor", True] else False
+        assert await ctx.has_actor(actor_ref) is expect_has_actor
+    else:
+        with pytest.raises((ServerClosed, ConnectionError)):
+            await ctx.has_actor(actor_ref)
+
+    if "COV_CORE_SOURCE" in os.environ:
+        for addr in [
+            process_placement_to_address(pg_name, 0, process_index=i) for i in range(2)
+        ]:
+            # must save the local reference until this is fixed:
+            # https://github.com/ray-project/ray/issues/7815
+            ray_actor = ray.get_actor(addr)
+            ray.get(ray_actor.cleanup.remote())
+
+
+@require_ray
+@pytest.mark.asyncio
+@mock.patch("ray.kill")
+async def test_kill_and_wait_timeout(fake_ray_kill, ray_start_regular):
+    pg_name, n_process = "ray_cluster", 1
+    pg = ray.util.placement_group(name=pg_name, bundles=[{"CPU": n_process}])
+    ray.get(pg.ready())
+    address = process_placement_to_address(pg_name, 0, process_index=0)
+    # start the actor pool
+    actor_handle = await mo.create_actor_pool(address, n_process=n_process)
+    with pytest.raises(Exception, match="not died"):
+        await kill_and_wait(actor_handle, timeout=1)
diff --git a/python/xorbits/_mars/oscar/backends/ray/tests/test_utils.py b/python/xorbits/_mars/oscar/backends/ray/tests/test_utils.py
new file mode 100644
index 000000000..fa2bfeda4
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/tests/test_utils.py
@@ -0,0 +1,33 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .....tests.core import mock, require_ray
+from .....utils import lazy_import
+from ..utils import report_event
+
+ray = lazy_import("ray")
+
+
+@require_ray
+@mock.patch("ray.report_event")
+def test_report_event(fake_report_event, ray_start_regular):
+    arguments = []
+
+    def _report_event(*args):
+        arguments.extend(args)
+
+    fake_report_event.side_effect = _report_event
+    severity, label, message = "WARNING", "test_label", "test_message"
+    report_event(severity, label, message)
+    assert arguments == [ray.EventSeverity.WARNING, label, message]
diff --git a/python/xorbits/_mars/oscar/backends/ray/utils.py b/python/xorbits/_mars/oscar/backends/ray/utils.py
new file mode 100644
index 000000000..f50c1886c
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/utils.py
@@ -0,0 +1,203 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import enum
+import logging
+import os
+import posixpath
+from urllib.parse import unquote, urlparse
+
+from ....utils import lazy_import, lazy_import_on_load
+
+ray = lazy_import("ray")
+
+logger = logging.getLogger(__name__)
+
+
+def get_placement_group(pg_name):  # pragma: no cover
+    return ray.util.get_placement_group(pg_name)
+
+
+def process_address_to_placement(address):
+    """
+    Parameters
+    ----------
+    address: str
+        The address of an actor pool which running in a ray actor. It's also
+        the name of the ray actor. address ex: ray://${pg_name}/${bundle_index}/${process_index}
+
+    Returns
+    -------
+    tuple
+        A tuple consisting of placement group name, bundle index, process index.
+    """
+    name, parts = _address_to_placement(address)
+    if not parts or len(parts) != 2:
+        raise ValueError(
+            f"Only bundle index and process index path are allowed in ray "
+            f"address {address} but got {parts}."
+        )
+    bundle_index, process_index = parts
+    return name, int(bundle_index), int(process_index)
+
+
+def node_address_to_placement(address):
+    """
+    Parameters
+    ----------
+    address : str
+        The address of a node. ex: ray://${pg_name}/${bundle_index}
+
+    Returns
+    -------
+    tuple
+        A tuple consisting of placement group name, bundle index.
+    """
+    name, parts = _address_to_placement(address)
+    if not parts or len(parts) != 1:
+        raise ValueError(
+            f"Only bundle index path is allowed in ray address {address} but got {parts}"
+        )
+    bundle_index = parts[0]
+    return name, int(bundle_index)
+
+
+def _address_to_placement(address):
+    """
+
+    Parameters
+    ----------
+    address : str
+        The address of a node or an actor pool which running in a ray actor.
+
+    Returns
+    -------
+    tuple
+        A tuple consisting of placement group name, bundle index, process index.
+    """
+    parsed_url = urlparse(unquote(address))
+    if parsed_url.scheme != "ray":
+        raise ValueError(f"The address scheme is not ray: {address}")
+    # os.path.split will not handle backslashes (\) correctly,
+    # so we use the posixpath.
+    parts = []
+    if parsed_url.netloc:
+        tmp = parsed_url.path
+        while tmp and tmp != "/":
+            tmp2, item = posixpath.split(tmp)
+            parts.append(item)
+            if tmp2 != tmp:
+                tmp = tmp2
+            else:
+                parts.append(tmp2)
+                break
+    parts = list(reversed(parts))
+    return parsed_url.netloc, parts
+
+
+def process_placement_to_address(
+    pg_name: str, bundle_index: int, process_index: int = 0
+):
+    return f"ray://{pg_name}/{bundle_index}/{process_index}"
+
+
+def node_placement_to_address(pg_name, bundle_index):
+    return f"ray://{pg_name}/{bundle_index}"
+
+
+def addresses_to_placement_group_info(address_to_resources):
+    bundles = {}
+    pg_name = None
+    for address, bundle_resources in address_to_resources.items():
+        name, bundle_index = node_address_to_placement(address)
+        if pg_name is None:
+            pg_name = name
+        else:
+            if name != pg_name:
+                raise ValueError(
+                    "All addresses should have consistent placement group names."
+                )
+        bundles[bundle_index] = bundle_resources
+    sorted_bundle_keys = sorted(bundles.keys())
+    if sorted_bundle_keys != list(range(len(address_to_resources))):
+        raise ValueError("The addresses contains invalid bundle.")
+    bundles = [bundles[k] for k in sorted_bundle_keys]
+    if not pg_name:
+        raise ValueError("Can't find a valid placement group name.")
+    return pg_name, bundles
+
+
+def placement_group_info_to_addresses(pg_name, bundles):
+    addresses = {}
+    for bundle_index, bundle_resources in enumerate(bundles):
+        address = node_placement_to_address(pg_name, bundle_index)
+        addresses[address] = bundle_resources
+    return addresses
+
+
+async def kill_and_wait(
+    actor_handle: "ray.actor.ActorHandle", no_restart=False, timeout: float = 30
+):
+    if "COV_CORE_SOURCE" in os.environ:  # pragma: no cover
+        try:
+            # must clean up first, or coverage info lost
+            await actor_handle.cleanup.remote()
+        except:  # noqa: E722  # nosec  # pylint: disable=bare-except
+            pass
+    r = actor_handle.wait.remote(timeout)
+    ray.kill(actor_handle, no_restart=no_restart)
+    ready, _ = await asyncio.wait([r], timeout=timeout)
+    if ready:
+        try:
+            await r
+        except ray.exceptions.RayActorError:
+            return  # We expect a RayActorError, it indicated that the actor is died.
+    raise Exception(
+        f"The actor {actor_handle} is not died after ray.kill {timeout} seconds."
+    )
+
+
+@lazy_import_on_load(ray)
+def _patch_event_security():
+    global ray
+
+    if ray and not hasattr(ray, "report_event"):  # pragma: no cover
+        # lower version of ray doesn't support event
+
+        class EventSeverity(enum.Enum):
+            INFO = 0
+            WARNING = 1
+            ERROR = 2
+            FATAL = 3
+
+        def _report_event(severity, label, message):
+            logger.warning(
+                "severity: %s, label: %s, message: %s.", severity, label, message
+            )
+
+        import ray
+
+        ray.EventSeverity = EventSeverity
+        ray.report_event = _report_event
+
+
+def report_event(severity, label, message):
+    if ray and ray.is_initialized():
+        severity = (
+            getattr(ray.EventSeverity, severity)
+            if isinstance(severity, str)
+            else severity
+        )
+        ray.report_event(severity, label, message)
diff --git a/python/xorbits/_mars/oscar/backends/router.py b/python/xorbits/_mars/oscar/backends/router.py
new file mode 100644
index 000000000..8b5ca749d
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/router.py
@@ -0,0 +1,134 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import threading
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+from .communication import Client, get_client_type
+
+
+class Router:
+    """
+    Router provides mapping from external address to internal address.
+    """
+
+    __slots__ = (
+        "_curr_external_addresses",
+        "_local_mapping",
+        "_mapping",
+        "_comm_config",
+        "_cache_local",
+    )
+
+    _instance: "Router" = None
+
+    @staticmethod
+    def set_instance(router: Optional["Router"]):
+        # Default router is set when an actor pool started
+        Router._instance = router
+
+    @staticmethod
+    def get_instance() -> "Router":
+        return Router._instance
+
+    @staticmethod
+    def get_instance_or_empty() -> "Router":
+        return Router._instance or Router(list(), None)
+
+    def __init__(
+        self,
+        external_addresses: List[str],
+        local_address: Optional[str],
+        mapping: Dict[str, str] = None,
+        comm_config: dict = None,
+    ):
+        self._curr_external_addresses = external_addresses
+        self._local_mapping = dict()
+        for addr in self._curr_external_addresses:
+            self._local_mapping[addr] = local_address
+        if mapping is None:
+            mapping = dict()
+        self._mapping = mapping
+        self._comm_config = comm_config or dict()
+        self._cache_local = threading.local()
+
+    @property
+    def _cache(self) -> Dict[Tuple[str, Any], Client]:
+        try:
+            return self._cache_local.cache
+        except AttributeError:
+            cache = self._cache_local.cache = dict()
+            return cache
+
+    def set_mapping(self, mapping: Dict[str, str]):
+        self._mapping = mapping
+        self._cache_local = threading.local()
+
+    def add_router(self, router: "Router"):
+        self._curr_external_addresses.extend(router._curr_external_addresses)
+        self._local_mapping.update(router._local_mapping)
+        self._mapping.update(router._mapping)
+        self._comm_config.update(router._comm_config)
+        self._cache_local = threading.local()
+
+    def remove_router(self, router: "Router"):
+        for external_address in router._curr_external_addresses:
+            try:
+                self._curr_external_addresses.remove(external_address)
+            except ValueError:
+                pass
+        for addr in router._local_mapping:
+            self._local_mapping.pop(addr, None)
+        for addr in router._mapping:
+            self._mapping.pop(addr, None)
+        self._cache_local = threading.local()
+
+    @property
+    def external_address(self):
+        if self._curr_external_addresses:
+            return self._curr_external_addresses[0]
+
+    def get_internal_address(self, external_address: str) -> str:
+        if external_address in self._curr_external_addresses:
+            # local address, use dummy address
+            return self._local_mapping.get(external_address)
+        # try to lookup inner address from address mapping
+        return self._mapping.get(external_address)
+
+    async def get_client(
+        self, external_address: str, from_who: Any = None, cached: bool = True, **kw
+    ) -> Client:
+        if cached and (external_address, from_who) in self._cache:
+            cached_client = self._cache[external_address, from_who]
+            if cached_client.closed:
+                # closed before, ignore it
+                del self._cache[external_address, from_who]
+            else:
+                return cached_client
+
+        address = self.get_internal_address(external_address)
+        if address is None:
+            # no inner address, just use external address
+            address = external_address
+        client_type: Type[Client] = get_client_type(address)
+        local_address = (
+            self._curr_external_addresses[0] if self._curr_external_addresses else None
+        )
+        config = client_type.parse_config(self._comm_config)
+        if config:
+            kw["config"] = config
+        client = await client_type.connect(address, local_address=local_address, **kw)
+        if cached:
+            self._cache[external_address, from_who] = client
+        return client
diff --git a/python/xorbits/_mars/oscar/backends/test/__init__.py b/python/xorbits/_mars/oscar/backends/test/__init__.py
new file mode 100644
index 000000000..f27d44076
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/test/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .backend import TestActorBackend
diff --git a/python/xorbits/_mars/oscar/backends/test/backend.py b/python/xorbits/_mars/oscar/backends/test/backend.py
new file mode 100644
index 000000000..6c13fb02a
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/test/backend.py
@@ -0,0 +1,33 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...backend import register_backend
+from ..mars.backend import MarsActorBackend, build_pool_kwargs
+from .pool import TestMainActorPool
+
+
+@register_backend
+class TestActorBackend(MarsActorBackend):
+    @staticmethod
+    def name():
+        return "test"
+
+    @classmethod
+    async def create_actor_pool(cls, address: str, n_process: int = None, **kwargs):
+        from ..pool import create_actor_pool
+
+        n_process, kwargs = build_pool_kwargs(n_process, kwargs)
+        return await create_actor_pool(
+            address, pool_cls=TestMainActorPool, n_process=n_process, **kwargs
+        )
diff --git a/python/xorbits/_mars/oscar/backends/test/pool.py b/python/xorbits/_mars/oscar/backends/test/pool.py
new file mode 100644
index 000000000..e73b3fcf4
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/test/pool.py
@@ -0,0 +1,135 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import multiprocessing
+from typing import Dict, List
+
+from ..communication import DummyServer, gen_local_address
+from ..config import ActorPoolConfig
+from ..mars.pool import MainActorPool, SubActorPool, SubpoolStatus
+from ..pool import ActorPoolType
+
+
+class TestMainActorPool(MainActorPool):
+    @classmethod
+    def get_external_addresses(
+        cls,
+        address: str,
+        n_process: int = None,
+        ports: List[int] = None,
+        schemes: List[str] = None,
+    ):
+        if "://" in address:
+            address = address.split("://", 1)[1]
+        return super().get_external_addresses(address, n_process=n_process, ports=ports)
+
+    @classmethod
+    def gen_internal_address(
+        cls, process_index: int, external_address: str = None
+    ) -> str:
+        return f"dummy://{process_index}"
+
+    @classmethod
+    async def start_sub_pool(
+        cls,
+        actor_pool_config: ActorPoolConfig,
+        process_index: int,
+        start_method: str = None,
+    ):
+        status_queue = multiprocessing.Queue()
+        return (
+            asyncio.create_task(
+                cls._create_sub_pool(actor_pool_config, process_index, status_queue)
+            ),
+            status_queue,
+        )
+
+    @classmethod
+    async def wait_sub_pools_ready(cls, create_pool_tasks: List[asyncio.Task]):
+        addresses = []
+        tasks = []
+        for t in create_pool_tasks:
+            pool_task, queue = await t
+            tasks.append(pool_task)
+            status = await asyncio.to_thread(queue.get)
+            addresses.append(status.external_addresses)
+        return tasks, addresses
+
+    @classmethod
+    async def _create_sub_pool(
+        cls,
+        actor_config: ActorPoolConfig,
+        process_index: int,
+        status_queue: multiprocessing.Queue,
+    ):
+        pool = await TestSubActorPool.create(
+            {"actor_pool_config": actor_config, "process_index": process_index}
+        )
+        await pool.start()
+        status_queue.put(
+            SubpoolStatus(status=0, external_addresses=[pool.external_address])
+        )
+        actor_config.reset_pool_external_address(process_index, [pool.external_address])
+        await pool.join()
+
+    def _sync_pool_config(self, actor_pool_config: ActorPoolConfig):
+        # test pool does not create routers, thus can skip this step
+        pass
+
+    async def kill_sub_pool(
+        self, process: multiprocessing.Process, force: bool = False
+    ):
+        process.cancel()
+
+    async def is_sub_pool_alive(self, process: multiprocessing.Process):
+        return not process.cancelled()
+
+
+class TestSubActorPool(SubActorPool):
+    def _sync_pool_config(self, actor_pool_config: ActorPoolConfig):
+        # test pool does not create routers, thus can skip this step
+        pass
+
+    @classmethod
+    async def create(cls, config: Dict) -> ActorPoolType:
+        kw = dict()
+        cls._parse_config(config, kw)
+        process_index: int = kw["process_index"]
+        actor_pool_config = kw["config"]  # type: ActorPoolConfig
+        external_addresses = actor_pool_config.get_pool_config(process_index)[
+            "external_address"
+        ]
+
+        def handle_channel(channel):
+            return pool.on_new_channel(channel)
+
+        # create servers
+        server_addresses = external_addresses + [gen_local_address(process_index)]
+        server_addresses = sorted(set(server_addresses))
+        servers = await cls._create_servers(
+            server_addresses, handle_channel, actor_pool_config.get_comm_config()
+        )
+        cls._update_stored_addresses(servers, server_addresses, actor_pool_config, kw)
+
+        # create pool
+        pool = cls(**kw)
+        return pool
+
+    async def stop(self):
+        # do not close dummy server
+        self._servers = [
+            s for s in self._servers[:-1] if not isinstance(s, DummyServer)
+        ]
+        await super().stop()
diff --git a/python/xorbits/_mars/oscar/backends/test/tests/__init__.py b/python/xorbits/_mars/oscar/backends/test/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/test/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/oscar/backends/test/tests/test_actor_context.py b/python/xorbits/_mars/oscar/backends/test/tests/test_actor_context.py
new file mode 100644
index 000000000..f42e49b51
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/test/tests/test_actor_context.py
@@ -0,0 +1,61 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+
+import pytest
+
+from ..... import oscar as mo
+
+
+class DummyActor(mo.Actor):
+    def __init__(self, value):
+        super().__init__()
+
+        if value < 0:
+            raise ValueError("value < 0")
+        self.value = value
+
+    async def add(self, value):
+        if not isinstance(value, int):
+            raise TypeError("add number must be int")
+        self.value += value
+        return self.value
+
+
+@pytest.fixture
+async def actor_pool_context():
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    pool = await mo.create_actor_pool(
+        "test://127.0.0.1", n_process=2, subprocess_start_method=start_method
+    )
+    async with pool:
+        yield pool
+
+
+@pytest.mark.asyncio
+async def test_simple(actor_pool_context):
+    pool = actor_pool_context
+    actor_ref = await mo.create_actor(
+        DummyActor,
+        100,
+        address=pool.external_address,
+        allocate_strategy=mo.allocate_strategy.RandomSubPool(),
+    )
+    assert await actor_ref.add(1) == 101
diff --git a/python/xorbits/_mars/oscar/backends/test/tests/test_message.py b/python/xorbits/_mars/oscar/backends/test/tests/test_message.py
new file mode 100644
index 000000000..3ec18751c
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/test/tests/test_message.py
@@ -0,0 +1,74 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cloudpickle as pickle
+
+from ...message import ErrorMessage
+
+
+def test_as_instanceof_cause():
+    fake_address = "Fake address"
+    fake_pid = 123
+    value = 3
+
+    class CustomException(Exception):
+        def __init__(self, i):
+            self.i = i
+
+        def __str__(self):
+            return "Custom Exception."
+
+    try:
+        raise CustomException(value)
+    except Exception as e:
+        em = ErrorMessage(
+            b"Fake message id", fake_address, fake_pid, type(e), e, e.__traceback__
+        )
+        assert "Fake message id" in repr(em)
+        try:
+            cause = em.as_instanceof_cause()
+            # Test serialization.
+            cause1 = pickle.loads(pickle.dumps(cause))
+            assert type(cause) is type(cause1)
+            raise cause
+        except Exception as e1:
+            e1 = pickle.loads(pickle.dumps(e1))
+            # Check cause exception.
+            assert isinstance(e1, CustomException)
+            assert e1.i == value
+            assert e1.address == fake_address
+            assert e1.pid == fake_pid
+            assert fake_address in str(e1)
+            assert "Custom Exception" in str(e1)
+            assert str(fake_pid) in str(e1)
+            em1 = ErrorMessage(
+                b"Fake message id",
+                fake_address,
+                fake_pid,
+                type(e1),
+                e1,
+                e1.__traceback__,
+            )
+            try:
+                raise em1.as_instanceof_cause()
+            except Exception as e2:
+                e2 = pickle.loads(pickle.dumps(e2))
+                # Check recursive cause exception.
+                assert isinstance(e2, CustomException)
+                assert e2.i == value
+                assert e2.address == fake_address
+                assert e2.pid == fake_pid
+                assert str(e2).count("Custom Exception") == 1
+                assert str(e2).count(fake_address) == 1
+                assert str(e2).count(str(fake_pid)) == 1
diff --git a/python/xorbits/_mars/oscar/batch.py b/python/xorbits/_mars/oscar/batch.py
new file mode 100644
index 000000000..37026e2a4
--- /dev/null
+++ b/python/xorbits/_mars/oscar/batch.py
@@ -0,0 +1,244 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import inspect
+import textwrap
+from collections import namedtuple
+from dataclasses import dataclass
+from typing import Callable, Dict, Optional, Tuple
+
+
+def build_args_binder(func, remove_self: bool = True) -> Optional[Callable]:
+    try:
+        spec = inspect.getfullargspec(func)
+    except TypeError:  # pragma: no cover
+        return None
+
+    sig_list = list(spec.args)
+    args_list = list(spec.args)
+    if remove_self:
+        args_list = args_list[1:]
+
+    if spec.varargs:
+        sig_list.append(f"*{spec.varargs}")
+        args_list.append(spec.varargs)
+    elif spec.kwonlyargs:
+        sig_list.append("*")
+
+    sig_list.extend(spec.kwonlyargs)
+    args_list.extend(spec.kwonlyargs)
+
+    if spec.varkw:
+        sig_list.append(f"**{spec.varkw}")
+        args_list.append(spec.varkw)
+
+    if getattr(func, "__name__", None).isidentifier():
+        ret_func_name = f"{func.__name__}_binder"
+        ret_type_name = f"_Args_{func.__name__}"
+    else:
+        ret_func_name = f"anon_{id(func)}_binder"
+        ret_type_name = f"_ArgsAnon_{id(func)}"
+
+    func_str = textwrap.dedent(
+        f"""
+    def {ret_func_name}({', '.join(sig_list)}):
+        return {ret_type_name}({', '.join(args_list)})
+    """
+    )
+
+    glob_vars = globals().copy()
+    glob_vars[ret_type_name] = namedtuple(ret_type_name, args_list)
+    loc_vars = dict()
+    exec(func_str, glob_vars, loc_vars)
+    ext_func = loc_vars[ret_func_name]
+    ext_func.__defaults__ = spec.defaults
+    ext_func.__kwdefaults__ = spec.kwonlydefaults
+
+    return ext_func
+
+
+@dataclass
+class _DelayedArgument:
+    args: Tuple
+    kwargs: Dict
+
+
+class _ExtensibleCallable:
+    func: Callable
+    batch_func: Optional[Callable]
+    is_async: bool
+    has_single_func: bool
+
+    def __call__(self, *args, **kwargs):
+        if self.is_async:
+            return self._async_call(*args, **kwargs)
+        else:
+            return self._sync_call(*args, **kwargs)
+
+    async def _async_call(self, *args, **kwargs):
+        try:
+            if self.has_single_func:
+                return await self.func(*args, **kwargs)
+        except NotImplementedError:
+            self.has_single_func = False
+
+        if self.batch_func is not None:
+            ret = await self.batch_func([args], [kwargs])
+            return None if ret is None else ret[0]
+        raise NotImplementedError
+
+    def _sync_call(self, *args, **kwargs):
+        try:
+            if self.has_single_func:
+                return self.func(*args, **kwargs)
+        except NotImplementedError:
+            self.has_single_func = False
+
+        if self.batch_func is not None:
+            return self.batch_func([args], [kwargs])[0]
+        raise NotImplementedError
+
+
+class _ExtensibleWrapper(_ExtensibleCallable):
+    def __init__(
+        self,
+        func: Callable,
+        batch_func: Optional[Callable] = None,
+        bind_func: Optional[Callable] = None,
+        is_async: bool = False,
+    ):
+        self.func = func
+        self.batch_func = batch_func
+        self.bind_func = bind_func
+        self.is_async = is_async
+        self.has_single_func = True
+
+    @staticmethod
+    def delay(*args, **kwargs):
+        return _DelayedArgument(args=args, kwargs=kwargs)
+
+    @staticmethod
+    def _gen_args_kwargs_list(delays):
+        args_list = [delay.args for delay in delays]
+        kwargs_list = [delay.kwargs for delay in delays]
+        return args_list, kwargs_list
+
+    async def _async_batch(self, args_list, kwargs_list):
+        # when there is only one call in batch, calling one-pass method
+        # will be more efficient
+        if len(args_list) == 0:
+            return []
+        elif len(args_list) == 1:
+            return [await self._async_call(*args_list[0], **kwargs_list[0])]
+        elif self.batch_func:
+            return await self.batch_func(args_list, kwargs_list)
+        else:
+            # this function has no batch implementation
+            # call it separately
+            tasks = [
+                asyncio.create_task(self.func(*args, **kwargs))
+                for args, kwargs in zip(args_list, kwargs_list)
+            ]
+            try:
+                return await asyncio.gather(*tasks)
+            except asyncio.CancelledError:
+                _ = [task.cancel() for task in tasks]
+                return await asyncio.gather(*tasks)
+
+    def _sync_batch(self, args_list, kwargs_list):
+        if len(args_list) == 0:
+            return []
+        elif self.batch_func:
+            return self.batch_func(args_list, kwargs_list)
+        else:
+            # this function has no batch implementation
+            # call it separately
+            return [
+                self.func(*args, **kwargs)
+                for args, kwargs in zip(args_list, kwargs_list)
+            ]
+
+    def batch(self, *delays):
+        args_list, kwargs_list = self._gen_args_kwargs_list(delays)
+        return self.call_with_lists(args_list, kwargs_list)
+
+    def call_with_lists(self, args_list, kwargs_list):
+        if self.is_async:
+            return self._async_batch(args_list, kwargs_list)
+        else:
+            return self._sync_batch(args_list, kwargs_list)
+
+    def bind(self, *args, **kwargs):
+        if self.bind_func is None:
+            raise TypeError(f"bind function not exist for method {self.func.__name__}")
+        return self.bind_func(*args, **kwargs)
+
+
+class _ExtensibleAccessor(_ExtensibleCallable):
+    func: Callable
+    batch_func: Optional[Callable]
+
+    def __init__(self, func: Callable):
+        self.func = func
+        self.batch_func = None
+        self.bind_func = build_args_binder(func, remove_self=True)
+        self.is_async = asyncio.iscoroutinefunction(self.func)
+        self.has_single_func = True
+
+    def batch(self, func: Callable):
+        self.batch_func = func
+        return self
+
+    def __get__(self, instance, owner):
+        if instance is None:
+            # calling from class
+            return self.func
+
+        func = self.func.__get__(instance, owner)
+        batch_func = (
+            self.batch_func.__get__(instance, owner)
+            if self.batch_func is not None
+            else None
+        )
+        bind_func = (
+            self.bind_func.__get__(instance, owner)
+            if self.bind_func is not None
+            else None
+        )
+
+        return _ExtensibleWrapper(
+            func, batch_func=batch_func, bind_func=bind_func, is_async=self.is_async
+        )
+
+
+def extensible(func: Callable):
+    """
+    `extensible` means this func could be functionality extended,
+    especially for batch operations.
+
+    Consider remote function calls, each function may have operations
+    like opening file, closing file, batching them can help to reduce the cost,
+    especially for remote function calls.
+
+    Parameters
+    ----------
+    func : callable
+        Function
+
+    Returns
+    -------
+    func
+    """
+    return _ExtensibleAccessor(func)
diff --git a/python/xorbits/_mars/oscar/context.pxd b/python/xorbits/_mars/oscar/context.pxd
new file mode 100644
index 000000000..d348efcc2
--- /dev/null
+++ b/python/xorbits/_mars/oscar/context.pxd
@@ -0,0 +1,20 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+cdef class BaseActorContext:
+    cdef public str _address
+
+
+cpdef get_context()
diff --git a/python/xorbits/_mars/oscar/context.pyx b/python/xorbits/_mars/oscar/context.pyx
new file mode 100644
index 000000000..9fff1228c
--- /dev/null
+++ b/python/xorbits/_mars/oscar/context.pyx
@@ -0,0 +1,275 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from urllib.parse import urlparse
+
+from .core cimport ActorRef
+from .utils cimport new_actor_id
+
+from .utils import create_actor_ref
+
+
+cdef dict _backend_context_cls = dict()
+
+cdef object _context = None
+
+
+cdef class BaseActorContext:
+    # allocate strategy is for Mars backend only
+    support_allocate_strategy = False
+
+    """
+    Base class for actor context. Every backend need to implement
+    actor context for their own.
+    """
+
+    def __init__(self, address: str = None):
+        self._address = address
+
+    async def create_actor(
+        self,
+        object actor_cls,
+        *args,
+        object uid=None,
+        object address=None,
+        **kwargs,
+    ):
+        """
+        Stub method for creating an actor in current context.
+
+        Parameters
+        ----------
+        actor_cls : Actor
+            Actor class
+        args : tuple
+            args to be passed into actor_cls.__init__
+        uid : identifier
+            Actor identifier
+        address : str
+            Address to locate the actor
+        kwargs : dict
+            kwargs to be passed into actor_cls.__init__
+
+        Returns
+        -------
+        ActorRef
+
+        """
+        raise NotImplementedError
+
+    async def has_actor(self, ActorRef actor_ref):
+        """
+        Check if actor exists in current context
+
+        Parameters
+        ----------
+        actor_ref : ActorRef
+            Reference to an actor
+
+        Returns
+        -------
+        bool
+        """
+        raise NotImplementedError
+
+    async def destroy_actor(self, ActorRef actor_ref):
+        """
+        Destroy an actor by its reference
+
+        Parameters
+        ----------
+        actor_ref : ActorRef
+            Reference to an actor
+
+        Returns
+        -------
+        bool
+        """
+        raise NotImplementedError
+
+    async def kill_actor(self, ActorRef actor_ref):
+        """
+        Force to kill an actor, take care this is a dangerous operation,
+        it may lead to the result that other actors are killed as well.
+        Hence, unless you are knowing what you are doing and know how
+        to recover possible effected actors, DO NOT USE this method!
+
+        Parameters
+        ----------
+        actor_ref : ActorRef
+            Reference to an actor
+
+        Returns
+        -------
+        bool
+        """
+
+    async def send(
+        self,
+        ActorRef actor_ref,
+        object message,
+        bint wait_response=True,
+        object profiling_context=None,
+    ):
+        """
+        Send a message to given actor by its reference
+
+        Parameters
+        ----------
+        actor_ref : ActorRef
+            Reference to an actor
+        message : object
+            Message to send to an actor, need to comply to Actor.__on_receive__
+        wait_response : bool
+            Whether to wait for responses from the actor.
+        profiling_context: ProfilingContext
+            The profiling context.
+
+        Returns
+        -------
+        object
+        """
+        raise NotImplementedError
+
+    async def actor_ref(self, *args, **kwargs):
+        """
+        Create a reference to an actor
+
+        Returns
+        -------
+        ActorRef
+        """
+        raise NotImplementedError
+
+    async def wait_actor_pool_recovered(self, str address, str main_address = None):
+        """
+        Wait until an actor pool is recovered
+
+        Parameters
+        ----------
+        address
+            address of the actor pool
+        main_address
+            address of the main pool
+        """
+        raise NotImplementedError
+
+    async def get_pool_config(self, str address):
+        """
+        Get config of actor pool with given address
+
+        Parameters
+        ----------
+        address
+            address of the actor pool
+
+        Returns
+        -------
+
+        """
+        raise NotImplementedError
+
+
+cdef class ClientActorContext(BaseActorContext):
+    """
+    Default actor context. This context will keep references to other contexts
+    given their protocol scheme (i.e., `ray://xxx`).
+    """
+    cdef dict _backend_contexts
+
+    def __init__(self, address: str = None):
+        BaseActorContext.__init__(self, address)
+        self._backend_contexts = dict()
+
+    cdef inline object _get_backend_context(self, object address):
+        if address is None:
+            raise ValueError('address has to be provided')
+        if '://' not in address:
+            scheme = None
+        else:
+            scheme = urlparse(address).scheme or None
+        try:
+            return self._backend_contexts[scheme]
+        except KeyError:
+            context = self._backend_contexts[scheme] = \
+                _backend_context_cls[scheme](address)
+            return context
+
+    def create_actor(
+        self,
+        object actor_cls,
+        *args,
+        object uid=None,
+        object address=None,
+        **kwargs,
+    ):
+        context = self._get_backend_context(address)
+        uid = uid or new_actor_id()
+        return context.create_actor(actor_cls, *args, uid=uid, address=address, **kwargs)
+
+    def has_actor(self, ActorRef actor_ref):
+        context = self._get_backend_context(actor_ref.address)
+        return context.has_actor(actor_ref)
+
+    def destroy_actor(self, ActorRef actor_ref):
+        context = self._get_backend_context(actor_ref.address)
+        return context.destroy_actor(actor_ref)
+
+    def kill_actor(self, ActorRef actor_ref):
+        context = self._get_backend_context(actor_ref.address)
+        return context.kill_actor(actor_ref)
+
+    def actor_ref(self, *args, **kwargs):
+        actor_ref = create_actor_ref(*args, **kwargs)
+        context = self._get_backend_context(actor_ref.address)
+        return context.actor_ref(actor_ref)
+
+    def send(
+        self,
+        ActorRef actor_ref,
+        object message,
+        bint wait_response=True,
+        object profiling_context=None
+    ):
+        context = self._get_backend_context(actor_ref.address)
+        return context.send(
+            actor_ref,
+            message,
+            wait_response=wait_response,
+            profiling_context=profiling_context,
+        )
+
+    def wait_actor_pool_recovered(self, str address, str main_address = None):
+        context = self._get_backend_context(address)
+        return context.wait_actor_pool_recovered(address, main_address)
+
+    def get_pool_config(self, str address):
+        context = self._get_backend_context(address)
+        return context.get_pool_config(address)
+
+
+def register_backend_context(scheme, cls):
+    assert issubclass(cls, BaseActorContext)
+    _backend_context_cls[scheme] = cls
+
+
+cpdef get_context():
+    """
+    Get an actor context. If not in an actor environment,
+    ClientActorContext will be used
+    """
+    global _context
+    if _context is None:
+        _context = ClientActorContext()
+    return _context
diff --git a/python/xorbits/_mars/oscar/core.pxd b/python/xorbits/_mars/oscar/core.pxd
new file mode 100644
index 000000000..02f6dab9c
--- /dev/null
+++ b/python/xorbits/_mars/oscar/core.pxd
@@ -0,0 +1,39 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+cdef class ActorRef:
+    cdef object __weakref__
+    cdef public str address
+    cdef public object uid
+    cdef dict _methods
+
+
+cdef class LocalActorRef(ActorRef):
+    cdef object _actor_weakref
+    cdef _weakref_local_actor(self)
+
+
+cdef class _BaseActor:
+    cdef object __weakref__
+    cdef str _address
+    cdef object _lock
+    cdef object _uid
+
+    cpdef ActorRef ref(self)
+
+
+cdef class ActorEnvironment:
+    cdef public dict actor_locks
+    cdef public object address
diff --git a/python/xorbits/_mars/oscar/core.pyx b/python/xorbits/_mars/oscar/core.pyx
new file mode 100644
index 000000000..d07bf181c
--- /dev/null
+++ b/python/xorbits/_mars/oscar/core.pyx
@@ -0,0 +1,549 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import inspect
+import logging
+import sys
+import weakref
+from typing import AsyncGenerator
+
+cimport cython
+
+from .context cimport get_context
+
+from .errors import ActorNotExist, Return
+
+from .utils cimport is_async_generator
+
+CALL_METHOD_DEFAULT = 0
+CALL_METHOD_BATCH = 1
+
+logger = logging.getLogger(__name__)
+
+cdef:
+    bint _log_unhandled_errors = False
+    bint _log_cycle_send = False
+    dict _local_pool_map = dict()
+    object _actor_method_wrapper
+
+
+def set_debug_options(options):
+    global _log_unhandled_errors, _log_cycle_send
+    if options is None:
+        _log_unhandled_errors = _log_cycle_send = False
+    else:
+        _log_unhandled_errors = options.log_unhandled_errors
+        _log_cycle_send = options.log_cycle_send
+
+
+cdef _get_local_actor(address, uid):
+    # Do not expose this method to Python to avoid actor being
+    # referenced everywhere.
+    #
+    # The cycle send detection relies on send message, so we
+    # disabled the local actor proxy if the debug option is on.
+    if _log_cycle_send:
+        return None
+    pool_ref = _local_pool_map.get(address)
+    pool = None if pool_ref is None else pool_ref()
+    if pool is not None:
+        actor = pool._actors.get(uid)
+        if actor is not None:
+            return actor
+    return None
+
+
+def register_local_pool(address, pool):
+    """
+    Register local actor pool for local actor lookup.
+    """
+    _local_pool_map[address] = weakref.ref(
+        pool, lambda _: _local_pool_map.pop(address, None)
+    )
+
+
+cpdef create_local_actor_ref(address, uid):
+    """
+    Create a reference to local actor.
+
+    Returns
+    -------
+    LocalActorRef or None
+    """
+    actor = _get_local_actor(address, uid)
+    if actor is not None:
+        return LocalActorRef(actor)
+    return None
+
+
+cpdef create_actor_ref(address, uid):
+    """
+    Create an actor reference.
+    TODO(fyrestone): Remove the create_actor_ref in utils.pyx
+
+    Returns
+    -------
+    ActorRef or LocalActorRef
+    """
+    actor = _get_local_actor(address, uid)
+    return ActorRef(address, uid) if actor is None else LocalActorRef(actor)
+
+
+cdef class ActorRef:
+    """
+    Reference of an Actor at user side
+    """
+    def __init__(self, str address, object uid):
+        if isinstance(uid, str):
+            uid = uid.encode()
+        self.uid = uid
+        self.address = address
+        self._methods = dict()
+
+    def destroy(self, object callback=None):
+        ctx = get_context()
+        return ctx.destroy_actor(self)
+
+    def __reduce__(self):
+        return create_actor_ref, (self.address, self.uid)
+
+    def __getattr__(self, item):
+        if item.startswith('_'):
+            return object.__getattribute__(self, item)
+
+        try:
+            return self._methods[item]
+        except KeyError:
+            method = self._methods[item] = ActorRefMethod(self, item)
+            return method
+
+    def __hash__(self):
+        return hash((self.address, self.uid))
+
+    def __eq__(self, other):
+        other_type = type(other)
+        if other_type is ActorRef or other_type is LocalActorRef:
+            return self.address == other.address and self.uid == other.uid
+        return False
+
+    def __repr__(self):
+        return 'ActorRef(uid={!r}, address={!r})'.format(self.uid, self.address)
+
+
+cdef class _DelayedArgument:
+    cdef readonly tuple arguments
+
+    def __init__(self, tuple arguments):
+        self.arguments = arguments
+
+
+cdef class ActorRefMethod:
+    """
+    Wrapper for an Actor method at client
+    """
+    cdef ActorRef ref
+    cdef object method_name
+    cdef object _options
+
+    def __init__(self, ref, method_name, options=None):
+        self.ref = ref
+        self.method_name = method_name
+        self._options = options or {}
+
+    def __call__(self, *args, **kwargs):
+        return self.send(*args, **kwargs)
+
+    def options(self, **options):
+        return ActorRefMethod(self.ref, self.method_name, options)
+
+    def send(self, *args, **kwargs):
+        arg_tuple = (self.method_name, CALL_METHOD_DEFAULT, args, kwargs)
+        return get_context().send(self.ref, arg_tuple, **self._options)
+
+    def tell(self, *args, **kwargs):
+        arg_tuple = (self.method_name, CALL_METHOD_DEFAULT, args, kwargs)
+        return get_context().send(self.ref, arg_tuple, wait_response=False, **self._options)
+
+    def delay(self, *args, **kwargs):
+        arg_tuple = (self.method_name, CALL_METHOD_DEFAULT, args, kwargs)
+        return _DelayedArgument(arg_tuple)
+
+    def batch(self, *delays, send=True):
+        cdef:
+            long n_delays = len(delays)
+            bint has_kw = False
+            list args_list
+            list kwargs_list
+            _DelayedArgument delay
+
+        args_list = [None] * n_delays
+        kwargs_list = [None] * n_delays
+
+        last_method = None
+        for idx in range(n_delays):
+            delay = delays[idx]
+            method, _call_method, args, kwargs = delay.arguments
+            if last_method is not None and method != last_method:
+                raise ValueError('Does not support calling multiple methods in batch')
+            last_method = method
+
+            args_list[idx] = args
+            kwargs_list[idx] = kwargs
+            if kwargs:
+                has_kw = True
+
+        if not has_kw:
+            kwargs_list = None
+        if last_method is None:
+            last_method = self.method_name
+
+        message = (last_method, CALL_METHOD_BATCH, (args_list, kwargs_list), None)
+        return get_context().send(self.ref, message, wait_response=send, **self._options)
+
+    def tell_delay(self, *args, delay=None, ignore_conn_fail=True, **kwargs):
+        async def delay_fun():
+            try:
+                await asyncio.sleep(delay)
+                message = (self.method_name, CALL_METHOD_DEFAULT, args, kwargs)
+                await get_context().send(self.ref, message, wait_response=False, **self._options)
+            except Exception as ex:
+                if ignore_conn_fail and isinstance(ex, ConnectionRefusedError):
+                    return
+
+                logger.error(f'Error {type(ex)} occurred when calling {self.method_name} '
+                             f'on {self.ref.uid} at {self.ref.address} with tell_delay')
+                raise
+
+        return asyncio.create_task(delay_fun())
+
+
+cdef class LocalActorRef(ActorRef):
+    def __init__(self, _BaseActor actor):
+        # Make sure the input actor is an instance of _BaseActor.
+        super().__init__(actor._address, actor._uid)
+        self._actor_weakref = weakref.ref(actor, lambda _: self._methods.clear())
+
+    cdef _weakref_local_actor(self):
+        actor = _get_local_actor(self.address, self.uid)
+        # Make sure the input actor is an instance of _BaseActor.
+        if actor is not None and isinstance(actor, _BaseActor):
+            self._actor_weakref = weakref.ref(actor, lambda _: self._methods.clear())
+            return actor
+        return None
+
+    def __getattr__(self, item):
+        try:
+            return self._methods[item]
+        except KeyError:
+            actor = self._actor_weakref() or self._weakref_local_actor()
+            if actor is None:
+                raise ActorNotExist(f"Actor {self.uid} does not exist") from None
+            # For detecting the attribute error.
+            getattr(actor, item)
+            method = self._methods[item] = LocalActorRefMethod(self, item)
+            return method
+
+    def __repr__(self):
+        return 'LocalActorRef(uid={!r}, address={!r}), actor_weakref={!r}'.format(
+            self.uid, self.address, self._actor_weakref)
+
+
+async def __pyx_actor_method_wrapper(method, result_handler, lock, args, kwargs):
+    async with lock:
+        result = method(*args, **kwargs)
+        if asyncio.iscoroutine(result):
+            result = await result
+    return await result_handler(result)
+
+# Avoid global lookup.
+_actor_method_wrapper = __pyx_actor_method_wrapper
+
+
+cdef class LocalActorRefMethod:
+    cdef LocalActorRef _local_actor_ref
+    cdef object _method_name
+
+    def __init__(self, LocalActorRef local_actor_ref, method_name):
+        self._local_actor_ref = local_actor_ref
+        self._method_name = method_name
+
+    cdef tuple _get_referent(self):
+        actor = self._local_actor_ref._actor_weakref() or self._local_actor_ref._weakref_local_actor()
+        if actor is None:
+            raise ActorNotExist(f"Actor {self._local_actor_ref.uid} does not exist.")
+        method = getattr(actor, self._method_name)
+        return actor, method
+
+    def __call__(self, *args, **kwargs):
+        actor, method = self._get_referent()
+        return _actor_method_wrapper(
+            method, actor._handle_actor_result, (<_BaseActor>actor)._lock, args, kwargs)
+
+    def options(self, **options):
+        return self
+
+    def send(self, *args, **kwargs):
+        actor, method = self._get_referent()
+        return _actor_method_wrapper(
+            method, actor._handle_actor_result, (<_BaseActor>actor)._lock, args, kwargs)
+
+    def tell(self, *args, **kwargs):
+        actor, method = self._get_referent()
+        coro = _actor_method_wrapper(
+            method, actor._handle_actor_result, (<_BaseActor>actor)._lock, args, kwargs)
+        asyncio.create_task(coro)
+        return asyncio.sleep(0)
+
+    def delay(self, *args, **kwargs):
+        actor, method = self._get_referent()
+        return method.delay(*args, **kwargs)
+
+    def batch(self, *delays, send=True):
+        actor, method = self._get_referent()
+        coro = _actor_method_wrapper(
+            method.batch, actor._handle_actor_result, (<_BaseActor>actor)._lock, delays, dict())
+        if send:
+            return coro
+        else:
+            asyncio.create_task(coro)
+            return asyncio.sleep(0)
+
+    def tell_delay(self, *args, delay=None, ignore_conn_fail=True, **kwargs):
+        async def delay_fun():
+            await asyncio.sleep(delay)
+            await self.tell(*args, **kwargs)
+
+        return asyncio.create_task(delay_fun())
+
+
+cdef class _BaseActor:
+    """
+    Base Mars actor class, user methods implemented as methods
+    """
+    def __cinit__(self, *args, **kwargs):
+        self._lock = self._create_lock()
+
+    def _create_lock(self):
+        raise NotImplementedError
+
+    @property
+    def uid(self):
+        return self._uid
+
+    @uid.setter
+    def uid(self, uid):
+        self._uid = uid
+
+    def _set_uid(self, uid):
+        self._uid = uid
+
+    @property
+    def address(self):
+        return self._address
+
+    @address.setter
+    def address(self, addr):
+        self._address = addr
+
+    def _set_address(self, addr):
+        self._address = addr
+
+    cpdef ActorRef ref(self):
+        return create_actor_ref(self._address, self._uid)
+
+    async def _handle_actor_result(self, result):
+        cdef int idx
+        cdef tuple res_tuple
+        cdef list tasks, coros, coro_poses, values
+        cdef object coro
+        cdef bint extract_tuple = False
+        cdef bint cancelled = False
+        cdef set dones, pending
+
+        if inspect.isawaitable(result):
+            result = await result
+        elif is_async_generator(result):
+            result = (result,)
+            extract_tuple = True
+
+        if type(result) is tuple:
+            res_tuple = result
+            coros = []
+            coro_poses = []
+            values = []
+            for idx, res_item in enumerate(res_tuple):
+                if is_async_generator(res_item):
+                    value = self._run_actor_async_generator(res_item)
+                    coros.append(value)
+                    coro_poses.append(idx)
+                elif inspect.isawaitable(res_item):
+                    value = res_item
+                    coros.append(value)
+                    coro_poses.append(idx)
+                else:
+                    value = res_item
+                values.append(value)
+
+            # when there is only one coroutine, we do not need to use
+            # asyncio.wait as it introduces much overhead
+            if len(coros) == 1:
+                task_result = await coros[0]
+                if extract_tuple:
+                    result = task_result
+                else:
+                    result = tuple(task_result if t is coros[0] else t for t in values)
+            elif len(coros) > 0:
+                tasks = [asyncio.create_task(t) for t in coros]
+                try:
+                    dones, pending = await asyncio.wait(tasks)
+                except asyncio.CancelledError:
+                    cancelled = True
+                    for task in tasks:
+                        task.cancel()
+                    # wait till all tasks return cancelled
+                    dones, pending = await asyncio.wait(tasks)
+
+                if extract_tuple:
+                    result = list(dones)[0].result()
+                else:
+                    for pos in coro_poses:
+                        task = tasks[pos]
+                        values[pos] = task.result()
+                    result = tuple(values)
+
+                if cancelled:
+                    # raise in case no CancelledError raised
+                    raise asyncio.CancelledError
+
+        return result
+
+    async def _run_actor_async_generator(self, gen: AsyncGenerator):
+        """
+        Run an async generator under Actor lock
+        """
+        cdef tuple res_tuple
+        cdef bint is_exception = False
+        cdef object res
+        cdef object message_trace = None, pop_message_trace = None, set_message_trace = None
+
+        from .debug import debug_async_timeout, pop_message_trace, set_message_trace
+        try:
+            res = None
+            while True:
+                async with self._lock:
+                    with debug_async_timeout('actor_lock_timeout',
+                                             'async_generator %r hold lock timeout', gen):
+                        if not is_exception:
+                            res = await gen.asend(res)
+                        else:
+                            res = await gen.athrow(*res)
+                try:
+                    if _log_cycle_send:
+                        message_trace = pop_message_trace()
+
+                    res = await self._handle_actor_result(res)
+                    is_exception = False
+                except:
+                    res = sys.exc_info()
+                    is_exception = True
+                finally:
+                    if _log_cycle_send:
+                        set_message_trace(message_trace)
+        except Return as ex:
+            return ex.value
+        except StopAsyncIteration as ex:
+            return
+
+    async def __post_create__(self):
+        """
+        Method called after actor creation
+        """
+        pass
+
+    async def __pre_destroy__(self):
+        """
+        Method called before actor destroy
+        """
+        pass
+
+    async def __on_receive__(self, tuple message):
+        """
+        Handle message from other actors and dispatch them to user methods
+
+        Parameters
+        ----------
+        message : tuple
+            Message shall be (method_name,) + args + (kwargs,)
+        """
+        from .debug import debug_async_timeout
+        try:
+            method, call_method, args, kwargs = message
+            if call_method == CALL_METHOD_DEFAULT:
+                func = getattr(self, method)
+                async with self._lock:
+                    with debug_async_timeout('actor_lock_timeout',
+                                             "Method %s of actor %s hold lock timeout.",
+                                             method, self.uid):
+                        result = func(*args, **kwargs)
+                        if asyncio.iscoroutine(result):
+                            result = await result
+            elif call_method == CALL_METHOD_BATCH:
+                func = getattr(self, method)
+                async with self._lock:
+                    with debug_async_timeout('actor_lock_timeout',
+                                             "Batch method %s of actor %s hold lock timeout, batch size %s.",
+                                             method, self.uid, len(args)):
+                        args_list, kwargs_list = args
+                        if kwargs_list is None:
+                            kwargs_list = [{}] * len(args_list)
+                        result = func.call_with_lists(args_list, kwargs_list)
+                        if asyncio.iscoroutine(result):
+                            result = await result
+            else:  # pragma: no cover
+                raise ValueError(f'call_method {call_method} not valid')
+
+            return await self._handle_actor_result(result)
+        except Exception as ex:
+            if _log_unhandled_errors:
+                from .debug import logger as debug_logger
+
+                # use `%.500` to avoid print too long messages
+                debug_logger.exception('Got unhandled error when handling message %.500r '
+                                       'in actor %s at %s', message, self.uid, self.address)
+            raise ex
+
+
+# The @cython.binding(True) is for ray getting members.
+# The value is True by default after cython >= 3.0.0
+@cython.binding(True)
+cdef class _Actor(_BaseActor):
+    def _create_lock(self):
+        return asyncio.locks.Lock()
+
+
+cdef class _FakeLock:
+    async def __aenter__(self):
+        pass
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        pass
+
+
+# The @cython.binding(True) is for ray getting members.
+# The value is True by default after cython >= 3.0.0
+@cython.binding(True)
+cdef class _StatelessActor(_BaseActor):
+    def _create_lock(self):
+        return _FakeLock()
diff --git a/python/xorbits/_mars/oscar/debug.py b/python/xorbits/_mars/oscar/debug.py
new file mode 100644
index 000000000..fc9e893e3
--- /dev/null
+++ b/python/xorbits/_mars/oscar/debug.py
@@ -0,0 +1,182 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio.tasks
+import contextvars
+import json
+import logging
+import os
+import time
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import List, Optional  # noqa: F401
+
+from ..utils import dataslots
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+@dataslots
+@dataclass
+class MessageTraceItem:
+    uid: str
+    address: str
+    method: str
+
+
+@dataslots
+@dataclass
+class DebugOptions:
+    actor_call_timeout: int = 10
+    process_message_timeout: int = 30
+    actor_lock_timeout: int = 30
+    ray_object_retrieval_timeout: int = 10
+    log_unhandled_errors: bool = True
+    log_cycle_send: bool = True
+
+
+_debug_opts: Optional[DebugOptions] = None
+
+
+def get_debug_options() -> Optional[DebugOptions]:
+    return _debug_opts
+
+
+def set_debug_options(options: Optional[DebugOptions]):
+    global _debug_opts
+    _debug_opts = options
+
+    # deliver debug config to native codes for optimization
+    from .core import set_debug_options as core_set_debug_options
+
+    core_set_debug_options(options)
+
+
+def reload_debug_opts_from_env():
+    config_str = os.environ.get("DEBUG_OSCAR", "0")
+    if config_str == "0":
+        set_debug_options(None)
+        return
+    config_str = os.environ["DEBUG_OSCAR"]
+    config_json = {} if config_str == "1" else json.loads(config_str)
+    set_debug_options(DebugOptions(**config_json))
+
+
+async def _log_timeout(timeout, msg, *args, **kwargs):
+    start_time, rnd = time.time(), 1
+    while True:
+        await asyncio.sleep(timeout * rnd)
+        rnd += 1
+        logger.warning(
+            msg + " (timeout for %.4f seconds).",
+            *args,
+            time.time() - start_time,
+            **kwargs,
+        )
+
+
+@contextmanager
+def debug_async_timeout(option_name: str, msg, *args, **kwargs):
+    if _debug_opts is None:
+        yield
+    else:
+        timeout_val = getattr(_debug_opts, option_name, -1)
+        timeout_task = None
+        if timeout_val and timeout_val > 0:
+            timeout_task = asyncio.create_task(
+                _log_timeout(timeout_val, msg, *args, **kwargs)
+            )
+
+        try:
+            yield
+        finally:
+            if timeout_task is not None:
+                timeout_task.cancel()
+
+
+_message_trace_var = contextvars.ContextVar("_message_trace_var")
+
+
+@contextmanager
+def record_message_trace(message):
+    if _debug_opts is None or not _debug_opts.log_cycle_send:
+        yield
+    else:
+        msg_trace = list(message.message_trace or [])
+        msg_trace.append(
+            MessageTraceItem(
+                uid=message.actor_ref.uid,
+                address=message.actor_ref.address,
+                method=message.content[0],
+            )
+        )
+        _message_trace_var.set(msg_trace)
+        try:
+            yield
+        finally:
+            _message_trace_var.set(None)
+
+
+def detect_cycle_send(message, wait_response: bool = True):
+    if _debug_opts is None or not _debug_opts.log_cycle_send or not wait_response:
+        return
+
+    cur_trace = _message_trace_var.get(None) or []  # type: List[MessageTraceItem]
+    message.message_trace = cur_trace
+
+    ref_key = (message.actor_ref.uid, message.actor_ref.address)
+    traced_ref_keys = set((item.uid, item.address) for item in cur_trace)
+    if ref_key in traced_ref_keys:
+        looped_trace = cur_trace + [
+            MessageTraceItem(
+                uid=message.actor_ref.uid,
+                address=message.actor_ref.address,
+                method=message.content[0],
+            )
+        ]
+
+        formatted_trace = "\n    ".join(
+            f"Calling {t.method!r} in actor {t.uid} at {t.address}"
+            for t in looped_trace
+        )
+        logger.warning(
+            "Call cycle detected when sending to actor %s at %s, the trace is\n"
+            "    %s",
+            message.actor_ref.uid,
+            message.actor_ref.address,
+            formatted_trace,
+        )
+
+
+@contextmanager
+def no_message_trace():
+    if _debug_opts is None or not _debug_opts.log_cycle_send:
+        yield
+    else:
+        trace = pop_message_trace()
+        yield
+        set_message_trace(trace)
+
+
+def pop_message_trace():
+    trace = _message_trace_var.get(None)
+    _message_trace_var.set(None)
+    return trace
+
+
+def set_message_trace(message_trace):
+    _message_trace_var.set(message_trace)
+
+
+reload_debug_opts_from_env()
diff --git a/python/xorbits/_mars/oscar/driver.py b/python/xorbits/_mars/oscar/driver.py
new file mode 100644
index 000000000..baadfedee
--- /dev/null
+++ b/python/xorbits/_mars/oscar/driver.py
@@ -0,0 +1,41 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from numbers import Number
+from typing import Dict, Type
+
+
+class BaseActorDriver(ABC):
+    @classmethod
+    @abstractmethod
+    def setup_cluster(cls, address_to_resources: Dict[str, Dict[str, Number]]):
+        """
+        Setup cluster according to given resources,
+        resources is a dict, e.g. {'CPU': 3, 'GPU': 1}
+
+        Parameters
+        ----------
+        address_to_resources: dict
+            resources that required for each node.
+        """
+        pass
+
+
+_backend_driver_cls: Dict[str, Type[BaseActorDriver]] = dict()
+
+
+def register_backend_driver(scheme: str, cls: Type[BaseActorDriver]):
+    assert issubclass(cls, BaseActorDriver)
+    _backend_driver_cls[scheme] = cls
diff --git a/python/xorbits/_mars/oscar/errors.py b/python/xorbits/_mars/oscar/errors.py
new file mode 100644
index 000000000..5af99b8e2
--- /dev/null
+++ b/python/xorbits/_mars/oscar/errors.py
@@ -0,0 +1,60 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..core.base import MarsError
+
+
+class ReconstructWorkerError(MarsError):
+    pass
+
+
+class ActorPoolNotStarted(MarsError):
+    pass
+
+
+class ActorNotExist(MarsError):
+    pass
+
+
+class ActorAlreadyExist(MarsError):
+    pass
+
+
+class NoIdleSlot(MarsError):
+    pass
+
+
+class NoFreeSlot(MarsError):
+    pass
+
+
+class SlotStateError(MarsError):
+    pass
+
+
+class ServerClosed(MarsError):
+    pass
+
+
+class CannotCancelTask(MarsError):
+    pass
+
+
+class SendMessageFailed(MarsError):
+    pass
+
+
+class Return(MarsError):
+    def __init__(self, value):
+        self.value = value
diff --git a/python/xorbits/_mars/oscar/profiling.py b/python/xorbits/_mars/oscar/profiling.py
new file mode 100644
index 000000000..c9a0eb880
--- /dev/null
+++ b/python/xorbits/_mars/oscar/profiling.py
@@ -0,0 +1,293 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import copy
+import heapq
+import json
+import logging
+import operator
+import os
+from collections import Counter
+from collections.abc import Mapping
+
+from ..typing import BandType
+from .backends.message import SendMessage, TellMessage
+
+logger = logging.getLogger(__name__)
+
+MARS_ENABLE_PROFILING = int(os.environ.get("MARS_ENABLE_PROFILING", 0))
+
+
+class _ProfilingOptionDescriptor:
+    def __init__(self, _type, default):
+        self._name = None
+        self._type = _type
+        self._default = default
+
+    def __get__(self, obj, cls):
+        if obj is None:
+            return self
+        v = obj._options.get(self._name)
+        if v is None:
+            v = os.environ.get(f"MARS_PROFILING_{self._name.upper()}", self._default)
+        if v is not None:
+            v = self._type(v)
+        # Cache the value.
+        obj.__dict__[self._name] = v
+        return v
+
+    def set_name(self, name: str):
+        self._name = name
+
+
+class _ProfilingOptionsMeta(type):
+    def __init__(cls, name, bases, classdict):
+        super(_ProfilingOptionsMeta, cls).__init__(name, bases, classdict)
+        for k, v in classdict.items():
+            if isinstance(v, _ProfilingOptionDescriptor):
+                v.set_name(k)
+
+
+class _ProfilingOptions(metaclass=_ProfilingOptionsMeta):
+    debug_interval_seconds = _ProfilingOptionDescriptor(float, default=None)
+    slow_calls_duration_threshold = _ProfilingOptionDescriptor(int, default=1)
+    slow_subtasks_duration_threshold = _ProfilingOptionDescriptor(int, default=10)
+
+    def __init__(self, options):
+        if isinstance(options, Mapping):
+            invalid_keys = options.keys() - type(self).__dict__.keys()
+            if invalid_keys:
+                raise ValueError(f"Invalid profiling options: {invalid_keys}")
+            self._options = options
+        elif options in (True, False, None):
+            self._options = {}
+        else:
+            raise ValueError(f"Invalid profiling options: {options}")
+
+
+class DummyOperator:
+    @staticmethod
+    def set(key, value):
+        pass
+
+    @staticmethod
+    def inc(key, value):
+        pass
+
+    @staticmethod
+    def nest(key):
+        return DummyOperator
+
+    @staticmethod
+    def values():
+        return []
+
+    @staticmethod
+    def empty():
+        return True
+
+
+class ProfilingDataOperator:
+    __slots__ = ("_target",)
+
+    def __init__(self, target):
+        self._target = target
+
+    def set(self, key, value):
+        self._target[key] = value
+
+    def inc(self, key, value):
+        old = self._target.get(key, 0)
+        self._target[key] = old + value
+
+    def nest(self, key):
+        v = self._target.setdefault(key, {})
+        if not isinstance(v, dict):
+            raise TypeError(
+                f"The value type of key {key} is {type(v)}, but a dict is expected."
+            )
+        return ProfilingDataOperator(v)
+
+    def values(self):
+        return self._target.values()
+
+    def empty(self):
+        return len(self._target) == 0
+
+
+class _CallStats:
+    def __init__(self, options: _ProfilingOptions):
+        self._options = options
+        self._call_counter = Counter()
+        self._slow_calls = []
+
+    def collect(self, message, duration: float):
+        key = (message.actor_ref.uid, message.content[0])
+        self._call_counter[key] += 1
+        if duration < self._options.slow_calls_duration_threshold:
+            return
+        key = (
+            duration,
+            message.actor_ref.uid,
+            message.actor_ref.address,
+            message.content,
+        )
+        try:
+            if len(self._slow_calls) < 10:
+                heapq.heappush(self._slow_calls, key)
+            else:
+                heapq.heapreplace(self._slow_calls, key)
+        except TypeError:
+            pass
+
+    def to_dict(self) -> dict:
+        most_calls = {}
+        for name_tuple, count in self._call_counter.most_common(10):
+            uid, method_name = name_tuple
+            most_calls[f"{uid.decode('utf-8')}.{method_name}"] = count
+        slow_calls = {}
+        for duration, uid, address, content in sorted(
+            self._slow_calls, key=operator.itemgetter(0), reverse=True
+        ):
+            method_name, _batch, args, kwargs = content
+            slow_calls[
+                f"[{address}]{uid.decode('utf-8')}.{method_name}(args={args}, kwargs={kwargs})"
+            ] = duration
+        return {"most_calls": most_calls, "slow_calls": slow_calls}
+
+
+class _SubtaskStats:
+    def __init__(self, options: _ProfilingOptions):
+        self._options = options
+        self._band_counter = Counter()
+        self._slow_subtasks = []
+
+    def collect(self, subtask, band: BandType, duration: float):
+        band_address = band[0]
+        self._band_counter[band_address] += 1
+        if duration < self._options.slow_subtasks_duration_threshold:
+            return
+        key = (duration, band_address, subtask)
+        try:
+            if len(self._slow_subtasks) < 10:
+                heapq.heappush(self._slow_subtasks, key)
+            else:
+                heapq.heapreplace(self._slow_subtasks, key)
+        except TypeError:
+            pass
+
+    def to_dict(self) -> dict:
+        band_subtasks = {}
+        key = operator.itemgetter(1)
+        if len(self._band_counter) > 10:
+            items = self._band_counter.items()
+            band_subtasks.update(heapq.nlargest(5, items, key=key))
+            band_subtasks.update(reversed(heapq.nsmallest(5, items, key=key)))
+        else:
+            band_subtasks.update(
+                sorted(self._band_counter.items(), key=key, reverse=True)
+            )
+        slow_subtasks = {}
+        for duration, band, subtask in sorted(
+            self._slow_subtasks, key=operator.itemgetter(0), reverse=True
+        ):
+            slow_subtasks[f"[{band}]{subtask}"] = duration
+        return {"band_subtasks": band_subtasks, "slow_subtasks": slow_subtasks}
+
+
+class _ProfilingData:
+    def __init__(self):
+        self._data = {}
+        self._call_stats = {}
+        self._subtask_stats = {}
+        self._debug_task = {}
+
+    def init(self, task_id: str, options=None):
+        options = _ProfilingOptions(options)
+        logger.info(
+            "Init profiling data for task %s with debug interval seconds %s.",
+            task_id,
+            options.debug_interval_seconds,
+        )
+        self._data[task_id] = {
+            "general": {},
+            "serialization": {},
+            "most_calls": {},
+            "slow_calls": {},
+            "band_subtasks": {},
+            "slow_subtasks": {},
+        }
+        self._call_stats[task_id] = _CallStats(options)
+        self._subtask_stats[task_id] = _SubtaskStats(options)
+
+        async def _debug_profiling_log():
+            while True:
+                try:
+                    r = self._data.get(task_id, None)
+                    if r is None:
+                        logger.info("Profiling debug log break.")
+                        break
+                    r = copy.copy(r)  # shadow copy is enough.
+                    r.update(self._call_stats.get(task_id).to_dict())
+                    r.update(self._subtask_stats.get(task_id).to_dict())
+                    logger.warning("Profiling debug:\n%s", json.dumps(r, indent=4))
+                except Exception:
+                    logger.exception("Profiling debug log failed.")
+                await asyncio.sleep(options.debug_interval_seconds)
+
+        if options.debug_interval_seconds is not None:
+            self._debug_task[task_id] = task = asyncio.create_task(
+                _debug_profiling_log()
+            )
+            task.add_done_callback(lambda _: self._debug_task.pop(task_id, None))
+
+    def pop(self, task_id: str):
+        logger.info("Pop profiling data of task %s.", task_id)
+        debug_task = self._debug_task.pop(task_id, None)
+        if debug_task is not None:
+            debug_task.cancel()
+        r = self._data.pop(task_id, None)
+        if r is not None:
+            r.update(self._call_stats.pop(task_id).to_dict())
+            r.update(self._subtask_stats.pop(task_id).to_dict())
+        return r
+
+    def collect_actor_call(self, message, duration: float):
+        if self._call_stats:
+            message_type = type(message)
+            if message_type is SendMessage or message_type is TellMessage:
+                for stats in self._call_stats.values():
+                    stats.collect(message, duration)
+
+    def collect_subtask(self, subtask, band: BandType, duration: float):
+        if self._subtask_stats:
+            stats = self._subtask_stats.get(subtask.task_id)
+            if stats is not None:
+                stats.collect(subtask, band, duration)
+
+    def __getitem__(self, item):
+        key = item if isinstance(item, tuple) else (item,)
+        v = None
+        d = self._data
+        for k in key:
+            v = d.get(k, None)
+            if v is None:
+                break
+            else:
+                d = v
+        return DummyOperator if v is None else ProfilingDataOperator(v)
+
+
+ProfilingData = _ProfilingData()
diff --git a/python/xorbits/_mars/oscar/tests/__init__.py b/python/xorbits/_mars/oscar/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/oscar/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/oscar/tests/test_actorcaller.py b/python/xorbits/_mars/oscar/tests/test_actorcaller.py
new file mode 100644
index 000000000..1e271f7ca
--- /dev/null
+++ b/python/xorbits/_mars/oscar/tests/test_actorcaller.py
@@ -0,0 +1,84 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+
+import pytest
+
+from ...tests.core import mock
+from ..backends.core import ActorCaller
+from ..backends.router import Router
+from ..errors import ServerClosed
+
+
+@pytest.mark.asyncio
+@mock.patch.object(Router, "get_client")
+async def test_send_when_close(fake_get_client):
+    class FakeClient:
+        def __init__(self):
+            self.closed = False
+            self.send_num = 0
+            self._messages = asyncio.Queue()
+            self.dest_address = "test"
+
+        async def send(self, message):
+            await self._messages.put(message)
+            self.send_num += 1
+            if self.send_num >= 3:
+                raise ConnectionError("test")
+
+        async def recv(self, *args, **kwargs):
+            await asyncio.sleep(3)
+            res = await self._messages.get()
+            return res
+
+        async def close(self):
+            self.closed = True
+
+    fake_client = FakeClient()
+    fake_get_client.side_effect = lambda *args, **kwargs: fake_client
+
+    class FakeMessage:
+        def __init__(self, id_num):
+            self.message_id = id_num
+
+    caller = ActorCaller()
+
+    router = Router(
+        external_addresses=["test1"],
+        local_address="test2",
+    )
+    futures = []
+    for index in range(2):
+        futures.append(
+            await caller.call(
+                router=router,
+                dest_address="test1",
+                message=FakeMessage(index),
+                wait=False,
+            )
+        )
+
+    with pytest.raises(ServerClosed):
+        # Just wait _list run.
+        await asyncio.sleep(1)
+        await caller.call(
+            router=router, dest_address="test1", message=FakeMessage(2), wait=False
+        )
+
+    res0 = await futures[0]
+    assert res0.message_id == 0
+
+    with pytest.raises(ServerClosed):
+        await futures[1]
diff --git a/python/xorbits/_mars/oscar/tests/test_batch.py b/python/xorbits/_mars/oscar/tests/test_batch.py
new file mode 100644
index 000000000..bf81ad6b1
--- /dev/null
+++ b/python/xorbits/_mars/oscar/tests/test_batch.py
@@ -0,0 +1,216 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import sys
+
+import pytest
+
+from ..batch import build_args_binder, extensible
+
+
+def _wrap_async(use_async):
+    def wrapper(func):
+        async def _wrapped(*args, **kwargs):
+            return func(*args, **kwargs)
+
+        _wrapped.__name__ = func.__name__
+        return _wrapped if use_async else func
+
+    return wrapper
+
+
+def test_args_binder():
+    anon_binder = build_args_binder(lambda x, y=10: None, remove_self=False)
+    assert (20, 10) == anon_binder(20)
+
+    def fun1(a, b=10):
+        pass
+
+    binder1 = build_args_binder(fun1, remove_self=False)
+    assert (20, 10) == binder1(20)
+
+    async def fun2(*, kw_only=10, **kw):
+        pass
+
+    binder2 = build_args_binder(fun2, remove_self=False)
+    assert (20, {"ext_arg": 5}) == binder2(kw_only=20, ext_arg=5)
+
+    async def fun3(x, *args, kw_only=10, **kw):
+        pass
+
+    binder3 = build_args_binder(fun3, remove_self=False)
+    assert 10 == binder3(20, 36, ext_arg=5).kw_only
+    assert (20, (36,), 10, {"ext_arg": 5}) == binder3(20, 36, ext_arg=5)
+
+
+def test_extensible_bind():
+    class TestClass:
+        def __init__(self):
+            self.a_list = []
+            self.b_list = []
+
+        @extensible
+        def method(self, a, b=10):
+            pass
+
+        @method.batch
+        def method(self, args_list, kwargs_list):
+            for args, kwargs in zip(args_list, kwargs_list):
+                a, b = self.method.bind(*args, **kwargs)
+                self.a_list.append(a)
+                self.b_list.append(b)
+
+    test_inst = TestClass()
+    test_inst.method.batch(
+        test_inst.method.delay(20),
+        test_inst.method.delay(30, 5),
+    )
+    assert test_inst.a_list == [20, 30]
+    assert test_inst.b_list == [10, 5]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("use_async", [False, True])
+@pytest.mark.skipif(
+    sys.version_info[:2] < (3, 7), reason="only run with Python 3.7 or greater"
+)
+async def test_extensible_no_batch(use_async):
+    class TestClass:
+        def __init__(self):
+            self.arg_list = []
+            self.kwarg_list = []
+
+        @extensible
+        @_wrap_async(use_async)
+        def method(self, *args, **kwargs):
+            self.arg_list.append(tuple(a - 1 for a in args))
+            self.kwarg_list.append({k: v - 1 for k, v in kwargs.items()})
+            return len(self.kwarg_list)
+
+    test_inst = TestClass()
+    ret = test_inst.method.batch(
+        test_inst.method.delay(12, kwarg=34), test_inst.method.delay(10, kwarg=33)
+    )
+    ret = await ret if use_async else ret
+    assert ret == [1, 2]
+    assert test_inst.arg_list == [(11,), (9,)]
+    assert test_inst.kwarg_list == [{"kwarg": 33}, {"kwarg": 32}]
+
+    if use_async:
+        test_inst = TestClass()
+        ret = await test_inst.method.batch(
+            test_inst.method.delay(12, kwarg=34), test_inst.method.delay(10, kawarg=33)
+        )
+        assert ret == [1, 2]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("use_async", [False, True])
+async def test_extensible_batch_only(use_async):
+    class TestClass:
+        def __init__(self):
+            self.arg_list = []
+            self.kwarg_list = []
+
+        @extensible
+        @_wrap_async(use_async)
+        def not_implemented_method(self, *args, **kw):
+            raise NotImplementedError
+
+        @extensible
+        @_wrap_async(use_async)
+        def method(self, *args, **kwargs):
+            raise NotImplementedError
+
+        @method.batch
+        @_wrap_async(use_async)
+        def method(self, args_list, kwargs_list):
+            self.arg_list.extend(args_list)
+            self.kwarg_list.extend(kwargs_list)
+            return [len(self.kwarg_list)] * len(args_list)
+
+    if use_async:
+        assert asyncio.iscoroutinefunction(TestClass.method)
+
+    test_inst = TestClass()
+    ret = test_inst.method.batch()
+    ret = await ret if use_async else ret
+    assert ret == []
+
+    test_inst = TestClass()
+    ret = test_inst.method.batch(test_inst.method.delay(12))
+    ret = await ret if use_async else ret
+    assert ret == [1]
+
+    test_inst = TestClass()
+    ret = test_inst.method.batch(test_inst.method.delay(12), test_inst.method.delay(10))
+    ret = await ret if use_async else ret
+    assert ret == [2, 2]
+    assert test_inst.arg_list == [(12,), (10,)]
+    assert test_inst.kwarg_list == [{}, {}]
+
+    test_inst = TestClass()
+    for _ in range(2):
+        with pytest.raises(NotImplementedError):
+            ret = test_inst.not_implemented_method()
+            await ret if use_async else ret
+    ret = test_inst.method(12, kwarg=34)
+    ret = await ret if use_async else ret
+    assert ret == 1
+    assert test_inst.arg_list == [(12,)]
+    assert test_inst.kwarg_list == [{"kwarg": 34}]
+
+
+@pytest.mark.asyncio
+@pytest.mark.skipif(
+    sys.version_info[:2] < (3, 7), reason="only run with Python 3.7 or greater"
+)
+@pytest.mark.parametrize("use_async", [False, True])
+async def test_extensible_single_with_batch(use_async):
+    class TestClass:
+        def __init__(self):
+            self.arg_list = []
+            self.kwarg_list = []
+
+        @extensible
+        @_wrap_async(use_async)
+        def method(self, *args, **kwargs):
+            self.arg_list.append(tuple(a * 2 for a in args))
+            self.kwarg_list.append({k: v * 2 for k, v in kwargs.items()})
+            return len(self.kwarg_list)
+
+        @method.batch
+        @_wrap_async(use_async)
+        def method(self, args_list, kwargs_list):
+            self.arg_list.extend([tuple(a * 2 + 1 for a in args) for args in args_list])
+            self.kwarg_list.extend(
+                [{k: v * 2 + 1 for k, v in kwargs.items()} for kwargs in kwargs_list]
+            )
+            return [len(self.kwarg_list)] * len(args_list)
+
+    if use_async:
+        assert asyncio.iscoroutinefunction(TestClass.method)
+
+    test_inst = TestClass()
+    ret = test_inst.method(15, kwarg=56)
+    ret = await ret if use_async else ret
+    assert ret == 1
+    ret = test_inst.method.batch(
+        test_inst.method.delay(16, kwarg=57), test_inst.method.delay(17, kwarg=58)
+    )
+    ret = await ret if use_async else ret
+    assert ret == [3, 3]
+    assert test_inst.arg_list == [(30,), (33,), (35,)]
+    assert test_inst.kwarg_list == [{"kwarg": 112}, {"kwarg": 115}, {"kwarg": 117}]
diff --git a/python/xorbits/_mars/oscar/tests/test_profiling.py b/python/xorbits/_mars/oscar/tests/test_profiling.py
new file mode 100644
index 000000000..75802de9c
--- /dev/null
+++ b/python/xorbits/_mars/oscar/tests/test_profiling.py
@@ -0,0 +1,177 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import dataclasses
+import os
+
+import pytest
+
+from ...tests.core import check_dict_structure_same, mock
+from ..backends.message import SendMessage
+from ..profiling import (
+    DummyOperator,
+    ProfilingData,
+    ProfilingDataOperator,
+    _CallStats,
+    _ProfilingOptions,
+    _SubtaskStats,
+)
+
+
+def test_profiling_data():
+    ProfilingData.init("abc")
+    try:
+        for n in ["general", "serialization"]:
+            assert isinstance(ProfilingData["abc", n], ProfilingDataOperator)
+        assert ProfilingData["def"] is DummyOperator
+        assert ProfilingData["abc", "def"] is DummyOperator
+        assert ProfilingData["abc", "def", 1] is DummyOperator
+        ProfilingData["def"].set("a", 1)
+        ProfilingData["def"].inc("b", 1)
+        assert ProfilingData["def"].empty()
+        assert sum(ProfilingData["def"].nest("a").values()) == 0
+        ProfilingData["abc", "serialization"].set("a", 1)
+        ProfilingData["abc", "serialization"].inc("b", 1)
+        with pytest.raises(TypeError):
+            assert ProfilingData["abc", "serialization"].nest("a")
+        assert sum(ProfilingData["abc", "serialization"].nest("c").values()) == 0
+        assert not ProfilingData["abc", "serialization"].empty()
+    finally:
+        v = ProfilingData.pop("abc")
+        check_dict_structure_same(
+            v,
+            {
+                "general": {},
+                "serialization": {"a": 1, "b": 1, "c": {}},
+                "most_calls": {},
+                "slow_calls": {},
+                "band_subtasks": {},
+                "slow_subtasks": {},
+            },
+        )
+
+
+@pytest.mark.asyncio
+@mock.patch("mars.oscar.profiling.logger.warning")
+async def test_profiling_debug(fake_warning):
+    ProfilingData.init("abc", {"debug_interval_seconds": 0.1})
+    assert len(ProfilingData._debug_task) == 1
+    assert not ProfilingData._debug_task["abc"].done()
+    await asyncio.sleep(0.5)
+    assert fake_warning.call_count > 1
+    ProfilingData.pop("abc")
+    call_count = fake_warning.call_count
+    assert len(ProfilingData._debug_task) == 0
+    await asyncio.sleep(0.5)
+    assert fake_warning.call_count == call_count
+
+    ProfilingData.init("abc", {"debug_interval_seconds": 0.1})
+    assert len(ProfilingData._debug_task) == 1
+    await asyncio.sleep(0.5)
+    assert fake_warning.call_count > call_count
+    ProfilingData._data.clear()
+    call_count = fake_warning.call_count
+    await asyncio.sleep(0.5)
+    assert len(ProfilingData._debug_task) == 0
+    assert fake_warning.call_count == call_count
+
+
+@pytest.mark.asyncio
+async def test_profiling_options():
+    with pytest.raises(ValueError):
+        ProfilingData.init("abc", 1.2)
+    with pytest.raises(ValueError):
+        ProfilingData.init("abc", ["invalid"])
+    with pytest.raises(ValueError):
+        ProfilingData.init("abc", {"invalid": True})
+    with pytest.raises(ValueError):
+        ProfilingData.init("abc", {"debug_interval_seconds": "abc"})
+
+    # Test the priority, options first, then env var.
+    env_key = "MARS_PROFILING_DEBUG_INTERVAL_SECONDS"
+    try:
+        os.environ[env_key] = "2"
+        options = _ProfilingOptions(True)
+        assert options.debug_interval_seconds == 2.0
+        options = _ProfilingOptions({"debug_interval_seconds": 1.0})
+        assert options.debug_interval_seconds == 1.0
+    finally:
+        os.environ.pop(env_key)
+
+    # Test option value cache.
+    d = {"debug_interval_seconds": 1.0}
+    options = _ProfilingOptions(d)
+    assert options.debug_interval_seconds == 1.0
+    d["debug_interval_seconds"] = 2.0
+    assert options.debug_interval_seconds == 1.0
+    try:
+        os.environ[env_key] = "2"
+        assert options.debug_interval_seconds == 1.0
+    finally:
+        os.environ.pop(env_key)
+
+
+def test_collect():
+    options = _ProfilingOptions(
+        {"slow_calls_duration_threshold": 0, "slow_subtasks_duration_threshold": 0}
+    )
+
+    # Test collect message with incomparable arguments.
+    from ..core import ActorRef
+
+    fake_actor_ref = ActorRef("def", b"uid")
+    fake_message1 = SendMessage(b"abc", fake_actor_ref, ["name", {}])
+    fake_message2 = SendMessage(b"abc", fake_actor_ref, ["name", 1])
+
+    cs = _CallStats(options)
+    cs.collect(fake_message1, 1.0)
+    cs.collect(fake_message2, 1.0)
+
+    @dataclasses.dataclass
+    class _FakeSubtask:
+        extra_config: dict
+
+    # Test collect subtask with incomparable arguments.
+    band = ("1.2.3.4", "numa-0")
+    subtask1 = _FakeSubtask({})
+    subtask2 = _FakeSubtask(None)
+    ss = _SubtaskStats(options)
+    ss.collect(subtask1, band, 1.0)
+    ss.collect(subtask2, band, 1.0)
+
+    # Test call stats order.
+    cs = _CallStats(options)
+    for i in range(20):
+        fake_message = SendMessage(
+            f"{i}".encode(), fake_actor_ref, ["name", True, (i,), {}]
+        )
+        cs.collect(fake_message, i)
+    d = cs.to_dict()
+    assert list(d["most_calls"].values())[0] == 20
+    assert list(d["slow_calls"].values()) == list(reversed(range(10, 20)))
+
+    # Test subtask stats order.
+    ss = _SubtaskStats(options)
+    counter = 0
+    for i in range(20):
+        for j in range(i):
+            fake_message = _FakeSubtask(counter)
+            ss.collect(fake_message, (str(j), "numa-0"), counter)
+            counter += 1
+    d = ss.to_dict()
+    assert list(d["band_subtasks"].values()) == [19, 18, 17, 16, 15, 5, 4, 3, 2, 1]
+    assert list(d["slow_subtasks"].values()) == list(
+        reversed(range(counter - 10, counter))
+    )
diff --git a/python/xorbits/_mars/oscar/utils.pxd b/python/xorbits/_mars/oscar/utils.pxd
new file mode 100644
index 000000000..7acd646c1
--- /dev/null
+++ b/python/xorbits/_mars/oscar/utils.pxd
@@ -0,0 +1,16 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cpdef bytes new_actor_id()
+cdef bint is_async_generator(obj)
diff --git a/python/xorbits/_mars/oscar/utils.pyx b/python/xorbits/_mars/oscar/utils.pyx
new file mode 100644
index 000000000..d08e55998
--- /dev/null
+++ b/python/xorbits/_mars/oscar/utils.pyx
@@ -0,0 +1,77 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import AsyncGenerator
+
+from .._utils cimport new_random_id, to_str
+from .core cimport ActorRef, LocalActorRef
+
+
+cpdef bytes new_actor_id():
+    return new_random_id(32)
+
+
+def create_actor_ref(*args, **kwargs):
+    """
+    Create an actor reference.
+
+    Returns
+    -------
+    ActorRef
+    """
+
+    cdef str address
+    cdef object uid
+    cdef ActorRef existing_ref
+
+    address = to_str(kwargs.pop('address', None))
+    uid = kwargs.pop('uid', None)
+
+    if kwargs:
+        raise ValueError('Only `address` or `uid` keywords are supported')
+
+    if len(args) == 2:
+        if address:
+            raise ValueError('address has been specified')
+        address = to_str(args[0])
+        uid = args[1]
+    elif len(args) == 1:
+        tp0 = type(args[0])
+        if tp0 is ActorRef or tp0 is LocalActorRef:
+            existing_ref = (args[0])
+            uid = existing_ref.uid
+            address = to_str(address or existing_ref.address)
+        else:
+            uid = args[0]
+
+    if uid is None:
+        raise ValueError('Actor uid should be provided')
+
+    return ActorRef(address, uid)
+
+
+cdef set _is_async_generator_typecache = set()
+
+
+cdef bint is_async_generator(obj):
+    cdef type tp = type(obj)
+    if tp in _is_async_generator_typecache:
+        return True
+
+    if isinstance(obj, AsyncGenerator):
+        if len(_is_async_generator_typecache) < 100:
+            _is_async_generator_typecache.add(tp)
+        return True
+    else:
+        return False
diff --git a/python/xorbits/_mars/remote/__init__.py b/python/xorbits/_mars/remote/__init__.py
new file mode 100644
index 000000000..3aecffae3
--- /dev/null
+++ b/python/xorbits/_mars/remote/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# noinspection PyUnresolvedReferences
+from ..core import ExecutableTuple
+from .core import spawn
+from .run_script import run_script
diff --git a/python/xorbits/_mars/remote/core.py b/python/xorbits/_mars/remote/core.py
new file mode 100644
index 000000000..c5c5a443b
--- /dev/null
+++ b/python/xorbits/_mars/remote/core.py
@@ -0,0 +1,343 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import UserDict
+from collections.abc import Iterable
+from functools import partial
+
+import numpy as np
+
+from .. import opcodes
+from ..core import ENTITY_TYPE, ChunkData, OutputType, Tileable
+from ..core.custom_log import redirect_custom_log
+from ..core.operand import Operand
+from ..dataframe.core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
+from ..serialization.serializables import (
+    BoolField,
+    DictField,
+    FunctionField,
+    Int32Field,
+    ListField,
+)
+from ..tensor.core import TENSOR_TYPE
+from ..utils import (
+    build_fetch_tileable,
+    enter_current_session,
+    find_objects,
+    merge_chunks,
+    merged_chunk_as_tileable_type,
+    replace_objects,
+)
+from .operands import RemoteOperandMixin
+
+
+class RemoteFunction(RemoteOperandMixin, Operand):
+    _op_type_ = opcodes.REMOTE_FUNCATION
+    _op_module_ = "remote"
+
+    function = FunctionField("function")
+    function_args = ListField("function_args")
+    function_kwargs = DictField("function_kwargs")
+    retry_when_fail = BoolField("retry_when_fail")
+    resolve_tileable_input = BoolField("resolve_tileable_input", default=False)
+    n_output = Int32Field("n_output", default=None)
+
+    def __init__(self, output_types=None, **kwargs):
+        super().__init__(_output_types=output_types, **kwargs)
+
+    @property
+    def output_limit(self):
+        return self.n_output or 1
+
+    @property
+    def retryable(self) -> bool:
+        return self.retry_when_fail
+
+    @classmethod
+    def _no_prepare(cls, tileable):
+        return isinstance(
+            tileable, (TENSOR_TYPE, DATAFRAME_TYPE, SERIES_TYPE, INDEX_TYPE)
+        )
+
+    def _set_inputs(self, inputs):
+        raw_inputs = getattr(self, "_inputs", None)
+        super()._set_inputs(inputs)
+
+        function_inputs = iter(inp for inp in self._inputs)
+        mapping = {inp: new_inp for inp, new_inp in zip(inputs, self._inputs)}
+        if raw_inputs is not None:
+            for raw_inp in raw_inputs:
+                if self._no_prepare(raw_inp):
+                    if not isinstance(self._inputs[0], ChunkData):
+                        # not in tile, set_inputs from tileable
+                        mapping[raw_inp] = next(function_inputs)
+                    else:
+                        # in tile, set_inputs from chunk
+                        mapping[raw_inp] = build_fetch_tileable(raw_inp)
+                else:
+                    mapping[raw_inp] = next(function_inputs)
+        self.function_args = replace_objects(self.function_args, mapping)
+        self.function_kwargs = replace_objects(self.function_kwargs, mapping)
+
+    def __call__(self):
+        find_inputs = partial(find_objects, types=ENTITY_TYPE)
+        inputs = find_inputs(self.function_args) + find_inputs(self.function_kwargs)
+        if self.n_output is None:
+            return self.new_tileable(inputs)
+        else:
+            return self.new_tileables(
+                inputs, kws=[dict(i=i) for i in range(self.n_output)]
+            )
+
+    @classmethod
+    def tile(cls, op):
+        outs = op.outputs
+        chunk_op = op.copy().reset_key()
+
+        chunk_inputs = []
+        pure_depends = []
+        executed = False
+        for inp in op.inputs:
+            if cls._no_prepare(inp):  # pragma: no cover
+                if not executed:
+                    # trigger execution
+                    yield
+                else:
+                    executed = True
+                # if input is tensor, DataFrame etc,
+                # do not prepare data, because the data may be to huge,
+                # and users can choose to fetch slice of the data themselves
+                pure_depends.extend([not op.resolve_tileable_input] * len(inp.chunks))
+            else:
+                pure_depends.extend([False] * len(inp.chunks))
+            chunk_inputs.extend(inp.chunks)
+        chunk_op._pure_depends = pure_depends
+        # record tileable op key for chunk op
+        chunk_op.tileable_op_key = op.key
+
+        out_chunks = [list() for _ in range(len(outs))]
+        chunk_kws = []
+        for i, (out, out_type) in enumerate(zip(outs, op.output_types)):
+            chunk_params = out.params.copy()
+            chunk_params["i"] = i
+            chunk_kws.append(chunk_params)
+            if out_type == OutputType.dataframe:
+                chunk_params["index"] = (0, 0)
+                chunk_params["shape"] = (np.nan, np.nan)
+            elif out_type == OutputType.series:
+                chunk_params["index"] = (0,)
+                chunk_params["shape"] = (np.nan,)
+            elif out_type == OutputType.df_or_series:
+                chunk_params["index"] = (0, 0)
+                chunk_params["shape"] = (np.nan, np.nan)
+                chunk_params["collapse_axis"] = 1
+            else:
+                chunk_params["index"] = ()
+                chunk_params["shape"] = ()
+        chunks = chunk_op.new_chunks(chunk_inputs, kws=chunk_kws)
+        for i, c in enumerate(chunks):
+            out_chunks[i].append(c)
+
+        kws = []
+        for i, (out, out_type) in enumerate(zip(outs, op.output_types)):
+            params = out.params.copy()
+            params["chunks"] = out_chunks[i]
+            if out_type == OutputType.dataframe:
+                params["nsplits"] = ((np.nan,), (np.nan,))
+            elif out_type == OutputType.series:
+                params["nsplits"] = ((np.nan,),)
+            else:
+                params["nsplits"] = ()
+            kws.append(params)
+        new_op = op.copy()
+        return new_op.new_tileables(op.inputs, kws=kws)
+
+    @classmethod
+    @redirect_custom_log
+    @enter_current_session
+    def execute(cls, ctx, op: "RemoteFunction"):
+        class MapperWrapper(UserDict):
+            def __getitem__(self, item):
+                if op.resolve_tileable_input and isinstance(item, Tileable):
+                    index_chunks = [(c.index, ctx[c.key]) for c in item.chunks]
+                    merged = merge_chunks(index_chunks)
+                    return merged_chunk_as_tileable_type(merged, item)
+                return super().__getitem__(item)
+
+        mapping = MapperWrapper(
+            {
+                inp: ctx[inp.key]
+                for inp, is_pure_dep in zip(op.inputs, op.pure_depends)
+                if not is_pure_dep
+            }
+        )
+
+        function = op.function
+        function_args = replace_objects(op.function_args, mapping)
+        function_kwargs = replace_objects(op.function_kwargs, mapping)
+
+        result = function(*function_args, **function_kwargs)
+
+        if op.n_output is None:
+            ctx[op.outputs[0].key] = result
+        else:
+            if not isinstance(result, Iterable):
+                raise TypeError(
+                    f"Specifying n_output={op.n_output}, "
+                    f"but result is not iterable, got {result}"
+                )
+            result = list(result)
+            if len(result) != op.n_output:
+                raise ValueError(
+                    f"Length of return value should be {op.n_output}, "
+                    f"got {len(result)}"
+                )
+            for out, r in zip(op.outputs, result):
+                ctx[out.key] = r
+
+
+def spawn(
+    func,
+    args=(),
+    kwargs=None,
+    retry_when_fail=False,
+    resolve_tileable_input=False,
+    n_output=None,
+    output_types=None,
+    **kw,
+):
+    """
+    Spawn a function and return a Mars Object which can be executed later.
+
+    Parameters
+    ----------
+    func : function
+        Function to spawn.
+    args: tuple
+       Args to pass to function
+    kwargs: dict
+       Kwargs to pass to function
+    retry_when_fail: bool, default False
+       If True, retry when function failed.
+    resolve_tileable_input: bool default False
+       If True, resolve tileable inputs as values.
+    n_output: int
+       Count of outputs for the function
+    output_types: str or list, default "object"
+        Specify type of returned objects.
+
+    Returns
+    -------
+    Object
+        Mars Object.
+
+    Examples
+    --------
+    >>> import mars.remote as mr
+    >>> def inc(x):
+    >>>     return x + 1
+    >>>
+    >>> result = mr.spawn(inc, args=(0,))
+    >>> result
+    Object 
+    >>> result.execute().fetch()
+    1
+
+    List of spawned functions can be converted to :class:`mars.remote.ExecutableTuple`,
+    and `.execute()` can be called to run together.
+
+    >>> results = [mr.spawn(inc, args=(i,)) for i in range(10)]
+    >>> mr.ExecutableTuple(results).execute().fetch()
+    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+    Mars Object returned by :meth:`mars.remote.spawn` can be treated
+    as arguments for other spawn functions.
+
+    >>> results = [mr.spawn(inc, args=(i,)) for i in range(10)]   # list of spawned functions
+    >>> def sum_all(xs):
+            return sum(xs)
+    >>> mr.spawn(sum_all, args=(results,)).execute().fetch()
+    55
+
+    inside a spawned function, new functions can be spawned.
+
+    >>> def driver():
+    >>>     results = [mr.spawn(inc, args=(i,)) for i in range(10)]
+    >>>     return mr.ExecutableTuple(results).execute().fetch()
+    >>>
+    >>> mr.spawn(driver).execute().fetch()
+    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+    Mars tensor, DataFrame and so forth is available in spawned functions as well.
+
+    >>> import mars.tensor as mt
+    >>> def driver2():
+    >>>     t = mt.random.rand(10, 10)
+    >>>     return t.sum().to_numpy()
+    >>>
+    >>> mr.spawn(driver2).execute().fetch()
+    52.47844223908132
+
+    Argument of `n_output` can indicate that the spawned function will return multiple outputs.
+    This is important when some of the outputs may be passed to different functions.
+
+    >>> def triage(alist):
+    >>>     ret = [], []
+    >>>     for i in alist:
+    >>>         if i < 0.5:
+    >>>             ret[0].append(i)
+    >>>         else:
+    >>>             ret[1].append(i)
+    >>>     return ret
+    >>>
+    >>> def sum_all(xs):
+    >>>     return sum(xs)
+    >>>
+    >>> l = [0.4, 0.7, 0.2, 0.8]
+    >>> la, lb = mr.spawn(triage, args=(l,), n_output=2)
+    >>>
+    >>> sa = mr.spawn(sum_all, args=(la,))
+    >>> sb = mr.spawn(sum_all, args=(lb,))
+    >>> mr.ExecutableTuple([sa, sb]).execute().fetch()
+    >>> [0.6000000000000001, 1.5]
+    """
+    if not isinstance(args, tuple):
+        args = [args]
+    else:
+        args = list(args)
+    if kwargs is None:
+        kwargs = dict()
+    if not isinstance(output_types, (list, tuple)):
+        if output_types is None:
+            output_types = OutputType.object
+        elif isinstance(output_types, str):
+            output_types = getattr(OutputType, output_types)
+        output_types = [output_types] if n_output is None else [output_types] * n_output
+
+    if not isinstance(kwargs, dict):
+        raise TypeError("kwargs has to be a dict")
+
+    op = RemoteFunction(
+        function=func,
+        function_args=args,
+        function_kwargs=kwargs,
+        retry_when_fail=retry_when_fail,
+        resolve_tileable_input=resolve_tileable_input,
+        n_output=n_output,
+        output_types=output_types,
+        **kw,
+    )
+    if op.extra_params:
+        raise ValueError(f"Unexpected kw: {list(op.extra_params)[0]}")
+    return op()
diff --git a/python/xorbits/_mars/remote/operands.py b/python/xorbits/_mars/remote/operands.py
new file mode 100644
index 000000000..b01fc2588
--- /dev/null
+++ b/python/xorbits/_mars/remote/operands.py
@@ -0,0 +1,34 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..core import FuseChunk, FuseChunkData
+from ..core.operand import Fuse, FuseChunkMixin, ObjectOperandMixin
+
+
+class RemoteFuseChunkMixin(ObjectOperandMixin, FuseChunkMixin):
+    __slots__ = ()
+
+    def _create_chunk(self, output_idx, index, **kw):
+        data = FuseChunkData(_index=index, _op=self, **kw)
+
+        return FuseChunk(data)
+
+
+class RemoteFuseChunk(RemoteFuseChunkMixin, Fuse):
+    pass
+
+
+class RemoteOperandMixin(ObjectOperandMixin):
+    def get_fuse_op_cls(self, _):
+        return RemoteFuseChunk
diff --git a/python/xorbits/_mars/remote/run_script.py b/python/xorbits/_mars/remote/run_script.py
new file mode 100644
index 000000000..3284ff0a6
--- /dev/null
+++ b/python/xorbits/_mars/remote/run_script.py
@@ -0,0 +1,248 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+from typing import Any, BinaryIO, Dict, List, TextIO, Union
+
+import numpy as np
+
+from .. import opcodes
+from ..core import TILEABLE_TYPE, OutputType
+from ..core.context import Context
+from ..core.operand import MergeDictOperand
+from ..serialization.serializables import (
+    BoolField,
+    BytesField,
+    DictField,
+    Int32Field,
+    ListField,
+)
+from ..typing import SessionType, TileableType
+from ..utils import build_fetch_tileable, to_binary
+
+
+class RunScript(MergeDictOperand):
+    _op_type_ = opcodes.RUN_SCRIPT
+
+    _code: bytes = BytesField("code")
+    _data: Dict[str, TileableType] = DictField("data")
+    _retry_when_fail: bool = BoolField("retry_when_fail")
+    _command_args: List[str] = ListField("command_args")
+    _world_size: int = Int32Field("world_size")
+    _rank: int = Int32Field("rank")
+
+    def __init__(
+        self,
+        code=None,
+        data=None,
+        world_size=None,
+        rank=None,
+        retry_when_fail=None,
+        command_args=None,
+        **kw
+    ):
+        super().__init__(
+            _code=code,
+            _data=data,
+            _world_size=world_size,
+            _rank=rank,
+            _retry_when_fail=retry_when_fail,
+            _command_args=command_args,
+            **kw
+        )
+        if self.output_types is None:
+            self.output_types = [OutputType.object]
+
+    @property
+    def code(self):
+        return self._code
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def world_size(self):
+        return self._world_size
+
+    @property
+    def rank(self):
+        return self._rank
+
+    @property
+    def command_args(self):
+        return self._command_args or []
+
+    @property
+    def retryable(self):
+        return self._retry_when_fail
+
+    def __call__(self, inputs):
+        return self.new_tileable(inputs)
+
+    @classmethod
+    def _get_chunk_data(cls, op: "RunScript"):
+        new_data = None
+        input_chunks = []
+        inputs_iter = iter(op.inputs)
+        if op.data:
+            new_data = dict()
+            for k, v in op.data.items():
+                if isinstance(v, TILEABLE_TYPE):
+                    v = next(inputs_iter)
+                    new_data[k] = build_fetch_tileable(v)
+                    input_chunks.extend(v.chunks)
+                else:
+                    new_data[k] = v
+        return new_data, input_chunks
+
+    @classmethod
+    def tile(cls, op: "RunScript"):
+        if len(op.inputs) > 0:
+            # trigger inputs to execute
+            yield
+
+        new_data, input_chunks = cls._get_chunk_data(op)
+
+        out_chunks = []
+        for i in range(op.world_size):
+            chunk_op = op.copy().reset_key()
+            chunk_op._data = new_data
+            chunk_op._rank = i
+            out_chunks.append(chunk_op.new_chunk(input_chunks, index=(i,)))
+
+        new_op = op.copy()
+        return new_op.new_tileables(
+            op.inputs,
+            chunks=out_chunks,
+            nsplits=(tuple(np.nan for _ in range(len(out_chunks))),),
+        )
+
+    @classmethod
+    def _build_envs(cls, ctx, op):
+        # set mars envs
+        envs = dict()
+        envs["RANK"] = str(op.rank)
+        envs["WORLD_SIZE"] = str(op.world_size)
+        return envs
+
+    @classmethod
+    def _build_locals(cls, ctx: Union[Context, dict], op: "RunScript"):
+        sess = ctx.get_current_session().as_default()
+        local = {"session": sess, "__name__": "__main__"}
+        if op.data is not None:
+            local.update(op.data)
+        return local
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.merge:
+            return super().execute(ctx, op)
+
+        old_env = os.environ.copy()
+        envs = cls._build_envs(ctx, op)
+        old_argv = sys.argv.copy()
+
+        # since a new session will be created and set as default, the current default session need
+        # to be restored after the execution of the script.
+        from ..deploy.oscar.session import get_default_session
+
+        old_default_session = get_default_session()
+
+        try:
+            os.environ.update(envs)
+            sys.argv = ["script"]
+            sys.argv.extend(op.command_args)
+
+            exec(op.code, cls._build_locals(ctx, op))
+
+            if op.rank == 0:
+                ctx[op.outputs[0].key] = {"status": "ok"}
+            else:
+                ctx[op.outputs[0].key] = {}
+        finally:
+            os.environ = old_env
+            sys.argv = old_argv
+            if old_default_session is not None:
+                old_default_session.as_default()
+            sys.stdout.flush()
+
+
+def _extract_inputs(data: Dict[str, TileableType] = None) -> List[TileableType]:
+    if data is not None and not isinstance(data, dict):
+        raise TypeError(
+            "`data` must be a dict whose key is variable name and value is data"
+        )
+
+    inputs = []
+    if data is not None:
+        for v in data.values():
+            if isinstance(v, TILEABLE_TYPE):
+                inputs.append(v)
+
+    return inputs
+
+
+def run_script(
+    script: Union[bytes, str, BinaryIO, TextIO],
+    data: Dict[str, TileableType] = None,
+    n_workers: int = 1,
+    command_argv: List[str] = None,
+    session: SessionType = None,
+    retry_when_fail: bool = False,
+    run_kwargs: Dict[str, Any] = None,
+):
+    """
+    Run script in Mars cluster.
+
+    Parameters
+    ----------
+    script: str or file-like object
+        Script to run.
+    data: dict
+        Variable name to data.
+    n_workers: int
+        number of workers to run the script
+    command_argv: list
+        extra command args for script
+    session: Mars session
+        if not provided, will use default one
+    retry_when_fail: bool, default False
+       If True, retry when function failed.
+    run_kwargs: dict
+        extra kwargs for session.run
+
+    Returns
+    -------
+    Object
+        Mars Object.
+
+    """
+
+    if hasattr(script, "read"):
+        code = script.read()
+    else:
+        with open(os.path.abspath(script), "rb") as f:
+            code = f.read()
+
+    inputs = _extract_inputs(data)
+    op = RunScript(
+        data=data,
+        code=to_binary(code),
+        world_size=n_workers,
+        retry_when_fail=retry_when_fail,
+        command_args=command_argv,
+    )
+    return op(inputs).execute(session=session, **(run_kwargs or {}))
diff --git a/python/xorbits/_mars/remote/tests/__init__.py b/python/xorbits/_mars/remote/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/remote/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/remote/tests/sample_script.py b/python/xorbits/_mars/remote/tests/sample_script.py
new file mode 100644
index 000000000..19e65a9e3
--- /dev/null
+++ b/python/xorbits/_mars/remote/tests/sample_script.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+assert os.environ["WORLD_SIZE"] == "2"
diff --git a/python/xorbits/_mars/remote/tests/test_remote_function.py b/python/xorbits/_mars/remote/tests/test_remote_function.py
new file mode 100644
index 000000000..9abee0520
--- /dev/null
+++ b/python/xorbits/_mars/remote/tests/test_remote_function.py
@@ -0,0 +1,331 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ... import dataframe as md
+from ... import get_context
+from ... import oscar as mo
+from ... import tensor as mt
+from ...core import tile
+from ...dataframe.core import DATAFRAME_OR_SERIES_TYPE, DATAFRAME_TYPE, SERIES_TYPE
+from ...deploy.oscar.session import get_default_session
+from ...learn.utils import shuffle
+from ...lib.mmh3 import hash as mmh3_hash
+from ...tensor.core import TENSOR_TYPE
+from .. import ExecutableTuple, spawn
+
+
+def test_params():
+    def f(x):
+        return x + 1
+
+    r = spawn(f, args=(1,))
+    c = tile(r).chunks[0]
+    assert isinstance(c.params, dict)
+    c.params = c.get_params_from_data(2)
+    assert isinstance(c.params, dict)
+
+    params = c.params
+    params.pop("index", None)
+    r.params = params
+    r.refresh_params()
+
+
+def test_remote_function(setup):
+    session = setup
+
+    def f1(x):
+        return x + 1
+
+    def f2(x, y, z=None):
+        return x * y * (z[0] + z[1])
+
+    rs = np.random.RandomState(0)
+    raw1 = rs.rand(10, 10)
+    raw2 = rs.rand(10, 10)
+
+    r1 = spawn(f1, raw1)
+    r2 = spawn(f1, raw2)
+    r3 = spawn(f2, (r1, r2), {"z": [r1, r2]})
+
+    result = r3.execute().fetch()
+    expected = (raw1 + 1) * (raw2 + 1) * (raw1 + 1 + raw2 + 1)
+    np.testing.assert_almost_equal(result, expected)
+
+    with pytest.raises(TypeError):
+        spawn(f2, (r1, r2), kwargs=())
+
+    with pytest.raises(ValueError, match="Unexpected kw: k"):
+        spawn(f2, (r1, r2), k=1)
+
+    session_id = session.session_id
+
+    def f():
+        assert get_default_session().session_id == session_id
+        return mt.ones((2, 3)).sum().to_numpy()
+
+    assert spawn(f).execute().fetch() == 6
+
+
+def test_specific_output_types(setup):
+    pd_df = pd.DataFrame(np.ones((10, 3)), columns=["a", "b", "c"])
+
+    def f1():
+        return pd_df
+
+    r = spawn(f1, output_types="dataframe").execute()
+
+    assert isinstance(r, DATAFRAME_TYPE)
+    assert r.index_value is not None
+    pd.testing.assert_frame_equal(r.fetch(), pd_df)
+    pd.testing.assert_index_equal(r.columns.to_pandas(), pd_df.columns)
+
+    pd_series = pd.Series(np.ones((10,)), name="a")
+
+    def f2():
+        return pd_series
+
+    r = spawn(f2, output_types="series").execute()
+
+    assert isinstance(r, SERIES_TYPE)
+    assert r.index_value is not None
+    assert r.name == pd_series.name
+    pd.testing.assert_series_equal(r.fetch(), pd_series)
+
+    def f3(v):
+        if v > 0:
+            return pd_series
+        else:
+            return pd_df
+
+    r = spawn(f3, args=(1,), output_types="df_or_series").execute()
+
+    assert isinstance(r, DATAFRAME_OR_SERIES_TYPE)
+    assert r.index_value is not None
+    assert r.name == pd_series.name
+    assert r.shape == pd_series.shape
+    assert getattr(r, "dtypes", None) is None
+    s = r.ensure_data()
+    assert isinstance(s, SERIES_TYPE)
+    pd.testing.assert_series_equal(s.fetch(), pd_series)
+
+    r = spawn(f3, args=(0,), output_types="df_or_series").execute()
+
+    assert isinstance(r, DATAFRAME_OR_SERIES_TYPE)
+    assert r.index_value is not None
+    pd.testing.assert_series_equal(r.dtypes, pd_df.dtypes)
+    assert r.shape == pd_df.shape
+    assert getattr(r, "dtype", None) is None
+    s = r.ensure_data()
+    assert isinstance(s, DATAFRAME_TYPE)
+    pd.testing.assert_frame_equal(s.fetch(), pd_df)
+
+    np_array = np.random.rand(10, 10)
+
+    def f2():
+        return np_array
+
+    r = spawn(f2, output_types="tensor").execute()
+
+    assert isinstance(r, TENSOR_TYPE)
+    assert r.dtype == np_array.dtype
+    np.testing.assert_array_equal(r.fetch(), np_array)
+
+
+def test_context(setup_cluster):
+    def get_workers():
+        ctx = get_context()
+        return ctx.get_worker_addresses()
+
+    def f1(worker: str):
+        ctx = get_context()
+        assert worker == ctx.worker_address
+        return np.random.rand(3, 3)
+
+    def f2(data_key: str, worker: str):
+        ctx = get_context()
+        assert worker == ctx.worker_address
+        meta = ctx.get_chunks_meta([data_key], fields=["bands"])[0]
+        assert len(meta) == 1
+        ctx.get_chunks_result([data_key], fetch_only=True)
+        # fetched, two workers have the data
+        meta = ctx.get_chunks_meta([data_key], fields=["bands"])[0]
+        assert len(meta["bands"]) == 2
+
+    workers = spawn(get_workers).execute().fetch()
+    assert len(workers) == len(set(workers)) > 1
+
+    r1 = spawn(f1, args=(workers[0],), expect_worker=workers[0]).execute()
+    data_key = r1._fetch_infos(fields=["data_key"])["data_key"][0]
+    r2 = spawn(f2, args=(data_key, workers[1]), expect_worker=workers[1])
+    r2.execute()
+
+    def get_bands():
+        ctx = get_context()
+        return [b for b in ctx.get_worker_bands() if b[1].startswith("numa-")]
+
+    def f3(band: tuple):
+        ctx = get_context()
+        assert band == ctx.band
+        return np.random.rand(3, 3)
+
+    def f4(data_key: str, band: tuple):
+        ctx = get_context()
+        assert band == ctx.band
+        meta = ctx.get_chunks_meta([data_key], fields=["bands"])[0]
+        assert len(meta) == 1
+        ctx.get_chunks_result([data_key], fetch_only=True)
+        # fetched, two bands have the data
+        meta = ctx.get_chunks_meta([data_key], fields=["bands"])[0]
+        assert len(meta["bands"]) == 2
+
+    bands = spawn(get_bands).execute().fetch()
+    assert len(bands) == len(set(bands)) > 1
+
+    r3 = spawn(f3, args=(bands[0],), expect_band=bands[0]).execute()
+    data_key = r3._fetch_infos(fields=["data_key"])["data_key"][0]
+    r4 = spawn(f4, args=(data_key, bands[1]), expect_band=bands[1])
+    r4.execute()
+
+
+def test_multi_output(setup):
+    sentences = ["word1 word2", "word2 word3", "word3 word2 word1"]
+
+    def mapper(s):
+        word_to_count = defaultdict(lambda: 0)
+        for word in s.split():
+            word_to_count[word] += 1
+
+        downsides = [defaultdict(lambda: 0), defaultdict(lambda: 0)]
+        for word, count in word_to_count.items():
+            downsides[mmh3_hash(word) % 2][word] += count
+
+        return downsides
+
+    def reducer(word_to_count_list):
+        d = defaultdict(lambda: 0)
+        for word_to_count in word_to_count_list:
+            for word, count in word_to_count.items():
+                d[word] += count
+
+        return dict(d)
+
+    outs = [], []
+    for sentence in sentences:
+        out1, out2 = spawn(mapper, sentence, n_output=2)
+        outs[0].append(out1)
+        outs[1].append(out2)
+
+    rs = []
+    for out in outs:
+        r = spawn(reducer, out)
+        rs.append(r)
+
+    result = dict()
+    for wc in ExecutableTuple(rs).to_object():
+        result.update(wc)
+
+    assert result == {"word1": 2, "word2": 3, "word3": 2}
+
+
+def test_chained_remote(setup):
+    def f(x):
+        return x + 1
+
+    def g(x):
+        return x * 2
+
+    s = spawn(g, spawn(f, 2))
+
+    result = s.execute().fetch()
+    assert result == 6
+
+
+def test_input_tileable(setup):
+    def f(t, x):
+        return (t * x).sum().to_numpy()
+
+    rs = np.random.RandomState(0)
+    raw = rs.rand(5, 4)
+
+    t1 = mt.tensor(raw, chunk_size=3)
+    t2 = t1.sum(axis=0)
+    s = spawn(f, args=(t2, 3))
+
+    result = s.execute().fetch()
+    expected = (raw.sum(axis=0) * 3).sum()
+    assert pytest.approx(result) == expected
+
+    df1 = md.DataFrame(raw, chunk_size=3)
+    df1.execute()
+    df2 = shuffle(df1)
+    df2.execute()
+
+    def f2(input_df):
+        bonus = input_df.iloc[:, 0].fetch().sum()
+        return input_df.sum().to_pandas() + bonus
+
+    for df in [df1, df2]:
+        s = spawn(f2, args=(df,))
+
+        result = s.execute().fetch()
+        expected = pd.DataFrame(raw).sum() + raw[:, 0].sum()
+        pd.testing.assert_series_equal(result, expected)
+
+
+def test_unknown_shape_inputs(setup):
+    def f(t, x):
+        assert all(not np.isnan(s) for s in t.shape)
+        return (t * x).sum().to_numpy(extra_config={"check_nsplits": False})
+
+    rs = np.random.RandomState(0)
+    raw = rs.rand(5, 4)
+
+    t1 = mt.tensor(raw, chunk_size=3)
+    t2 = t1[t1 > 0]
+    s = spawn(f, args=(t2, 3))
+
+    result = s.execute().fetch()
+    expected = (raw[raw > 0] * 3).sum()
+    assert pytest.approx(result) == expected
+
+
+def test_none_outputs(setup):
+    def f(*_args):
+        pass
+
+    r1 = spawn(f, args=(0,))
+    r2 = spawn(f, args=(r1, 1))
+    r3 = spawn(f, args=(r1, 2))
+    r4 = spawn(f, args=(r2, r3))
+
+    assert r4.execute().fetch() is None
+
+
+def test_remote_with_unpickable(setup_cluster):
+    def f(*_):
+        class Unpickleable:
+            def __reduce__(self):
+                raise ValueError
+
+        raise KeyError(Unpickleable())
+
+    with pytest.raises(mo.SendMessageFailed):
+        d = spawn(f, retry_when_fail=False)
+        d.execute()
diff --git a/python/xorbits/_mars/remote/tests/test_run_script.py b/python/xorbits/_mars/remote/tests/test_run_script.py
new file mode 100644
index 000000000..6473b07cc
--- /dev/null
+++ b/python/xorbits/_mars/remote/tests/test_run_script.py
@@ -0,0 +1,65 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from io import BytesIO
+
+import pytest
+
+from ... import dataframe as md
+from ... import tensor as mt
+from .. import run_script
+
+script1 = b"""
+import os
+assert os.environ['WORLD_SIZE'] == '2'
+"""
+
+script2 = b"""
+assert session is not None
+"""
+
+script3 = b"""
+from mars.core.operand import Fetch
+from mars.deploy.oscar.session import AbstractSession
+
+assert AbstractSession.default is not None
+assert isinstance(tensor.op, Fetch)
+assert len(tensor.chunks) > 0
+assert isinstance(tensor.chunks[0].op, Fetch)
+tensor.fetch().sum() == df.fetch()['s'].sum()
+"""
+
+
+def test_local_run_script(setup_cluster):
+    s = BytesIO(script1)
+    assert run_script(s, n_workers=2).fetch()["status"] == "ok"
+
+
+def test_local_run_script_with_exec(setup_cluster):
+    s = BytesIO(script2)
+    assert run_script(s, n_workers=2).fetch()["status"] == "ok"
+
+
+def test_local_run_script_with_data(setup_cluster):
+    s = BytesIO(script3)
+    data = {"tensor": mt.arange(10), "df": md.DataFrame({"s": mt.arange(9, 0, -1)})}
+    assert run_script(s, data=data, n_workers=1).fetch()["status"] == "ok"
+
+    pytest.raises(TypeError, run_script, s, data=[])
+
+
+def test_run_with_file(setup_cluster):
+    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "sample_script.py")
+    assert run_script(path, n_workers=2).fetch()["status"] == "ok"
diff --git a/python/xorbits/_mars/resource.py b/python/xorbits/_mars/resource.py
new file mode 100644
index 000000000..65eb56f6b
--- /dev/null
+++ b/python/xorbits/_mars/resource.py
@@ -0,0 +1,413 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import math
+import os
+import subprocess  # nosec
+import sys
+import time
+from collections import namedtuple
+from typing import List, Optional
+
+import psutil
+
+from ._resource import Resource, ZeroResource
+from .lib import nvutils
+from .utils import get_bool_environ
+
+Resource = Resource
+ZeroResource = ZeroResource
+
+logger = logging.getLogger(__name__)
+
+CGROUP_V1_CPU_ACCT_FILE = "/sys/fs/cgroup/cpuacct/cpuacct.usage"
+CGROUP_V1_MEM_STAT_FILE = "/sys/fs/cgroup/memory/memory.stat"
+CGROUP_V2_CPU_STAT_FILE = "/sys/fs/cgroup/cpu.stat"
+CGROUP_V2_MEM_CURRENT_FILE = "/sys/fs/cgroup/memory.current"
+CGROUP_V2_MEM_MAX_FILE = "/sys/fs/cgroup/memory.max"
+
+_is_cgroup_v2 = os.path.exists(CGROUP_V2_CPU_STAT_FILE)
+
+_proc = psutil.Process()
+_timer = getattr(time, "monotonic", time.time)
+
+_use_process_stat = get_bool_environ("MARS_USE_PROCESS_STAT")
+_use_cgroup_stat = get_bool_environ("MARS_USE_CGROUP_STAT")
+_cpu_use_process_stat = get_bool_environ("MARS_CPU_USE_PROCESS_STAT")
+_cpu_use_cgroup_stat = get_bool_environ("MARS_CPU_USE_CGROUP_STAT")
+_mem_use_process_stat = get_bool_environ("MARS_MEM_USE_PROCESS_STAT")
+_mem_use_cgroup_stat = get_bool_environ("MARS_MEM_USE_CGROUP_STAT")
+
+# if general config exists, overwrite individual ones
+if _use_process_stat is not None:
+    _cpu_use_process_stat = _mem_use_process_stat = _use_process_stat
+if _use_cgroup_stat is not None:
+    _cpu_use_cgroup_stat = _mem_use_cgroup_stat = _use_cgroup_stat
+
+if "MARS_CPU_TOTAL" in os.environ:
+    _cpu_total = int(math.ceil(float(os.environ["MARS_CPU_TOTAL"])))
+else:
+    _cpu_total = psutil.cpu_count(logical=True)
+
+if "MARS_MEMORY_TOTAL" in os.environ:
+    _mem_total = int(os.environ["MARS_MEMORY_TOTAL"])
+else:
+    _mem_total = None
+
+_virt_memory_stat = namedtuple("virtual_memory", "total available percent used free")
+
+_shm_path = [
+    pt.mountpoint
+    for pt in psutil.disk_partitions(all=True)
+    if pt.mountpoint in ("/tmp", "/dev/shm") and pt.fstype == "tmpfs"
+]
+if not _shm_path:
+    _shm_path = None
+else:
+    _shm_path = _shm_path[0]
+
+
+def _read_cgroup_stat_file(file_name: str):
+    with open(file_name, "r") as cg_file:
+        contents = cg_file.read()
+    kvs = dict()
+    for line in contents.splitlines():
+        parts = line.split(" ")
+        if len(parts) == 2:
+            kvs[parts[0]] = int(parts[1])
+    return kvs
+
+
+_root_pid = None
+
+
+def virtual_memory() -> _virt_memory_stat:
+    global _root_pid
+
+    sys_mem = psutil.virtual_memory()
+    if _mem_use_cgroup_stat:
+        max_mem = min(_mem_total or sys_mem.total, sys_mem.total)
+        if _is_cgroup_v2:
+            # see Memory section in https://www.kernel.org/doc/Documentation/cgroup-v2.txt
+            with open(CGROUP_V2_MEM_MAX_FILE, "r") as mem_max_file:
+                max_str = mem_max_file.read().strip()
+                total = max_mem if max_str == "max" else int(max_str)
+            with open(CGROUP_V2_MEM_CURRENT_FILE, "r") as mem_current_file:
+                used = int(mem_current_file.read().strip())
+        else:
+            # see section 5.5 in https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt
+            cgroup_mem_info = _read_cgroup_stat_file(CGROUP_V1_MEM_STAT_FILE)
+            total = cgroup_mem_info["hierarchical_memory_limit"]
+            total = min(max_mem, total)
+            used = cgroup_mem_info["rss"] + cgroup_mem_info.get("swap", 0)
+
+        if _shm_path:
+            shm_stats = psutil.disk_usage(_shm_path)
+            used += shm_stats.used
+        available = free = total - used
+        percent = 100.0 * (total - available) / total
+        return _virt_memory_stat(total, available, percent, used, free)
+    elif not _mem_use_process_stat:
+        total = min(_mem_total or sys_mem.total, sys_mem.total)
+        used = sys_mem.used + getattr(sys_mem, "shared", 0)
+        available = sys_mem.available
+        free = sys_mem.free
+        percent = 100.0 * (total - available) / total
+        return _virt_memory_stat(total, available, percent, used, free)
+    else:
+        used = 0
+        if _root_pid is None:
+            cur_proc = psutil.Process()
+            while True:
+                par_proc = cur_proc.parent()
+                if par_proc is None:
+                    break
+                try:
+                    cmd = par_proc.cmdline()
+                    if "python" not in " ".join(cmd).lower():
+                        break
+                    cur_proc = par_proc
+                except:  # noqa: E722  # nosec  # pylint: disable=bare-except  # pragma: no cover
+                    break
+            _root_pid = cur_proc.pid
+
+        root_proc = psutil.Process(_root_pid)
+        for p in root_proc.children(True):
+            try:
+                used += p.memory_info().rss
+            except (psutil.NoSuchProcess, psutil.AccessDenied):
+                pass
+
+        if _shm_path:
+            shm_stats = psutil.disk_usage(_shm_path)
+            used += shm_stats.used
+
+        total = min(_mem_total or sys_mem.total, sys_mem.total)
+        # TODO sys_mem.available does not work in container
+        # available = min(sys_mem.available, total - used)
+        available = total - used
+        free = min(sys_mem.free, total - used)
+        percent = 100.0 * (total - available) / total
+        return _virt_memory_stat(total, available, percent, used, free)
+
+
+def cpu_count():
+    return _cpu_total
+
+
+def mem_total():
+    return virtual_memory().total
+
+
+_last_cgroup_cpu_measure = None
+_last_proc_cpu_measure = None
+_last_psutil_measure = None
+_last_cpu_percent = None
+_cpu_percent_interval = 0.1
+
+
+def _take_process_cpu_snapshot():
+    pts = dict()
+    sts = dict()
+    for p in psutil.process_iter():
+        try:
+            pts[p.pid] = p.cpu_times()
+            sts[p.pid] = _timer()
+        except (psutil.NoSuchProcess, psutil.AccessDenied):
+            pass
+    return pts, sts
+
+
+def cpu_percent():
+    global _last_cgroup_cpu_measure, _last_proc_cpu_measure, _last_cpu_percent, _last_psutil_measure
+    if _cpu_use_cgroup_stat:
+        if _is_cgroup_v2:
+            # see CPU section in https://www.kernel.org/doc/Documentation/cgroup-v2.txt
+            cpu_content = _read_cgroup_stat_file(CGROUP_V2_CPU_STAT_FILE)
+            cpu_acct = cpu_content["usage_usec"] * 1000
+        else:
+            # see https://www.kernel.org/doc/Documentation/cgroup-v1/cpuacct.txt
+            with open(CGROUP_V1_CPU_ACCT_FILE, "r") as cgroup_file:
+                cpu_acct = int(cgroup_file.read())
+        sample_time = _timer()
+
+        if _last_cgroup_cpu_measure is None:
+            _last_cgroup_cpu_measure = (cpu_acct, sample_time)
+            return None
+
+        last_cpu_acct, last_sample_time = _last_cgroup_cpu_measure
+        time_delta = sample_time - last_sample_time
+        if time_delta < _cpu_percent_interval:
+            return _last_cpu_percent or 0
+
+        _last_cgroup_cpu_measure = (cpu_acct, sample_time)
+        # nanoseconds / seconds * 100, we shall divide 1e7.
+        _last_cpu_percent = round(
+            (cpu_acct - last_cpu_acct) / (sample_time - last_sample_time) / 1e7, 1
+        )
+        return _last_cpu_percent or 0
+    elif _cpu_use_process_stat:
+        pts, sts = _take_process_cpu_snapshot()
+
+        if _last_proc_cpu_measure is None:
+            _last_proc_cpu_measure = (pts, sts)
+            return None
+
+        old_pts, old_sts = _last_proc_cpu_measure
+
+        percents = []
+        for pid in pts:
+            if pid not in old_pts:
+                continue
+            pt1 = old_pts[pid]
+            pt2 = pts[pid]
+            delta_proc = (pt2.user - pt1.user) + (pt2.system - pt1.system)
+            time_delta = sts[pid] - old_sts[pid]
+
+            if time_delta < _cpu_percent_interval:
+                return _last_cpu_percent or 0
+            percents.append((delta_proc / time_delta) * 100)
+        _last_proc_cpu_measure = (pts, sts)
+        _last_cpu_percent = round(sum(percents), 1)
+        return _last_cpu_percent or 0
+    else:
+        measure_time = time.time()
+        if (
+            _last_psutil_measure is not None
+            and measure_time - _last_psutil_measure < _cpu_percent_interval
+        ):
+            return _last_cpu_percent or 0
+        _last_psutil_measure = measure_time
+        _last_cpu_percent = psutil.cpu_percent() * _cpu_total
+        return _last_cpu_percent or 0
+
+
+def disk_usage(d):
+    return psutil.disk_usage(d)
+
+
+def iowait():
+    cpu_percent = psutil.cpu_times_percent()
+    try:
+        return cpu_percent.iowait
+    except AttributeError:
+        return None
+
+
+_last_disk_io_metas = dict()
+_path_to_device = dict()
+_win_diskperf_called = False
+
+
+def get_path_device(path: str):
+    for part in psutil.disk_partitions(all=True):
+        if path.startswith(part.mountpoint):
+            return part.device
+    return None
+
+
+def _get_path_device(path: str):
+    if path in _path_to_device:
+        return _path_to_device[path]
+
+    for part in psutil.disk_partitions(all=True):
+        if path.startswith(part.mountpoint):
+            dev_name = _path_to_device[path] = part.device.replace("/dev/", "")
+            return dev_name
+    _path_to_device[path] = None
+    return None
+
+
+_disk_io_usage_type = namedtuple("_disk_io_usage_type", "reads writes")
+
+
+def disk_io_usage(path=None) -> Optional[_disk_io_usage_type]:
+    global _win_diskperf_called
+
+    # Needed by psutil.disk_io_counters() under newer version of Windows.
+    # diskperf -y need to be called or no disk information can be found.
+    if sys.platform == "win32" and not _win_diskperf_called:
+        CREATE_NO_WINDOW = 0x08000000
+        try:
+            proc = subprocess.Popen(
+                ["diskperf", "-y"], shell=False, creationflags=CREATE_NO_WINDOW
+            )  # nosec
+            proc.wait()
+        except (subprocess.CalledProcessError, OSError):  # pragma: no cover
+            pass
+        _win_diskperf_called = True
+
+    if path is None:
+        disk_counters = psutil.disk_io_counters()
+    else:
+        dev_to_counters = psutil.disk_io_counters(perdisk=True)
+        disk_counters = dev_to_counters.get(_get_path_device(path))
+        if disk_counters is None:
+            return None
+    tst = time.time()
+
+    read_bytes = disk_counters.read_bytes
+    write_bytes = disk_counters.write_bytes
+    if path not in _last_disk_io_metas:
+        _last_disk_io_metas[path] = (read_bytes, write_bytes, tst)
+        return None
+
+    last_read_bytes, last_write_bytes, last_time = _last_disk_io_metas[path]
+    delta_time = tst - last_time
+    if delta_time == 0:
+        return None
+
+    read_speed = (read_bytes - last_read_bytes) / delta_time
+    write_speed = (write_bytes - last_write_bytes) / delta_time
+
+    _last_disk_io_metas[path] = (read_bytes, write_bytes, tst)
+    return _disk_io_usage_type(read_speed, write_speed)
+
+
+_last_net_io_meta = None
+
+
+def net_io_usage():
+    global _last_net_io_meta
+
+    net_counters = psutil.net_io_counters()
+    tst = time.time()
+
+    send_bytes = net_counters.bytes_sent
+    recv_bytes = net_counters.bytes_recv
+    if _last_net_io_meta is None:
+        _last_net_io_meta = (send_bytes, recv_bytes, tst)
+        return None
+
+    last_send_bytes, last_recv_bytes, last_time = _last_net_io_meta
+    delta_time = tst - last_time
+    if delta_time == 0:
+        return None
+
+    recv_speed = (recv_bytes - last_recv_bytes) / delta_time
+    send_speed = (send_bytes - last_send_bytes) / delta_time
+
+    _last_net_io_meta = (send_bytes, recv_bytes, tst)
+    return recv_speed, send_speed
+
+
+_cuda_info = namedtuple("cuda_info", "driver_version cuda_version products gpu_count")
+_cuda_card_stat = namedtuple(
+    "cuda_card_stat", "index product_name gpu_usage temperature fb_mem_info"
+)
+
+
+def cuda_info():  # pragma: no cover
+    driver_info = nvutils.get_driver_info()
+    if not driver_info:
+        return
+    gpu_count = nvutils.get_device_count()
+    return _cuda_info(
+        driver_version=driver_info.driver_version,
+        cuda_version=driver_info.cuda_version,
+        products=[nvutils.get_device_info(idx).name for idx in range(gpu_count)],
+        gpu_count=gpu_count,
+    )
+
+
+def cuda_count():
+    return nvutils.get_device_count() or 0
+
+
+def cuda_card_stats() -> List[_cuda_card_stat]:  # pragma: no cover
+    infos = []
+    device_count = nvutils.get_device_count()
+    if not device_count:
+        return infos
+    for device_idx in range(device_count):
+        device_info = nvutils.get_device_info(device_idx)
+        device_status = nvutils.get_device_status(device_idx)
+
+        infos.append(
+            _cuda_card_stat(
+                index=device_info.index,
+                product_name=device_info.name,
+                gpu_usage=device_status.gpu_util,
+                temperature=device_status.temperature,
+                fb_mem_info=_virt_memory_stat(
+                    total=device_status.fb_total_mem,
+                    used=device_status.fb_used_mem,
+                    free=device_status.fb_free_mem,
+                    available=device_status.fb_free_mem,
+                    percent=device_status.mem_util,
+                ),
+            )
+        )
+    return infos
diff --git a/python/xorbits/_mars/serialization/__init__.py b/python/xorbits/_mars/serialization/__init__.py
new file mode 100644
index 000000000..9dd84d48d
--- /dev/null
+++ b/python/xorbits/_mars/serialization/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import arrow, cuda, exception, mars_objects, numpy, ray, scipy
+from .aio import AioDeserializer, AioSerializer
+from .core import Serializer, deserialize, serialize, serialize_with_spawn
+
+del arrow, cuda, numpy, scipy, mars_objects, ray, exception
diff --git a/python/xorbits/_mars/serialization/aio.py b/python/xorbits/_mars/serialization/aio.py
new file mode 100644
index 000000000..4030da1ee
--- /dev/null
+++ b/python/xorbits/_mars/serialization/aio.py
@@ -0,0 +1,140 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import struct
+from io import BytesIO
+from typing import Any, BinaryIO, Union
+
+import cloudpickle
+import numpy as np
+
+from ..utils import lazy_import
+from .core import deserialize, serialize_with_spawn
+
+rmm = lazy_import("rmm")
+
+DEFAULT_SERIALIZATION_VERSION = 1
+DEFAULT_SPAWN_THRESHOLD = 100
+BUFFER_SIZES_NAME = "buf_sizes"
+
+
+class AioSerializer:
+    def __init__(self, obj: Any, compress=0):
+        self._obj = obj
+        self._compress = compress
+
+    async def _get_buffers(self):
+        headers, buffers = await serialize_with_spawn(
+            self._obj, spawn_threshold=DEFAULT_SPAWN_THRESHOLD
+        )
+
+        def _is_cuda_buffer(buf: Union["rmm.DeviceBuffer", BinaryIO]):
+            return hasattr(buf, "__cuda_array_interface__")
+
+        is_cuda_buffers = [_is_cuda_buffer(buf) for buf in buffers]
+        headers[0]["is_cuda_buffers"] = np.array(is_cuda_buffers)
+
+        # add buffer lengths into headers
+        headers[0][BUFFER_SIZES_NAME] = [
+            buf.nbytes if hasattr(buf, "nbytes") else len(buf) for buf in buffers
+        ]
+        header = cloudpickle.dumps(headers)
+
+        # gen header buffer
+        header_bio = BytesIO()
+        # write version first
+        header_bio.write(struct.pack("B", DEFAULT_SERIALIZATION_VERSION))
+        # write header length
+        header_bio.write(struct.pack(" Callable: ...
+def fast_id(obj: Any) -> int: ...
+
+class Serializer:
+    serializer_id: int
+    def serial(self, obj: Any, context: Dict): ...
+    def deserial(self, serialized: Tuple, context: Dict, subs: List[Any]): ...
+    def on_deserial_error(
+        self,
+        serialized: Tuple,
+        context: Dict,
+        subs_serialized: List,
+        error_index: int,
+        exc: BaseException,
+    ): ...
+    @classmethod
+    def register(cls, obj_type): ...
+    @classmethod
+    def unregister(cls, obj_type): ...
+
+class Placeholder:
+    id: int
+    callbacks: List[Callable]
+    def __init__(self, id_: int): ...
+    def __hash__(self): ...
+    def __eq__(self, other): ...
+
+def serialize(obj: Any, context: Dict = None): ...
+async def serialize_with_spawn(
+    obj: Any,
+    context: Dict = None,
+    spawn_threshold: int = 100,
+    executor: Executor = None,
+): ...
+def deserialize(headers: List, buffers: List, context: Dict = None): ...
diff --git a/python/xorbits/_mars/serialization/core.pyx b/python/xorbits/_mars/serialization/core.pyx
new file mode 100644
index 000000000..e11656b4e
--- /dev/null
+++ b/python/xorbits/_mars/serialization/core.pyx
@@ -0,0 +1,934 @@
+# distutils: language = c++
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import datetime
+import enum
+import hashlib
+import inspect
+import sys
+from functools import partial, wraps
+from typing import Any, Dict, List
+
+import numpy as np
+import pandas as pd
+
+from cpython cimport PyObject
+from libc.stdint cimport int32_t, int64_t, uint32_t, uint64_t, uintptr_t
+from libcpp.unordered_map cimport unordered_map
+
+from .._utils import NamedType
+
+from .._utils cimport TypeDispatcher
+
+import cloudpickle
+
+if sys.version_info[:2] < (3, 8):  # pragma: no cover
+    try:
+        import pickle5 as pickle  # nosec  # pylint: disable=import_pickle
+    except ImportError:
+        import pickle  # nosec  # pylint: disable=import_pickle
+else:
+    import pickle  # nosec  # pylint: disable=import_pickle
+
+# resolve pandas pickle compatibility between <1.2 and >=1.3
+try:
+    from pandas.core.internals import blocks as pd_blocks
+    if not hasattr(pd_blocks, "new_block") and hasattr(pd_blocks, "make_block"):
+        # register missing func that would cause errors
+        pd_blocks.new_block = pd_blocks.make_block
+except (ImportError, AttributeError):
+    pass
+
+BUFFER_PICKLE_PROTOCOL = max(pickle.DEFAULT_PROTOCOL, 5)
+cdef bint HAS_PICKLE_BUFFER = pickle.HIGHEST_PROTOCOL >= 5
+cdef bint _PANDAS_HAS_MGR = hasattr(pd.Series([0]), "_mgr")
+
+
+cdef TypeDispatcher _serial_dispatcher = TypeDispatcher()
+cdef dict _deserializers = dict()
+
+cdef uint32_t _MAX_STR_PRIMITIVE_LEN = 1024
+# prime modulus for serializer ids
+# use the largest prime number smaller than 32767
+cdef int32_t _SERIALIZER_ID_PRIME = 32749
+
+
+cdef class Serializer:
+    serializer_id = None
+
+    def __cinit__(self):
+        # make the value can be referenced with C code
+        self._serializer_id = self.serializer_id
+
+    cpdef serial(self, object obj, dict context):
+        """
+        Returns intermediate serialization result of certain object.
+        The returned value can be a Placeholder or a tuple comprising
+        of three parts: a header, a group of subcomponents and
+        a finalizing flag.
+
+        * Header is a pickle-serializable tuple
+        * Subcomponents are parts or buffers for iterative
+          serialization.
+        * Flag is a boolean value. If true, subcomponents should be
+          buffers (for instance, bytes, memory views, GPU buffers,
+          etc.) that can be read and written directly. If false,
+          subcomponents will be serialized iteratively.
+
+        Parameters
+        ----------
+        obj: Any
+            Object to serialize
+        context: Dict
+            Serialization context to help creating Placeholder objects
+            for reducing duplicated serialization
+
+        Returns
+        -------
+        result: Placeholder | Tuple[Tuple, List, bool]
+            Intermediate result of serialization
+        """
+        raise NotImplementedError
+
+    cpdef deserial(self, tuple serialized, dict context, list subs):
+        """
+        Returns deserialized object given serialized headers and
+        deserialized subcomponents.
+
+        Parameters
+        ----------
+        serialized: Tuple
+            Serialized object header as a tuple
+        context
+            Serialization context for instantiation of Placeholder
+            objects
+        subs: List
+            Deserialized subcomponents
+
+        Returns
+        -------
+        result: Any
+            Deserialized objects
+        """
+        raise NotImplementedError
+
+    cpdef on_deserial_error(
+        self,
+        tuple serialized,
+        dict context,
+        list subs_serialized,
+        int error_index,
+        object exc,
+    ):
+        """
+        Returns rewritten exception when subcomponent deserialization fails
+
+        Parameters
+        ----------
+        serialized: Tuple
+            Serialized object header as a tuple
+        context
+            Serialization context for instantiation of Placeholder
+            objects
+        subs_serialized: List
+            Serialized subcomponents
+        error_index: int
+            Index of subcomponent causing error
+        exc: BaseException
+            Exception raised
+
+        Returns
+        -------
+        exc: BaseException | None
+            Rewritten exception. If None, original exception is kept.
+        """
+        return None
+
+    @classmethod
+    def calc_default_serializer_id(cls):
+        s = f"{cls.__module__}.{cls.__qualname__}"
+        h = hashlib.md5(s.encode())
+        return int(h.hexdigest(), 16) % _SERIALIZER_ID_PRIME
+
+    @classmethod
+    def register(cls, obj_type, name=None):
+        if (
+            cls.serializer_id is None
+            or cls.serializer_id == getattr(super(cls, cls), "serializer_id", None)
+        ):
+            # a class should have its own serializer_id
+            # inherited serializer_id not acceptable
+            cls.serializer_id = cls.calc_default_serializer_id()
+
+        inst = cls()
+        if name is not None:
+            obj_type = NamedType(name, obj_type)
+        _serial_dispatcher.register(obj_type, inst)
+        if _deserializers.get(cls.serializer_id) is not None:
+            assert type(_deserializers[cls.serializer_id]) is cls
+        else:
+            _deserializers[cls.serializer_id] = inst
+
+    @classmethod
+    def unregister(cls, obj_type, name=None):
+        if name is not None:
+            obj_type = NamedType(name, obj_type)
+        _serial_dispatcher.unregister(obj_type)
+        _deserializers.pop(cls.serializer_id, None)
+
+
+cdef inline uint64_t _fast_id(object obj) nogil:
+    return obj
+
+
+def fast_id(obj):
+    """C version of id() used for serialization"""
+    return _fast_id(obj)
+
+
+def buffered(func):
+    """
+    Wrapper for serial() method to reduce duplicated serialization
+    """
+    @wraps(func)
+    def wrapped(self, obj: Any, dict context):
+        cdef uint64_t obj_id = _fast_id(obj)
+        if obj_id in context:
+            return Placeholder(_fast_id(obj))
+        else:
+            context[obj_id] = obj
+            return func(self, obj, context)
+
+    return wrapped
+
+
+def pickle_buffers(obj):
+    cdef list buffers = [None]
+
+    if HAS_PICKLE_BUFFER:
+
+        def buffer_cb(x):
+            x = x.raw()
+            if x.ndim > 1:
+                # ravel n-d memoryview
+                x = x.cast(x.format)
+            buffers.append(memoryview(x))
+
+        buffers[0] = cloudpickle.dumps(
+            obj,
+            buffer_callback=buffer_cb,
+            protocol=BUFFER_PICKLE_PROTOCOL,
+        )
+    else:  # pragma: no cover
+        buffers[0] = cloudpickle.dumps(obj)
+    return buffers
+
+
+def unpickle_buffers(buffers):
+    result = cloudpickle.loads(buffers[0], buffers=buffers[1:])
+
+    # as pandas prior to 1.1.0 use _data instead of _mgr to hold BlockManager,
+    # deserializing from high versions may produce mal-functioned pandas objects,
+    # thus the patch is needed
+    if _PANDAS_HAS_MGR:
+        return result
+    else:  # pragma: no cover
+        if hasattr(result, "_mgr") and isinstance(result, (pd.DataFrame, pd.Series)):
+            result._data = getattr(result, "_mgr")
+            delattr(result, "_mgr")
+        return result
+
+
+cdef class PickleSerializer(Serializer):
+    serializer_id = 0
+
+    cpdef serial(self, obj: Any, dict context):
+        cdef uint64_t obj_id
+        obj_id = _fast_id(obj)
+        if obj_id in context:
+            return Placeholder(obj_id)
+        context[obj_id] = obj
+
+        return (), pickle_buffers(obj), True
+
+    cpdef deserial(self, tuple serialized, dict context, list subs):
+        return unpickle_buffers(subs)
+
+
+cdef set _primitive_types = {
+    type(None),
+    bool,
+    int,
+    float,
+    complex,
+    datetime.datetime,
+    datetime.date,
+    datetime.timedelta,
+    enum.Enum,
+    type(max),  # builtin functions
+    np.dtype,
+    np.number,
+}
+
+
+class PrimitiveSerializer(Serializer):
+    serializer_id = 1
+
+    @buffered
+    def serial(self, obj: Any, context: Dict):
+        return (obj,), [], True
+
+    def deserial(self, tuple obj, context: Dict, subs: List[Any]):
+        return obj[0]
+
+
+cdef class BytesSerializer(Serializer):
+    serializer_id = 2
+
+    cpdef serial(self, obj: Any, dict context):
+        cdef uint64_t obj_id
+        obj_id = _fast_id(obj)
+        if obj_id in context:
+            return Placeholder(obj_id)
+        context[obj_id] = obj
+
+        return (), [obj], True
+
+    cpdef deserial(self, tuple serialized, dict context, list subs):
+        return subs[0]
+
+
+cdef class StrSerializer(Serializer):
+    serializer_id = 3
+
+    cpdef serial(self, obj: Any, dict context):
+        cdef uint64_t obj_id
+        obj_id = _fast_id(obj)
+        if obj_id in context:
+            return Placeholder(obj_id)
+        context[obj_id] = obj
+
+        return (), [(obj).encode()], True
+
+    cpdef deserial(self, tuple serialized, dict context, list subs):
+        buffer = subs[0]
+        if type(buffer) is memoryview:
+            buffer = buffer.tobytes()
+        return buffer.decode()
+
+
+cdef class CollectionSerializer(Serializer):
+    obj_type = None
+
+    cdef object _obj_type
+
+    def __cinit__(self):
+        # make the value can be referenced with C code
+        self._obj_type = self.obj_type
+
+    cdef tuple _serial_iterable(self, obj: Any):
+        cdef list idx_to_propagate = []
+        cdef list obj_to_propagate = []
+        cdef list obj_list = obj if type(obj) is list else list(obj)
+        cdef int64_t idx
+        cdef object item
+
+        for idx in range(len(obj_list)):
+            item = obj_list[idx]
+
+            if type(item) is bytes and len(item) < _MAX_STR_PRIMITIVE_LEN:
+                # treat short strings as primitives
+                continue
+            elif type(item) is str and len(item) < _MAX_STR_PRIMITIVE_LEN:
+                # treat short strings as primitives
+                continue
+            elif type(item) in _primitive_types:
+                continue
+
+            if obj is obj_list:
+                obj_list = list(obj)
+
+            obj_list[idx] = None
+            idx_to_propagate.append(idx)
+            obj_to_propagate.append(item)
+
+        if self._obj_type is not None and type(obj) is not self._obj_type:
+            obj_type = type(obj)
+        else:
+            obj_type = None
+        return (obj_list, idx_to_propagate, obj_type), obj_to_propagate, False
+
+    cpdef serial(self, obj: Any, dict context):
+        cdef uint64_t obj_id
+        obj_id = _fast_id(obj)
+        if obj_id in context:
+            return Placeholder(obj_id)
+        context[obj_id] = obj
+
+        return self._serial_iterable(obj)
+
+    cdef list _deserial_iterable(self, tuple serialized, list subs):
+        cdef list res_list, idx_to_propagate
+        cdef int64_t i
+
+        res_list, idx_to_propagate, _ = serialized
+
+        for i in range(len(idx_to_propagate)):
+            res_list[idx_to_propagate[i]] = subs[i]
+        return res_list
+
+
+cdef class TupleSerializer(CollectionSerializer):
+    serializer_id = 4
+    obj_type = tuple
+
+    cpdef deserial(self, tuple serialized, dict context, list subs):
+        cdef list res = self._deserial_iterable(serialized, subs)
+        for v in res:
+            assert type(v) is not Placeholder
+
+        obj_type = serialized[-1] or tuple
+        if hasattr(obj_type, "_fields"):
+            # namedtuple
+            return obj_type(*res)
+        else:
+            return obj_type(res)
+
+
+cdef class ListSerializer(CollectionSerializer):
+    serializer_id = 5
+    obj_type = list
+
+    cpdef deserial(self, tuple serialized, dict context, list subs):
+        cdef int64_t idx
+        cdef list res = self._deserial_iterable(serialized, subs)
+
+        obj_type = serialized[-1]
+        if obj_type is None:
+            result = res
+        else:
+            result = obj_type(res)
+
+        for idx, v in enumerate(res):
+            if type(v) is Placeholder:
+                cb = partial(result.__setitem__, idx)
+                (v).callbacks.append(cb)
+        return result
+
+
+def _dict_key_replacer(ret, key, real_key):
+    ret[real_key] = ret.pop(key)
+
+
+def _dict_value_replacer(context, ret, key, real_value):
+    if type(key) is Placeholder:
+        key = context[(key).id]
+    ret[key] = real_value
+
+
+cdef class DictSerializer(CollectionSerializer):
+    serializer_id = 6
+    cdef set _inspected_inherits
+
+    def __cinit__(self):
+        self._inspected_inherits = set()
+
+    cpdef serial(self, obj: Any, dict context):
+        cdef uint64_t obj_id
+        cdef tuple key_obj, value_obj
+        cdef list key_bufs, value_bufs
+
+        if type(obj) is dict and len(obj) == 0:
+            return (), [], True
+
+        obj_id = _fast_id(obj)
+        if obj_id in context:
+            return Placeholder(obj_id)
+        context[obj_id] = obj
+
+        obj_type = type(obj)
+
+        if obj_type is not dict and obj_type not in self._inspected_inherits:
+            inspect_init = inspect.getfullargspec(obj_type.__init__)
+            if (
+                inspect_init.args == ["self"]
+                and not inspect_init.varargs
+                and not inspect_init.varkw
+            ):
+                # inherited dicts may not have proper initializers
+                # for deserialization
+                # remove context to generate real serialized result
+                context.pop(obj_id)
+                return (obj,), [], True
+            else:
+                self._inspected_inherits.add(obj_type)
+
+        key_obj, key_bufs, _ = self._serial_iterable(obj.keys())
+        value_obj, value_bufs, _ = self._serial_iterable(obj.values())
+        if obj_type is dict:
+            obj_type = None
+        ser_obj = (key_obj[:-1], value_obj[:-1], len(key_bufs), obj_type)
+        return ser_obj, key_bufs + value_bufs, False
+
+    cpdef deserial(self, tuple serialized, dict context, list subs):
+        cdef int64_t i, num_key_bufs
+        cdef list key_subs, value_subs, keys, values
+
+        if not serialized:
+            return {}
+        if len(serialized) == 1:
+            # serialized directly
+            return serialized[0]
+
+        key_serialized, value_serialized, num_key_bufs, obj_type = serialized
+        key_subs = subs[:num_key_bufs]
+        value_subs = subs[num_key_bufs:]
+
+        keys = self._deserial_iterable(key_serialized + (None,), key_subs)
+        values = self._deserial_iterable(value_serialized + (None,), value_subs)
+
+        if obj_type is None:
+            ret = dict(zip(keys, values))
+        else:
+            try:
+                ret = obj_type(zip(keys, values))
+            except TypeError:
+                # first arg of defaultdict is a callable
+                ret = obj_type()
+                ret.update(zip(keys, values))
+
+        for i in range(len(keys)):
+            k, v = keys[i], values[i]
+            if type(k) is Placeholder:
+                (k).callbacks.append(
+                    partial(_dict_key_replacer, ret, k)
+                )
+            if type(v) is Placeholder:
+                (v).callbacks.append(
+                    partial(_dict_value_replacer, context, ret, k)
+                )
+        return ret
+
+
+cdef class Placeholder:
+    """
+    Placeholder object to reduce duplicated serialization
+
+    The object records object identifier and keeps callbacks
+    to replace itself in parent objects.
+    """
+    cdef public uint64_t id
+    cdef public list callbacks
+
+    def __init__(self, uint64_t id_):
+        self.id = id_
+        self.callbacks = []
+
+    def __hash__(self):
+        return self.id
+
+    def __eq__(self, other):  # pragma: no cover
+        if type(other) is not Placeholder:
+            return False
+        return self.id == other.id
+
+    def __repr__(self):
+        return (
+            f"Placeholder(id={self.id}, "
+            f"callbacks=[list of {len(self.callbacks)}])"
+        )
+
+
+cdef class PlaceholderSerializer(Serializer):
+    serializer_id = 7
+
+    cpdef serial(self, obj: Any, dict context):
+        return (), [], True
+
+    cpdef deserial(self, tuple serialized, dict context, list subs):
+        return Placeholder(0)
+
+
+PickleSerializer.register(object)
+for _primitive in _primitive_types:
+    PrimitiveSerializer.register(_primitive)
+BytesSerializer.register(bytes)
+BytesSerializer.register(memoryview)
+StrSerializer.register(str)
+ListSerializer.register(list)
+TupleSerializer.register(tuple)
+DictSerializer.register(dict)
+PlaceholderSerializer.register(Placeholder)
+
+
+cdef class _SerialStackItem:
+    cdef public tuple serialized
+    cdef public list subs
+    cdef public list subs_serialized
+
+    def __cinit__(self, tuple serialized, list subs):
+        self.serialized = serialized
+        self.subs = subs
+        self.subs_serialized = []
+
+
+cdef class _IdContextHolder:
+    cdef unordered_map[uint64_t, uint64_t] d
+
+
+cdef int _COMMON_HEADER_LEN = 4
+
+
+cdef tuple _serial_single(
+    obj, dict context, _IdContextHolder id_context_holder
+):
+    """Serialize single object and return serialized tuples"""
+    cdef uint64_t obj_id, ordered_id
+    cdef Serializer serializer
+    cdef tuple common_header, serialized
+
+    while True:
+        name = context.get("serializer")
+        obj_type = type(obj) if name is None else NamedType(name, type(obj))
+        serializer = _serial_dispatcher.get_handler(obj_type)
+        ret_serial = serializer.serial(obj, context)
+        if type(ret_serial) is tuple:
+            # object is serialized, form a common header and return
+            serialized, subs, final = ret_serial
+
+            if type(obj) is Placeholder:
+                obj_id = (obj).id
+                ordered_id = id_context_holder.d[obj_id]
+            else:
+                obj_id = _fast_id(obj)
+                ordered_id = id_context_holder.d.size()
+                id_context_holder.d[obj_id] = ordered_id
+
+            # REMEMBER to change _COMMON_HEADER_LEN when content of
+            # this header changed
+            common_header = (
+                serializer._serializer_id, ordered_id, len(subs), final
+            )
+            break
+        else:
+            # object is converted into another (usually a Placeholder)
+            obj = ret_serial
+    return common_header + serialized, subs, final
+
+
+class _SerializeObjectOverflow(Exception):
+    def __init__(self, tuple cur_serialized, int num_total_serialized):
+        super(_SerializeObjectOverflow, self).__init__(cur_serialized)
+        self.cur_serialized = cur_serialized
+        self.num_total_serialized = num_total_serialized
+
+
+cpdef object _serialize_with_stack(
+    list serial_stack,
+    tuple serialized,
+    dict context,
+    _IdContextHolder id_context_holder,
+    list result_bufs_list,
+    int64_t num_overflow = 0,
+    int64_t num_total_serialized = 0,
+):
+    cdef _SerialStackItem stack_item
+    cdef list subs
+    cdef bint final
+    cdef int64_t num_sub_serialized
+    cdef bint is_resume = num_total_serialized > 0
+
+    while serial_stack:
+        stack_item = serial_stack[-1]
+        if serialized is not None:
+            # have previously-serialized results, record first
+            stack_item.subs_serialized.append(serialized)
+
+        num_sub_serialized = len(stack_item.subs_serialized)
+        if len(stack_item.subs) == num_sub_serialized:
+            # all subcomponents serialized, serialization of current is done
+            # and we can move to the parent object
+            serialized = stack_item.serialized + tuple(stack_item.subs_serialized)
+            num_total_serialized += 1
+            serial_stack.pop()
+        else:
+            # serialize next subcomponent at stack top
+            serialized, subs, final = _serial_single(
+                stack_item.subs[num_sub_serialized], context, id_context_holder
+            )
+            num_total_serialized += 1
+            if final or not subs:
+                # the subcomponent is a leaf
+                if subs:
+                    result_bufs_list.extend(subs)
+            else:
+                # the subcomponent has its own subcomponents, we push itself
+                # into stack and process its children
+                stack_item = _SerialStackItem(serialized, subs)
+                serial_stack.append(stack_item)
+                # note that the serialized header should not be recorded
+                # as we are now processing the subcomponent itself
+                serialized = None
+        if 0 < num_overflow < num_total_serialized:
+            raise _SerializeObjectOverflow(serialized, num_total_serialized)
+
+    # we keep an empty dict for extra metas required for other modules
+    if is_resume:
+        # returns num of deserialized objects when resumed
+        extra_meta = {"_N": num_total_serialized}
+    else:
+        # otherwise does not record the number to reduce result size
+        extra_meta = {}
+    return (extra_meta, serialized), result_bufs_list
+
+
+def serialize(obj, dict context = None):
+    """
+    Serialize an object and return a header and buffers.
+    Buffers are intended for zero-copy data manipulation.
+
+    Parameters
+    ----------
+    obj: Any
+        Object to serialize
+    context:
+        Serialization context for instantiation of Placeholder
+        objects
+
+    Returns
+    -------
+    result: Tuple[Tuple, List]
+        Picklable header and buffers
+    """
+    cdef list serial_stack = []
+    cdef list result_bufs_list = []
+    cdef tuple serialized
+    cdef list subs
+    cdef bint final
+    cdef _IdContextHolder id_context_holder = _IdContextHolder()
+
+    context = context if context is not None else dict()
+    serialized, subs, final = _serial_single(obj, context, id_context_holder)
+    if final or not subs:
+        # marked as a leaf node, return directly
+        return ({}, serialized), subs
+
+    serial_stack.append(_SerialStackItem(serialized, subs))
+    return _serialize_with_stack(
+        serial_stack, None, context, id_context_holder, result_bufs_list
+    )
+
+
+async def serialize_with_spawn(
+    obj, dict context = None, int spawn_threshold = 100, object executor = None
+):
+    """
+    Serialize an object and return a header and buffers.
+    Buffers are intended for zero-copy data manipulation.
+
+    Parameters
+    ----------
+    obj: Any
+        Object to serialize
+    context: Dict
+        Serialization context for instantiation of Placeholder
+        objects
+    spawn_threshold: int
+        Threshold to spawn into a ThreadPoolExecutor
+    executor: ThreadPoolExecutor
+        ThreadPoolExecutor to spawn rest serialization into
+
+    Returns
+    -------
+    result: Tuple[Tuple, List]
+        Picklable header and buffers
+    """
+    cdef list serial_stack = []
+    cdef list result_bufs_list = []
+    cdef tuple serialized
+    cdef list subs
+    cdef bint final
+    cdef _IdContextHolder id_context_holder = _IdContextHolder()
+
+    context = context if context is not None else dict()
+    serialized, subs, final = _serial_single(obj, context, id_context_holder)
+    if final or not subs:
+        # marked as a leaf node, return directly
+        return ({}, serialized), subs
+
+    serial_stack.append(_SerialStackItem(serialized, subs))
+
+    try:
+        result = _serialize_with_stack(
+            serial_stack, None, context, id_context_holder, result_bufs_list, spawn_threshold
+        )
+    except _SerializeObjectOverflow as ex:
+        result = await asyncio.get_running_loop().run_in_executor(
+            executor,
+            _serialize_with_stack,
+            serial_stack,
+            ex.cur_serialized,
+            context,
+            id_context_holder,
+            result_bufs_list,
+            0,
+            ex.num_total_serialized,
+        )
+    return result
+
+
+cdef class _DeserialStackItem:
+    cdef public tuple serialized
+    cdef public tuple subs
+    cdef public list subs_deserialized
+
+    def __cinit__(self, tuple serialized, tuple subs):
+        self.serialized = serialized
+        self.subs = subs
+        self.subs_deserialized = []
+
+
+cdef _deserial_single(tuple serialized, dict context, list subs):
+    """Deserialize a single object"""
+    cdef Serializer serializer
+    cdef int64_t num_subs
+
+    serializer_id, obj_id, num_subs, final = serialized[:_COMMON_HEADER_LEN]
+    serializer = _deserializers[serializer_id]
+    res = serializer.deserial(serialized[_COMMON_HEADER_LEN:], context, subs)
+
+    if type(res) is Placeholder:
+        try:
+            res = context[obj_id]
+        except KeyError:
+            (res).id = obj_id
+
+    # get previously-recorded context values
+    context_val, context[obj_id] = context.get(obj_id), res
+    # if previously recorded object is a Placeholder,
+    # replace it with callbacks
+    if type(context_val) is Placeholder:
+        for cb in (context_val).callbacks:
+            cb(res)
+    return res
+
+
+def deserialize(tuple serialized, list buffers, dict context = None):
+    """
+    Deserialize an object with serialized headers and buffers
+
+    Parameters
+    ----------
+    serialized: Tuple
+        Serialized object header
+    buffers: List
+        List of buffers extracted from serialize() calls
+    context: Dict
+        Serialization context for replacing Placeholder
+        objects
+
+    Returns
+    -------
+    result: Any
+        Deserialized object
+    """
+    cdef list deserial_stack = []
+    cdef _DeserialStackItem stack_item
+    cdef int64_t num_subs, num_deserialized, buf_pos = 0
+    cdef bint final
+    cdef Serializer serializer
+    cdef object deserialized = None, exc_value = None
+    cdef bint has_deserialized = False
+
+    context = context if context is not None else dict()
+    # drop extra meta field
+    serialized = serialized[-1]
+    serializer_id, obj_id, num_subs, final = serialized[:4]
+    if final or num_subs == 0:
+        # marked as a leaf node, return directly
+        return _deserial_single(serialized, context, buffers)
+
+    deserial_stack.append(
+        _DeserialStackItem(
+            serialized[:-num_subs], serialized[-num_subs:]
+        )
+    )
+
+    while deserial_stack:
+        stack_item = deserial_stack[-1]
+        # the deserialized result can be None, hence we cannot
+        # simply judge from the value deserialized
+        if has_deserialized:
+            # have previously-deserialized results, record first
+            stack_item.subs_deserialized.append(deserialized)
+        elif exc_value is not None:
+            # have exception in successor components, try rewrite
+            # and pass to predecessors
+            serializer_id = stack_item.serialized[0]
+            serializer = _deserializers[serializer_id]
+            new_exc_value = serializer.on_deserial_error(
+                stack_item.serialized[_COMMON_HEADER_LEN:],
+                context,
+                list(stack_item.subs),
+                len(stack_item.subs_deserialized),
+                exc_value,
+            )
+            exc_value = new_exc_value if new_exc_value is not None else exc_value
+            deserial_stack.pop()
+            continue
+
+        num_deserialized = len(stack_item.subs_deserialized)
+        if len(stack_item.subs) == num_deserialized:
+            try:
+                # all subcomponents deserialized, we can deserialize the object itself
+                deserialized = _deserial_single(
+                    stack_item.serialized, context, stack_item.subs_deserialized
+                )
+                has_deserialized = True
+                deserial_stack.pop()
+            except BaseException as ex:
+                has_deserialized = False
+                exc_value = ex
+                deserial_stack.pop()
+        else:
+            # select next subcomponent to process
+            serialized = stack_item.subs[num_deserialized]
+            serializer_id, obj_id, num_subs, final = serialized[:4]
+            if final or num_subs == 0:
+                try:
+                    # next subcomponent is a leaf, just deserialize
+                    deserialized = _deserial_single(
+                        serialized, context, buffers[buf_pos : buf_pos + num_subs]
+                    )
+                    has_deserialized = True
+                    buf_pos += num_subs
+                except BaseException as ex:
+                    has_deserialized = False
+                    exc_value = ex
+            else:
+                # next subcomponent has its own subcomponents, we push it
+                # into stack and start handling its children
+                stack_item = _DeserialStackItem(
+                    serialized[:-num_subs], serialized[-num_subs:]
+                )
+                deserial_stack.append(stack_item)
+                # note that the deserialized object should be cleaned
+                # as we are just starting to handle the subcomponent itself
+                has_deserialized = False
+
+    if exc_value is not None:
+        raise exc_value
+    return deserialized
diff --git a/python/xorbits/_mars/serialization/cuda.py b/python/xorbits/_mars/serialization/cuda.py
new file mode 100644
index 000000000..f5794bcbe
--- /dev/null
+++ b/python/xorbits/_mars/serialization/cuda.py
@@ -0,0 +1,110 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Tuple
+
+import pandas as pd
+
+from ..utils import lazy_import
+from .core import Serializer, buffered
+
+cupy = lazy_import("cupy")
+cudf = lazy_import("cudf")
+
+
+class CupySerializer(Serializer):
+    @buffered
+    def serial(self, obj: Any, context: Dict):
+        if not (obj.flags["C_CONTIGUOUS"] or obj.flags["F_CONTIGUOUS"]):
+            obj = cupy.array(obj, copy=True)
+
+        header = obj.__cuda_array_interface__.copy()
+        header["strides"] = tuple(obj.strides)
+        header["lengths"] = [obj.nbytes]
+        buffer = cupy.ndarray(
+            shape=(obj.nbytes,), dtype=cupy.dtype("u1"), memptr=obj.data, strides=(1,)
+        )
+        return (header,), [buffer], True
+
+    def deserial(self, serialized: Tuple, context: Dict, subs: List):
+        (header,) = serialized
+        return cupy.ndarray(
+            shape=header["shape"],
+            dtype=header["typestr"],
+            memptr=cupy.asarray(subs[0]).data,
+            strides=header["strides"],
+        )
+
+
+class CudfSerializer(Serializer):
+    @staticmethod
+    def _get_ext_index_type(index_obj):
+        import cudf
+
+        multi_index_type = None
+        if isinstance(index_obj, pd.MultiIndex):
+            multi_index_type = "pandas"
+        elif isinstance(index_obj, cudf.MultiIndex):
+            multi_index_type = "cudf"
+
+        if multi_index_type is None:
+            return None
+        return {
+            "index_type": multi_index_type,
+            "names": list(index_obj.names),
+        }
+
+    @staticmethod
+    def _apply_index_type(obj, attr, header):
+        import cudf
+
+        multi_index_cls = (
+            pd.MultiIndex if header["index_type"] == "pandas" else cudf.MultiIndex
+        )
+        original_index = getattr(obj, attr)
+        if isinstance(original_index, (pd.MultiIndex, cudf.MultiIndex)):
+            return
+        new_index = multi_index_cls.from_tuples(original_index, names=header["names"])
+        setattr(obj, attr, new_index)
+
+    def serial(self, obj: Any, context: Dict):
+        header, buffers = obj.device_serialize()
+        if hasattr(obj, "columns"):
+            header["_ext_columns"] = self._get_ext_index_type(obj.columns)
+        if hasattr(obj, "index"):
+            header["_ext_index"] = self._get_ext_index_type(obj.index)
+        return (header,), buffers, True
+
+    def deserial(self, serialized: Tuple, context: Dict, buffers: List):
+        from cudf.core.abc import Serializable
+
+        (header,) = serialized
+        col_header = header.pop("_ext_columns", None)
+        index_header = header.pop("_ext_index", None)
+
+        result = Serializable.device_deserialize(header, buffers)
+
+        if col_header is not None:
+            self._apply_index_type(result, "columns", col_header)
+        if index_header is not None:
+            self._apply_index_type(result, "index", index_header)
+        return result
+
+
+if cupy is not None:
+    CupySerializer.register("cupy.ndarray")
+if cudf is not None:
+    CudfSerializer.register("cudf.DataFrame")
+    CudfSerializer.register("cudf.Series")
+    CudfSerializer.register("cudf.Index")
diff --git a/python/xorbits/_mars/serialization/exception.py b/python/xorbits/_mars/serialization/exception.py
new file mode 100644
index 000000000..6fea0d193
--- /dev/null
+++ b/python/xorbits/_mars/serialization/exception.py
@@ -0,0 +1,46 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pickle  # nosec  # pylint: disable=import_pickle
+from typing import Dict, List, Union
+
+from .core import Serializer, buffered, pickle_buffers, unpickle_buffers
+
+
+class UnpickleableError(Exception):
+    def __init__(self, raw_error: Union[str, Exception]):
+        if isinstance(raw_error, str):
+            super().__init__(raw_error)
+        else:
+            super().__init__(
+                f"Error cannot be pickled, "
+                f"error type: {type(raw_error)}, "
+                f"raw error:\n{raw_error}"
+            )
+
+
+class ExceptionSerializer(Serializer):
+    @buffered
+    def serial(self, obj: Exception, context: Dict):
+        try:
+            buffers = pickle_buffers(obj)
+        except (TypeError, pickle.PicklingError):
+            buffers = pickle_buffers(UnpickleableError(obj))
+        return (), buffers, True
+
+    def deserial(self, serialized: Dict, context: Dict, subs: List):
+        return unpickle_buffers(subs)
+
+
+ExceptionSerializer.register(Exception)
diff --git a/python/xorbits/_mars/serialization/mars_objects.py b/python/xorbits/_mars/serialization/mars_objects.py
new file mode 100644
index 000000000..f4f77be46
--- /dev/null
+++ b/python/xorbits/_mars/serialization/mars_objects.py
@@ -0,0 +1,39 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List
+
+try:
+    import scipy.sparse as sps
+except ImportError:  # pragma: no cover
+    sps = None
+
+from ..lib.sparse import SparseNDArray
+from .core import Serializer, buffered, deserialize, serialize
+
+
+class SparseNDArraySerializer(Serializer):
+    @buffered
+    def serial(self, obj: Any, context: Dict):
+        raw_header, raw_buffers = serialize(obj.raw, context)
+        return (raw_header, obj.shape), raw_buffers, True
+
+    def deserial(self, serialized: Dict, context: Dict, subs: List):
+        raw_header, obj_shape = serialized
+        raw_csr = deserialize(raw_header, subs)
+        return SparseNDArray(raw_csr, shape=tuple(obj_shape))
+
+
+if sps:  # pragma: no branch
+    SparseNDArraySerializer.register(SparseNDArray)
diff --git a/python/xorbits/_mars/serialization/numpy.py b/python/xorbits/_mars/serialization/numpy.py
new file mode 100644
index 000000000..02eeeb54c
--- /dev/null
+++ b/python/xorbits/_mars/serialization/numpy.py
@@ -0,0 +1,81 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Tuple
+
+import numpy as np
+
+from .core import Serializer, buffered, pickle_buffers, unpickle_buffers
+
+
+class NDArraySerializer(Serializer):
+    @buffered
+    def serial(self, obj: np.ndarray, context: Dict):
+        header = {}
+        if obj.dtype.hasobject:
+            header["pickle"] = True
+            buffers = pickle_buffers(obj)
+            return (header,), buffers, True
+
+        order = "C"
+        if obj.flags.f_contiguous:
+            order = "F"
+        elif not obj.flags.c_contiguous:
+            obj = np.ascontiguousarray(obj)
+        try:
+            desc = np.lib.format.dtype_to_descr(obj.dtype)
+            dtype_new_order = None
+        except ValueError:
+            # for structured dtype, array[[field2, field1]] will create a view,
+            # and dtype_to_desc will fail due to the order
+            fields = obj.dtype.fields
+            new_fields = sorted(fields, key=lambda k: fields[k][1])
+            desc = np.lib.format.dtype_to_descr(obj.dtype[new_fields])
+            dtype_new_order = list(fields)
+        header.update(
+            dict(
+                pickle=False,
+                descr=desc,
+                dtype_new_order=dtype_new_order,
+                shape=list(obj.shape),
+                strides=list(obj.strides),
+                order=order,
+            )
+        )
+        return (header,), [memoryview(obj.ravel(order=order).view("uint8").data)], True
+
+    def deserial(self, serialized: Tuple, context: Dict, subs: List[Any]):
+        header = serialized[0]
+        if header["pickle"]:
+            return unpickle_buffers(subs)
+
+        try:
+            dtype = np.lib.format.descr_to_dtype(header["descr"])
+        except AttributeError:  # pragma: no cover
+            # for older numpy versions, descr_to_dtype is not implemented
+            dtype = np.dtype(header["descr"])
+
+        dtype_new_order = header["dtype_new_order"]
+        if dtype_new_order:
+            dtype = dtype[dtype_new_order]
+        return np.ndarray(
+            shape=tuple(header["shape"]),
+            dtype=dtype,
+            buffer=subs[0],
+            strides=tuple(header["strides"]),
+            order=header["order"],
+        )
+
+
+NDArraySerializer.register(np.ndarray)
diff --git a/python/xorbits/_mars/serialization/ray.py b/python/xorbits/_mars/serialization/ray.py
new file mode 100644
index 000000000..a8aeabf8e
--- /dev/null
+++ b/python/xorbits/_mars/serialization/ray.py
@@ -0,0 +1,38 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Tuple
+
+from ..utils import lazy_import
+from .core import Serializer, buffered
+
+ray = lazy_import("ray")
+
+
+class RaySerializer(Serializer):
+    """Return raw object to let ray do serialization."""
+
+    @buffered
+    def serial(self, obj: Any, context: Dict):
+        return (obj,), [], True
+
+    def deserial(self, serialized: Tuple, context: Dict, subs: List[Any]):
+        assert not subs
+        return serialized[0]
+
+
+if ray is not None:
+    RaySerializer.register(object, "ray")
+    RaySerializer.register("ray.ObjectRef", "ray")
+    RaySerializer.register("ray.actor.ActorHandle", "ray")
diff --git a/python/xorbits/_mars/serialization/scipy.py b/python/xorbits/_mars/serialization/scipy.py
new file mode 100644
index 000000000..fddcecd2c
--- /dev/null
+++ b/python/xorbits/_mars/serialization/scipy.py
@@ -0,0 +1,71 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Tuple
+
+import numpy as np
+
+try:
+    import scipy.sparse as sps
+except ImportError:  # pragma: no cover
+    sps = None
+
+from .core import Serializer, buffered, deserialize, serialize
+
+
+class CsrMatrixSerializer(Serializer):
+    @buffered
+    def serial(self, obj: Any, context: Dict):
+        data_header, data_buffers = serialize(obj.data)
+        idx_header, idx_buffers = serialize(obj.indices)
+        indptr_header, indptr_buffers = serialize(obj.indptr)
+        header = (
+            data_header,  # data_header
+            len(data_buffers),  # data_buf_num
+            idx_header,  # idx_header
+            len(idx_buffers),  # idx_buf_num
+            indptr_header,  # indptr_header
+            obj.shape,  # shape
+        )
+        return header, data_buffers + idx_buffers + indptr_buffers, True
+
+    def deserial(self, serialized: Tuple, context: Dict, subs: List):
+        (
+            data_header,
+            data_buf_num,
+            idx_header,
+            idx_buf_num,
+            indptr_header,
+            shape,
+        ) = serialized
+        data_buffers = subs[:data_buf_num]
+        idx_buffers = subs[data_buf_num : data_buf_num + idx_buf_num]
+        indptr_buffers = subs[data_buf_num + idx_buf_num :]
+
+        data = deserialize(data_header, data_buffers)
+        indices = deserialize(idx_header, idx_buffers)
+        indptr = deserialize(indptr_header, indptr_buffers)
+        shape = tuple(shape)
+
+        empty_arr = np.zeros(0, dtype=data.dtype)
+
+        target_csr = sps.coo_matrix(
+            (empty_arr, (empty_arr,) * 2), dtype=data.dtype, shape=shape
+        ).tocsr()
+        target_csr.data, target_csr.indices, target_csr.indptr = data, indices, indptr
+        return target_csr
+
+
+if sps:  # pragma: no branch
+    CsrMatrixSerializer.register(sps.csr_matrix)
diff --git a/python/xorbits/_mars/serialization/serializables/__init__.py b/python/xorbits/_mars/serialization/serializables/__init__.py
new file mode 100644
index 000000000..ffc7769b7
--- /dev/null
+++ b/python/xorbits/_mars/serialization/serializables/__init__.py
@@ -0,0 +1,55 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .core import Serializable, SerializableMeta
+from .field import (
+    AnyField,
+    BoolField,
+    BytesField,
+    Complex64Field,
+    Complex128Field,
+    DataFrameField,
+    DataTypeField,
+    Datetime64Field,
+    DictField,
+    Float16Field,
+    Float32Field,
+    Float64Field,
+    FunctionField,
+    IdentityField,
+    IndexField,
+    Int8Field,
+    Int16Field,
+    Int32Field,
+    Int64Field,
+    IntervalArrayField,
+    KeyField,
+    ListField,
+    NamedTupleField,
+    NDArrayField,
+    OneOfField,
+    ReferenceField,
+    SeriesField,
+    SliceField,
+    StringField,
+    Timedelta64Field,
+    TupleField,
+    TZInfoField,
+    UInt8Field,
+    UInt16Field,
+    UInt32Field,
+    UInt64Field,
+)
+from .field_type import FieldTypes
diff --git a/python/xorbits/_mars/serialization/serializables/core.py b/python/xorbits/_mars/serialization/serializables/core.py
new file mode 100644
index 000000000..73a206667
--- /dev/null
+++ b/python/xorbits/_mars/serialization/serializables/core.py
@@ -0,0 +1,245 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import operator
+import weakref
+from typing import Dict, List, Tuple, Type
+
+import cloudpickle
+
+from ..core import Placeholder, Serializer, buffered
+from .field import Field
+from .field_type import (
+    DatetimeType,
+    DictType,
+    DtypeType,
+    ListType,
+    PrimitiveFieldType,
+    TimedeltaType,
+    TupleType,
+    TZInfoType,
+)
+
+_primitive_field_types = (
+    PrimitiveFieldType,
+    DtypeType,
+    DatetimeType,
+    TimedeltaType,
+    TZInfoType,
+)
+
+
+def _is_field_primitive_compound(field: Field):
+    if field.on_serialize is not None or field.on_deserialize is not None:
+        return False
+
+    def check_type(field_type):
+        if isinstance(field_type, _primitive_field_types):
+            return True
+        if isinstance(field_type, (ListType, TupleType)):
+            if all(
+                check_type(element_type) or element_type is Ellipsis
+                for element_type in field_type._field_types
+            ):
+                return True
+        if isinstance(field_type, DictType):
+            if all(
+                isinstance(element_type, _primitive_field_types)
+                or element_type is Ellipsis
+                for element_type in (field_type.key_type, field_type.value_type)
+            ):
+                return True
+        return False
+
+    return check_type(field.field_type)
+
+
+class SerializableMeta(type):
+    def __new__(mcs, name: str, bases: Tuple[Type], properties: Dict):
+        # All the fields including base fields.
+        all_fields = dict()
+
+        for base in bases:
+            if hasattr(base, "_FIELDS"):
+                all_fields.update(base._FIELDS)
+
+        properties_without_fields = {}
+        properties_field_slot_names = []
+        for k, v in properties.items():
+            if not isinstance(v, Field):
+                properties_without_fields[k] = v
+                continue
+
+            field = all_fields.get(k)
+            if field is None:
+                properties_field_slot_names.append(k)
+            else:
+                v.name = field.name
+                v.get = field.get
+                v.set = field.set
+                v.__delete__ = field.__delete__
+            all_fields[k] = v
+
+        # Make field order deterministic to serialize it as list instead of dict.
+        all_fields = dict(sorted(all_fields.items(), key=operator.itemgetter(0)))
+        pickle_fields = []
+        non_pickle_fields = []
+        for v in all_fields.values():
+            if _is_field_primitive_compound(v):
+                pickle_fields.append(v)
+            else:
+                non_pickle_fields.append(v)
+
+        slots = set(properties.pop("__slots__", set()))
+        slots.update(properties_field_slot_names)
+
+        properties = properties_without_fields
+        properties["_FIELDS"] = all_fields
+        properties["_PRIMITIVE_FIELDS"] = pickle_fields
+        properties["_NON_PRIMITIVE_FIELDS"] = non_pickle_fields
+        properties["__slots__"] = tuple(slots)
+
+        clz = type.__new__(mcs, name, bases, properties)
+        # Bind slot member_descriptor with field.
+        for name in properties_field_slot_names:
+            member_descriptor = getattr(clz, name)
+            field = all_fields[name]
+            field.name = member_descriptor.__name__
+            field.get = member_descriptor.__get__
+            field.set = member_descriptor.__set__
+            field.__delete__ = member_descriptor.__delete__
+            setattr(clz, name, field)
+
+        return clz
+
+
+class Serializable(metaclass=SerializableMeta):
+    __slots__ = ("__weakref__",)
+
+    _cache_primitive_serial = False
+
+    _FIELDS: Dict[str, Field]
+    _PRIMITIVE_FIELDS: List[str]
+    _NON_PRIMITIVE_FIELDS: List[str]
+
+    def __init__(self, *args, **kwargs):
+        fields = self._FIELDS
+        if args:  # pragma: no cover
+            values = dict(zip(fields, args))
+            values.update(kwargs)
+        else:
+            values = kwargs
+        for k, v in values.items():
+            fields[k].set(self, v)
+
+    def __on_deserialize__(self):
+        pass
+
+    def __repr__(self):
+        values = ", ".join(
+            [
+                "{}={!r}".format(slot, getattr(self, slot, None))
+                for slot in self.__slots__
+            ]
+        )
+        return "{}({})".format(self.__class__.__name__, values)
+
+    def copy(self) -> "Serializable":
+        copied = type(self)()
+        copied_fields = copied._FIELDS
+        for k, field in self._FIELDS.items():
+            try:
+                # Slightly faster than getattr.
+                value = field.get(self, k)
+                copied_fields[k].set(copied, value)
+            except AttributeError:
+                continue
+        return copied
+
+
+_primitive_serial_cache = weakref.WeakKeyDictionary()
+
+
+class _NoFieldValue:
+    pass
+
+
+class SerializableSerializer(Serializer):
+    """
+    Leverage DictSerializer to perform serde.
+    """
+
+    @classmethod
+    def _get_field_values(cls, obj: Serializable, fields):
+        values = []
+        for field in fields:
+            try:
+                value = field.get(obj)
+                if field.on_serialize is not None:
+                    value = field.on_serialize(value)
+            except AttributeError:
+                # Most field values are not None, serialize by list is more efficient than dict.
+                value = _NoFieldValue
+            values.append(value)
+        return values
+
+    @buffered
+    def serial(self, obj: Serializable, context: Dict):
+        if obj._cache_primitive_serial and obj in _primitive_serial_cache:
+            primitive_vals = _primitive_serial_cache[obj]
+        else:
+            primitive_vals = self._get_field_values(obj, obj._PRIMITIVE_FIELDS)
+            if obj._cache_primitive_serial:
+                primitive_vals = cloudpickle.dumps(primitive_vals)
+                _primitive_serial_cache[obj] = primitive_vals
+
+        compound_vals = self._get_field_values(obj, obj._NON_PRIMITIVE_FIELDS)
+        return (type(obj), primitive_vals), [compound_vals], False
+
+    @staticmethod
+    def _set_field_value(obj: Serializable, field: Field, value):
+        if value is _NoFieldValue:
+            return
+        if type(value) is Placeholder:
+            if field.on_deserialize is not None:
+                value.callbacks.append(
+                    lambda v: field.set(obj, field.on_deserialize(v))
+                )
+            else:
+                value.callbacks.append(lambda v: field.set(obj, v))
+        else:
+            if field.on_deserialize is not None:
+                field.set(obj, field.on_deserialize(value))
+            else:
+                field.set(obj, value)
+
+    def deserial(self, serialized: Tuple, context: Dict, subs: List) -> Serializable:
+        obj_class, primitives = serialized
+
+        if type(primitives) is not list:
+            primitives = cloudpickle.loads(primitives)
+
+        obj = obj_class.__new__(obj_class)
+
+        if primitives:
+            for field, value in zip(obj_class._PRIMITIVE_FIELDS, primitives):
+                self._set_field_value(obj, field, value)
+
+        if obj_class._NON_PRIMITIVE_FIELDS:
+            for field, value in zip(obj_class._NON_PRIMITIVE_FIELDS, subs[0]):
+                self._set_field_value(obj, field, value)
+        obj.__on_deserialize__()
+        return obj
+
+
+SerializableSerializer.register(Serializable)
diff --git a/python/xorbits/_mars/serialization/serializables/field.py b/python/xorbits/_mars/serialization/serializables/field.py
new file mode 100644
index 000000000..072cb6173
--- /dev/null
+++ b/python/xorbits/_mars/serialization/serializables/field.py
@@ -0,0 +1,579 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import inspect
+import itertools
+from abc import ABC, ABCMeta, abstractmethod
+from typing import Any, Callable, Optional, Type, Union
+
+from ...utils import _is_ci, no_default
+from .field_type import (
+    AbstractFieldType,
+    DictType,
+    FieldTypes,
+    ListType,
+    ReferenceType,
+    TupleType,
+)
+
+
+class Field(ABC):
+    __slots__ = (
+        "_tag",
+        "_default_value",
+        "_default_factory",
+        "_on_serialize",
+        "_on_deserialize",
+        "name",  # The __name__ of member_descriptor
+        "get",  # The __get__ of member_descriptor
+        "set",  # The __set__ of member_descriptor
+        "__delete__",  # The __delete__ of member_descriptor
+    )
+
+    _tag: str
+    _default_value: Any
+    _default_factory: Optional[Callable]
+
+    def __init__(
+        self,
+        tag: str,
+        default: Any = no_default,
+        default_factory: Optional[Callable] = None,
+        on_serialize: Callable[[Any], Any] = None,
+        on_deserialize: Callable[[Any], Any] = None,
+    ):
+        if (
+            default is not no_default and default_factory is not None
+        ):  # pragma: no cover
+            raise ValueError("default and default_factory can not be specified both")
+
+        self._tag = tag
+        self._default_value = default
+        self._default_factory = default_factory
+        self._on_serialize = on_serialize
+        self._on_deserialize = on_deserialize
+
+    @property
+    def tag(self):
+        return self._tag
+
+    @property
+    def on_serialize(self):
+        return self._on_serialize
+
+    @property
+    def on_deserialize(self):
+        return self._on_deserialize
+
+    @property
+    @abstractmethod
+    def field_type(self) -> AbstractFieldType:
+        """
+        Field type.
+
+        Returns
+        -------
+        field_type : AbstractFieldType
+             Field type.
+        """
+
+    def __get__(self, instance, owner=None):
+        try:
+            return self.get(instance, owner)
+        except AttributeError:
+            if self._default_value is not no_default:
+                val = self._default_value
+                self.set(instance, val)
+                return val
+            elif self._default_factory is not None:
+                val = self._default_factory()
+                self.set(instance, val)
+                return val
+            else:
+                raise
+
+    def __set__(self, instance, value) -> None:
+        if _is_ci:  # pragma: no branch
+            from ...core import is_kernel_mode
+
+            if not is_kernel_mode():
+                field_type = self.field_type
+                try:
+                    to_check_value = value
+                    if to_check_value is not None and self._on_serialize:
+                        to_check_value = self._on_serialize(to_check_value)
+                    field_type.validate(to_check_value)
+                except (TypeError, ValueError) as e:
+                    raise type(e)(
+                        f"Failed to set `{self.name}` for {type(instance).__name__} "
+                        f"when environ CI=true is set: {str(e)}"
+                    )
+        self.set(instance, value)
+
+
+class AnyField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.any
+
+
+class IdentityField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.string
+
+
+class BoolField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.bool
+
+
+class Int8Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.int8
+
+
+class Int16Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.int16
+
+
+class Int32Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.int32
+
+
+class Int64Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.int64
+
+
+class UInt8Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.uint8
+
+
+class UInt16Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.uint16
+
+
+class UInt32Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.uint32
+
+
+class UInt64Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.uint64
+
+
+class Float16Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.float16
+
+
+class Float32Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.float32
+
+
+class Float64Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.float64
+
+
+class Complex64Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.complex64
+
+
+class Complex128Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.complex128
+
+
+class StringField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.string
+
+
+class BytesField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.bytes
+
+
+class KeyField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.key
+
+
+class NDArrayField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.ndarray
+
+
+class Datetime64Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.datetime
+
+
+class Timedelta64Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.timedelta
+
+
+class DataTypeField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.dtype
+
+
+class IndexField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.index
+
+
+class SeriesField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.series
+
+
+class DataFrameField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.dataframe
+
+
+class SliceField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.slice
+
+
+class FunctionField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.function
+
+
+class NamedTupleField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.namedtuple
+
+
+class TZInfoField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.tzinfo
+
+
+class IntervalArrayField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.interval_array
+
+
+class _CollectionField(Field, metaclass=ABCMeta):
+    __slots__ = ("_field_type",)
+
+    def __init__(
+        self,
+        tag: str,
+        field_type: AbstractFieldType = None,
+        default: Any = no_default,
+        default_factory: Optional[Callable] = None,
+        on_serialize: Callable[[Any], Any] = None,
+        on_deserialize: Callable[[Any], Any] = None,
+    ):
+        super().__init__(
+            tag,
+            default=default,
+            default_factory=default_factory,
+            on_serialize=on_serialize,
+            on_deserialize=on_deserialize,
+        )
+        if field_type is None:
+            field_type = FieldTypes.any
+        collection_type = self._collection_type()
+        if not isinstance(field_type, collection_type):
+            field_type = collection_type(field_type, ...)
+        self._field_type = field_type
+
+    @classmethod
+    @abstractmethod
+    def _collection_type(cls) -> AbstractFieldType:
+        """
+        Collection type.
+
+        Returns
+        -------
+        collection_type
+        """
+
+    @property
+    def field_type(self) -> Type[AbstractFieldType]:
+        return self._field_type
+
+
+class ListField(_CollectionField):
+    __slots__ = ()
+
+    @classmethod
+    def _collection_type(cls) -> Type[AbstractFieldType]:
+        return ListType
+
+
+class TupleField(_CollectionField):
+    __slots__ = ()
+
+    @classmethod
+    def _collection_type(cls) -> Type[AbstractFieldType]:
+        return TupleType
+
+
+class DictField(Field):
+    __slots__ = ("_field_type",)
+
+    def __init__(
+        self,
+        tag: str,
+        key_type: AbstractFieldType = None,
+        value_type: AbstractFieldType = None,
+        default: Any = no_default,
+        default_factory: Optional[Callable] = None,
+        on_serialize: Callable[[Any], Any] = None,
+        on_deserialize: Callable[[Any], Any] = None,
+    ):
+        super().__init__(
+            tag,
+            default=default,
+            default_factory=default_factory,
+            on_serialize=on_serialize,
+            on_deserialize=on_deserialize,
+        )
+        self._field_type = DictType(key_type, value_type)
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return self._field_type
+
+
+class ReferenceField(Field):
+    __slots__ = "_reference_type", "_field_type"
+
+    def __init__(
+        self,
+        tag: str,
+        reference_type: Union[str, Type] = None,
+        default: Any = no_default,
+        on_serialize: Callable[[Any], Any] = None,
+        on_deserialize: Callable[[Any], Any] = None,
+    ):
+        super().__init__(
+            tag,
+            default=default,
+            on_serialize=on_serialize,
+            on_deserialize=on_deserialize,
+        )
+        self._reference_type = reference_type
+
+        if not isinstance(reference_type, str):
+            self._field_type = ReferenceType(reference_type)
+        else:
+            # need to bind dynamically
+            self._field_type = None
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return self._field_type
+
+    def get_field_type(self, instance):
+        if self._field_type is None:
+            # bind dynamically
+            if self._reference_type == "self":
+                reference_type = type(instance)
+            elif isinstance(self._reference_type, str) and "." in self._reference_type:
+                module, name = self._reference_type.rsplit(".", 1)
+                reference_type = getattr(importlib.import_module(module), name)
+            else:
+                module = inspect.getmodule(instance)
+                reference_type = getattr(module, self._reference_type)
+            self._field_type = ReferenceType(reference_type)
+        return self._field_type
+
+    def __set__(self, instance, value):
+        if _is_ci:
+            from ...core import is_kernel_mode
+
+            if not is_kernel_mode():
+                field_type = self.get_field_type(instance)
+                try:
+                    to_check_value = value
+                    if to_check_value is not None and self._on_serialize:
+                        to_check_value = self._on_serialize(to_check_value)
+                    field_type.validate(to_check_value)
+                except (TypeError, ValueError) as e:
+                    raise type(e)(
+                        f"Failed to set `{self.name}` for {type(instance).__name__} "
+                        f"when environ CI=true is set: {e}"
+                    )
+        self.set(instance, value)
+
+
+class OneOfField(Field):
+    __slots__ = "_reference_fields"
+
+    def __init__(
+        self,
+        tag: str,
+        default: Any = no_default,
+        on_serialize: Callable[[Any], Any] = None,
+        on_deserialize: Callable[[Any], Any] = None,
+        **tag_to_reference_types,
+    ):
+        super().__init__(
+            tag,
+            default=default,
+            on_serialize=on_serialize,
+            on_deserialize=on_deserialize,
+        )
+        self._reference_fields = [
+            ReferenceField(t, ref_type)
+            for t, ref_type in tag_to_reference_types.items()
+        ]
+
+    @property
+    def reference_fields(self):
+        return self._reference_fields
+
+    @property
+    def field_type(self) -> AbstractFieldType:  # pragma: no cover
+        # takes no effect here, just return AnyType()
+        # we will do check in __set__ instead
+        return FieldTypes.any
+
+    def __set__(self, instance, value):
+        if not _is_ci:  # pragma: no cover
+            return self.set(instance, value)
+
+        for reference_field in self._reference_fields:
+            try:
+                to_check_value = value
+                if to_check_value is not None and self._on_serialize:
+                    to_check_value = self._on_serialize(to_check_value)
+                reference_field.get_field_type(instance).validate(to_check_value)
+                self.set(instance, value)
+                return
+            except TypeError:
+                continue
+        valid_types = list(
+            itertools.chain(
+                *[
+                    r.get_field_type(instance).valid_types
+                    for r in self._reference_fields
+                ]
+            )
+        )
+        raise TypeError(
+            f"Failed to set `{self.name}` for {type(instance).__name__} "
+            f"when environ CI=true is set: type of instance cannot match any "
+            f"of {valid_types}, got {type(value).__name__}"
+        )
diff --git a/python/xorbits/_mars/serialization/serializables/field_type.py b/python/xorbits/_mars/serialization/serializables/field_type.py
new file mode 100644
index 000000000..24e01308f
--- /dev/null
+++ b/python/xorbits/_mars/serialization/serializables/field_type.py
@@ -0,0 +1,559 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, ABCMeta, abstractmethod
+from datetime import datetime, timedelta, tzinfo
+from enum import Enum
+from typing import Tuple, Type
+
+import numpy as np
+import pandas as pd
+
+from ...utils import lazy_import
+
+cupy = lazy_import("cupy")
+cudf = lazy_import("cudf")
+
+
+class PrimitiveType(Enum):
+    bool = 1
+    int8 = 2
+    int16 = 3
+    int32 = 4
+    int64 = 5
+    uint8 = 6
+    uint16 = 7
+    uint32 = 8
+    uint64 = 9
+    float16 = 10
+    float32 = 11
+    float64 = 12
+    bytes = 13
+    string = 14
+    complex64 = 24
+    complex128 = 25
+
+
+_primitive_type_to_valid_types = {
+    PrimitiveType.bool: (bool, np.bool_),
+    PrimitiveType.int8: (int, np.int8),
+    PrimitiveType.int16: (int, np.int16),
+    PrimitiveType.int32: (int, np.int32),
+    PrimitiveType.int64: (int, np.int64),
+    PrimitiveType.uint8: (int, np.uint8),
+    PrimitiveType.uint16: (int, np.uint16),
+    PrimitiveType.uint32: (int, np.uint32),
+    PrimitiveType.uint64: (int, np.uint64),
+    PrimitiveType.float16: (float, np.float16),
+    PrimitiveType.float32: (float, np.float32),
+    PrimitiveType.float64: (float, np.float64),
+    PrimitiveType.bytes: (bytes, np.bytes_),
+    PrimitiveType.string: (str, np.unicode_),
+    PrimitiveType.complex64: (complex, np.complex64),
+    PrimitiveType.complex128: (complex, np.complex128),
+}
+
+
+class AbstractFieldType(ABC):
+    __slots__ = ()
+
+    @property
+    @abstractmethod
+    def type_name(self) -> str:
+        """
+        Type name.
+
+        Returns
+        -------
+        type_name : str
+        """
+
+    @property
+    def name(self) -> str:
+        """
+        Name of field type instance.
+
+        Returns
+        -------
+        name : str
+        """
+        return self.type_name.capitalize()
+
+    @property
+    @abstractmethod
+    def valid_types(self) -> Tuple[Type, ...]:
+        """
+        Valid types.
+
+        Returns
+        -------
+        valid_types: tuple
+            Valid types.
+        """
+
+    def validate(self, value):
+        if value is not None and not isinstance(value, self.valid_types):
+            raise TypeError(
+                f"value needs to be instance "
+                f"of {self.valid_types}, got {type(value)}"
+            )
+
+    def __call__(self, *args, **kwargs):
+        return type(self)(*args, **kwargs)
+
+
+class SingletonFieldType(AbstractFieldType, metaclass=ABCMeta):
+    __slots__ = ()
+
+    _instance = None
+
+    def __new__(cls, *args, **kw):
+        if cls._instance is None:
+            inst = super().__new__(cls, *args, **kw)
+            cls._instance = inst
+        return cls._instance
+
+
+class PrimitiveFieldType(AbstractFieldType):
+    __slots__ = ("type",)
+
+    _type_to_instances = dict()
+
+    def __new__(cls, *args, **kwargs):
+        primitive_type = args[0]
+        try:
+            return cls._type_to_instances[primitive_type]
+        except KeyError:
+            inst = cls._type_to_instances[primitive_type] = super().__new__(cls)
+            return inst
+
+    def __init__(self, primitive_type: PrimitiveType):
+        self.type = primitive_type
+
+    @property
+    def type_name(self) -> str:
+        return self.type.name
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return _primitive_type_to_valid_types[self.type]
+
+
+class SliceType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "slice"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return (slice,)
+
+
+class NDArrayType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "ndarray"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        if cupy is None:
+            return (np.ndarray,)
+        else:
+            return np.ndarray, cupy.ndarray
+
+
+class DtypeType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "dtype"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return np.dtype, pd.api.extensions.ExtensionDtype
+
+
+class KeyType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "dtype"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        from ...core.entity import ENTITY_TYPE
+
+        return ENTITY_TYPE
+
+
+class DatetimeType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "datetime"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return datetime, pd.Timestamp
+
+
+class TimedeltaType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "timedelta"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return timedelta, pd.Timedelta
+
+
+class IndexType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "index"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        if cudf is None:
+            return (pd.Index,)
+        else:
+            return pd.Index, cudf.Index
+
+
+class SeriesType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "series"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        if cudf is None:
+            return (pd.Series,)
+        else:
+            return pd.Series, cudf.Series
+
+
+class DataFrameType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "dataframe"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        if cudf is None:
+            return (pd.DataFrame,)
+        else:
+            return pd.DataFrame, cudf.DataFrame
+
+
+class FunctionType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "function"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:  # pragma: no cover
+        return ()
+
+    def validate(self, value):
+        if value is not None and not callable(value):
+            raise TypeError(f"value should be a function, got {type(value)}")
+
+
+class NamedtupleType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "namedtuple"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return (tuple,)
+
+    def validate(self, value):
+        if not (isinstance(value, self.valid_types) and hasattr(value, "_fields")):
+            raise TypeError(
+                f"value should be instance of namedtuple, got {type(value)}"
+            )
+
+
+class TZInfoType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "tzinfo"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return (tzinfo,)
+
+
+class IntervalArrayType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "interval_array"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return (pd.arrays.IntervalArray,)
+
+
+class AnyType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "any"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:  # pragma: no cover
+        return ()
+
+    def validate(self, value):
+        # any type is valid
+        return
+
+
+class _CollectionType(AbstractFieldType, metaclass=ABCMeta):
+    __slots__ = ("_field_types",)
+
+    def __init__(self, *field_types):
+        self._field_types = field_types
+        if len(field_types) == 0:
+            self._field_types = (AnyType(), Ellipsis)
+
+    @property
+    def name(self) -> str:
+        base_name = super().name
+        if self.is_homogeneous():
+            if isinstance(self._field_types[0], AnyType):
+                return base_name
+            else:
+                return f"{base_name}[{self._field_types[0].name}, ...]"
+        else:
+            field_type_names = ", ".join([ft.name for ft in self._field_types])
+            return f"{base_name}[{field_type_names}]"
+
+    def is_homogeneous(self):
+        return len(self._field_types) == 1 or (
+            len(self._field_types) == 2 and self._field_types[1] is Ellipsis
+        )
+
+    def validate(self, value):
+        if value is None:
+            return
+        if not isinstance(value, self.valid_types):
+            raise TypeError(
+                f"value should be instance of {self.valid_types}, got {type(value)}"
+            )
+        if self.is_homogeneous():
+            field_type: AbstractFieldType = self._field_types[0]
+            if not isinstance(field_type, AnyType):
+                for item in value:
+                    try:
+                        field_type.validate(item)
+                    except TypeError:
+                        raise TypeError(
+                            f"item should be instance of "
+                            f"{field_type.valid_types}, "
+                            f"got {type(item)}"
+                        )
+        else:
+            if len(value) != len(self._field_types):
+                raise ValueError(
+                    f"value should own {len(self._field_types)} items, "
+                    f"got {len(value)} items"
+                )
+            for expect_field_type, item in zip(self._field_types, value):
+                try:
+                    expect_field_type.validate(item)
+                except TypeError:
+                    raise TypeError(
+                        f"item should be instance of "
+                        f"{expect_field_type.valid_types}, "
+                        f"got {type(item)}"
+                    )
+
+
+class ListType(_CollectionType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "list"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return (list,)
+
+
+class TupleType(_CollectionType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "tuple"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return (tuple,)
+
+
+class DictType(AbstractFieldType):
+    __slots__ = "key_type", "value_type"
+
+    key_type: AbstractFieldType
+    value_type: AbstractFieldType
+
+    def __init__(
+        self, key_type: AbstractFieldType = None, value_type: AbstractFieldType = None
+    ):
+        if key_type is None:
+            key_type = AnyType()
+        if value_type is None:
+            value_type = AnyType()
+        self.key_type = key_type
+        self.value_type = value_type
+
+    @property
+    def type_name(self) -> str:
+        return "dict"
+
+    @property
+    def name(self) -> str:
+        if isinstance(self.key_type, AnyType) and isinstance(self.value_type, AnyType):
+            return "Dict"
+        else:
+            return f"Dict[{self.key_type.name}, {self.value_type.name}]"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return (dict,)
+
+    def validate(self, value):
+        super().validate(value)
+        if value is None:
+            return
+        for k, v in value.items():
+            try:
+                self.key_type.validate(k)
+            except TypeError:
+                raise TypeError(
+                    f"key should be instance of "
+                    f"{self.key_type.valid_types}, got {type(k)}"
+                )
+            try:
+                self.value_type.validate(v)
+            except TypeError:
+                raise TypeError(
+                    f"value should be instance of "
+                    f"{self.value_type.valid_types}, got {type(v)}"
+                )
+
+
+class ReferenceType(AbstractFieldType):
+    __slots__ = ("reference_type",)
+
+    reference_type: Type
+
+    def __init__(self, reference_type: Type = None):
+        if reference_type is None:
+            reference_type = object
+        self.reference_type = reference_type
+
+    @property
+    def type_name(self) -> str:
+        return "reference"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return (self.reference_type,)
+
+
+class FieldTypes:
+    # primitive type
+    bool = PrimitiveFieldType(PrimitiveType.bool)
+    int8 = PrimitiveFieldType(PrimitiveType.int8)
+    int16 = PrimitiveFieldType(PrimitiveType.int16)
+    int32 = PrimitiveFieldType(PrimitiveType.int32)
+    int64 = PrimitiveFieldType(PrimitiveType.int64)
+    uint8 = PrimitiveFieldType(PrimitiveType.uint8)
+    uint16 = PrimitiveFieldType(PrimitiveType.uint16)
+    uint32 = PrimitiveFieldType(PrimitiveType.uint32)
+    uint64 = PrimitiveFieldType(PrimitiveType.uint64)
+    float16 = PrimitiveFieldType(PrimitiveType.float16)
+    float32 = PrimitiveFieldType(PrimitiveType.float32)
+    float64 = PrimitiveFieldType(PrimitiveType.float64)
+    complex64 = PrimitiveFieldType(PrimitiveType.complex64)
+    complex128 = PrimitiveFieldType(PrimitiveType.complex128)
+    bytes = PrimitiveFieldType(PrimitiveType.bytes)
+    string = PrimitiveFieldType(PrimitiveType.string)
+
+    key = KeyType()
+
+    # Python types
+    slice = SliceType()
+    datetime = DatetimeType()
+    # alias of datetime
+    datatime64 = DatetimeType()
+    timedelta = TimedeltaType()
+    # alias of timedelta
+    timedelta64 = TimedeltaType()
+    tzinfo = TZInfoType()
+    function = FunctionType()
+    namedtuple = NamedtupleType()
+    reference = ReferenceType()
+    any = AnyType()
+    # equivalent to any
+    pickled = AnyType()
+
+    # collection
+    list = ListType()
+    tuple = TupleType()
+    dict = DictType()
+
+    # numpy
+    ndarray = NDArrayType()
+    # alias of ndarray
+    arr = NDArrayType()
+    dtype = DtypeType()
+
+    # pandas
+    index = IndexType()
+    series = SeriesType()
+    dataframe = DataFrameType()
+    interval_array = IntervalArrayType()
+    # alias of interval_array
+    interval_arr = IntervalArrayType()
diff --git a/python/xorbits/_mars/serialization/serializables/tests/__init__.py b/python/xorbits/_mars/serialization/serializables/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/serialization/serializables/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/serialization/serializables/tests/test_field_type.py b/python/xorbits/_mars/serialization/serializables/tests/test_field_type.py
new file mode 100644
index 000000000..3dc8275ea
--- /dev/null
+++ b/python/xorbits/_mars/serialization/serializables/tests/test_field_type.py
@@ -0,0 +1,121 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import namedtuple
+from datetime import datetime, timedelta, timezone
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ....core import EntityData
+from .. import FieldTypes
+
+
+class MyClass(EntityData):
+    __slots__ = ()
+
+    @staticmethod
+    def my_func():
+        """
+        Test function
+        """
+
+
+my_named_tuple = namedtuple("my_named_tuple", "a b")
+
+
+fields_values = [
+    # field_type, valid values, invalid values
+    [FieldTypes.bool, [True, np.bool_(False)], [1]],
+    [FieldTypes.int8, [8, np.int8(8)], [8.0]],
+    [FieldTypes.int16, [16, np.int16(16)], [16.0]],
+    [FieldTypes.int32, [32, np.int32(32)], [64.0]],
+    [FieldTypes.uint8, [8, np.uint8(8)], [8.0]],
+    [FieldTypes.uint16, [16, np.uint16(16)], [16.0]],
+    [FieldTypes.uint32, [32, np.uint32(32)], [32.0]],
+    [FieldTypes.uint64, [64, np.uint64(64)], [64.0]],
+    [FieldTypes.float16, [16.0, np.float16(16)], [16]],
+    [FieldTypes.float32, [32.0, np.float32(32)], [32]],
+    [FieldTypes.float64, [64.0, np.float64(64)], [64]],
+    [FieldTypes.complex64, [1 + 2j, np.complex64(1 + 2j)], [64]],
+    [FieldTypes.complex128, [1 + 2j, np.complex128(1 + 2j)], [128]],
+    [FieldTypes.bytes, [b"abc", np.bytes_("abc")], ["abc"]],
+    [FieldTypes.string, ["abc", np.str_("abc")], [b"abc"]],
+    [FieldTypes.ndarray, [np.array([1, 2, 3])], [object()]],
+    [FieldTypes.dtype, [np.dtype(np.int32), pd.StringDtype()], [object()]],
+    [FieldTypes.key, [MyClass()], [object()]],
+    [FieldTypes.slice, [slice(1, 10), slice("a", "b")], [object()]],
+    [FieldTypes.datetime, [datetime.now(), pd.Timestamp(0)], [object()]],
+    [FieldTypes.timedelta, [timedelta(days=1), pd.Timedelta(days=1)], [object()]],
+    [FieldTypes.tzinfo, [timezone.utc], [object()]],
+    [FieldTypes.index, [pd.RangeIndex(10), pd.Index([1, 2])], [object()]],
+    [FieldTypes.series, [pd.Series([1, 2, 3])], [object()]],
+    [FieldTypes.dataframe, [pd.DataFrame({"a": [1, 2]})], [object()]],
+    [FieldTypes.interval_array, [pd.arrays.IntervalArray([])], [object()]],
+    [FieldTypes.function, [MyClass.my_func], [object()]],
+    [FieldTypes.namedtuple, [my_named_tuple(a=1, b=2)], [tuple()]],
+    [FieldTypes.reference(MyClass), [MyClass()], [object()]],
+    [
+        FieldTypes.tuple(FieldTypes.int64, ...),
+        [tuple(), tuple([1, 2])],
+        [list(), tuple([1, 2.0])],
+    ],
+    [
+        FieldTypes.list(FieldTypes.int64, FieldTypes.float64),
+        [[1, 1.0]],
+        [tuple(), [1, 1]],
+    ],
+    [
+        FieldTypes.dict(FieldTypes.string, FieldTypes.int64),
+        [{"a": 1}],
+        [{1: "a"}, {"a": 1.0}],
+    ],
+    [FieldTypes.any, [object()], []],
+]
+
+
+@pytest.mark.parametrize("field_type, valid_values, invalid_values", fields_values)
+def test_field_type(field_type, valid_values, invalid_values):
+    assert isinstance(field_type.type_name, str)
+    assert isinstance(field_type.name, str)
+
+    for valid_value in valid_values:
+        field_type.validate(valid_value)
+
+    for invalid_value in invalid_values:
+        with pytest.raises(TypeError):
+            field_type.validate(invalid_value)
+
+
+def test_collction_field_error():
+    with pytest.raises(ValueError):
+        FieldTypes.tuple(FieldTypes.int64, FieldTypes.float32).validate(
+            tuple([1, 3.0, 3.0])
+        )
+
+
+def test_field_name():
+    assert FieldTypes.list().name == "List"
+    assert (
+        FieldTypes.list(FieldTypes.int64, FieldTypes.float32).name
+        == "List[Int64, Float32]"
+    )
+    assert FieldTypes.tuple(FieldTypes.int8, ...).name == "Tuple[Int8, ...]"
+    assert FieldTypes.tuple(FieldTypes.int8).name == "Tuple[Int8, ...]"
+    assert FieldTypes.dict().name == "Dict"
+    assert (
+        FieldTypes.dict(FieldTypes.int8, FieldTypes.float64).name
+        == "Dict[Int8, Float64]"
+    )
diff --git a/python/xorbits/_mars/serialization/serializables/tests/test_serializable.py b/python/xorbits/_mars/serialization/serializables/tests/test_serializable.py
new file mode 100644
index 000000000..a60a0349a
--- /dev/null
+++ b/python/xorbits/_mars/serialization/serializables/tests/test_serializable.py
@@ -0,0 +1,261 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import os
+from collections import namedtuple
+from datetime import timezone
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ....core import EntityData
+from ....utils import no_default
+from ... import deserialize, serialize
+from .. import (
+    AnyField,
+    BoolField,
+    BytesField,
+    Complex64Field,
+    Complex128Field,
+    DataFrameField,
+    DataTypeField,
+    Datetime64Field,
+    DictField,
+    FieldTypes,
+    Float16Field,
+    Float32Field,
+    Float64Field,
+    FunctionField,
+    IdentityField,
+    IndexField,
+    Int8Field,
+    Int16Field,
+    Int32Field,
+    Int64Field,
+    IntervalArrayField,
+    KeyField,
+    ListField,
+    NamedTupleField,
+    NDArrayField,
+    OneOfField,
+    ReferenceField,
+    Serializable,
+    SeriesField,
+    SliceField,
+    StringField,
+    Timedelta64Field,
+    TupleField,
+    TZInfoField,
+    UInt8Field,
+    UInt16Field,
+    UInt32Field,
+    UInt64Field,
+)
+
+my_namedtuple = namedtuple("my_namedtuple", "a, b")
+
+
+@pytest.fixture(autouse=True)
+def set_environ(request):
+    from .. import core, field
+
+    exist_env = os.environ.get("CI", no_default)
+    env_to_set = getattr(request, "param", None) or "true"
+
+    try:
+        os.environ["CI"] = env_to_set
+        core.SerializableSerializer.unregister(core.Serializable)
+        importlib.reload(core)
+        importlib.reload(field)
+        yield
+    finally:
+        if exist_env is no_default:
+            os.environ.pop("CI", None)
+        else:
+            os.environ["CI"] = exist_env
+        core.SerializableSerializer.unregister(core.Serializable)
+        importlib.reload(core)
+        importlib.reload(field)
+
+
+class MyHasKey(EntityData):
+    def __init__(self, key=None, **kw):
+        super().__init__(_key=key, **kw)
+        self._id = "1"
+
+    def __eq__(self, other):
+        return isinstance(other, MyHasKey) and other._key == self._key
+
+
+class MySimpleSerializable(Serializable):
+    _id = IdentityField("id")
+    _int_val = Int64Field("int_val", default=1000)
+    _list_val = ListField("list_val", default_factory=list)
+    _ref_val = ReferenceField("ref_val", "MySimpleSerializable")
+
+
+class MySerializable(Serializable):
+    _id = IdentityField("id")
+    _any_val = AnyField("any_val")
+    _bool_val = BoolField("bool_val")
+    _int8_val = Int8Field("int8_val")
+    _int16_val = Int16Field("int16_val")
+    _int32_val = Int32Field("int32_val")
+    _int64_val = Int64Field("int64_val")
+    _uint8_val = UInt8Field("uint8_val")
+    _uint16_val = UInt16Field("uint16_val")
+    _uint32_val = UInt32Field("uint32_val")
+    _uint64_val = UInt64Field("uint64_val")
+    _float16_val = Float16Field("float16_val")
+    _float32_val = Float32Field(
+        "float32_val", on_serialize=lambda x: x + 1, on_deserialize=lambda x: x - 1
+    )
+    _float64_val = Float64Field("float64_val")
+    _complex64_val = Complex64Field("complex64_val")
+    _complex128_val = Complex128Field("complex128_val")
+    _string_val = StringField("string_val")
+    _bytes_val = BytesField("bytes_val")
+    _key_val = KeyField("key_val")
+    _ndarray_val = NDArrayField("ndarray_val")
+    _datetime64_val = Datetime64Field("datetime64_val")
+    _timedelta64_val = Timedelta64Field("timedelta64_val")
+    _datatype_val = DataTypeField("datatype_val")
+    _index_val = IndexField("index_val")
+    _series_val = SeriesField("series_val")
+    _dataframe_val = DataFrameField("dataframe_val")
+    _interval_array_val = IntervalArrayField("interval_array_val")
+    _slice_val = SliceField("slice_val")
+    _function_val = FunctionField("function_val")
+    _named_tuple_val = NamedTupleField("named_tuple_val")
+    _tzinfo_val = TZInfoField("tzinfo_val")
+    _list_val = ListField("list_val", FieldTypes.int64)
+    _tuple_val = TupleField("tuple_val", FieldTypes.string)
+    _dict_val = DictField("dict_val", FieldTypes.string, FieldTypes.bytes)
+    _ref_val = ReferenceField("ref_val", "self")
+    _ref_val2 = ReferenceField("ref_val2", MySimpleSerializable)
+    _oneof_val = OneOfField(
+        "ref_val",
+        oneof1_val=f"{__name__}.MySerializable",
+        oneof2_val=MySimpleSerializable,
+    )
+
+
+@pytest.mark.parametrize("set_environ", ["false", "true"], indirect=True)
+def test_serializable(set_environ):
+    my_serializable = MySerializable(
+        _id="1",
+        _any_val="any_value",
+        _bool_val=True,
+        _int8_val=-8,
+        _int16_val=np.int16(-16),
+        _int32_val=-32,
+        _int64_val=-64,
+        _uint8_val=8,
+        _uint16_val=16,
+        _uint32_val=np.uint32(32),
+        _uint64_val=64,
+        _float16_val=1.0,
+        _float32_val=np.float32(2.0),
+        _float64_val=2.0,
+        _complex64_val=np.complex64(1 + 2j),
+        _complex128_val=1 + 2j,
+        _string_val="string_value",
+        _bytes_val=b"bytes_value",
+        _key_val=MyHasKey("aaa"),
+        _ndarray_val=np.random.rand(4, 3),
+        _datetime64_val=pd.Timestamp(123),
+        _timedelta64_val=pd.Timedelta(days=1),
+        _datatype_val=np.dtype(np.int32),
+        _index_val=pd.Index([1, 2]),
+        _series_val=pd.Series(["a", "b"]),
+        _dataframe_val=pd.DataFrame({"a": [1, 2, 3]}),
+        _interval_array_val=pd.arrays.IntervalArray([]),
+        _slice_val=slice(1, 10, 2),
+        _function_val=lambda x: x + 1,
+        _named_tuple_val=my_namedtuple(a=1, b=2),
+        _tzinfo_val=timezone.utc,
+        _list_val=[1, 2],
+        _tuple_val=("a", "b"),
+        _dict_val={"a": b"bytes_value"},
+        _ref_val=MySerializable(),
+        _oneof_val=MySerializable(_id="2"),
+    )
+
+    header, buffers = serialize(my_serializable)
+    my_serializable2 = deserialize(header, buffers)
+    _assert_serializable_eq(my_serializable, my_serializable2)
+
+
+def _assert_serializable_eq(my_serializable, my_serializable2):
+    for field_name, field in my_serializable._FIELDS.items():
+        if not hasattr(my_serializable, field.tag):
+            continue
+        expect_value = getattr(my_serializable, field_name)
+        actual_value = getattr(my_serializable2, field_name)
+        if isinstance(expect_value, np.ndarray):
+            np.testing.assert_array_equal(expect_value, actual_value)
+        elif isinstance(expect_value, pd.DataFrame):
+            pd.testing.assert_frame_equal(expect_value, actual_value)
+        elif isinstance(expect_value, pd.Series):
+            pd.testing.assert_series_equal(expect_value, actual_value)
+        elif isinstance(expect_value, pd.Index):
+            pd.testing.assert_index_equal(expect_value, actual_value)
+        elif isinstance(expect_value, pd.api.extensions.ExtensionArray):
+            pd.testing.assert_extension_array_equal(expect_value, actual_value)
+        elif isinstance(expect_value, (MySimpleSerializable, MySerializable)):
+            _assert_serializable_eq(expect_value, actual_value)
+        elif callable(expect_value):
+            assert expect_value(1) == actual_value(1)
+        else:
+            assert expect_value == actual_value
+
+
+def test_fields_errors():
+    my_simple = MySimpleSerializable(_id="1", _ref_val=MySimpleSerializable(_id="2"))
+    my_serializeble = MySerializable(_oneof_val=my_simple)
+
+    with pytest.raises(TypeError) as exc_info:
+        my_simple._int_val = "10"
+    assert "_int_val" in str(exc_info.value)
+
+    del my_simple._ref_val
+    with pytest.raises(AttributeError):
+        _ = my_simple._ref_val
+
+    del my_simple._id
+    with pytest.raises(AttributeError):
+        _ = my_simple._id
+
+    assert my_simple._int_val == 1000
+    assert my_simple._list_val == []
+
+    del my_serializeble._oneof_val
+    with pytest.raises(AttributeError):
+        _ = my_serializeble._oneof_val
+
+    my_serializeble._ref_val2 = MySimpleSerializable(_id="3")
+    del my_serializeble._ref_val2
+    with pytest.raises(AttributeError):
+        _ = my_serializeble._ref_val2
+
+    with pytest.raises(TypeError):
+        my_serializeble._ref_val = my_simple
+
+    with pytest.raises(TypeError):
+        my_serializeble._oneof_val = 1
+
+    with pytest.raises(AttributeError):
+        del my_serializeble._oneof_val
diff --git a/python/xorbits/_mars/serialization/tests/__init__.py b/python/xorbits/_mars/serialization/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/serialization/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/serialization/tests/test_serial.py b/python/xorbits/_mars/serialization/tests/test_serial.py
new file mode 100644
index 000000000..02bc475a5
--- /dev/null
+++ b/python/xorbits/_mars/serialization/tests/test_serial.py
@@ -0,0 +1,322 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import threading
+from collections import OrderedDict, defaultdict
+from typing import Any, Dict, List, Tuple
+
+import numpy as np
+import pandas as pd
+import pytest
+
+try:
+    import pyarrow as pa
+except ImportError:
+    pa = None
+try:
+    import scipy.sparse as sps
+except ImportError:
+    sps = None
+
+from ...lib.sparse import SparseMatrix
+from ...tests.core import require_cudf, require_cupy
+from ...utils import lazy_import
+from .. import deserialize, serialize, serialize_with_spawn
+from ..core import ListSerializer, Placeholder
+
+cupy = lazy_import("cupy")
+cudf = lazy_import("cudf")
+
+
+class CustomList(list):
+    pass
+
+
+@pytest.mark.parametrize(
+    "val",
+    [
+        None,
+        False,
+        123,
+        3.567,
+        3.5 + 4.3j,
+        b"abcd",
+        "abcd",
+        ["uvw", ("mno", "sdaf"), 4, 6.7],
+        CustomList([3, 4, CustomList([5, 6])]),
+        {"abc": 5.6, "def": [3.4], "gh": None, "ijk": {}},
+        OrderedDict([("abcd", 5.6)]),
+        defaultdict(lambda: 0, [("abcd", 0)]),
+    ],
+)
+def test_core(val):
+    deserialized = deserialize(*serialize(val))
+    assert type(val) == type(deserialized)
+    assert val == deserialized
+
+
+def test_strings():
+    str_obj = "abcd" * 1024
+    obj = [str_obj, str_obj]
+    header, bufs = serialize(obj)
+    assert len(header) < len(str_obj) * 2
+    bufs = [memoryview(buf) for buf in bufs]
+    assert obj == deserialize(header, bufs)
+
+
+def test_placeholder_obj():
+    assert Placeholder(1024) == Placeholder(1024)
+    assert hash(Placeholder(1024)) == hash(Placeholder(1024))
+    assert Placeholder(1024) != Placeholder(1023)
+    assert hash(Placeholder(1024)) != hash(Placeholder(1023))
+    assert Placeholder(1024) != 1024
+    assert "1024" in repr(Placeholder(1024))
+
+
+def test_nested_list():
+    val = [b"a" * 1200] * 10
+    val[0] = val
+    deserialized = deserialize(*serialize(val))
+    assert deserialized[0] is deserialized
+    assert val[1:] == deserialized[1:]
+
+
+class KeyedDict(dict):
+    def _skeys(self):
+        return set(k for k in self.keys() if isinstance(k, str))
+
+    def __hash__(self):
+        return hash(frozenset(self._skeys()))
+
+    def __eq__(self, other: "KeyedDict"):
+        return self._skeys() == other._skeys()
+
+
+def test_nested_dict():
+    val = {i: "b" * 100 for i in range(10)}
+    val[0] = val
+    deserialized = deserialize(*serialize(val))
+    assert deserialized[0] is deserialized
+
+    val = KeyedDict(abcd="efgh")
+    val[val] = val
+    deserialized = deserialize(*serialize(val))
+    assert deserialized[val] is deserialized
+
+
+class DictWithoutInitArgs(dict):
+    # dict inheritance without args in __init__
+    def __init__(self):
+        super().__init__()
+
+
+def test_dict_without_init_args():
+    val = DictWithoutInitArgs()
+    val["a"] = "b"
+    deserialized = deserialize(*serialize(val))
+    assert deserialized == val
+
+
+@pytest.mark.parametrize(
+    "val",
+    [
+        np.array(np.random.rand(100, 100)),
+        np.array(np.random.rand(100, 100).T),
+        np.array(["a", "bcd", None]),
+    ],
+)
+def test_numpy(val):
+    deserialized = deserialize(*serialize(val))
+    assert type(val) == type(deserialized)
+    np.testing.assert_equal(val, deserialized)
+    if val.flags.f_contiguous:
+        assert deserialized.flags.f_contiguous
+
+
+def test_pandas():
+    val = pd.Series([1, 2, 3, 4])
+    pd.testing.assert_series_equal(val, deserialize(*serialize(val)))
+
+    val = pd.DataFrame(
+        {
+            "a": np.random.rand(1000),
+            "b": np.random.choice(list("abcd"), size=(1000,)),
+            "c": np.random.randint(0, 100, size=(1000,)),
+        }
+    )
+    pd.testing.assert_frame_equal(val, deserialize(*serialize(val)))
+
+
+@pytest.mark.skipif(pa is None, reason="need pyarrow to run the cases")
+def test_arrow():
+    test_df = pd.DataFrame(
+        {
+            "a": np.random.rand(1000),
+            "b": np.random.choice(list("abcd"), size=(1000,)),
+            "c": np.random.randint(0, 100, size=(1000,)),
+        }
+    )
+    test_vals = [
+        pa.RecordBatch.from_pandas(test_df),
+        pa.Table.from_pandas(test_df),
+    ]
+    for val in test_vals:
+        deserialized = deserialize(*serialize(val))
+        assert type(val) is type(deserialized)
+        np.testing.assert_equal(val, deserialized)
+
+
+@pytest.mark.parametrize(
+    "np_val",
+    [np.random.rand(100, 100), np.random.rand(100, 100).T],
+)
+@require_cupy
+def test_cupy(np_val):
+    val = cupy.array(np_val)
+    deserialized = deserialize(*serialize(val))
+    assert type(val) is type(deserialized)
+    cupy.testing.assert_array_equal(val, deserialized)
+
+
+@require_cudf
+def test_cudf():
+    raw_df = pd.DataFrame(
+        {
+            "a": np.random.rand(1000),
+            "b": np.random.choice(list("abcd"), size=(1000,)),
+            "c": np.random.randint(0, 100, size=(1000,)),
+        }
+    )
+    test_df = cudf.DataFrame(raw_df)
+    cudf.testing.assert_frame_equal(test_df, deserialize(*serialize(test_df)))
+
+    raw_df.columns = pd.MultiIndex.from_tuples([("a", "a"), ("a", "b"), ("b", "c")])
+    test_df = cudf.DataFrame(raw_df)
+    cudf.testing.assert_frame_equal(test_df, deserialize(*serialize(test_df)))
+
+
+@pytest.mark.skipif(sps is None, reason="need scipy to run the test")
+def test_scipy_sparse():
+    val = sps.random(100, 100, 0.1, format="csr")
+    deserial = deserialize(*serialize(val))
+    assert (val != deserial).nnz == 0
+
+
+@pytest.mark.skipif(sps is None, reason="need scipy to run the test")
+def test_mars_sparse():
+    val = SparseMatrix(sps.random(100, 100, 0.1, format="csr"))
+    deserial = deserialize(*serialize(val))
+    assert (val.spmatrix != deserial.spmatrix).nnz == 0
+
+
+class MockSerializerForErrors(ListSerializer):
+    serializer_id = 25951
+    raises = False
+
+    def on_deserial_error(
+        self,
+        serialized: Tuple,
+        context: Dict,
+        subs_serialized: List,
+        error_index: int,
+        exc: BaseException,
+    ):
+        assert serialized[2] is CustomList  # obj_type field of ListSerializer
+        assert error_index == 1
+        assert subs_serialized[error_index]
+        try:
+            raise SystemError from exc
+        except BaseException as ex:
+            return ex
+
+    def deserial(self, serialized: Tuple, context: Dict, subs: List[Any]):
+        if len(subs) == 2 and self.raises:
+            raise TypeError
+        return super().deserial(serialized, context, subs)
+
+
+class UnpickleWithError:
+    def __getstate__(self):
+        return (None,)
+
+    def __setstate__(self, state):
+        raise ValueError
+
+
+def test_deserial_errors():
+    try:
+        MockSerializerForErrors.raises = False
+        MockSerializerForErrors.register(CustomList)
+        ListSerializer.register(CustomList, name="test_name")
+
+        # error of leaf object is raised
+        obj = [1, [[3, UnpickleWithError()]]]
+        with pytest.raises(ValueError):
+            deserialize(*serialize(obj))
+
+        # error of leaf object is rewritten in parent object
+        obj = CustomList([[1], [[3, UnpickleWithError()]]])
+        with pytest.raises(SystemError) as exc_info:
+            deserialize(*serialize(obj))
+        assert isinstance(exc_info.value.__cause__, ValueError)
+
+        MockSerializerForErrors.raises = True
+
+        # error of non-leaf object is raised
+        obj = [CustomList([[1], [[2]]])]
+        with pytest.raises(TypeError):
+            deserialize(*serialize(obj))
+        deserialize(*serialize(obj, {"serializer": "test_name"}))
+
+        # error of non-leaf CustomList is rewritten in parent object
+        obj = CustomList([[1], CustomList([[1], [[2]]]), [2]])
+        with pytest.raises(SystemError) as exc_info:
+            deserialize(*serialize(obj))
+        assert isinstance(exc_info.value.__cause__, TypeError)
+        deserialize(*serialize(obj, {"serializer": "test_name"}))
+    finally:
+        MockSerializerForErrors.unregister(CustomList)
+        ListSerializer.unregister(CustomList, name="test_name")
+        # Above unregister will remove the ListSerializer from deserializers,
+        # so we need to register ListSerializer again to make the
+        # deserializers correct.
+        ListSerializer.register(list)
+
+
+class MockSerializerForSpawn(ListSerializer):
+    thread_calls = defaultdict(lambda: 0)
+
+    def serial(self, obj: Any, context: Dict):
+        self.thread_calls[threading.current_thread().ident] += 1
+        return super().serial(obj, context)
+
+
+@pytest.mark.asyncio
+async def test_spawn_threshold():
+    try:
+        assert 0 == deserialize(*(await serialize_with_spawn(0)))
+
+        MockSerializerForSpawn.register(CustomList)
+        obj = [CustomList([i]) for i in range(200)]
+        serialized = await serialize_with_spawn(obj, spawn_threshold=100)
+        assert serialized[0][0]["_N"] == 201
+        deserialized = deserialize(*serialized)
+        for s, d in zip(obj, deserialized):
+            assert s[0] == d[0]
+
+        calls = MockSerializerForSpawn.thread_calls
+        assert sum(calls.values()) == 200
+        assert calls[threading.current_thread().ident] == 101
+    finally:
+        MockSerializerForSpawn.unregister(CustomList)
diff --git a/python/xorbits/_mars/services/__init__.py b/python/xorbits/_mars/services/__init__.py
new file mode 100644
index 000000000..c01180bb2
--- /dev/null
+++ b/python/xorbits/_mars/services/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import (
+    NodeRole,
+    create_service_session,
+    destroy_service_session,
+    start_services,
+    stop_services,
+)
diff --git a/python/xorbits/_mars/services/cluster/__init__.py b/python/xorbits/_mars/services/cluster/__init__.py
new file mode 100644
index 000000000..1c7ee3a71
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/__init__.py
@@ -0,0 +1,24 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .api import AbstractClusterAPI, ClusterAPI, MockClusterAPI, WebClusterAPI
+from .backends import AbstractClusterBackend
+from .core import (  # noqa: F401
+    DiskInfo,
+    NodeInfo,
+    NodeRole,
+    QuotaInfo,
+    StorageInfo,
+    WorkerSlotInfo,
+)
diff --git a/python/xorbits/_mars/services/cluster/api/__init__.py b/python/xorbits/_mars/services/cluster/api/__init__.py
new file mode 100644
index 000000000..1812045f1
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/api/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import AbstractClusterAPI
+from .oscar import ClusterAPI, MockClusterAPI
+from .web import WebClusterAPI
diff --git a/python/xorbits/_mars/services/cluster/api/core.py b/python/xorbits/_mars/services/cluster/api/core.py
new file mode 100644
index 000000000..dcd80c613
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/api/core.py
@@ -0,0 +1,190 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import abstractmethod
+from typing import Dict, List, Set
+
+from ....resource import Resource
+from ....typing import BandType
+from ...core import NodeRole
+from ..core import NodeStatus
+
+
+class AbstractClusterAPI:
+    @staticmethod
+    def _calc_statuses(
+        statuses: Set[NodeStatus] = None, exclude_statuses: Set[NodeStatus] = None
+    ) -> Set[NodeStatus]:
+        if statuses:
+            return statuses
+        elif exclude_statuses is not None:
+            return set(NodeStatus.__members__.values()).difference(exclude_statuses)
+        else:
+            return {NodeStatus.READY}
+
+    @abstractmethod
+    async def get_supervisors(self, filter_ready: bool = True) -> List[str]:
+        """
+        Get supervisor addresses
+
+        Returns
+        -------
+        out
+            list of supervisors
+        """
+
+    @abstractmethod
+    async def watch_supervisors(self):
+        """
+        Watch supervisor addresses
+
+        Returns
+        -------
+        out
+            generator of list of supervisors
+        """
+
+    @abstractmethod
+    async def watch_nodes(
+        self,
+        role: NodeRole,
+        env: bool = False,
+        resource: bool = False,
+        detail: bool = False,
+        statuses: Set[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+    ) -> List[Dict[str, Dict]]:
+        """
+        Watch changes of workers
+
+        Returns
+        -------
+        out: List[Dict[str, Dict]]
+            dict of worker resources by addresses and bands
+        """
+
+    @abstractmethod
+    async def get_nodes_info(
+        self,
+        nodes: List[str] = None,
+        role: NodeRole = None,
+        env: bool = False,
+        resource: bool = False,
+        detail: bool = False,
+        statuses: Set[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+    ):
+        """
+        Get worker info
+
+        Parameters
+        ----------
+        nodes
+            address of nodes
+        role
+            roles of nodes
+        env
+            receive env info
+        resource
+            receive resource info
+        detail
+            receive detail info
+
+        Returns
+        -------
+        out: Dict
+            info of worker
+        """
+
+    @abstractmethod
+    async def get_all_bands(
+        self,
+        role: NodeRole = None,
+        statuses: Set[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+    ) -> Dict[BandType, Resource]:
+        """
+        Get all bands that can be used for computation.
+
+        Returns
+        -------
+        band_to_resource : dict
+            Band to resource.
+        """
+
+    @abstractmethod
+    async def watch_all_bands(
+        self,
+        role: NodeRole = None,
+        statuses: Set[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+    ):
+        """
+        Watch all bands that can be used for computation.
+
+        Returns
+        -------
+        band_to_resource : dict
+            Band to resource.
+        """
+
+    @abstractmethod
+    async def get_mars_versions(self) -> List[str]:
+        """
+        Get versions used in current Mars cluster
+
+        Returns
+        -------
+        version_list : list
+            List of versions
+        """
+
+    @abstractmethod
+    async def get_node_pool_configs(self, address: str) -> List[Dict]:
+        """
+        Get pool configs of a Mars node
+
+        Returns
+        -------
+        config_list : List[Dict]
+            List of configs for all pool processes
+        """
+
+    async def get_node_thread_stacks(self, address: str) -> List[Dict[int, List[str]]]:
+        """
+        Get current thread pool stacks of a Mars node
+
+        Parameters
+        ----------
+        address
+
+        Returns
+        -------
+
+        """
+
+    async def fetch_node_log(self, size: int, address: str, offset: int) -> str:
+        """
+        Get current log content of a Mars node
+
+        Parameters
+        ----------
+        size
+        address
+        offset
+
+        Returns
+        -------
+
+        """
diff --git a/python/xorbits/_mars/services/cluster/api/oscar.py b/python/xorbits/_mars/services/cluster/api/oscar.py
new file mode 100644
index 000000000..c66877f7e
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/api/oscar.py
@@ -0,0 +1,412 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+from typing import Dict, List, Optional, Set, Type, TypeVar
+
+from .... import oscar as mo
+from ....lib.aio import alru_cache
+from ....resource import Resource
+from ....typing import BandType
+from ...core import NodeRole
+from ..core import (
+    DiskInfo,
+    NodeStatus,
+    QuotaInfo,
+    StorageInfo,
+    WorkerSlotInfo,
+    watch_method,
+)
+from .core import AbstractClusterAPI
+
+APIType = TypeVar("APIType", bound="ClusterAPI")
+logger = logging.getLogger(__name__)
+
+
+class ClusterAPI(AbstractClusterAPI):
+    def __init__(self, address: str):
+        self._address = address
+        self._locator_ref = None
+        self._uploader_ref = None
+
+    async def _init(self):
+        from ..locator import SupervisorLocatorActor
+        from ..uploader import NodeInfoUploaderActor
+
+        self._locator_ref = await mo.actor_ref(
+            SupervisorLocatorActor.default_uid(), address=self._address
+        )
+        self._uploader_ref = await mo.actor_ref(
+            NodeInfoUploaderActor.default_uid(), address=self._address
+        )
+
+    @classmethod
+    @alru_cache(cache_exceptions=False)
+    async def create(cls: Type[APIType], address: str) -> APIType:
+        api_obj = cls(address)
+        await api_obj._init()
+        return api_obj
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_node_info_ref(self):
+        from ..supervisor.node_info import NodeInfoCollectorActor
+
+        [node_info_ref] = await self.get_supervisor_refs(
+            [NodeInfoCollectorActor.default_uid()]
+        )
+        return node_info_ref
+
+    async def get_supervisors(self, filter_ready: bool = True) -> List[str]:
+        return await self._locator_ref.get_supervisors(filter_ready=filter_ready)
+
+    @watch_method
+    async def watch_supervisors(self, version: Optional[int] = None):
+        return await self._locator_ref.watch_supervisors(version=version)
+
+    async def get_supervisors_by_keys(self, keys: List[str]) -> List[str]:
+        """
+        Get supervisor address hosting the specified key
+
+        Parameters
+        ----------
+        keys
+            key for a supervisor address
+
+        Returns
+        -------
+        out
+            addresses of the supervisor
+        """
+        get_supervisor = self._locator_ref.get_supervisor
+        return await get_supervisor.batch(*(get_supervisor.delay(k) for k in keys))
+
+    @watch_method
+    async def watch_supervisors_by_keys(
+        self, keys: List[str], version: Optional[int] = None
+    ):
+        return await self._locator_ref.watch_supervisors_by_keys(keys, version=version)
+
+    async def get_supervisor_refs(self, uids: List[str]) -> List[mo.ActorRef]:
+        """
+        Get actor references hosting the specified actor uid
+
+        Parameters
+        ----------
+        uids
+            uids for a supervisor address
+        watch
+            if True, will watch changes of supervisor changes
+
+        Returns
+        -------
+        out : List[mo.ActorRef]
+            references of the actors
+        """
+        addrs = await self.get_supervisors_by_keys(uids)
+        if any(addr is None for addr in addrs):
+            none_uid = next(uid for addr, uid in zip(addrs, uids) if addr is None)
+            raise mo.ActorNotExist(f"Actor {none_uid} not exist as no supervisors")
+
+        return await asyncio.gather(
+            *[mo.actor_ref(uid, address=addr) for addr, uid in zip(addrs, uids)]
+        )
+
+    async def watch_supervisor_refs(self, uids: List[str]):
+        async for addrs in self.watch_supervisors_by_keys(uids):
+            yield await asyncio.gather(
+                *[mo.actor_ref(uid, address=addr) for addr, uid in zip(addrs, uids)]
+            )
+
+    @watch_method
+    async def watch_nodes(
+        self,
+        role: NodeRole,
+        env: bool = False,
+        resource: bool = False,
+        detail: bool = False,
+        version: Optional[int] = None,
+        statuses: Set[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+    ) -> List[Dict[str, Dict]]:
+        statuses = self._calc_statuses(statuses, exclude_statuses)
+        node_info_ref = await self._get_node_info_ref()
+        return await node_info_ref.watch_nodes(
+            role,
+            env=env,
+            resource=resource,
+            detail=detail,
+            statuses=statuses,
+            version=version,
+        )
+
+    async def get_nodes_info(
+        self,
+        nodes: List[str] = None,
+        role: NodeRole = None,
+        env: bool = False,
+        resource: bool = False,
+        detail: bool = False,
+        statuses: Set[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+    ) -> Dict[str, Dict]:
+        statuses = self._calc_statuses(statuses, exclude_statuses)
+        node_info_ref = await self._get_node_info_ref()
+        return await node_info_ref.get_nodes_info(
+            nodes=nodes,
+            role=role,
+            env=env,
+            resource=resource,
+            detail=detail,
+            statuses=statuses,
+        )
+
+    async def set_node_status(self, node: str, role: NodeRole, status: NodeStatus):
+        """
+        Set status of node
+
+        Parameters
+        ----------
+        node : str
+            address of node
+        role: NodeRole
+            role of node
+        status : NodeStatus
+            status of node
+        """
+        node_info_ref = await self._get_node_info_ref()
+        await node_info_ref.update_node_info(node, role, status=status)
+
+    async def get_all_bands(
+        self,
+        role: NodeRole = None,
+        statuses: Set[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+    ) -> Dict[BandType, Resource]:
+        statuses = self._calc_statuses(statuses, exclude_statuses)
+        node_info_ref = await self._get_node_info_ref()
+        return await node_info_ref.get_all_bands(role, statuses=statuses)
+
+    @watch_method
+    async def watch_all_bands(
+        self,
+        role: NodeRole = None,
+        version: Optional[int] = None,
+        statuses: Set[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+    ):
+        statuses = self._calc_statuses(statuses, exclude_statuses)
+        node_info_ref = await self._get_node_info_ref()
+        return await node_info_ref.watch_all_bands(
+            role, statuses=statuses, version=version
+        )
+
+    async def get_mars_versions(self) -> List[str]:
+        node_info_ref = await self._get_node_info_ref()
+        return await node_info_ref.get_mars_versions()
+
+    async def get_bands(self) -> Dict:
+        """
+        Get bands that can be used for computation on current node.
+
+        Returns
+        -------
+        band_to_resource : dict
+            Band to resource.
+        """
+        return await self._uploader_ref.get_bands()
+
+    async def mark_node_ready(self):
+        """
+        Mark current node ready for work loads
+        """
+        await self._uploader_ref.mark_node_ready()
+
+    async def wait_node_ready(self):
+        """
+        Wait current node to be ready
+        """
+        await self._uploader_ref.wait_node_ready()
+
+    async def wait_all_supervisors_ready(self):
+        """
+        Wait till all expected supervisors are ready
+        """
+        await self._locator_ref.wait_all_supervisors_ready()
+
+    async def set_band_slot_infos(
+        self, band_name: str, slot_infos: List[WorkerSlotInfo]
+    ):
+        await self._uploader_ref.set_band_slot_infos.tell(band_name, slot_infos)
+
+    async def set_band_quota_info(self, band_name: str, quota_info: QuotaInfo):
+        await self._uploader_ref.set_band_quota_info.tell(band_name, quota_info)
+
+    async def set_node_disk_info(self, disk_info: List[DiskInfo]):
+        await self._uploader_ref.set_node_disk_info(disk_info)
+
+    @mo.extensible
+    async def set_band_storage_info(self, band_name: str, storage_info: StorageInfo):
+        await self._uploader_ref.set_band_storage_info(band_name, storage_info)
+
+    async def request_worker(
+        self, worker_cpu: int = None, worker_mem: int = None, timeout: int = None
+    ) -> str:
+        node_allocator_ref = await self._get_node_allocator_ref()
+        address = await node_allocator_ref.request_worker(
+            worker_cpu, worker_mem, timeout
+        )
+        return address
+
+    async def release_worker(self, address: str):
+        node_allocator_ref = await self._get_node_allocator_ref()
+        await node_allocator_ref.release_worker(address)
+        node_info_ref = await self._get_node_info_ref()
+        await node_info_ref.update_node_info(
+            address, NodeRole.WORKER, status=NodeStatus.STOPPED
+        )
+
+    async def reconstruct_worker(self, address: str):
+        node_allocator_ref = await self._get_node_allocator_ref()
+        await node_allocator_ref.reconstruct_worker(address)
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_node_allocator_ref(self):
+        from ..supervisor.node_allocator import NodeAllocatorActor
+
+        [node_allocator_ref] = await self.get_supervisor_refs(
+            [NodeAllocatorActor.default_uid()]
+        )
+        return node_allocator_ref
+
+    async def _get_process_info_manager_ref(self, address: str = None):
+        from ..procinfo import ProcessInfoManagerActor
+
+        return await mo.actor_ref(
+            ProcessInfoManagerActor.default_uid(), address=address or self._address
+        )
+
+    async def get_node_pool_configs(self, address: str = None) -> List[Dict]:
+        ref = await self._get_process_info_manager_ref(address)
+        return await ref.get_pool_configs()
+
+    async def get_node_thread_stacks(
+        self, address: str = None
+    ) -> List[Dict[int, List[str]]]:
+        ref = await self._get_process_info_manager_ref(address)
+        return await ref.get_thread_stacks()
+
+    async def _get_log_ref(self, address: str = None):
+        from ..file_logger import FileLoggerActor
+
+        return await mo.actor_ref(
+            FileLoggerActor.default_uid(), address=address or self._address
+        )
+
+    async def fetch_node_log(
+        self, size: int, address: str = None, offset: int = 0
+    ) -> str:
+        ref = await self._get_log_ref(address)
+        return await ref.fetch_logs(size, offset)
+
+
+class MockClusterAPI(ClusterAPI):
+    @classmethod
+    async def create(cls: Type[APIType], address: str, **kw) -> APIType:
+        from ..file_logger import FileLoggerActor
+        from ..procinfo import ProcessInfoManagerActor
+        from ..supervisor.locator import SupervisorPeerLocatorActor
+        from ..supervisor.node_allocator import NodeAllocatorActor
+        from ..supervisor.node_info import NodeInfoCollectorActor
+        from ..uploader import NodeInfoUploaderActor
+
+        create_actor_coros = [
+            mo.create_actor(
+                SupervisorPeerLocatorActor,
+                "fixed",
+                address,
+                uid=SupervisorPeerLocatorActor.default_uid(),
+                address=address,
+            ),
+            mo.create_actor(
+                NodeInfoCollectorActor,
+                uid=NodeInfoCollectorActor.default_uid(),
+                address=address,
+            ),
+            mo.create_actor(
+                NodeAllocatorActor,
+                "fixed",
+                address,
+                uid=NodeAllocatorActor.default_uid(),
+                address=address,
+            ),
+            mo.create_actor(
+                NodeInfoUploaderActor,
+                NodeRole.WORKER,
+                interval=kw.get("upload_interval"),
+                band_to_resource=kw.get("band_to_resource"),
+                use_gpu=kw.get("use_gpu", False),
+                uid=NodeInfoUploaderActor.default_uid(),
+                address=address,
+            ),
+            mo.create_actor(
+                ProcessInfoManagerActor,
+                uid=ProcessInfoManagerActor.default_uid(),
+                address=address,
+            ),
+            mo.create_actor(
+                FileLoggerActor, uid=FileLoggerActor.default_uid(), address=address
+            ),
+        ]
+        dones, _ = await asyncio.wait(
+            [asyncio.ensure_future(coro) for coro in create_actor_coros]
+        )
+
+        for task in dones:
+            try:
+                task.result()
+            except mo.ActorAlreadyExist:  # pragma: no cover
+                pass
+
+        api = await super().create(address=address)
+        await api.mark_node_ready()
+        return api
+
+    @classmethod
+    async def cleanup(cls, address: str):
+        from ..file_logger import FileLoggerActor
+        from ..supervisor.locator import SupervisorPeerLocatorActor
+        from ..supervisor.node_info import NodeInfoCollectorActor
+        from ..uploader import NodeInfoUploaderActor
+
+        await asyncio.gather(
+            mo.destroy_actor(
+                mo.create_actor_ref(
+                    uid=SupervisorPeerLocatorActor.default_uid(), address=address
+                )
+            ),
+            mo.destroy_actor(
+                mo.create_actor_ref(
+                    uid=NodeInfoCollectorActor.default_uid(), address=address
+                )
+            ),
+            mo.destroy_actor(
+                mo.create_actor_ref(
+                    uid=NodeInfoUploaderActor.default_uid(), address=address
+                )
+            ),
+            mo.destroy_actor(
+                mo.create_actor_ref(uid=FileLoggerActor.default_uid(), address=address)
+            ),
+        )
diff --git a/python/xorbits/_mars/services/cluster/api/web.py b/python/xorbits/_mars/services/cluster/api/web.py
new file mode 100644
index 000000000..9755b9293
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/api/web.py
@@ -0,0 +1,378 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import time
+from typing import Callable, Dict, List, Optional, Set
+
+from ....lib.aio import alru_cache
+from ....resource import Resource
+from ....typing import BandType
+from ....utils import deserialize_serializable, serialize_serializable
+from ...core import NodeRole
+from ...web import MarsServiceWebAPIHandler, MarsWebAPIClientMixin, web_api
+from ..core import NodeStatus, watch_method
+from .core import AbstractClusterAPI
+
+
+class ClusterWebAPIHandler(MarsServiceWebAPIHandler):
+    _root_pattern = "/api/cluster"
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_cluster_api(self):
+        from ...cluster import ClusterAPI
+
+        return await ClusterAPI.create(self._supervisor_addr)
+
+    @staticmethod
+    def _convert_node_dict(node_info_list: Dict[str, Dict]):
+        res = {}
+        for node_addr, node in node_info_list.items():
+            res_dict = node.copy()
+            res_dict["status"] = res_dict["status"].value
+            res[node_addr] = res_dict
+        return res
+
+    @web_api("nodes", method=["get", "post"], cache_blocking=True)
+    async def get_nodes_info(self):
+        watch = bool(int(self.get_argument("watch", "0")))
+        env = bool(int(self.get_argument("env", "0")))
+        resource = bool(int(self.get_argument("resource", "0")))
+        detail = bool(int(self.get_argument("detail", "0")))
+
+        nodes_arg = self.get_argument("nodes", None)
+        nodes = nodes_arg.split(",") if nodes_arg is not None else None
+
+        role_arg = self.get_argument("role", None)
+        role = NodeRole(int(role_arg)) if role_arg is not None else None
+
+        statuses_arg = self.get_argument("statuses", None)
+        statuses = (
+            set(NodeStatus(int(v)) for v in statuses_arg.split(","))
+            if statuses_arg
+            else None
+        )
+
+        exclude_statuses_arg = self.get_argument("exclude_statuses", None)
+        exclude_statuses = (
+            set(NodeStatus(int(v)) for v in exclude_statuses_arg.split(","))
+            if exclude_statuses_arg
+            else None
+        )
+
+        statuses = WebClusterAPI._calc_statuses(statuses, exclude_statuses)
+
+        cluster_api = await self._get_cluster_api()
+        result = {}
+        if watch:
+            assert nodes is None
+            version = self.get_argument("version", "") or None
+            if version:
+                version = int(version)
+
+            async for version, node_infos in cluster_api.watch_nodes(
+                role,
+                env=env,
+                resource=resource,
+                detail=detail,
+                statuses=statuses,
+                version=version,
+            ):
+                result["version"] = version
+                result["nodes"] = self._convert_node_dict(node_infos)
+                break
+        else:
+            nodes = await cluster_api.get_nodes_info(
+                nodes=nodes,
+                role=role,
+                env=env,
+                resource=resource,
+                statuses=statuses,
+                detail=detail,
+            )
+            result["nodes"] = self._convert_node_dict(nodes)
+        self.write(json.dumps(result))
+
+    @web_api("bands", method="get", cache_blocking=True)
+    async def get_all_bands(self):
+        role_arg = self.get_argument("role", None)
+        role = NodeRole(int(role_arg)) if role_arg is not None else None
+        watch = bool(int(self.get_argument("watch", "0")))
+
+        statuses_arg = self.get_argument("statuses", None)
+        statuses = (
+            set(NodeStatus(int(v)) for v in statuses_arg.split(","))
+            if statuses_arg
+            else None
+        )
+
+        cluster_api = await self._get_cluster_api()
+        if watch:
+            version = self.get_argument("version", "") or None
+            if version:
+                version = int(version)
+
+            async for version, bands in cluster_api.watch_all_bands(
+                role, statuses=statuses, version=version
+            ):
+                self.write(serialize_serializable((version, bands)))
+                break
+        else:
+            self.write(
+                serialize_serializable(
+                    await cluster_api.get_all_bands(role, statuses=statuses)
+                )
+            )
+
+    @web_api("versions", method="get", cache_blocking=True)
+    async def get_mars_versions(self):
+        cluster_api = await self._get_cluster_api()
+        self.write(json.dumps(list(await cluster_api.get_mars_versions())))
+
+    @web_api("pools", method="get", cache_blocking=True)
+    async def get_node_pool_configs(self):
+        cluster_api = await self._get_cluster_api()
+        address = self.get_argument("address", "") or None
+        pools = list(await cluster_api.get_node_pool_configs(address))
+        # Since logging_conf field cannot be serialized by json,
+        # and this field is not used by the front end, it is removed.
+        for pool in pools:
+            pool.pop("logging_conf", None)
+        self.write(json.dumps({"pools": pools}))
+
+    @web_api("stacks", method="get", cache_blocking=True)
+    async def get_node_thread_stacks(self):
+        cluster_api = await self._get_cluster_api()
+        address = self.get_argument("address", "") or None
+        stacks = list(await cluster_api.get_node_thread_stacks(address))
+        self.write(
+            json.dumps(
+                {
+                    "generate_time": time.time(),
+                    "stacks": stacks,
+                }
+            )
+        )
+
+    @web_api("logs", method="get", cache_blocking=True)
+    async def fetch_node_log(self):
+        cluster_api = await self._get_cluster_api()
+        address = self.get_argument("address", "") or None
+        # 10MB by default
+        size = int(self.get_argument("size", str(10 * 1024 * 1024)))
+        offset = 0
+        content = await cluster_api.fetch_node_log(size, address=address, offset=offset)
+        if size != -1:
+            self.write(json.dumps({"content": content}))
+        # size == -1 means downloading the current file
+        else:
+            self.set_header("Content-Type", "application/octet-stream")
+            self.set_header("Content-Disposition", "attachment")
+
+            while True:
+                if len(content) == 0:  # read to file end
+                    await self.finish()
+                    break
+                else:
+                    self.write(content)
+                    await self.flush()
+                offset = offset + len(content)
+                content = await cluster_api.fetch_node_log(
+                    size, address=address, offset=offset
+                )
+
+
+web_handlers = {ClusterWebAPIHandler.get_root_pattern(): ClusterWebAPIHandler}
+
+
+class WebClusterAPI(AbstractClusterAPI, MarsWebAPIClientMixin):
+    def __init__(self, address: str, request_rewriter: Callable = None):
+        self._address = address.rstrip("/")
+        self.request_rewriter = request_rewriter
+
+    @staticmethod
+    def _convert_node_dict(node_info_list: Dict[str, Dict]):
+        res = {}
+        for node_addr, node in node_info_list.items():
+            res_dict = node.copy()
+            res_dict["status"] = NodeStatus(res_dict["status"])
+            res[node_addr] = res_dict
+        return res
+
+    async def _get_nodes_info(
+        self,
+        nodes: List[str] = None,
+        role: NodeRole = None,
+        env: bool = False,
+        resource: bool = False,
+        detail: bool = False,
+        watch: bool = False,
+        statuses: Set[NodeStatus] = None,
+        version: Optional[int] = None,
+    ):
+        statuses_str = (
+            ",".join(str(status.value) for status in statuses) if statuses else ""
+        )
+        args = [
+            ("nodes", ",".join(nodes) if nodes else None),
+            ("role", role.value if role is not None else None),
+            ("env", 1 if env else 0),
+            ("resource", 1 if resource else 0),
+            ("detail", 1 if detail else 0),
+            ("watch", 1 if watch else 0),
+            ("statuses", statuses_str),
+            ("version", str(version or "")),
+        ]
+        args_str = "&".join(f"{key}={val}" for key, val in args if val is not None)
+
+        path = f"{self._address}/api/cluster/nodes"
+        res = await self._request_url(
+            path=path,
+            method="POST",
+            data=args_str,
+            headers={"Content-Type": "application/x-www-form-urlencoded"},
+        )
+        result = json.loads(res.body)
+        if watch:
+            return result["version"], self._convert_node_dict(result["nodes"])
+        else:
+            return self._convert_node_dict(result["nodes"])
+
+    async def get_supervisors(self, filter_ready: bool = True) -> List[str]:
+        statuses = (
+            {NodeStatus.READY}
+            if filter_ready
+            else {NodeStatus.STARTING, NodeStatus.READY}
+        )
+        res = await self._get_nodes_info(role=NodeRole.SUPERVISOR, statuses=statuses)
+        return list(res.keys())
+
+    @watch_method
+    async def watch_supervisors(self, version: Optional[int] = None):
+        version, res = await self._get_nodes_info(
+            role=NodeRole.SUPERVISOR, watch=True, version=version
+        )
+        return version, list(res.keys())
+
+    async def get_nodes_info(
+        self,
+        nodes: List[str] = None,
+        role: NodeRole = None,
+        env: bool = False,
+        resource: bool = False,
+        detail: bool = False,
+        statuses: Set[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+    ):
+        statuses = self._calc_statuses(statuses, exclude_statuses)
+        return await self._get_nodes_info(
+            nodes,
+            role=role,
+            env=env,
+            resource=resource,
+            detail=detail,
+            watch=False,
+            statuses=statuses,
+        )
+
+    @watch_method
+    async def watch_nodes(
+        self,
+        role: NodeRole,
+        env: bool = False,
+        resource: bool = False,
+        detail: bool = False,
+        statuses: Set[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+        version: Optional[int] = None,
+    ) -> List[Dict[str, Dict]]:
+        statuses = self._calc_statuses(statuses, exclude_statuses)
+        return await self._get_nodes_info(
+            role=role,
+            env=env,
+            resource=resource,
+            detail=detail,
+            watch=True,
+            statuses=statuses,
+            version=version,
+        )
+
+    async def get_all_bands(
+        self,
+        role: NodeRole = None,
+        statuses: Set[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+    ) -> Dict[BandType, Resource]:
+        statuses = self._calc_statuses(statuses, exclude_statuses)
+        statuses_str = (
+            ",".join(str(status.value) for status in statuses) if statuses else ""
+        )
+        params = {}
+        if role is not None:  # pragma: no cover
+            params["role"] = role.value
+        if statuses_str:
+            params["statuses"] = statuses_str
+
+        path = f"{self._address}/api/cluster/bands"
+        res = await self._request_url("GET", path, params=params)
+        return deserialize_serializable(res.body)
+
+    @watch_method
+    async def watch_all_bands(
+        self,
+        role: NodeRole = None,
+        statuses: List[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+        version: Optional[int] = None,
+    ):
+        statuses = self._calc_statuses(statuses, exclude_statuses)
+        statuses_str = (
+            ",".join(str(status.value) for status in statuses) if statuses else ""
+        )
+        params = dict(watch=1, version=str(version or ""))
+        if role is not None:  # pragma: no cover
+            params["role"] = role.value
+        if statuses_str:
+            params["statuses"] = statuses_str
+
+        path = f"{self._address}/api/cluster/bands"
+        res = await self._request_url("GET", path, params=params)
+        return deserialize_serializable(res.body)
+
+    async def get_mars_versions(self) -> List[str]:
+        path = f"{self._address}/api/cluster/versions"
+        res = await self._request_url("GET", path)
+        return list(json.loads(res.body))
+
+    async def get_node_pool_configs(self, address: str) -> List[Dict]:
+        path = f"{self._address}/api/cluster/pools?address={address}"
+        res = await self._request_url("GET", path)
+        return list(json.loads(res.body)["pools"])
+
+    async def get_node_thread_stacks(self, address: str) -> List[Dict]:
+        path = f"{self._address}/api/cluster/stacks?address={address}"
+        res = await self._request_url("GET", path)
+        return list(json.loads(res.body)["stacks"])
+
+    async def fetch_node_log(
+        self, size: int = None, address: str = None, offset: int = 0
+    ) -> str:
+        path = f"{self._address}/api/cluster/logs?address={address}"
+        if size is not None:
+            path += f"&&size={size}"
+        res = await self._request_url("GET", path)
+        if size == -1:
+            return res.body.decode(encoding="utf8")
+        else:
+            return str(json.loads(res.body)["content"])
diff --git a/python/xorbits/_mars/services/cluster/backends/__init__.py b/python/xorbits/_mars/services/cluster/backends/__init__.py
new file mode 100644
index 000000000..4696550f1
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/backends/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import AbstractClusterBackend, get_cluster_backend, register_cluster_backend
+from .fixed import FixedClusterBackend
diff --git a/python/xorbits/_mars/services/cluster/backends/base.py b/python/xorbits/_mars/services/cluster/backends/base.py
new file mode 100644
index 000000000..f44811c1a
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/backends/base.py
@@ -0,0 +1,103 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import AsyncGenerator, Dict, List, Optional, Type
+
+from ..core import NodeRole
+
+
+class AbstractClusterBackend(ABC):
+    name = None
+
+    @classmethod
+    @abstractmethod
+    async def create(
+        cls, node_role: NodeRole, lookup_address: Optional[str], pool_address: str
+    ) -> "AbstractClusterBackend":
+        """
+
+        Parameters
+        ----------
+        node_role
+        lookup_address
+        pool_address
+
+        Returns
+        -------
+
+        """
+
+    @abstractmethod
+    async def watch_supervisors(self) -> AsyncGenerator[List[str], None]:
+        """
+        Watch changes of supervisors
+
+        Returns
+        -------
+        out : AsyncGenerator[List[str]]
+            Generator of list of schedulers
+        """
+
+    @abstractmethod
+    async def get_supervisors(self, filter_ready: bool = True) -> List[str]:
+        """
+        Get list of supervisors
+
+        Parameters
+        ----------
+        filter_ready : bool
+            True if return ready nodes only, or return starting and ready nodes
+
+        Returns
+        -------
+        out : List[str]
+            List of supervisors
+        """
+
+    @abstractmethod
+    async def request_worker(
+        self, worker_cpu: int = None, worker_mem: int = None, timeout: int = None
+    ) -> str:
+        """
+        Create a new worker
+
+        Returns
+        -------
+        Address of the new created worker
+        """
+
+    @abstractmethod
+    async def release_worker(self, address: str):
+        """
+        Return a worker
+        """
+
+    @abstractmethod
+    async def reconstruct_worker(self, address: str):
+        """
+        Reconstruct a worker
+        """
+
+
+_cluster_backend_types: Dict[str, Type[AbstractClusterBackend]] = dict()
+
+
+def register_cluster_backend(backend: Type[AbstractClusterBackend]):
+    _cluster_backend_types[backend.name] = backend
+    return backend
+
+
+def get_cluster_backend(backend_name: str) -> Type[AbstractClusterBackend]:
+    return _cluster_backend_types[backend_name]
diff --git a/python/xorbits/_mars/services/cluster/backends/fixed.py b/python/xorbits/_mars/services/cluster/backends/fixed.py
new file mode 100644
index 000000000..c5085a6b3
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/backends/fixed.py
@@ -0,0 +1,51 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import AsyncGenerator, List, Optional, Union
+
+from ..core import NodeRole
+from .base import AbstractClusterBackend, register_cluster_backend
+
+
+@register_cluster_backend
+class FixedClusterBackend(AbstractClusterBackend):
+    name = "fixed"
+
+    def __init__(self, lookup_address: Union[List[str], str]):
+        if isinstance(lookup_address, str):
+            lookup_address = lookup_address.split(",")
+        self._supervisors = [n.strip() for n in lookup_address]
+
+    @classmethod
+    async def create(
+        cls, node_role: NodeRole, lookup_address: Optional[str], pool_address: str
+    ):
+        return cls(lookup_address)
+
+    async def watch_supervisors(self) -> AsyncGenerator[List[str], None]:
+        yield self._supervisors
+
+    async def get_supervisors(self, filter_ready: bool = True) -> List[str]:
+        return self._supervisors
+
+    async def request_worker(
+        self, worker_cpu: int = None, worker_mem: int = None, timeout: int = None
+    ) -> str:
+        raise NotImplementedError
+
+    async def release_worker(self, address: str):
+        raise NotImplementedError
+
+    async def reconstruct_worker(self, address: str):
+        raise NotImplementedError
diff --git a/python/xorbits/_mars/services/cluster/core.py b/python/xorbits/_mars/services/cluster/core.py
new file mode 100644
index 000000000..26ef3e4a4
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/core.py
@@ -0,0 +1,126 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import enum
+import functools
+import time
+from dataclasses import dataclass, field
+from typing import (
+    AsyncGenerator,
+    Awaitable,
+    Callable,
+    Dict,
+    Optional,
+    Set,
+    Tuple,
+    TypeVar,
+)
+
+from ...serialization.serializables import (
+    Float64Field,
+    Int32Field,
+    Int64Field,
+    Serializable,
+    StringField,
+)
+from ...storage import StorageLevel
+from ..core import NodeRole
+
+
+class NodeStatus(enum.Enum):
+    STARTING = 0
+    READY = 1
+    DEGENERATED = 2
+    STOPPING = 3
+    STOPPED = -1
+
+
+@dataclass
+class NodeInfo:
+    role: NodeRole
+    status: NodeStatus = NodeStatus.READY
+    update_time: float = field(default_factory=time.time)
+    env: Dict = field(default_factory=dict)
+    resource: Dict = field(default_factory=dict)
+    detail: Dict = field(default_factory=dict)
+
+
+class WatchNotifier:
+    _events: Set[asyncio.Event]
+
+    def __init__(self):
+        self._event = asyncio.Event()
+        self._lock = asyncio.Lock()
+        self._version = 0
+
+    async def watch(self, version: Optional[int] = None):
+        if version != self._version:
+            return self._version
+        await self._event.wait()
+        return self._version
+
+    async def notify(self):
+        async with self._lock:
+            self._version += 1
+            self._event.set()
+            self._event = asyncio.Event()
+
+
+RetType = TypeVar("RetType")
+
+
+def watch_method(
+    func: Callable[..., Awaitable[Tuple[int, RetType]]]
+) -> Callable[..., AsyncGenerator[RetType, None]]:
+    @functools.wraps(func)
+    async def wrapped(*args, **kwargs):
+        if "version" in kwargs:
+            yield await func(*args, **kwargs)
+            return
+
+        kwargs["version"] = None
+        while True:
+            version, val = await func(*args, **kwargs)
+            kwargs["version"] = version
+            yield val
+
+    return wrapped
+
+
+class WorkerSlotInfo(Serializable):
+    slot_id: int = Int32Field("slot_id")
+    session_id: str = StringField("session_id")
+    subtask_id: str = StringField("subtask_id")
+    processor_usage: float = Float64Field("processor_usage")
+
+
+class QuotaInfo(Serializable):
+    quota_size: int = Int64Field("quota_size")
+    allocated_size: int = Int64Field("allocated_size")
+    hold_size: int = Int64Field("hold_size")
+
+
+class StorageInfo(Serializable):
+    storage_level: StorageLevel = Int32Field(
+        "storage_level", on_serialize=lambda x: x.value, on_deserialize=StorageLevel
+    )
+    total_size: int = Int64Field("total_size")
+    used_size: int = Int64Field("used_size")
+    pinned_size: int = Int64Field("pinned_size", default=None)
+
+
+class DiskInfo(Serializable):
+    path: str = StringField("path")
+    limit_size: int = Int64Field("limit_size", default=None)
diff --git a/python/xorbits/_mars/services/cluster/file_logger.py b/python/xorbits/_mars/services/cluster/file_logger.py
new file mode 100644
index 000000000..9e1685e4d
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/file_logger.py
@@ -0,0 +1,91 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+
+from ... import oscar as mo
+from ...constants import MARS_LOG_PATH_KEY
+
+logger = logging.getLogger(__name__)
+
+
+class FileLoggerActor(mo.Actor):
+    """
+    Read log file path from env (source from yaml config) for each node (including supervisor and all the workers).
+    Expose interface for web frontend to fetch log content.
+    """
+
+    def __init__(self):
+        file_path = os.environ.get(MARS_LOG_PATH_KEY)
+        self._log_filename = file_path
+
+    def fetch_logs(self, size: int, offset: int) -> str:
+        """
+        Externally exposed interface.
+
+        Parameters
+        ----------
+        size
+        offset
+
+        Returns
+        -------
+
+        """
+        if size != -1:
+            content = self._get_n_bytes_tail_file(size)
+        else:
+            content = self._get_n_bytes_from_pos(10 * 1024 * 1024, offset)
+        return content
+
+    def _get_n_bytes_tail_file(self, bytes_num: int) -> str:
+        """
+        Read last n bytes of file.
+
+        Parameters
+        ----------
+        bytes_num: the bytes to read. -1 means read the whole file.
+
+        Returns
+        -------
+
+        """
+        f_size = os.stat(self._log_filename).st_size
+        target = f_size - bytes_num if f_size > bytes_num else 0
+        with open(self._log_filename) as f:
+            f.seek(target)
+            if target == 0:
+                res = f.read()
+            else:
+                f.readline()
+                res = f.read()
+
+        return res
+
+    def _get_n_bytes_from_pos(self, size: int, offset: int) -> str:
+        """
+        Read n bytes from a position.
+        Parameters
+        ----------
+        size
+        offset
+
+        Returns
+        -------
+
+        """
+        with open(self._log_filename) as f:
+            f.seek(offset)
+            res = f.read(size)
+        return res
diff --git a/python/xorbits/_mars/services/cluster/gather.py b/python/xorbits/_mars/services/cluster/gather.py
new file mode 100644
index 000000000..7feeb0bb8
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/gather.py
@@ -0,0 +1,265 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import platform
+import socket
+import sys
+from typing import Dict, List
+
+import numpy as np
+import pandas as pd
+
+from ...resource import Resource, ZeroResource
+
+try:
+    import scipy
+except ImportError:  # pragma: no cover
+    scipy = None
+
+from ... import resource as mars_resource
+from ...config import options
+from ...storage import StorageLevel
+from ...utils import git_info, lazy_import
+from .core import DiskInfo, QuotaInfo, StorageInfo, WorkerSlotInfo
+
+cp = lazy_import("cupy", rename="cp")
+cudf = lazy_import("cudf")
+
+logger = logging.getLogger(__name__)
+
+_is_initial = True
+
+
+def gather_node_env():
+    from ... import __version__ as mars_version
+    from ...lib.mkl_interface import mkl_get_version
+    from ...lib.nvutils import NVError
+
+    global _is_initial
+    if _is_initial:
+        _is_initial = False
+        mars_resource.cpu_percent()
+
+    mem_stats = mars_resource.virtual_memory()
+
+    node_info = {
+        "command_line": sys.argv,
+        "platform": platform.platform(),
+        "host_name": socket.gethostname(),
+        "python_version": sys.version,
+        "mars_version": mars_version,
+        "cpu_total": mars_resource.cpu_count(),
+        "memory_total": mem_stats.total,
+        "options": options.to_dict(),
+    }
+
+    if "MARS_K8S_POD_NAME" in os.environ:
+        node_info["k8s_pod_name"] = os.environ["MARS_K8S_POD_NAME"]
+    if "CONTAINER_ID" in os.environ:
+        node_info["yarn_container_id"] = os.environ["CONTAINER_ID"]
+
+    try:
+        cuda_info = mars_resource.cuda_info()
+    except NVError:  # pragma: no cover
+        logger.exception("NVError encountered, cannot gather CUDA devices.")
+        cuda_info = None
+
+    if cuda_info:
+        node_info["cuda_info"] = {
+            "driver": cuda_info.driver_version,
+            "cuda": cuda_info.cuda_version,
+            "products": list(cuda_info.products),
+        }
+
+    package_vers = {
+        "numpy": np.__version__,
+        "pandas": pd.__version__,
+    }
+    if hasattr(np, "__mkl_version__") and mkl_get_version:
+        mkl_version = mkl_get_version()
+        package_vers[
+            "mkl"
+        ] = f"{mkl_version.major}.{mkl_version.minor}.{mkl_version.update}"
+
+    if scipy is not None:
+        package_vers["scipy"] = scipy.__version__
+    if cp is not None:
+        package_vers["cupy"] = cp.__version__
+    if cudf is not None:
+        package_vers["cudf"] = cudf.__version__
+
+    node_info["package_versions"] = package_vers
+
+    git = git_info()
+    if git:
+        node_info["git_info"] = {
+            "hash": git.commit_hash,
+            "ref": git.commit_ref,
+        }
+
+    bands = node_info["bands"] = dict()
+
+    cpu_band = {
+        "resources": {
+            "cpu": mars_resource.cpu_count(),
+            "memory": mars_resource.virtual_memory().total,
+        }
+    }
+    # todo numa can be supported by adding more bands
+    bands["numa-0"] = cpu_band
+
+    for idx, gpu_card_stat in enumerate(
+        mars_resource.cuda_card_stats()
+    ):  # pragma: no cover
+        bands[f"gpu-{idx}"] = {
+            "resources": {
+                "gpu": 1,
+                "memory": gpu_card_stat.fb_mem_info.total,
+            }
+        }
+    return node_info
+
+
+def gather_node_resource(band_to_resource: Dict[str, Resource] = None, use_gpu=True):
+    # todo numa can be supported by adding more bands
+    res = dict()
+    mem_info = mars_resource.virtual_memory()
+    num_cpu = (
+        mars_resource.cpu_count()
+        if band_to_resource is None
+        else band_to_resource.get("numa-0", ZeroResource).num_cpus
+    )
+    mem_bytes = (
+        mem_info.total
+        if band_to_resource is None
+        else band_to_resource.get("numa-0", ZeroResource).mem_bytes
+    )
+    if num_cpu:  # pragma: no branch
+        res["numa-0"] = {
+            "cpu_avail": mars_resource.cpu_count()
+            - mars_resource.cpu_percent() / 100.0,
+            "cpu_total": num_cpu,
+            "memory_avail": mem_info.available,
+            "memory_total": min(mem_info.total, mem_bytes),
+        }
+
+    if use_gpu:
+        for idx, gpu_card_stat in enumerate(
+            mars_resource.cuda_card_stats()
+        ):  # pragma: no cover
+            num_gpu = (
+                1
+                if band_to_resource is None
+                else band_to_resource.get(f"gpu-{idx}", ZeroResource).num_gpus
+            )
+            if not num_gpu:
+                continue
+            res[f"gpu-{idx}"] = {
+                "gpu_avail": 1 - gpu_card_stat.gpu_usage,
+                "gpu_total": num_gpu,
+                "memory_avail": gpu_card_stat.fb_mem_info.available,
+                "memory_total": gpu_card_stat.fb_mem_info.total,
+            }
+    return res
+
+
+def gather_node_details(
+    band_slot_infos: Dict[str, List[WorkerSlotInfo]] = None,
+    band_quota_infos: Dict[str, QuotaInfo] = None,
+    disk_infos: List[DiskInfo] = None,
+    band_storage_infos: Dict[str, Dict[StorageLevel, StorageInfo]] = None,
+):
+    disk_io_usage = mars_resource.disk_io_usage()
+    net_io_usage = mars_resource.net_io_usage()
+    res = {
+        "disk": dict(zip(("reads", "writes"), disk_io_usage))
+        if disk_io_usage
+        else dict(),
+        "network": dict(zip(("receives", "sends"), net_io_usage))
+        if net_io_usage
+        else dict(),
+        "iowait": mars_resource.iowait(),
+    }
+
+    if disk_infos:
+        part_dict = dict()
+        for info in disk_infos:
+            part_dev = mars_resource.get_path_device(info.path)
+            if part_dev in part_dict:
+                continue
+
+            disk_usage_result = mars_resource.disk_usage(info.path)
+            io_usage_result = mars_resource.disk_io_usage(info.path)
+            part_dict[part_dev] = disk_info = {
+                "size_limit": info.limit_size,
+                "size_used": disk_usage_result.used,
+                "size_total": disk_usage_result.total,
+            }
+            if io_usage_result is not None:
+                disk_info.update(
+                    {
+                        "reads": io_usage_result.reads if io_usage_result else None,
+                        "writes": io_usage_result.writes if io_usage_result else None,
+                    }
+                )
+            if not sys.platform.startswith("win"):
+                in_usage_result = os.statvfs(info.path)
+                disk_info.update(
+                    {
+                        "inode_used": in_usage_result.f_files
+                        - in_usage_result.f_favail,
+                        "inode_total": in_usage_result.f_files,
+                    }
+                )
+        res["disk"]["partitions"] = part_dict
+
+    band_slot_infos = band_slot_infos or dict()
+    res["slot"] = {
+        band: [
+            {
+                "slot_id": slot_info.slot_id,
+                "session_id": slot_info.session_id,
+                "subtask_id": slot_info.subtask_id,
+                "processor_usage": slot_info.processor_usage,
+            }
+            for slot_info in slot_infos
+        ]
+        for band, slot_infos in band_slot_infos.items()
+    }
+
+    band_quota_infos = band_quota_infos or dict()
+    res["quota"] = {
+        band: {
+            "quota_size": quota_info.quota_size,
+            "allocated_size": quota_info.allocated_size,
+            "hold_size": quota_info.hold_size,
+        }
+        for band, quota_info in band_quota_infos.items()
+    }
+
+    band_storage_infos = band_storage_infos or dict()
+    res["storage"] = {
+        band: {
+            level.name.lower(): {
+                "size_used": storage_info.used_size,
+                "size_total": storage_info.total_size,
+                "size_pinned": storage_info.pinned_size,
+            }
+            for level, storage_info in storage_infos.items()
+        }
+        for band, storage_infos in band_storage_infos.items()
+    }
+    return res
diff --git a/python/xorbits/_mars/services/cluster/locator.py b/python/xorbits/_mars/services/cluster/locator.py
new file mode 100644
index 000000000..28c34d281
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/locator.py
@@ -0,0 +1,114 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+from typing import List, Optional, Set
+
+from ... import oscar as mo
+from ...lib.uhashring import HashRing
+from .backends import AbstractClusterBackend, get_cluster_backend
+from .core import NodeRole, WatchNotifier
+
+logger = logging.getLogger(__name__)
+
+
+class SupervisorLocatorActor(mo.Actor):
+    _backend: Optional[AbstractClusterBackend]
+    _node_role: NodeRole = None
+
+    def __init__(self, backend_name: str, lookup_address: str):
+        self._backend_name = backend_name
+        self._lookup_address = lookup_address
+        self._backend = None
+        self._supervisors = None
+        self._hash_ring = None
+
+        self._watch_notifier = WatchNotifier()
+        self._watch_task = None
+
+    async def __post_create__(self):
+        backend_cls = get_cluster_backend(self._backend_name)
+        self._backend = await backend_cls.create(
+            self._node_role, self._lookup_address, self.address
+        )
+        await self._set_supervisors(await self._get_supervisors_from_backend())
+
+        self._watch_task = asyncio.create_task(self._watch_supervisor_changes())
+
+    async def __pre_destroy__(self):
+        self._watch_task.cancel()
+
+    async def _set_supervisors(self, supervisors: List[str]):
+        self._supervisors = supervisors
+        self._hash_ring = HashRing(nodes=supervisors, hash_fn="ketama")
+        await self._watch_notifier.notify()
+
+    async def _get_supervisors_from_backend(self, filter_ready: bool = True):
+        raise NotImplementedError
+
+    def _watch_supervisors_from_backend(self):
+        raise NotImplementedError
+
+    def _if_set_supervisors(
+        self, current_supervisors: Set[str], last_supervisors: Set[str]
+    ):
+        return current_supervisors != last_supervisors
+
+    async def _watch_supervisor_changes(self):
+        last_supervisors = set()
+        try:
+            async for sv_list in self._watch_supervisors_from_backend():
+                if self._if_set_supervisors(set(sv_list), last_supervisors):
+                    await self._set_supervisors(sv_list)
+                    last_supervisors = set(sv_list)
+        except asyncio.CancelledError:
+            return
+
+    async def get_supervisors(self, filter_ready: bool = True):
+        if filter_ready:
+            return self._supervisors
+        else:
+            return await self._get_supervisors_from_backend(filter_ready=filter_ready)
+
+    @mo.extensible
+    def get_supervisor(self, key: str, size=1):
+        if not self._supervisors:
+            return None
+        elif size == 1:
+            return self._hash_ring.get_node(key)
+        else:
+            return tuple(it["nodename"] for it in self._hash_ring.range(key, size=size))
+
+    async def watch_supervisors(self, version: Optional[int] = None):
+        version = yield self._watch_notifier.watch(version)
+        raise mo.Return((version, self._supervisors))
+
+    async def watch_supervisors_by_keys(
+        self, keys: List[str], version: Optional[int] = None
+    ):
+        version = yield self._watch_notifier.watch(version)
+        raise mo.Return((version, [self.get_supervisor(k) for k in keys]))
+
+    async def wait_all_supervisors_ready(self):
+        version = None
+        while True:
+            expected_supervisors = await self._get_supervisors_from_backend(
+                filter_ready=False
+            )
+            if self._supervisors and set(self._supervisors) == set(
+                expected_supervisors
+            ):
+                break
+            version = yield self._watch_notifier.watch(version)
diff --git a/python/xorbits/_mars/services/cluster/procinfo.py b/python/xorbits/_mars/services/cluster/procinfo.py
new file mode 100644
index 000000000..2837dcac3
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/procinfo.py
@@ -0,0 +1,95 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import sys
+import threading
+import traceback
+from typing import Dict, List
+
+from ... import oscar as mo
+from ...oscar.backends.allocate_strategy import ProcessIndex
+
+
+class ProcessInfoManagerActor(mo.StatelessActor):
+    _process_refs: List[mo.ActorRef]
+
+    def __init__(self):
+        self._process_refs = []
+        self._pool_configs = []
+
+    async def __post_create__(self):
+        index = 0
+        while True:
+            try:
+                ref = await mo.create_actor(
+                    ProcessInfoActor,
+                    process_index=index,
+                    uid=ProcessInfoActor.gen_uid(index),
+                    address=self.address,
+                    allocate_strategy=ProcessIndex(index),
+                )
+            except IndexError:
+                break
+
+            index += 1
+            self._process_refs.append(ref)
+
+        self._pool_configs = await asyncio.gather(
+            *[ref.get_pool_config() for ref in self._process_refs]
+        )
+
+    async def get_pool_configs(self) -> List[Dict]:
+        return self._pool_configs
+
+    async def get_thread_stacks(self) -> List[Dict[int, List[str]]]:
+        stack_tasks = [
+            asyncio.create_task(ref.get_thread_stacks()) for ref in self._process_refs
+        ]
+        await asyncio.wait(stack_tasks, return_when=asyncio.ALL_COMPLETED)
+
+        results = []
+        for fut in stack_tasks:
+            try:
+                results.append(fut.result())
+            except (mo.ActorNotExist, mo.ServerClosed):
+                results.append(None)
+        return results
+
+
+class ProcessInfoActor(mo.StatelessActor):
+    def __init__(self, process_index: int = 0):
+        self._process_index = process_index
+        self._pool_config = None
+
+    async def __post_create__(self):
+        self._pool_config = await mo.get_pool_config(self.address)
+
+    @classmethod
+    def gen_uid(cls, process_index: int):
+        return f"process_info_{process_index}"
+
+    def get_pool_config(self) -> dict:
+        idx = self._pool_config.get_process_index(self.address)
+        return self._pool_config.get_pool_config(idx)
+
+    @classmethod
+    def get_thread_stacks(cls) -> Dict[str, List[str]]:
+        frames = sys._current_frames()
+        stacks = dict()
+        for th in threading.enumerate():
+            tid = getattr(th, "native_id", th.ident)
+            stack_key = f"{tid}:{th.name}"
+            stacks[stack_key] = traceback.format_stack(frames[th.ident])
+        return stacks
diff --git a/python/xorbits/_mars/services/cluster/supervisor/__init__.py b/python/xorbits/_mars/services/cluster/supervisor/__init__.py
new file mode 100644
index 000000000..a258409aa
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/supervisor/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .service import ClusterSupervisorService
diff --git a/python/xorbits/_mars/services/cluster/supervisor/locator.py b/python/xorbits/_mars/services/cluster/supervisor/locator.py
new file mode 100644
index 000000000..9e1691648
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/supervisor/locator.py
@@ -0,0 +1,51 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .... import oscar as mo
+from ....lib.aio import alru_cache
+from ..core import NodeRole
+from ..locator import SupervisorLocatorActor
+
+
+class SupervisorPeerLocatorActor(SupervisorLocatorActor):
+    _node_role = NodeRole.SUPERVISOR
+
+    @classmethod
+    def default_uid(cls):
+        return SupervisorLocatorActor.__name__
+
+    async def __post_create__(self):
+        await super().__post_create__()
+
+        supervisors = await self._backend.get_supervisors(filter_ready=False)
+        try:
+            node_info_ref = await self._get_node_info_ref()
+            await node_info_ref.put_starting_nodes(supervisors, NodeRole.SUPERVISOR)
+        except mo.ActorNotExist:  # pragma: no cover
+            pass
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_node_info_ref(self):
+        from .node_info import NodeInfoCollectorActor
+
+        return await mo.actor_ref(
+            uid=NodeInfoCollectorActor.default_uid(), address=self.address
+        )
+
+    async def _get_supervisors_from_backend(self, filter_ready: bool = True):
+        return await self._backend.get_supervisors(filter_ready=filter_ready)
+
+    async def _watch_supervisors_from_backend(self):
+        async for supervisors in self._backend.watch_supervisors():
+            yield supervisors
diff --git a/python/xorbits/_mars/services/cluster/supervisor/node_allocator.py b/python/xorbits/_mars/services/cluster/supervisor/node_allocator.py
new file mode 100644
index 000000000..4a97a273b
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/supervisor/node_allocator.py
@@ -0,0 +1,45 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from .... import oscar as mo
+from ...core import NodeRole
+from ..backends import AbstractClusterBackend, get_cluster_backend
+
+
+class NodeAllocatorActor(mo.StatelessActor):
+    def __init__(self, backend_name: str, lookup_address: str):
+        self._backend_name = backend_name
+        self._lookup_address = lookup_address
+        self._backend: Optional[AbstractClusterBackend] = None
+
+    async def __post_create__(self):
+        backend_cls = get_cluster_backend(self._backend_name)
+        self._backend = await backend_cls.create(
+            NodeRole.WORKER, self._lookup_address, self.address
+        )
+
+    async def request_worker(
+        self, worker_cpu: int, worker_mem: int, timeout: int = None
+    ) -> str:
+        return await self._backend.request_worker(
+            worker_cpu, worker_mem, timeout=timeout
+        )
+
+    async def release_worker(self, address: str):
+        await self._backend.release_worker(address)
+
+    async def reconstruct_worker(self, address: str):
+        await self._backend.reconstruct_worker(address)
diff --git a/python/xorbits/_mars/services/cluster/supervisor/node_info.py b/python/xorbits/_mars/services/cluster/supervisor/node_info.py
new file mode 100644
index 000000000..c36ed92fb
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/supervisor/node_info.py
@@ -0,0 +1,215 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+from collections import defaultdict
+from typing import Dict, List, Optional, Set
+
+from .... import oscar as mo
+from ....resource import Resource
+from ....typing import BandType
+from ...core import NodeRole
+from ..core import NodeInfo, NodeStatus, WatchNotifier
+
+DEFAULT_NODE_DEAD_TIMEOUT = 120
+DEFAULT_NODE_CHECK_INTERVAL = 1
+
+
+class NodeInfoCollectorActor(mo.Actor):
+    _node_infos: Dict[str, NodeInfo]
+
+    def __init__(self, timeout=None, check_interval=None):
+        self._role_to_nodes = defaultdict(set)
+        self._role_to_notifier = defaultdict(WatchNotifier)
+
+        self._node_infos = dict()
+
+        self._node_timeout = timeout or DEFAULT_NODE_DEAD_TIMEOUT
+        self._check_interval = check_interval or DEFAULT_NODE_CHECK_INTERVAL
+        self._check_task = None
+
+    async def __post_create__(self):
+        self._check_task = self.ref().check_dead_nodes.tell_delay(
+            delay=self._check_interval
+        )
+
+    async def __pre_destroy__(self):
+        self._check_task.cancel()
+
+    async def check_dead_nodes(self):
+        affect_roles = set()
+        for address, info in self._node_infos.items():
+            if (
+                info.status == NodeStatus.READY
+                and time.time() - info.update_time > self._node_timeout
+            ):
+                info.status = NodeStatus.STOPPED
+                node_role = info.role
+                affect_roles.add(node_role)
+
+        if affect_roles:
+            await self._notify_roles(affect_roles)
+
+        self._check_task = self.ref().check_dead_nodes.tell_delay(
+            delay=self._check_interval
+        )
+
+    async def _notify_roles(self, roles):
+        for role in roles:
+            await self._role_to_notifier[role].notify()
+
+    async def update_node_info(
+        self,
+        address: str,
+        role: NodeRole,
+        env: Dict = None,
+        resource: Dict = None,
+        detail: Dict = None,
+        status: NodeStatus = None,
+    ):
+        need_notify = False
+        if address not in self._node_infos:
+            need_notify = True
+            info = self._node_infos[address] = NodeInfo(role=role, status=status)
+        else:
+            info = self._node_infos[address]
+
+        info.update_time = time.time()
+        if env is not None:
+            info.env.update(env)
+        if resource is not None:
+            info.resource.update(resource)
+        if detail is not None:
+            info.detail.update(detail)
+        if status is not None:
+            need_notify = need_notify or (info.status != status)
+            info.status = status
+
+        if need_notify:
+            self._role_to_nodes[role].add(address)
+            await self._notify_roles([role])
+
+    def get_nodes_info(
+        self,
+        nodes: List[str] = None,
+        role: NodeRole = None,
+        env: bool = False,
+        resource: bool = False,
+        detail: bool = False,
+        statuses: Set[NodeStatus] = None,
+    ):
+        statuses = statuses or {NodeStatus.READY}
+        if nodes is None:
+            nodes = (
+                self._role_to_nodes.get(role)
+                if role is not None
+                else self._node_infos.keys()
+            )
+            nodes = nodes or []
+        ret_infos = dict()
+        for node in nodes:
+            if node not in self._node_infos:
+                continue
+            info = self._node_infos[node]
+            if info.status not in statuses:
+                continue
+
+            ret_infos[node] = dict(
+                status=info.status,
+                update_time=info.update_time,
+                env=info.env if env else None,
+                resource=info.resource if resource else None,
+                detail=info.detail if detail else None,
+            )
+        return ret_infos
+
+    def get_all_bands(
+        self, role: NodeRole = None, statuses: Set[NodeStatus] = None
+    ) -> Dict[BandType, Resource]:
+        statuses = statuses or {NodeStatus.READY}
+        role = role or NodeRole.WORKER
+        nodes = self._role_to_nodes.get(role, [])
+        band_resource = dict()
+        for node in nodes:
+            if self._node_infos[node].status not in statuses:
+                continue
+            node_resource = self._node_infos[node].resource
+            for resource_type, info in node_resource.items():
+                if resource_type.startswith("numa"):
+                    # cpu
+                    band_resource[(node, resource_type)] = Resource(
+                        num_cpus=info["cpu_total"], mem_bytes=info["memory_total"]
+                    )
+                else:  # pragma: no cover
+                    assert resource_type.startswith("gpu")
+                    band_resource[(node, resource_type)] = Resource(
+                        num_gpus=info["gpu_total"]
+                    )
+        return band_resource
+
+    def get_mars_versions(self) -> List[str]:
+        versions = set(info.env["mars_version"] for info in self._node_infos.values())
+        return list(sorted(versions))
+
+    async def watch_nodes(
+        self,
+        role: NodeRole,
+        env: bool = False,
+        resource: bool = False,
+        detail: bool = False,
+        statuses: Set[NodeStatus] = None,
+        version: Optional[int] = None,
+    ):
+        version = yield self._role_to_notifier[role].watch(version=version)
+        raise mo.Return(
+            (
+                version,
+                self.get_nodes_info(
+                    role=role,
+                    env=env,
+                    resource=resource,
+                    detail=detail,
+                    statuses=statuses,
+                ),
+            )
+        )
+
+    async def watch_all_bands(
+        self,
+        role: NodeRole = None,
+        statuses: Set[NodeStatus] = None,
+        version: Optional[int] = None,
+    ):
+        role = role or NodeRole.WORKER
+        version = yield self._role_to_notifier[role].watch(version=version)
+        raise mo.Return((version, self.get_all_bands(role=role, statuses=statuses)))
+
+    async def put_starting_nodes(self, nodes: List[str], role: NodeRole):
+        for node_ep in nodes:
+            if node_ep in self._node_infos and self._node_infos[node_ep].status not in {
+                NodeStatus.STARTING,
+                NodeStatus.STOPPED,
+            }:
+                continue
+            self._node_infos[node_ep] = NodeInfo(
+                role, NodeStatus.STARTING, update_time=time.time()
+            )
+            self._role_to_nodes[role].add(node_ep)
+
+        nodes_set = set(nodes)
+        for node, info in self._node_infos.items():
+            if info.status == NodeStatus.STARTING and node not in nodes_set:
+                info.status = NodeStatus.STOPPED
+
+        await self._role_to_notifier[role].notify()
diff --git a/python/xorbits/_mars/services/cluster/supervisor/service.py b/python/xorbits/_mars/services/cluster/supervisor/service.py
new file mode 100644
index 000000000..4344de160
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/supervisor/service.py
@@ -0,0 +1,109 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .... import oscar as mo
+from ...core import AbstractService, NodeRole
+from ..file_logger import FileLoggerActor
+from ..procinfo import ProcessInfoManagerActor
+from ..uploader import NodeInfoUploaderActor
+from .locator import SupervisorPeerLocatorActor
+from .node_allocator import NodeAllocatorActor
+from .node_info import NodeInfoCollectorActor
+
+
+class ClusterSupervisorService(AbstractService):
+    """
+    Cluster service on supervisor
+
+    Service Configuration
+    ---------------------
+    {
+        "cluster": {
+            "backend": "",
+            "lookup_address": "
", + "node_timeout": timeout seconds of nodes, + "node_check_interval": check interval seconds for nodes + } + } + """ + + async def start(self): + svc_config = self._config["cluster"] + address = self._address + + backend = svc_config.get("backend", "fixed") + lookup_address = svc_config.get( + "lookup_address", address if backend == "fixed" else None + ) + await mo.create_actor( + NodeInfoCollectorActor, + timeout=svc_config.get("node_timeout"), + check_interval=svc_config.get("node_check_interval"), + uid=NodeInfoCollectorActor.default_uid(), + address=address, + ) + await mo.create_actor( + SupervisorPeerLocatorActor, + backend_name=backend, + lookup_address=lookup_address, + uid=SupervisorPeerLocatorActor.default_uid(), + address=address, + ) + await mo.create_actor( + NodeInfoUploaderActor, + role=NodeRole.SUPERVISOR, + interval=svc_config.get("node_check_interval"), + uid=NodeInfoUploaderActor.default_uid(), + address=address, + ) + await mo.create_actor( + NodeAllocatorActor, + backend_name=backend, + lookup_address=lookup_address, + uid=NodeAllocatorActor.default_uid(), + address=address, + ) + await mo.create_actor( + ProcessInfoManagerActor, + uid=ProcessInfoManagerActor.default_uid(), + address=address, + ) + await mo.create_actor( + FileLoggerActor, uid=FileLoggerActor.default_uid(), address=address + ) + + async def stop(self): + address = self._address + + await mo.destroy_actor( + mo.create_actor_ref( + uid=NodeInfoCollectorActor.default_uid(), address=address + ) + ) + await mo.destroy_actor( + mo.create_actor_ref( + uid=SupervisorPeerLocatorActor.default_uid(), address=address + ) + ) + await mo.destroy_actor( + mo.create_actor_ref( + uid=NodeInfoUploaderActor.default_uid(), address=address + ) + ) + await mo.destroy_actor( + mo.create_actor_ref(uid=NodeAllocatorActor.default_uid(), address=address) + ) + await mo.destroy_actor( + mo.create_actor_ref(uid=FileLoggerActor.default_uid(), address=address) + ) diff --git a/python/xorbits/_mars/services/cluster/tests/__init__.py b/python/xorbits/_mars/services/cluster/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/services/cluster/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/services/cluster/tests/backend.py b/python/xorbits/_mars/services/cluster/tests/backend.py new file mode 100644 index 000000000..e7be76fb1 --- /dev/null +++ b/python/xorbits/_mars/services/cluster/tests/backend.py @@ -0,0 +1,66 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import logging +import os +from typing import AsyncGenerator, List, Optional + +from ... import NodeRole +from ...cluster.backends import AbstractClusterBackend, register_cluster_backend + +logger = logging.getLogger(__name__) + + +@register_cluster_backend +class TestClusterBackend(AbstractClusterBackend): + name = "test" + + def __init__(self, file_path: str): + self._file_path = file_path + self._modify_date = os.path.getmtime(file_path) + + @classmethod + async def create( + cls, node_role: NodeRole, lookup_address: Optional[str], pool_address: str + ) -> "AbstractClusterBackend": + return TestClusterBackend(lookup_address) + + async def get_supervisors(self, filter_ready: bool = True) -> List[str]: + with open(self._file_path, "r") as inp_file: + result = [] + for line in inp_file.read().strip().splitlines(False): + line_parts = line.rsplit(",", 1) + if len(line_parts) == 1 or (filter_ready and int(line_parts[1])): + result.append(line_parts[0]) + return result + + async def watch_supervisors(self) -> AsyncGenerator[List[str], None]: + while True: + mtime = os.path.getmtime(self._file_path) + if mtime != self._modify_date: + self._modify_date = mtime + yield await self.get_supervisors() + await asyncio.sleep(0.1) + + async def request_worker( + self, worker_cpu: int = None, worker_mem: int = None, timeout: int = None + ) -> str: + raise NotImplementedError + + async def release_worker(self, address: str): + raise NotImplementedError + + async def reconstruct_worker(self, address: str): + raise NotImplementedError diff --git a/python/xorbits/_mars/services/cluster/tests/test_api.py b/python/xorbits/_mars/services/cluster/tests/test_api.py new file mode 100644 index 000000000..9dc77a61d --- /dev/null +++ b/python/xorbits/_mars/services/cluster/tests/test_api.py @@ -0,0 +1,201 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import asyncio +import os +import tempfile + +import pytest + +from .... import oscar as mo +from ....constants import MARS_LOG_PATH_KEY, MARS_LOG_PREFIX, MARS_TMP_DIR_PREFIX +from ....utils import clean_mars_tmp_dir, get_next_port +from ... import NodeRole +from ...web.supervisor import WebSupervisorService +from ..api import ClusterAPI, MockClusterAPI, WebClusterAPI +from ..api.web import web_handlers +from ..core import NodeStatus + + +@pytest.fixture +async def actor_pool(): + # prepare + mars_tmp_dir = tempfile.mkdtemp(prefix=MARS_TMP_DIR_PREFIX) + _, file_path = tempfile.mkstemp(prefix=MARS_LOG_PREFIX, dir=mars_tmp_dir) + os.environ[MARS_LOG_PATH_KEY] = file_path + pool = await mo.create_actor_pool("127.0.0.1", n_process=0) + async with pool: + yield pool + + # clean + clean_mars_tmp_dir() + + +class TestActor(mo.Actor): + __test__ = False + + +async def wait_async_gen(async_gen): + async for _ in async_gen: + pass + + +@pytest.mark.asyncio +async def test_api(actor_pool): + pool_addr = actor_pool.external_address + api = await MockClusterAPI.create(pool_addr, upload_interval=0.1) + + assert await api.get_supervisors() == [pool_addr] + + assert pool_addr in await api.get_supervisors_by_keys(["test_mock"]) + + await mo.create_actor(TestActor, uid=TestActor.default_uid(), address=pool_addr) + assert (await api.get_supervisor_refs([TestActor.default_uid()]))[ + 0 + ].address == pool_addr + + bands = await api.get_all_bands() + assert (pool_addr, "numa-0") in bands + + with pytest.raises(asyncio.TimeoutError): + await asyncio.wait_for(wait_async_gen(api.watch_supervisors()), timeout=0.1) + with pytest.raises(asyncio.TimeoutError): + await asyncio.wait_for( + wait_async_gen(api.watch_supervisor_refs([TestActor.default_uid()])), + timeout=0.1, + ) + with pytest.raises(asyncio.TimeoutError): + await asyncio.wait_for( + wait_async_gen( + api.watch_nodes(NodeRole.WORKER, statuses={NodeStatus.READY}) + ), + timeout=0.1, + ) + with pytest.raises(asyncio.TimeoutError): + await asyncio.wait_for( + wait_async_gen(api.watch_all_bands(statuses={NodeStatus.READY})), + timeout=0.1, + ) + with pytest.raises(NotImplementedError): + await api.request_worker(timeout=1) + with pytest.raises(NotImplementedError): + await api.release_worker("127.0.0.1:1234") + + await api.set_node_status(pool_addr, NodeRole.WORKER, NodeStatus.STOPPING) + assert {} == await api.get_all_bands() + assert {} == await api.get_nodes_info(role=NodeRole.WORKER) + bands = await api.get_all_bands(exclude_statuses={NodeStatus.STOPPED}) + assert (pool_addr, "numa-0") in bands + assert pool_addr in await api.get_nodes_info( + role=NodeRole.WORKER, exclude_statuses={NodeStatus.STOPPED} + ) + + log_ref = await api._get_log_ref() + assert log_ref is not None + + content = await api.fetch_node_log(size=10, address=pool_addr) + assert "" == content + content = await api.fetch_node_log(size=-1, address=pool_addr) + assert type(content) is str + assert "" == content + + await MockClusterAPI.cleanup(pool_addr) + + +@pytest.mark.asyncio +async def test_web_api(actor_pool): + pool_addr = actor_pool.external_address + await MockClusterAPI.create(pool_addr, upload_interval=0.1) + + web_config = { + "web": { + "host": "127.0.0.1", + "port": get_next_port(), + "web_handlers": web_handlers, + } + } + web_service = WebSupervisorService(web_config, pool_addr) + await web_service.start() + + web_api = WebClusterAPI(f'http://127.0.0.1:{web_config["web"]["port"]}') + assert await web_api.get_supervisors() == [pool_addr] + + assert len(await web_api.get_all_bands(statuses={NodeStatus.READY})) > 0 + nodes = await web_api.get_nodes_info( + role=NodeRole.WORKER, statuses={NodeStatus.READY} + ) + assert len(nodes) > 0 + + from .... import __version__ as mars_version + + assert await web_api.get_mars_versions() == [mars_version] + + with pytest.raises(asyncio.TimeoutError): + await asyncio.wait_for(wait_async_gen(web_api.watch_supervisors()), timeout=0.1) + with pytest.raises(asyncio.TimeoutError): + await asyncio.wait_for( + wait_async_gen(web_api.watch_nodes(NodeRole.WORKER)), timeout=0.1 + ) + with pytest.raises(asyncio.TimeoutError): + await asyncio.wait_for(wait_async_gen(web_api.watch_all_bands()), timeout=0.1) + + proc_info = await web_api.get_node_pool_configs(pool_addr) + assert len(proc_info) > 0 + stacks = await web_api.get_node_thread_stacks(pool_addr) + assert len(stacks) > 0 + + log_content = await web_api.fetch_node_log(size=None, address=pool_addr) + assert len(log_content) == 0 + + log_content = await web_api.fetch_node_log(size=5, address=pool_addr) + assert len(log_content) == 0 + + log_content = await web_api.fetch_node_log(size=-1, address=pool_addr) + assert type(log_content) is str + assert len(log_content) == 0 + + log_file = os.environ[MARS_LOG_PATH_KEY] + with open(log_file, "w") as f: + f.write("foo bar baz") + log_content = await web_api.fetch_node_log(size=-1, address=pool_addr) + assert len(log_content) == 11 + + await MockClusterAPI.cleanup(pool_addr) + + +@pytest.mark.asyncio +async def test_no_supervisor(actor_pool): + pool_addr = actor_pool.external_address + + from ..supervisor.locator import SupervisorPeerLocatorActor + from ..uploader import NodeInfoUploaderActor + + await mo.create_actor( + SupervisorPeerLocatorActor, + "fixed", + [], + uid=SupervisorPeerLocatorActor.default_uid(), + address=pool_addr, + ) + await mo.create_actor( + NodeInfoUploaderActor, + NodeRole.WORKER, + interval=1, + band_to_resource=None, + use_gpu=False, + uid=NodeInfoUploaderActor.default_uid(), + address=pool_addr, + ) + api = await ClusterAPI.create(address=pool_addr) + with pytest.raises(mo.ActorNotExist): + await api.get_supervisor_refs(["KEY"]) diff --git a/python/xorbits/_mars/services/cluster/tests/test_file_logger.py b/python/xorbits/_mars/services/cluster/tests/test_file_logger.py new file mode 100644 index 000000000..0d9ff091a --- /dev/null +++ b/python/xorbits/_mars/services/cluster/tests/test_file_logger.py @@ -0,0 +1,93 @@ +# Copyright 2022 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import tempfile + +import pytest + +from .... import oscar as mo +from ....constants import MARS_LOG_PATH_KEY, MARS_LOG_PREFIX, MARS_TMP_DIR_PREFIX +from ....utils import clean_mars_tmp_dir +from ..file_logger import FileLoggerActor + +full_content = "qwert\nasdfg\nzxcvb\nyuiop\nhjkl;\nnm,./" + + +@pytest.fixture +async def actor_pool(): + # prepare + mars_tmp_dir = tempfile.mkdtemp(prefix=MARS_TMP_DIR_PREFIX) + _, file_path = tempfile.mkstemp(prefix=MARS_LOG_PREFIX, dir=mars_tmp_dir) + os.environ[MARS_LOG_PATH_KEY] = file_path + pool = await mo.create_actor_pool("127.0.0.1", n_process=0) + async with pool: + yield pool + + # clean + clean_mars_tmp_dir() + + +@pytest.mark.asyncio +async def test_file_logger(actor_pool): + pool_addr = actor_pool.external_address + logger_ref = await mo.create_actor( + FileLoggerActor, + uid=FileLoggerActor.default_uid(), + address=pool_addr, + ) + + filename = os.environ.get(MARS_LOG_PATH_KEY) + with open(filename, "w", newline="\n") as f: + f.write(full_content) + + byte_num = 5 + expected_data = "" + content = await logger_ref.fetch_logs(byte_num, 0) + assert content == expected_data + + byte_num = 6 + expected_data = "nm,./" + content = await logger_ref.fetch_logs(byte_num, 0) + assert content == expected_data + + byte_num = 11 + expected_data = "nm,./" + content = await logger_ref.fetch_logs(byte_num, 0) + assert content == expected_data + + byte_num = 12 + expected_data = "hjkl;\nnm,./" + content = await logger_ref.fetch_logs(byte_num, 0) + assert content == expected_data + + byte_num = 50 + expected_data = "qwert\nasdfg\nzxcvb\nyuiop\nhjkl;\nnm,./" + content = await logger_ref.fetch_logs(byte_num, 0) + assert content == expected_data + + byte_num = -1 + expected_data = "qwert\nasdfg\nzxcvb\nyuiop\nhjkl;\nnm,./" + content = await logger_ref.fetch_logs(byte_num, 0) + assert content == expected_data + + byte_num = -1 + offset = 1 + expected_data = "wert\nasdfg\nzxcvb\nyuiop\nhjkl;\nnm,./" + content = await logger_ref.fetch_logs(byte_num, offset) + assert content == expected_data + + offset = 35 + expected_data = "" + content = await logger_ref.fetch_logs(byte_num, offset) + assert content == expected_data diff --git a/python/xorbits/_mars/services/cluster/tests/test_gather.py b/python/xorbits/_mars/services/cluster/tests/test_gather.py new file mode 100644 index 000000000..5c5c10b8d --- /dev/null +++ b/python/xorbits/_mars/services/cluster/tests/test_gather.py @@ -0,0 +1,46 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time + +from .. import DiskInfo +from ..gather import gather_node_details, gather_node_env, gather_node_resource + + +def test_gather_node_env(): + node_env = gather_node_env() + band_data = node_env["bands"]["numa-0"] + assert band_data["resources"]["cpu"] > 0 + assert band_data["resources"]["memory"] > 0 + + +def test_gather_node_resource(): + node_res = gather_node_resource() + band_res = node_res["numa-0"] + assert band_res["cpu_total"] >= band_res["cpu_avail"] + assert band_res["memory_total"] >= band_res["memory_avail"] + + +def test_gather_node_details(): + gather_node_details() + time.sleep(0.1) + node_details = gather_node_details() + assert not node_details["disk"].get("partitions") + + curdir = os.path.dirname(os.path.abspath(__file__)) + gather_node_details(disk_infos=[DiskInfo(path=curdir)]) + time.sleep(0.1) + node_details = gather_node_details(disk_infos=[DiskInfo(path=curdir)]) + assert node_details["disk"].get("partitions") diff --git a/python/xorbits/_mars/services/cluster/tests/test_locator.py b/python/xorbits/_mars/services/cluster/tests/test_locator.py new file mode 100644 index 000000000..c0b1b9920 --- /dev/null +++ b/python/xorbits/_mars/services/cluster/tests/test_locator.py @@ -0,0 +1,213 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import os +import tempfile +from typing import List + +import pytest + +from .... import oscar as mo +from ....tests.core import flaky +from ....utils import Timer +from ..core import NodeRole, NodeStatus +from ..supervisor.locator import SupervisorPeerLocatorActor +from ..supervisor.node_info import NodeInfoCollectorActor +from ..tests import backend +from ..worker.locator import WorkerSupervisorLocatorActor + +del backend + + +class MockNodeInfoCollectorActor(mo.Actor): + def __init__(self): + self._node_infos = dict() + self._version = 0 + + def set_all_node_infos(self, node_infos): + self._node_infos = node_infos + + def get_nodes_info(self, *args, **kwargs): + return self._node_infos + + async def watch_nodes(self, *args, version=None, **kwargs): + await asyncio.sleep(0.5) + self._version += 1 + return self._version, self._node_infos + + def put_starting_nodes(self, nodes: List[str], role: NodeRole): + for node in nodes: + self._node_infos[node] = NodeStatus.STARTING + + +@pytest.fixture +async def actor_pool(): + pool = await mo.create_actor_pool("127.0.0.1", n_process=0) + async with pool: + await mo.create_actor( + MockNodeInfoCollectorActor, + uid=NodeInfoCollectorActor.default_uid(), + address=pool.external_address, + ) + yield pool + + +@pytest.mark.asyncio +async def test_fixed_locator(actor_pool): + addresses = ["1.2.3.4:1234", "1.2.3.4:1235", "1.2.3.4:1236", "1.2.3.4:1237"] + locator_ref = await mo.create_actor( + SupervisorPeerLocatorActor, + "fixed", + ",".join(addresses), + address=actor_pool.external_address, + ) + + assert await locator_ref.get_supervisor("mock_name") in addresses + + dbl_addrs = await locator_ref.get_supervisor("mock_name", 2) + assert len(dbl_addrs) == 2 + assert all(addr in addresses for addr in dbl_addrs) + + with Timer() as timer: + await locator_ref.wait_all_supervisors_ready() + assert timer.duration < 0.1 + + await mo.destroy_actor(locator_ref) + + +@pytest.fixture +def temp_address_file(): + with tempfile.TemporaryDirectory(prefix="mars-test") as dir_name: + yield os.path.join(dir_name, "addresses") + + +@flaky(max_runs=3) +@pytest.mark.asyncio +async def test_supervisor_peer_locator(actor_pool, temp_address_file): + addresses = ["1.2.3.4:1234", "1.2.3.4:1235", "1.2.3.4:1236", "1.2.3.4:1237"] + with open(temp_address_file, "w") as file_obj: + file_obj.write("\n".join(addresses)) + + locator_ref = await mo.create_actor( + SupervisorPeerLocatorActor, + "test", + temp_address_file, + uid=SupervisorPeerLocatorActor.default_uid(), + address=actor_pool.external_address, + ) + + # test starting nodes filled + info_ref = await mo.actor_ref( + uid=NodeInfoCollectorActor.default_uid(), address=actor_pool.external_address + ) + assert set(await info_ref.get_nodes_info()) == set(addresses) + + # test watch nodes changes + version, result = await asyncio.wait_for( + locator_ref.watch_supervisors_by_keys(["mock_name"]), + timeout=30, + ) + assert result[0] in addresses + + with open(temp_address_file, "w") as file_obj: + file_obj.write("\n".join(addresses[2:])) + + version, result = await asyncio.wait_for( + locator_ref.watch_supervisors_by_keys(["mock_name"], version=version), + timeout=30, + ) + assert result[0] in addresses[2:] + + # test wait all supervisors ready + with open(temp_address_file, "w") as file_obj: + file_obj.write("\n".join(f"{a},{idx % 2}" for idx, a in enumerate(addresses))) + + async def delay_read_fun(): + await asyncio.sleep(0.2) + with open(temp_address_file, "w") as file_obj: + file_obj.write( + "\n".join(f"{a},{(idx + 1) % 2}" for idx, a in enumerate(addresses)) + ) + await asyncio.sleep(0.5) + with open(temp_address_file, "w") as file_obj: + file_obj.write("\n".join(addresses)) + + asyncio.create_task(delay_read_fun()) + + with Timer() as timer: + await asyncio.wait_for(locator_ref.wait_all_supervisors_ready(), timeout=30) + assert timer.duration > 0.4 + + await mo.destroy_actor(locator_ref) + + +@flaky(max_runs=3) +@pytest.mark.asyncio +async def test_worker_supervisor_locator(actor_pool, temp_address_file): + addresses = [actor_pool.external_address] + with open(temp_address_file, "w") as file_obj: + file_obj.write("\n".join(addresses)) + + locator_ref = await mo.create_actor( + WorkerSupervisorLocatorActor, + "test", + temp_address_file, + uid=WorkerSupervisorLocatorActor.default_uid(), + address=actor_pool.external_address, + ) + + info_ref = await mo.actor_ref( + uid=NodeInfoCollectorActor.default_uid(), address=actor_pool.external_address + ) + await info_ref.set_all_node_infos({actor_pool.external_address: NodeStatus.READY}) + + # test watch nodes changes + supervisors = await locator_ref.get_supervisors(filter_ready=False) + assert supervisors == addresses + version, result = await asyncio.wait_for( + locator_ref.watch_supervisors_by_keys(["mock_name"]), + timeout=30, + ) + assert result[0] in addresses + + # test watch without NodeInfoCollectorActor + await info_ref.destroy() + + addresses = ["localhost:1234", "localhost:1235"] + with open(temp_address_file, "w") as file_obj: + file_obj.write("\n".join(addresses)) + version, result = await asyncio.wait_for( + locator_ref.watch_supervisors_by_keys(["mock_name"], version=version), + timeout=30, + ) + assert result[0] in addresses + + # test watch when NodeInfoCollectorActor is created again + info_ref = await mo.create_actor( + MockNodeInfoCollectorActor, + uid=NodeInfoCollectorActor.default_uid(), + address=actor_pool.external_address, + ) + await info_ref.set_all_node_infos({actor_pool.external_address: NodeStatus.READY}) + + addresses = [actor_pool.external_address] + with open(temp_address_file, "w") as file_obj: + file_obj.write("\n".join(addresses)) + + version, result = await asyncio.wait_for( + locator_ref.watch_supervisors_by_keys(["mock_name"], version=version), + timeout=30, + ) + assert result[0] in addresses diff --git a/python/xorbits/_mars/services/cluster/tests/test_procinfo.py b/python/xorbits/_mars/services/cluster/tests/test_procinfo.py new file mode 100644 index 000000000..697ed0246 --- /dev/null +++ b/python/xorbits/_mars/services/cluster/tests/test_procinfo.py @@ -0,0 +1,42 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from .... import oscar as mo +from ..procinfo import ProcessInfoManagerActor + + +@pytest.fixture +async def actor_pool(): + pool = await mo.create_actor_pool( + "127.0.0.1", n_process=2, labels=["main", "numa-0", "gpu-0"] + ) + async with pool: + yield pool + + +@pytest.mark.asyncio +async def test_proc_info(actor_pool): + address = actor_pool.external_address + manager_ref = await mo.create_actor( + ProcessInfoManagerActor, + uid=ProcessInfoManagerActor.default_uid(), + address=address, + ) # type: ProcessInfoManagerActor | mo.ActorRef + pool_cfgs = await manager_ref.get_pool_configs() + for cfg, expect_label in zip(pool_cfgs, ["main", "numa-0", "gpu-0"]): + assert cfg["label"] == expect_label + stacks = await manager_ref.get_thread_stacks() + assert len(stacks) == len(pool_cfgs) diff --git a/python/xorbits/_mars/services/cluster/tests/test_service.py b/python/xorbits/_mars/services/cluster/tests/test_service.py new file mode 100644 index 000000000..3e1e0160b --- /dev/null +++ b/python/xorbits/_mars/services/cluster/tests/test_service.py @@ -0,0 +1,93 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import os + +import pytest + +from .... import oscar as mo +from ....storage import StorageLevel +from ... import NodeRole, start_services, stop_services +from .. import ClusterAPI, DiskInfo, QuotaInfo, StorageInfo, WorkerSlotInfo + + +@pytest.fixture +async def actor_pools(): + async def start_pool(): + pool = await mo.create_actor_pool("127.0.0.1", n_process=0) + await pool.start() + return pool + + sv_pool, worker_pool = await asyncio.gather(start_pool(), start_pool()) + try: + yield sv_pool, worker_pool + finally: + await asyncio.gather(sv_pool.stop(), worker_pool.stop()) + + +@pytest.mark.asyncio +async def test_cluster_service(actor_pools): + sv_pool, worker_pool = actor_pools + + config = { + "services": ["cluster"], + "cluster": { + "backend": "fixed", + "lookup_address": sv_pool.external_address, + }, + } + await start_services(NodeRole.SUPERVISOR, config, address=sv_pool.external_address) + await start_services(NodeRole.WORKER, config, address=worker_pool.external_address) + + sv_api = await ClusterAPI.create(sv_pool.external_address) + worker_api = await ClusterAPI.create(worker_pool.external_address) + + await worker_api.set_band_quota_info( + "numa-0", QuotaInfo(quota_size=1024, allocated_size=100, hold_size=100) + ) + await worker_api.set_band_slot_infos( + "numa-0", + [ + WorkerSlotInfo( + slot_id=0, + session_id="test_session", + subtask_id="test_subtask", + processor_usage=1.0, + ) + ], + ) + await worker_api.set_band_storage_info( + "numa-0", + StorageInfo(storage_level=StorageLevel.MEMORY, total_size=1024, used_size=512), + ) + curdir = os.path.dirname(os.path.abspath(__file__)) + await worker_api.set_node_disk_info([DiskInfo(path=curdir)]) + await asyncio.sleep(1.5) + + assert ( + next(iter(await sv_api.get_nodes_info(role=NodeRole.SUPERVISOR))) + == sv_pool.external_address + ) + worker_infos = await sv_api.get_nodes_info(role=NodeRole.WORKER, detail=True) + assert worker_pool.external_address in worker_infos + + info_details = worker_infos[worker_pool.external_address]["detail"] + assert len(info_details["disk"]["partitions"]) > 0 + assert len(info_details["slot"]) > 0 + assert len(info_details["quota"]) > 0 + assert len(info_details["storage"]) > 0 + + await stop_services(NodeRole.WORKER, config, address=worker_pool.external_address) + await stop_services(NodeRole.SUPERVISOR, config, address=sv_pool.external_address) diff --git a/python/xorbits/_mars/services/cluster/tests/test_uploader.py b/python/xorbits/_mars/services/cluster/tests/test_uploader.py new file mode 100644 index 000000000..947bbd383 --- /dev/null +++ b/python/xorbits/_mars/services/cluster/tests/test_uploader.py @@ -0,0 +1,88 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio + +import pytest + +from .... import oscar as mo +from ... import NodeRole +from ..supervisor.locator import SupervisorPeerLocatorActor +from ..supervisor.node_info import NodeInfoCollectorActor +from ..uploader import NodeInfoUploaderActor + + +@pytest.fixture +async def actor_pool(): + pool = await mo.create_actor_pool("127.0.0.1", n_process=0) + async with pool: + yield pool + + +@pytest.mark.asyncio +async def test_uploader(actor_pool): + pool_addr = actor_pool.external_address + await mo.create_actor( + SupervisorPeerLocatorActor, + "fixed", + pool_addr, + uid=SupervisorPeerLocatorActor.default_uid(), + address=pool_addr, + ) + node_info_ref = await mo.create_actor( + NodeInfoCollectorActor, + timeout=0.5, + check_interval=0.1, + uid=NodeInfoCollectorActor.default_uid(), + address=pool_addr, + ) + uploader_ref = await mo.create_actor( + NodeInfoUploaderActor, + role=NodeRole.WORKER, + interval=0.1, + uid=NodeInfoUploaderActor.default_uid(), + address=pool_addr, + ) + wait_ready_task = asyncio.create_task(uploader_ref.wait_node_ready()) + await uploader_ref.mark_node_ready() + await asyncio.wait_for(wait_ready_task, timeout=0.1) + + # test empty result + result = await node_info_ref.get_nodes_info(role=NodeRole.WORKER) + assert pool_addr in result + assert all(result[pool_addr].get(k) is None for k in ("env", "resource", "detail")) + + result = await node_info_ref.get_nodes_info( + role=NodeRole.WORKER, env=True, resource=True, detail=True + ) + assert pool_addr in result + assert all( + result[pool_addr].get(k) is not None for k in ("env", "resource", "detail") + ) + + async def watcher(): + version = None + while True: + version, infos = await node_info_ref.watch_nodes( + NodeRole.WORKER, version=version + ) + if not infos: + break + + watch_task = asyncio.create_task(watcher()) + + await uploader_ref.destroy() + assert not await asyncio.wait_for(watch_task, timeout=5) + + await node_info_ref.destroy() diff --git a/python/xorbits/_mars/services/cluster/uploader.py b/python/xorbits/_mars/services/cluster/uploader.py new file mode 100644 index 000000000..86b2b2655 --- /dev/null +++ b/python/xorbits/_mars/services/cluster/uploader.py @@ -0,0 +1,196 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import logging +from collections import defaultdict +from typing import Dict, List + +from ... import oscar as mo +from ...lib.aio import alru_cache +from ...resource import Resource +from ...storage import StorageLevel +from ...typing import BandType +from .core import DiskInfo, NodeInfo, NodeStatus, QuotaInfo, StorageInfo, WorkerSlotInfo +from .gather import gather_node_details, gather_node_env, gather_node_resource + +logger = logging.getLogger(__name__) + +DEFAULT_INFO_UPLOAD_INTERVAL = 1 + + +class NodeInfoUploaderActor(mo.Actor): + _band_slot_infos: Dict[str, List[WorkerSlotInfo]] + _band_quota_infos: Dict[str, QuotaInfo] + _disk_infos: List[DiskInfo] + _band_storage_infos: Dict[str, Dict[StorageLevel, StorageInfo]] + + def __init__(self, role=None, interval=None, band_to_resource=None, use_gpu=True): + self._info = NodeInfo(role=role) + + self._env_uploaded = False + self._band_to_resource = band_to_resource + + self._interval = interval or DEFAULT_INFO_UPLOAD_INTERVAL + self._upload_task = None + self._upload_enabled = False + self._uploaded_future = asyncio.Future() + self._node_ready_event = asyncio.Event() + + self._use_gpu = use_gpu + + self._band_slot_infos = dict() + self._band_quota_infos = dict() + self._band_storage_infos = defaultdict(dict) + self._disk_infos = [] + + async def __post_create__(self): + self._upload_task = asyncio.create_task(self._periodical_upload_node_info()) + await self._uploaded_future + + async def __pre_destroy__(self): + self._upload_task.cancel() + + @alru_cache(cache_exceptions=False) + async def _get_node_info_ref(self): + from .locator import SupervisorLocatorActor + from .supervisor.node_info import NodeInfoCollectorActor + + locator_ref = await mo.actor_ref( + SupervisorLocatorActor.default_uid(), address=self.address + ) + supervisor_addr = await locator_ref.get_supervisor( + NodeInfoCollectorActor.default_uid() + ) + if supervisor_addr is None: + raise ValueError + + return await mo.actor_ref( + NodeInfoCollectorActor.default_uid(), address=supervisor_addr + ) + + async def _periodical_upload_node_info(self): + while True: + try: + await self.upload_node_info() + if not self._uploaded_future.done(): + self._uploaded_future.set_result(None) + except asyncio.CancelledError: # pragma: no cover + break + except ( + Exception + ) as ex: # pragma: no cover # noqa: E722 # nosec # pylint: disable=bare-except + logger.error(f"Failed to upload node info: {ex}") + if not self._uploaded_future.done(): + self._uploaded_future.set_exception(ex) + try: + await asyncio.sleep(self._interval) + except asyncio.CancelledError: # pragma: no cover + break + + async def mark_node_ready(self): + self._upload_enabled = True + + while True: + try: + # upload info in time to reduce latency + await self.upload_node_info(status=NodeStatus.READY) + break + except (mo.ActorNotExist, ConnectionError): # pragma: no cover + await asyncio.sleep(1) + + self._node_ready_event.set() + + def is_node_ready(self): + return self._node_ready_event.is_set() + + async def wait_node_ready(self): + return self._node_ready_event.wait() + + async def upload_node_info(self, status: NodeStatus = None): + try: + if not self._info.env: + self._info.env = await asyncio.to_thread(gather_node_env) + self._info.detail.update( + await asyncio.to_thread( + gather_node_details, + disk_infos=self._disk_infos, + band_storage_infos=self._band_storage_infos, + band_slot_infos=self._band_slot_infos, + band_quota_infos=self._band_quota_infos, + ) + ) + + band_resources = await asyncio.to_thread( + gather_node_resource, self._band_to_resource, use_gpu=self._use_gpu + ) + + for band, res in band_resources.items(): + try: + res_dict = self._info.resource[band] + except KeyError: + res_dict = self._info.resource[band] = dict() + res_dict.update(res) + + if self._upload_enabled: + try: + node_info_ref = await self._get_node_info_ref() + if not self._env_uploaded: + status = status or NodeStatus.READY + await node_info_ref.update_node_info( + address=self.address, + role=self._info.role, + env=self._info.env if not self._env_uploaded else None, + resource=self._info.resource, + detail=self._info.detail, + status=status, + ) + self._env_uploaded = True + except ValueError: + pass + except RuntimeError as ex: # pragma: no cover + if "cannot schedule new futures after interpreter shutdown" not in str(ex): + # when atexit is triggered, the default pool might be shutdown + # and to_thread will fail + raise + except: # noqa: E722 # nosec # pylint: disable=bare-except # pragma: no cover + logger.exception(f"Failed to upload node info") + raise + + def get_bands(self) -> Dict[BandType, int]: + band_resource = dict() + for resource_type, info in self._info.resource.items(): + if resource_type.startswith("numa"): + # cpu + band_resource[(self.address, resource_type)] = Resource( + num_cpus=info["cpu_total"], mem_bytes=info["memory_total"] + ) + else: # pragma: no cover + assert resource_type.startswith("gpu") + band_resource[(self.address, resource_type)] = Resource( + num_gpus=info["gpu_total"] + ) + return band_resource + + def set_node_disk_info(self, node_disk_info: List[DiskInfo]): + self._disk_infos = node_disk_info + + def set_band_storage_info(self, band_name: str, storage_info: StorageInfo): + self._band_storage_infos[band_name][storage_info.storage_level] = storage_info + + def set_band_slot_infos(self, band_name, slot_infos: List[WorkerSlotInfo]): + self._band_slot_infos[band_name] = slot_infos + + def set_band_quota_info(self, band_name, quota_info: QuotaInfo): + self._band_quota_infos[band_name] = quota_info diff --git a/python/xorbits/_mars/services/cluster/worker/__init__.py b/python/xorbits/_mars/services/cluster/worker/__init__.py new file mode 100644 index 000000000..321fd0623 --- /dev/null +++ b/python/xorbits/_mars/services/cluster/worker/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .service import ClusterWorkerService diff --git a/python/xorbits/_mars/services/cluster/worker/locator.py b/python/xorbits/_mars/services/cluster/worker/locator.py new file mode 100644 index 000000000..37b161ecc --- /dev/null +++ b/python/xorbits/_mars/services/cluster/worker/locator.py @@ -0,0 +1,90 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import List, Set + +from .... import oscar as mo +from ..core import NodeRole, NodeStatus +from ..locator import SupervisorLocatorActor + +logger = logging.getLogger(__name__) + + +class WorkerSupervisorLocatorActor(SupervisorLocatorActor): + _node_role = NodeRole.WORKER + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._node_info_ref = None + + @classmethod + def default_uid(cls): + return SupervisorLocatorActor.__name__ + + def _if_set_supervisors( + self, current_supervisors: Set[str], last_supervisors: Set[str] + ): + return current_supervisors != last_supervisors or self._node_info_ref is None + + async def _set_supervisors(self, supervisors: List[str]): + await super()._set_supervisors(supervisors) + if supervisors and self._node_info_ref is None: + from ..supervisor.node_info import NodeInfoCollectorActor + + supervisor_addr = self.get_supervisor(NodeInfoCollectorActor.default_uid()) + try: + self._node_info_ref = await mo.actor_ref( + uid=NodeInfoCollectorActor.default_uid(), address=supervisor_addr + ) + except (OSError, mo.ServerClosed, mo.ActorNotExist): + self._node_info_ref = None + + async def _get_supervisors_from_backend(self, filter_ready: bool = True): + try: + assert self._node_info_ref is not None + statuses = ( + {NodeStatus.READY} + if filter_ready + else {NodeStatus.READY, NodeStatus.STARTING} + ) + infos = await self._node_info_ref.get_nodes_info( + role=NodeRole.SUPERVISOR, statuses=statuses + ) + return list(infos) + except (AssertionError, OSError, mo.ServerClosed, mo.ActorNotExist): + self._node_info_ref = None + return await self._backend.get_supervisors(filter_ready=filter_ready) + + async def _watch_supervisor_from_node_info(self): + assert self._node_info_ref is not None + version = None + while True: + version, infos = await self._node_info_ref.watch_nodes( + role=NodeRole.SUPERVISOR, version=version + ) + yield list(infos) + + async def _watch_supervisors_from_backend(self): + while True: + try: + async for supervisors in self._watch_supervisor_from_node_info(): + yield supervisors + except (AssertionError, OSError, mo.ServerClosed, mo.ActorNotExist): + self._node_info_ref = None + + async for supervisors in self._backend.watch_supervisors(): + yield supervisors + if self._node_info_ref is not None: + break diff --git a/python/xorbits/_mars/services/cluster/worker/service.py b/python/xorbits/_mars/services/cluster/worker/service.py new file mode 100644 index 000000000..9222a2130 --- /dev/null +++ b/python/xorbits/_mars/services/cluster/worker/service.py @@ -0,0 +1,90 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .... import oscar as mo +from ...core import AbstractService, NodeRole +from ..file_logger import FileLoggerActor +from ..procinfo import ProcessInfoManagerActor +from ..uploader import NodeInfoUploaderActor +from .locator import WorkerSupervisorLocatorActor + + +class ClusterWorkerService(AbstractService): + """ + Cluster service on worker. + + Service Configuration + --------------------- + { + "disk_dirs": ["List of disk directories"], + "cluster": { + "backend": "", + "lookup_address": "
", + "node_check_interval": check interval seconds for nodes, + "resource": { + "numa-0": Resource(num_cpus=8, mem_bytes=1073741824), + "gpu-0": Resource(num_gpus=1) + } + } + } + """ + + async def start(self): + svc_config = self._config["cluster"] + address = self._address + + backend = svc_config.get("backend", "fixed") + lookup_address = svc_config.get( + "lookup_address", address if backend == "fixed" else None + ) + await mo.create_actor( + WorkerSupervisorLocatorActor, + backend_name=backend, + lookup_address=lookup_address, + uid=WorkerSupervisorLocatorActor.default_uid(), + address=address, + ) + await mo.create_actor( + NodeInfoUploaderActor, + role=NodeRole.WORKER, + interval=svc_config.get("node_check_interval"), + band_to_resource=svc_config.get("resource"), + uid=NodeInfoUploaderActor.default_uid(), + address=address, + ) + await mo.create_actor( + ProcessInfoManagerActor, + uid=ProcessInfoManagerActor.default_uid(), + address=address, + ) + await mo.create_actor( + FileLoggerActor, uid=FileLoggerActor.default_uid(), address=address + ) + + async def stop(self): + address = self._address + + await mo.destroy_actor( + mo.create_actor_ref( + uid=NodeInfoUploaderActor.default_uid(), address=address + ) + ) + await mo.destroy_actor( + mo.create_actor_ref( + uid=WorkerSupervisorLocatorActor.default_uid(), address=address + ) + ) + await mo.destroy_actor( + mo.create_actor_ref(uid=FileLoggerActor.default_uid(), address=address) + ) diff --git a/python/xorbits/_mars/services/context.py b/python/xorbits/_mars/services/context.py new file mode 100644 index 000000000..b646e8764 --- /dev/null +++ b/python/xorbits/_mars/services/context.py @@ -0,0 +1,301 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import logging +from collections import defaultdict +from functools import lru_cache +from typing import Dict, List + +from .. import oscar as mo +from ..core.context import Context +from ..lib.aio import new_isolation +from ..storage.base import StorageLevel +from ..typing import BandType, SessionType +from ..utils import implements, is_ray_address +from .cluster import ClusterAPI, NodeRole +from .meta import MetaAPI, WorkerMetaAPI +from .session import SessionAPI +from .storage import StorageAPI +from .subtask import SubtaskAPI + +logger = logging.getLogger(__name__) + + +class ThreadedServiceContext(Context): + _cluster_api: ClusterAPI + _session_api: SessionAPI + _meta_api: MetaAPI + _subtask_api: SubtaskAPI + + def __init__( + self, + session_id: str, + supervisor_address: str, + worker_address: str, + local_address: str, + loop: asyncio.AbstractEventLoop, + band: BandType = None, + ): + super().__init__( + session_id=session_id, + supervisor_address=supervisor_address, + worker_address=worker_address, + local_address=local_address, + band=band, + ) + self._loop = loop + # new isolation with current loop, + # so that session created in tile and execute + # can get the right isolation + new_isolation(loop=self._loop, threaded=False) + + self._running_session_id = None + self._running_op_key = None + + # APIs + self._cluster_api = None + self._session_api = None + self._meta_api = None + self._subtask_api = None + + async def init(self): + self._cluster_api = await ClusterAPI.create(self.supervisor_address) + self._session_api = await SessionAPI.create(self.supervisor_address) + self._meta_api = await MetaAPI.create(self.session_id, self.supervisor_address) + try: + self._subtask_api = await SubtaskAPI.create(self.local_address) + except mo.ActorNotExist: + pass + + def _call(self, coro): + fut = asyncio.run_coroutine_threadsafe(coro, self._loop) + return fut.result() + + @implements(Context.get_current_session) + def get_current_session(self) -> SessionType: + from ..deploy.oscar.session import new_session + + return new_session( + self.supervisor_address, self.session_id, new=False, default=False + ) + + @implements(Context.get_local_host_ip) + def get_local_host_ip(self) -> str: + local_address = self.local_address + if is_ray_address(local_address): + import ray + + return ray.util.get_node_ip_address() + else: + return local_address.split(":", 1)[0] + + @implements(Context.get_supervisor_addresses) + def get_supervisor_addresses(self) -> List[str]: + return self._call(self._cluster_api.get_supervisors()) + + @implements(Context.get_worker_addresses) + def get_worker_addresses(self) -> List[str]: + return list(self._call(self._cluster_api.get_nodes_info(role=NodeRole.WORKER))) + + @implements(Context.get_worker_bands) + def get_worker_bands(self) -> List[BandType]: + return list(self._call(self._cluster_api.get_all_bands(NodeRole.WORKER))) + + @implements(Context.get_total_n_cpu) + def get_total_n_cpu(self) -> int: + all_bands = self._call(self._cluster_api.get_all_bands()) + n_cpu = 0 + for band, resource in all_bands.items(): + _, band_name = band + if band_name.startswith("numa-"): + n_cpu += resource.num_cpus + return n_cpu + + @implements(Context.get_slots) + def get_slots(self) -> int: + worker_bands = self._call(self._get_worker_bands()) + resource = worker_bands[self.band] + return int(resource.num_cpus or resource.num_gpus) + + async def _get_worker_bands(self): + worker_cluster_api = await ClusterAPI.create(self.worker_address) + return await worker_cluster_api.get_bands() + + async def _get_chunks_meta( + self, data_keys: List[str], fields: List[str] = None, error: str = "raise" + ) -> List[Dict]: + # get chunks meta + get_metas = [] + for data_key in data_keys: + meta = self._meta_api.get_chunk_meta.delay( + data_key, fields=["bands"], error=error + ) + get_metas.append(meta) + supervisor_metas = await self._meta_api.get_chunk_meta.batch(*get_metas) + key_to_supervisor_metas = dict(zip(data_keys, supervisor_metas)) + api_to_keys_calls = defaultdict(lambda: (list(), list())) + for data_key, meta in zip(data_keys, supervisor_metas): + addr = meta["bands"][0][0] + worker_meta_api = await WorkerMetaAPI.create(self.session_id, addr) + keys, calls = api_to_keys_calls[worker_meta_api] + keys.append(data_key) + calls.append( + worker_meta_api.get_chunk_meta.delay( + data_key, fields=fields, error=error + ) + ) + coros = [] + for api, (keys, calls) in api_to_keys_calls.items(): + coros.append(api.get_chunk_meta.batch(*calls)) + all_metas = await asyncio.gather(*coros) + key_to_meta = dict() + for (keys, _), metas in zip(api_to_keys_calls.values(), all_metas): + for k, meta in zip(keys, metas): + meta["bands"] = key_to_supervisor_metas[k]["bands"] + key_to_meta[k] = meta + return [key_to_meta[k] for k in data_keys] + + async def _get_chunks_result(self, data_keys: List[str]) -> List: + metas = await self._get_chunks_meta(data_keys, fields=["bands"]) + addresses = [meta["bands"][0][0] for meta in metas] + + storage_api_to_gets = defaultdict(lambda: (list(), list())) + for data_key, address in zip(data_keys, addresses): + storage_api = await StorageAPI.create(self.session_id, address) + storage_api_to_gets[storage_api][0].append(data_key) + storage_api_to_gets[storage_api][1].append(storage_api.get.delay(data_key)) + results = dict() + for storage_api, (keys, gets) in storage_api_to_gets.items(): + chunks_data = await storage_api.get.batch(*gets) + for chunk_key, chunk_data in zip(keys, chunks_data): + results[chunk_key] = chunk_data + return [results[key] for key in data_keys] + + async def _fetch_chunks(self, data_keys: List[str]): + metas = await self._get_chunks_meta(data_keys, fields=["bands"]) + bands = [meta["bands"][0] for meta in metas] + + storage_api = await StorageAPI.create(self.session_id, self.local_address) + fetches = [] + for data_key, (address, band_name) in zip(data_keys, bands): + fetches.append( + storage_api.fetch.delay( + data_key, remote_address=address, band_name=band_name + ) + ) + await storage_api.fetch.batch(*fetches) + + @implements(Context.get_chunks_result) + def get_chunks_result(self, data_keys: List[str], fetch_only: bool = False) -> List: + if not fetch_only: + return self._call(self._get_chunks_result(data_keys)) + else: + return self._call(self._fetch_chunks(data_keys)) + + @implements(Context.get_chunks_meta) + def get_chunks_meta( + self, data_keys: List[str], fields: List[str] = None, error="raise" + ) -> List[Dict]: + return self._call(self._get_chunks_meta(data_keys, fields=fields, error=error)) + + async def _get_backend_info( + self, address: str = None, level: StorageLevel = StorageLevel.MEMORY + ) -> dict: + if address is None: + address = self.worker_address + storage_api = await StorageAPI.create(self.session_id, address) + return await storage_api.get_storage_info(level) + + @implements(Context.get_storage_info) + def get_storage_info( + self, address: str = None, level: StorageLevel = StorageLevel.MEMORY + ): + return self._call(self._get_backend_info(address, level)) + + @implements(Context.create_remote_object) + def create_remote_object(self, name: str, object_cls, *args, **kwargs): + ref = self._call( + self._session_api.create_remote_object( + self.session_id, name, object_cls, *args, **kwargs + ) + ) + return _RemoteObjectWrapper(ref, self._loop) + + @implements(Context.get_remote_object) + def get_remote_object(self, name: str): + ref = self._call(self._session_api.get_remote_object(self.session_id, name)) + return _RemoteObjectWrapper(ref, self._loop) + + @implements(Context.destroy_remote_object) + def destroy_remote_object(self, name: str): + return self._call( + self._session_api.destroy_remote_object(self.session_id, name) + ) + + @implements(Context.register_custom_log_path) + def register_custom_log_path( + self, + session_id: str, + tileable_op_key: str, + chunk_op_key: str, + worker_address: str, + log_path: str, + ): + return self._call( + self._session_api.register_custom_log_path( + session_id, tileable_op_key, chunk_op_key, worker_address, log_path + ) + ) + + @implements(Context.new_custom_log_dir) + @lru_cache(50) + def new_custom_log_dir(self) -> str: + return self._call( + self._session_api.new_custom_log_dir(self.local_address, self.session_id) + ) + + def set_running_operand_key(self, session_id: str, op_key: str): + self._running_session_id = session_id + self._running_op_key = op_key + + def set_progress(self, progress: float): + if ( + self._running_op_key is None or self._subtask_api is None + ): # pragma: no cover + return + return self._call( + self._subtask_api.set_running_operand_progress( + session_id=self._running_session_id, + op_key=self._running_op_key, + slot_address=self.local_address, + progress=progress, + ) + ) + + +class _RemoteObjectWrapper: + def __init__(self, ref: mo.ActorRef, loop: asyncio.AbstractEventLoop): + self._ref = ref + self._loop = loop + + def __getattr__(self, attr): + func = getattr(self._ref, attr) + + def wrap(*args, **kwargs): + coro = func(*args, **kwargs) + fut = asyncio.run_coroutine_threadsafe(coro, loop=self._loop) + return fut.result() + + return wrap diff --git a/python/xorbits/_mars/services/core.py b/python/xorbits/_mars/services/core.py new file mode 100644 index 000000000..8a25bf24f --- /dev/null +++ b/python/xorbits/_mars/services/core.py @@ -0,0 +1,201 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import abc +import asyncio +import enum +import importlib +import inspect +import warnings +from typing import Dict, Iterable, List, Union + +_ModulesType = Union[List, str, None] + + +class NodeRole(enum.Enum): + SUPERVISOR = 0 + WORKER = 1 + + +class AbstractService(abc.ABC): + _instances = dict() + + def __init__(self, config: Dict, address: str): + self._config = config + self._address = address + + @classmethod + def get_instance(cls, address: str, config: Dict = None): + type_addr = (cls, address) + if type_addr not in cls._instances: + inst = cls._instances[type_addr] = cls(config, address) + else: + inst = cls._instances[type_addr] + return inst + + @classmethod + def clear(cls): + cls._instances = dict() + + @abc.abstractmethod + async def start(self): + raise NotImplementedError + + @abc.abstractmethod + async def stop(self): + raise NotImplementedError + + async def create_session(self, session_id: str): + pass + + async def destroy_session(self, session_id: str): + pass + + +class EmptyService(AbstractService): + async def start(self): + pass + + async def stop(self): + pass + + +def _find_service_entries(node_role: NodeRole, services: List, modules: List): + svc_entries_list = [] + + web_handlers = {} + for svc_names in services: + if isinstance(svc_names, str): + svc_names = [svc_names] + svc_entries = [] + for svc_name in svc_names: + svc_mod = None + for mod_name in modules: + try: + full_mod_name = f"{mod_name}.{svc_name}.{node_role.name.lower()}" + svc_mod = importlib.import_module(full_mod_name) + + abstract_derivatives = [] + valid_derivatives = [] + for attr_name in dir(svc_mod): + obj = getattr(svc_mod, attr_name) + if ( + obj is not AbstractService + and isinstance(obj, type) + and issubclass(obj, AbstractService) + ): + if inspect.isabstract(obj): + abstract_derivatives.append(obj) + else: + valid_derivatives.append(obj) + + svc_entries.extend(valid_derivatives) + if not valid_derivatives and abstract_derivatives: + warnings.warn( + f"Module {full_mod_name} does not have non-abstract " + f"service classes, but abstract classes " + f"{abstract_derivatives} found.", + RuntimeWarning, + ) + + try: + web_mod = importlib.import_module( + mod_name + "." + svc_name + ".api.web" + ) + web_handlers.update(getattr(web_mod, "web_handlers", {})) + except ImportError: + pass + except ImportError: + pass + if svc_mod is None: + raise ImportError(f"Cannot discover {node_role} for service {svc_name}") + svc_entries_list.append(svc_entries) + + return svc_entries_list, web_handlers + + +def _normalize_modules(modules: _ModulesType): + if modules is None: + modules = [] + elif isinstance(modules, str): + modules = [modules] + else: + modules = list(modules) + modules = [__name__.rsplit(".", 1)[0]] + modules + return modules + + +def _iter_service_instances( + node_role: NodeRole, config: Dict, address: str = None, reverse: bool = False +) -> Iterable[List[AbstractService]]: + modules = _normalize_modules(config.get("modules")) + service_names = config["services"] + if reverse: + service_names = service_names[::-1] + + svc_entries_list, _ = _find_service_entries(node_role, service_names, modules) + for entries in svc_entries_list: + yield [svc_entry.get_instance(address, config) for svc_entry in entries] + + +async def start_services( + node_role: NodeRole, config: Dict, address: str = None, mark_ready: bool = True +): + modules = _normalize_modules(config.get("modules")) + + # discover services + service_names = config["services"] + + svc_entries_list, web_handlers = _find_service_entries( + node_role, service_names, modules + ) + + if "web" in service_names: + try: + web_config = config["web"] + except KeyError: + web_config = config["web"] = dict() + + web_config["web_handlers"] = web_handlers + + for entries in svc_entries_list: + instances = [svc_entry.get_instance(address, config) for svc_entry in entries] + await asyncio.gather(*[inst.start() for inst in instances]) + + if mark_ready and "cluster" in service_names: + from .cluster import ClusterAPI + + cluster_api = await ClusterAPI.create(address) + await cluster_api.mark_node_ready() + + +async def stop_services(node_role: NodeRole, config: Dict, address: str = None): + for instances in _iter_service_instances(node_role, config, address, reverse=True): + await asyncio.gather(*[inst.stop() for inst in instances]) + + AbstractService.clear() + + +async def create_service_session( + node_role: NodeRole, config: Dict, session_id: str = None, address: str = None +): + for instances in _iter_service_instances(node_role, config, address): + await asyncio.gather(*[inst.create_session(session_id) for inst in instances]) + + +async def destroy_service_session( + node_role: NodeRole, config: Dict, session_id: str = None, address: str = None +): + for instances in _iter_service_instances(node_role, config, address, reverse=True): + await asyncio.gather(*[inst.destroy_session(session_id) for inst in instances]) diff --git a/python/xorbits/_mars/services/lifecycle/__init__.py b/python/xorbits/_mars/services/lifecycle/__init__.py new file mode 100644 index 000000000..bd908f4fa --- /dev/null +++ b/python/xorbits/_mars/services/lifecycle/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .api import AbstractLifecycleAPI, LifecycleAPI, MockLifecycleAPI, WebLifecycleAPI +from .errors import TileableNotTracked diff --git a/python/xorbits/_mars/services/lifecycle/api/__init__.py b/python/xorbits/_mars/services/lifecycle/api/__init__.py new file mode 100644 index 000000000..579637d58 --- /dev/null +++ b/python/xorbits/_mars/services/lifecycle/api/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .core import AbstractLifecycleAPI +from .oscar import LifecycleAPI, MockLifecycleAPI +from .web import WebLifecycleAPI diff --git a/python/xorbits/_mars/services/lifecycle/api/core.py b/python/xorbits/_mars/services/lifecycle/api/core.py new file mode 100644 index 000000000..fadcaf26e --- /dev/null +++ b/python/xorbits/_mars/services/lifecycle/api/core.py @@ -0,0 +1,43 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from typing import Dict, List + + +class AbstractLifecycleAPI(ABC): + @abstractmethod + async def decref_tileables( + self, tileable_keys: List[str], counts: List[int] = None + ): + """ + Decref tileables. + + Parameters + ---------- + tileable_keys : list + List of tileable keys. + counts: list + List of ref count. + """ + + @abstractmethod + async def get_all_chunk_ref_counts(self) -> Dict[str, int]: + """ + Get all chunk keys' ref counts. + + Returns + ------- + key_to_ref_counts: dict + """ diff --git a/python/xorbits/_mars/services/lifecycle/api/oscar.py b/python/xorbits/_mars/services/lifecycle/api/oscar.py new file mode 100644 index 000000000..02f85bd1a --- /dev/null +++ b/python/xorbits/_mars/services/lifecycle/api/oscar.py @@ -0,0 +1,188 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List + +from .... import oscar as mo +from ....lib.aio import alru_cache +from ..supervisor.tracker import LifecycleTrackerActor +from .core import AbstractLifecycleAPI + + +class LifecycleAPI(AbstractLifecycleAPI): + def __init__( + self, + session_id: str, + lifecycle_tracker_ref: mo.ActorRefType[LifecycleTrackerActor], + ): + self._session_id = session_id + self._lifecycle_tracker_ref = lifecycle_tracker_ref + + @classmethod + @alru_cache(cache_exceptions=False) + async def create(cls, session_id: str, address: str) -> "LifecycleAPI": + """ + Create Lifecycle API. + + Parameters + ---------- + session_id : str + Session ID. + address : str + Supervisor address. + + Returns + ------- + lifecycle_api + Lifecycle API. + """ + lifecycle_tracker_ref = await mo.actor_ref( + address, LifecycleTrackerActor.gen_uid(session_id) + ) + return LifecycleAPI(session_id, lifecycle_tracker_ref) + + @mo.extensible + async def track(self, tileable_key: str, chunk_keys: List[str]): + """ + Track tileable. + + Parameters + ---------- + tileable_key : str + Tileable key. + chunk_keys : list + List of chunk keys. + """ + return await self._lifecycle_tracker_ref.track(tileable_key, chunk_keys) + + @track.batch + async def batch_track(self, args_list, kwargs_list): + tracks = [] + for args, kwargs in zip(args_list, kwargs_list): + tracks.append(self._lifecycle_tracker_ref.track.delay(*args, **kwargs)) + return await self._lifecycle_tracker_ref.track.batch(*tracks) + + async def incref_tileables( + self, tileable_keys: List[str], counts: List[int] = None + ): + """ + Incref tileables. + + Parameters + ---------- + tileable_keys : list + List of tileable keys. + counts: list + List of ref count. + """ + return await self._lifecycle_tracker_ref.incref_tileables( + tileable_keys, counts=counts + ) + + async def decref_tileables( + self, tileable_keys: List[str], counts: List[int] = None + ): + """ + Decref tileables. + + Parameters + ---------- + tileable_keys : list + List of tileable keys. + counts: list + List of ref count. + """ + return await self._lifecycle_tracker_ref.decref_tileables(tileable_keys) + + async def get_tileable_ref_counts(self, tileable_keys: List[str]) -> List[int]: + """ + Get ref counts of tileables. + + Parameters + ---------- + tileable_keys : list + List of tileable keys. + + Returns + ------- + ref_counts : list + List of ref counts. + """ + return await self._lifecycle_tracker_ref.get_tileable_ref_counts(tileable_keys) + + async def incref_chunks(self, chunk_keys: List[str], counts: List[int] = None): + """ + Incref chunks. + + Parameters + ---------- + chunk_keys : list + List of chunk keys. + counts: list + List of ref count. + """ + return await self._lifecycle_tracker_ref.incref_chunks( + chunk_keys, counts=counts + ) + + async def decref_chunks(self, chunk_keys: List[str], counts: List[int] = None): + """ + Decref chunks + + Parameters + ---------- + chunk_keys : list + List of chunk keys. + counts: list + List of ref count. + """ + return await self._lifecycle_tracker_ref.decref_chunks( + chunk_keys, counts=counts + ) + + async def get_chunk_ref_counts(self, chunk_keys: List[str]) -> List[int]: + """ + Get ref counts of chunks. + + Parameters + ---------- + chunk_keys : list + List of chunk keys. + + Returns + ------- + ref_counts : list + List of ref counts. + """ + return await self._lifecycle_tracker_ref.get_chunk_ref_counts(chunk_keys) + + async def get_all_chunk_ref_counts(self) -> Dict[str, int]: + """ + Get all chunk keys' ref counts. + + Returns + ------- + key_to_ref_counts: dict + """ + return await self._lifecycle_tracker_ref.get_all_chunk_ref_counts() + + +class MockLifecycleAPI(LifecycleAPI): + @classmethod + async def create(cls, session_id: str, address: str) -> "LifecycleAPI": + from ..supervisor.service import LifecycleSupervisorService + + service = LifecycleSupervisorService({}, address) + await service.create_session(session_id) + return await super().create(session_id=session_id, address=address) diff --git a/python/xorbits/_mars/services/lifecycle/api/web.py b/python/xorbits/_mars/services/lifecycle/api/web.py new file mode 100644 index 000000000..870c0e6e5 --- /dev/null +++ b/python/xorbits/_mars/services/lifecycle/api/web.py @@ -0,0 +1,78 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Dict, List + +from ....utils import deserialize_serializable, serialize_serializable +from ...web import MarsServiceWebAPIHandler, MarsWebAPIClientMixin, web_api +from .core import AbstractLifecycleAPI + + +class LifecycleWebAPIHandler(MarsServiceWebAPIHandler): + _root_pattern = "/api/session/(?P[^/]+)/lifecycle" + + async def _get_oscar_lifecycle_api(self, session_id: str): + from .oscar import LifecycleAPI + + return await self._get_api_by_key(LifecycleAPI, session_id) + + @web_api("", method="post", arg_filter={"action": "decref_tileables"}) + async def decref_tileables(self, session_id: str): + tileable_keys = self.get_argument("tileable_keys").split(",") + counts = self.get_argument("counts", None) + if counts: + counts = [int(c) for c in counts.split(",")] + + oscar_api = await self._get_oscar_lifecycle_api(session_id) + await oscar_api.decref_tileables(tileable_keys, counts=counts) + + @web_api("", method="get", arg_filter={"action": "get_all_chunk_ref_counts"}) + async def get_all_chunk_ref_counts(self, session_id: str): + oscar_api = await self._get_oscar_lifecycle_api(session_id) + res = await oscar_api.get_all_chunk_ref_counts() + self.write(serialize_serializable(res)) + + +web_handlers = {LifecycleWebAPIHandler.get_root_pattern(): LifecycleWebAPIHandler} + + +class WebLifecycleAPI(AbstractLifecycleAPI, MarsWebAPIClientMixin): + def __init__( + self, session_id: str, address: str, request_rewriter: Callable = None + ): + self._session_id = session_id + self._address = address.rstrip("/") + self.request_rewriter = request_rewriter + + async def decref_tileables( + self, tileable_keys: List[str], counts: List[int] = None + ): + path = f"{self._address}/api/session/{self._session_id}/lifecycle" + params = dict(action="decref_tileables") + counts = ( + f"&counts={','.join(str(c) for c in counts)}" if counts is not None else "" + ) + await self._request_url( + path=path, + method="POST", + params=params, + headers={"Content-Type": "application/x-www-form-urlencoded"}, + data="tileable_keys=" + ",".join(tileable_keys) + counts, + ) + + async def get_all_chunk_ref_counts(self) -> Dict[str, int]: + params = dict(action="get_all_chunk_ref_counts") + path = f"{self._address}/api/session/{self._session_id}/lifecycle" + res = await self._request_url("GET", path, params=params) + return deserialize_serializable(res.body) diff --git a/python/xorbits/_mars/services/lifecycle/errors.py b/python/xorbits/_mars/services/lifecycle/errors.py new file mode 100644 index 000000000..4b020c556 --- /dev/null +++ b/python/xorbits/_mars/services/lifecycle/errors.py @@ -0,0 +1,19 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...core.base import MarsError + + +class TileableNotTracked(MarsError): + pass diff --git a/python/xorbits/_mars/services/lifecycle/supervisor/__init__.py b/python/xorbits/_mars/services/lifecycle/supervisor/__init__.py new file mode 100644 index 000000000..3491eec41 --- /dev/null +++ b/python/xorbits/_mars/services/lifecycle/supervisor/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .service import LifecycleSupervisorService diff --git a/python/xorbits/_mars/services/lifecycle/supervisor/service.py b/python/xorbits/_mars/services/lifecycle/supervisor/service.py new file mode 100644 index 000000000..bc4985bea --- /dev/null +++ b/python/xorbits/_mars/services/lifecycle/supervisor/service.py @@ -0,0 +1,40 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .... import oscar as mo +from ...core import AbstractService +from .tracker import LifecycleTrackerActor + + +class LifecycleSupervisorService(AbstractService): + async def start(self): + pass + + async def stop(self): + pass + + async def create_session(self, session_id: str): + await mo.create_actor( + LifecycleTrackerActor, + session_id, + address=self._address, + uid=LifecycleTrackerActor.gen_uid(session_id), + ) + + async def destroy_session(self, session_id: str): + await mo.destroy_actor( + mo.create_actor_ref( + uid=LifecycleTrackerActor.gen_uid(session_id), address=self._address + ) + ) diff --git a/python/xorbits/_mars/services/lifecycle/supervisor/tests/__init__.py b/python/xorbits/_mars/services/lifecycle/supervisor/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/services/lifecycle/supervisor/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/services/lifecycle/supervisor/tests/test_tracker.py b/python/xorbits/_mars/services/lifecycle/supervisor/tests/test_tracker.py new file mode 100644 index 000000000..c4bc49c25 --- /dev/null +++ b/python/xorbits/_mars/services/lifecycle/supervisor/tests/test_tracker.py @@ -0,0 +1,110 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest + +from ..... import oscar as mo +from ..... import tensor as mt +from .....core import tile +from ....cluster import MockClusterAPI +from ....meta import MockMetaAPI +from ....session import MockSessionAPI +from ....storage import DataNotExist, MockStorageAPI +from ....task.supervisor.manager import TaskManagerActor +from ... import TileableNotTracked +from ...supervisor.tracker import LifecycleTrackerActor + + +class FakeTaskManager(TaskManagerActor): + def __init__(self, session_id: str): + super().__init__(session_id) + self._remove_tileables = [] + + async def __post_create__(self): + pass + + def remove_tileables(self, tileable_keys): + self._remove_tileables.extend(tileable_keys) + + def get_removed_tileables(self): + return self._remove_tileables + + +@pytest.mark.asyncio +async def test_tracker(): + pool = await mo.create_actor_pool("127.0.0.1", n_process=0) + + async with pool: + addr = pool.external_address + session_id = "test_session" + await MockClusterAPI.create(addr) + await MockSessionAPI.create(addr, session_id=session_id) + meta_api = await MockMetaAPI.create(session_id, addr) + storage_api = await MockStorageAPI.create(session_id, addr) + + try: + task_manager = await mo.create_actor( + FakeTaskManager, + session_id, + uid=FakeTaskManager.gen_uid(session_id), + address=pool.external_address, + ) + + tracker = await mo.create_actor( + LifecycleTrackerActor, + session_id, + uid=LifecycleTrackerActor.gen_uid(session_id), + address=pool.external_address, + ) + + t = mt.random.rand(15, 5, chunk_size=5) + t = tile(t) + + tileable_key = t.key + chunk_keys = [] + for c in t.chunks: + chunk_keys.append(c.key) + await meta_api.set_chunk_meta(c, bands=[(addr, "numa-0")]) + await storage_api.put(c.key, np.random.rand(5, 5)) + + await tracker.track(tileable_key, chunk_keys) + await tracker.incref_tileables([tileable_key]) + await tracker.incref_tileables([tileable_key], [2]) + await tracker.incref_chunks(chunk_keys[:2]) + await tracker.incref_chunks(chunk_keys[:2], [3, 3]) + await tracker.decref_chunks(chunk_keys[:2]) + await tracker.decref_chunks(chunk_keys[:2], [3, 3]) + await tracker.decref_tileables([tileable_key]) + await tracker.decref_tileables([tileable_key], [2]) + assert len(await tracker.get_all_chunk_ref_counts()) == 0 + assert await task_manager.get_removed_tileables() == [tileable_key] + + with pytest.raises(ValueError): + await tracker.incref_tileables([tileable_key], [2, 3]) + + for chunk_key in chunk_keys: + with pytest.raises(KeyError): + await meta_api.get_chunk_meta(chunk_key) + for chunk_key in chunk_keys: + with pytest.raises(DataNotExist): + await storage_api.get(chunk_key) + + with pytest.raises(TileableNotTracked): + await tracker.incref_tileables(["not_tracked"]) + with pytest.raises(TileableNotTracked): + await tracker.decref_tileables(["not_tracked"]) + finally: + await MockStorageAPI.cleanup(pool.external_address) + await MockClusterAPI.cleanup(pool.external_address) diff --git a/python/xorbits/_mars/services/lifecycle/supervisor/tracker.py b/python/xorbits/_mars/services/lifecycle/supervisor/tracker.py new file mode 100644 index 000000000..00a8d6c7a --- /dev/null +++ b/python/xorbits/_mars/services/lifecycle/supervisor/tracker.py @@ -0,0 +1,258 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import itertools +import logging +from collections import defaultdict +from typing import Dict, List, Optional + +from .... import oscar as mo +from ....lib.aio import alru_cache +from ...meta.api import MetaAPI +from ...storage.api import StorageAPI +from ..errors import TileableNotTracked + +logger = logging.getLogger(__name__) + + +class LifecycleTrackerActor(mo.Actor): + _meta_api: MetaAPI + + def __init__(self, session_id: str): + self._session_id = session_id + self._tileable_key_to_chunk_keys = dict() + self._tileable_ref_counts = defaultdict(lambda: 0) + self._chunk_ref_counts = defaultdict(lambda: 0) + + self._meta_api: Optional[MetaAPI] = None + + async def __post_create__(self): + self._meta_api = await MetaAPI.create(self._session_id, self.address) + + async def __pre_destroy__(self): + chunk_keys = [ + chunk_key + for chunk_key, ref_count in self._chunk_ref_counts.items() + if ref_count > 0 + ] + # remove all chunks + await self._remove_chunks(chunk_keys) + + @alru_cache + async def _get_task_api(self): + from ...task.api import TaskAPI + + return await TaskAPI.create(self._session_id, self.address) + + @staticmethod + def gen_uid(session_id): + return f"{session_id}_lifecycle_tracker" + + def _track(self, tileable_key: str, chunk_keys: List[str]): + if tileable_key not in self._tileable_key_to_chunk_keys: + self._tileable_key_to_chunk_keys[tileable_key] = [] + chunk_keys_set = set(self._tileable_key_to_chunk_keys[tileable_key]) + incref_chunk_keys = [] + tileable_ref_count = self._tileable_ref_counts.get(tileable_key, 0) + for chunk_key in chunk_keys: + if chunk_key in chunk_keys_set: + continue + if tileable_ref_count > 0: + incref_chunk_keys.extend([chunk_key] * tileable_ref_count) + self._tileable_key_to_chunk_keys[tileable_key].append(chunk_key) + if incref_chunk_keys: + self._incref_chunks(incref_chunk_keys) + + @mo.extensible + async def track(self, tileable_key: str, chunk_keys: List[str]): + return await asyncio.to_thread(self._track, tileable_key, chunk_keys) + + @classmethod + def _check_ref_counts(cls, keys: List[str], ref_counts: List[int]): + if ref_counts is not None and len(keys) != len(ref_counts): + raise ValueError( + f"`ref_counts` should have same size as `keys`, expect {len(keys)}, got {len(ref_counts)}" + ) + + def _incref_chunks(self, chunk_keys: List[str], counts: List[int] = None): + counts = counts if counts is not None else itertools.repeat(1) + for chunk_key, count in zip(chunk_keys, counts): + self._chunk_ref_counts[chunk_key] += count + + async def incref_chunks(self, chunk_keys: List[str], counts: List[int] = None): + self._check_ref_counts(chunk_keys, counts) + return await asyncio.to_thread(self._incref_chunks, chunk_keys, counts=counts) + + def _get_remove_chunk_keys(self, chunk_keys: List[str], counts: List[int] = None): + to_remove_chunk_keys = [] + counts = counts if counts is not None else itertools.repeat(1) + for chunk_key, count in zip(chunk_keys, counts): + ref_count = self._chunk_ref_counts[chunk_key] + ref_count -= count + assert ref_count >= 0, f"chunk key {chunk_key} will have negative ref count" + self._chunk_ref_counts[chunk_key] = ref_count + if ref_count == 0: + # remove + to_remove_chunk_keys.append(chunk_key) + return to_remove_chunk_keys + + async def decref_chunks(self, chunk_keys: List[str], counts: List[int] = None): + self._check_ref_counts(chunk_keys, counts) + to_remove_chunk_keys = await asyncio.to_thread( + self._get_remove_chunk_keys, chunk_keys, counts=counts + ) + # make _remove_chunks release actor lock so that multiple `decref_chunks` can run concurrently. + yield self._remove_chunks(to_remove_chunk_keys) + + async def _remove_chunks(self, to_remove_chunk_keys: List[str]): + # get meta + logger.debug( + "Remove chunks %.500s with a refcount of zero", to_remove_chunk_keys + ) + get_metas = [] + for to_remove_chunk_key in to_remove_chunk_keys: + get_metas.append( + self._meta_api.get_chunk_meta.delay( + to_remove_chunk_key, fields=["bands"], error="ignore" + ) + ) + metas = await self._meta_api.get_chunk_meta.batch(*get_metas) + + # filter chunks that not exist + new_to_remove_chunk_keys = [] + new_metas = [] + for to_remove_chunk_key, meta in zip(to_remove_chunk_keys, metas): + if meta is not None: + new_to_remove_chunk_keys.append(to_remove_chunk_key) + new_metas.append(meta) + to_remove_chunk_keys = new_to_remove_chunk_keys + metas = new_metas + + all_bands = [meta["bands"] for meta in metas] + key_to_addresses = dict() + for to_remove_chunk_key, bands in zip(to_remove_chunk_keys, all_bands): + key_to_addresses[to_remove_chunk_key] = bands + + # remove data via storage API + storage_api_to_deletes = defaultdict(list) + for key, bands in key_to_addresses.items(): + for band in bands: + # storage API is cached for same arguments + storage_api = await StorageAPI.create( + self._session_id, band[0], band[1] + ) + storage_api_to_deletes[storage_api].append( + storage_api.delete.delay(key, error="ignore") + ) + await asyncio.gather( + *[ + storage_api.delete.batch(*deletes) + for storage_api, deletes in storage_api_to_deletes.items() + ] + ) + + # delete meta + delete_metas = [] + for to_remove_chunk_key in to_remove_chunk_keys: + delete_metas.append( + self._meta_api.del_chunk_meta.delay(to_remove_chunk_key) + ) + await self._meta_api.del_chunk_meta.batch(*delete_metas) + + def get_chunk_ref_counts(self, chunk_keys: List[str]) -> List[int]: + return [self._chunk_ref_counts[chunk_key] for chunk_key in chunk_keys] + + def get_all_chunk_ref_counts(self) -> Dict[str, int]: + result = dict() + for chunk_key, ref_count in self._chunk_ref_counts.items(): + if ref_count > 0: + result[chunk_key] = ref_count + return result + + def _incref_tileables(self, tileable_keys: List[str], counts: List[int] = None): + counts = counts if counts is not None else itertools.repeat(1) + for tileable_key, count in zip(tileable_keys, counts): + if tileable_key not in self._tileable_key_to_chunk_keys: + raise TileableNotTracked(f"tileable {tileable_key} not tracked before") + self._tileable_ref_counts[tileable_key] += count + incref_chunk_keys = self._tileable_key_to_chunk_keys[tileable_key] + # incref chunks for this tileable + logger.debug( + "Incref chunks %.500s while increfing tileable %s", + incref_chunk_keys, + tileable_key, + ) + chunk_counts = None if count == 1 else [count] * len(incref_chunk_keys) + self._incref_chunks(incref_chunk_keys, counts=chunk_counts) + + async def incref_tileables( + self, tileable_keys: List[str], counts: List[int] = None + ): + self._check_ref_counts(tileable_keys, counts) + return await asyncio.to_thread( + self._incref_tileables, tileable_keys, counts=counts + ) + + def _get_decref_chunk_keys( + self, tileable_keys: List[str], counts: List[int] = None + ) -> Dict[str, int]: + decref_chunk_keys = dict() + counts = counts if counts is not None else itertools.repeat(1) + for tileable_key, count in zip(tileable_keys, counts): + if tileable_key not in self._tileable_key_to_chunk_keys: + raise TileableNotTracked(f"tileable {tileable_key} not tracked before") + self._tileable_ref_counts[tileable_key] -= count + + for chunk_key in self._tileable_key_to_chunk_keys[tileable_key]: + if chunk_key not in decref_chunk_keys: + decref_chunk_keys[chunk_key] = count + else: + decref_chunk_keys[chunk_key] += count + logger.debug( + "Decref chunks %.500s while decrefing tileables %s", + decref_chunk_keys, + tileable_keys, + ) + return decref_chunk_keys + + async def decref_tileables( + self, tileable_keys: List[str], counts: List[int] = None + ): + self._check_ref_counts(tileable_keys, counts) + decref_chunk_key_to_counts = await asyncio.to_thread( + self._get_decref_chunk_keys, tileable_keys, counts=counts + ) + to_remove_chunk_keys = await asyncio.to_thread( + self._get_remove_chunk_keys, + list(decref_chunk_key_to_counts), + counts=list(decref_chunk_key_to_counts.values()), + ) + to_remove_tileable_keys = await asyncio.to_thread( + list, (key for key in tileable_keys if self._tileable_ref_counts[key] <= 0) + ) + coros = [] + if to_remove_chunk_keys: + coros.append(self._remove_chunks(to_remove_chunk_keys)) + if to_remove_tileable_keys: + task_api = await self._get_task_api() + coros.append(task_api.remove_tileables(to_remove_tileable_keys)) + if coros: + # release actor lock + yield asyncio.gather(*coros) + + def get_tileable_ref_counts(self, tileable_keys: List[str]) -> List[int]: + return [ + self._tileable_ref_counts[tileable_key] for tileable_key in tileable_keys + ] diff --git a/python/xorbits/_mars/services/lifecycle/worker/__init__.py b/python/xorbits/_mars/services/lifecycle/worker/__init__.py new file mode 100644 index 000000000..55b7ebca7 --- /dev/null +++ b/python/xorbits/_mars/services/lifecycle/worker/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...core import EmptyService + + +class TaskWorkerService(EmptyService): + pass diff --git a/python/xorbits/_mars/services/meta/__init__.py b/python/xorbits/_mars/services/meta/__init__.py new file mode 100644 index 000000000..5d962a663 --- /dev/null +++ b/python/xorbits/_mars/services/meta/__init__.py @@ -0,0 +1,22 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .api import ( + AbstractMetaAPI, + MetaAPI, + MockMetaAPI, + MockWorkerMetaAPI, + WebMetaAPI, + WorkerMetaAPI, +) diff --git a/python/xorbits/_mars/services/meta/api/__init__.py b/python/xorbits/_mars/services/meta/api/__init__.py new file mode 100644 index 000000000..74aaea7fd --- /dev/null +++ b/python/xorbits/_mars/services/meta/api/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .core import AbstractMetaAPI +from .oscar import MetaAPI, MockMetaAPI, MockWorkerMetaAPI, WorkerMetaAPI +from .web import WebMetaAPI diff --git a/python/xorbits/_mars/services/meta/api/core.py b/python/xorbits/_mars/services/meta/api/core.py new file mode 100644 index 000000000..f54daba31 --- /dev/null +++ b/python/xorbits/_mars/services/meta/api/core.py @@ -0,0 +1,38 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from typing import Dict, List, Optional + + +class AbstractMetaAPI(ABC): + @abstractmethod + async def get_chunk_meta( + self, object_id: str, fields: List[str] = None, error: str = "raise" + ) -> Optional[Dict]: + """ + Get chunk meta + + Parameters + ---------- + object_id + Object ID + fields + Fields to obtain + error + Way to handle errors, 'raise' by default + Returns + ------- + Dict with fields as keys + """ diff --git a/python/xorbits/_mars/services/meta/api/oscar.py b/python/xorbits/_mars/services/meta/api/oscar.py new file mode 100644 index 000000000..ac35ab33d --- /dev/null +++ b/python/xorbits/_mars/services/meta/api/oscar.py @@ -0,0 +1,335 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Any, Dict, List + +from .... import oscar as mo +from ....core import ChunkType +from ....core.operand import Fuse +from ....lib.aio import alru_cache +from ....typing import BandType +from ....utils import get_chunk_params +from ..core import get_meta_type +from ..store import AbstractMetaStore +from ..supervisor.core import MetaStoreActor, MetaStoreManagerActor +from ..worker.core import WorkerMetaStoreManagerActor +from .core import AbstractMetaAPI + + +class BaseMetaAPI(AbstractMetaAPI): + def __init__(self, session_id: str, meta_store: mo.ActorRefType[AbstractMetaStore]): + # make sure all meta types registered + from .. import metas + + del metas + + self._session_id = session_id + self._meta_store = meta_store + + @mo.extensible + async def set_tileable_meta( + self, tileable, memory_size: int = None, store_size: int = None, **extra + ): + from ....dataframe.core import ( + DATAFRAME_GROUPBY_TYPE, + DATAFRAME_TYPE, + SERIES_GROUPBY_TYPE, + ) + + params = tileable.params.copy() + if isinstance( + tileable, (DATAFRAME_TYPE, DATAFRAME_GROUPBY_TYPE, SERIES_GROUPBY_TYPE) + ): + # dataframe needs some special process for now + del params["columns_value"] + del params["dtypes"] + params.pop("key_dtypes", None) + params["dtypes_value"] = tileable.dtypes_value + params["nsplits"] = tileable.nsplits + params.update(extra) + meta = get_meta_type(type(tileable))( + object_id=tileable.key, + **params, + memory_size=memory_size, + store_size=store_size + ) + return await self._meta_store.set_meta(tileable.key, meta) + + @mo.extensible + async def get_tileable_meta( + self, object_id: str, fields: List[str] = None + ) -> Dict[str, Any]: + return await self._meta_store.get_meta(object_id, fields=fields) + + @mo.extensible + async def del_tileable_meta(self, object_id: str): + return await self._meta_store.del_meta(object_id) + + @classmethod + def _extract_chunk_meta( + cls, + chunk: ChunkType, + memory_size: int = None, + store_size: int = None, + bands: List[BandType] = None, + fields: List[str] = None, + exclude_fields: List[str] = None, + **extra + ): + if isinstance(chunk.op, Fuse): + # fuse op + chunk = chunk.chunk + params = get_chunk_params(chunk) + chunk_key = extra.pop("chunk_key", chunk.key) + object_ref = extra.pop("object_ref", None) + params.update(extra) + + if object_ref: + object_refs = ( + [object_ref] if not isinstance(object_ref, list) else object_ref + ) + else: + object_refs = [] + + if fields is not None: + fields = set(fields) + params = {k: v for k, v in params.items() if k in fields} + elif exclude_fields is not None: + exclude_fields = set(exclude_fields) + params = {k: v for k, v in params.items() if k not in exclude_fields} + + return get_meta_type(type(chunk))( + object_id=chunk_key, + **params, + bands=bands, + memory_size=memory_size, + store_size=store_size, + object_refs=object_refs + ) + + @mo.extensible + async def set_chunk_meta( + self, + chunk: ChunkType, + memory_size: int = None, + store_size: int = None, + bands: List[BandType] = None, + fields: List[str] = None, + exclude_fields: List[str] = None, + **extra + ): + """ + Parameters + ---------- + chunk: ChunkType + chunk to set meta + memory_size: int + memory size for chunk data + store_size: int + serialized size for chunk data + bands: + chunk data bands + fields: list + fields to include in meta + exclude_fields: list + fields to exclude in meta + extra + + Returns + ------- + + """ + meta = self._extract_chunk_meta( + chunk, + memory_size=memory_size, + store_size=store_size, + bands=bands, + fields=fields, + exclude_fields=exclude_fields, + **extra + ) + return await self._meta_store.set_meta(meta.object_id, meta) + + @set_chunk_meta.batch + async def batch_set_chunk_meta(self, args_list, kwargs_list): + set_chunk_metas = [] + for args, kwargs in zip(args_list, kwargs_list): + meta = self._extract_chunk_meta(*args, **kwargs) + set_chunk_metas.append( + self._meta_store.set_meta.delay(meta.object_id, meta) + ) + return await self._meta_store.set_meta.batch(*set_chunk_metas) + + @mo.extensible + async def get_chunk_meta( + self, object_id: str, fields: List[str] = None, error="raise" + ): + return await self._meta_store.get_meta(object_id, fields=fields, error=error) + + @get_chunk_meta.batch + async def batch_get_chunk_meta(self, args_list, kwargs_list): + get_chunk_metas = [] + for args, kwargs in zip(args_list, kwargs_list): + get_chunk_metas.append(self._meta_store.get_meta.delay(*args, **kwargs)) + return await self._meta_store.get_meta.batch(*get_chunk_metas) + + @mo.extensible + async def del_chunk_meta(self, object_id: str): + """ + Parameters + ---------- + object_id: str + chunk id + """ + return await self._meta_store.del_meta(object_id) + + @del_chunk_meta.batch + async def batch_del_chunk_meta(self, args_list, kwargs_list): + del_chunk_metas = [] + for args, kwargs in zip(args_list, kwargs_list): + del_chunk_metas.append(self._meta_store.del_meta.delay(*args, **kwargs)) + return await self._meta_store.del_meta.batch(*del_chunk_metas) + + @mo.extensible + async def add_chunk_bands(self, object_id: str, bands: List[BandType]): + return await self._meta_store.add_chunk_bands(object_id, bands) + + @add_chunk_bands.batch + async def batch_add_chunk_bands(self, args_list, kwargs_list): + add_chunk_bands_tasks = [] + for args, kwargs in zip(args_list, kwargs_list): + add_chunk_bands_tasks.append( + self._meta_store.add_chunk_bands.delay(*args, **kwargs) + ) + return await self._meta_store.add_chunk_bands.batch(*add_chunk_bands_tasks) + + @mo.extensible + async def remove_chunk_bands(self, object_id: str, bands: List[BandType]): + return await self._meta_store.remove_chunk_bands(object_id, bands) + + @remove_chunk_bands.batch + async def batch_remove_chunk_bands(self, args_list, kwargs_list): + remove_chunk_bands_tasks = [] + for args, kwargs in zip(args_list, kwargs_list): + remove_chunk_bands_tasks.append( + self._meta_store.remove_chunk_bands.delay(*args, **kwargs) + ) + return await self._meta_store.remove_chunk_bands.batch( + *remove_chunk_bands_tasks + ) + + @mo.extensible + async def get_band_chunks(self, band: BandType) -> List[str]: + return await self._meta_store.get_band_chunks(band) + + +class MetaAPI(BaseMetaAPI): + @classmethod + @alru_cache(maxsize=1024, cache_exceptions=False) + async def create(cls, session_id: str, address: str) -> "MetaAPI": + """ + Create Meta API. + + Parameters + ---------- + session_id : str + Session ID. + address : str + Supervisor address. + + Returns + ------- + meta_api + Meta api. + """ + meta_store_ref = await mo.actor_ref(address, MetaStoreActor.gen_uid(session_id)) + + return MetaAPI(session_id, meta_store_ref) + + +class MockMetaAPI(MetaAPI): + @classmethod + async def create(cls, session_id: str, address: str) -> "MetaAPI": + # create an Actor for mock + try: + meta_store_manager_ref = await mo.create_actor( + MetaStoreManagerActor, + "dict", + dict(), + address=address, + uid=MetaStoreManagerActor.default_uid(), + ) + except mo.ActorAlreadyExist: + # ignore if actor exists + meta_store_manager_ref = await mo.actor_ref( + MetaStoreManagerActor, + address=address, + uid=MetaStoreManagerActor.default_uid(), + ) + try: + await meta_store_manager_ref.new_session_meta_store(session_id) + except mo.ActorAlreadyExist: + pass + return await super().create(session_id=session_id, address=address) + + +class WorkerMetaAPI(BaseMetaAPI): + @classmethod + @alru_cache(cache_exceptions=False) + async def create(cls, session_id: str, address: str) -> "WorkerMetaAPI": + """ + Create worker meta API. + + Parameters + ---------- + session_id : str + Session ID. + address : str + Worker address. + + Returns + ------- + meta_api + Worker meta api. + """ + worker_meta_store_manager_ref = await mo.actor_ref( + uid=WorkerMetaStoreManagerActor.default_uid(), address=address + ) + worker_meta_store_ref = ( + await worker_meta_store_manager_ref.new_session_meta_store(session_id) + ) + return WorkerMetaAPI(session_id, worker_meta_store_ref) + + +class MockWorkerMetaAPI(WorkerMetaAPI): + @classmethod + async def create(cls, session_id: str, address: str) -> "WorkerMetaAPI": + # create an Actor for mock + try: + await mo.create_actor( + WorkerMetaStoreManagerActor, + "dict", + dict(), + address=address, + uid=WorkerMetaStoreManagerActor.default_uid(), + ) + except mo.ActorAlreadyExist: + # ignore if actor exists + await mo.actor_ref( + WorkerMetaStoreManagerActor, + address=address, + uid=WorkerMetaStoreManagerActor.default_uid(), + ) + return await super().create(session_id, address) diff --git a/python/xorbits/_mars/services/meta/api/web.py b/python/xorbits/_mars/services/meta/api/web.py new file mode 100644 index 000000000..193206cf1 --- /dev/null +++ b/python/xorbits/_mars/services/meta/api/web.py @@ -0,0 +1,90 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Dict, List, Optional + +from .... import oscar as mo +from ....utils import deserialize_serializable, serialize_serializable +from ...web import MarsServiceWebAPIHandler, MarsWebAPIClientMixin, web_api +from .core import AbstractMetaAPI + + +class MetaWebAPIHandler(MarsServiceWebAPIHandler): + _root_pattern = "/api/session/(?P[^/]+)/meta" + + async def _get_oscar_meta_api(self, session_id: str): + from .oscar import MetaAPI + + return await self._get_api_by_key(MetaAPI, session_id) + + @web_api("(?P[^/]+)", method="get") + async def get_chunk_meta(self, session_id: str, data_key: str): + fields_str = self.get_argument("fields", None) + error = self.get_argument("error", "raise") + fields = fields_str.split(",") if fields_str else None + + oscar_api = await self._get_oscar_meta_api(session_id) + result = await oscar_api.get_chunk_meta(data_key, fields=fields, error=error) + self.write(serialize_serializable(result)) + + @web_api("", method="post") + async def get_chunks_meta(self, session_id: str): + body_args = deserialize_serializable(self.request.body) + oscar_api = await self._get_oscar_meta_api(session_id) + get_metas = [] + for data_key, fields, error in body_args: + get_metas.append(oscar_api.get_chunk_meta.delay(data_key, fields, error)) + results = await oscar_api.get_chunk_meta.batch(*get_metas) + self.write(serialize_serializable(results)) + + +web_handlers = {MetaWebAPIHandler.get_root_pattern(): MetaWebAPIHandler} + + +class WebMetaAPI(AbstractMetaAPI, MarsWebAPIClientMixin): + def __init__( + self, session_id: str, address: str, request_rewriter: Callable = None + ): + # make sure all meta types registered + from .. import metas + + del metas + + self._session_id = session_id + self._address = address.rstrip("/") + self.request_rewriter = request_rewriter + + @mo.extensible + async def get_chunk_meta( + self, object_id: str, fields: List[str] = None, error: str = "raise" + ) -> Optional[Dict]: + params = dict(error=error) + req_addr = f"{self._address}/api/session/{self._session_id}/meta/{object_id}" + if fields: + params["fields"] = ",".join(fields) + res = await self._request_url("GET", req_addr, params=params) + return deserialize_serializable(res.body) + + @get_chunk_meta.batch + async def get_chunks_meta(self, args_list, kwargs_list): + get_chunk_metas = [] + for args, kwargs in zip(args_list, kwargs_list): + object_id, fields, error = self.get_chunk_meta.bind(*args, **kwargs) + get_chunk_metas.append([object_id, fields, error]) + + req_addr = f"{self._address}/api/session/{self._session_id}/meta" + res = await self._request_url( + "POST", req_addr, data=serialize_serializable(get_chunk_metas) + ) + return deserialize_serializable(res.body) diff --git a/python/xorbits/_mars/services/meta/core.py b/python/xorbits/_mars/services/meta/core.py new file mode 100644 index 000000000..9fb16b578 --- /dev/null +++ b/python/xorbits/_mars/services/meta/core.py @@ -0,0 +1,77 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Any, Dict, List, Tuple, Type, Union + +import numpy as np +import pandas as pd + +from ...typing import BandType +from ...utils import TypeDispatcher, dataslots + +PandasDtypeType = Union[np.dtype, pd.api.extensions.ExtensionDtype] + +_meta_class_dispatcher = TypeDispatcher() + + +def register_meta_type(object_types: Tuple): + def _call(meta_type: Type["_CommonMeta"]): + _meta_class_dispatcher.register(object_types, meta_type) + return meta_type + + return _call + + +def get_meta_type(object_type: Type) -> Type["_CommonMeta"]: + return _meta_class_dispatcher.get_handler(object_type) + + +@dataslots +@dataclass +class _CommonMeta: + """ + Class for common meta, for both tileable and chunk, or DataFrame, tensor etc. + """ + + object_id: str + name: Any = None + memory_size: int = None # size in memory + store_size: int = None # size that stored in storage + extra: Dict = None + + def merge_from(self, value: "_CommonMeta"): + return self + + +@dataslots +@dataclass +class _TileableMeta(_CommonMeta): + nsplits: Tuple[Tuple[int]] = None + + +@dataslots +@dataclass +class _ChunkMeta(_CommonMeta): + index: Tuple[int] = None + bands: List[BandType] = None + # needed by ray ownership to keep object alive when worker died. + object_refs: List[Any] = None + + def merge_from(self, value: "_ChunkMeta"): + if value.bands: + self.bands = list(set(self.bands) | set(value.bands)) + if value.object_refs: + self.object_refs = list(set(self.object_refs) | set(value.object_refs)) + return self diff --git a/python/xorbits/_mars/services/meta/metas.py b/python/xorbits/_mars/services/meta/metas.py new file mode 100644 index 000000000..d166c4fe3 --- /dev/null +++ b/python/xorbits/_mars/services/meta/metas.py @@ -0,0 +1,207 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Any, Dict, List, Tuple + +import numpy as np + +from ...core import OBJECT_CHUNK_TYPE, OBJECT_TYPE +from ...dataframe.core import ( + CATEGORICAL_CHUNK_TYPE, + CATEGORICAL_TYPE, + DATAFRAME_CHUNK_TYPE, + DATAFRAME_GROUPBY_CHUNK_TYPE, + DATAFRAME_GROUPBY_TYPE, + DATAFRAME_OR_SERIES_CHUNK_TYPE, + DATAFRAME_OR_SERIES_TYPE, + DATAFRAME_TYPE, + INDEX_CHUNK_TYPE, + INDEX_TYPE, + SERIES_CHUNK_TYPE, + SERIES_GROUPBY_CHUNK_TYPE, + SERIES_GROUPBY_TYPE, + SERIES_TYPE, + DtypesValue, + IndexValue, +) +from ...tensor.core import TENSOR_CHUNK_TYPE, TENSOR_TYPE, TensorOrder +from ...utils import dataslots +from .core import PandasDtypeType, _ChunkMeta, _TileableMeta, register_meta_type + +""" +Create a separate module for metas to avoid direct +dependency on mars.dataframe +""" + + +@register_meta_type(TENSOR_TYPE) +@dataslots +@dataclass +class TensorMeta(_TileableMeta): + shape: Tuple[int] = None + dtype: np.dtype = None + order: TensorOrder = None + + +@register_meta_type(DATAFRAME_TYPE) +@dataslots +@dataclass +class DataFrameMeta(_TileableMeta): + shape: Tuple[int] = None + dtypes_value: DtypesValue = None + index_value: IndexValue = None + + +@register_meta_type(SERIES_TYPE) +@dataslots +@dataclass +class SeriesMeta(_TileableMeta): + shape: Tuple[int] = None + dtype: PandasDtypeType = None + index_value: IndexValue = None + + +@register_meta_type(INDEX_TYPE) +@dataslots +@dataclass +class IndexMeta(_TileableMeta): + shape: Tuple[int] = None + dtype: PandasDtypeType = None + index_value: IndexValue = None + + +@register_meta_type(DATAFRAME_GROUPBY_TYPE) +@dataslots +@dataclass +class DataFrameGroupByMeta(_TileableMeta): + shape: Tuple[int] = None + dtypes_value: DtypesValue = None + index_value: IndexValue = None + selection: List = None + + +@register_meta_type(SERIES_GROUPBY_TYPE) +@dataslots +@dataclass +class SeriesGroupByMeta(_TileableMeta): + shape: Tuple[int] = None + dtype: PandasDtypeType = None + index_value: IndexValue = None + selection: List = None + + +@register_meta_type(CATEGORICAL_TYPE) +@dataslots +@dataclass +class CategoricalMeta(_TileableMeta): + shape: Tuple[int] = None + dtype: PandasDtypeType = None + categories_value: IndexValue = None + + +@register_meta_type(OBJECT_TYPE) +@dataslots +@dataclass +class ObjectMeta(_TileableMeta): + pass + + +@register_meta_type(TENSOR_CHUNK_TYPE) +@dataslots +@dataclass +class TensorChunkMeta(_ChunkMeta): + shape: Tuple[int] = None + dtype: np.dtype = None + order: TensorOrder = None + + +@register_meta_type(DATAFRAME_CHUNK_TYPE) +@dataslots +@dataclass +class DataFrameChunkMeta(_ChunkMeta): + shape: Tuple[int] = None + dtypes_value: DtypesValue = None + index_value: IndexValue = None + + +@register_meta_type(SERIES_CHUNK_TYPE) +@dataslots +@dataclass +class SeriesChunkMeta(_ChunkMeta): + shape: Tuple[int] = None + dtype: PandasDtypeType = None + index_value: IndexValue = None + + +@register_meta_type(INDEX_CHUNK_TYPE) +@dataslots +@dataclass +class IndexChunkMeta(_ChunkMeta): + shape: Tuple[int] = None + dtype: PandasDtypeType = None + index_value: IndexValue = None + + +@register_meta_type(DATAFRAME_GROUPBY_CHUNK_TYPE) +@dataslots +@dataclass +class DataFrameGroupByChunkMeta(_ChunkMeta): + shape: Tuple[int] = None + dtypes_value: DtypesValue = None + index_value: IndexValue = None + selection: List = None + + +@register_meta_type(SERIES_GROUPBY_CHUNK_TYPE) +@dataslots +@dataclass +class SeriesGroupByChunkMeta(_ChunkMeta): + shape: Tuple[int] = None + dtype: PandasDtypeType = None + index_value: IndexValue = None + selection: List = None + + +@register_meta_type(CATEGORICAL_CHUNK_TYPE) +@dataslots +@dataclass +class CategoricalChunkMeta(_ChunkMeta): + shape: Tuple[int] = None + dtype: PandasDtypeType = None + categories_value: IndexValue = None + + +@register_meta_type(OBJECT_CHUNK_TYPE) +@dataslots +@dataclass +class ObjectChunkMeta(_ChunkMeta): + pass + + +@register_meta_type(DATAFRAME_OR_SERIES_TYPE) +@dataslots +@dataclass +class DataFrameOrSeriesMeta(_TileableMeta): + data_type: str = None + data_params: Dict[str, Any] = None + + +@register_meta_type(DATAFRAME_OR_SERIES_CHUNK_TYPE) +@dataslots +@dataclass +class DataFrameOrSeriesChunkMeta(_ChunkMeta): + collapse_axis: int = None + data_type: str = None + data_params: Dict[str, Any] = None diff --git a/python/xorbits/_mars/services/meta/store/__init__.py b/python/xorbits/_mars/services/meta/store/__init__.py new file mode 100644 index 000000000..e238d3b3c --- /dev/null +++ b/python/xorbits/_mars/services/meta/store/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .base import AbstractMetaStore, get_meta_store +from .dictionary import DictMetaStore diff --git a/python/xorbits/_mars/services/meta/store/base.py b/python/xorbits/_mars/services/meta/store/base.py new file mode 100644 index 000000000..24c05e1f0 --- /dev/null +++ b/python/xorbits/_mars/services/meta/store/base.py @@ -0,0 +1,139 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from typing import Dict, List, Type + +from ....typing import BandType +from ..core import _CommonMeta + + +class AbstractMetaStore(ABC): + name = None + + def __init__(self, session_id: str, **kw): + # make sure all meta types registered + from .. import metas + + del metas + + self._session_id = session_id + + @classmethod + @abstractmethod + async def create(cls, config) -> Dict: + """ + Create a meta store. Do some initialization work. + For instance, for database backend, + db files including tables may be created first. + This should be done when service starting. + + Parameters + ---------- + config : dict + config. + + Returns + ------- + kwargs : dict + kwargs to create a meta store. + """ + + @abstractmethod + async def set_meta(self, object_id: str, meta: _CommonMeta): + """ + Set meta. + + Parameters + ---------- + object_id : str + Object ID. + meta : _CommonMeta + Meta. + """ + + @abstractmethod + async def get_meta( + self, object_id: str, fields: List[str] = None, error="raise" + ) -> Dict: + """ + Get meta. + + Parameters + ---------- + object_id : str + Object ID. + fields : list + Fields to filter, if not provided, get all fields. + error : str + 'raise' or 'ignore' + + Returns + ------- + meta: dict + Meta. + """ + + @abstractmethod + async def del_meta(self, object_id: str): + """ + Delete meta. + + Parameters + ---------- + object_id : str + Object ID. + """ + + @abstractmethod + async def add_chunk_bands(self, object_id: str, bands: List[BandType]): + """ + Add band to chunk. + + Parameters + ---------- + object_id : str + Object ID. + bands : List[BandType] + Band of chunk to add, shall be tuple of (worker, band). + """ + + @abstractmethod + async def remove_chunk_bands(self, object_id: str, bands: List[BandType]): + """ + Remove bands from chunk. + + Parameters + ---------- + object_id : str + Object ID. + bands : List[BandType] + Bands of chunk to remove, shall be tuple of (worker, band). + """ + + @abstractmethod + async def get_band_chunks(self, band: BandType) -> List[str]: + """Get chunks key of band""" + + +_meta_store_types: Dict[str, Type[AbstractMetaStore]] = dict() + + +def register_meta_store(meta_store: Type[AbstractMetaStore]): + _meta_store_types[meta_store.name] = meta_store + return meta_store + + +def get_meta_store(meta_store_name: str) -> Type[AbstractMetaStore]: + return _meta_store_types[meta_store_name] diff --git a/python/xorbits/_mars/services/meta/store/dictionary.py b/python/xorbits/_mars/services/meta/store/dictionary.py new file mode 100644 index 000000000..fbdbdc5d0 --- /dev/null +++ b/python/xorbits/_mars/services/meta/store/dictionary.py @@ -0,0 +1,159 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +from collections import defaultdict +from dataclasses import fields as dataclass_fields +from typing import Dict, List + +from .... import oscar as mo +from ....lib.ordered_set import OrderedSet +from ....typing import BandType +from ....utils import implements +from ..core import _ChunkMeta, _CommonMeta +from .base import AbstractMetaStore, register_meta_store + + +@functools.lru_cache(100) +def _get_meta_fields(meta_cls): + return [f.name for f in dataclass_fields(meta_cls)] + + +@register_meta_store +class DictMetaStore(AbstractMetaStore): + name = "dict" + + def __init__(self, session_id: str, **kw): + super().__init__(session_id) + self._store: Dict[str, _CommonMeta] = dict() + # For shuffle data, we use main key to record them, here uses + # OrderedSet to make sure that the first band in set stores complete + # data, other bands may only have part data, so when reducers fetch data, + # we always choose the first band to avoid unexpected absence. + self._band_chunks: Dict[BandType, OrderedSet] = defaultdict(OrderedSet) + if kw: # pragma: no cover + raise TypeError(f"Keyword arguments {kw!r} cannot be recognized.") + + @classmethod + @implements(AbstractMetaStore.create) + async def create(cls, config) -> Dict: + # Nothing needs to do for dict-based meta store. + # no extra kwargs. + return dict() + + def _set_meta(self, object_id: str, meta: _CommonMeta): + if isinstance(meta, _ChunkMeta): + for band in meta.bands: + self._band_chunks[band].add(object_id) + prev_meta = self._store.get(object_id) + if prev_meta: + meta = meta.merge_from(prev_meta) + self._store[object_id] = meta + + @implements(AbstractMetaStore.set_meta) + @mo.extensible + async def set_meta(self, object_id: str, meta: _CommonMeta): + self._set_meta(object_id, meta) + + @set_meta.batch + async def batch_set_meta(self, args_list, kwargs_list): + for args, kwargs in zip(args_list, kwargs_list): + self._set_meta(*args, **kwargs) + + def _get_meta( + self, object_id: str, fields: List[str] = None, error: str = "raise" + ) -> Dict: + if error not in ("raise", "ignore"): # pragma: no cover + raise ValueError("error must be raise or ignore") + try: + meta = self._store[object_id] + if fields is None: + fields = _get_meta_fields(type(meta)) + return {k: getattr(meta, k) for k in fields} + except KeyError: + if error == "raise": + raise + else: + return + + @implements(AbstractMetaStore.get_meta) + @mo.extensible + async def get_meta( + self, object_id: str, fields: List[str] = None, error: str = "raise" + ) -> Dict: + return self._get_meta(object_id, fields=fields, error=error) + + @get_meta.batch + async def batch_get_meta(self, args_list, kwargs_list): + metas = [] + for args, kwargs in zip(args_list, kwargs_list): + metas.append(self._get_meta(*args, **kwargs)) + return metas + + def _del_meta(self, object_id: str): + meta = self._store[object_id] + if isinstance(meta, _ChunkMeta): + for band in meta.bands: + chunks = self._band_chunks[band] + chunks.remove(object_id) + if len(chunks) == 0: + del self._band_chunks[band] + del self._store[object_id] + + @implements(AbstractMetaStore.del_meta) + @mo.extensible + async def del_meta(self, object_id: str): + self._del_meta(object_id) + + @del_meta.batch + async def batch_del_meta(self, args_list, kwargs_list): + for args, kwargs in zip(args_list, kwargs_list): + self._del_meta(*args, **kwargs) + + def _add_chunk_bands(self, object_id: str, bands: List[BandType]): + meta = self._store[object_id] + assert isinstance(meta, _ChunkMeta) + meta.bands = list(OrderedSet(meta.bands) | OrderedSet(bands)) + for band in bands: + self._band_chunks[band].add(object_id) + + @implements(AbstractMetaStore.add_chunk_bands) + @mo.extensible + async def add_chunk_bands(self, object_id: str, bands: List[BandType]): + self._add_chunk_bands(object_id, bands) + + @add_chunk_bands.batch + async def batch_add_chunk_bands(self, args_list, kwargs_list): + for args, kwargs in zip(args_list, kwargs_list): + self._add_chunk_bands(*args, **kwargs) + + def _remove_chunk_bands(self, object_id: str, bands: List[BandType]): + meta = self._store[object_id] + assert isinstance(meta, _ChunkMeta) + meta.bands = list(OrderedSet(meta.bands) - OrderedSet(bands)) + for band in bands: + self._band_chunks[band].remove(object_id) + + @implements(AbstractMetaStore.remove_chunk_bands) + @mo.extensible + async def remove_chunk_bands(self, object_id: str, bands: List[BandType]): + self._remove_chunk_bands(object_id, bands) + + @remove_chunk_bands.batch + async def batch_remove_chunk_bands(self, args_list, kwargs_list): + for args, kwargs in zip(args_list, kwargs_list): + self._remove_chunk_bands(*args, **kwargs) + + async def get_band_chunks(self, band: BandType) -> List[str]: + return list(self._band_chunks[band]) diff --git a/python/xorbits/_mars/services/meta/store/tests/__init__.py b/python/xorbits/_mars/services/meta/store/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/services/meta/store/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/services/meta/store/tests/test_meta_store.py b/python/xorbits/_mars/services/meta/store/tests/test_meta_store.py new file mode 100644 index 000000000..8ccfa96b9 --- /dev/null +++ b/python/xorbits/_mars/services/meta/store/tests/test_meta_store.py @@ -0,0 +1,54 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from ..... import tensor as mt +from .....core import tile +from ...metas import TensorMeta +from ...store import get_meta_store + + +@pytest.mark.asyncio +async def test_mock_meta_store(): + meta_store = get_meta_store("dict")("mock_session_id") + + t = mt.random.rand(10, 10) + t = tile(t) + + await meta_store.set_meta( + t.key, + TensorMeta( + object_id=t.key, + shape=t.shape, + dtype=t.dtype, + order=t.order, + nsplits=t.nsplits, + ), + ) + + meta = await meta_store.get_meta(t.key) + assert meta["shape"] == t.shape + assert meta["order"] == t.order + assert meta["dtype"] == t.dtype + + meta = await meta_store.get_meta(t.key, fields=["shape", "order"]) + assert meta["shape"] == t.shape + assert meta["order"] == t.order + assert "dtype" not in meta + + await meta_store.del_meta(t.key) + + with pytest.raises(KeyError): + await meta_store.get_meta(t.key) diff --git a/python/xorbits/_mars/services/meta/supervisor/__init__.py b/python/xorbits/_mars/services/meta/supervisor/__init__.py new file mode 100644 index 000000000..5f21cf2a3 --- /dev/null +++ b/python/xorbits/_mars/services/meta/supervisor/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .service import MetaSupervisorService diff --git a/python/xorbits/_mars/services/meta/supervisor/core.py b/python/xorbits/_mars/services/meta/supervisor/core.py new file mode 100644 index 000000000..9def8e47b --- /dev/null +++ b/python/xorbits/_mars/services/meta/supervisor/core.py @@ -0,0 +1,73 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +from typing import Dict + +from .... import oscar as mo +from ...session import SessionAPI +from ..store import get_meta_store + + +class MetaStoreManagerActor(mo.Actor): + def __init__(self, meta_store_name: str, config: Dict): + self._meta_store_name = meta_store_name + self._meta_store_type = get_meta_store(meta_store_name) + self._config = config + self._meta_init_kwargs = None + + # API + self._session_api = None + + async def __post_create__(self): + self._meta_init_kwargs = await self._meta_store_type.create(self._config) + self._session_api = await SessionAPI.create(self.address) + + async def new_session_meta_store(self, session_id: str) -> mo.ActorRef: + session_address = await self._session_api.get_session_address(session_id) + allocate_strategy = mo.allocate_strategy.AddressSpecified(session_address) + return await mo.create_actor( + MetaStoreActor, + self._meta_store_name, + session_id, + address=self.address, + uid=MetaStoreActor.gen_uid(session_id), + allocate_strategy=allocate_strategy, + **self._meta_init_kwargs, + ) + + +class MetaStoreActor(mo.Actor): + def __init__(self, meta_store_name: str, session_id: str, **meta_store_kwargs): + meta_store_type = get_meta_store(meta_store_name) + self._store = meta_store_type(session_id, **meta_store_kwargs) + self._worker_meta_store_refs = [] + + def add_worker_meta_store(self, ref: mo.ActorRef): + self._worker_meta_store_refs.append(ref) + + async def __pre_destroy__(self): + await asyncio.gather( + *[ + mo.destroy_actor(mo.create_actor_ref(ref)) + for ref in self._worker_meta_store_refs + ] + ) + + @staticmethod + def gen_uid(session_id: str): + return f"{session_id}_meta" + + def __getattr__(self, attr): + return getattr(self._store, attr) diff --git a/python/xorbits/_mars/services/meta/supervisor/service.py b/python/xorbits/_mars/services/meta/supervisor/service.py new file mode 100644 index 000000000..82f043b30 --- /dev/null +++ b/python/xorbits/_mars/services/meta/supervisor/service.py @@ -0,0 +1,65 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .... import oscar as mo +from ...core import AbstractService +from .core import MetaStoreActor, MetaStoreManagerActor + + +class MetaSupervisorService(AbstractService): + """ + Meta service on supervisor. + + Service Configuration + --------------------- + { + "meta" : { + "store": "", + # other config related to each store + } + } + """ + + async def start(self): + service_config = self._config["meta"] + meta_store_name = service_config.get("meta", "dict") + extra_config = service_config.copy() + extra_config.pop("meta", None) + await mo.create_actor( + MetaStoreManagerActor, + meta_store_name, + extra_config, + uid=MetaStoreManagerActor.default_uid(), + address=self._address, + ) + + async def stop(self): + await mo.destroy_actor( + mo.create_actor_ref( + uid=MetaStoreManagerActor.default_uid(), address=self._address + ) + ) + + async def create_session(self, session_id: str): + # get MetaStoreManagerActor ref. + meta_store_manager_ref = await mo.actor_ref( + self._address, MetaStoreManagerActor.default_uid() + ) + await meta_store_manager_ref.new_session_meta_store(session_id) + + async def destroy_session(self, session_id: str): + meta_store_ref = await mo.actor_ref( + self._address, MetaStoreActor.gen_uid(session_id) + ) + await mo.destroy_actor(meta_store_ref) diff --git a/python/xorbits/_mars/services/meta/tests/__init__.py b/python/xorbits/_mars/services/meta/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/services/meta/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/services/meta/tests/test_api.py b/python/xorbits/_mars/services/meta/tests/test_api.py new file mode 100644 index 000000000..6adb32a9a --- /dev/null +++ b/python/xorbits/_mars/services/meta/tests/test_api.py @@ -0,0 +1,171 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys + +import pytest + +from .... import dataframe as md +from .... import oscar as mo +from .... import remote as mr +from .... import tensor as mt +from ....core import tile +from ....utils import get_next_port +from ... import NodeRole, start_services, stop_services +from ...cluster import MockClusterAPI +from ...session import MockSessionAPI, SessionAPI +from .. import MetaAPI, MockMetaAPI, WebMetaAPI, WorkerMetaAPI + +t = mt.random.rand(10, 10) +df = md.DataFrame(t) +series = df[0] +index = df.index +obj = mr.spawn(lambda: 3) +t, df, series, index, obj = tile(t, df, series, index, obj) + +test_objects = [t, df, series, index, obj] + + +@pytest.mark.asyncio +@pytest.mark.parametrize("obj", test_objects) +async def test_meta_mock_api(obj): + start_method = "fork" if sys.platform != "win32" else None + pool = await mo.create_actor_pool( + "127.0.0.1", 2, subprocess_start_method=start_method + ) + async with pool: + session_id = "mock_session_id" + + await MockClusterAPI.create(pool.external_address) + await MockSessionAPI.create(pool.external_address, session_id=session_id) + meta_api = await MockMetaAPI.create( + session_id=session_id, address=pool.external_address + ) + + await meta_api.set_tileable_meta(obj) + meta = await meta_api.get_tileable_meta(obj.key, fields=["nsplits"]) + assert meta["nsplits"] == obj.nsplits + await meta_api.del_tileable_meta(obj.key) + with pytest.raises(KeyError): + await meta_api.get_tileable_meta(obj.key) + + chunk = obj.chunks[0] + + await meta_api.set_chunk_meta(chunk, bands=[(pool.external_address, "numa-0")]) + meta = await meta_api.get_chunk_meta(chunk.key, fields=["index", "bands"]) + assert meta["index"] == chunk.index + assert meta["bands"] == [(pool.external_address, "numa-0")] + + for i in range(2): + band = (f"1.2.3.{i}:1234", "numa-0") + await meta_api.add_chunk_bands(chunk.key, [band]) + meta = await meta_api.get_chunk_meta(chunk.key, fields=["bands"]) + assert band in meta["bands"] + meta = await meta_api.get_chunk_meta(chunk.key, fields=["bands"]) + band = meta["bands"][0] + chunks = await meta_api.get_band_chunks(band) + assert chunk.key in chunks + await meta_api.remove_chunk_bands(chunk.key, [band]) + meta = await meta_api.get_chunk_meta(chunk.key, fields=["bands"]) + assert band not in meta["bands"] + + await meta_api.del_chunk_meta(chunk.key) + with pytest.raises(KeyError): + await meta_api.get_chunk_meta(chunk.key) + + await MockClusterAPI.cleanup(pool.external_address) + + +@pytest.mark.asyncio +async def test_worker_meta_api(): + supervisor_pool = await mo.create_actor_pool("127.0.0.1", n_process=0) + worker_pool = await mo.create_actor_pool("127.0.0.1", n_process=0) + + async with supervisor_pool, worker_pool: + config = { + "services": ["cluster", "session", "meta", "web"], + "cluster": { + "backend": "fixed", + "lookup_address": supervisor_pool.external_address, + }, + "meta": {"store": "dict"}, + } + await start_services( + NodeRole.SUPERVISOR, config, address=supervisor_pool.external_address + ) + await start_services( + NodeRole.WORKER, config, address=worker_pool.external_address + ) + + session_id = "test_session" + session_api = await SessionAPI.create(supervisor_pool.external_address) + await session_api.create_session(session_id) + + worker_meta_api = await WorkerMetaAPI.create( + session_id=session_id, address=worker_pool.external_address + ) + await worker_meta_api.set_tileable_meta(t) + meta = await worker_meta_api.get_tileable_meta(t.key, fields=["nsplits"]) + assert meta["nsplits"] == t.nsplits + await worker_meta_api.del_tileable_meta(t.key) + with pytest.raises(KeyError): + await worker_meta_api.get_tileable_meta(t.key) + + await stop_services( + NodeRole.WORKER, config, address=worker_pool.external_address + ) + await stop_services( + NodeRole.SUPERVISOR, config, address=supervisor_pool.external_address + ) + + +@pytest.mark.asyncio +async def test_meta_web_api(): + pool = await mo.create_actor_pool("127.0.0.1", n_process=0) + web_port = get_next_port() + + async with pool: + config = { + "services": ["cluster", "session", "meta", "web"], + "cluster": { + "backend": "fixed", + "lookup_address": pool.external_address, + }, + "meta": {"store": "dict"}, + "web": { + "port": web_port, + }, + } + await start_services(NodeRole.SUPERVISOR, config, address=pool.external_address) + + session_id = "test_session" + session_api = await SessionAPI.create(pool.external_address) + await session_api.create_session(session_id) + + t = mt.random.rand(10, 10) + t = tile(t) + + meta_api = await MetaAPI.create(session_id, pool.external_address) + web_api = WebMetaAPI(session_id, f"http://localhost:{web_port}") + + await meta_api.set_chunk_meta( + t.chunks[0], bands=[(pool.external_address, "numa-0")] + ) + meta = await web_api.get_chunk_meta(t.chunks[0].key, fields=["shape", "bands"]) + assert set(meta.keys()) == {"shape", "bands"} + + with pytest.raises(KeyError): + await web_api.get_chunk_meta("non-exist-key") + + await stop_services(NodeRole.SUPERVISOR, config, address=pool.external_address) diff --git a/python/xorbits/_mars/services/meta/tests/test_service.py b/python/xorbits/_mars/services/meta/tests/test_service.py new file mode 100644 index 000000000..6df7acbaf --- /dev/null +++ b/python/xorbits/_mars/services/meta/tests/test_service.py @@ -0,0 +1,65 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from .... import oscar as mo +from ... import NodeRole, start_services, stop_services +from ...session.api import SessionAPI +from ..api import MetaAPI, WorkerMetaAPI +from ..supervisor import MetaSupervisorService + + +@pytest.mark.asyncio +async def test_meta_service(): + pool = await mo.create_actor_pool("127.0.0.1", n_process=0) + worker_pool = await mo.create_actor_pool("127.0.0.1", n_process=0) + + async with pool, worker_pool: + config = { + "services": ["cluster", "session", "meta"], + "cluster": { + "backend": "fixed", + "lookup_address": pool.external_address, + }, + "meta": {"store": "dict"}, + } + await start_services(NodeRole.SUPERVISOR, config, address=pool.external_address) + await start_services( + NodeRole.WORKER, config, address=worker_pool.external_address + ) + + session_id = "test_session" + session_api = await SessionAPI.create(pool.external_address) + await session_api.create_session(session_id) + # get session store + meta_api = await MetaAPI.create(session_id, pool.external_address) + # get worker meta store + worker_meta_api = await WorkerMetaAPI.create( + session_id, worker_pool.external_address + ) + + # destroy session + service = MetaSupervisorService({}, pool.external_address) + await service.destroy_session(session_id) + with pytest.raises(mo.ActorNotExist): + await service.destroy_session(session_id) + with pytest.raises(mo.ActorNotExist): + # actor already destroyed + await worker_meta_api.get_tileable_meta("any_id") + + # test alru_cache + assert await MetaAPI.create(session_id, pool.external_address) is meta_api + + await stop_services(NodeRole.SUPERVISOR, config, address=pool.external_address) diff --git a/python/xorbits/_mars/services/meta/worker/__init__.py b/python/xorbits/_mars/services/meta/worker/__init__.py new file mode 100644 index 000000000..8ea51a17f --- /dev/null +++ b/python/xorbits/_mars/services/meta/worker/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .service import MetaWorkerService diff --git a/python/xorbits/_mars/services/meta/worker/core.py b/python/xorbits/_mars/services/meta/worker/core.py new file mode 100644 index 000000000..d7660333e --- /dev/null +++ b/python/xorbits/_mars/services/meta/worker/core.py @@ -0,0 +1,77 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict + +from .... import oscar as mo +from ....lib.aio import alru_cache +from ...cluster import ClusterAPI +from ..store import get_meta_store + + +class WorkerMetaStoreManagerActor(mo.Actor): + def __init__(self, meta_store_name: str, config: Dict): + self._meta_store_name = meta_store_name + self._meta_store_type = get_meta_store(meta_store_name) + self._config = config + self._meta_init_kwargs = None + + self._cluster_api = None + + async def __post_create__(self): + self._meta_init_kwargs = await self._meta_store_type.create(self._config) + self._cluster_api = await ClusterAPI.create(self.address) + + @alru_cache(cache_exceptions=False) + async def _get_supervisor_address(self, session_id: str): + [address] = await self._cluster_api.get_supervisors_by_keys([session_id]) + return address + + async def new_session_meta_store(self, session_id: str) -> mo.ActorRef: + from ..supervisor.core import MetaStoreActor + + try: + ref = await mo.create_actor( + WorkerMetaStoreActor, + self._meta_store_name, + session_id, + uid=WorkerMetaStoreActor.gen_uid(session_id), + address=self.address, + **self._meta_init_kwargs, + ) + supervisor_address = await self._get_supervisor_address(session_id) + supervisor_meta_store_ref = await mo.actor_ref( + uid=MetaStoreActor.gen_uid(session_id), address=supervisor_address + ) + # register worker meta store, + # when session destroyed, this worker meta store actor will be removed + await supervisor_meta_store_ref.add_worker_meta_store(ref) + except mo.ActorAlreadyExist: + ref = await mo.actor_ref( + uid=WorkerMetaStoreActor.gen_uid(session_id), address=self.address + ) + return ref + + +class WorkerMetaStoreActor(mo.Actor): + def __init__(self, meta_store_name: str, session_id: str, **meta_store_kwargs): + meta_store_type = get_meta_store(meta_store_name) + self._store = meta_store_type(session_id, **meta_store_kwargs) + + @staticmethod + def gen_uid(session_id: str): + return f"{session_id}_worker_meta" + + def __getattr__(self, attr): + return getattr(self._store, attr) diff --git a/python/xorbits/_mars/services/meta/worker/service.py b/python/xorbits/_mars/services/meta/worker/service.py new file mode 100644 index 000000000..8b412c6e0 --- /dev/null +++ b/python/xorbits/_mars/services/meta/worker/service.py @@ -0,0 +1,52 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .... import oscar as mo +from ...core import AbstractService +from .core import WorkerMetaStoreManagerActor + + +class MetaWorkerService(AbstractService): + """ + Meta service on worker. + + Service Configuration + --------------------- + { + "meta" : { + "store": "", + # other config related to each store + } + } + """ + + async def start(self): + service_config = self._config["meta"] + meta_store_name = service_config.get("meta", "dict") + extra_config = service_config.copy() + extra_config.pop("meta", None) + await mo.create_actor( + WorkerMetaStoreManagerActor, + meta_store_name, + extra_config, + uid=WorkerMetaStoreManagerActor.default_uid(), + address=self._address, + ) + + async def stop(self): + await mo.destroy_actor( + mo.create_actor_ref( + uid=WorkerMetaStoreManagerActor.default_uid(), address=self._address + ) + ) diff --git a/python/xorbits/_mars/services/mutable/__init__.py b/python/xorbits/_mars/services/mutable/__init__.py new file mode 100644 index 000000000..10c5a5cb6 --- /dev/null +++ b/python/xorbits/_mars/services/mutable/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .api import AbstractMutableAPI, MockMutableAPI, MutableAPI, WebMutableAPI +from .core import MutableTensor +from .supervisor import MutableObjectManagerActor, MutableTensorActor diff --git a/python/xorbits/_mars/services/mutable/api/__init__.py b/python/xorbits/_mars/services/mutable/api/__init__.py new file mode 100644 index 000000000..3ba73c389 --- /dev/null +++ b/python/xorbits/_mars/services/mutable/api/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .core import AbstractMutableAPI +from .oscar import MockMutableAPI, MutableAPI +from .web import WebMutableAPI diff --git a/python/xorbits/_mars/services/mutable/api/core.py b/python/xorbits/_mars/services/mutable/api/core.py new file mode 100644 index 000000000..1e9cfa9fa --- /dev/null +++ b/python/xorbits/_mars/services/mutable/api/core.py @@ -0,0 +1,126 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from typing import Tuple, Union + +import numpy as np + +from ..core import MutableTensorInfo + + +class AbstractMutableAPI(ABC): + @abstractmethod + async def create_mutable_tensor( + self, + shape: tuple, + dtype: Union[np.dtype, str], + name: str = None, + default_value: Union[int, float] = 0, + chunk_size: Union[int, Tuple] = None, + ) -> MutableTensorInfo: + """ + Create a mutable tensor. + + Parameters + ---------- + shape: tuple + Shape of the mutable tensor. + + dtype: np.dtype or str + Data type of the mutable tensor. + + chunk_size: int or tuple + Chunk size of the mutable tensor. + + name: str, optional + Name of the mutable tensor, a random name will be used if not specified. + + default_value: optional + Default value of the mutable tensor. Default is 0. + + Returns + ------- + MutableTensorInfo + """ + + @abstractmethod + async def get_mutable_tensor(self, name: str) -> MutableTensorInfo: + """ + Get the mutable tensor by name. + + Parameters + ---------- + name: str + Name of the mutable tensor to get. + + Returns + ------- + MutableTensorInfo + """ + + @abstractmethod + async def seal_mutable_tensor(self, name: str, timestamp=None): + """ + Seal the mutable tensor by name. + + Parameters + ---------- + name: str + Name of the mutable tensor to seal. + + timestamp: optional + Operations that happened before timestamp will be sealed, and later ones will be discard. + + Returns + ------- + object + """ + + @abstractmethod + async def read(self, name: str, index: object, timestamp=None): + """ + Read value from mutable tensor. + + Parameters + ---------- + name: str + Name of mutable tensor to read. + + index: + Index to read from the tensor. + + timestamp: optional + Timestamp to read value that happened before then. + """ + + @abstractmethod + async def write(self, name: str, index: object, value: object, timestamp=None): + """ + Write value to mutable tensor. + + Parameters + ---------- + name: str + Name of the mutable tensor to write. + + index: + Index to write to the tensor. + + value: + The value that will be filled into the mutable tensor according to `index`. + + timestamp: optional + Timestamp to associated with the newly touched value. + """ diff --git a/python/xorbits/_mars/services/mutable/api/oscar.py b/python/xorbits/_mars/services/mutable/api/oscar.py new file mode 100644 index 000000000..1c5283d21 --- /dev/null +++ b/python/xorbits/_mars/services/mutable/api/oscar.py @@ -0,0 +1,104 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Tuple, Type, TypeVar, Union + +import numpy as np + +from .... import oscar as mo +from ....lib.aio import alru_cache +from ..core import MutableTensorInfo +from ..supervisor import MutableObjectManagerActor, MutableTensorActor +from .core import AbstractMutableAPI + +APIType = TypeVar("APIType", bound="MutableAPI") + + +class MutableAPI(AbstractMutableAPI): + def __init__( + self, + address: str, + mutable_mananger: mo.ActorRefType[MutableObjectManagerActor], + ): + self._address = address + self._mutable_manager_ref = mutable_mananger + + @classmethod + @alru_cache(cache_exceptions=False) + async def create(cls, session_id: str, address: str) -> "MutableAPI": + mutable_manager = await mo.actor_ref( + address, MutableObjectManagerActor.gen_uid(session_id) + ) + return MutableAPI(address, mutable_manager) + + @alru_cache(cache_exceptions=False) + async def _get_mutable_tensor_ref( + self, name: str + ) -> mo.ActorRefType[MutableTensorActor]: + return await self._mutable_manager_ref.get_mutable_tensor(name) + + async def create_mutable_tensor( + self, + shape: tuple, + dtype: Union[np.dtype, str], + name: str = None, + default_value: Union[int, float] = 0, + chunk_size: Union[int, Tuple] = None, + ) -> MutableTensorInfo: + actor_ref = await self._mutable_manager_ref.create_mutable_tensor( + name=name, + shape=shape, + dtype=dtype, + chunk_size=chunk_size, + default_value=default_value, + ) + return await actor_ref.info() + + @alru_cache(cache_exceptions=False) + async def get_mutable_tensor(self, name: str): + actor_ref = await self._mutable_manager_ref.get_mutable_tensor(name) + return await actor_ref.info() + + async def seal_mutable_tensor(self, name: str, timestamp=None): + # invalidate the `get_mutable_tensor` cache first. + self.get_mutable_tensor.invalidate() + return await self._mutable_manager_ref.seal_mutable_tensor( + name, timestamp=timestamp + ) + + async def read(self, name: str, index, timestamp=None): + tensor_ref = await self._get_mutable_tensor_ref(name) + return await tensor_ref.read(index, timestamp) + + async def write(self, name: str, index, value, timestamp=None): + tensor_ref = await self._get_mutable_tensor_ref(name) + return await tensor_ref.write(index, value, timestamp) + + +class MockMutableAPI(MutableAPI): + @classmethod + async def create(cls: Type[APIType], session_id: str, address: str) -> "MutableAPI": + mutable_managger = await mo.create_actor( + MutableObjectManagerActor, + session_id, + address=address, + uid=MutableObjectManagerActor.gen_uid(session_id), + ) + return MockMutableAPI(address, mutable_managger) + + @classmethod + async def cleanup(cls: Type[APIType], session_id: str, address: str): + await mo.destroy_actor( + await mo.actor_ref(address, MutableObjectManagerActor.gen_uid(session_id)) + ) diff --git a/python/xorbits/_mars/services/mutable/api/web.py b/python/xorbits/_mars/services/mutable/api/web.py new file mode 100644 index 000000000..abcb000a9 --- /dev/null +++ b/python/xorbits/_mars/services/mutable/api/web.py @@ -0,0 +1,186 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Union + +import numpy as np + +from ....lib.aio import alru_cache +from ....utils import deserialize_serializable, serialize_serializable +from ...web import MarsServiceWebAPIHandler, MarsWebAPIClientMixin, web_api +from .core import AbstractMutableAPI + + +class MutableWebAPIHandler(MarsServiceWebAPIHandler): + _root_pattern = "/api/session/(?P[^/]+)/mutable" + + @alru_cache(cache_exceptions=False) + async def _get_cluster_api(self): + from ...cluster import ClusterAPI + + return await ClusterAPI.create(self._supervisor_addr) + + @alru_cache(cache_exceptions=False) + async def _get_oscar_mutable_api(self, session_id: str): + from .oscar import MutableAPI + + cluster_api = await self._get_cluster_api() + [address] = await cluster_api.get_supervisors_by_keys([session_id]) + return await MutableAPI.create(session_id, address) + + @web_api("", method="post") + async def create_mutable_tensor(self, session_id: str): + body_args = ( + deserialize_serializable(self.request.body) if self.request.body else None + ) + shape = body_args.get("shape") + dtype = body_args.get("dtype") + name = body_args.get("name") + default_value = body_args.get("default_value") + chunk_size = body_args.get("chunk_size") + + oscar_api = await self._get_oscar_mutable_api(session_id) + res = await oscar_api.create_mutable_tensor( + shape, dtype, name, default_value, chunk_size + ) + self.write(serialize_serializable(res)) + + @web_api("(?P[^/]+)", method="get") + async def get_mutable_tensor(self, session_id: str, name: str): + oscar_api = await self._get_oscar_mutable_api(session_id) + res = await oscar_api.get_mutable_tensor(name) + self.write(serialize_serializable(res)) + + @web_api("(?P[^/]+)/seal", method="post") + async def seal_mutable_tensor(self, session_id: str, name: str): # pragma: no cover + body_args = ( + deserialize_serializable(self.request.body) if self.request.body else None + ) + timestamp = body_args.get("timestamp") + + oscar_api = await self._get_oscar_mutable_api(session_id) + res = await oscar_api.seal_mutable_tensor(name, timestamp) + self.write(serialize_serializable(res)) + + @web_api("(?P[^/]+)/read", method="post") + async def read_mutable(self, session_id: str, name: str): # pragma: no cover + body_args = ( + deserialize_serializable(self.request.body) if self.request.body else None + ) + index = body_args.get("index") + timestamp = body_args.get("timestamp") + + oscar_api = await self._get_oscar_mutable_api(session_id) + res = await oscar_api.read(name, index, timestamp) + self.write(serialize_serializable(res)) + + @web_api("(?P[^/]+)/write", method="post") + async def write_mutable(self, session_id: str, name: str): # pragma: no cover + body_args = ( + deserialize_serializable(self.request.body) if self.request.body else None + ) + index = body_args.get("index") + value = body_args.get("value") + timestamp = body_args.get("timestamp") + + oscar_api = await self._get_oscar_mutable_api(session_id) + res = await oscar_api.write(name, index, value, timestamp) + self.write(serialize_serializable(res)) + + +web_handlers = { + MutableWebAPIHandler.get_root_pattern(): MutableWebAPIHandler, +} + + +class WebMutableAPI(AbstractMutableAPI, MarsWebAPIClientMixin): + def __init__( + self, session_id: str, address: str, request_rewriter: Callable = None + ): + self._session_id = session_id + self._address = address.rstrip("/") + self.request_rewriter = request_rewriter + + async def create_mutable_tensor( + self, + shape: tuple, + dtype: Union[np.dtype, str], + name: str = None, + default_value: Union[int, float] = 0, + chunk_size: Union[tuple, int] = None, + ): + path = f"{self._address}/api/session/{self._session_id}/mutable" + params = dict( + shape=shape, + dtype=dtype, + name=name, + default_value=default_value, + chunk_size=chunk_size, + ) + body = serialize_serializable(params) + res = await self._request_url( + path=path, + method="POST", + data=body, + headers={"Content-Type": "application/octet-stream"}, + ) + return deserialize_serializable(res.body) + + async def get_mutable_tensor(self, name: str): + path = f"{self._address}/api/session/{self._session_id}/mutable/{name}" + res = await self._request_url( + path=path, + method="GET", + headers={"Content-Type": "application/octet-stream"}, + ) + return deserialize_serializable(res.body) + + async def seal_mutable_tensor(self, name: str, timestamp=None): + path = f"{self._address}/api/session/{self._session_id}/mutable/{name}/seal" + params = dict(timestamp=timestamp) + body = serialize_serializable(params) + res = await self._request_url( + path=path, + method="POST", + data=body, + headers={"Content-Type": "application/octet-stream"}, + request_timeout=3600, + ) + return deserialize_serializable(res.body) + + async def read(self, name: str, index, timestamp=None): + path = f"{self._address}/api/session/{self._session_id}/mutable/{name}/read" + params = dict(index=index, timestamp=timestamp) + body = serialize_serializable(params) + res = await self._request_url( + path=path, + method="POST", + data=body, + headers={"Content-Type": "application/octet-stream"}, + request_timeout=3600, + ) + return deserialize_serializable(res.body) + + async def write(self, name: str, index, value, timestamp=None): + path = f"{self._address}/api/session/{self._session_id}/mutable/{name}/write" + params = dict(index=index, value=value, timestamp=timestamp) + body = serialize_serializable(params) + res = await self._request_url( + path=path, + method="POST", + data=body, + headers={"Content-Type": "application/octet-stream"}, + request_timeout=3600, + ) + return deserialize_serializable(res.body) diff --git a/python/xorbits/_mars/services/mutable/core.py b/python/xorbits/_mars/services/mutable/core.py new file mode 100644 index 000000000..43213332f --- /dev/null +++ b/python/xorbits/_mars/services/mutable/core.py @@ -0,0 +1,191 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio + + +class MutableTensorInfo: + """ + Why `MutableTensorInfo`? + + We need a cluster to transfer meta information of mutable tensor, between + server and client, as over the HTTP web session. + + Thus we design an internal-only `MutableTensorInfo` type as a container + for those information. + + A `MutableTensor` can be initialized from + + - a info, which contains the metadata + - a `mutable_api`, which will be used to request the backend API + - a `loop`, which will be used to execute `__setitem__` (and `__getitem__`) + synchronously to make the API more user-friendly. + """ + + def __init__(self, shape, dtype, name, default_value): + self._shape = shape + self._dtype = dtype + self._name = name + self._default_value = default_value + + @property + def shape(self): + return self._shape + + @property + def dtype(self): + return self._dtype + + @property + def name(self): + return self._name + + @property + def default_value(self): + return self._default_value + + +class MutableTensor: + def __init__(self, info, mutable_api, loop): + self._info = info + self._mutable_api = mutable_api + self._loop = loop + + @classmethod + def create( + cls, + info: "MutableTensorInfo", + mutable_api, # no type signature, to avoid cycle imports + loop: asyncio.AbstractEventLoop, + ) -> "MutableTensor": + return MutableTensor(info, mutable_api, loop) + + @property + def shape(self): + """ + Get the shape the mutable tensor. + + Returns + ------- + Tuple + """ + return self._info.shape + + @property + def dtype(self): + """ + Get the dtype the mutable tensor. + + Returns + ------- + np.dtype or str + """ + return self._info.dtype + + @property + def name(self): + """ + Get the dtype the mutable tensor. + + Returns + ------- + str + """ + return self._info.name + + @property + def default_value(self): + """ + Get the dtype the mutable tensor. + + Returns + ------- + int or float + """ + return self._info.default_value + + async def read(self, index, timestamp=None): + """ + Read value from mutable tensor. + + Parameters + ---------- + index: + Index to read from the tensor. + + timestamp: optional + Timestamp to read value that happened before then. + """ + return await self._mutable_api.read(self.name, index, timestamp) + + async def write(self, index, value, timestamp=None): + """ + Write value to mutable tensor. + + Parameters + ---------- + index: + Index to write to the tensor. + + value: + The value that will be filled into the mutable tensor according to `index`. + + timestamp: optional + Timestamp to associated with the newly touched value. + """ + return await self._mutable_api.write(self.name, index, value, timestamp) + + def __getitem__(self, index): + """ + Read value from mutable tensor with a synchronous API. + + Parameters + ---------- + index: + Index to read from the tensor. + """ + coro = self.read(index) + fut = asyncio.run_coroutine_threadsafe(coro, self._loop) + return fut.result() + + def __setitem__(self, index, value): + """ + Write value to mutable tensor with a synchronous API. + + Parameters + ---------- + index: + Index to write to the tensor. + + value: + The value that will be filled into the mutable tensor according to `index`. + """ + coro = self.write(index, value) + fut = asyncio.run_coroutine_threadsafe(coro, self._loop) + return fut.result() + + async def seal(self, timestamp=None): + """ + Seal the mutable tensor by name. + + Parameters + ---------- + timestamp: optional + Operations that happened before timestamp will be sealed, and later ones will be discard. + + Returns + ------- + object + """ + return await self._mutable_api.seal_mutable_tensor(self.name, timestamp) diff --git a/python/xorbits/_mars/services/mutable/supervisor/__init__.py b/python/xorbits/_mars/services/mutable/supervisor/__init__.py new file mode 100644 index 000000000..af3be8fff --- /dev/null +++ b/python/xorbits/_mars/services/mutable/supervisor/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .core import MutableObjectManagerActor, MutableTensorActor +from .service import MutableObjectManagerSupervisorService diff --git a/python/xorbits/_mars/services/mutable/supervisor/core.py b/python/xorbits/_mars/services/mutable/supervisor/core.py new file mode 100644 index 000000000..0cedbdeb3 --- /dev/null +++ b/python/xorbits/_mars/services/mutable/supervisor/core.py @@ -0,0 +1,245 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +from typing import Dict, List, Optional, Tuple, Union + +import numpy as np + +from .... import oscar as mo +from ....core import tile +from ....utils import build_fetch +from ...cluster import ClusterAPI +from ...core import NodeRole +from ...meta import MetaAPI +from ..core import MutableTensorInfo +from ..utils import ( + getitem_to_records, + normalize_name, + normalize_timestamp, + setitem_to_records, +) +from ..worker import MutableTensorChunkActor + + +class MutableObjectManagerActor(mo.Actor): + def __init__(self, session_id: str): + self._session_id = session_id + self._cluster_api: Optional[ClusterAPI] = None + + self._mutable_objects = dict() + + async def __post_create__(self): + self._cluster_api = await ClusterAPI.create(self.address) + + async def __pre_destroy__(self): + await asyncio.gather( + *[mo.destroy_actor(ref) for ref in self._mutable_objects.values()] + ) + + @classmethod + def gen_uid(cls, session_id: str): + return f"mutable-object-manager-{session_id}" + + async def create_mutable_tensor(self, *args, name: Optional[str] = None, **kwargs): + name = normalize_name(name) + if name in self._mutable_objects: + raise ValueError(f"Mutable tensor {name} already exists!") + + workers: List[str] = list( + await self._cluster_api.get_nodes_info(role=NodeRole.WORKER) + ) + + tensor_ref = await mo.create_actor( + MutableTensorActor, + self._session_id, + name, + workers, + *args, + **kwargs, + address=self.address, + uid=MutableTensorActor.gen_uid(self._session_id, name), + ) + self._mutable_objects[name] = tensor_ref + return tensor_ref + + async def get_mutable_tensor(self, name: str): + tensor_ref = self._mutable_objects.get(name, None) + if tensor_ref is None: + raise ValueError(f"Mutable tensor {name} doesn't exist!") + return tensor_ref + + async def seal_mutable_tensor(self, name: str, timestamp=None): + tensor_ref = self._mutable_objects.get(name, None) + if tensor_ref is None: + raise ValueError(f"Mutable tensor {name} doesn't exist!") + tensor = await tensor_ref.seal(timestamp) + await mo.destroy_actor(tensor_ref) + self._mutable_objects.pop(name) + return tensor + + +class MutableTensorActor(mo.Actor): + def __init__( + self, + session_id: str, + name: str, + workers: List[str], + shape: Tuple, + dtype: Union[np.dtype, str], + default_value: Union[int, float] = 0, + chunk_size: Union[int, Tuple] = None, + ): + self._session_id = session_id + self._name = name + self._workers = workers + self._shape = shape + self._dtype = dtype + self._default_value = default_value + self._chunk_size = chunk_size + + self._sealed = False + + self._fetch = None + self._chunk_actors = [] + # chunk to actor: {chunk index -> actor uid} + self._chunk_to_actor: Dict[ + Tuple, mo.ActorRefType[MutableTensorChunkActor] + ] = dict() + + async def __post_create__(self): + self._meta_api = await MetaAPI.create(self._session_id, self.address) + + # tiling a random tensor to generate keys, but we doesn't actually execute + # the random generator + from ....tensor.random import rand + + self._fetch = build_fetch( + tile(rand(*self._shape, dtype=self._dtype, chunk_size=self._chunk_size)) + ) + + chunk_groups = np.array_split(self._fetch.chunks, len(self._workers)) + for idx, (worker, chunks) in enumerate(zip(self._workers, chunk_groups)): + if len(chunks) == 0: + break + chunk_actor_ref = await mo.create_actor( + MutableTensorChunkActor, + self._session_id, + self.address, + list(chunks), + dtype=self._dtype, + default_value=self._default_value, + address=worker, + uid=MutableTensorChunkActor.gen_uid(self._session_id, self._name, idx), + ) + self._chunk_actors.append(chunk_actor_ref) + for chunk in chunks: + self._chunk_to_actor[chunk.index] = chunk_actor_ref + + async def __pre_destroy__(self): + await asyncio.gather(*[mo.destroy_actor(ref) for ref in self._chunk_actors]) + + @classmethod + def gen_uid(cls, session_id, name): + return f"mutable-tensor-{session_id}-{name}" + + async def info(self) -> "MutableTensorInfo": + return MutableTensorInfo( + self._shape, self._dtype, self._name, self._default_value + ) + + @mo.extensible + async def _read_chunk( + self, chunk_actor_ref, chunk_index, records, chunk_value_shape, timestamp + ): + return await chunk_actor_ref.read( + chunk_index, records, chunk_value_shape, timestamp + ) + + async def read(self, index, timestamp=None): + """ + Read value from mutable tensor. + + Parameters + ---------- + index: + Index to read from the tensor. + + timestamp: optional + Timestamp to read value that happened before then. + """ + timestamp = normalize_timestamp(timestamp) + records, output_shape = getitem_to_records(self._fetch, index) + + read_tasks, chunk_indices = [], [] + for chunk_index, (records, chunk_value_shape, indices) in records.items(): + chunk_actor_ref = self._chunk_to_actor[chunk_index] + read_tasks.append( + self._read_chunk.delay( + chunk_actor_ref, chunk_index, records, chunk_value_shape, timestamp + ) + ) + chunk_indices.append(indices) + chunks = await self._read_chunk.batch(*read_tasks) + result = np.full(output_shape, fill_value=self._default_value) + for chunk, indices in zip(chunks, chunk_indices): + result[indices] = chunk + return result + + @mo.extensible + async def _write_chunk(self, chunk_actor_ref, chunk_index, records): + await chunk_actor_ref.write(chunk_index, records) + + async def write(self, index, value, timestamp=None): + """ + Write value to mutable tensor. + + Parameters + ---------- + index: + Index to write to the tensor. + + value: + The value that will be filled into the mutable tensor according to `index`. + + timestamp: optional + Timestamp to associated with the newly touched value. + """ + timestamp = normalize_timestamp(timestamp) + records = setitem_to_records(self._fetch, index, value, timestamp) + + write_tasks = [] + for chunk_index, records in records.items(): + chunk_actor_ref = self._chunk_to_actor[chunk_index] + write_tasks.append( + self._write_chunk.delay(chunk_actor_ref, chunk_index, records) + ) + await self._write_chunk.batch(*write_tasks) + + @mo.extensible + async def _seal_chunk(self, chunk_actor_ref, timestamp): + await chunk_actor_ref.seal(timestamp) + + async def seal(self, timestamp=None): + if self._sealed: + return self._fetch + + timestamp = normalize_timestamp(timestamp) + self._sealed = True + seal_tasks = [] + for chunk_actor_ref in self._chunk_actors: + seal_tasks.append(self._seal_chunk.delay(chunk_actor_ref, timestamp)) + await self._seal_chunk.batch(*seal_tasks) + self._chunk_actors = [] + return self._fetch diff --git a/python/xorbits/_mars/services/mutable/supervisor/service.py b/python/xorbits/_mars/services/mutable/supervisor/service.py new file mode 100644 index 000000000..9e0302347 --- /dev/null +++ b/python/xorbits/_mars/services/mutable/supervisor/service.py @@ -0,0 +1,40 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .... import oscar as mo +from ...core import AbstractService +from .core import MutableObjectManagerActor + + +class MutableObjectManagerSupervisorService(AbstractService): + async def start(self): + pass + + async def stop(self): + pass + + async def create_session(self, session_id: str): + await mo.create_actor( + MutableObjectManagerActor, + session_id, + address=self._address, + uid=MutableObjectManagerActor.gen_uid(session_id), + ) + + async def destroy_session(self, session_id: str): + await mo.destroy_actor( + mo.create_actor_ref( + uid=MutableObjectManagerActor.gen_uid(session_id), address=self._address + ) + ) diff --git a/python/xorbits/_mars/services/mutable/tests/__init__.py b/python/xorbits/_mars/services/mutable/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/services/mutable/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/services/mutable/tests/test_mutable.py b/python/xorbits/_mars/services/mutable/tests/test_mutable.py new file mode 100644 index 000000000..87fc8d510 --- /dev/null +++ b/python/xorbits/_mars/services/mutable/tests/test_mutable.py @@ -0,0 +1,247 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import sys +import uuid + +import numpy as np +import pytest + +from ....deploy.oscar.local import new_cluster +from ....deploy.oscar.session import AsyncSession, SyncSession +from ..core import MutableTensor +from ..utils import normalize_timestamp + +_is_windows = sys.platform.lower().startswith("win") + + +@pytest.fixture +async def create_cluster(): + client = await new_cluster(n_worker=2, n_cpu=2, web=True) + async with client: + yield client + + +@pytest.mark.skipif(_is_windows, reason="FIXME") +@pytest.mark.parametrize( + "session_type", + ["async_session", "async_web_session", "sync_session", "sync_web_session"], +) +@pytest.mark.asyncio +async def test_mutable_tensor(create_cluster, session_type): + is_web = "web" in session_type + is_async = "async" in session_type + + if is_web: + session_id = str(uuid.uuid4()) + session = await AsyncSession.init(create_cluster.web_address, session_id) + else: + session = create_cluster.session + if not is_async: + session = SyncSession.from_isolated_session(session) + + tensor_useless: MutableTensor = session.create_mutable_tensor( # noqa: F841 + shape=(10, 30, 50), dtype=np.int64, default_value=100, chunk_size=(20, 20, 20) + ) + if is_async: + tensor_useless = await tensor_useless + + tensor: MutableTensor = session.create_mutable_tensor( + shape=(10, 30, 50), + dtype=np.int64, + name="mytensor", + default_value=100, + chunk_size=(20, 20, 20), + ) + if is_async: + tensor = await tensor + + assert tensor.shape == (10, 30, 50) + assert tensor.dtype == np.int64 + assert tensor.name == "mytensor" + assert tensor.default_value == 100 + + assert tensor_useless.name != tensor.name + + # non exists + with pytest.raises(ValueError): + tensor1 = session.get_mutable_tensor("notensor") + if is_async: + tensor1 = await tensor1 + + # create with duplicate name + with pytest.raises(ValueError): + tensor2 = session.create_mutable_tensor( + shape=(10, 30, 50), + dtype=np.int64, + name="mytensor", + default_value=100, + chunk_size=(20, 20, 20), + ) + if is_async: + tensor2 = await tensor2 + + tensor3: MutableTensor = session.get_mutable_tensor("mytensor") + if is_async: + tensor3 = await tensor3 + assert tensor3.shape == (10, 30, 50) + assert tensor3.dtype == np.int64 + assert tensor3.name == "mytensor" + assert tensor3.default_value == 100 + + # test using read/write + + expected = np.full((10, 30, 50), fill_value=100) + xs = await tensor3.read(slice(None, None, None)) + np.testing.assert_array_equal(expected, xs) + + await tensor.write(slice(None, None, None), 1) + expected[:] = 1 + xs = await tensor3.read(slice(None, None, None)) + np.testing.assert_array_equal(expected, xs) + + await tensor.write((9, 2, 3), 2) + expected[9, 2, 3] = 2 + xs = await tensor3.read((9, 2, 3)) + assert expected[9, 2, 3] == xs + + await tensor.write((slice(2, 9, 3), slice(5, 15, None), slice(8, 50, 9)), 4) + expected[2:9:3, 5:15, 8:50:9] = 4 + xs = await tensor3.read(slice(None, None, None)) + np.testing.assert_array_equal(expected, xs) + + # test using __getitem__/__setitem__ + + # reset + tensor[:] = 100 + + expected = np.full((10, 30, 50), fill_value=100) + xs = tensor3[:] + np.testing.assert_array_equal(expected, xs) + + tensor[:] = 1 + expected[:] = 1 + xs = tensor3[:] + np.testing.assert_array_equal(expected, xs) + + tensor[9, 2, 3] = 2 + expected[9, 2, 3] = 2 + xs = tensor3[9, 2, 3] + assert expected[9, 2, 3] == xs + + tensor[2:19:3, 5:15, 8:50:9] = 4 + expected[2:19:3, 5:15, 8:50:9] = 4 + xs = tensor3[:] + np.testing.assert_array_equal(expected, xs) + + # seal + + if is_async: + sealed = await tensor.seal() + info = await session.execute(sealed) + await info + value = await session.fetch(sealed) + else: + sealed = await tensor.seal() + session.execute(sealed) + value = session.fetch(sealed) + np.testing.assert_array_equal(expected, value) + + # non exists after sealed + with pytest.raises(ValueError): + await tensor.seal() + with pytest.raises(ValueError): + await tensor3.seal() + + # TODO: real fancy index not supported yet, as `TensorConcatenate` involved + # + # await tensor.write(([11, 2, 3, 50], [14, 5, 6, 50], [17, 8, 9, 50]), 3) + # expected[[11, 2, 3, 50], [14, 5, 6, 50], [17, 8, 9, 50]] = 3 + # xs = await tensor1[:] + # np.testing.assert_array_equal(expected, xs) + + +@pytest.mark.skipif(_is_windows, reason="FIXME") +@pytest.mark.parametrize( + "session_type", + ["async_session", "async_web_session", "sync_session", "sync_web_session"], +) +@pytest.mark.asyncio +async def test_mutable_tensor_timestamp(create_cluster, session_type): + is_web = "web" in session_type + is_async = "async" in session_type + + if is_web: + session_id = str(uuid.uuid4()) + session = await AsyncSession.init(create_cluster.web_address, session_id) + else: + session = create_cluster.session + if not is_async: + session = SyncSession.from_isolated_session(session) + + tensor: MutableTensor = session.create_mutable_tensor( + shape=(2, 4), dtype=np.int64, default_value=0, chunk_size=(1, 3) + ) + if is_async: + tensor = await tensor + + assert tensor.shape == (2, 4) + assert tensor.dtype == np.int64 + assert tensor.default_value == 0 + + t0 = normalize_timestamp() + await asyncio.sleep(5) + t1 = normalize_timestamp() + + # write with earlier timestamp + await tensor.write((slice(0, 2, 1), slice(0, 2, 1)), 1, timestamp=t1) + + # read staled value + actual = await tensor.read(slice(None, None, None), t0) + expected = np.array([[0, 0, 0, 0], [0, 0, 0, 0]]) + np.testing.assert_array_equal(expected, actual) + + # read current value + actual = await tensor.read(slice(None, None, None), t1) + expected = np.array([[1, 1, 0, 0], [1, 1, 0, 0]]) + np.testing.assert_array_equal(expected, actual) + + # read new value + t2 = normalize_timestamp() + actual = await tensor.read(slice(None, None, None), t2) + expected = np.array([[1, 1, 0, 0], [1, 1, 0, 0]]) + np.testing.assert_array_equal(expected, actual) + + # read latest value + actual = await tensor.read(slice(None, None, None)) + expected = np.array([[1, 1, 0, 0], [1, 1, 0, 0]]) + np.testing.assert_array_equal(expected, actual) + + # seal on staled value + if is_async: + sealed = await tensor.seal(timestamp=t0) + info = await session.execute(sealed) + await info + actual = await session.fetch(sealed) + else: + sealed = await tensor.seal(timestamp=t0) + session.execute(sealed) + actual = session.fetch(sealed) + expected = np.array([[0, 0, 0, 0], [0, 0, 0, 0]]) + np.testing.assert_array_equal(expected, actual) + + # non exists after sealed + with pytest.raises(ValueError): + await tensor.seal() diff --git a/python/xorbits/_mars/services/mutable/utils.py b/python/xorbits/_mars/services/mutable/utils.py new file mode 100644 index 000000000..2be686205 --- /dev/null +++ b/python/xorbits/_mars/services/mutable/utils.py @@ -0,0 +1,219 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import uuid +from datetime import datetime +from numbers import Integral +from typing import Optional + +import numpy as np + +from ...core import tile + + +def indexing_to_chunk_indices(output_chunk): + """ + Compute input_indices and value_indices when read from or write to + a tensor chunk. + + Parameters + ---------- + output_chunk: + A chunk in the output of the `__setitem__` op. + + Returns + ------- + The indices in the input chunk, and value_indices in the value block + that will be assigned. + """ + input_indices = [] # index in the chunk of the mutable tensor + value_indices = [] # index in the chunk of the assigned value + for d, s in zip(output_chunk.op.indexes, output_chunk.op.inputs[0].shape): + # expand the index (slice) + idx = np.r_[slice(*d.indices(s)) if isinstance(d, slice) else d] + input_indices.append(idx) + if not isinstance(d, Integral): + value_indices.append(np.arange(len(idx))) + return input_indices, value_indices + + +def compute_output_of_indexing(tensor, tensor_index): + """ + Compute the output information of `__{set,get}item__` on tensor for every chunk. + """ + from ...tensor.indexing.core import calc_shape, process_index + from ...tensor.indexing.getitem import TensorIndex + + tensor_index = process_index(tensor.ndim, tensor_index) + output_shape = calc_shape(tensor.shape, tensor_index) + + index_tensor_op = TensorIndex( + dtype=tensor.dtype, sparse=False, indexes=list(tensor_index) + ) + index_tensor = tile(index_tensor_op.new_tensor([tensor], shape=tuple(output_shape))) + output_chunks = index_tensor.chunks + + nsplits_acc = [ + np.cumsum( + (0,) + + tuple( + c.shape[i] + for c in output_chunks + if all(idx == 0 for j, idx in enumerate(c.index) if j != i) + ) + ) + for i in range(len(output_chunks[0].shape)) + ] + return output_shape, output_chunks, nsplits_acc + + +def setitem_on_chunk_to_records(nsplits_acc, output_chunk, value, ts, is_scalar): + """ + Turns a `__setitem__` on chunk to a list of index-value records. + + Parameters + ---------- + nsplits_acc: + Accumulate nsplits arrays of the output tensor chunks. + + Returns + ------- + A list of `(index, value, timestamp)`, where `index` is the in-chunk index. + """ + input_indices, value_indices = indexing_to_chunk_indices(output_chunk) + + # normalize assigned value + if is_scalar: + chunk_value = value + else: + chunk_value_slice = tuple( + slice( + nsplits_acc[i][output_chunk.index[i]], + nsplits_acc[i][output_chunk.index[i] + 1], + ) + for i in range(len(output_chunk.index)) + ) + chunk_value = value[chunk_value_slice] + + records = [] + for chunk_idx, value_idx in zip( + itertools.product(*input_indices), itertools.product(*value_indices) + ): + new_value = chunk_value if is_scalar else chunk_value[value_idx] + index_in_chunk = np.ravel_multi_index( + chunk_idx, output_chunk.op.inputs[0].shape + ) + records.append((index_in_chunk, new_value, ts)) + return records + + +def setitem_to_records(tensor, tensor_index, value, timestamp): + """ + Compute the records of `__setitem__` on tensor for every chunk. + + Returns + ------- + dict, a dict of chunk index to records in that chunk. + """ + output_shape, output_chunks, nsplits_acc = compute_output_of_indexing( + tensor, tensor_index + ) + + is_scalar = ( + np.isscalar(value) + or isinstance(value, tuple) + and tensor.dtype.fields is not None + ) + if not is_scalar: + value = np.broadcast_to(value, output_shape).astype(tensor.dtype) + + records = dict() + for output_chunk in output_chunks: + records_in_chunk = setitem_on_chunk_to_records( + nsplits_acc, output_chunk, value, timestamp, is_scalar=is_scalar + ) + records[output_chunk.op.inputs[0].index] = records_in_chunk + return records + + +def getitem_on_chunk_to_records(nsplits_acc, output_chunk): + """ + Turns a `__getitem__` on chunk to a list of index-value records. + + Parameters + ---------- + nsplits_acc: + Accumulate nsplits arrays of the output tensor chunks. + + Returns + ------- + records: A list of `(index, value_index)`, where `index` is the in-chunk index, and + `value_index` is the index in the final result block. + chunk_value_shape: shape of result of this chunk. + chunk_value_slice: index of result of this chunk in the whole result tensor. + """ + input_indices, value_indices = indexing_to_chunk_indices(output_chunk) + + chunk_value_slice = tuple( + slice( + nsplits_acc[i][output_chunk.index[i]], + nsplits_acc[i][output_chunk.index[i] + 1], + ) + for i in range(len(output_chunk.index)) + ) + + records = [] + for chunk_idx, value_idx in zip( + itertools.product(*input_indices), itertools.product(*value_indices) + ): + index_in_chunk = np.ravel_multi_index( + chunk_idx, output_chunk.op.inputs[0].shape + ) + records.append((index_in_chunk, value_idx)) + return records, output_chunk.shape, chunk_value_slice + + +def getitem_to_records(tensor, tensor_index): + """ + Compute the records of `__getitem__` on tensor for every chunk. + + Returns + ------- + records and output_chunk dict, records is a dict of chunk index to records + in that chunk. + """ + output_shape, output_chunks, nsplits_acc = compute_output_of_indexing( + tensor, tensor_index + ) + + records = dict() + for output_chunk in output_chunks: + records_in_chunk = getitem_on_chunk_to_records(nsplits_acc, output_chunk) + records[output_chunk.op.inputs[0].index] = records_in_chunk + return records, output_shape + + +def normalize_timestamp(timestamp=None): + if timestamp is None: + timestamp = np.datetime64(datetime.now()) + if isinstance(timestamp, datetime): + timestamp = np.datetime64(timestamp) + return timestamp + + +def normalize_name(name: Optional[str] = None): + if not name: + return str(uuid.uuid4()) + return name diff --git a/python/xorbits/_mars/services/mutable/worker/__init__.py b/python/xorbits/_mars/services/mutable/worker/__init__.py new file mode 100644 index 000000000..e6334983a --- /dev/null +++ b/python/xorbits/_mars/services/mutable/worker/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .core import MutableTensorChunkActor diff --git a/python/xorbits/_mars/services/mutable/worker/core.py b/python/xorbits/_mars/services/mutable/worker/core.py new file mode 100644 index 000000000..926747a68 --- /dev/null +++ b/python/xorbits/_mars/services/mutable/worker/core.py @@ -0,0 +1,142 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bisect +import sys +from collections import defaultdict +from typing import List, Union + +import numpy as np + +from .... import oscar as mo +from ....typing import ChunkType + + +class MutableTensorChunkActor(mo.Actor): + def __init__( + self, + session_id: str, + manager_address: str, + chunks: List, + dtype: Union[np.dtype, str], + default_value: Union[int, float] = 0, + ) -> None: + self._session_id = session_id + self._manager_address = manager_address + self._chunks = chunks + self._dtype = dtype + self._default_value = default_value + + self._storage_api = None + self._meta_api = None + + self._index_to_chunk = None + + @classmethod + def gen_uid(cls, session_id: str, name: str, index: int): + return f"mutable-tensor-chunk-{session_id}-{name}-{index}" + + async def __post_create__(self): + from ...meta import MetaAPI + from ...storage import StorageAPI + + self._storage_api = await StorageAPI.create(self._session_id, self.address) + self._meta_api = await MetaAPI.create(self._session_id, self._manager_address) + + self._index_to_chunk = { + chunk.index: MutableTensorChunk( + chunk, + self._manager_address, + self.address, + default_value=self._default_value, + ) + for chunk in self._chunks + } + + async def write(self, chunk_index, records): + chunk: MutableTensorChunk = self._index_to_chunk[chunk_index] + await chunk.write(records) + + async def read(self, chunk_index, records, chunk_value_shape, timestamp): + chunk: MutableTensorChunk = self._index_to_chunk[chunk_index] + return await chunk.read(records, chunk_value_shape, timestamp) + + async def seal(self, timestamp): + for _, chunk in self._index_to_chunk.items(): + chunk_data = await chunk.seal(timestamp) + await self._storage_api.put(chunk.chunk.key, chunk_data) + await self._meta_api.set_chunk_meta( + chunk.chunk, bands=[(self.address, "numa-0")] + ) + + +class MutableTensorChunk: + def __init__( + self, + chunk: ChunkType, + manager_address: str, + worker_address: str, + default_value: Union[int, float] = 0, + ) -> None: + self._chunk = chunk + self._manager_address = manager_address + self._worker_address = worker_address + self._default_value = default_value + + self._records = defaultdict(list) + + @property + def chunk(self): + return self._chunk + + async def write(self, records): + for flat_index, value, ts in records: + self._records[flat_index].append((ts, value)) + + async def read(self, records, chunk_value_shape, timestamp): + result = np.full(shape=chunk_value_shape, fill_value=self._default_value) + for flat_index, value_index in records: + if flat_index not in self._records: + continue + # Find the newest one. + # + # FIXME Python doesn't have things like SortedDict or SortedList, + # we trigger a `sorted` here to ensure the correct semantic and try + # to be as efficient as possible. + self._records[flat_index].sort() + # bitsect will compare on first element in the tuple. + index = bisect.bisect_right( + self._records[flat_index], (timestamp, sys.float_info.max) + ) + if index == 0: + continue + result[value_index] = self._records[flat_index][index - 1][ + 1 + ] # take the value + return result + + async def seal(self, timestamp): + result = np.full(self._chunk.shape, self._default_value) + for flat_index, values in self._records.items(): + if flat_index not in self._records: + continue + # compute value + values.sort() + index = bisect.bisect_right(values, (timestamp, sys.float_info.max)) + if index == 0: + continue + # compute value index + value_index = np.unravel_index(flat_index, self._chunk.shape) + result[value_index] = values[index - 1][1] # take the value + return result diff --git a/python/xorbits/_mars/services/scheduling/__init__.py b/python/xorbits/_mars/services/scheduling/__init__.py new file mode 100644 index 000000000..679cdc435 --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .api import MockSchedulingAPI, SchedulingAPI diff --git a/python/xorbits/_mars/services/scheduling/api/__init__.py b/python/xorbits/_mars/services/scheduling/api/__init__.py new file mode 100644 index 000000000..da695e1bf --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/api/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .oscar import MockSchedulingAPI, SchedulingAPI diff --git a/python/xorbits/_mars/services/scheduling/api/core.py b/python/xorbits/_mars/services/scheduling/api/core.py new file mode 100644 index 000000000..41773db67 --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/api/core.py @@ -0,0 +1,37 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from typing import List, Optional + +from ..core import SubtaskScheduleSummary + + +class AbstractSchedulingAPI(ABC): + @abstractmethod + def get_subtask_schedule_summaries( + self, task_id: Optional[str] = None + ) -> List[SubtaskScheduleSummary]: + """ + Get details of scheduling for tasks + + Parameters + ---------- + task_id + + Returns + ------- + details + List of details for subtasks + """ diff --git a/python/xorbits/_mars/services/scheduling/api/oscar.py b/python/xorbits/_mars/services/scheduling/api/oscar.py new file mode 100644 index 000000000..96e3ea43f --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/api/oscar.py @@ -0,0 +1,202 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional, Tuple, Type, TypeVar, Union + +from .... import oscar as mo +from ....lib.aio import alru_cache +from ...subtask import Subtask +from ..core import SubtaskScheduleSummary +from .core import AbstractSchedulingAPI + +APIType = TypeVar("APIType", bound="SchedulingAPI") + + +class SchedulingAPI(AbstractSchedulingAPI): + def __init__( + self, + session_id: str, + address: str, + manager_ref=None, + queueing_ref=None, + autoscaler_ref=None, + ): + self._session_id = session_id + self._address = address + + self._manager_ref = manager_ref + self._queueing_ref = queueing_ref + self._autoscaler = autoscaler_ref + + @classmethod + @alru_cache + async def create(cls: Type[APIType], session_id: str, address: str) -> APIType: + from ..supervisor.manager import SubtaskManagerActor + + manager_ref = await mo.actor_ref( + SubtaskManagerActor.gen_uid(session_id), address=address + ) + from ..supervisor.queueing import SubtaskQueueingActor + + queueing_ref = await mo.actor_ref( + SubtaskQueueingActor.gen_uid(session_id), address=address + ) + + from ...cluster import ClusterAPI + from ..supervisor.autoscale import AutoscalerActor + + cluster_api = await ClusterAPI.create(address) + [autoscaler] = await cluster_api.get_supervisor_refs( + [AutoscalerActor.default_uid()] + ) + scheduling_api = SchedulingAPI( + session_id, address, manager_ref, queueing_ref, autoscaler + ) + return scheduling_api + + async def get_subtask_schedule_summaries( + self, task_id: Optional[str] = None + ) -> List[SubtaskScheduleSummary]: + return await self._manager_ref.get_schedule_summaries(task_id) + + async def add_subtasks( + self, subtasks: List[Subtask], priorities: Optional[List[Tuple]] = None + ): + """ + Submit subtasks into scheduling service + + Parameters + ---------- + subtasks + list of subtasks to be submitted to service + priorities + list of priorities of subtasks + """ + if priorities is None: + priorities = [subtask.priority or tuple() for subtask in subtasks] + await self._manager_ref.add_subtasks(subtasks, priorities) + + @mo.extensible + async def update_subtask_priority(self, subtask_id: str, priority: Tuple): + """ + Update priorities of subtasks + + Parameters + ---------- + subtask_id + id of subtask to update priority + priority + list of priority of subtasks + """ + raise NotImplementedError + + @update_subtask_priority.batch + async def update_subtask_priority(self, args_list, kwargs_list): + await self._queueing_ref.update_subtask_priority.batch( + *( + self._queueing_ref.update_subtask_priority.delay(*args, **kwargs) + for args, kwargs in zip(args_list, kwargs_list) + ) + ) + + async def cancel_subtasks( + self, subtask_ids: List[str], kill_timeout: Union[float, int] = None + ): + """ + Cancel pending and running subtasks. + + Parameters + ---------- + subtask_ids + ids of subtasks to cancel + kill_timeout + timeout seconds to kill actor process forcibly + """ + await self._manager_ref.cancel_subtasks(subtask_ids, kill_timeout=kill_timeout) + + async def finish_subtasks( + self, + subtask_ids: List[str], + bands: List[Tuple] = None, + schedule_next: bool = True, + ): + """ + Mark subtasks as finished, letting scheduling service to schedule + next tasks in the ready queue + + Parameters + ---------- + subtask_ids + ids of subtasks to mark as finished + bands + bands of subtasks to mark as finished + schedule_next + whether to schedule succeeding subtasks + """ + await self._manager_ref.finish_subtasks(subtask_ids, bands, schedule_next) + + async def disable_autoscale_in(self): + """Disable autoscale in""" + await self._autoscaler.disable_autoscale_in() + + async def try_enable_autoscale_in(self): + """Try to enable autoscale in, the autoscale-in will be enabled only when last call corresponding + `disable_autoscale_in` has been invoked.""" + await self._autoscaler.try_enable_autoscale_in() + + +class MockSchedulingAPI(SchedulingAPI): + @classmethod + async def create(cls: Type[APIType], session_id: str, address: str) -> APIType: + from ..supervisor import AutoscalerActor, GlobalResourceManagerActor + + await mo.create_actor( + GlobalResourceManagerActor, + uid=GlobalResourceManagerActor.default_uid(), + address=address, + ) + await mo.create_actor( + AutoscalerActor, {}, uid=AutoscalerActor.default_uid(), address=address + ) + + from .... import resource as mars_resource + from ..worker import ( + SubtaskExecutionActor, + WorkerQuotaManagerActor, + WorkerSlotManagerActor, + ) + + await mo.create_actor( + SubtaskExecutionActor, + subtask_max_retries=0, + uid=SubtaskExecutionActor.default_uid(), + address=address, + ) + await mo.create_actor( + WorkerSlotManagerActor, + uid=WorkerSlotManagerActor.default_uid(), + address=address, + ) + await mo.create_actor( + WorkerQuotaManagerActor, + {"quota_size": mars_resource.virtual_memory().total}, + uid=WorkerQuotaManagerActor.default_uid(), + address=address, + ) + + from ..supervisor import SchedulingSupervisorService + + service = SchedulingSupervisorService({}, address) + await service.create_session(session_id) + return await super().create(session_id, address) diff --git a/python/xorbits/_mars/services/scheduling/api/web.py b/python/xorbits/_mars/services/scheduling/api/web.py new file mode 100644 index 000000000..725fc99a8 --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/api/web.py @@ -0,0 +1,106 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from typing import Callable, List, Optional + +from ....lib.aio import alru_cache +from ...web import MarsServiceWebAPIHandler, MarsWebAPIClientMixin, web_api +from ..core import SubtaskScheduleSummary +from .core import AbstractSchedulingAPI + + +class SchedulingWebAPIHandler(MarsServiceWebAPIHandler): + _root_pattern = "/api/session/(?P[^/]+)/scheduling" + + @alru_cache(cache_exceptions=False) + async def _get_cluster_api(self): + from ...cluster import ClusterAPI + + return await ClusterAPI.create(self._supervisor_addr) + + @alru_cache(cache_exceptions=False) + async def _get_oscar_scheduling_api(self, session_id: str): + from ..api import SchedulingAPI + + cluster_api = await self._get_cluster_api() + [address] = await cluster_api.get_supervisors_by_keys([session_id]) + return await SchedulingAPI.create(session_id, address) + + @web_api("subtasks", method="get") + async def get_subtask_schedule_summaries(self, session_id: str): + oscar_api = await self._get_oscar_scheduling_api(session_id) + task_id = self.get_argument("task_id", None) or None + + result = await oscar_api.get_subtask_schedule_summaries(task_id) + self.write( + json.dumps( + { + summary.subtask_id: { + "task_id": summary.task_id, + "subtask_id": summary.subtask_id, + "bands": [ + { + "endpoint": band[0], + "band_name": band[1], + } + for band in summary.bands + ], + "num_reschedules": summary.num_reschedules, + "is_finished": summary.is_finished, + "is_cancelled": summary.is_cancelled, + } + for summary in result + } + ) + ) + + +web_handlers = {SchedulingWebAPIHandler.get_root_pattern(): SchedulingWebAPIHandler} + + +class WebSchedulingAPI(AbstractSchedulingAPI, MarsWebAPIClientMixin): + def __init__( + self, session_id: str, address: str, request_rewriter: Callable = None + ): + self._session_id = session_id + self._address = address.rstrip("/") + self.request_rewriter = request_rewriter + + async def get_subtask_schedule_summaries( + self, task_id: Optional[str] = None + ) -> List[SubtaskScheduleSummary]: + task_id = task_id or "" + path = ( + f"{self._address}/api/session/{self._session_id}/scheduling/subtasks" + f"?task_id={task_id}" + ) + + res = await self._request_url("GET", path) + res_json = json.loads(res.body) + + return [ + SubtaskScheduleSummary( + task_id=summary_json["task_id"], + subtask_id=summary_json["subtask_id"], + bands=[ + (band_json["endpoint"], band_json["band_name"]) + for band_json in summary_json["bands"] + ], + num_reschedules=summary_json["num_reschedules"], + is_finished=summary_json["is_finished"], + is_cancelled=summary_json["is_cancelled"], + ) + for summary_json in res_json.values() + ] diff --git a/python/xorbits/_mars/services/scheduling/core.py b/python/xorbits/_mars/services/scheduling/core.py new file mode 100644 index 000000000..7fd47fda0 --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/core.py @@ -0,0 +1,34 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List + +from ...serialization.serializables import ( + BoolField, + FieldTypes, + Int32Field, + ListField, + Serializable, + StringField, +) +from ...typing import BandType + + +class SubtaskScheduleSummary(Serializable): + task_id: str = StringField("task_id") + subtask_id: str = StringField("subtask_id") + bands: List[BandType] = ListField("bands", FieldTypes.tuple(FieldTypes.string)) + is_finished: bool = BoolField("is_finished", default=False) + is_cancelled: bool = BoolField("is_cancelled", default=False) + num_reschedules: int = Int32Field("num_reschedules", default=0) diff --git a/python/xorbits/_mars/services/scheduling/errors.py b/python/xorbits/_mars/services/scheduling/errors.py new file mode 100644 index 000000000..c06337481 --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/errors.py @@ -0,0 +1,27 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...core.base import MarsError + + +class NoMatchingSlots(MarsError): + def __init__(self, slot_prefix): + self.slot_prefix = slot_prefix + + def __str__(self): + return str(self.slot_prefix) + + +class NoAvailableBand(MarsError): + pass diff --git a/python/xorbits/_mars/services/scheduling/supervisor/__init__.py b/python/xorbits/_mars/services/scheduling/supervisor/__init__.py new file mode 100644 index 000000000..aac494d7e --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/supervisor/__init__.py @@ -0,0 +1,20 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .assigner import AssignerActor +from .autoscale import AutoscalerActor +from .globalresource import GlobalResourceManagerActor +from .manager import SubtaskManagerActor +from .queueing import SubtaskQueueingActor +from .service import SchedulingSupervisorService diff --git a/python/xorbits/_mars/services/scheduling/supervisor/assigner.py b/python/xorbits/_mars/services/scheduling/supervisor/assigner.py new file mode 100644 index 000000000..d5d004d01 --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/supervisor/assigner.py @@ -0,0 +1,292 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import itertools +from collections import defaultdict +from typing import Dict, List, Set + +import numpy as np + +from .... import oscar as mo +from ....core.operand import Fetch, FetchShuffle +from ....typing import BandType +from ...core import NodeRole +from ...subtask import Subtask +from ..errors import NoAvailableBand, NoMatchingSlots + + +class AssignerActor(mo.Actor): + _bands: List[BandType] + + @classmethod + def gen_uid(cls, session_id: str): + return f"{session_id}_assigner" + + def __init__(self, session_id: str): + self._session_id = session_id + self._slots_ref = None + + self._cluster_api = None + self._meta_api = None + + self._bands = [] + self._address_to_bands = dict() + self._device_type_to_bands = dict() + self._band_watch_task = None + + async def __post_create__(self): + from ...cluster.api import ClusterAPI + from ...meta.api import MetaAPI + + self._cluster_api = await ClusterAPI.create(self.address) + self._meta_api = await MetaAPI.create( + session_id=self._session_id, address=self.address + ) + + async def watch_bands(): + async for bands in self._cluster_api.watch_all_bands(NodeRole.WORKER): + self._update_bands(list(bands)) + + self._band_watch_task = asyncio.create_task(watch_bands()) + + async def __pre_destroy__(self): + if self._band_watch_task is not None: # pragma: no branch + self._band_watch_task.cancel() + + def _update_bands(self, bands: List[BandType]): + self._bands = bands + + grouped_bands = itertools.groupby(sorted(self._bands), key=lambda b: b[0]) + self._address_to_bands = {k: list(v) for k, v in grouped_bands} + + grouped_bands = itertools.groupby( + sorted(("numa" if b[1].startswith("numa") else "gpu", b) for b in bands), + key=lambda tp: tp[0], + ) + self._device_type_to_bands = { + k: [v[1] for v in tps] for k, tps in grouped_bands + } + + def _get_device_bands(self, is_gpu: bool): + band_prefix = "numa" if not is_gpu else "gpu" + filtered_bands = self._device_type_to_bands.get(band_prefix) or [] + if not filtered_bands: + raise NoMatchingSlots("gpu" if is_gpu else "cpu") + return filtered_bands + + def _get_random_band( + self, + is_gpu: bool, + exclude_bands: Set[BandType] = None, + random_when_unavailable: bool = True, + ): + bands = self._get_device_bands(is_gpu) + if exclude_bands: + avail_bands = [band for band in bands if band not in exclude_bands] + if avail_bands: + return avail_bands[np.random.choice(len(avail_bands))] + elif not random_when_unavailable: + raise NoAvailableBand( + f"No bands available after excluding bands {exclude_bands}" + ) + return bands[np.random.choice(len(bands))] + + async def assign_subtasks( + self, + subtasks: List[Subtask], + exclude_bands: Set[BandType] = None, + random_when_unavailable: bool = True, + ): + exclude_bands = exclude_bands or set() + inp_keys = set() + broadcaster_keys = set() + selected_bands = dict() + + if not self._bands: + self._update_bands( + list(await self._cluster_api.get_all_bands(NodeRole.WORKER)) + ) + + for subtask in subtasks: + is_gpu = any(c.op.gpu for c in subtask.chunk_graph) + if subtask.expect_bands: + # exclude expected but unready bands + expect_available_bands = [ + expect_band + for expect_band in subtask.expect_bands + if expect_band in self._bands and expect_band not in exclude_bands + ] + # fill in if all expected bands are unready + if not expect_available_bands: + expect_available_bands = [ + self._get_random_band( + is_gpu, exclude_bands, random_when_unavailable + ) + ] + selected_bands[subtask.subtask_id] = expect_available_bands + continue + for indep_chunk in subtask.chunk_graph.iter_indep(): + if isinstance(indep_chunk.op, Fetch): + if indep_chunk.is_broadcaster: + broadcaster_keys.add(indep_chunk.key) + inp_keys.add(indep_chunk.key) + elif isinstance(indep_chunk.op, FetchShuffle): + selected_bands[subtask.subtask_id] = [ + self._get_random_band( + is_gpu, exclude_bands, random_when_unavailable + ) + ] + break + + fields = ["store_size", "bands"] + inp_keys = list(inp_keys) + metas = await self._meta_api.get_chunk_meta.batch( + *(self._meta_api.get_chunk_meta.delay(key, fields) for key in inp_keys) + ) + + inp_metas = dict(zip(inp_keys, metas)) + if broadcaster_keys: + # set broadcaster's size as 0 to avoid assigning all successors to same band. + for key in broadcaster_keys: + inp_metas[key]["store_size"] = 0 + assigns = [] + for subtask in subtasks: + is_gpu = any(c.op.gpu for c in subtask.chunk_graph) + band_prefix = "numa" if not is_gpu else "gpu" + filtered_bands = self._get_device_bands(is_gpu) + + if subtask.subtask_id in selected_bands: + bands = selected_bands[subtask.subtask_id] + else: + band_sizes = defaultdict(lambda: 0) + for inp in subtask.chunk_graph.iter_indep(): + if not isinstance(inp.op, Fetch): # pragma: no cover + continue + meta = inp_metas[inp.key] + for band in meta["bands"]: + if not band[1].startswith(band_prefix): + sel_bands = [ + b + for b in self._address_to_bands[band[0]] + if b[1].startswith(band_prefix) + and b not in exclude_bands + ] + if sel_bands: + band = sel_bands[np.random.choice(len(sel_bands))] + if band not in filtered_bands or band in exclude_bands: + band = self._get_random_band( + is_gpu, exclude_bands, random_when_unavailable + ) + band_sizes[band] += meta["store_size"] + bands = [] + max_size = -1 + for band, size in band_sizes.items(): + if size > max_size: + bands = [band] + max_size = size + elif size == max_size: + bands.append(band) + band = bands[np.random.choice(len(bands))] + if ( + not random_when_unavailable and band in exclude_bands + ): # pragma: no cover + raise NoAvailableBand( + f"No bands available for subtask {subtask.subtask_id} after " + f"excluded {exclude_bands}" + ) + if subtask.bands_specified and band not in subtask.expect_bands: + raise NoAvailableBand( + f"No bands available for subtask {subtask.subtask_id} on bands {subtask.expect_bands} " + f"after excluded {exclude_bands}" + ) + assigns.append(band) + return assigns + + async def reassign_subtasks( + self, band_to_queued_num: Dict[BandType, int] + ) -> Dict[BandType, int]: + move_queued_subtasks = {} + for is_gpu in (False, True): + band_name_prefix = "numa" if not is_gpu else "gpu" + + filtered_bands = [ + band for band in self._bands if band[1].startswith(band_name_prefix) + ] + filtered_band_to_queued_num = { + k: v + for k, v in band_to_queued_num.items() + if k[1].startswith(band_name_prefix) + } + + if not filtered_bands: + continue + + num_used_bands = len(filtered_band_to_queued_num.keys()) + if num_used_bands == 1: + [(band, length)] = filtered_band_to_queued_num.items() + if length == 0: + move_queued_subtasks.update({band: 0}) + continue + # no need to balance when there's only one band initially + if len(filtered_bands) == 1 and band == filtered_bands[0]: + move_queued_subtasks.update({band: 0}) + continue + # unready bands recorded in band_num_queued_subtasks, some of them may hold 0 subtasks + unready_bands = list( + set(filtered_band_to_queued_num.keys()) - set(filtered_bands) + ) + # ready bands not recorded in band_num_queued_subtasks, all of them hold 0 subtasks + new_ready_bands = list( + set(filtered_bands) - set(filtered_band_to_queued_num.keys()) + ) + # when there are new ready bands, make all bands hold same amount of subtasks + # when there are no new ready bands now, move out subtasks left on them + if not new_ready_bands and unready_bands: + filtered_band_to_queued_num = { + k: filtered_band_to_queued_num[k] for k in unready_bands + } + # approximate total of subtasks moving to each ready band + num_all_subtasks = sum(filtered_band_to_queued_num.values()) + mean = int(num_all_subtasks / len(filtered_bands)) + # all_bands (namely) includes: + # a. ready bands recorded in band_num_queued_subtasks + # b. ready bands not recorded in band_num_queued_subtasks + # c. unready bands recorded in band_num_queued_subtasks + # a. + b. = self._bands, a. + c. = bands in band_num_queued_subtasks + all_bands = list( + set(filtered_bands) | set(filtered_band_to_queued_num.keys()) + ) + # calculate the differential steps of moving subtasks + # move < 0 means subtasks should move out and vice versa + # unready bands no longer hold subtasks + # assuming bands not recorded in band_num_queued_subtasks hold 0 subtasks + band_move_nums = {} + for band in all_bands: + if band in filtered_bands: + band_move_nums[band] = mean - filtered_band_to_queued_num.get( + band, 0 + ) + else: + band_move_nums[band] = -filtered_band_to_queued_num.get(band, 0) + # ensure the balance of moving in and out + total_move = sum(band_move_nums.values()) + # int() is going to be closer to zero, so `mean` is no more than actual mean value + # total_move = mean * len(self._bands) - num_all_subtasks + # <= actual_mean * len(self._bands) - num_all_subtasks = 0 + assert total_move <= 0 + if total_move != 0: + band_move_nums[self._get_random_band(False)] -= total_move + move_queued_subtasks.update(band_move_nums) + return dict(sorted(move_queued_subtasks.items(), key=lambda item: item[1])) diff --git a/python/xorbits/_mars/services/scheduling/supervisor/autoscale.py b/python/xorbits/_mars/services/scheduling/supervisor/autoscale.py new file mode 100644 index 000000000..a442a69a6 --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/supervisor/autoscale.py @@ -0,0 +1,441 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import importlib +import logging +import random +import time +from abc import ABC, abstractmethod +from collections import defaultdict +from typing import Any, Dict, List, Optional, Set + +from .... import oscar as mo +from ....lib.aio import alru_cache +from ....typing import BandType +from ...cluster.api import ClusterAPI +from ...cluster.core import NodeRole, NodeStatus +from ..errors import NoAvailableBand + +logger = logging.getLogger(__name__) + + +class AutoscalerActor(mo.Actor): + def __init__(self, autoscale_conf: Dict[str, Any]): + self._enabled = autoscale_conf.get("enabled", False) + self._autoscale_conf = autoscale_conf + self._cluster_api = None + self.queueing_refs = dict() + self.global_resource_ref = None + self._dynamic_workers: Set[str] = set() + self._autoscale_in_disable_counter = 0 + + async def __post_create__(self): + strategy = self._autoscale_conf.get("strategy") + if strategy: # pragma: no cover + module, name = strategy.rsplit(".", 1) + strategy_cls = getattr(importlib.import_module(module), name) + else: + strategy_cls = PendingTaskBacklogStrategy + from ..supervisor import GlobalResourceManagerActor + + self.global_resource_ref = await mo.actor_ref( + GlobalResourceManagerActor.default_uid(), address=self.address + ) + self._cluster_api = await ClusterAPI.create(self.address) + self._strategy = await strategy_cls.create(self._autoscale_conf, self) + if self._enabled: + logger.info(f"Auto scale strategy %s started", self._strategy) + await self._strategy.start() + + async def __pre_destroy__(self): + if self._enabled: + await self._strategy.stop() + + async def register_session(self, session_id: str, address: str): + from .queueing import SubtaskQueueingActor + + self.queueing_refs[session_id] = await mo.actor_ref( + SubtaskQueueingActor.gen_uid(session_id), address=address + ) + + async def unregister_session(self, session_id: str): + self.queueing_refs.pop(session_id, None) + + async def request_worker( + self, worker_cpu: int = None, worker_mem: int = None, timeout: int = None + ) -> str: + start_time = time.time() + worker_address = await self._cluster_api.request_worker( + worker_cpu, worker_mem, timeout + ) + if worker_address: + self._dynamic_workers.add(worker_address) + logger.warning( + "Requested new worker %s in %.4f seconds, current dynamic worker nums is %s", + worker_address, + time.time() - start_time, + self.get_dynamic_worker_nums(), + ) + return worker_address + else: + logger.warning( + "Request worker with resource %s failed in %.4f seconds.", + dict(worker_cpu=worker_cpu, worker_mem=worker_mem), + time.time() - start_time, + ) + + async def disable_autoscale_in(self): + self._autoscale_in_disable_counter += 1 + if self._enabled: + logger.info("Disabled autoscale_in") + + async def try_enable_autoscale_in(self): + self._autoscale_in_disable_counter -= 1 + if self._autoscale_in_disable_counter == 0 and self._enabled: + logger.info("Enabled autoscale_in") + + async def release_workers(self, addresses: List[str]) -> List[str]: + """ + Release a group of worker nodes. + Parameters + ---------- + addresses : List[str] + The addresses of the specified node. + """ + if self._autoscale_in_disable_counter > 0: + return [] + workers_bands = { + address: await self.get_worker_bands(address) for address in addresses + } + logger.info( + "Start to release workers %s which have bands %s.", + addresses, + workers_bands, + ) + for address in addresses: + await self._cluster_api.set_node_status( + node=address, role=NodeRole.WORKER, status=NodeStatus.STOPPING + ) + # Ensure global_slot_manager get latest bands timely, so that we can invoke `wait_band_idle` + # to ensure there won't be new tasks scheduled to the stopping worker. + await self.global_resource_ref.refresh_bands() + excluded_bands = set(b for bands in workers_bands.values() for b in bands) + + async def release_worker(address): + logger.info("Start to release worker %s.", address) + worker_bands = workers_bands[address] + await asyncio.gather( + *[ + self.global_resource_ref.wait_band_idle(band) + for band in worker_bands + ] + ) + await self._migrate_data_of_bands(worker_bands, excluded_bands) + await self._cluster_api.release_worker(address) + self._dynamic_workers.remove(address) + logger.info("Released worker %s.", address) + + # Release workers one by one to ensure others workers which the current is moving data to + # is not being releasing. + for address in addresses: + await release_worker(address) + return addresses + + def get_dynamic_workers(self) -> Set[str]: + return self._dynamic_workers + + def get_dynamic_worker_nums(self) -> int: + return len(self._dynamic_workers) + + async def get_worker_bands(self, worker_address) -> List[BandType]: + node_info = ( + await self._cluster_api.get_nodes_info( + [worker_address], resource=True, exclude_statuses=set() + ) + )[worker_address] + return [ + (worker_address, resource_type) + for resource_type in node_info["resource"].keys() + ] + + async def _migrate_data_of_bands( + self, bands: List[BandType], excluded_bands: Set[BandType] + ): + """Move data from `bands` to other available bands""" + session_ids = list(self.queueing_refs.keys()) + for session_id in session_ids: + from ...meta import MetaAPI + + meta_api = await MetaAPI.create(session_id, self.address) + + batch_fetch, batch_delete = defaultdict(list), defaultdict(list) + batch_add_chunk_bands, batch_remove_chunk_bands = [], [] + for src_band in bands: + band_data_keys = await meta_api.get_band_chunks(src_band) + for data_key in band_data_keys: + dest_band = await self._select_target_band( + src_band, data_key, excluded_bands + ) + logger.debug( + "Move chunk % from band %s to band %s.", + data_key, + src_band, + dest_band, + ) + dest_storage_api = await self._get_storage_api( + session_id, dest_band[0] + ) + # For ray backend, there will only be meta update rather than data transfer + batch_fetch[dest_storage_api].append( + dest_storage_api.fetch.delay( + data_key, band_name=src_band[1], remote_address=src_band[0] + ) + ) + src_storage_api = await self._get_storage_api( + session_id, src_band[0] + ) + batch_delete[src_storage_api].append( + src_storage_api.delete.delay(data_key) + ) + batch_add_chunk_bands.append( + meta_api.add_chunk_bands.delay(data_key, [dest_band]) + ) + batch_remove_chunk_bands.append( + meta_api.remove_chunk_bands.delay(data_key, [src_band]) + ) + await asyncio.gather( + *[api.fetch.batch(*fetches) for api, fetches in batch_fetch.items()] + ) + await meta_api.add_chunk_bands.batch(*batch_add_chunk_bands) + await meta_api.remove_chunk_bands.batch(*batch_remove_chunk_bands) + await asyncio.gather( + *[api.delete.batch(*deletes) for api, deletes in batch_delete.items()] + ) + + async def _select_target_band( + self, band: BandType, data_key: str, excluded_bands: Set[BandType] + ): + all_bands = await self._cluster_api.get_all_bands() + bands = list( + b + for b in all_bands.keys() + if (b[1] == band[1] and b != band and b not in excluded_bands) + ) + if not bands: # pragma: no cover + raise NoAvailableBand( + f"No bands to migrate data to, " + f"all available bands is {all_bands}, " + f"current band is {band}, " + f"excluded bands are {excluded_bands}." + ) + # TODO select band based on remaining store space size of other bands + return random.choice(bands) + + @alru_cache(cache_exceptions=False) + async def _get_storage_api(self, session_id: str, address: str): + from ...storage import StorageAPI + + return await StorageAPI.create(session_id, address) + + +class AbstractScaleStrategy(ABC): + @classmethod + @abstractmethod + async def create(cls, autoscale_conf: Dict[str, Any], autoscaler): + """Create a autoscale strategy which will decide when to scale in/.out""" + + @abstractmethod + async def start(self): + """Start auto scale""" + + @abstractmethod + async def stop(self): + """Stop auto scale""" + + +class PendingTaskBacklogStrategy(AbstractScaleStrategy): + _task: Optional[asyncio.Task] + + def __init__(self, autoscale_conf: Dict[str, Any], autoscaler): + self._autoscaler = autoscaler + self._scheduler_check_interval = autoscale_conf.get( + "scheduler_check_interval", 1 + ) + self._scheduler_backlog_timeout = autoscale_conf.get( + "scheduler_backlog_timeout", 20 + ) + self._sustained_scheduler_backlog_timeout = autoscale_conf.get( + "sustained_scheduler_backlog_timeout", self._scheduler_backlog_timeout + ) + # Make worker_idle_timeout greater than scheduler_backlog_timeout to + # avoid cluster fluctuate back and forth。 + self._worker_idle_timeout = autoscale_conf.get( + "worker_idle_timeout", 2 * self._scheduler_backlog_timeout + ) + self._min_workers = autoscale_conf.get("min_workers", 1) + assert self._min_workers >= 1, "Mars need at least 1 worker." + self._max_workers = autoscale_conf.get("max_workers", 100) + self._task = None + + @classmethod + async def create(cls, autoscale_conf: Dict[str, Any], autoscaler): + return cls(autoscale_conf, autoscaler) + + async def start(self): + self._task = asyncio.create_task(self._run()) + + async def _run(self): + try: + delta = self._min_workers - self._autoscaler.get_dynamic_worker_nums() + while delta > 0: + logger.info(f"Start to request %s initial workers.", delta) + initial_worker_addresses = await asyncio.gather( + *[self._autoscaler.request_worker() for _ in range(delta)] + ) + initial_worker_addresses = [ + addr for addr in initial_worker_addresses if addr is not None + ] + logger.info( + f"Requested %s initial workers %s", + len(initial_worker_addresses), + initial_worker_addresses, + ) + delta = self._min_workers - self._autoscaler.get_dynamic_worker_nums() + while True: + await asyncio.sleep(self._scheduler_check_interval) + await self._run_round() + except asyncio.CancelledError: # pragma: no cover + logger.info("Canceled pending task backlog strategy.") + except Exception as e: # pragma: no cover + logger.exception("Exception occurred when try to auto scale") + raise e + + async def _run_round(self): + queueing_refs = list(self._autoscaler.queueing_refs.values()) + if any([await queueing_ref.all_bands_busy() for queueing_ref in queueing_refs]): + await self._scale_out(queueing_refs) + else: + await self._scale_in() + + async def _scale_out(self, queueing_refs): + logger.info( + "Try to scale out, current dynamic workers %s", + self._autoscaler.get_dynamic_worker_nums(), + ) + start_time = time.time() + while not await self._autoscaler.request_worker(): + logger.warning( + "Request worker failed, wait %s seconds and retry.", + self._scheduler_check_interval, + ) + await asyncio.sleep(self._scheduler_check_interval) + await asyncio.sleep(self._scheduler_backlog_timeout) + rnd = 1 + while any( + [await queueing_ref.all_bands_busy() for queueing_ref in queueing_refs] + ): + worker_num = 2**rnd + if ( + self._autoscaler.get_dynamic_worker_nums() + worker_num + > self._max_workers + ): + worker_num = ( + self._max_workers - self._autoscaler.get_dynamic_worker_nums() + ) + while set( + await asyncio.gather( + *[self._autoscaler.request_worker() for _ in range(worker_num)] + ) + ) == {None}: + logger.warning( + "Request %s workers all failed, wait %s seconds and retry.", + worker_num, + self._scheduler_check_interval, + ) + await asyncio.sleep(self._scheduler_check_interval) + rnd += 1 + await asyncio.sleep(self._sustained_scheduler_backlog_timeout) + logger.info( + "Scale out finished in %s round, took %s seconds, current dynamic workers %s", + rnd, + time.time() - start_time, + self._autoscaler.get_dynamic_worker_nums(), + ) + + async def _scale_in(self): + idle_bands = set( + await self._autoscaler.global_resource_ref.get_idle_bands( + self._worker_idle_timeout + ) + ) + # exclude non-dynamic created workers and ensure all bands of the worker are idle + idle_bands = { + band + for band in idle_bands + if band[0] in self._autoscaler.get_dynamic_workers() + and idle_bands.issuperset( + set(await self._autoscaler.get_worker_bands(band[0])) + ) + } + worker_addresses = set(band[0] for band in idle_bands) + if worker_addresses: + logger.debug( + "Bands %s of workers % has been idle for as least %s seconds.", + idle_bands, + worker_addresses, + self._worker_idle_timeout, + ) + while ( + worker_addresses + and self._autoscaler.get_dynamic_worker_nums() - len(worker_addresses) + < self._min_workers + ): + worker_address = worker_addresses.pop() + logger.debug( + "Skip offline idle worker %s to keep at least %s dynamic workers. " + "Current total dynamic workers is %s.", + worker_address, + self._min_workers, + self._autoscaler.get_dynamic_worker_nums(), + ) + idle_bands.difference_update( + set(await self._autoscaler.get_worker_bands(worker_address)) + ) + if worker_addresses: + start_time = time.time() + logger.info( + "Try to offline idle workers %s with bands %s.", + worker_addresses, + idle_bands, + ) + try: + worker_addresses = await self._autoscaler.release_workers( + worker_addresses + ) + logger.info( + "Finished offline workers %s in %.4f seconds", + worker_addresses, + time.time() - start_time, + ) + except NoAvailableBand as e: # pragma: no cover + logger.warning( + "No enough bands, offline workers %s failed with exception %s.", + worker_addresses, + e, + ) + + async def stop(self): + self._task.cancel() + await self._task diff --git a/python/xorbits/_mars/services/scheduling/supervisor/globalresource.py b/python/xorbits/_mars/services/scheduling/supervisor/globalresource.py new file mode 100644 index 000000000..8e26889a6 --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/supervisor/globalresource.py @@ -0,0 +1,168 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import logging +import time +from collections import defaultdict +from typing import DefaultDict, Dict, List, Tuple + +from .... import oscar as mo +from ....resource import Resource, ZeroResource +from ....typing import BandType + +logger = logging.getLogger(__name__) + + +class GlobalResourceManagerActor(mo.Actor): + # {(address, resource_type): {(session_id, subtask_id): Resource(...)}} + _band_stid_resources: DefaultDict[BandType, Dict[Tuple[str, str], Resource]] + _band_used_resources: Dict[BandType, Resource] + _band_total_resources: Dict[BandType, Resource] + + def __init__(self): + self._band_stid_resources = defaultdict(dict) + self._band_used_resources = defaultdict(lambda: ZeroResource) + self._band_idle_start_time = dict() + self._band_idle_events = dict() + self._band_total_resources = dict() + self._cluster_api = None + self._band_watch_task = None + + async def __post_create__(self): + from ...cluster.api import ClusterAPI + + self._cluster_api = await ClusterAPI.create(self.address) + + async def watch_bands(): + async for bands in self._cluster_api.watch_all_bands(): + old_bands = set(self._band_total_resources.keys()) + self._band_total_resources = bands + new_bands = set(bands.keys()) - old_bands + for band in new_bands: + self._update_band_usage(band, ZeroResource) + + self._band_watch_task = asyncio.create_task(watch_bands()) + + async def __pre_destroy__(self): + self._band_watch_task.cancel() + + async def refresh_bands(self): + self._band_total_resources = await self._cluster_api.get_all_bands() + + @mo.extensible + async def apply_subtask_resources( + self, + band: BandType, + session_id: str, + subtask_ids: List[str], + subtask_resources: List[Resource], + ) -> List[str]: + if ( + not self._band_total_resources or band not in self._band_total_resources + ): # pragma: no cover + await self.refresh_bands() + idx = 0 + # only ready bands will pass + if band in self._band_total_resources: + total_resource = self._band_total_resources[band] + for stid, subtask_resource in zip(subtask_ids, subtask_resources): + band_used_resource = self._band_used_resources[band] + if band_used_resource + subtask_resource > total_resource: + break + self._band_stid_resources[band][(session_id, stid)] = subtask_resource + self._update_band_usage(band, subtask_resource) + idx += 1 + if idx == 0: + logger.debug( + "No resources available, status: %r, request: %r", + self._band_used_resources, + subtask_resources, + ) + return subtask_ids[:idx] + + @mo.extensible + def update_subtask_resources( + self, band: BandType, session_id: str, subtask_id: str, resource: Resource + ): + session_subtask_id = (session_id, subtask_id) + subtask_resources = self._band_stid_resources[band] + if session_subtask_id not in subtask_resources: + return + + resource_delta = resource - subtask_resources[session_subtask_id] + subtask_resources[session_subtask_id] = resource + self._update_band_usage(band, resource_delta) + + @mo.extensible + def release_subtask_resource( + self, band: BandType, session_id: str, subtask_id: str + ): + # todo ensure slots released when subtasks ends in all means + resource_delta = self._band_stid_resources[band].pop( + (session_id, subtask_id), ZeroResource + ) + self._update_band_usage(band, -resource_delta) + + def _update_band_usage(self, band: BandType, band_usage_delta: Resource): + self._band_used_resources[band] += band_usage_delta + # some code path doesn't call `apply_subtask_resources` + band_total_resource = self._band_total_resources.get(band) + if ( + band_total_resource is not None + and self._band_used_resources[band] > band_total_resource + ): # pragma: no cover + raise Exception( + f"Resource exceed: band used resource {self._band_used_resources[band]} " + f"band total resource {self._band_total_resources[band]}" + ) + if self._band_used_resources[band] <= ZeroResource: + self._band_used_resources.pop(band) + self._band_idle_start_time[band] = time.time() + if band in self._band_idle_events: + self._band_idle_events.pop(band).set() + else: + self._band_idle_start_time[band] = -1 + + def get_used_resources(self) -> Dict[BandType, Resource]: + return self._band_used_resources + + def get_remaining_resources(self) -> Dict[BandType, Resource]: + resources = {} + for band, resource in self._band_total_resources.items(): + used_resource = self.get_used_resources()[band] + resources[band] = resource - used_resource + return resources + + async def get_idle_bands(self, idle_duration: int): + """Return a band list which all bands has been idle for at least `idle_duration` seconds.""" + now = time.time() + idle_bands = [] + for band in self._band_total_resources.keys(): + idle_start_time = self._band_idle_start_time.get(band) + if idle_start_time is None: # pragma: no cover + # skip new requested band for this round scale in. + self._band_idle_start_time[band] = now + elif idle_start_time > 0 and now >= idle_start_time + idle_duration: + idle_bands.append(band) + return idle_bands + + async def wait_band_idle(self, band: BandType): + if self._band_idle_start_time[band] <= 0: + if band in self._band_idle_events: + event = self._band_idle_events[band] + else: + event = asyncio.Event() + self._band_idle_events[band] = event + return event.wait() diff --git a/python/xorbits/_mars/services/scheduling/supervisor/manager.py b/python/xorbits/_mars/services/scheduling/supervisor/manager.py new file mode 100644 index 000000000..8424937ea --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/supervisor/manager.py @@ -0,0 +1,446 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import logging +import time +from collections import defaultdict +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Tuple, Union + +from .... import oscar as mo +from ....lib.aio import alru_cache +from ....metrics import Metrics +from ....oscar.backends.context import ProfilingContext +from ....oscar.errors import MarsError +from ....oscar.profiling import MARS_ENABLE_PROFILING, ProfilingData +from ....typing import BandType +from ....utils import Timer, dataslots +from ...subtask import Subtask, SubtaskResult, SubtaskStatus +from ...task import TaskAPI +from ..core import SubtaskScheduleSummary +from ..utils import redirect_subtask_errors + +logger = logging.getLogger(__name__) + + +# the default times to reschedule subtask. +DEFAULT_SUBTASK_MAX_RESCHEDULES = 0 + + +@dataslots +@dataclass +class SubtaskScheduleInfo: + subtask: Subtask + band_futures: Dict[BandType, asyncio.Future] = field(default_factory=dict) + start_time: int = -1 + end_time: int = -1 + max_reschedules: int = 0 + num_reschedules: int = 0 + num_speculative_concurrent_run: int = 0 + + def to_summary(self, **kwargs) -> SubtaskScheduleSummary: + return SubtaskScheduleSummary( + task_id=self.subtask.task_id, + subtask_id=self.subtask.subtask_id, + bands=list(self.band_futures.keys()), + num_reschedules=self.num_reschedules, + **kwargs, + ) + + +class SubtaskManagerActor(mo.Actor): + _subtask_infos: Dict[str, SubtaskScheduleInfo] # subtask id -> schedule info + _subtask_summaries: Dict[str, SubtaskScheduleSummary] # subtask id -> summary + + @classmethod + def gen_uid(cls, session_id: str): + return f"{session_id}_subtask_manager" + + def __init__( + self, + session_id: str, + subtask_max_reschedules: int = DEFAULT_SUBTASK_MAX_RESCHEDULES, + subtask_cancel_timeout: int = 5, + speculation_config: Dict[str, object] = None, + ): + self._session_id = session_id + self._subtask_infos = dict() + self._subtask_summaries = dict() + self._subtask_max_reschedules = subtask_max_reschedules + self._subtask_cancel_timeout = subtask_cancel_timeout + self._speculation_config = speculation_config or {} + self._queueing_ref = None + self._global_resource_ref = None + self._submitted_subtask_count = Metrics.counter( + "mars.scheduling.submitted_subtask_count", + "The count of submitted subtasks to all bands.", + ("session_id", "task_id", "stage_id"), + ) + self._finished_subtask_count = Metrics.counter( + "mars.scheduling.finished_subtask_count", + "The count of finished subtasks of all bands.", + ("session_id", "task_id", "stage_id"), + ) + self._canceled_subtask_count = Metrics.counter( + "mars.scheduling.canceled_subtask_count", + "The count of canceled subtasks of all bands.", + ("session_id", "task_id", "stage_id"), + ) + logger.info( + "Created SubtaskManager with subtask_max_reschedules %s, " + "speculation_config %s", + self._subtask_max_reschedules, + speculation_config, + ) + + async def __post_create__(self): + from .queueing import SubtaskQueueingActor + + self._queueing_ref = await mo.actor_ref( + SubtaskQueueingActor.gen_uid(self._session_id), address=self.address + ) + from ..supervisor import GlobalResourceManagerActor + + self._global_resource_ref = await mo.actor_ref( + GlobalResourceManagerActor.default_uid(), address=self.address + ) + from .speculation import SpeculativeScheduler + + self._speculation_execution_scheduler = SpeculativeScheduler( + self._queueing_ref, self._global_resource_ref, self._speculation_config + ) + await self._speculation_execution_scheduler.start() + + async def __pre_destroy__(self): + await self._speculation_execution_scheduler.stop() + + @alru_cache + async def _get_task_api(self): + return await TaskAPI.create(self._session_id, self.address) + + async def add_subtasks(self, subtasks: List[Subtask], priorities: List[Tuple]): + async with redirect_subtask_errors(self, subtasks): + for subtask in subtasks: + # the extra_config may be None. the extra config overwrites the default value. + subtask_max_reschedules = ( + subtask.extra_config.get("subtask_max_reschedules") + if subtask.extra_config + else None + ) + if subtask_max_reschedules is None: + subtask_max_reschedules = self._subtask_max_reschedules + if subtask.subtask_id in self._subtask_infos: # pragma: no cover + raise KeyError(f"Subtask {subtask.subtask_id} already added.") + self._subtask_infos[subtask.subtask_id] = SubtaskScheduleInfo( + subtask, max_reschedules=subtask_max_reschedules + ) + + virtual_subtasks = [subtask for subtask in subtasks if subtask.virtual] + for subtask in virtual_subtasks: + task_api = await self._get_task_api() + await task_api.set_subtask_result( + SubtaskResult( + subtask_id=subtask.subtask_id, + session_id=subtask.session_id, + task_id=subtask.task_id, + stage_id=subtask.stage_id, + progress=1.0, + status=SubtaskStatus.succeeded, + ) + ) + await self._queueing_ref.add_subtasks( + [subtask for subtask in subtasks if not subtask.virtual], priorities + ) + await self._queueing_ref.submit_subtasks.tell() + + @alru_cache(maxsize=10000) + async def _get_execution_ref(self, band: BandType): + from ..worker.execution import SubtaskExecutionActor + + return await mo.actor_ref(SubtaskExecutionActor.default_uid(), address=band[0]) + + async def finish_subtasks( + self, + subtask_ids: List[str], + bands: List[BandType] = None, + schedule_next: bool = True, + ): + logger.debug("Finished subtasks %s.", subtask_ids) + band_tasks = defaultdict(lambda: 0) + bands = bands or [None] * len(subtask_ids) + for subtask_id, subtask_band in zip(subtask_ids, bands): + subtask_info = self._subtask_infos.get(subtask_id, None) + if subtask_info is not None: + self._finished_subtask_count.record( + 1, + { + "session_id": self._session_id, + "task_id": subtask_info.subtask.task_id, + "stage_id": subtask_info.subtask.stage_id, + }, + ) + self._subtask_summaries[subtask_id] = subtask_info.to_summary( + is_finished=True + ) + subtask_info.end_time = time.time() + self._speculation_execution_scheduler.finish_subtask(subtask_info) + # Cancel subtask on other bands. + aio_task = subtask_info.band_futures.pop(subtask_band, None) + if aio_task: + await aio_task + if schedule_next: + band_tasks[subtask_band] += 1 + if subtask_info.band_futures: + # Cancel subtask here won't change subtask status. + # See more in `TaskProcessorActor.set_subtask_result` + logger.info( + "Try to cancel subtask %s on bands %s.", + subtask_id, + set(subtask_info.band_futures.keys()), + ) + # Cancel subtask can be async and may need to kill slot which need more time. + # Can't use `tell` here because next line remove subtask info which is needed by + # `cancel_subtasks`. + yield self.ref().cancel_subtasks([subtask_id]) + # cancel subtask first then pop subtask info. + self._subtask_infos.pop(subtask_id, None) + if schedule_next: + for band in subtask_info.band_futures.keys(): + band_tasks[band] += 1 + await self._queueing_ref.remove_queued_subtasks(subtask_ids) + if band_tasks: + tasks = [] + for band, subtask_count in band_tasks.items(): + task = asyncio.ensure_future( + self._queueing_ref.submit_subtasks.tell(band, subtask_count) + ) + tasks.append(task) + await asyncio.wait(tasks) + + def _get_subtasks_by_ids(self, subtask_ids: List[str]) -> List[Optional[Subtask]]: + subtasks = [] + for stid in subtask_ids: + try: + subtasks.append(self._subtask_infos[stid].subtask) + except KeyError: + subtasks.append(None) + return subtasks + + async def submit_subtask_to_band(self, subtask_id: str, band: BandType): + if subtask_id not in self._subtask_infos: # pragma: no cover + logger.info( + "Subtask %s is not in added subtasks set, it may be finished or canceled, skip it.", + subtask_id, + ) + return + async with redirect_subtask_errors( + self, self._get_subtasks_by_ids([subtask_id]) + ): + try: + subtask_info = self._subtask_infos[subtask_id] + execution_ref = await self._get_execution_ref(band) + extra_config = subtask_info.subtask.extra_config + enable_profiling = MARS_ENABLE_PROFILING or ( + extra_config and extra_config.get("enable_profiling") + ) + profiling_context = ( + ProfilingContext(subtask_info.subtask.task_id) + if enable_profiling + else None + ) + self._submitted_subtask_count.record( + 1, + { + "session_id": self._session_id, + "task_id": subtask_info.subtask.task_id, + "stage_id": subtask_info.subtask.stage_id, + }, + ) + logger.debug("Start run subtask %s in band %s.", subtask_id, band) + with Timer() as timer: + task = asyncio.create_task( + execution_ref.run_subtask.options( + profiling_context=profiling_context + ).send(subtask_info.subtask, band[1], self.address) + ) + subtask_info.band_futures[band] = task + subtask_info.start_time = time.time() + self._speculation_execution_scheduler.add_subtask(subtask_info) + result = yield task + ProfilingData.collect_subtask( + subtask_info.subtask, band, timer.duration + ) + task_api = await self._get_task_api() + logger.debug("Finished subtask %s with result %s.", subtask_id, result) + await task_api.set_subtask_result(result) + except (OSError, MarsError) as ex: + # TODO: We should handle ServerClosed Error. + if ( + subtask_info.subtask.retryable + and subtask_info.num_reschedules < subtask_info.max_reschedules + ): + logger.error( + "Reschedule subtask %s due to %s", + subtask_info.subtask.subtask_id, + ex, + ) + subtask_info.num_reschedules += 1 + await self._queueing_ref.add_subtasks( + [subtask_info.subtask], + [subtask_info.subtask.priority or tuple()], + exclude_bands=set(subtask_info.band_futures.keys()), + ) + else: + raise ex + except asyncio.CancelledError: + raise + except Exception as ex: + if ( + subtask_info.subtask.retryable + and subtask_info.num_reschedules < subtask_info.max_reschedules + ): + logger.error( + "Failed to reschedule subtask %s, " + "num_reschedules: %s, max_reschedules: %s, unhandled exception: %s", + subtask_info.subtask.subtask_id, + subtask_info.num_reschedules, + subtask_info.max_reschedules, + ex, + ) + raise ex + finally: + # make sure slot is released before marking tasks as finished + await self._global_resource_ref.release_subtask_resource( + band, + subtask_info.subtask.session_id, + subtask_info.subtask.subtask_id, + ) + logger.debug( + "Slot released for band %s after subtask %s", + band, + subtask_info.subtask.subtask_id, + ) + # We should call submit_subtasks after the resource is released. + # If submit_subtasks runs before release_subtask_resource + # then the rescheduled subtask may not be submitted due to + # no available resource. The mars will hangs. + if subtask_info.num_reschedules > 0: + await self._queueing_ref.submit_subtasks.tell() + + async def cancel_subtasks( + self, subtask_ids: List[str], kill_timeout: Union[float, int] = None + ): + kill_timeout = kill_timeout or self._subtask_cancel_timeout + logger.info( + "Start to cancel subtasks %s, kill timeout is %s.", + subtask_ids, + kill_timeout, + ) + queued_subtask_ids = [] + single_cancel_tasks = [] + + task_api = await self._get_task_api() + + async def cancel_single_task(subtask, raw_tasks, cancel_tasks): + if cancel_tasks: + await asyncio.wait(cancel_tasks) + if raw_tasks: + dones, _ = await asyncio.wait(raw_tasks) + else: + dones = [] + if not dones or all(fut.cancelled() for fut in dones): + await task_api.set_subtask_result( + SubtaskResult( + subtask_id=subtask.subtask_id, + session_id=subtask.session_id, + task_id=subtask.task_id, + stage_id=subtask.stage_id, + status=SubtaskStatus.cancelled, + ) + ) + + for subtask_id in subtask_ids: + if subtask_id not in self._subtask_infos: + # subtask may already finished or not submitted at all + logger.info( + "Skip cancel subtask %s, it may already finished or not submitted at all", + subtask_id, + ) + continue + + subtask_info = self._subtask_infos[subtask_id] + raw_tasks_to_cancel = list(subtask_info.band_futures.values()) + + if not raw_tasks_to_cancel: + queued_subtask_ids.append(subtask_id) + single_cancel_tasks.append( + asyncio.create_task( + cancel_single_task(subtask_info.subtask, [], []) + ) + ) + else: + cancel_tasks = [] + for band in subtask_info.band_futures.keys(): + execution_ref = await self._get_execution_ref(band) + cancel_tasks.append( + asyncio.create_task( + execution_ref.cancel_subtask( + subtask_id, kill_timeout=kill_timeout + ) + ) + ) + single_cancel_tasks.append( + asyncio.create_task( + cancel_single_task( + subtask_info.subtask, raw_tasks_to_cancel, cancel_tasks + ) + ) + ) + if queued_subtask_ids: + # Don't use `finish_subtasks` because it may remove queued + await self._queueing_ref.remove_queued_subtasks(queued_subtask_ids) + if single_cancel_tasks: + yield asyncio.wait(single_cancel_tasks) + + for subtask_id in subtask_ids: + subtask_info = self._subtask_infos.pop(subtask_id, None) + if subtask_info is not None: + self._subtask_summaries[subtask_id] = subtask_info.to_summary( + is_finished=True, is_cancelled=True + ) + self._canceled_subtask_count.record( + 1, + { + "session_id": self._session_id, + "task_id": subtask_info.subtask.task_id, + "stage_id": subtask_info.subtask.stage_id, + }, + ) + await self._queueing_ref.submit_subtasks.tell() + logger.info("Subtasks %s canceled.", subtask_ids) + + def get_schedule_summaries(self, task_id: Optional[str] = None): + if task_id is not None: + summaries = { + subtask_id: summary + for subtask_id, summary in self._subtask_summaries.items() + if summary.task_id == task_id + } + else: + summaries = dict(self._subtask_summaries) + for info in self._subtask_infos.values(): + if task_id is None or info.subtask.task_id == task_id: + summaries[info.subtask.subtask_id] = info.to_summary() + return list(summaries.values()) diff --git a/python/xorbits/_mars/services/scheduling/supervisor/queueing.py b/python/xorbits/_mars/services/scheduling/supervisor/queueing.py new file mode 100644 index 000000000..4f55f26ca --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/supervisor/queueing.py @@ -0,0 +1,350 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import copy +import heapq +import logging +from collections import defaultdict +from dataclasses import dataclass +from typing import DefaultDict, Dict, List, Optional, Set, Tuple, Union + +from .... import oscar as mo +from ....lib.aio import alru_cache +from ....metrics import Metrics +from ....resource import ZeroResource +from ....utils import dataslots +from ...subtask import Subtask +from ...task import TaskAPI +from ..utils import redirect_subtask_errors + +logger = logging.getLogger(__name__) + +_DEFAULT_SUBMIT_PERIOD = 0 + + +@dataslots +@dataclass +class HeapItem: + subtask: Subtask + priority: Tuple + + def __lt__(self, other: "HeapItem"): + return self.priority > other.priority + + +class SubtaskQueueingActor(mo.Actor): + _stid_to_bands: DefaultDict[str, List[Tuple]] + _stid_to_items: Dict[str, HeapItem] + _band_queues: DefaultDict[Tuple, List[HeapItem]] + + @classmethod + def gen_uid(cls, session_id: str): + return f"{session_id}_subtask_queueing" + + def __init__(self, session_id: str, submit_period: Union[float, int] = None): + self._session_id = session_id + self._stid_to_bands = defaultdict(list) + self._stid_to_items = dict() + # Note that we need to ensure top item in every band heap queue is valid, + # so that we can ensure band queue is busy if the band queue is not empty. + self._band_queues = defaultdict(list) + + self._cluster_api = None + self._slots_ref = None + self._assigner_ref = None + + self._band_to_resource = dict() + self._band_watch_task = None + self._max_enqueue_id = 0 + + self._periodical_submit_task = None + self._submit_period = submit_period or _DEFAULT_SUBMIT_PERIOD + self._submitted_subtask_number = Metrics.gauge( + "mars.band.submitted_subtask_number", + "The number of submitted subtask to a band.", + ("session_id", "band"), + ) + self._unsubmitted_subtask_number = Metrics.gauge( + "mars.band.unsubmitted_subtask_number", + "The number of unsubmitted subtask to a band.", + ("session_id", "band"), + ) + + async def __post_create__(self): + from ...cluster import ClusterAPI + + self._cluster_api = await ClusterAPI.create(self.address) + self._band_to_resource = {} + + async def watch_bands(): + async for bands in self._cluster_api.watch_all_bands(): + # confirm ready bands indeed changed + if bands != self._band_to_resource: + old_band_resource = self._band_to_resource + self._band_to_resource = copy.deepcopy(bands) + if self._band_queues: + await self.balance_queued_subtasks() + # Refresh global slot manager to get latest bands, + # so that subtasks reassigned to the new bands can be + # ensured to get submitted as least one subtask every band + # successfully. + await self._slots_ref.refresh_bands() + all_bands = {*bands.keys(), *old_band_resource.keys()} + bands_delta = {} + for b in all_bands: + new_resource = bands.get(b, ZeroResource) + old_resource = old_band_resource.get(b, ZeroResource) + delta = new_resource - old_resource + if delta != ZeroResource: + bands_delta[b] = delta + # Submit tasks on new bands manually, otherwise some subtasks + # will never got submitted. Note that we must ensure every new + # band will get at least one subtask submitted successfully. + # Later subtasks submit on the band will be triggered by the + # success of previous subtasks on the same band. + logger.info( + "Bands changed with delta %s, submit all bands.", + bands_delta, + ) + await self.ref().submit_subtasks() + + self._band_watch_task = asyncio.create_task(watch_bands()) + + from .globalresource import GlobalResourceManagerActor + + [self._slots_ref] = await self._cluster_api.get_supervisor_refs( + [GlobalResourceManagerActor.default_uid()] + ) + from .assigner import AssignerActor + + self._assigner_ref = await mo.actor_ref( + AssignerActor.gen_uid(self._session_id), address=self.address + ) + + if self._submit_period > 0: + self._periodical_submit_task = self.ref().periodical_submit.tell_delay( + delay=self._submit_period + ) + + async def __pre_destroy__(self): + self._band_watch_task.cancel() + if self._periodical_submit_task is not None: # pragma: no branch + self._periodical_submit_task.cancel() + + async def periodical_submit(self): + await self.ref().submit_subtasks.tell() + self._periodical_submit_task = self.ref().periodical_submit.tell_delay( + delay=self._submit_period + ) + + @alru_cache + async def _get_task_api(self): + return await TaskAPI.create(self._session_id, self.address) + + @alru_cache(cache_exceptions=False) + async def _get_manager_ref(self): + from .manager import SubtaskManagerActor + + return await mo.actor_ref( + SubtaskManagerActor.gen_uid(self._session_id), address=self.address + ) + + async def add_subtasks( + self, + subtasks: List[Subtask], + priorities: List[Tuple], + exclude_bands: Set[Tuple] = None, + random_when_unavailable: bool = True, + ): + bands = await self._assigner_ref.assign_subtasks( + subtasks, exclude_bands, random_when_unavailable + ) + for subtask, band, priority in zip(subtasks, bands, priorities): + assert band is not None + self._stid_to_bands[subtask.subtask_id].append(band) + heap_item = self._stid_to_items[subtask.subtask_id] = HeapItem( + subtask, priority + (self._max_enqueue_id,) + ) + self._max_enqueue_id += 1 + heapq.heappush(self._band_queues[band], heap_item) + logger.debug( + "Subtask %s enqueued to band %s excluded from %s.", + subtask.subtask_id, + band, + exclude_bands, + ) + logger.debug("%d subtasks enqueued", len(subtasks)) + + async def submit_subtasks(self, band: Tuple = None, limit: Optional[int] = None): + logger.debug("Submitting subtasks with limit %s", limit) + + if not limit and band not in self._band_to_resource: + self._band_to_resource = await self._cluster_api.get_all_bands() + + bands = [band] if band is not None else list(self._band_to_resource.keys()) + submit_aio_tasks = [] + manager_ref = await self._get_manager_ref() + + apply_delays = [] + submit_items_list = [] + submitted_bands = [] + + for band in bands: + band_limit = limit or ( + self._band_to_resource[band].num_cpus + or self._band_to_resource[band].num_gpus + ) + task_queue = self._band_queues[band] + submit_items = dict() + while ( + self._ensure_top_item_valid(task_queue) + and len(submit_items) < band_limit + ): + item = heapq.heappop(task_queue) + submit_items[item.subtask.subtask_id] = item + + subtask_ids = list(submit_items) + if not subtask_ids: + continue + + submitted_bands.append(band) + submit_items_list.append(submit_items) + + # Before hbo, when a manager finish a subtask, it will schedule one subtask successfully because + # there is a slot idle. But now we have memory requirements, so the subtask may apply resource + # from supervisor failed. In such cases, those subtasks will never got scheduled. + # TODO We can use `_periodical_submit_task` to submit those subtasks. + subtask_resources = [ + item.subtask.required_resource for item in submit_items.values() + ] + apply_delays.append( + self._slots_ref.apply_subtask_resources.delay( + band, self._session_id, subtask_ids, subtask_resources + ) + ) + + async with redirect_subtask_errors( + self, + [ + item.subtask + for submit_items in submit_items_list + for item in submit_items.values() + ], + ): + submitted_ids_list = await self._slots_ref.apply_subtask_resources.batch( + *apply_delays + ) + + for band, submit_items, submitted_ids in zip( + submitted_bands, submit_items_list, submitted_ids_list + ): + subtask_ids = list(submit_items) + task_queue = self._band_queues[band] + + async with redirect_subtask_errors( + self, [item.subtask for item in submit_items.values()] + ): + non_submitted_ids = [k for k in submit_items if k not in submitted_ids] + tags = { + "session_id": self._session_id, + "band": band[0] if band else "", + } + self._submitted_subtask_number.record(len(submitted_ids), tags) + self._unsubmitted_subtask_number.record(len(non_submitted_ids), tags) + if submitted_ids: + for stid in subtask_ids: + if stid not in submitted_ids: + continue + item = submit_items[stid] + logger.debug("Submit subtask %r to band %r", item.subtask, band) + submit_aio_tasks.append( + asyncio.create_task( + manager_ref.submit_subtask_to_band.tell( + item.subtask.subtask_id, band + ) + ) + ) + await asyncio.sleep(0) + self.remove_queued_subtasks([item.subtask.subtask_id]) + else: + logger.debug("No slots available") + + for stid in non_submitted_ids: + # TODO if subtasks submit failed due to lacking memory/cpu/gpu resources, lower the priority so that + # other subtasks can be submitted. + heapq.heappush(task_queue, submit_items[stid]) + + if submit_aio_tasks: + yield asyncio.gather(*submit_aio_tasks) + + def _ensure_top_item_valid(self, task_queue): + """Clean invalid subtask item from the queue to ensure that when the queue is not empty, + there is always some subtasks waiting being scheduled.""" + while ( + task_queue and task_queue[0].subtask.subtask_id not in self._stid_to_items + ): + # skip removed items (as they may be re-pushed into the queue) + heapq.heappop(task_queue) + return bool(task_queue) + + @mo.extensible + def update_subtask_priority(self, subtask_id: str, priority: Tuple): + if subtask_id not in self._stid_to_bands: + return + for band in self._stid_to_bands[subtask_id]: + new_item = HeapItem(self._stid_to_items[subtask_id].subtask, priority) + self._stid_to_items[subtask_id] = new_item + heapq.heappush(self._band_queues[band], new_item) + + def remove_queued_subtasks(self, subtask_ids: List[str]): + for stid in subtask_ids: + bands = self._stid_to_bands.pop(stid, []) + self._stid_to_items.pop(stid, None) + for band in bands: + band_queue = self._band_queues.get(band) + self._ensure_top_item_valid(band_queue) + + async def all_bands_busy(self) -> bool: + """Return True if all bands queue has tasks waiting to be submitted.""" + bands = set(self._band_to_resource.keys()) + if set(self._band_queues.keys()).issuperset(bands): + return all(len(self._band_queues[band]) > 0 for band in bands) + return False + + async def balance_queued_subtasks(self): + # record length of band queues + band_num_queued_subtasks = { + band: len(queue) for band, queue in self._band_queues.items() + } + move_queued_subtasks = await self._assigner_ref.reassign_subtasks( + band_num_queued_subtasks + ) + items = [] + # rewrite band queues according to feedbacks from assigner + for band, move in move_queued_subtasks.items(): + task_queue = self._band_queues[band] + assert move + len(task_queue) >= 0 + for _ in range(abs(move)): + if move < 0: + # TODO: pop item of low priority + item = heapq.heappop(task_queue) + self._stid_to_bands[item.subtask.subtask_id].remove(band) + items.append(item) + elif move > 0: + item = items.pop() + self._stid_to_bands[item.subtask.subtask_id].append(band) + heapq.heappush(task_queue, item) + if len(task_queue) == 0: + self._band_queues.pop(band) diff --git a/python/xorbits/_mars/services/scheduling/supervisor/service.py b/python/xorbits/_mars/services/scheduling/supervisor/service.py new file mode 100644 index 000000000..4ec202daf --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/supervisor/service.py @@ -0,0 +1,146 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio + +from .... import oscar as mo +from ...core import AbstractService +from .autoscale import AutoscalerActor +from .manager import DEFAULT_SUBTASK_MAX_RESCHEDULES + + +class SchedulingSupervisorService(AbstractService): + """ + Scheduling service on supervisor. + + Scheduling Configuration + ------------------------ + { + "scheduling" : { + "submit_period": 1, + "autoscale" : { + "enabled": false, + "scheduler_backlog_timeout": 20, + "sustained_scheduler_backlog_timeout": 20, + "worker_idle_timeout": 40, + "min_workers": 1, + "max_workers": 100 + } + } + } + """ + + async def start(self): + from .globalresource import GlobalResourceManagerActor + + await mo.create_actor( + GlobalResourceManagerActor, + uid=GlobalResourceManagerActor.default_uid(), + address=self._address, + ) + + autoscale_config = self._config.get("scheduling", {}).get("autoscale", {}) + await mo.create_actor( + AutoscalerActor, + autoscale_config, + uid=AutoscalerActor.default_uid(), + address=self._address, + ) + + async def stop(self): + from .autoscale import AutoscalerActor + + await mo.destroy_actor( + mo.create_actor_ref( + uid=AutoscalerActor.default_uid(), address=self._address + ) + ) + + from .globalresource import GlobalResourceManagerActor + + await mo.destroy_actor( + mo.create_actor_ref( + uid=GlobalResourceManagerActor.default_uid(), address=self._address + ) + ) + + async def create_session(self, session_id: str): + service_config = self._config or dict() + scheduling_config = service_config.get("scheduling", {}) + subtask_max_reschedules = scheduling_config.get( + "subtask_max_reschedules", DEFAULT_SUBTASK_MAX_RESCHEDULES + ) + subtask_cancel_timeout = scheduling_config.get("subtask_cancel_timeout", 5) + speculation_config = scheduling_config.get("speculation", {}) + + from .assigner import AssignerActor + + assigner_coro = mo.create_actor( + AssignerActor, + session_id, + address=self._address, + uid=AssignerActor.gen_uid(session_id), + ) + + from .queueing import SubtaskQueueingActor + + queueing_coro = mo.create_actor( + SubtaskQueueingActor, + session_id, + scheduling_config.get("submit_period"), + address=self._address, + uid=SubtaskQueueingActor.gen_uid(session_id), + ) + + await asyncio.gather(assigner_coro, queueing_coro) + + from .manager import SubtaskManagerActor + + await mo.create_actor( + SubtaskManagerActor, + session_id, + subtask_max_reschedules, + subtask_cancel_timeout, + speculation_config, + address=self._address, + uid=SubtaskManagerActor.gen_uid(session_id), + ) + + from ...cluster import ClusterAPI + from .autoscale import AutoscalerActor + + cluster_api = await ClusterAPI.create(self._address) + [autoscaler_ref] = await cluster_api.get_supervisor_refs( + [AutoscalerActor.default_uid()] + ) + await autoscaler_ref.register_session(session_id, self._address) + + async def destroy_session(self, session_id: str): + from .assigner import AssignerActor + from .autoscale import AutoscalerActor + from .manager import SubtaskManagerActor + from .queueing import SubtaskQueueingActor + + autoscaler_ref = await mo.actor_ref( + AutoscalerActor.default_uid(), address=self._address + ) + await autoscaler_ref.unregister_session(session_id) + + destroy_tasks = [] + for actor_cls in [SubtaskManagerActor, SubtaskQueueingActor, AssignerActor]: + ref = await mo.actor_ref( + actor_cls.gen_uid(session_id), address=self._address + ) + destroy_tasks.append(asyncio.create_task(ref.destroy())) + await asyncio.gather(*destroy_tasks) diff --git a/python/xorbits/_mars/services/scheduling/supervisor/speculation.py b/python/xorbits/_mars/services/scheduling/supervisor/speculation.py new file mode 100644 index 000000000..5cdecfed0 --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/supervisor/speculation.py @@ -0,0 +1,277 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import logging +import time +from collections import defaultdict +from typing import Dict + +import numpy as np + +from ....utils import create_task_with_error_log, parse_readable_size +from ..errors import NoAvailableBand +from .manager import SubtaskScheduleInfo + +logger = logging.getLogger(__name__) + +# the default times for speculative subtask execution. +DEFAULT_SUBTASK_SPECULATION_THRESHOLD = 0.75 +DEFAULT_SUBTASK_SPECULATION_INTERVAL = 5 # time unit: seconds +DEFAULT_SUBTASK_SPECULATION_MIN_TASK_RUNTIME = 3 +DEFAULT_SUBTASK_SPECULATION_MULTIPLIER = 1.5 +DEFAULT_SUBTASK_MAX_CONCURRENT_RUN = 3 + + +class SpeculativeScheduler: + _grouped_unfinished_subtasks: Dict[ + str, Dict[str, SubtaskScheduleInfo] + ] # key is subtask logic key + _grouped_finished_subtasks: Dict[ + str, Dict[str, SubtaskScheduleInfo] + ] # key is subtask logic key + + def __init__( + self, queueing_ref, global_resource_ref, speculation_config: Dict[str, any] + ): + self._grouped_unfinished_subtasks = defaultdict(dict) + self._grouped_finished_subtasks = defaultdict(dict) + self._queueing_ref = queueing_ref + self._global_resource_ref = global_resource_ref + self._speculation_config = speculation_config + self._subtask_speculation_enabled = speculation_config.get("enabled", False) + assert self._subtask_speculation_enabled in (True, False) + self._subtask_speculation_dry = speculation_config.get("dry", False) + self._subtask_speculation_threshold = parse_readable_size( + speculation_config.get("threshold", DEFAULT_SUBTASK_SPECULATION_THRESHOLD) + )[0] + self._subtask_speculation_interval = speculation_config.get( + "interval", DEFAULT_SUBTASK_SPECULATION_INTERVAL + ) + self._subtask_speculation_min_task_runtime = speculation_config.get( + "min_task_runtime", DEFAULT_SUBTASK_SPECULATION_MIN_TASK_RUNTIME + ) + self._subtask_speculation_multiplier = speculation_config.get( + "multiplier", DEFAULT_SUBTASK_SPECULATION_MULTIPLIER + ) + self._subtask_speculation_max_concurrent_run = speculation_config.get( + "max_concurrent_run", DEFAULT_SUBTASK_MAX_CONCURRENT_RUN + ) + if self._subtask_speculation_enabled: + assert 1 >= self._subtask_speculation_threshold > 0 + assert self._subtask_speculation_interval > 0 + assert self._subtask_speculation_min_task_runtime > 0 + assert self._subtask_speculation_multiplier > 0 + assert self._subtask_speculation_max_concurrent_run > 0 + self._speculation_execution_task = None + + async def start(self): + if self._subtask_speculation_enabled: + self._speculation_execution_task = create_task_with_error_log( + self._speculative_execution_loop() + ) + logger.info( + "Speculative execution started with config %s.", + self._speculation_config, + ) + + async def stop(self): + if self._subtask_speculation_enabled: + self._speculation_execution_task.cancel() + try: + await self._speculation_execution_task + except asyncio.CancelledError: + pass + logger.info("Speculative execution stopped.") + + def add_subtask(self, subtask_info: SubtaskScheduleInfo): + # duplicate subtask add will be handled in `_speculative_execution`. + subtask = subtask_info.subtask + self._grouped_unfinished_subtasks[subtask.logic_key][ + subtask.subtask_id + ] = subtask_info + + def finish_subtask(self, subtask_info: SubtaskScheduleInfo): + subtask = subtask_info.subtask + grouped_finished_subtasks = self._grouped_finished_subtasks[subtask.logic_key] + grouped_finished_subtasks[subtask.subtask_id] = subtask_info + self._grouped_unfinished_subtasks[subtask.logic_key].pop( + subtask.subtask_id, None + ) + if len(grouped_finished_subtasks) == subtask.logic_parallelism: + self._grouped_finished_subtasks.pop(subtask.logic_key) + self._grouped_unfinished_subtasks.pop(subtask.logic_key, None) + logger.info( + "Subtask group with logic key %s parallelism %s finished.", + subtask.logic_key, + subtask.logic_parallelism, + ) + + async def _speculative_execution_loop(self): + while True: + # check subtasks in the same group which has same logic key periodically, if some subtasks hasn't been + # finished in a considerably longer duration, then those subtasks maybe slow/hang subtasks, try resubmit + # it to other bands too. + await asyncio.sleep(self._subtask_speculation_interval) + await self._speculative_execution() + + async def _speculative_execution(self): + for logic_key, subtask_infos_dict in dict( + self._grouped_finished_subtasks + ).items(): + if not subtask_infos_dict: # pragma: no cover + continue + subtask_infos = subtask_infos_dict.values() + one_subtask = next(iter(subtask_infos)).subtask + parallelism = one_subtask.logic_parallelism + spec_threshold = max( + 1, int(self._subtask_speculation_threshold * parallelism) + ) + # if finished subtasks reached the spec_threshold, try to find slow/hang unfinished subtasks + if parallelism > len(subtask_infos) >= spec_threshold: + unfinished_subtask_infos = self._grouped_unfinished_subtasks[ + logic_key + ].values() + # sort finished subtasks by running time + duration_array = np.sort( + np.array( + [info.end_time - info.start_time for info in subtask_infos] + ) + ) + median = np.percentile(duration_array, 50) + duration_threshold = max( + median * self._subtask_speculation_multiplier, + self._subtask_speculation_min_task_runtime, + ) + now = time.time() + # find subtasks whose duration is large enough so that can be took as slow/hang subtasks + unfinished_subtask_infos = [ + info + for info in unfinished_subtask_infos + if info not in subtask_infos + and now - info.start_time > duration_threshold + ] + if not unfinished_subtask_infos: # pragma: no cover + continue + exclude_bands = set() + for info in unfinished_subtask_infos: + exclude_bands.update(info.band_futures.keys()) + remaining_resources = ( + await self._global_resource_ref.get_remaining_resources() + ) + logger.warning( + "%s subtasks in %s for group %s has not been finished in %s seconds on bands %s, " + "median duration is %s, average duration for %s finished subtasks " + "is %s. trying speculative running. " + "Current cluster remaining resources %s", + len(unfinished_subtask_infos), + parallelism, + logic_key, + duration_threshold, + exclude_bands, + median, + len(subtask_infos), + duration_array.mean(), + remaining_resources, + ) + # TODO(chaokunyang) If too many subtasks got stale on same node, mark the node as slow node. + for subtask_info in unfinished_subtask_infos: + subtask = subtask_info.subtask + if subtask.retryable: + logger.warning( + "Subtask %s has not been finished in %s seconds on bands %s, " + "trying speculative running.", + subtask.subtask_id, + now - subtask_info.start_time, + list(subtask_info.band_futures.keys()), + ) + await self._submit_speculative_subtask( + subtask_info, exclude_bands + ) + else: + logger.warning( + "Unretryable subtask %s has not been finished in %s seconds " + "on bands %s, median duration is %s, it may hang.", + subtask.subtask_id, + (now - subtask_info.start_time), + list(subtask_info.band_futures.keys()), + median, + ) + await self._queueing_ref.submit_subtasks.tell() + + async def _submit_speculative_subtask(self, subtask_info, exclude_bands): + subtask = subtask_info.subtask + if ( + subtask_info.num_speculative_concurrent_run + == self._subtask_speculation_max_concurrent_run + ): + logger.debug( + "Subtask %s speculative run has reached max limit %s, " + "won't submit another speculative run.", + subtask.subtask_id, + self._subtask_speculation_max_concurrent_run, + ) + return + if not self._subtask_speculation_dry: + if ( + len(subtask_info.band_futures) + < subtask_info.num_speculative_concurrent_run + 1 + ): + # ensure same subtask won't be submitted to same worker. + logger.info( + "Speculative execution for subtask %s has not been submitted to worker," + "waiting for being submitted to worker." + "Cluster resources may be not enough after excluded %s", + subtask.subtask_id, + exclude_bands, + ) + return + try: + await self._queueing_ref.add_subtasks( + [subtask], + [subtask.priority or tuple()], + exclude_bands=exclude_bands, + random_when_unavailable=False, + ) + logger.info( + "Added subtask %s to queue excluded from %s.", + subtask.subtask_id, + exclude_bands, + ) + subtask_info.num_speculative_concurrent_run += 1 + if ( + subtask_info.num_speculative_concurrent_run + == self._subtask_speculation_max_concurrent_run + ): + logger.info( + "Subtask %s reached max speculative execution: %s", + subtask.subtask_id, + self._subtask_speculation_max_concurrent_run, + ) + except NoAvailableBand: + logger.warning( + "No bands available for subtask %s after excluded bands %s, " + "try resubmit later.", + subtask.subtask_id, + exclude_bands, + ) + except KeyError as e: # pragma: no cover + # if the subtask happen to be finished, it's input chunk may got gc, if assigning to band + # needs to know input meta, we'll get KeyError or something else, just ignore it. + logger.warning( + "Subtask %s may happen to be finished just now, cannot add it to " + "subtask queue, got error %s, just ignore it.", + subtask.subtask_id, + e, + ) diff --git a/python/xorbits/_mars/services/scheduling/supervisor/tests/__init__.py b/python/xorbits/_mars/services/scheduling/supervisor/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/supervisor/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/services/scheduling/supervisor/tests/test_assigner.py b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_assigner.py new file mode 100644 index 000000000..23dd8fcc9 --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_assigner.py @@ -0,0 +1,375 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio + +import numpy as np +import pytest + +from ..... import oscar as mo +from .....core import ChunkGraph +from .....tensor.arithmetic import TensorTreeAdd +from .....tensor.fetch import TensorFetch +from ....cluster import ClusterAPI +from ....cluster.core import NodeRole, NodeStatus +from ....cluster.supervisor.locator import SupervisorPeerLocatorActor +from ....cluster.supervisor.node_info import NodeInfoCollectorActor +from ....cluster.uploader import NodeInfoUploaderActor +from ....meta import MockMetaAPI +from ....session import MockSessionAPI +from ....subtask import Subtask +from ...errors import NoAvailableBand, NoMatchingSlots +from ...supervisor import AssignerActor + + +class MockNodeInfoCollectorActor(NodeInfoCollectorActor): + def __init__(self, timeout=None, check_interval=None, with_gpu=False): + super().__init__(timeout=timeout, check_interval=check_interval) + self.ready_bands = { + ("address0", "numa-0"): 2, + ("address1", "numa-0"): 2, + ("address2", "numa-0"): 2, + ("address3", "numa-0"): 2, + } + if with_gpu: + self.ready_bands[("address0", "gpu-0")] = 1 + self.all_bands = self.ready_bands.copy() + + async def update_node_info( + self, address, role, env=None, resource=None, detail=None, status=None + ): + if "address" in address and status == NodeStatus.STOPPING: + del self.ready_bands[(address, "numa-0")] + await super().update_node_info(address, role, env, resource, detail, status) + + def get_all_bands(self, role=None, statuses=None): + if statuses == {NodeStatus.READY}: + return self.ready_bands + else: + return self.all_bands + + +class FakeClusterAPI(ClusterAPI): + @classmethod + async def create(cls, address: str, **kw): + dones, _ = await asyncio.wait( + [ + mo.create_actor( + SupervisorPeerLocatorActor, + "fixed", + address, + uid=SupervisorPeerLocatorActor.default_uid(), + address=address, + ), + mo.create_actor( + MockNodeInfoCollectorActor, + with_gpu=kw.get("with_gpu", False), + uid=NodeInfoCollectorActor.default_uid(), + address=address, + ), + mo.create_actor( + NodeInfoUploaderActor, + NodeRole.WORKER, + interval=kw.get("upload_interval"), + band_to_resource=kw.get("band_to_resource"), + use_gpu=kw.get("use_gpu", False), + uid=NodeInfoUploaderActor.default_uid(), + address=address, + ), + ] + ) + + for task in dones: + try: + task.result() + except mo.ActorAlreadyExist: # pragma: no cover + pass + + api = await super().create(address=address) + await api.mark_node_ready() + return api + + +@pytest.fixture +async def actor_pool(request): + pool = await mo.create_actor_pool("127.0.0.1", n_process=0) + with_gpu = request.param + + async with pool: + session_id = "test_session" + cluster_api = await FakeClusterAPI.create( + pool.external_address, with_gpu=with_gpu + ) + await MockSessionAPI.create(pool.external_address, session_id=session_id) + meta_api = await MockMetaAPI.create(session_id, pool.external_address) + assigner_ref = await mo.create_actor( + AssignerActor, + session_id, + uid=AssignerActor.gen_uid(session_id), + address=pool.external_address, + ) + + try: + yield pool, session_id, assigner_ref, cluster_api, meta_api + finally: + await mo.destroy_actor(assigner_ref) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("actor_pool", [False], indirect=True) +async def test_assign_cpu_tasks(actor_pool): + pool, session_id, assigner_ref, cluster_api, meta_api = actor_pool + + input1 = TensorFetch(key="a", source_key="a", dtype=np.dtype(int)).new_chunk([]) + input2 = TensorFetch(key="b", source_key="b", dtype=np.dtype(int)).new_chunk([]) + input3 = TensorFetch(key="c", source_key="c", dtype=np.dtype(int)).new_chunk([]) + result_chunk = TensorTreeAdd(args=[input1, input2, input3]).new_chunk( + [input1, input2, input3] + ) + + chunk_graph = ChunkGraph([result_chunk]) + chunk_graph.add_node(input1) + chunk_graph.add_node(input2) + chunk_graph.add_node(input3) + chunk_graph.add_node(result_chunk) + chunk_graph.add_edge(input1, result_chunk) + chunk_graph.add_edge(input2, result_chunk) + chunk_graph.add_edge(input3, result_chunk) + + await meta_api.set_chunk_meta( + input1, memory_size=200, store_size=200, bands=[("address0", "numa-0")] + ) + await meta_api.set_chunk_meta( + input2, memory_size=400, store_size=400, bands=[("address1", "numa-0")] + ) + await meta_api.set_chunk_meta( + input3, memory_size=400, store_size=400, bands=[("address2", "numa-0")] + ) + + await cluster_api.set_node_status( + node="address1", role=NodeRole.WORKER, status=NodeStatus.STOPPING + ) + await cluster_api.set_node_status( + node="address3", role=NodeRole.WORKER, status=NodeStatus.STOPPING + ) + + subtask = Subtask("test_task", session_id, chunk_graph=chunk_graph) + [result] = await assigner_ref.assign_subtasks([subtask]) + assert result in (("address0", "numa-0"), ("address2", "numa-0")) + + subtask.expect_bands = [("address0", "numa-0")] + [result] = await assigner_ref.assign_subtasks([subtask]) + assert result == ("address0", "numa-0") + + subtask.expect_bands = [("address0", "numa-0"), ("address1", "numa-0")] + [result] = await assigner_ref.assign_subtasks([subtask]) + assert result == ("address0", "numa-0") + + subtask.expect_bands = [("address1", "numa-0")] + [result] = await assigner_ref.assign_subtasks([subtask]) + assert result in (("address0", "numa-0"), ("address2", "numa-0")) + + [result] = await assigner_ref.assign_subtasks( + [subtask], exclude_bands={("address0", "numa-0"), ("address2", "numa-0")} + ) + assert result in (("address0", "numa-0"), ("address2", "numa-0")) + [result] = await assigner_ref.assign_subtasks( + [subtask], exclude_bands={("address0", "numa-0")}, random_when_unavailable=False + ) + assert result == ("address2", "numa-0") + with pytest.raises(NoAvailableBand): + await assigner_ref.assign_subtasks( + [subtask], + exclude_bands={("address0", "numa-0"), ("address2", "numa-0")}, + random_when_unavailable=False, + ) + subtask.bands_specified = True + assert result == ("address2", "numa-0") + with pytest.raises(NoAvailableBand): + await assigner_ref.assign_subtasks([subtask]) + subtask.bands_specified = False + + result_chunk.op.gpu = True + subtask = Subtask("test_task", session_id, chunk_graph=chunk_graph) + with pytest.raises(NoMatchingSlots) as err: + await assigner_ref.assign_subtasks([subtask]) + assert "gpu" in str(err.value) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("actor_pool", [False], indirect=True) +async def test_assign_broadcaster(actor_pool): + pool, session_id, assigner_ref, cluster_api, meta_api = actor_pool + + broadcaster = TensorFetch(key="x", source_key="x", dtype=np.dtype(int)).new_chunk( + [], is_broadcaster=True + ) + input_chunk = TensorFetch(key="a", source_key="a", dtype=np.dtype(int)).new_chunk( + [] + ) + result_chunk = TensorTreeAdd(args=[broadcaster, input_chunk]).new_chunk( + [broadcaster, input_chunk] + ) + + chunk_graph = ChunkGraph([result_chunk]) + chunk_graph.add_node(broadcaster) + chunk_graph.add_node(input_chunk) + chunk_graph.add_node(result_chunk) + chunk_graph.add_edge(broadcaster, result_chunk) + chunk_graph.add_edge(input_chunk, result_chunk) + + await meta_api.set_chunk_meta( + broadcaster, memory_size=1000, store_size=200, bands=[("address0", "numa-0")] + ) + await meta_api.set_chunk_meta( + input_chunk, memory_size=200, store_size=200, bands=[("address1", "numa-0")] + ) + + subtask = Subtask("test_task", session_id, chunk_graph=chunk_graph) + [result] = await assigner_ref.assign_subtasks([subtask]) + assert result == ("address1", "numa-0") + + +@pytest.mark.asyncio +@pytest.mark.parametrize("actor_pool", [True], indirect=True) +async def test_assign_gpu_tasks(actor_pool): + pool, session_id, assigner_ref, cluster_api, meta_api = actor_pool + + input1 = TensorFetch(key="a", source_key="a", dtype=np.dtype(int)).new_chunk([]) + input2 = TensorFetch(key="b", source_key="b", dtype=np.dtype(int)).new_chunk([]) + result_chunk = TensorTreeAdd(args=[input1, input2], gpu=True).new_chunk( + [input1, input2] + ) + + chunk_graph = ChunkGraph([result_chunk]) + chunk_graph.add_node(input1) + chunk_graph.add_node(input2) + chunk_graph.add_node(result_chunk) + chunk_graph.add_edge(input1, result_chunk) + chunk_graph.add_edge(input2, result_chunk) + + await meta_api.set_chunk_meta( + input1, memory_size=200, store_size=200, bands=[("address0", "numa-0")] + ) + await meta_api.set_chunk_meta( + input2, memory_size=200, store_size=200, bands=[("address0", "numa-0")] + ) + + subtask = Subtask("test_task", session_id, chunk_graph=chunk_graph) + [result] = await assigner_ref.assign_subtasks([subtask]) + assert result[1].startswith("gpu") + + +@pytest.mark.asyncio +@pytest.mark.parametrize("actor_pool", [False], indirect=True) +async def test_reassign_subtasks(actor_pool): + pool, session_id, assigner_ref, cluster_api, meta_api = actor_pool + + # ('address0', 'numa-0'), ('address1', 'numa-0'), ('address2', 'numa-0') are ready + await cluster_api.set_node_status( + node="address3", role=NodeRole.WORKER, status=NodeStatus.STOPPING + ) + + band_num_queued_subtasks = {("address0", "numa-0"): 3, ("address1", "numa-0"): 4} + move_queued_subtasks = await assigner_ref.reassign_subtasks( + band_num_queued_subtasks + ) + assert move_queued_subtasks in ( + { + ("address1", "numa-0"): -1, + ("address0", "numa-0"): -1, + ("address2", "numa-0"): 2, + }, + { + ("address1", "numa-0"): -2, + ("address0", "numa-0"): 0, + ("address2", "numa-0"): 2, + }, + { + ("address1", "numa-0"): -2, + ("address0", "numa-0"): -1, + ("address2", "numa-0"): 3, + }, + ) + + # ('address0', 'numa-0'), ('address2', 'numa-0') are ready + await cluster_api.set_node_status( + node="address1", role=NodeRole.WORKER, status=NodeStatus.STOPPING + ) + + band_num_queued_subtasks = { + ("address0", "numa-0"): 9, + ("address1", "numa-0"): 7, + ("address2", "numa-0"): 0, + } + move_queued_subtasks = await assigner_ref.reassign_subtasks( + band_num_queued_subtasks + ) + assert move_queued_subtasks in ( + { + ("address1", "numa-0"): -7, + ("address0", "numa-0"): 3, + ("address2", "numa-0"): 4, + }, + { + ("address1", "numa-0"): -7, + ("address0", "numa-0"): 4, + ("address2", "numa-0"): 3, + }, + ) + + band_num_queued_subtasks = {("address0", "numa-0"): 9, ("address1", "numa-0"): 7} + move_queued_subtasks = await assigner_ref.reassign_subtasks( + band_num_queued_subtasks + ) + assert move_queued_subtasks == { + ("address1", "numa-0"): -7, + ("address0", "numa-0"): -1, + ("address2", "numa-0"): 8, + } + + band_num_queued_subtasks = {("address1", "numa-0"): 8} + move_queued_subtasks = await assigner_ref.reassign_subtasks( + band_num_queued_subtasks + ) + assert move_queued_subtasks == { + ("address1", "numa-0"): -8, + ("address0", "numa-0"): 4, + ("address2", "numa-0"): 4, + } + + band_num_queued_subtasks = {("address1", "numa-0"): 0} + move_queued_subtasks = await assigner_ref.reassign_subtasks( + band_num_queued_subtasks + ) + assert move_queued_subtasks == {("address1", "numa-0"): 0} + + # only ('address0', 'numa-0') is ready, i.e. there's only one band initially + await cluster_api.set_node_status( + node="address2", role=NodeRole.WORKER, status=NodeStatus.STOPPING + ) + band_num_queued_subtasks = {("address0", "numa-0"): 8} + move_queued_subtasks = await assigner_ref.reassign_subtasks( + band_num_queued_subtasks + ) + assert move_queued_subtasks == {("address0", "numa-0"): 0} + + band_num_queued_subtasks = {("address1", "numa-0"): 8} + move_queued_subtasks = await assigner_ref.reassign_subtasks( + band_num_queued_subtasks + ) + assert move_queued_subtasks == { + ("address1", "numa-0"): -8, + ("address0", "numa-0"): 8, + } diff --git a/python/xorbits/_mars/services/scheduling/supervisor/tests/test_globalresource.py b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_globalresource.py new file mode 100644 index 000000000..84bb27a8e --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_globalresource.py @@ -0,0 +1,82 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio + +import pytest + +from ..... import oscar as mo +from .....resource import Resource +from ....cluster import ClusterAPI, MockClusterAPI +from ....session import MockSessionAPI +from ...supervisor import GlobalResourceManagerActor + + +@pytest.fixture +async def actor_pool(): + pool = await mo.create_actor_pool("127.0.0.1", n_process=0) + + async with pool: + session_id = "test_session" + await MockClusterAPI.create(pool.external_address) + await MockSessionAPI.create(pool.external_address, session_id=session_id) + + global_resource_ref = await mo.create_actor( + GlobalResourceManagerActor, + uid=GlobalResourceManagerActor.default_uid(), + address=pool.external_address, + ) + + try: + yield pool, session_id, global_resource_ref + finally: + await mo.destroy_actor(global_resource_ref) + await MockClusterAPI.cleanup(pool.external_address) + + +@pytest.mark.asyncio +async def test_global_resource(actor_pool): + pool, session_id, global_resource_ref = actor_pool + + cluster_api = await ClusterAPI.create(pool.external_address) + bands = await cluster_api.get_all_bands() + band = (pool.external_address, "numa-0") + band_resource = bands[band] + + assert band in await global_resource_ref.get_idle_bands(0) + assert ["subtask0"] == await global_resource_ref.apply_subtask_resources( + band, session_id, ["subtask0"], [Resource(num_cpus=1)] + ) + assert band not in await global_resource_ref.get_idle_bands(0) + + await global_resource_ref.update_subtask_resources( + band, session_id, "subtask0", band_resource + ) + assert [] == await global_resource_ref.apply_subtask_resources( + band, session_id, ["subtask1"], [Resource(num_cpus=1)] + ) + + wait_coro = global_resource_ref.wait_band_idle(band) + (done, pending) = await asyncio.wait([wait_coro], timeout=0.5) + assert not done + await global_resource_ref.release_subtask_resource(band, session_id, "subtask0") + (done, pending) = await asyncio.wait([wait_coro], timeout=0.5) + assert done + assert band in await global_resource_ref.get_idle_bands(0) + assert ["subtask1"] == await global_resource_ref.apply_subtask_resources( + band, session_id, ["subtask1"], [Resource(num_cpus=1)] + ) + assert (await global_resource_ref.get_remaining_resources())[ + band + ] == band_resource - Resource(num_cpus=1) diff --git a/python/xorbits/_mars/services/scheduling/supervisor/tests/test_manager.py b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_manager.py new file mode 100644 index 000000000..d16c993f0 --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_manager.py @@ -0,0 +1,200 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +from collections import defaultdict +from typing import List, Set, Tuple + +import pytest + +from ..... import oscar as mo +from .....typing import BandType +from ....cluster import MockClusterAPI +from ....subtask import Subtask, SubtaskResult, SubtaskStatus +from ....task.supervisor.manager import TaskManagerActor +from ...supervisor import ( + GlobalResourceManagerActor, + SubtaskManagerActor, + SubtaskQueueingActor, +) +from ...worker import SubtaskExecutionActor + + +class MockTaskManagerActor(mo.Actor): + def __init__(self): + self._results = dict() + + def set_subtask_result(self, result: SubtaskResult): + self._results[result.subtask_id] = result + + def get_result(self, subtask_id: str) -> SubtaskResult: + return self._results[subtask_id] + + +class MockSubtaskQueueingActor(mo.Actor): + def __init__(self): + self._subtasks = dict() + self._error = None + + def add_subtasks( + self, + subtasks: List[Subtask], + priorities: List[Tuple], + exclude_bands: Set[Tuple] = None, + random_when_unavailable: bool = True, + ): + if self._error is not None: + raise self._error + for subtask, priority in zip(subtasks, priorities): + self._subtasks[subtask.subtask_id] = (subtask, priority) + + def submit_subtasks(self, band: BandType, limit: int): + pass + + def remove_queued_subtasks(self, subtask_ids: List[str]): + for stid in subtask_ids: + self._subtasks.pop(stid) + + def set_error(self, error): + self._error = error + + +class MockSubtaskExecutionActor(mo.StatelessActor): + def __init__(self): + self._subtask_aiotasks = defaultdict(dict) + self._run_subtask_events = {} + + async def set_run_subtask_event(self, subtask_id, event): + self._run_subtask_events[subtask_id] = event + + async def run_subtask( + self, subtask: Subtask, band_name: str, supervisor_address: str + ): + self._run_subtask_events[subtask.subtask_id].set() + task = self._subtask_aiotasks[subtask.subtask_id][ + band_name + ] = asyncio.create_task(asyncio.sleep(20)) + return await task + + def cancel_subtask(self, subtask_id: str, kill_timeout: int = 5): + for task in self._subtask_aiotasks[subtask_id].values(): + task.cancel() + + async def wait_subtask(self, subtask_id: str, band_name: str): + try: + yield self._subtask_aiotasks[subtask_id][band_name] + except asyncio.CancelledError: + pass + + +@pytest.fixture +async def actor_pool(): + pool = await mo.create_actor_pool("127.0.0.1", n_process=0) + + async with pool: + session_id = "test_session" + await MockClusterAPI.create(pool.external_address) + queue_ref = await mo.create_actor( + MockSubtaskQueueingActor, + uid=SubtaskQueueingActor.gen_uid(session_id), + address=pool.external_address, + ) + slots_ref = await mo.create_actor( + GlobalResourceManagerActor, + uid=GlobalResourceManagerActor.default_uid(), + address=pool.external_address, + ) + task_manager_ref = await mo.create_actor( + MockTaskManagerActor, + uid=TaskManagerActor.gen_uid(session_id), + address=pool.external_address, + ) + execution_ref = await mo.create_actor( + MockSubtaskExecutionActor, + uid=SubtaskExecutionActor.default_uid(), + address=pool.external_address, + ) + submitter_ref = await mo.create_actor( + SubtaskManagerActor, + session_id, + uid=SubtaskManagerActor.gen_uid(session_id), + address=pool.external_address, + ) + + try: + yield pool, session_id, execution_ref, submitter_ref, queue_ref, task_manager_ref + finally: + await mo.destroy_actor(slots_ref) + await MockClusterAPI.cleanup(pool.external_address) + + +@pytest.mark.asyncio +async def test_subtask_manager(actor_pool): + ( + pool, + session_id, + execution_ref, + manager_ref, + queue_ref, + task_manager_ref, + ) = actor_pool + + subtask1 = Subtask("subtask1", session_id) + subtask2 = Subtask("subtask2", session_id) + + await manager_ref.add_subtasks([subtask1, subtask2], [(1,), (2,)]) + run_subtask1_event, run_subtask2_event = asyncio.Event(), asyncio.Event() + await execution_ref.set_run_subtask_event(subtask1.subtask_id, run_subtask1_event) + await execution_ref.set_run_subtask_event(subtask2.subtask_id, run_subtask2_event) + + submit1 = asyncio.create_task( + manager_ref.submit_subtask_to_band( + subtask1.subtask_id, (pool.external_address, "gpu-0") + ) + ) + submit2 = asyncio.create_task( + manager_ref.submit_subtask_to_band( + subtask2.subtask_id, (pool.external_address, "gpu-1") + ) + ) + + await asyncio.gather(run_subtask1_event.wait(), run_subtask2_event.wait()) + + await manager_ref.cancel_subtasks([subtask1.subtask_id, subtask2.subtask_id]) + await asyncio.wait_for( + asyncio.gather( + execution_ref.wait_subtask(subtask1.subtask_id, "gpu-0"), + execution_ref.wait_subtask(subtask2.subtask_id, "gpu-1"), + ), + timeout=10, + ) + with pytest.raises(asyncio.CancelledError): + await submit1 + with pytest.raises(asyncio.CancelledError): + await submit2 + assert ( + await task_manager_ref.get_result(subtask1.subtask_id) + ).status == SubtaskStatus.cancelled + assert ( + await task_manager_ref.get_result(subtask2.subtask_id) + ).status == SubtaskStatus.cancelled + + subtask3 = Subtask("subtask3", session_id) + + await queue_ref.set_error(ValueError()) + await manager_ref.add_subtasks.tell([subtask3], [(3,)]) + await asyncio.sleep(0.1) + subtask3_result = await task_manager_ref.get_result(subtask3.subtask_id) + assert subtask3_result.status == SubtaskStatus.errored + assert isinstance(subtask3_result.error, ValueError) diff --git a/python/xorbits/_mars/services/scheduling/supervisor/tests/test_queue_balance.py b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_queue_balance.py new file mode 100644 index 000000000..53f294678 --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_queue_balance.py @@ -0,0 +1,238 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +from collections import defaultdict +from typing import List, Tuple + +import pytest + +from ..... import oscar as mo +from .....resource import Resource +from ....cluster import ClusterAPI +from ....cluster.core import NodeRole, NodeStatus +from ....cluster.supervisor.locator import SupervisorPeerLocatorActor +from ....cluster.supervisor.node_info import NodeInfoCollectorActor +from ....cluster.uploader import NodeInfoUploaderActor +from ....subtask import Subtask +from ...supervisor import ( + AssignerActor, + GlobalResourceManagerActor, + SubtaskManagerActor, + SubtaskQueueingActor, +) + + +class MockNodeInfoCollectorActor(NodeInfoCollectorActor): + def __init__(self, timeout=None, check_interval=None): + super().__init__(timeout=timeout, check_interval=check_interval) + self.ready_nodes = { + ("address0", "numa-0"): 2, + ("address1", "numa-0"): 2, + ("address2", "numa-0"): 2, + } + + async def update_node_info( + self, address, role, env=None, resource=None, detail=None, status=None + ): + if "address" in address and status == NodeStatus.STOPPING: + del self.ready_nodes[(address, "numa-0")] + await super().update_node_info(address, role, env, resource, detail, status) + + def get_all_bands(self, role=None, statuses=None): + if statuses == {NodeStatus.READY}: + return self.ready_nodes + else: + return { + ("address0", "numa-0"): 2, + ("address1", "numa-0"): 2, + ("address2", "numa-0"): 2, + } + + +class FakeClusterAPI(ClusterAPI): + @classmethod + async def create(cls, address: str, **kw): + dones, _ = await asyncio.wait( + [ + mo.create_actor( + SupervisorPeerLocatorActor, + "fixed", + address, + uid=SupervisorPeerLocatorActor.default_uid(), + address=address, + ), + mo.create_actor( + MockNodeInfoCollectorActor, + uid=NodeInfoCollectorActor.default_uid(), + address=address, + ), + mo.create_actor( + NodeInfoUploaderActor, + NodeRole.WORKER, + interval=kw.get("upload_interval"), + band_to_resource=kw.get("band_to_resource"), + use_gpu=kw.get("use_gpu", False), + uid=NodeInfoUploaderActor.default_uid(), + address=address, + ), + ] + ) + + for task in dones: + try: + task.result() + except mo.ActorAlreadyExist: # pragma: no cover + pass + + api = await super().create(address=address) + await api.mark_node_ready() + return api + + +class MockSlotsActor(mo.Actor): + @mo.extensible + def apply_subtask_resources( + self, + band: Tuple, + session_id: str, + subtask_ids: List[str], + subtask_slots: List[Resource], + ): + return subtask_ids + + def refresh_bands(self): + pass + + def get_used_resources(self): + return {} + + +class MockAssignerActor(mo.Actor): + def assign_subtasks( + self, subtasks: List[Subtask], exclude_bands=None, random_when_unavailable=True + ): + return [subtask.expect_bands[0] for subtask in subtasks] + + def reassign_subtasks(self, band_num_queued_subtasks): + if len(band_num_queued_subtasks.keys()) == 1: + [(band, _)] = band_num_queued_subtasks.items() + return {band: 0} + return { + ("address1", "numa-0"): -8, + ("address0", "numa-0"): 0, + ("address2", "numa-0"): 8, + } + + +class MockSubtaskManagerActor(mo.Actor): + def __init__(self): + self._submitted_subtask_ids = defaultdict(list) + + @mo.extensible + def submit_subtask_to_band(self, subtask_id: str, band: Tuple): + print(f"submit subtask {subtask_id} to band {band}") + self._submitted_subtask_ids[band].append(subtask_id) + + def dump_data(self): + return self._submitted_subtask_ids + + +@pytest.fixture +async def actor_pool(): + pool = await mo.create_actor_pool("127.0.0.1", n_process=0) + + async with pool: + session_id = "test_session" + cluster_api = await FakeClusterAPI.create(pool.external_address) + + # create assigner actor + await mo.create_actor( + MockAssignerActor, + uid=AssignerActor.gen_uid(session_id), + address=pool.external_address, + ) + # create queueing actor + manager_ref = await mo.create_actor( + MockSubtaskManagerActor, + uid=SubtaskManagerActor.gen_uid(session_id), + address=pool.external_address, + ) + # create slots actor + slots_ref = await mo.create_actor( + MockSlotsActor, + uid=GlobalResourceManagerActor.default_uid(), + address=pool.external_address, + ) + # create queueing actor + queueing_ref = await mo.create_actor( + SubtaskQueueingActor, + session_id, + 1, + uid=SubtaskQueueingActor.gen_uid(session_id), + address=pool.external_address, + ) + + try: + yield pool, session_id, cluster_api, queueing_ref, slots_ref, manager_ref + finally: + await mo.destroy_actor(queueing_ref) + + +async def _queue_subtasks(num_subtasks, expect_bands, queueing_ref): + if not num_subtasks: + return + subtasks = [Subtask(expect_bands[0] + "-" + str(i)) for i in range(num_subtasks)] + for subtask in subtasks: + subtask.expect_bands = [expect_bands] + subtask.required_resource = Resource(num_cpus=1) + priorities = [(i,) for i in range(num_subtasks)] + + await queueing_ref.add_subtasks(subtasks, priorities) + + +@pytest.mark.asyncio +async def test_subtask_queueing(actor_pool): + _pool, session_id, cluster_api, queueing_ref, slots_ref, manager_ref = actor_pool + nums_subtasks = [9, 8, 1] + expects_bands = [ + ("address0", "numa-0"), + ("address1", "numa-0"), + ("address2", "numa-0"), + ] + for num_subtasks, expect_bands in zip(nums_subtasks, expects_bands): + await _queue_subtasks(num_subtasks, expect_bands, queueing_ref) + + await cluster_api.set_node_status( + node="address1", role=NodeRole.WORKER, status=NodeStatus.STOPPING + ) + + # 9 subtasks on ('address0', 'numa-0') + await queueing_ref.submit_subtasks(band=("address0", "numa-0"), limit=10) + commited_subtask_ids = (await manager_ref.dump_data())[("address0", "numa-0")] + assert ( + len(commited_subtask_ids) == 9 + ), f"commited_subtask_ids {commited_subtask_ids}" + + # 0 subtasks on ('address1', 'numa-0') + await queueing_ref.submit_subtasks(band=("address1", "numa-0"), limit=10) + commited_subtask_ids = (await manager_ref.dump_data())[("address0", "numa-0")] + assert ( + len(commited_subtask_ids) == 9 + ), f"commited_subtask_ids {commited_subtask_ids}" + + # 9 subtasks on ('address2', 'numa-0') + await queueing_ref.submit_subtasks(band=("address2", "numa-0"), limit=10) + submitted_subtask_ids = await manager_ref.dump_data() + assert sum(len(v) for v in submitted_subtask_ids.values()) == 18 diff --git a/python/xorbits/_mars/services/scheduling/supervisor/tests/test_queueing.py b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_queueing.py new file mode 100644 index 000000000..d6032cc74 --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_queueing.py @@ -0,0 +1,141 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Tuple + +import pytest + +from ..... import oscar as mo +from .....resource import Resource +from ....cluster import MockClusterAPI +from ....subtask import Subtask +from ...supervisor import ( + AssignerActor, + GlobalResourceManagerActor, + SubtaskManagerActor, + SubtaskQueueingActor, +) + + +class MockSlotsActor(mo.Actor): + def __init__(self): + self._capacity = -1 + + def set_capacity(self, capacity: int): + self._capacity = capacity + + @mo.extensible + def apply_subtask_resources( + self, + band: Tuple, + session_id: str, + subtask_ids: List[str], + subtask_resources: List[Resource], + ): + idx = ( + min(self._capacity, len(subtask_ids)) + if self._capacity >= 0 + else len(subtask_ids) + ) + return subtask_ids[:idx] + + +class MockAssignerActor(mo.Actor): + def assign_subtasks( + self, subtasks: List[Subtask], exclude_bands=None, random_when_unavailable=True + ): + return [(self.address, "numa-0")] * len(subtasks) + + +class MockSubtaskManagerActor(mo.Actor): + def __init__(self): + self._subtask_ids, self._bands = [], [] + + @mo.extensible + def submit_subtask_to_band(self, subtask_id: str, band: Tuple): + self._subtask_ids.append(subtask_id) + self._bands.append(band) + + def dump_data(self): + return self._subtask_ids, self._bands + + +@pytest.fixture +async def actor_pool(): + pool = await mo.create_actor_pool("127.0.0.1", n_process=0) + + async with pool: + session_id = "test_session" + await MockClusterAPI.create(pool.external_address) + + # create assigner actor + await mo.create_actor( + MockAssignerActor, + uid=AssignerActor.gen_uid(session_id), + address=pool.external_address, + ) + # create queueing actor + manager_ref = await mo.create_actor( + MockSubtaskManagerActor, + uid=SubtaskManagerActor.gen_uid(session_id), + address=pool.external_address, + ) + # create slots actor + slots_ref = await mo.create_actor( + MockSlotsActor, + uid=GlobalResourceManagerActor.default_uid(), + address=pool.external_address, + ) + # create queueing actor + queueing_ref = await mo.create_actor( + SubtaskQueueingActor, + session_id, + uid=SubtaskQueueingActor.gen_uid(session_id), + address=pool.external_address, + ) + try: + yield pool, session_id, queueing_ref, slots_ref, manager_ref + finally: + await mo.destroy_actor(queueing_ref) + await MockClusterAPI.cleanup(pool.external_address) + + +@pytest.mark.asyncio +async def test_subtask_queueing(actor_pool): + _pool, session_id, queueing_ref, slots_ref, manager_ref = actor_pool + await slots_ref.set_capacity(2) + + subtasks = [Subtask(str(i)) for i in range(5)] + priorities = [(i,) for i in range(5)] + + await queueing_ref.add_subtasks(subtasks, priorities) + # queue: [4 3 2 1 0] + assert await queueing_ref.all_bands_busy() + await queueing_ref.submit_subtasks() + # queue: [2 1 0] + commited_subtask_ids, _commited_bands = await manager_ref.dump_data() + assert commited_subtask_ids == ["4", "3"] + + await queueing_ref.remove_queued_subtasks(["1"]) + # queue: [2 0] + await queueing_ref.update_subtask_priority.batch( + queueing_ref.update_subtask_priority.delay("0", (3,)), + queueing_ref.update_subtask_priority.delay("4", (5,)), + ) + # queue: [0(3) 2] + await queueing_ref.submit_subtasks() + # queue: [] + commited_subtasks, _commited_bands = await manager_ref.dump_data() + assert commited_subtasks == ["4", "3", "0", "2"] + assert not await queueing_ref.all_bands_busy() diff --git a/python/xorbits/_mars/services/scheduling/supervisor/tests/test_speculation.py b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_speculation.py new file mode 100644 index 000000000..ed666a483 --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_speculation.py @@ -0,0 +1,151 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +from typing import List, Set, Tuple + +import pytest + +from ..... import oscar as mo +from ....cluster import MockClusterAPI +from ....subtask import Subtask +from ...errors import NoAvailableBand +from ...supervisor import GlobalResourceManagerActor +from ..manager import SubtaskScheduleInfo +from ..speculation import SpeculativeScheduler + + +class MockSubtaskQueueingActor(mo.Actor): + def __init__(self): + self._subtasks = [] + self._exceptions = [] + + async def add_subtasks( + self, + subtasks: List[Subtask], + priorities: List[Tuple], + exclude_bands: Set[Tuple] = None, + random_when_unavailable: bool = True, + ): + if { + ("addr0", "numa-0"), + ("addr1", "numa-0"), + ("addr2", "numa-0"), + } - exclude_bands == set(): + self._exceptions.append(NoAvailableBand()) + raise self._exceptions[-1] + self._subtasks.extend(subtasks) + + async def get_subtasks(self): + return self._subtasks + + async def get_exceptions(self): + return self._exceptions + + +@pytest.fixture +async def actor_pool(): + pool = await mo.create_actor_pool("127.0.0.1", n_process=0) + + async with pool: + session_id = "test_session" + cluster_api = await MockClusterAPI.create(pool.external_address) + slots_ref = await mo.create_actor( + GlobalResourceManagerActor, + uid=GlobalResourceManagerActor.default_uid(), + address=pool.external_address, + ) + queue_ref = await mo.create_actor( + MockSubtaskQueueingActor, + address=pool.external_address, + ) + try: + yield pool, cluster_api, session_id, slots_ref, queue_ref + finally: + await mo.destroy_actor(queue_ref) + await MockClusterAPI.cleanup(pool.external_address) + + +@pytest.mark.asyncio +async def test_speculation(actor_pool): + pool, cluster_api, session_id, slots_ref, queue_ref = actor_pool + speculation_conf = { + "enabled": True, + "interval": 1000, + "threshold": 0.2, + "min_task_runtime": 0.01, + "multiplier": 1.5, + "max_concurrent_run": 2, + } + speculative_scheduler = SpeculativeScheduler(queue_ref, slots_ref, speculation_conf) + await speculative_scheduler.start() + await speculative_scheduler._speculative_execution() + total_subtasks = 5 + subtasks = [ + Subtask(str(i), retryable=False, logic_key=f"logic_key1", logic_parallelism=5) + for i in range(total_subtasks) + ] + subtask_infos = [ + SubtaskScheduleInfo(subtask, max_reschedules=3) for subtask in subtasks + ] + # add unfinished subtasks + for subtask_info in subtask_infos: + speculative_scheduler.add_subtask(subtask_info) + await speculative_scheduler._speculative_execution() + assert len(speculative_scheduler._grouped_finished_subtasks.values()) == 0 + # finished some subtasks + for subtask_info in subtask_infos[:-1]: + speculative_scheduler.finish_subtask(subtask_info) + assert ( + len(next(iter(speculative_scheduler._grouped_finished_subtasks.values()))) + == total_subtasks - 1 + ) + assert ( + len(next(iter(speculative_scheduler._grouped_unfinished_subtasks.values()))) + == 1 + ) + await speculative_scheduler._speculative_execution() + subtask_infos[-1].subtask.retryable = True + # pretend subtask has been running on a band. + subtask_infos[-1].band_futures[("addr0", "numa-0")] = asyncio.ensure_future( + asyncio.sleep(1) + ) + await speculative_scheduler._speculative_execution() + submitted = await queue_ref.get_subtasks() + # assert stale subtasks resubmitted + assert subtask_infos[-1].subtask in submitted + await speculative_scheduler._speculative_execution() + # if resubmitted subtasks not running, don't resubmitted again. + assert 1 == len(await queue_ref.get_subtasks()) + # pretend subtask has been running on a band. + subtask_infos[-1].band_futures[("addr1", "numa-0")] = asyncio.ensure_future( + asyncio.sleep(1) + ) + await speculative_scheduler._speculative_execution() + # stale subtasks resubmitted again + assert 2 == len(await queue_ref.get_subtasks()) + # pretend subtask has been running on another band. + subtask_infos[-1].band_futures[("addr2", "numa-0")] = asyncio.ensure_future( + asyncio.sleep(1) + ) + # speculative run reached max limit `max_concurrent_run`, i.e. 2 + await speculative_scheduler._speculative_execution() + # assert raise queue_ref raise NoAvailableBand + speculative_scheduler._subtask_speculation_max_concurrent_run += 1 + await speculative_scheduler._speculative_execution() + assert isinstance((await queue_ref.get_exceptions())[0], NoAvailableBand) + # finish subtasks + speculative_scheduler.finish_subtask(subtask_infos[-1]) + assert len(speculative_scheduler._grouped_unfinished_subtasks) == 0 + await speculative_scheduler.stop() diff --git a/python/xorbits/_mars/services/scheduling/tests/__init__.py b/python/xorbits/_mars/services/scheduling/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/services/scheduling/tests/test_service.py b/python/xorbits/_mars/services/scheduling/tests/test_service.py new file mode 100644 index 000000000..4da089e25 --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/tests/test_service.py @@ -0,0 +1,332 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import time +from collections import defaultdict + +import numpy as np +import pytest + +from .... import oscar as mo +from .... import remote as mr +from .... import tensor as mt +from ....core.graph import ChunkGraphBuilder, TileableGraph, TileableGraphBuilder +from ....resource import Resource +from ... import NodeRole, start_services, stop_services +from ...session import SessionAPI +from ...storage import MockStorageAPI, StorageAPI +from ...subtask import Subtask, SubtaskResult, SubtaskStatus +from ...task import new_task_id +from ...task.supervisor.manager import TaskManagerActor +from ...web import WebActor +from .. import SchedulingAPI +from ..api.web import WebSchedulingAPI +from ..supervisor import GlobalResourceManagerActor + + +class FakeTaskManager(TaskManagerActor): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._events = defaultdict(list) + self._results = dict() + + def set_subtask_result(self, subtask_result: SubtaskResult): + self._results[subtask_result.subtask_id] = subtask_result + for event in self._events[subtask_result.subtask_id]: + event.set() + self._events.pop(subtask_result.subtask_id, None) + + def _return_result(self, subtask_id: str): + result = self._results[subtask_id] + if result.status == SubtaskStatus.cancelled: + raise asyncio.CancelledError + elif result.status == SubtaskStatus.errored: + raise result.error.with_traceback(result.traceback) + return result + + async def wait_subtask_result(self, subtask_id: str): + if subtask_id in self._results: + return self._return_result(subtask_id) + + event = asyncio.Event() + self._events[subtask_id].append(event) + + async def waiter(): + await event.wait() + return self._return_result(subtask_id) + + return waiter() + + +def _gen_subtask(t, session_id): + graph = TileableGraph([t.data]) + next(TileableGraphBuilder(graph).build()) + + chunk_graph = next(ChunkGraphBuilder(graph, fuse_enabled=False).build()) + subtask = Subtask(new_task_id(), session_id, new_task_id(), chunk_graph) + subtask.required_resource = Resource(num_cpus=1) + + return subtask + + +def _approx_resource(actual, expect): + return ( + pytest.approx(actual.num_cpus) == expect.num_cpus + and pytest.approx(actual.num_gpus) == expect.num_cpus + and pytest.approx(actual.mem_bytes) == expect.mem_bytes + ) + + +@pytest.fixture +async def actor_pools(): + async def start_pool(is_worker: bool): + if is_worker: + kw = dict( + n_process=2, + labels=["main"] + ["numa-0"] * 2, + subprocess_start_method="spawn", + ) + else: + kw = dict(n_process=0, subprocess_start_method="spawn") + pool = await mo.create_actor_pool("127.0.0.1", **kw) + await pool.start() + return pool + + sv_pool, worker_pool = await asyncio.gather(start_pool(False), start_pool(True)) + + config = { + "services": [ + "cluster", + "session", + "meta", + "lifecycle", + "scheduling", + "subtask", + "task", + "mutable", + "web", + ], + "cluster": { + "backend": "fixed", + "lookup_address": sv_pool.external_address, + "resource": {"numa-0": Resource(num_cpus=2)}, + }, + "meta": {"store": "dict"}, + "scheduling": {}, + "subtask": {}, + } + await start_services(NodeRole.SUPERVISOR, config, address=sv_pool.external_address) + await start_services(NodeRole.WORKER, config, address=worker_pool.external_address) + + session_id = "test_session" + session_api = await SessionAPI.create(sv_pool.external_address) + await session_api.create_session(session_id) + ref = await mo.actor_ref( + FakeTaskManager.gen_uid(session_id), address=sv_pool.external_address + ) + await mo.destroy_actor(ref) + task_manager_ref = await mo.create_actor( + FakeTaskManager, + session_id, + uid=FakeTaskManager.gen_uid(session_id), + address=sv_pool.external_address, + ) + await MockStorageAPI.create(session_id, worker_pool.external_address) + + try: + yield sv_pool, worker_pool, session_id, task_manager_ref + finally: + await session_api.delete_session(session_id) + await MockStorageAPI.cleanup(worker_pool.external_address) + await stop_services( + NodeRole.WORKER, config, address=worker_pool.external_address + ) + await stop_services( + NodeRole.SUPERVISOR, config, address=sv_pool.external_address + ) + + await asyncio.gather(sv_pool.stop(), worker_pool.stop()) + + +async def _get_subtask_summaries_by_web(sv_pool_address, session_id, task_id=None): + web_actor = await mo.actor_ref(WebActor.default_uid(), address=sv_pool_address) + web_address = await web_actor.get_web_address() + web_scheduling_api = WebSchedulingAPI(session_id, web_address) + return await web_scheduling_api.get_subtask_schedule_summaries(task_id) + + +@pytest.mark.asyncio +async def test_schedule_success(actor_pools): + sv_pool, worker_pool, session_id, task_manager_ref = actor_pools + global_resource_ref = await mo.actor_ref( + GlobalResourceManagerActor.default_uid(), address=sv_pool.external_address + ) + + scheduling_api = await SchedulingAPI.create(session_id, sv_pool.external_address) + storage_api = await StorageAPI.create(session_id, worker_pool.external_address) + + a = mt.ones((10, 10), chunk_size=10) + b = a + 1 + + subtask = _gen_subtask(b, session_id) + subtask.expect_bands = [(worker_pool.external_address, "numa-0")] + await scheduling_api.add_subtasks([subtask], [(0,)]) + await task_manager_ref.wait_subtask_result(subtask.subtask_id) + await scheduling_api.finish_subtasks([subtask.subtask_id]) + + result_key = next(subtask.chunk_graph.iter_indep(reverse=True)).key + result = await storage_api.get(result_key) + np.testing.assert_array_equal(np.ones((10, 10)) + 1, result) + + assert _approx_resource( + (await global_resource_ref.get_used_resources()).get( + (worker_pool.external_address, "numa-0"), Resource() + ), + Resource(), + ) + + [summary] = await _get_subtask_summaries_by_web( + sv_pool.external_address, session_id, subtask.task_id + ) + assert summary.is_finished + assert subtask.expect_bands[0] in summary.bands + + +@pytest.mark.asyncio +async def test_schedule_queue(actor_pools): + sv_pool, worker_pool, session_id, task_manager_ref = actor_pools + global_resource_ref = await mo.actor_ref( + GlobalResourceManagerActor.default_uid(), address=sv_pool.external_address + ) + scheduling_api = await SchedulingAPI.create(session_id, sv_pool.external_address) + + finish_ids, finish_time = [], [] + + def _remote_fun(secs): + time.sleep(secs) + return secs + + async def _waiter_fun(subtask_id): + await task_manager_ref.wait_subtask_result(subtask_id) + await scheduling_api.finish_subtasks([subtask_id]) + finish_ids.append(subtask_id) + finish_time.append(time.time()) + + subtasks = [] + wait_tasks = [] + for task_id in range(6): + a = mr.spawn(_remote_fun, args=(0.5 + 0.01 * task_id,)) + subtask = _gen_subtask(a, session_id) + subtask.subtask_id = f"test_schedule_queue_subtask_{task_id}" + subtask.expect_bands = [(worker_pool.external_address, "numa-0")] + subtask.priority = (4 - task_id,) + wait_tasks.append(asyncio.create_task(_waiter_fun(subtask.subtask_id))) + subtasks.append(subtask) + + await scheduling_api.add_subtasks(subtasks) + await scheduling_api.update_subtask_priority(subtasks[-1].subtask_id, (6,)) + await asyncio.gather(*wait_tasks) + + assert _approx_resource( + (await global_resource_ref.get_used_resources()).get( + (worker_pool.external_address, "numa-0"), Resource() + ), + Resource(), + ) + + +@pytest.mark.asyncio +async def test_schedule_error(actor_pools): + sv_pool, worker_pool, session_id, task_manager_ref = actor_pools + global_resource_ref = await mo.actor_ref( + GlobalResourceManagerActor.default_uid(), address=sv_pool.external_address + ) + scheduling_api = await SchedulingAPI.create(session_id, sv_pool.external_address) + + exc_types = [ValueError, asyncio.CancelledError, GeneratorExit] + for exc_type in exc_types: + + def _remote_fun(): + raise exc_type + + a = mr.spawn(_remote_fun) + subtask = _gen_subtask(a, session_id) + subtask.expect_bands = [(worker_pool.external_address, "numa-0")] + + await scheduling_api.add_subtasks([subtask]) + with pytest.raises(exc_type): + await task_manager_ref.wait_subtask_result(subtask.subtask_id) + + assert _approx_resource( + (await global_resource_ref.get_used_resources()).get( + (worker_pool.external_address, "numa-0"), Resource() + ), + Resource(), + ) + + +@pytest.mark.asyncio +async def test_schedule_cancel(actor_pools): + sv_pool, worker_pool, session_id, task_manager_ref = actor_pools + global_resource_ref = await mo.actor_ref( + GlobalResourceManagerActor.default_uid(), address=sv_pool.external_address + ) + scheduling_api = await SchedulingAPI.create(session_id, sv_pool.external_address) + + def _remote_fun(secs): + time.sleep(secs) + return secs + + async def _waiter_fun(subtask_id): + await task_manager_ref.wait_subtask_result(subtask_id) + await scheduling_api.finish_subtasks([subtask_id]) + + subtasks = [] + wait_tasks = [] + for task_id in range(6): + a = mr.spawn(_remote_fun, args=(1 - 0.01 * task_id,)) + subtask = _gen_subtask(a, session_id) + subtask.subtask_id = f"test_schedule_queue_subtask_{task_id}" + subtask.expect_bands = [(worker_pool.external_address, "numa-0")] + subtask.priority = (4 - task_id,) + wait_tasks.append(asyncio.create_task(_waiter_fun(subtask.subtask_id))) + subtasks.append(subtask) + + await scheduling_api.add_subtasks(subtasks) + await asyncio.gather(*wait_tasks[:2]) + + await scheduling_api.cancel_subtasks( + [subtask.subtask_id for subtask in subtasks], kill_timeout=0.1 + ) + + for wait_task in wait_tasks[2:]: + with pytest.raises(asyncio.CancelledError): + await wait_task + + summaries = await _get_subtask_summaries_by_web( + sv_pool.external_address, session_id + ) + assert all( + summary.is_finished and summary.is_cancelled for summary in summaries[2:] + ) + # `cancel_subtask` will invoke `task_api.set_subtask_result` which is async, wait 1 second so that slot can be + # released. + await asyncio.sleep(1) + assert _approx_resource( + (await global_resource_ref.get_used_resources()).get( + (worker_pool.external_address, "numa-0"), Resource() + ), + Resource(), + ) diff --git a/python/xorbits/_mars/services/scheduling/utils.py b/python/xorbits/_mars/services/scheduling/utils.py new file mode 100644 index 000000000..271dd9ec3 --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/utils.py @@ -0,0 +1,62 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import contextlib +import sys + +from ... import oscar as mo +from ...lib.aio import alru_cache +from ..subtask import SubtaskResult, SubtaskStatus +from ..task import TaskAPI + + +@alru_cache +async def _get_task_api(actor: mo.Actor): + return await TaskAPI.create(getattr(actor, "_session_id"), actor.address) + + +@contextlib.asynccontextmanager +async def redirect_subtask_errors(actor: mo.Actor, subtasks): + try: + yield + except: # noqa: E722 # pylint: disable=bare-except + _, error, traceback = sys.exc_info() + status = ( + SubtaskStatus.cancelled + if isinstance(error, asyncio.CancelledError) + else SubtaskStatus.errored + ) + task_api = await _get_task_api(actor) + coros = [] + for subtask in subtasks: + if subtask is None: # pragma: no cover + continue + coros.append( + task_api.set_subtask_result( + SubtaskResult( + subtask_id=subtask.subtask_id, + session_id=subtask.session_id, + task_id=subtask.task_id, + stage_id=subtask.stage_id, + progress=1.0, + status=status, + error=error, + traceback=traceback, + ) + ) + ) + tasks = [asyncio.ensure_future(coro) for coro in coros] + await asyncio.wait(tasks) + raise diff --git a/python/xorbits/_mars/services/scheduling/worker/__init__.py b/python/xorbits/_mars/services/scheduling/worker/__init__.py new file mode 100644 index 000000000..42c23612a --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/worker/__init__.py @@ -0,0 +1,22 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .execution import SubtaskExecutionActor +from .quota import MemQuotaActor, QuotaActor, WorkerQuotaManagerActor +from .service import SchedulingWorkerService +from .workerslot import ( + BandSlotControlActor, + BandSlotManagerActor, + WorkerSlotManagerActor, +) diff --git a/python/xorbits/_mars/services/scheduling/worker/execution.py b/python/xorbits/_mars/services/scheduling/worker/execution.py new file mode 100644 index 000000000..09791d18b --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/worker/execution.py @@ -0,0 +1,552 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import functools +import logging +import operator +import pprint +import sys +from collections import defaultdict +from dataclasses import dataclass, field +from typing import Dict, Optional + +from .... import oscar as mo +from ....core import ExecutionError +from ....core.graph import DAG +from ....core.operand import Fetch, FetchShuffle +from ....lib.aio import alru_cache +from ....metrics import Metrics +from ....oscar.errors import MarsError +from ....storage import StorageLevel +from ....utils import dataslots, get_chunk_key_to_data_keys, wrap_exception +from ...cluster import ClusterAPI +from ...meta import MetaAPI +from ...storage import StorageAPI +from ...subtask import Subtask, SubtaskAPI, SubtaskResult, SubtaskStatus +from .quota import QuotaActor +from .workerslot import BandSlotManagerActor + +logger = logging.getLogger(__name__) + +# the default times to run subtask. +DEFAULT_SUBTASK_MAX_RETRIES = 0 + + +@dataslots +@dataclass +class SubtaskExecutionInfo: + aio_task: asyncio.Task + band_name: str + supervisor_address: str + result: SubtaskResult = field(default_factory=SubtaskResult) + cancelling: bool = False + max_retries: int = 0 + num_retries: int = 0 + slot_id: Optional[int] = None + kill_timeout: Optional[int] = None + + +async def _retry_run( + subtask: Subtask, subtask_info: SubtaskExecutionInfo, target_async_func, *args +): + assert subtask_info.num_retries >= 0 + assert subtask_info.max_retries >= 0 + + while True: + try: + return await target_async_func(*args) + except (OSError, MarsError) as ex: + if subtask_info.num_retries < subtask_info.max_retries: + logger.error( + "Rerun[%s/%s] the %s of subtask %s due to %s.", + subtask_info.num_retries, + subtask_info.max_retries, + target_async_func, + subtask.subtask_id, + ex, + ) + subtask_info.num_retries += 1 + continue + if subtask_info.max_retries > 0: + message = ( + f"Exceed max rerun[{subtask_info.num_retries}/{subtask_info.max_retries}]:" + f" {target_async_func} of subtask {subtask.subtask_id} due to {ex}." + ) + logger.error(message) + + raise wrap_exception(ex, wrap_name="_ExceedMaxRerun", message=message) + else: + raise ex + except asyncio.CancelledError: + raise + except Exception as ex: + if subtask_info.max_retries > 0: + message = ( + f"Failed to rerun the {target_async_func} of subtask {subtask.subtask_id}, " + f"num_retries: {subtask_info.num_retries}, max_retries: {subtask_info.max_retries} " + f"due to unhandled exception: {ex}." + ) + logger.error(message) + + raise wrap_exception( + ex, wrap_name="_UnhandledException", message=message + ) + else: + raise ex + + +def _fill_subtask_result_with_exception( + subtask: Subtask, subtask_info: SubtaskExecutionInfo +): + _, exc, tb = sys.exc_info() + if isinstance(exc, ExecutionError): + exc = exc.nested_error + tb = exc.__traceback__ + + exc_info = (type(exc), exc, tb) + if isinstance(exc, asyncio.CancelledError): + status = SubtaskStatus.cancelled + logger.exception( + "Cancel run subtask %s on band %s", + subtask.subtask_id, + subtask_info.band_name, + exc_info=exc_info, + ) + else: + status = SubtaskStatus.errored + logger.exception( + "Failed to run subtask %s on band %s", + subtask.subtask_id, + subtask_info.band_name, + exc_info=exc_info, + ) + subtask_info.result.status = status + subtask_info.result.progress = 1.0 + subtask_info.result.error = exc + subtask_info.result.traceback = tb + + +class SubtaskExecutionActor(mo.StatelessActor): + _subtask_info: Dict[str, SubtaskExecutionInfo] + + def __init__( + self, + subtask_max_retries: int = DEFAULT_SUBTASK_MAX_RETRIES, + enable_kill_slot: bool = True, + data_prepare_timeout: int = 600, + ): + self._cluster_api = None + self._global_resource_ref = None + self._subtask_max_retries = subtask_max_retries + self._enable_kill_slot = enable_kill_slot + self._data_prepare_timeout = data_prepare_timeout + + self._subtask_info = dict() + self._submitted_subtask_count = Metrics.counter( + "mars.band.submitted_subtask_count", + "The count of submitted subtasks to the current band.", + ("band",), + ) + self._finished_subtask_count = Metrics.counter( + "mars.band.finished_subtask_count", + "The count of finished subtasks of the current band.", + ("band",), + ) + + async def __post_create__(self): + self._cluster_api = await ClusterAPI.create(self.address) + + @alru_cache(cache_exceptions=False) + async def _get_slot_manager_ref( + self, band: str + ) -> mo.ActorRefType[BandSlotManagerActor]: + return await mo.actor_ref( + BandSlotManagerActor.gen_uid(band), address=self.address + ) + + @alru_cache(cache_exceptions=False) + async def _get_band_quota_ref(self, band: str) -> mo.ActorRefType[QuotaActor]: + return await mo.actor_ref(QuotaActor.gen_uid(band), address=self.address) + + async def _prepare_input_data(self, subtask: Subtask, band_name: str): + queries = [] + shuffle_queries = [] + storage_api = await StorageAPI.create( + subtask.session_id, address=self.address, band_name=band_name + ) + chunk_key_to_data_keys = get_chunk_key_to_data_keys(subtask.chunk_graph) + for chunk in subtask.chunk_graph: + if chunk.key in subtask.pure_depend_keys: + continue + if chunk.op.gpu: # pragma: no cover + to_fetch_band = band_name + else: + to_fetch_band = "numa-0" + if isinstance(chunk.op, Fetch): + queries.append( + storage_api.fetch.delay(chunk.key, band_name=to_fetch_band) + ) + elif isinstance(chunk.op, FetchShuffle): + for key in chunk_key_to_data_keys[chunk.key]: + shuffle_queries.append( + storage_api.fetch.delay( + key, band_name=to_fetch_band, error="ignore" + ) + ) + if queries: + await storage_api.fetch.batch(*queries) + if shuffle_queries: + # TODO(hks): The batch method doesn't accept different error arguments, + # combine them when it can. + + await storage_api.fetch.batch(*shuffle_queries) + + async def _collect_input_sizes( + self, subtask: Subtask, supervisor_address: str, band_name: str + ): + graph = subtask.chunk_graph + sizes = dict() + + fetch_keys = list( + set( + n.key + for n in graph.iter_indep() + if isinstance(n.op, Fetch) and n.key not in subtask.pure_depend_keys + ) + ) + if not fetch_keys: + return sizes + + storage_api = await StorageAPI.create( + subtask.session_id, address=self.address, band_name=band_name + ) + meta_api = await MetaAPI.create(subtask.session_id, address=supervisor_address) + + fetch_metas = await meta_api.get_chunk_meta.batch( + *( + meta_api.get_chunk_meta.delay(k, fields=["memory_size", "store_size"]) + for k in fetch_keys + ) + ) + data_infos = await storage_api.get_infos.batch( + *(storage_api.get_infos.delay(k) for k in fetch_keys) + ) + + # compute memory quota size. when data located in shared memory, the cost + # should be differences between deserialized memory cost and serialized cost, + # otherwise we should take deserialized memory cost + for key, meta, infos in zip(fetch_keys, fetch_metas, data_infos): + level = functools.reduce(operator.or_, (info.level for info in infos)) + if level & StorageLevel.MEMORY: + mem_cost = max(0, meta["memory_size"] - meta["store_size"]) + else: + mem_cost = meta["memory_size"] + sizes[key] = (meta["store_size"], mem_cost) + + return sizes + + @classmethod + def _estimate_sizes(cls, subtask: Subtask, input_sizes: Dict): + size_context = dict(input_sizes.items()) + graph = subtask.chunk_graph + + key_to_ops = defaultdict(set) + chunk_key_to_sizes = defaultdict(lambda: 0) + for n in graph: + key_to_ops[n.op.key].add(n.op) + chunk_key_to_sizes[n.key] += 1 + key_to_ops = {k: list(v) for k, v in key_to_ops.items()} + + # condense op key graph + op_key_graph = DAG() + for n in graph.topological_iter(): + if n.key in subtask.pure_depend_keys: + continue + if n.op.key not in op_key_graph: + op_key_graph.add_node(n.op.key) + for succ in graph.iter_successors(n): + if succ.op.key not in op_key_graph: + op_key_graph.add_node(succ.op.key) + op_key_graph.add_edge(n.op.key, succ.op.key) + + key_stack = list(op_key_graph.iter_indep()) + pred_ref_count = {k: op_key_graph.count_predecessors(k) for k in op_key_graph} + succ_ref_count = {k: op_key_graph.count_successors(k) for k in op_key_graph} + + visited_op_keys = set() + total_memory_cost = 0 + max_memory_cost = sum(calc_size for _, calc_size in size_context.values()) + while key_stack: + key = key_stack.pop() + op = key_to_ops[key][0] + + if not isinstance(op, Fetch): + op.estimate_size(size_context, op) + + calc_cost = sum(size_context[out.key][1] for out in op.outputs) + total_memory_cost += calc_cost + max_memory_cost = max(total_memory_cost, max_memory_cost) + + if not isinstance(op, Fetch): + # when calculation result is stored, memory cost of calculation + # can be replaced with result memory cost + result_cost = sum(size_context[out.key][0] for out in op.outputs) + total_memory_cost += result_cost - calc_cost + + visited_op_keys.add(key) + + for succ_op_key in op_key_graph.iter_successors(key): + pred_ref_count[succ_op_key] -= 1 + if pred_ref_count[succ_op_key] == 0: + key_stack.append(succ_op_key) + + for pred_op_key in op_key_graph.iter_predecessors(key): + succ_ref_count[pred_op_key] -= 1 + if succ_ref_count[pred_op_key] == 0: + pred_op = key_to_ops[pred_op_key][0] + outs = key_to_ops[pred_op_key][0].outputs + for out in outs: + chunk_key_to_sizes[out.key] -= 1 + # when clearing fetches, subtract memory size, otherwise subtract store size + account_idx = 1 if isinstance(pred_op, Fetch) else 0 + pop_result_cost = 0 + for out in outs: + # corner case exist when a fetch op and another op has same chunk key + # but their op keys are different + if chunk_key_to_sizes[out.key] == 0: + pop_result_cost += size_context.pop(out.key, (0, 0))[ + account_idx + ] + else: + pop_result_cost += size_context.get(out.key, (0, 0))[ + account_idx + ] + total_memory_cost -= pop_result_cost + return sum(t[0] for t in size_context.values()), max_memory_cost + + @classmethod + def _check_cancelling(cls, subtask_info: SubtaskExecutionInfo): + if subtask_info.cancelling: + raise asyncio.CancelledError + + async def internal_run_subtask(self, subtask: Subtask, band_name: str): + subtask_api = SubtaskAPI(self.address) + subtask_info = self._subtask_info[subtask.subtask_id] + subtask_info.result = SubtaskResult( + subtask_id=subtask.subtask_id, + session_id=subtask.session_id, + task_id=subtask.task_id, + stage_id=subtask.stage_id, + status=SubtaskStatus.pending, + ) + try: + logger.debug("Preparing data for subtask %s", subtask.subtask_id) + prepare_data_task = asyncio.create_task( + _retry_run( + subtask, subtask_info, self._prepare_input_data, subtask, band_name + ) + ) + await asyncio.wait_for( + prepare_data_task, timeout=self._data_prepare_timeout + ) + + input_sizes = await self._collect_input_sizes( + subtask, subtask_info.supervisor_address, band_name + ) + _store_size, calc_size = await asyncio.to_thread( + self._estimate_sizes, subtask, input_sizes + ) + self._check_cancelling(subtask_info) + + batch_quota_req = {(subtask.session_id, subtask.subtask_id): calc_size} + logger.debug("Start actual running of subtask %s", subtask.subtask_id) + subtask_info.result = await self._retry_run_subtask( + subtask, band_name, subtask_api, batch_quota_req + ) + except: # noqa: E722 # pylint: disable=bare-except + _fill_subtask_result_with_exception(subtask, subtask_info) + finally: + # make sure new slot usages are uploaded in time + try: + slot_manager_ref = await self._get_slot_manager_ref(band_name) + await slot_manager_ref.upload_slot_usages(periodical=False) + except: # noqa: E722 # pylint: disable=bare-except + _fill_subtask_result_with_exception(subtask, subtask_info) + finally: + # pop the subtask info at the end is to cancel the job. + self._subtask_info.pop(subtask.subtask_id, None) + return subtask_info.result + + async def _retry_run_subtask( + self, subtask: Subtask, band_name: str, subtask_api: SubtaskAPI, batch_quota_req + ): + quota_ref = await self._get_band_quota_ref(band_name) + slot_manager_ref = await self._get_slot_manager_ref(band_name) + subtask_info = self._subtask_info[subtask.subtask_id] + assert subtask_info.num_retries >= 0 + assert subtask_info.max_retries >= 0 + + async def _run_subtask_once(): + aiotask = None + slot_id = None + try: + await quota_ref.request_batch_quota(batch_quota_req) + self._check_cancelling(subtask_info) + + slot_id = await slot_manager_ref.acquire_free_slot( + (subtask.session_id, subtask.subtask_id) + ) + subtask_info.slot_id = slot_id + self._check_cancelling(subtask_info) + + subtask_info.result.status = SubtaskStatus.running + aiotask = asyncio.create_task( + subtask_api.run_subtask_in_slot(band_name, slot_id, subtask) + ) + return await asyncio.shield(aiotask) + except asyncio.CancelledError as ex: + try: + if aiotask is not None: + logger.info( + "Start to cancel subtask %s in slot %s on band %s.", + subtask.subtask_id, + slot_id, + band_name, + ) + await asyncio.wait_for( + asyncio.shield( + subtask_api.cancel_subtask_in_slot(band_name, slot_id) + ), + subtask_info.kill_timeout, + ) + except asyncio.TimeoutError: + logger.info( + "Wait for subtask to cancel timed out (%s). " + "Start killing slot %d", + subtask_info.kill_timeout, + slot_id, + ) + await slot_manager_ref.kill_slot(slot_id) + sub_pool_address = await slot_manager_ref.get_slot_address(slot_id) + await mo.wait_actor_pool_recovered(sub_pool_address, self.address) + except: # pragma: no cover + logger.exception("Unexpected errors raised when handling cancel") + raise + finally: + raise ex + except (OSError, MarsError) as ex: + if slot_id is not None: + # may encounter subprocess memory error + sub_pool_address = await slot_manager_ref.get_slot_address(slot_id) + await mo.wait_actor_pool_recovered(sub_pool_address, self.address) + raise ex + finally: + # make sure allocated slots are traced + if slot_id is None: # pragma: no cover + slot_id = await slot_manager_ref.get_subtask_slot( + (subtask.session_id, subtask.subtask_id) + ) + logger.debug( + "Subtask %s running ended, slot_id=%r", subtask.subtask_id, slot_id + ) + if slot_id is not None: + await slot_manager_ref.release_free_slot( + slot_id, (subtask.session_id, subtask.subtask_id) + ) + logger.debug( + "Released slot %d for subtask %s", slot_id, subtask.subtask_id + ) + await quota_ref.release_quotas(tuple(batch_quota_req.keys())) + + # TODO(fyrestone): For the retryable op, we should rerun it when + # any exceptions occurred. + if subtask.retryable: + return await _retry_run(subtask, subtask_info, _run_subtask_once) + else: + try: + return await _run_subtask_once() + except Exception as e: + unretryable_op = [ + chunk.op + for chunk in subtask.chunk_graph + if not getattr(chunk.op, "retryable", True) + ] + message = ( + f"Run subtask failed due to {e}, the subtask {subtask.subtask_id} is " + f"not retryable, it contains unretryable op: \n" + f"{pprint.pformat(unretryable_op)}" + ) + logger.error(message) + + raise wrap_exception( + e, wrap_name="_UnretryableException", message=message + ) + + async def run_subtask( + self, subtask: Subtask, band_name: str, supervisor_address: str + ): + if subtask.subtask_id in self._subtask_info: # pragma: no cover + raise Exception( + f"Subtask {subtask.subtask_id} is already running on this band[{self.address}]." + ) + logger.debug( + "Start to schedule subtask %s on %s.", subtask.subtask_id, self.address + ) + self._submitted_subtask_count.record(1, {"band": self.address}) + with mo.debug.no_message_trace(): + task = asyncio.create_task( + self.ref().internal_run_subtask(subtask, band_name) + ) + + logger.debug("Subtask %r accepted in worker %s", subtask, self.address) + # the extra_config may be None. the extra config overwrites the default value. + subtask_max_retries = ( + subtask.extra_config.get("subtask_max_retries") + if subtask.extra_config + else None + ) + if subtask_max_retries is None: + subtask_max_retries = self._subtask_max_retries + + self._subtask_info[subtask.subtask_id] = SubtaskExecutionInfo( + task, band_name, supervisor_address, max_retries=subtask_max_retries + ) + result = await task + self._subtask_info.pop(subtask.subtask_id, None) + self._finished_subtask_count.record(1, {"band": self.address}) + logger.debug("Subtask %s finished with result %s", subtask.subtask_id, result) + return result + + async def cancel_subtask(self, subtask_id: str, kill_timeout: Optional[int] = 5): + try: + subtask_info = self._subtask_info[subtask_id] + except KeyError: + logger.info("Subtask %s not exists, skip cancel.", subtask_id) + return + logger.info( + "Start to cancel subtask %s in slot %s, kill_timeout is %s", + subtask_id, + subtask_info.slot_id, + kill_timeout, + ) + + kill_timeout = kill_timeout if self._enable_kill_slot else None + if not subtask_info.cancelling: + subtask_info.kill_timeout = kill_timeout + subtask_info.cancelling = True + subtask_info.aio_task.cancel() + + await subtask_info.aio_task + self._subtask_info.pop(subtask_id, None) diff --git a/python/xorbits/_mars/services/scheduling/worker/quota.py b/python/xorbits/_mars/services/scheduling/worker/quota.py new file mode 100644 index 000000000..f5e048071 --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/worker/quota.py @@ -0,0 +1,428 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import itertools +import logging +import time +from collections import OrderedDict, namedtuple +from dataclasses import dataclass +from typing import Dict, Optional, Tuple, Union + +from .... import oscar as mo +from .... import resource as mars_resource +from ....typing import BandType +from ...cluster import QuotaInfo + +logger = logging.getLogger(__name__) + +QuotaDumpType = namedtuple("QuotaDumpType", "allocations requests hold_sizes") + + +@dataclass +class QuotaRequest: + req_size: Tuple + delta: int + req_time: float + event: asyncio.Event + + +class QuotaActor(mo.Actor): + @classmethod + def gen_uid(cls, band_name: str): + return f"{band_name}_quota" + + def __init__(self, band: BandType, quota_size: int, **kw): + super().__init__() + self._requests = OrderedDict() + + self._cluster_api = None + + self._band = band + self._band_name = band[1] + + self._quota_size = quota_size + self._allocations = dict() + self._total_allocated = 0 + + self._hold_sizes = dict() + self._total_hold = 0 + + if kw: # pragma: no cover + logger.warning("Keywords for QuotaActor %r not used", list(kw.keys())) + + async def __post_create__(self): + from ...cluster.api import ClusterAPI + + try: + self._cluster_api = await ClusterAPI.create(self.address) + self._report_quota_info() + except mo.ActorNotExist: + pass + + async def _has_space(self, delta: int): + return self._total_allocated + delta <= self._quota_size + + def _log_allocate(self, msg: str, *args, **kwargs): + args += (self._total_allocated, self._quota_size) + logger.debug(msg + " Allocated: %s, Total size: %s", *args, **kwargs) + + def _report_quota_info(self): + if self._cluster_api is not None: + quota_info = QuotaInfo( + quota_size=self._quota_size, + allocated_size=self._total_allocated, + hold_size=self._total_hold, + ) + asyncio.create_task( + self._cluster_api.set_band_quota_info(self._band_name, quota_info) + ) + + async def request_batch_quota(self, batch: Dict): + """ + Request for resources in a batch + :param batch: the request dict in form {request_key: request_size, ...} + :return: if request is returned immediately, return True, otherwise False + """ + all_allocated = True + # check if the request is already allocated + for key, size in batch.items(): + if key not in self._allocations or size > self._allocations.get(key): + all_allocated = False + break + + self._log_allocate("Receive batch quota request %r on %s.", batch, self.uid) + sorted_req = sorted(batch.items(), key=lambda tp: tp[0]) + keys = tuple(tp[0] for tp in sorted_req) + quota_sizes = tuple(tp[1] for tp in sorted_req) + delta = sum(v - self._allocations.get(k, 0) for k, v in batch.items()) + + # if all requested and allocation can still be applied, apply directly + if all_allocated and await self._has_space(delta): + self._log_allocate("Quota request %r already allocated.", batch) + return + + if delta > self._quota_size: + raise ValueError( + f"Cannot allocate quota size {delta} " + f"larger than total capacity {self._quota_size}." + ) + + if keys in self._requests: + event = self._requests[keys].event + else: + has_space = await self._has_space(delta) + if has_space and not self._requests: + # if no previous requests, we can apply directly + self._log_allocate( + "Quota request met for key %r on %s.", keys, self.uid + ) + await self.alter_allocations(keys, quota_sizes, allocate=True) + return + else: + # current free space cannot satisfy the request, the request is queued + if not has_space: + self._log_allocate( + "Quota request unmet for key %r on %s.", keys, self.uid + ) + else: + self._log_allocate( + "Quota request queued for key %r on %s.", keys, self.uid + ) + event = asyncio.Event() + quota_request = QuotaRequest(quota_sizes, delta, time.time(), event) + if keys not in self._requests: + self._requests[keys] = quota_request + + async def waiter(): + try: + await event.wait() + except asyncio.CancelledError as ex: + await self.ref().remove_requests.tell(keys) + raise ex + + return waiter() + + async def remove_requests(self, keys: Tuple): + self._requests.pop(keys, None) + await self._process_requests() + + def hold_quotas(self, keys: Tuple): + """ + Mark request quota as already been hold + + Parameters + ---------- + keys : Tuple + request keys + """ + for key in keys: + try: + alloc_size = self._allocations[key] + except KeyError: + continue + self._total_hold += alloc_size - self._hold_sizes.get(key, 0) + self._hold_sizes[key] = alloc_size + + async def release_quotas(self, keys: Tuple): + """ + Release allocated quota in batch + + Parameters + ---------- + keys : Tuple + request keys + """ + total_alloc_size = 0 + + for key in keys: + try: + alloc_size = self._allocations.pop(key) + total_alloc_size += alloc_size + except KeyError: + continue + self._total_hold -= self._hold_sizes.pop(key, 0) + + self._total_allocated -= total_alloc_size + if total_alloc_size: + await self._process_requests() + + self._report_quota_info() + self._log_allocate("Quota keys %s released on %s.", keys, self.uid) + + def dump_data(self): + return QuotaDumpType(self._allocations, self._requests, self._hold_sizes) + + def get_allocated_size(self): + # get total allocated size, for debug purpose + return self._total_allocated + + async def alter_allocations( + self, + keys: Tuple, + quota_sizes: Tuple, + handle_shrink: bool = True, + allocate: bool = False, + ): + """ + Alter multiple requests + + Parameters + ---------- + keys : Tuple + keys to update + quota_sizes : Tuple + new quota sizes, if None, no changes will be made + handle_shrink : bool + if True and the quota size less than the original, process requests in the queue + allocate : bool + if True, will allocate resources for new items + """ + quota_sizes = quota_sizes or itertools.repeat(None) + total_old_size, total_diff = 0, 0 + for k, s in zip(keys, quota_sizes): + old_size = self._allocations.get(k, 0) + size_diff = 0 + + if not allocate and k not in self._allocations: + total_old_size += old_size + continue + + if s != old_size: + s = int(s) + size_diff = s - old_size + self._total_allocated += size_diff + self._allocations[k] = s + try: + self._total_hold += s - self._hold_sizes[k] + self._hold_sizes[k] = s + except KeyError: + pass + + total_old_size += old_size + total_diff += size_diff + if handle_shrink and total_diff < 0: + await self._process_requests() + + self._report_quota_info() + self._log_allocate( + "Quota keys %r applied on %s. Total old Size: %s, Total diff: %s,", + keys, + self.uid, + total_old_size, + total_diff, + ) + + async def _process_requests(self): + """ + Process quota requests in the queue + """ + removed = [] + for k, req in self._requests.items(): + if await self._has_space(req.delta): + await self.alter_allocations( + k, req.req_size, handle_shrink=False, allocate=True + ) + req.event.set() + removed.append(k) + else: + # Quota left cannot satisfy the next request, we quit + break + for k in removed: + self._requests.pop(k, None) + + +class MemQuotaActor(QuotaActor): + """ + Actor handling worker memory quota + """ + + def __init__( + self, + band: BandType, + quota_size: int, + hard_limit: int = None, + refresh_time: Union[int, float] = None, + enable_kill_slot: bool = True, + ): + super().__init__(band, quota_size) + self._hard_limit = hard_limit + self._last_memory_available = 0 + self._refresh_time = refresh_time or 1 + + self._enable_kill_slot = enable_kill_slot + + self._stat_refresh_task = None + self._slot_manager_ref = None + + async def __post_create__(self): + await super().__post_create__() + self._stat_refresh_task = self.ref().update_mem_stats.tell_delay( + delay=self._refresh_time + ) + + from .workerslot import BandSlotManagerActor + + try: + self._slot_manager_ref = await mo.actor_ref( + uid=BandSlotManagerActor.gen_uid(self._band[1]), address=self.address + ) + except mo.ActorNotExist: # pragma: no cover + pass + + async def __pre_destroy__(self): + self._stat_refresh_task.cancel() + + async def update_mem_stats(self): + """ + Refresh memory usage + """ + cur_mem_available = mars_resource.virtual_memory().available + if cur_mem_available > self._last_memory_available: + # memory usage reduced: try reallocate existing requests + await self._process_requests() + self._last_memory_available = cur_mem_available + self._report_quota_info() + self._stat_refresh_task = self.ref().update_mem_stats.tell_delay( + delay=self._refresh_time + ) + + async def _has_space(self, delta: int): + if self._hard_limit is None: + return await super()._has_space(delta) + + mem_stats = mars_resource.virtual_memory() + # calc available physical memory + available_size = ( + mem_stats.available + - max(0, mem_stats.total - self._hard_limit) + - (self._total_allocated - self._total_hold) + ) + if max(delta, 0) >= available_size: + logger.warning( + "%s met hard memory limitation: request %d, available %d, hard limit %d", + self.uid, + delta, + available_size, + self._hard_limit, + ) + + if self._enable_kill_slot and self._slot_manager_ref is not None: + logger.info("Restarting free slots to obtain more memory") + await self._slot_manager_ref.restart_free_slots() + return False + return await super()._has_space(delta) + + def _log_allocate(self, msg: str, *args, **kwargs): # pragma: no cover + if logger.getEffectiveLevel() > logging.DEBUG: + return + + if self._hard_limit is None: + return super()._log_allocate(msg, *args, **kwargs) + + mem_stats = mars_resource.virtual_memory() + # calc available physical memory + available_size = ( + mem_stats.available + - max(0, mem_stats.total - self._hard_limit) + - (self._total_allocated - self._total_hold) + ) + args += ( + self._total_allocated, + self._quota_size, + mem_stats.available, + available_size, + self._hard_limit, + self._total_hold, + ) + + logger.debug( + msg + + " Allocated: %s, Quota size: %s, Phy available: %s, Hard available: %s," + " Hard limit: %s, Holding: %s", + *args, + **kwargs, + ) + + +class WorkerQuotaManagerActor(mo.Actor): + def __init__(self, default_config: Dict, band_configs: Optional[Dict] = None): + self._cluster_api = None + self._default_config = default_config + self._band_configs = band_configs or dict() + + self._band_quota_refs = dict() # type: Dict[str, mo.ActorRef] + + async def __post_create__(self): + from ...cluster.api import ClusterAPI + + self._cluster_api = await ClusterAPI.create(self.address) + + band_to_resource = await self._cluster_api.get_bands() + for band in band_to_resource.keys(): + band_config = self._band_configs.get(band[1], self._default_config) + hard_limit = band_config.get("hard_limit") + actor_cls = MemQuotaActor if hard_limit else QuotaActor + self._band_quota_refs[band] = await mo.create_actor( + actor_cls, + band, + **band_config, + uid=MemQuotaActor.gen_uid(band[1]), + address=self.address, + ) + + async def __pre_destroy__(self): + await asyncio.gather( + *[mo.destroy_actor(ref) for ref in self._band_quota_refs.values()] + ) diff --git a/python/xorbits/_mars/services/scheduling/worker/service.py b/python/xorbits/_mars/services/scheduling/worker/service.py new file mode 100644 index 000000000..208deb96f --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/worker/service.py @@ -0,0 +1,100 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .... import oscar as mo +from ....utils import calc_size_by_str +from ...core import AbstractService +from .execution import DEFAULT_SUBTASK_MAX_RETRIES, SubtaskExecutionActor +from .quota import WorkerQuotaManagerActor +from .workerslot import WorkerSlotManagerActor + + +class SchedulingWorkerService(AbstractService): + """ + Scheduling service on worker. + + Service Configuration + --------------------- + { + "scheduling": { + "mem_quota_size": "80%", + "mem_hard_limit": "95%", + "enable_kill_slot": true, + "data_prepare_timeout": 600, + "subtask_max_retries": 1 + } + } + """ + + async def start(self): + from .... import resource as mars_resource + + scheduling_config = self._config.get("scheduling", {}) + address = self._address + + total_mem = mars_resource.virtual_memory().total + mem_quota_size = calc_size_by_str( + scheduling_config.get("mem_quota_size", "80%"), total_mem + ) + mem_hard_limit = calc_size_by_str( + scheduling_config.get("mem_hard_limit", "95%"), total_mem + ) + enable_kill_slot = scheduling_config.get("enable_kill_slot", True) + subtask_max_retries = scheduling_config.get( + "subtask_max_retries", DEFAULT_SUBTASK_MAX_RETRIES + ) + data_prepare_timeout = scheduling_config.get("data_prepare_timeout", 600) + + await mo.create_actor( + WorkerSlotManagerActor, + uid=WorkerSlotManagerActor.default_uid(), + address=address, + ) + await mo.create_actor( + WorkerQuotaManagerActor, + default_config=dict( + quota_size=mem_quota_size, + hard_limit=mem_hard_limit, + enable_kill_slot=enable_kill_slot, + ), + uid=WorkerQuotaManagerActor.default_uid(), + address=address, + ) + await mo.create_actor( + SubtaskExecutionActor, + subtask_max_retries=subtask_max_retries, + enable_kill_slot=enable_kill_slot, + data_prepare_timeout=data_prepare_timeout, + uid=SubtaskExecutionActor.default_uid(), + address=address, + ) + + async def stop(self): + address = self._address + + await mo.destroy_actor( + mo.create_actor_ref( + uid=SubtaskExecutionActor.default_uid(), address=address + ) + ) + await mo.destroy_actor( + mo.create_actor_ref( + uid=WorkerQuotaManagerActor.default_uid(), address=address + ) + ) + await mo.destroy_actor( + mo.create_actor_ref( + uid=WorkerSlotManagerActor.default_uid(), address=address + ) + ) diff --git a/python/xorbits/_mars/services/scheduling/worker/tests/__init__.py b/python/xorbits/_mars/services/scheduling/worker/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/worker/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/services/scheduling/worker/tests/test_execution.py b/python/xorbits/_mars/services/scheduling/worker/tests/test_execution.py new file mode 100644 index 000000000..cee67f91f --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/worker/tests/test_execution.py @@ -0,0 +1,534 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import os +import tempfile +import time +import uuid +from contextlib import asynccontextmanager +from typing import Tuple + +import numpy as np +import pandas as pd +import pytest + +from ..... import oscar as mo +from ..... import remote as mr +from .....core import ( + ChunkGraph, + ChunkGraphBuilder, + OutputType, + TileableGraph, + TileableGraphBuilder, +) +from .....remote.core import RemoteFunction +from .....resource import Resource +from .....tensor.arithmetic import TensorTreeAdd +from .....tensor.fetch import TensorFetch +from .....utils import Timer +from ....cluster import MockClusterAPI +from ....lifecycle import MockLifecycleAPI +from ....meta import MockMetaAPI, MockWorkerMetaAPI +from ....mutable import MockMutableAPI +from ....session import MockSessionAPI +from ....storage import MockStorageAPI +from ....storage.handler import StorageHandlerActor +from ....subtask import MockSubtaskAPI, Subtask, SubtaskStatus +from ....task.supervisor.manager import TaskManagerActor +from ...supervisor import GlobalResourceManagerActor +from ...worker import BandSlotManagerActor, QuotaActor, SubtaskExecutionActor + + +class CancelDetectActorMixin: + @asynccontextmanager + async def _delay_method(self): + delay_fetch_event = getattr(self, "_delay_fetch_event", None) + delay_wait_event = getattr(self, "_delay_wait_event", None) + try: + if delay_fetch_event is not None: + delay_fetch_event.set() + if delay_wait_event is not None: + await delay_wait_event.wait() + yield + except asyncio.CancelledError: + self._is_cancelled = True + raise + + def set_delay_fetch_event( + self, fetch_event: asyncio.Event, wait_event: asyncio.Event + ): + setattr(self, "_delay_fetch_event", fetch_event) + setattr(self, "_delay_wait_event", wait_event) + + def get_is_cancelled(self): + return getattr(self, "_is_cancelled", False) + + +class MockStorageHandlerActor(StorageHandlerActor, CancelDetectActorMixin): + async def fetch_batch(self, *args, **kwargs): + async with self._delay_method(): + return super().fetch_batch(*args, **kwargs) + + +class MockQuotaActor(QuotaActor, CancelDetectActorMixin): + def __init__(self, *args, **kw): + super().__init__(*args, **kw) + self._batch_quota_reqs = [] + + async def request_batch_quota(self, batch): + self._batch_quota_reqs.append(batch) + async with self._delay_method(): + return super().request_batch_quota(batch) + + def get_batch_quota_reqs(self): + return self._batch_quota_reqs + + +class MockBandSlotManagerActor(BandSlotManagerActor, CancelDetectActorMixin): + async def acquire_free_slot(self, session_stid: Tuple[str, str], block=True): + if getattr(self, "_delay_function", None) != "acquire_free_slot": + return super().acquire_free_slot(session_stid, block) + else: + async with self._delay_method(): + return super().acquire_free_slot(session_stid, block) + + async def upload_slot_usages(self, periodical: bool = False): + if ( + getattr(self, "_delay_function", None) != "upload_slot_usages" + or periodical is True + ): + return super().upload_slot_usages(periodical) + else: + async with self._delay_method(): + return super().upload_slot_usages(periodical) + + def set_delay_function(self, name): + self._delay_function = name + + +class MockGlobalResourceManagerActor( + GlobalResourceManagerActor, CancelDetectActorMixin +): + async def __post_create__(self): + pass + + async def __pre_destroy__(self): + pass + + @mo.extensible + async def update_subtask_resources( + self, band, session_id: str, subtask_id: str, resources: Resource + ): + pass + + +class MockTaskManager(mo.Actor): + def __init__(self): + self._results = [] + + def set_subtask_result(self, result): + self._results.append(result) + + def get_results(self): + return self._results + + +@pytest.fixture +async def actor_pool(request): + n_slots, enable_kill = request.param + pool = await mo.create_actor_pool( + "127.0.0.1", labels=[None] + ["numa-0"] * n_slots, n_process=n_slots + ) + + async with pool: + session_id = "test_session" + await MockClusterAPI.create( + pool.external_address, + band_to_resource={"numa-0": Resource(num_cpus=n_slots)}, + ) + await MockSessionAPI.create(pool.external_address, session_id=session_id) + meta_api = await MockMetaAPI.create(session_id, pool.external_address) + worker_meta_api = await MockWorkerMetaAPI.create( + session_id, pool.external_address + ) + await MockLifecycleAPI.create(session_id, pool.external_address) + await MockSubtaskAPI.create(pool.external_address) + await MockMutableAPI.create(session_id, pool.external_address) + storage_api = await MockStorageAPI.create( + session_id, + pool.external_address, + storage_handler_cls=MockStorageHandlerActor, + ) + + # create assigner actor + execution_ref = await mo.create_actor( + SubtaskExecutionActor, + subtask_max_retries=0, + enable_kill_slot=enable_kill, + uid=SubtaskExecutionActor.default_uid(), + address=pool.external_address, + ) + # create quota actor + quota_ref = await mo.create_actor( + MockQuotaActor, + "numa-0", + 102400, + uid=QuotaActor.gen_uid("numa-0"), + address=pool.external_address, + ) + # create dispatcher actor + band_slot_ref = await mo.create_actor( + MockBandSlotManagerActor, + (pool.external_address, "numa-0"), + n_slots, + uid=BandSlotManagerActor.gen_uid("numa-0"), + address=pool.external_address, + ) + + # create global slot manager actor + global_resource_ref = await mo.create_actor( + MockGlobalResourceManagerActor, + uid=GlobalResourceManagerActor.default_uid(), + address=pool.external_address, + ) + + # create mock task manager actor + task_manager_ref = await mo.create_actor( + MockTaskManager, + uid=TaskManagerActor.gen_uid(session_id), + address=pool.external_address, + ) + + try: + yield pool, session_id, meta_api, worker_meta_api, storage_api, execution_ref + finally: + await mo.destroy_actor(task_manager_ref) + await mo.destroy_actor(band_slot_ref) + await mo.destroy_actor(global_resource_ref) + await mo.destroy_actor(quota_ref) + await mo.destroy_actor(execution_ref) + await MockStorageAPI.cleanup(pool.external_address) + await MockSubtaskAPI.cleanup(pool.external_address) + await MockClusterAPI.cleanup(pool.external_address) + await MockMutableAPI.cleanup(session_id, pool.external_address) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("actor_pool", [(1, True)], indirect=True) +async def test_execute_tensor(actor_pool): + pool, session_id, meta_api, worker_meta_api, storage_api, execution_ref = actor_pool + + data1 = np.random.rand(10, 10) + data2 = np.random.rand(10, 10) + + input1 = TensorFetch( + key="input1", source_key="input2", dtype=np.dtype(int) + ).new_chunk([]) + input2 = TensorFetch( + key="input2", source_key="input2", dtype=np.dtype(int) + ).new_chunk([]) + result_chunk = TensorTreeAdd(args=[input1, input2]).new_chunk( + [input1, input2], shape=data1.shape, dtype=data1.dtype + ) + + await meta_api.set_chunk_meta( + input1, + memory_size=data1.nbytes, + store_size=data1.nbytes, + bands=[(pool.external_address, "numa-0")], + ) + await meta_api.set_chunk_meta( + input2, + memory_size=data1.nbytes, + store_size=data2.nbytes, + bands=[(pool.external_address, "numa-0")], + ) + # todo use different storage level when storage ready + await storage_api.put(input1.key, data1) + await storage_api.put(input2.key, data2) + + chunk_graph = ChunkGraph([result_chunk]) + chunk_graph.add_node(input1) + chunk_graph.add_node(input2) + chunk_graph.add_node(result_chunk) + chunk_graph.add_edge(input1, result_chunk) + chunk_graph.add_edge(input2, result_chunk) + + subtask = Subtask("test_subtask", session_id=session_id, chunk_graph=chunk_graph) + await execution_ref.run_subtask(subtask, "numa-0", pool.external_address) + + # check if results are correct + result = await storage_api.get(result_chunk.key) + np.testing.assert_array_equal(data1 + data2, result) + + # check if quota computations are correct + quota_ref = await mo.actor_ref( + QuotaActor.gen_uid("numa-0"), address=pool.external_address + ) + [quota] = await quota_ref.get_batch_quota_reqs() + assert quota[(subtask.session_id, subtask.subtask_id)] == data1.nbytes + + # check if metas are correct + result_meta = await worker_meta_api.get_chunk_meta(result_chunk.key) + assert result_meta["object_id"] == result_chunk.key + assert result_meta["shape"] == result.shape + + +_cancel_phases = [ + "prepare", + "quota", + "slot", + "execute", + "finally", + "immediately", +] + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "actor_pool,cancel_phase", + [((1, True), phase) for phase in _cancel_phases], + indirect=["actor_pool"], +) +async def test_execute_with_cancel(actor_pool, cancel_phase): + pool, session_id, meta_api, worker_meta_api, storage_api, execution_ref = actor_pool + delay_fetch_event = asyncio.Event() + delay_wait_event = asyncio.Event() + + # config for different phases + ref_to_delay = None + if cancel_phase == "prepare": + ref_to_delay = await mo.actor_ref( + StorageHandlerActor.gen_uid("numa-0"), address=pool.external_address + ) + elif cancel_phase == "quota": + ref_to_delay = await mo.actor_ref( + QuotaActor.gen_uid("numa-0"), address=pool.external_address + ) + elif cancel_phase == "slot": + ref_to_delay = await mo.actor_ref( + BandSlotManagerActor.gen_uid("numa-0"), address=pool.external_address + ) + await ref_to_delay.set_delay_function("acquire_free_slot") + elif cancel_phase == "finally": + ref_to_delay = await mo.actor_ref( + BandSlotManagerActor.gen_uid("numa-0"), address=pool.external_address + ) + await ref_to_delay.set_delay_function("upload_slot_usages") + if ref_to_delay: + await ref_to_delay.set_delay_fetch_event(delay_fetch_event, delay_wait_event) + else: + delay_fetch_event.set() + + def delay_fun(delay, _inp1): + if not ref_to_delay: + time.sleep(delay) + return (delay,) + + input1 = TensorFetch( + key="input1", source_key="input1", dtype=np.dtype(int) + ).new_chunk([]) + remote_result = RemoteFunction( + function=delay_fun, function_args=[100, input1], function_kwargs={}, n_output=1 + ).new_chunk([input1]) + + data1 = np.random.rand(10, 10) + await meta_api.set_chunk_meta( + input1, + memory_size=data1.nbytes, + store_size=data1.nbytes, + bands=[(pool.external_address, "numa-0")], + ) + await storage_api.put(input1.key, data1) + + chunk_graph = ChunkGraph([remote_result]) + chunk_graph.add_node(input1) + chunk_graph.add_node(remote_result) + chunk_graph.add_edge(input1, remote_result) + + subtask = Subtask( + f"test_subtask_{uuid.uuid4()}", session_id=session_id, chunk_graph=chunk_graph + ) + aiotask = asyncio.create_task( + execution_ref.run_subtask(subtask, "numa-0", pool.external_address) + ) + if ref_to_delay: + await delay_fetch_event.wait() + else: + if cancel_phase != "immediately": + await asyncio.sleep(1) + + with Timer() as timer: + await asyncio.wait_for( + execution_ref.cancel_subtask(subtask.subtask_id, kill_timeout=1), + timeout=30, + ) + r = await asyncio.wait_for(aiotask, timeout=30) + assert r.status == SubtaskStatus.cancelled + assert timer.duration < 15 + + # check for different phases + if ref_to_delay is not None: + assert await ref_to_delay.get_is_cancelled() + delay_wait_event.set() + + # test if slot is restored + remote_tileable = mr.spawn(delay_fun, args=(0.5, None)) + graph = TileableGraph([remote_tileable.data]) + next(TileableGraphBuilder(graph).build()) + + chunk_graph = next(ChunkGraphBuilder(graph, fuse_enabled=False).build()) + + subtask = Subtask( + f"test_subtask2_{uuid.uuid4()}", session_id=session_id, chunk_graph=chunk_graph + ) + await asyncio.wait_for( + execution_ref.run_subtask(subtask, "numa-0", pool.external_address), timeout=30 + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("actor_pool", [(1, True)], indirect=True) +async def test_execute_with_pure_deps(actor_pool): + pool, session_id, meta_api, worker_meta_api, storage_api, execution_ref = actor_pool + + dep = TensorFetch(key="input1", dtype=np.dtype(int)).new_chunk([]) + + def main_fun(): + return session_id + + remote_result = RemoteFunction( + function=main_fun, function_args=[], function_kwargs={} + ).new_chunk([dep]) + # mark `dep` as pure dependency + remote_result.op._pure_depends = [True] + chunk_graph = ChunkGraph([remote_result]) + chunk_graph.add_node(dep) + chunk_graph.add_node(remote_result) + chunk_graph.add_edge(dep, remote_result) + + subtask = Subtask( + f"test_subtask_{uuid.uuid4()}", session_id=session_id, chunk_graph=chunk_graph + ) + # subtask shall run well without data of `dep` available + await execution_ref.run_subtask(subtask, "numa-0", pool.external_address) + res = await storage_api.get(remote_result.key) + assert res == session_id + + +def test_estimate_size(): + from .....dataframe.arithmetic import DataFrameAdd + from .....dataframe.fetch import DataFrameFetch + from .....dataframe.utils import parse_index + from ..execution import SubtaskExecutionActor + + index_value = parse_index(pd.Index([10, 20, 30], dtype=np.int64)) + + input1 = DataFrameFetch( + output_types=[OutputType.series], + ).new_chunk( + [], _key="INPUT1", shape=(np.nan,), dtype=np.dtype("O"), index_value=index_value + ) + input2 = DataFrameFetch( + output_types=[OutputType.series], + ).new_chunk( + [], _key="INPUT2", shape=(np.nan,), dtype=np.dtype("O"), index_value=index_value + ) + result_chunk = DataFrameAdd( + axis=0, output_types=[OutputType.series], lhs=input1, rhs=input2 + ).new_chunk( + [input1, input2], + _key="ADD_RESULT", + shape=(np.nan,), + dtype=np.dtype("O"), + index_value=index_value, + ) + + chunk_graph = ChunkGraph([result_chunk]) + chunk_graph.add_node(input1) + chunk_graph.add_node(input2) + chunk_graph.add_node(result_chunk) + chunk_graph.add_edge(input1, result_chunk) + chunk_graph.add_edge(input2, result_chunk) + + input_sizes = { + "INPUT1": (1024, 1024), + "INPUT2": (1024, 1024), + } + + subtask = Subtask("test_subtask", session_id="session_id", chunk_graph=chunk_graph) + result = SubtaskExecutionActor._estimate_sizes(subtask, input_sizes) + assert result[0] == 1024 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("actor_pool", [(1, False)], indirect=True) +async def test_cancel_without_kill(actor_pool): + pool, session_id, meta_api, worker_meta_api, storage_api, execution_ref = actor_pool + executed_file = os.path.join( + tempfile.gettempdir(), f"mars_test_cancel_without_kill_{os.getpid()}.tmp" + ) + + def delay_fun(delay): + import mars + + open(executed_file, "w").close() + time.sleep(delay) + mars._slot_marker = 1 + return delay + + def check_fun(): + import mars + + return getattr(mars, "_slot_marker", False) + + remote_result = RemoteFunction( + function=delay_fun, function_args=[2], function_kwargs={} + ).new_chunk([]) + chunk_graph = ChunkGraph([remote_result]) + chunk_graph.add_node(remote_result) + + subtask = Subtask( + f"test_subtask_{uuid.uuid4()}", session_id=session_id, chunk_graph=chunk_graph + ) + aiotask = asyncio.create_task( + execution_ref.run_subtask(subtask, "numa-0", pool.external_address) + ) + await asyncio.sleep(0.5) + + await asyncio.wait_for( + execution_ref.cancel_subtask(subtask.subtask_id, kill_timeout=1), + timeout=30, + ) + r = await asyncio.wait_for(aiotask, timeout=30) + assert r.status == SubtaskStatus.cancelled + + remote_result = RemoteFunction( + function=check_fun, function_args=[], function_kwargs={} + ).new_chunk([]) + chunk_graph = ChunkGraph([remote_result]) + chunk_graph.add_node(remote_result) + + subtask = Subtask( + f"test_subtask_{uuid.uuid4()}", session_id=session_id, chunk_graph=chunk_graph + ) + await asyncio.wait_for( + execution_ref.run_subtask(subtask, "numa-0", pool.external_address), timeout=30 + ) + + # check if slots not killed (or slot assignment may be cancelled) + if os.path.exists(executed_file): + assert await storage_api.get(remote_result.key) + os.unlink(executed_file) diff --git a/python/xorbits/_mars/services/scheduling/worker/tests/test_quota.py b/python/xorbits/_mars/services/scheduling/worker/tests/test_quota.py new file mode 100644 index 000000000..9a9143c41 --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/worker/tests/test_quota.py @@ -0,0 +1,183 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import os +import sys +import time + +import pytest + +from ..... import oscar as mo +from .....tests.core import mock +from .....utils import get_next_port +from ...worker import BandSlotManagerActor, MemQuotaActor, QuotaActor + + +class MockBandSlotManagerActor(mo.Actor): + def get_restart_record(self): + return getattr(self, "_restart_record", False) + + def restart_free_slots(self): + self._restart_record = True + + +@pytest.fixture +async def actor_pool(): + start_method = ( + os.environ.get("POOL_START_METHOD", "fork") if sys.platform != "win32" else None + ) + pool = await mo.create_actor_pool( + f"127.0.0.1:{get_next_port()}", + n_process=0, + subprocess_start_method=start_method, + ) + await pool.start() + try: + yield pool + finally: + await pool.stop() + + +@pytest.mark.asyncio +async def test_quota(actor_pool): + quota_ref = await mo.create_actor( + QuotaActor, + (actor_pool.external_address, "numa-0"), + 300, + uid=QuotaActor.gen_uid("cpu-0"), + address=actor_pool.external_address, + ) # type: mo.ActorRefType[QuotaActor] + + # test quota options with non-existing keys + await quota_ref.hold_quotas(["non_exist"]) + await quota_ref.release_quotas(["non_exist"]) + + with pytest.raises(ValueError): + await quota_ref.request_batch_quota({"ERROR": 1000}) + + # test quota request with immediate return + await quota_ref.request_batch_quota({"0": 100}) + await quota_ref.request_batch_quota({"0": 50}) + await quota_ref.request_batch_quota({"0": 200}) + + # test request with process_quota=True + await quota_ref.request_batch_quota({"0": 200}) + await quota_ref.alter_allocations(["0"], [190]) + assert (await quota_ref.dump_data()).allocations["0"] == 190 + + await quota_ref.hold_quotas(["0"]) + assert "0" in (await quota_ref.dump_data()).hold_sizes + + req_task1 = asyncio.create_task(quota_ref.request_batch_quota({"1": 150})) + req_task2 = asyncio.create_task(quota_ref.request_batch_quota({"2": 50})) + asyncio.create_task(quota_ref.request_batch_quota({"3": 200})) + asyncio.create_task(quota_ref.request_batch_quota({"3": 180})) + + await asyncio.sleep(0.1) + assert "2" not in (await quota_ref.dump_data()).allocations + + req_task1.cancel() + with pytest.raises(asyncio.CancelledError): + await req_task1 + + await asyncio.wait_for(req_task2, timeout=1) + assert "1" not in (await quota_ref.dump_data()).allocations + assert "2" in (await quota_ref.dump_data()).allocations + assert "3" not in (await quota_ref.dump_data()).allocations + + await quota_ref.release_quotas(["0"]) + assert "3" in (await quota_ref.dump_data()).allocations + + req_task4 = asyncio.create_task(quota_ref.request_batch_quota({"4": 180})) + await asyncio.sleep(0) + assert "4" not in (await quota_ref.dump_data()).allocations + + await quota_ref.alter_allocations(["3"], [50]) + await req_task4 + assert "4" in (await quota_ref.dump_data()).allocations + + +@pytest.mark.asyncio +async def test_batch_quota_allocation(actor_pool): + quota_ref = await mo.create_actor( + QuotaActor, + (actor_pool.external_address, "numa-0"), + 300, + uid=QuotaActor.gen_uid("cpu-0"), + address=actor_pool.external_address, + ) # type: mo.ActorRefType[QuotaActor] + + end_time = [] + + async def task_fun(b): + await quota_ref.request_batch_quota(b) + await asyncio.sleep(0.5) + assert set(b.keys()) == set((await quota_ref.dump_data()).allocations.keys()) + await quota_ref.release_quotas(list(b.keys())) + end_time.append(time.time()) + + tasks = [] + for idx in (0, 1): + keys = [f"{idx}_0", f"{idx}_1"] + batch = dict((k, 100) for k in keys) + tasks.append(asyncio.create_task(task_fun(batch))) + await asyncio.wait_for(asyncio.gather(*tasks), timeout=10) + + assert abs(end_time[0] - end_time[1]) > 0.4 + assert await quota_ref.get_allocated_size() == 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("enable_kill_slot", [False, True]) +async def test_mem_quota_allocation(actor_pool, enable_kill_slot): + from .....utils import AttributeDict + + mock_mem_stat = AttributeDict(dict(total=300, available=50, used=0, free=50)) + mock_band_slot_manager_ref = await mo.create_actor( + MockBandSlotManagerActor, + uid=BandSlotManagerActor.gen_uid("numa-0"), + address=actor_pool.external_address, + ) + quota_ref = await mo.create_actor( + MemQuotaActor, + (actor_pool.external_address, "numa-0"), + 300, + hard_limit=300, + refresh_time=0.1, + enable_kill_slot=enable_kill_slot, + uid=MemQuotaActor.gen_uid("cpu-0"), + address=actor_pool.external_address, + ) # type: mo.ActorRefType[QuotaActor] + + with mock.patch("mars.resource.virtual_memory", new=lambda: mock_mem_stat): + time_recs = [time.time()] + + async def task_fun(): + await quota_ref.request_batch_quota({"req": 100}) + await quota_ref.release_quotas(["req"]) + time_recs.append(time.time()) + + task = asyncio.create_task(task_fun()) + await asyncio.sleep(0.2) + assert "req" not in (await quota_ref.dump_data()).allocations + + mock_mem_stat["available"] = 150 + mock_mem_stat["free"] = 150 + await asyncio.wait_for(task, timeout=1) + assert 0.15 < abs(time_recs[0] - time_recs[1]) < 1 + assert ( + bool(await mock_band_slot_manager_ref.get_restart_record()) + == enable_kill_slot + ) diff --git a/python/xorbits/_mars/services/scheduling/worker/tests/test_workerslot.py b/python/xorbits/_mars/services/scheduling/worker/tests/test_workerslot.py new file mode 100644 index 000000000..9f8d17b1b --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/worker/tests/test_workerslot.py @@ -0,0 +1,353 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import os +import sys +import time +from typing import Tuple + +import pandas as pd +import psutil +import pytest + +from ..... import oscar as mo +from .....oscar import ServerClosed +from .....oscar.backends.allocate_strategy import IdleLabel +from .....oscar.errors import NoFreeSlot, SlotStateError +from .....resource import Resource +from .....tests.core import wait_for_condition +from .....utils import get_next_port +from ...supervisor import GlobalResourceManagerActor +from ...worker import BandSlotControlActor, BandSlotManagerActor + + +class MockGlobalResourceManagerActor(mo.Actor): + def __init__(self): + self._result = None + + @mo.extensible + def update_subtask_resources( + self, band: Tuple, session_id: str, subtask_id: str, resources: Resource + ): + self._result = (band, session_id, subtask_id, resources) + + def get_result(self): + return self._result + + +@pytest.fixture +async def actor_pool(request): + start_method = ( + os.environ.get("POOL_START_METHOD", "forkserver") + if sys.platform != "win32" + else None + ) + n_slots = request.param + pool = await mo.create_actor_pool( + f"127.0.0.1:{get_next_port()}", + n_process=n_slots, + labels=[None] + ["numa-0"] * n_slots, + subprocess_start_method=start_method, + ) + + async with pool: + global_resource_ref = await mo.create_actor( + MockGlobalResourceManagerActor, + uid=GlobalResourceManagerActor.default_uid(), + address=pool.external_address, + ) + slot_manager_ref = await mo.create_actor( + BandSlotManagerActor, + (pool.external_address, "numa-0"), + n_slots, + global_resource_ref, + uid=BandSlotManagerActor.gen_uid("numa-0"), + address=pool.external_address, + ) + try: + yield pool, slot_manager_ref + finally: + await slot_manager_ref.destroy() + + +ActorPoolType = Tuple[mo.MainActorPoolType, mo.ActorRefType[BandSlotManagerActor]] + + +class TaskActor(mo.Actor): + def __init__(self, call_logs, slot_id=0): + self._call_logs = call_logs + self._dispatch_ref = None + self._slot_id = slot_id + + @classmethod + def gen_uid(cls, slot_id): + return f"{slot_id}_task_actor" + + async def __post_create__(self): + self._dispatch_ref = await mo.actor_ref( + BandSlotManagerActor.gen_uid("numa-0"), address=self.address + ) + await self._dispatch_ref.register_slot.tell(self._slot_id, os.getpid()) + + async def queued_call(self, key, session_stid, delay): + try: + self._call_logs[key] = time.time() + await asyncio.sleep(delay) + finally: + if session_stid is not None: + await self._dispatch_ref.release_free_slot(self._slot_id, session_stid) + + def get_call_logs(self): + return self._call_logs + + +@pytest.mark.asyncio +@pytest.mark.parametrize("actor_pool", [0], indirect=True) +async def test_slot_assign(actor_pool: ActorPoolType): + pool, slot_manager_ref = actor_pool + + call_logs = dict() + group_size = 4 + delay = 1 + await asyncio.gather( + *( + mo.create_actor( + TaskActor, + call_logs, + slot_id=slot_id, + uid=TaskActor.gen_uid(slot_id), + address=pool.external_address, + ) + for slot_id in range(group_size) + ) + ) + assert len((await slot_manager_ref.dump_data()).free_slots) == group_size + + async def task_fun(idx): + session_stid = ("session_id", f"subtask_id{idx}") + slot_id = await slot_manager_ref.acquire_free_slot(session_stid) + assert slot_id == await slot_manager_ref.get_subtask_slot(session_stid) + ref = await mo.actor_ref( + uid=TaskActor.gen_uid(slot_id), address=pool.external_address + ) + await ref.queued_call(idx, session_stid, delay) + + tasks = [] + start_time = time.time() + for idx in range(group_size + 1): + tasks.append(asyncio.create_task(task_fun(idx))) + await asyncio.gather(*tasks) + + log_series = pd.Series(call_logs).sort_index() - start_time + assert len(log_series) == group_size + 1 + assert log_series.iloc[:group_size].max() < delay / 4 + assert log_series.iloc[group_size:].min() > delay / 4 + + call_logs.clear() + tasks = [] + start_time = time.time() + for idx in range(group_size * 2 + 1): + tasks.append(asyncio.create_task(task_fun(idx))) + await asyncio.sleep(delay / 10) + tasks[group_size].cancel() + await asyncio.wait(tasks) + + with pytest.raises(asyncio.CancelledError): + tasks[group_size].result() + + log_series = pd.Series(call_logs).sort_index() - start_time + + assert len(log_series) == group_size * 2 + assert log_series.iloc[:group_size].max() < delay / 4 + assert log_series.iloc[group_size:].min() > delay / 4 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("actor_pool", [1], indirect=True) +async def test_slot_kill(actor_pool: ActorPoolType): + pool, slot_manager_ref = actor_pool + + strategy = IdleLabel("numa-0", "task_actor") + task_ref = await mo.create_actor( + TaskActor, {}, allocate_strategy=strategy, address=pool.external_address + ) + + assert await mo.actor_ref( + BandSlotControlActor.gen_uid("numa-0", 0), address=pool.external_address + ) + delayed_task = asyncio.create_task(task_ref.queued_call("key", None, 10)) + await asyncio.sleep(0.1) + + # check if process hosting the actor is closed + kill_task = asyncio.create_task(slot_manager_ref.kill_slot(0)) + await asyncio.sleep(0) + kill_task2 = asyncio.create_task(slot_manager_ref.kill_slot(0)) + + with pytest.raises(ServerClosed): + await delayed_task + + # check if slot actor is restored + await kill_task + # check if secondary task makes no change + await kill_task2 + + assert await mo.actor_ref( + BandSlotControlActor.gen_uid("numa-0", 0), address=pool.external_address + ) + + async def check_alive(): + assert await mo.actor_ref(task_ref) + return True + + await wait_for_condition(check_alive) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("actor_pool", [3], indirect=True) +async def test_slot_restart(actor_pool: ActorPoolType): + pool, slot_manager_ref = actor_pool + + strategy = IdleLabel("numa-0", "task_actor") + task_refs = [] + for idx in range(3): + ref = await mo.create_actor( + TaskActor, + {}, + slot_id=idx, + allocate_strategy=strategy, + address=pool.external_address, + ) + await ref.queued_call("idx", None, idx) + task_refs.append(ref) + + await slot_manager_ref.acquire_free_slot(("session_id", "subtask_id1")) + slot_id2 = await slot_manager_ref.acquire_free_slot(("session_id", "subtask_id2")) + await slot_manager_ref.release_free_slot(slot_id2, ("session_id", "subtask_id2")) + + async def record_finish_time(coro): + await coro + return time.time() + + restart_task1 = asyncio.create_task( + record_finish_time(slot_manager_ref.restart_free_slots()) + ) + await asyncio.sleep(0) + restart_task2 = asyncio.create_task( + record_finish_time(slot_manager_ref.restart_free_slots()) + ) + acquire_task = asyncio.create_task( + record_finish_time( + slot_manager_ref.acquire_free_slot(("session_id", "subtask_id3")) + ) + ) + + await asyncio.gather(restart_task1, restart_task2, acquire_task) + + # check only slots with running records are restarted + assert len(await task_refs[0].get_call_logs()) > 0 + assert len(await task_refs[1].get_call_logs()) == 0 + assert len(await task_refs[2].get_call_logs()) > 0 + + assert abs(restart_task1.result() - acquire_task.result()) < 0.1 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("actor_pool", [1], indirect=True) +async def test_report_usage(actor_pool: ActorPoolType): + pool, slot_manager_ref = actor_pool + + await slot_manager_ref.acquire_free_slot(("session_id", "subtask_id")) + await asyncio.sleep(1.3) + + global_resource_ref = await mo.actor_ref( + uid=GlobalResourceManagerActor.default_uid(), address=pool.external_address + ) + _band, session_id, subtask_id, resources = await global_resource_ref.get_result() + assert resources.num_cpus == pytest.approx(1.0) + assert session_id == "session_id" + assert subtask_id == "subtask_id" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("actor_pool", [1], indirect=True) +async def test_slot_fault_tolerance(actor_pool: ActorPoolType): + pool, slot_manager_ref = actor_pool + # acquire -> slot restarted = can't acquire more. + slot_id = await slot_manager_ref.acquire_free_slot(("session_id", "subtask_id")) + await slot_manager_ref.register_slot(slot_id, os.getpid()) + with pytest.raises(NoFreeSlot): + await slot_manager_ref.acquire_free_slot( + ("session_id", "subtask_id"), block=False + ) + await slot_manager_ref.release_free_slot(slot_id, ("session_id", "subtask_id")) + + # acquire -> release -> slot restarted = can only acquire once. + slot_id = await slot_manager_ref.acquire_free_slot(("session_id", "subtask_id2")) + await slot_manager_ref.release_free_slot(slot_id, ("session_id", "subtask_id2")) + await slot_manager_ref.register_slot(slot_id, os.getpid()) + await slot_manager_ref.acquire_free_slot(("session_id", "subtask_id2")) + with pytest.raises(NoFreeSlot): + await slot_manager_ref.acquire_free_slot( + ("session_id", "subtask_id2"), block=False + ) + await slot_manager_ref.release_free_slot(slot_id, ("session_id", "subtask_id2")) + + # acquire -> release -> acquire -> slot restarted = can't acquire more. + slot_id = await slot_manager_ref.acquire_free_slot(("session_id", "subtask_id3")) + await slot_manager_ref.release_free_slot(slot_id, ("session_id", "subtask_id3")) + await slot_manager_ref.acquire_free_slot(("session_id", "subtask_id3")) + await slot_manager_ref.register_slot(slot_id, os.getpid()) + with pytest.raises(NoFreeSlot): + await slot_manager_ref.acquire_free_slot( + ("session_id", "subtask_id3"), block=False + ) + await slot_manager_ref.release_free_slot(slot_id, ("session_id", "subtask_id3")) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("actor_pool", [1], indirect=True) +async def test_slot_exception(actor_pool: ActorPoolType): + pool, slot_manager_ref = actor_pool + + # make sure the BandSlotControlActor has registered. + slot_id = await slot_manager_ref.acquire_free_slot(("session_id", "subtask_id")) + await slot_manager_ref.release_free_slot(slot_id, ("session_id", "subtask_id")) + + if sys.platform == "win32": + with pytest.raises(ValueError): + await slot_manager_ref.register_slot(1, -1) + else: + with pytest.raises((psutil.AccessDenied, psutil.NoSuchProcess)): + await slot_manager_ref.register_slot(1, 0) + + dump_data = await slot_manager_ref.dump_data() + # after the register_slot is correctly handled, + # we can assert 1 not in free slots. + assert 1 in dump_data.free_slots + + slot_id = await slot_manager_ref.acquire_free_slot(("session_id", "subtask_id")) + with pytest.raises(SlotStateError): + # release session_stid not matched the acquired value. + await slot_manager_ref.release_free_slot(slot_id, ("session_id", "subtask_id1")) + + dump_data = await slot_manager_ref.dump_data() + # the slot is not released. + assert slot_id not in dump_data.free_slots + + not_acquired_slot = next(iter(dump_data.free_slots)) + with pytest.raises(SlotStateError): + await slot_manager_ref.release_free_slot( + not_acquired_slot, ("session_id", "subtask_id1") + ) diff --git a/python/xorbits/_mars/services/scheduling/worker/workerslot.py b/python/xorbits/_mars/services/scheduling/worker/workerslot.py new file mode 100644 index 000000000..60134f1f5 --- /dev/null +++ b/python/xorbits/_mars/services/scheduling/worker/workerslot.py @@ -0,0 +1,339 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import logging +import os +import time +from typing import Dict, List, NamedTuple, Set, Tuple + +import psutil + +from .... import oscar as mo +from ....oscar.backends.allocate_strategy import IdleLabel +from ....oscar.errors import NoFreeSlot, SlotStateError +from ....resource import Resource +from ....typing import BandType +from ...cluster import ClusterAPI, WorkerSlotInfo + +logger = logging.getLogger(__name__) + + +class DispatchDumpType(NamedTuple): + free_slots: Set + fresh_slots: Set + + +class WorkerSlotManagerActor(mo.Actor): + _band_slot_infos: Dict[str, List[WorkerSlotInfo]] + + def __init__(self): + self._cluster_api = None + self._global_resource_ref = None + + self._band_slot_managers = dict() # type: Dict[str, mo.ActorRef] + + async def __post_create__(self): + self._cluster_api = await ClusterAPI.create(self.address) + + band_to_resource = await self._cluster_api.get_bands() + for band, resource in band_to_resource.items(): + self._band_slot_managers[band] = await mo.create_actor( + BandSlotManagerActor, + band, + int(resource.num_cpus or resource.num_gpus), + self._global_resource_ref, + uid=BandSlotManagerActor.gen_uid(band[1]), + address=self.address, + ) + + async def __pre_destroy__(self): + await asyncio.gather( + *[mo.destroy_actor(ref) for ref in self._band_slot_managers.values()] + ) + + +class BandSlotManagerActor(mo.Actor): + _free_slots: Set[int] + _fresh_slots: Set[int] + + @classmethod + def gen_uid(cls, band_name: str): + return f"{band_name}_band_slot_manager" + + def __init__( + self, band: BandType, n_slots: int, global_resource_ref: mo.ActorRef = None + ): + super().__init__() + self._cluster_api = None + + self._band = band + self._band_name = band[1] + self._global_resource_ref = global_resource_ref + self._n_slots = n_slots + + self._semaphore = asyncio.Semaphore(0) + self._slot_control_refs = dict() + self._free_slots = set() + self._fresh_slots = set() + self._slot_kill_events = dict() + + self._restarting = False + self._restart_done_event = asyncio.Event() + + self._session_stid_to_slot = dict() + self._slot_to_session_stid = dict() + self._last_report_time = time.time() + + self._slot_to_proc = dict() + self._usage_upload_task = None + + async def __post_create__(self): + try: + self._cluster_api = await ClusterAPI.create(self.address) + except mo.ActorNotExist: + pass + + strategy = IdleLabel(self._band_name, f"worker_slot_control") + for slot_id in range(self._n_slots): + self._slot_control_refs[slot_id] = await mo.create_actor( + BandSlotControlActor, + self.ref(), + self._band_name, + slot_id, + uid=BandSlotControlActor.gen_uid(self._band_name, slot_id), + address=self.address, + allocate_strategy=strategy, + ) + self._fresh_slots.add(slot_id) + + self._upload_slot_usage_with_delay() + + async def __pre_destroy__(self): + self._usage_upload_task.cancel() + + async def _get_global_resource_ref(self): + if self._global_resource_ref is not None: + return self._global_resource_ref + + from ..supervisor import GlobalResourceManagerActor + + try: + [self._global_resource_ref] = await self._cluster_api.get_supervisor_refs( + [GlobalResourceManagerActor.default_uid()] + ) + except mo.ActorNotExist: + self._global_resource_ref = None + return self._global_resource_ref + + def get_slot_address(self, slot_id: int): + return self._slot_control_refs[slot_id].address + + def get_subtask_slot(self, session_stid: Tuple[str, str]): + return self._session_stid_to_slot.get(session_stid) + + async def acquire_free_slot(self, session_stid: Tuple[str, str], block=True): + if not block and self._semaphore.locked(): + raise NoFreeSlot(f"No free slot for {session_stid}") + yield self._semaphore.acquire() + if self._restarting: + yield self._restart_done_event.wait() + + slot_id = self._free_slots.pop() + self._fresh_slots.difference_update([slot_id]) + self._slot_to_session_stid[slot_id] = session_stid + self._session_stid_to_slot[session_stid] = slot_id + logger.debug("Slot %d acquired for subtask %r", slot_id, session_stid) + raise mo.Return(slot_id) + + def release_free_slot(self, slot_id: int, session_stid: Tuple[str, str]): + acquired_session_stid = self._slot_to_session_stid.pop(slot_id, None) + if acquired_session_stid is None: + raise SlotStateError(f"Slot {slot_id} is not acquired.") + if acquired_session_stid != session_stid: + raise SlotStateError( + f"Slot {slot_id} releasing state incorrect, " + f"the acquired session_stid: {acquired_session_stid}, " + f"the releasing session_stid: {session_stid}" + ) + acquired_slot_id = self._session_stid_to_slot.pop(acquired_session_stid) + assert ( + acquired_slot_id == slot_id + ), f"{acquired_session_stid}: acquired_slot_id {acquired_slot_id} != slot_id {slot_id}" + + logger.debug("Slot %d released", slot_id) + + if slot_id not in self._free_slots: + self._free_slots.add(slot_id) + self._semaphore.release() + + def register_slot(self, slot_id: int, pid: int): + try: + self._fresh_slots.add(slot_id) + if slot_id in self._slot_kill_events: + event = self._slot_kill_events.pop(slot_id) + event.set() + if slot_id in self._slot_to_session_stid: + # We should release the slot by one role, if the slot is + # acquired by the SubtaskExecutionActor, then the slot + # should be released by it, too. + session_stid = self._slot_to_session_stid[slot_id] + logger.info( + "Slot %s registered by pid %s, current acquired session_stid is %s", + slot_id, + pid, + session_stid, + ) + else: + if slot_id not in self._free_slots: + self._free_slots.add(slot_id) + self._semaphore.release() + finally: + # psutil may raises exceptions, but currently we can't handle the register exception, + # so put it to the finally. + # TODO(fyrestone): handle register_slot failure. + self._slot_to_proc[slot_id] = proc = psutil.Process(pid) + # collect initial stats for the process + proc.cpu_percent(interval=None) + + async def _kill_slot(self, slot_id: int): + if slot_id in self._slot_kill_events: + await self._slot_kill_events[slot_id].wait() + return + + event = self._slot_kill_events[slot_id] = asyncio.Event() + # TODO(fyrestone): Make it more reliable. e.g. kill_actor + # success but the actor does not restart. + try: + await mo.kill_actor(self._slot_control_refs[slot_id]) + except ConnectionError: + pass + await event.wait() + + async def kill_slot(self, slot_id: int): + self._free_slots.difference_update([slot_id]) + yield self._kill_slot(slot_id) + + async def restart_free_slots(self): + if self._restarting: + yield self._restart_done_event.wait() + return + + self._restart_done_event = asyncio.Event() + self._restarting = True + slot_ids = [ + slot_id for slot_id in self._free_slots if slot_id not in self._fresh_slots + ] + if slot_ids: + yield asyncio.gather(*[self._kill_slot(slot_id) for slot_id in slot_ids]) + logger.info("%d idle slots restarted", len(slot_ids)) + + self._restarting = False + self._restart_done_event.set() + + def _upload_slot_usage_with_delay(self, delay: int = 1): + self._usage_upload_task = self.ref().upload_slot_usages.tell_delay( + periodical=True, delay=delay + ) + + async def upload_slot_usages(self, periodical: bool = False): + delays = [] + slot_infos = [] + global_resource_ref = await self._get_global_resource_ref() + + if global_resource_ref is None: # pragma: no cover + if periodical: + self._upload_slot_usage_with_delay() + return + + for slot_id, proc in self._slot_to_proc.items(): + if slot_id not in self._slot_to_session_stid: + continue + session_id, subtask_id = self._slot_to_session_stid[slot_id] + cpu_usage, gpu_usage, processor_usage = 0, 0, 0 + if self._band_name.startswith("gpu"): + processor_usage = gpu_usage = 1 + else: + try: + processor_usage = cpu_usage = ( + proc.cpu_percent(interval=None) / 100.0 + ) + except psutil.NoSuchProcess: # pragma: no cover + continue + except psutil.AccessDenied as e: # pragma: no cover + logger.warning("Access denied when getting cpu percent: %s", e) + processor_usage = cpu_usage = 0.0 + + slot_infos.append( + WorkerSlotInfo( + slot_id=slot_id, + session_id=session_id, + subtask_id=subtask_id, + processor_usage=processor_usage, + ) + ) + + if global_resource_ref is not None: # pragma: no branch + # FIXME fix band slot mistake + delays.append( + global_resource_ref.update_subtask_resources.delay( + self._band[1], + session_id, + subtask_id, + Resource( + num_cpus=max(1.0, cpu_usage), num_gpus=max(1.0, gpu_usage) + ), + ) + ) + + if delays: # pragma: no branch + yield global_resource_ref.update_subtask_resources.batch(*delays) + if self._cluster_api is not None: + await self._cluster_api.set_band_slot_infos(self._band_name, slot_infos) + + if periodical: + self._upload_slot_usage_with_delay() + + def dump_data(self): + """ + Get all refs of slots of a queue + """ + return DispatchDumpType(self._free_slots, self._fresh_slots) + + +class BandSlotControlActor(mo.Actor): + @classmethod + def gen_uid(cls, band_name: str, slot_id: int): + return f"{band_name}_{slot_id}_band_slot_control" + + def __init__(self, manager_ref, band_name, slot_id: int): + self._manager_ref = manager_ref + self._band_name = band_name + self._slot_id = slot_id + self._report_task = None + + async def __post_create__(self): + self._report_task = asyncio.create_task(self._report_slot_ready()) + + async def _report_slot_ready(self): + from ...cluster.api import ClusterAPI + + try: + self._cluster_api = await ClusterAPI.create(self.address) + await self._cluster_api.wait_node_ready() + except mo.ActorNotExist: + pass + + await mo.wait_actor_pool_recovered(self.address) + await self._manager_ref.register_slot.tell(self._slot_id, os.getpid()) diff --git a/python/xorbits/_mars/services/session/__init__.py b/python/xorbits/_mars/services/session/__init__.py new file mode 100644 index 000000000..d46c9cf03 --- /dev/null +++ b/python/xorbits/_mars/services/session/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .api import AbstractSessionAPI, MockSessionAPI, SessionAPI, WebSessionAPI +from .supervisor import SessionActor, SessionManagerActor diff --git a/python/xorbits/_mars/services/session/api/__init__.py b/python/xorbits/_mars/services/session/api/__init__.py new file mode 100644 index 000000000..ed925376b --- /dev/null +++ b/python/xorbits/_mars/services/session/api/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .core import AbstractSessionAPI +from .oscar import MockSessionAPI, SessionAPI +from .web import WebSessionAPI diff --git a/python/xorbits/_mars/services/session/api/core.py b/python/xorbits/_mars/services/session/api/core.py new file mode 100644 index 000000000..e8bc35687 --- /dev/null +++ b/python/xorbits/_mars/services/session/api/core.py @@ -0,0 +1,110 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from typing import Dict, List, Union + +from ..core import SessionInfo + + +class AbstractSessionAPI(ABC): + @abstractmethod + async def get_sessions(self) -> List[SessionInfo]: + """ + Get information of all sessions + + Returns + ------- + session_infos : List[SessionInfo] + List of session infos. + """ + + @abstractmethod + async def create_session(self, session_id: str) -> str: + """ + Create session and return address. + + Parameters + ---------- + session_id : str + Session ID + + Returns + ------- + address : str + Session address. + """ + + @abstractmethod + async def delete_session(self, session_id: str): + """ + Delete session. + + Parameters + ---------- + session_id : str + Session ID. + """ + + @abstractmethod + async def delete_all_sessions(self): + """ + Delete all sessions. + """ + + @abstractmethod + async def get_last_idle_time( + self, session_id: Union[str, None] = None + ) -> Union[float, None]: + """ + Get session last idle time. + + Parameters + ---------- + session_id : str, None + Session ID. None for all sessions. + + Returns + ------- + last_idle_time: str + The last idle time if the session(s) is idle else None. + """ + + @abstractmethod + async def fetch_tileable_op_logs( + self, + session_id: str, + tileable_op_key: str, + chunk_op_key_to_offsets: Dict[str, List[int]], + chunk_op_key_to_sizes: Dict[str, List[int]], + ) -> Dict: + """ + Fetch tileable op's logs + + Parameters + ---------- + session_id : str + Session ID. + tileable_op_key : str + Tileable op key. + chunk_op_key_to_offsets : str or int or list of int + Fetch offsets. + chunk_op_key_to_sizes : str or int or list of int + Fetch sizes. + + Returns + ------- + logs : dict + chunk op key to result. + """ diff --git a/python/xorbits/_mars/services/session/api/oscar.py b/python/xorbits/_mars/services/session/api/oscar.py new file mode 100644 index 000000000..69d0a3c1d --- /dev/null +++ b/python/xorbits/_mars/services/session/api/oscar.py @@ -0,0 +1,207 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Union + +from .... import oscar as mo +from ....lib.aio import alru_cache +from ....utils import parse_readable_size +from ..core import SessionInfo +from ..supervisor import CustomLogMetaActor, SessionActor, SessionManagerActor +from ..worker import CustomLogActor +from .core import AbstractSessionAPI + + +class SessionAPI(AbstractSessionAPI): + def __init__( + self, address: str, session_manager: mo.ActorRefType[SessionManagerActor] + ): + self._address = address + self._session_manager_ref = session_manager + + @classmethod + @alru_cache(cache_exceptions=False) + async def create(cls, address: str, **kwargs) -> "SessionAPI": + if kwargs: # pragma: no cover + raise TypeError(f"SessionAPI.create got unknown arguments: {list(kwargs)}") + session_manager = await mo.actor_ref(address, SessionManagerActor.default_uid()) + return SessionAPI(address, session_manager) + + async def create_session(self, session_id: str) -> str: + session_actor_ref = await self._session_manager_ref.create_session(session_id) + return session_actor_ref.address + + async def get_sessions(self) -> List[SessionInfo]: + return await self._session_manager_ref.get_sessions() + + async def has_session(self, session_id: str) -> bool: + """ + Check if session created. + + Parameters + ---------- + session_id : str + Session ID. + + Returns + ------- + if_exists : bool + """ + return await self._session_manager_ref.has_session(session_id) + + async def delete_session(self, session_id: str): + await self._session_manager_ref.delete_session(session_id) + + async def delete_all_sessions(self): + await self._session_manager_ref.delete_all_sessions() + + @alru_cache(cache_exceptions=False) + async def get_session_address(self, session_id: str) -> str: + """ + Get session address. + + Parameters + ---------- + session_id : str + Session ID. + + Returns + ------- + address : str + Session address. + """ + return (await self._session_manager_ref.get_session_ref(session_id)).address + + async def get_last_idle_time( + self, session_id: Union[str, None] = None + ) -> Union[float, None]: + return await self._session_manager_ref.get_last_idle_time(session_id) + + @alru_cache(cache_exceptions=False) + async def _get_session_ref(self, session_id: str) -> mo.ActorRefType[SessionActor]: + return await self._session_manager_ref.get_session_ref(session_id) + + async def create_remote_object( + self, session_id: str, name: str, object_cls, *args, **kwargs + ): + session = await self._get_session_ref(session_id) + return await session.create_remote_object(name, object_cls, *args, **kwargs) + + async def get_remote_object(self, session_id: str, name: str): + session = await self._get_session_ref(session_id) + return await session.get_remote_object(name) + + async def destroy_remote_object(self, session_id: str, name: str): + session = await self._get_session_ref(session_id) + return await session.destroy_remote_object(name) + + @alru_cache(cache_exceptions=False) + async def _get_custom_log_meta_ref( + self, session_id: str + ) -> mo.ActorRefType[CustomLogMetaActor]: + session = await self._get_session_ref(session_id) + return await mo.actor_ref( + mo.ActorRef(session.address, CustomLogMetaActor.gen_uid(session_id)) + ) + + async def register_custom_log_path( + self, + session_id: str, + tileable_op_key: str, + chunk_op_key: str, + worker_address: str, + log_path: str, + ): + custom_log_meta_ref = await self._get_custom_log_meta_ref(session_id) + return await custom_log_meta_ref.register_custom_log_path( + tileable_op_key, chunk_op_key, worker_address, log_path + ) + + @classmethod + async def new_custom_log_dir(cls, address: str, session_id: str): + try: + ref = await mo.actor_ref(mo.ActorRef(address, CustomLogActor.default_uid())) + except mo.ActorNotExist: + return + return await ref.new_custom_log_dir(session_id) + + async def fetch_tileable_op_logs( + self, + session_id: str, + tileable_op_key: str, + chunk_op_key_to_offsets: Dict[str, List[int]], + chunk_op_key_to_sizes: Dict[str, List[int]], + ) -> Dict: + custom_log_meta_ref = await self._get_custom_log_meta_ref(session_id) + chunk_op_key_to_arr_paths = await custom_log_meta_ref.get_tileable_op_log_paths( + tileable_op_key + ) + if chunk_op_key_to_arr_paths is None: + return + worker_to_kwds = dict() + for chunk_op_key, addr_path in chunk_op_key_to_arr_paths.items(): + worker_address, log_path = addr_path + if isinstance(chunk_op_key_to_offsets, dict): + offset = chunk_op_key_to_offsets.get(chunk_op_key, 0) + elif isinstance(chunk_op_key_to_offsets, str): + offset = int(parse_readable_size(chunk_op_key_to_offsets)[0]) + elif isinstance(chunk_op_key_to_offsets, int): + offset = chunk_op_key_to_offsets + else: + offset = 0 + if isinstance(chunk_op_key_to_sizes, dict): + size = chunk_op_key_to_sizes.get(chunk_op_key, -1) + elif isinstance(chunk_op_key_to_sizes, str): + size = int(parse_readable_size(chunk_op_key_to_sizes)[0]) + elif isinstance(chunk_op_key_to_sizes, int): + size = chunk_op_key_to_sizes + else: + size = -1 + if worker_address not in worker_to_kwds: + worker_to_kwds[worker_address] = { + "chunk_op_keys": [], + "log_paths": [], + "offsets": [], + "sizes": [], + } + kwds = worker_to_kwds[worker_address] + kwds["chunk_op_keys"].append(chunk_op_key) + kwds["log_paths"].append(log_path) + kwds["offsets"].append(offset) + kwds["sizes"].append(size) + result = dict() + for worker, kwds in worker_to_kwds.items(): + custom_log_ref = await mo.actor_ref( + mo.ActorRef(worker, CustomLogActor.default_uid()) + ) + chunk_op_keys = kwds.pop("chunk_op_keys") + logs = await custom_log_ref.fetch_logs(**kwds) + for chunk_op_key, log_result in zip(chunk_op_keys, logs): + result[chunk_op_key] = log_result + return result + + +class MockSessionAPI(SessionAPI): + @classmethod + async def create(cls, address: str, **kwargs) -> "SessionAPI": + session_id = kwargs.pop("session_id") + if kwargs: # pragma: no cover + raise TypeError(f"SessionAPI.create got unknown arguments: {list(kwargs)}") + + session_manager = await mo.create_actor( + SessionManagerActor, address=address, uid=SessionManagerActor.default_uid() + ) + if session_id: + await session_manager.create_session(session_id, create_services=False) + return MockSessionAPI(address, session_manager) diff --git a/python/xorbits/_mars/services/session/api/web.py b/python/xorbits/_mars/services/session/api/web.py new file mode 100644 index 000000000..4fdda8b23 --- /dev/null +++ b/python/xorbits/_mars/services/session/api/web.py @@ -0,0 +1,176 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from typing import Callable, Dict, List, Union + +from ....utils import parse_readable_size +from ...web import MarsServiceWebAPIHandler, MarsWebAPIClientMixin, web_api +from ..core import SessionInfo +from .core import AbstractSessionAPI + + +def _encode_size(size: Union[str, Dict[str, List[int]]]) -> str: + if not isinstance(size, dict): + return size + else: + return ",".join(f"{k}={v}" for k, v in size.items()) + + +def _decode_size(encoded: str) -> Union[int, str, Dict[str, Union[int, List[int]]]]: + if not encoded: + return 0 + if "," not in encoded and "=" not in encoded: + try: + return int(encoded) + except ValueError: + return int(parse_readable_size(encoded)[0]) + else: + ret = dict() + for kv in encoded.split(","): + k, v = kv.split("=", 1) + ret[k] = int(parse_readable_size(v)[0]) + return ret + + +class SessionWebAPIBaseHandler(MarsServiceWebAPIHandler): + async def _get_oscar_session_api(self): + from .oscar import SessionAPI + + return await self._get_api_by_key(SessionAPI, "Session", with_key_arg=False) + + +class SessionWebAPIHandler(SessionWebAPIBaseHandler): + @classmethod + def get_root_pattern(cls): + return "/api/session(?:/(?P[^/]*)$|$)" + + @web_api("(?P[^/]+)", method="put") + async def create_session(self, session_id: str): + oscar_api = await self._get_oscar_session_api() + addr = await oscar_api.create_session(session_id) + self.write(addr) + + @web_api("(?P[^/]+)", method="delete") + async def delete_session(self, session_id: str): + oscar_api = await self._get_oscar_session_api() + await oscar_api.delete_session(session_id) + + @web_api("", method="delete") + async def delete_all_sessions(self): + oscar_api = await self._get_oscar_session_api() + await oscar_api.delete_all_sessions() + + @web_api( + "(?P[^/]+)", method="get", arg_filter={"action": "check_exist"} + ) + async def has_session(self, session_id: str): + oscar_api = await self._get_oscar_session_api() + res = await oscar_api.has_session(session_id) + self.write("1" if res else "0") + + @web_api( + "(?P[^/]*)", + method="get", + arg_filter={"action": "get_last_idle_time"}, + ) + async def get_last_idle_time(self, session_id: str): + session_id = session_id or None + oscar_api = await self._get_oscar_session_api() + res = await oscar_api.get_last_idle_time(session_id) + self.write(str(res) if res else "") + + @web_api("", method="get") + async def get_sessions(self): + oscar_api = await self._get_oscar_session_api() + res = await oscar_api.get_sessions() + self.write( + json.dumps({"sessions": [{"session_id": info.session_id} for info in res]}) + ) + + +class SessionWebLogAPIHandler(SessionWebAPIBaseHandler): + _root_pattern = "/api/session/(?P[^/]+)/op/(?P[^/]+)/log" + + @web_api("", method="get") + async def fetch_tileable_op_logs(self, session_id: str, op_key: str): + oscar_api = await self._get_oscar_session_api() + offsets = _decode_size(self.get_argument("offsets", None)) + sizes = _decode_size(self.get_argument("sizes", None)) + log_result = await oscar_api.fetch_tileable_op_logs( + session_id, op_key, offsets, sizes + ) + self.write(json.dumps(log_result)) + + +web_handlers = { + SessionWebAPIHandler.get_root_pattern(): SessionWebAPIHandler, + SessionWebLogAPIHandler.get_root_pattern(): SessionWebLogAPIHandler, +} + + +class WebSessionAPI(AbstractSessionAPI, MarsWebAPIClientMixin): + def __init__(self, address: str, request_rewriter: Callable = None): + self._address = address.rstrip("/") + self.request_rewriter = request_rewriter + + async def get_sessions(self) -> List[SessionInfo]: + addr = f"{self._address}/api/session" + res = await self._request_url("GET", addr) + res_obj = json.loads(res.body.decode()) + return [SessionInfo(**kw) for kw in res_obj["sessions"]] + + async def create_session(self, session_id: str) -> str: + addr = f"{self._address}/api/session/{session_id}" + res = await self._request_url(path=addr, method="PUT", data=b"") + return res.body.decode() + + async def delete_session(self, session_id: str): + addr = f"{self._address}/api/session/{session_id}" + await self._request_url(path=addr, method="DELETE") + + async def delete_all_sessions(self): + addr = f"{self._address}/api/session" + await self._request_url(path=addr, method="DELETE") + + async def has_session(self, session_id: str): + addr = f"{self._address}/api/session/{session_id}" + params = dict(action="check_exist") + res = await self._request_url("GET", addr, params=params) + return bool(int(res.body.decode())) + + async def get_last_idle_time( + self, session_id: Union[str, None] = None + ) -> Union[float, None]: + session_id = session_id or "" + addr = f"{self._address}/api/session/{session_id}" + params = dict(action="get_last_idle_time") + res = await self._request_url("GET", addr, params=params) + content = res.body.decode() + return float(content) if content else None + + async def fetch_tileable_op_logs( + self, + session_id: str, + tileable_op_key: str, + chunk_op_key_to_offsets: Dict[str, List[int]], + chunk_op_key_to_sizes: Dict[str, List[int]], + ) -> Dict: + addr = f"{self._address}/api/session/{session_id}/op/{tileable_op_key}/log" + params = dict( + offsets=_encode_size(chunk_op_key_to_offsets), + sizes=_encode_size(chunk_op_key_to_sizes), + ) + res = await self._request_url("GET", addr, params=params) + return json.loads(res.body.decode()) diff --git a/python/xorbits/_mars/services/session/core.py b/python/xorbits/_mars/services/session/core.py new file mode 100644 index 000000000..417c92618 --- /dev/null +++ b/python/xorbits/_mars/services/session/core.py @@ -0,0 +1,19 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...serialization.serializables import Serializable, StringField + + +class SessionInfo(Serializable): + session_id: str = StringField("session_id") diff --git a/python/xorbits/_mars/services/session/supervisor/__init__.py b/python/xorbits/_mars/services/session/supervisor/__init__.py new file mode 100644 index 000000000..c229ed8d4 --- /dev/null +++ b/python/xorbits/_mars/services/session/supervisor/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .core import SessionActor, SessionManagerActor +from .custom_log import CustomLogMetaActor +from .service import SessionSupervisorService diff --git a/python/xorbits/_mars/services/session/supervisor/core.py b/python/xorbits/_mars/services/session/supervisor/core.py new file mode 100644 index 000000000..f8398cd62 --- /dev/null +++ b/python/xorbits/_mars/services/session/supervisor/core.py @@ -0,0 +1,236 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import functools +import logging +import os +import time +from typing import Dict, List, Optional + +from .... import oscar as mo +from ....utils import to_binary +from ...cluster import ClusterAPI +from ...core import NodeRole, create_service_session, destroy_service_session +from ..core import SessionInfo + +logger = logging.getLogger(__name__) + + +class SessionManagerActor(mo.Actor): + def __init__(self, service_config: Optional[Dict] = None): + self._session_refs: Dict[str, mo.ActorRef] = dict() + self._cluster_api: Optional[ClusterAPI] = None + self._service_config = service_config or dict() + self._stored_last_idle_time = None + + async def __post_create__(self): + self._cluster_api = await ClusterAPI.create(self.address) + self._stored_last_idle_time = time.time() + + async def __pre_destroy__(self): + await asyncio.gather( + *[mo.destroy_actor(ref) for ref in self._session_refs.values()] + ) + + async def create_session(self, session_id: str, create_services: bool = True): + if session_id in self._session_refs: + raise mo.Return(self._session_refs[session_id]) + + [address] = await self._cluster_api.get_supervisors_by_keys([session_id]) + try: + session_actor_ref = await mo.create_actor( + SessionActor, + session_id, + self._service_config, + address=address, + uid=SessionActor.gen_uid(session_id), + allocate_strategy=mo.allocate_strategy.RandomSubPool(), + ) + except IndexError: + # when there is only one supervisor process, strategy RandomSubPool + # fails with IndexError. So we need to retry using strategy Random. + session_actor_ref = await mo.create_actor( + SessionActor, + session_id, + self._service_config, + address=address, + uid=SessionActor.gen_uid(session_id), + allocate_strategy=mo.allocate_strategy.Random(), + ) + self._session_refs[session_id] = session_actor_ref + + # sync ref to other managers + for supervisor_address in await self._cluster_api.get_supervisors(): + if supervisor_address == self.address: + continue + session_manager_ref = await mo.actor_ref( + supervisor_address, SessionManagerActor.default_uid() + ) + await session_manager_ref.add_session_ref(session_id, session_actor_ref) + + # let session actor create session-related services + if create_services: + yield session_actor_ref.create_services() + + raise mo.Return(session_actor_ref) + + def get_sessions(self) -> List[SessionInfo]: + return [ + SessionInfo(session_id=session_id) + for session_id in self._session_refs.keys() + ] + + def get_session_ref(self, session_id: str): + return self._session_refs[session_id] + + def add_session_ref(self, session_id: str, session_actor_ref: mo.ActorRef): + self._session_refs[session_id] = session_actor_ref + + def remove_session_ref(self, session_id: str): + del self._session_refs[session_id] + + def has_session(self, session_id: str): + return session_id in self._session_refs + + async def delete_session(self, session_id): + session_actor_ref = self._session_refs.pop(session_id) + await session_actor_ref.remove() + await mo.destroy_actor(session_actor_ref) + + # sync removing to other managers + for supervisor_address in await self._cluster_api.get_supervisors(): + if supervisor_address == self.address: + continue + session_manager_ref = await mo.actor_ref( + supervisor_address, SessionManagerActor.default_uid() + ) + await session_manager_ref.remove_session_ref(session_id) + + async def delete_all_sessions(self): + for session_id in list(self._session_refs): + await self.delete_session(session_id) + + async def get_last_idle_time(self, session_id=None): + if session_id is not None: + session = self._session_refs[session_id] + raise mo.Return(await session.get_last_idle_time()) + else: + all_last_idle_time = yield asyncio.gather( + *[ + session.get_last_idle_time() + for session in self._session_refs.values() + ] + ) + if any(last_idle_time is None for last_idle_time in all_last_idle_time): + raise mo.Return(None) + else: + self._stored_last_idle_time = max( + [self._stored_last_idle_time] + all_last_idle_time + ) + raise mo.Return(self._stored_last_idle_time) + + +class SessionActor(mo.Actor): + def __init__(self, session_id: str, service_config: Dict): + self._session_id = session_id + + self._meta_api = None + self._lifecycle_api = None + self._task_api = None + self._scheduling_api = None + + self._service_config = service_config + + self._custom_log_meta_ref = None + + @classmethod + def gen_uid(cls, session_id): + return f"{session_id}_session_actor" + + async def __post_create__(self): + from .custom_log import CustomLogMetaActor + + self._custom_log_meta_ref = await mo.create_actor( + CustomLogMetaActor, + self._session_id, + address=self.address, + uid=CustomLogMetaActor.gen_uid(self._session_id), + ) + logger.debug( + "Session %s actor created on pid: %s", + self._session_id, + os.getpid(), + ) + + async def remove(self): + await destroy_service_session( + NodeRole.SUPERVISOR, self._service_config, self._session_id, self.address + ) + + async def __pre_destroy__(self): + await mo.destroy_actor(self._custom_log_meta_ref) + + async def create_services(self): + from ...task import TaskAPI + + await create_service_session( + NodeRole.SUPERVISOR, self._service_config, self._session_id, self.address + ) + if "task" in self._service_config["services"]: + self._task_api = await TaskAPI.create( + session_id=self._session_id, address=self.address + ) + + async def get_last_idle_time(self): + if self._task_api is None: + return None + return await self._task_api.get_last_idle_time() + + async def create_remote_object(self, name: str, object_cls, *args, **kwargs): + return await mo.create_actor( + RemoteObjectActor, + object_cls, + args, + kwargs, + address=self.address, + uid=to_binary(name), + ) + + async def get_remote_object(self, name: str): + return await mo.actor_ref(mo.ActorRef(self.address, to_binary(name))) + + async def destroy_remote_object(self, name: str): + return await mo.destroy_actor(mo.ActorRef(self.address, to_binary(name))) + + +class RemoteObjectActor(mo.Actor): + def __init__(self, object_cls, args, kwargs): + self._object = object_cls(*args, **kwargs) + + def __getattr__(self, attr): + func = getattr(self._object, attr) + if not callable(func): # pragma: no cover + return object.__getattribute__(self._object, attr) + + @functools.wraps(func) + async def wrap(*args, **kwargs): + # return coroutine to not block current actor + if asyncio.iscoroutinefunction(func): + return func(*args, **kwargs) + else: + # for sync call, running in thread + return asyncio.to_thread(func, *args, **kwargs) + + return wrap diff --git a/python/xorbits/_mars/services/session/supervisor/custom_log.py b/python/xorbits/_mars/services/session/supervisor/custom_log.py new file mode 100644 index 000000000..b09abc456 --- /dev/null +++ b/python/xorbits/_mars/services/session/supervisor/custom_log.py @@ -0,0 +1,62 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os.path +from collections import defaultdict +from typing import Dict, Tuple + +from .... import oscar as mo + + +class CustomLogMetaActor(mo.Actor): + # {tileable_op_key -> {chunk_op_key -> (worker_addr, path)}} + _custom_log_path_store: Dict[str, Dict[str, Tuple[str, str]]] + + def __init__(self, session_id: str): + self._session_id = session_id + self._custom_log_path_store = dict() + + @classmethod + def gen_uid(cls, session_id: str): + return f"custom_log_{session_id}" + + async def __post_create__(self): + from ..worker.custom_log import CustomLogActor + + worker_address_to_paths = defaultdict(set) + for address, path in self._custom_log_path_store.values(): + log_dir = os.path.dirname(path) + worker_address_to_paths[address].add(log_dir) + for address, paths in worker_address_to_paths.items(): + ref = await mo.actor_ref(address, CustomLogActor.default_uid()) + await ref.clear_custom_log_dirs(list(paths)) + + def register_custom_log_path( + self, + tileable_op_key: str, + chunk_op_key: str, + worker_address: str, + log_path: str, + ): + if tileable_op_key not in self._custom_log_path_store: + self._custom_log_path_store[tileable_op_key] = dict() + self._custom_log_path_store[tileable_op_key][chunk_op_key] = ( + worker_address, + log_path, + ) + + def get_tileable_op_log_paths( + self, tileable_op_key: str + ) -> Dict[str, Tuple[str, str]]: + return self._custom_log_path_store.get(tileable_op_key) diff --git a/python/xorbits/_mars/services/session/supervisor/service.py b/python/xorbits/_mars/services/session/supervisor/service.py new file mode 100644 index 000000000..f3a865451 --- /dev/null +++ b/python/xorbits/_mars/services/session/supervisor/service.py @@ -0,0 +1,45 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .... import oscar as mo +from ...core import AbstractService +from .core import SessionManagerActor + + +class SessionSupervisorService(AbstractService): + """ + Session service on supervisor. + + Session Configuration + --------------------- + { + "session" : { + } + } + """ + + async def start(self): + await mo.create_actor( + SessionManagerActor, + self._config, + uid=SessionManagerActor.default_uid(), + address=self._address, + ) + + async def stop(self): + await mo.destroy_actor( + mo.create_actor_ref( + uid=SessionManagerActor.default_uid(), address=self._address + ) + ) diff --git a/python/xorbits/_mars/services/session/tests/__init__.py b/python/xorbits/_mars/services/session/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/services/session/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/services/session/tests/test_service.py b/python/xorbits/_mars/services/session/tests/test_service.py new file mode 100644 index 000000000..7d9b7b086 --- /dev/null +++ b/python/xorbits/_mars/services/session/tests/test_service.py @@ -0,0 +1,206 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import threading +import time + +import numpy as np +import pytest + +from .... import oscar as mo +from .... import remote as mr +from ....core import TileableGraph, TileableGraphBuilder +from ....resource import Resource +from ....utils import get_next_port +from ... import NodeRole, start_services, stop_services +from ...task.api import TaskAPI +from .. import SessionAPI, WebSessionAPI + + +@pytest.mark.parametrize("test_web", [False, True]) +@pytest.mark.asyncio +async def test_session_service(test_web): + pool = await mo.create_actor_pool("127.0.0.1", n_process=0) + + async with pool: + config = { + "services": ["cluster", "session", "meta"], + "cluster": { + "backend": "fixed", + "lookup_address": pool.external_address, + }, + "meta": {"store": "dict"}, + } + if test_web: + config["services"] += ["web"] + config["web"] = {"port": get_next_port()} + + await start_services(NodeRole.SUPERVISOR, config, address=pool.external_address) + + if not test_web: + session_api = await SessionAPI.create(pool.external_address) + else: + session_api = WebSessionAPI(f'http://127.0.0.1:{config["web"]["port"]}') + session_id = "test_session" + session_address = await session_api.create_session(session_id) + assert session_address == pool.external_address + assert await session_api.has_session(session_id) is True + assert (await session_api.get_sessions())[0].session_id == session_id + if not test_web: + assert await session_api.get_session_address(session_id) == session_address + await session_api.delete_session(session_id) + assert await session_api.has_session(session_id) is False + assert await session_api.get_sessions() == [] + await session_api.delete_all_sessions() + assert await session_api.has_session(session_id) is False + + await stop_services(NodeRole.SUPERVISOR, config, address=pool.external_address) + + +@pytest.mark.asyncio +async def test_get_last_idle_time(): + sv_pool = await mo.create_actor_pool("127.0.0.1", n_process=0) + worker_pool = await mo.create_actor_pool( + "127.0.0.1", + n_process=2, + labels=["main"] + ["numa-0"] * 2, + subprocess_start_method="spawn", + ) + async with sv_pool, worker_pool: + config = { + "services": [ + "cluster", + "session", + "meta", + "lifecycle", + "scheduling", + "subtask", + "task", + "mutable", + ], + "cluster": { + "backend": "fixed", + "lookup_address": sv_pool.external_address, + "resource": {"numa-0": Resource(num_cpus=2)}, + }, + "meta": {"store": "dict"}, + } + await start_services( + NodeRole.SUPERVISOR, config, address=sv_pool.external_address + ) + await start_services( + NodeRole.WORKER, config, address=worker_pool.external_address + ) + + start_time = time.time() + session_api = await SessionAPI.create(sv_pool.external_address) + assert await session_api.get_last_idle_time() < start_time + + session_id = "test_session" + await session_api.create_session(session_id) + # check last idle time is not None + last_idle_time = await session_api.get_last_idle_time(session_id) + assert last_idle_time is not None + assert await session_api.get_last_idle_time(session_id) == last_idle_time + # submit a task + task_api = await TaskAPI.create(session_id, sv_pool.external_address) + + def f1(): + return np.arange(5) + + def f2(): + return np.arange(5, 10) + + def f3(f1r, f2r): + return np.concatenate([f1r, f2r]).sum() + + r1 = mr.spawn(f1) + r2 = mr.spawn(f2) + r3 = mr.spawn(f3, args=(r1, r2)) + + graph = TileableGraph([r3.data]) + next(TileableGraphBuilder(graph).build()) + task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False) + await task_api.wait_task(task_id) + task_result = await task_api.get_task_result(task_id) + + # the error is Actor b'StorageHandlerActor' does not exist + assert task_result.error is not None + + # the last idle time is changed + new_last_idle_time = await session_api.get_last_idle_time() + assert new_last_idle_time is not None + assert new_last_idle_time != last_idle_time + assert await session_api.get_last_idle_time() == new_last_idle_time + assert new_last_idle_time > last_idle_time + + # blocking task. + def f4(): + import time + + time.sleep(10) + + r4 = mr.spawn(f4) + graph = TileableGraph([r4.data]) + next(TileableGraphBuilder(graph).build()) + await task_api.submit_tileable_graph(graph, fuse_enabled=False) + assert await session_api.get_last_idle_time() is None + + await stop_services( + NodeRole.WORKER, config, address=worker_pool.external_address + ) + await stop_services( + NodeRole.SUPERVISOR, config, address=sv_pool.external_address + ) + + +@pytest.mark.asyncio +async def test_dmap(): + pool = await mo.create_actor_pool("127.0.0.1", n_process=0) + + async with pool: + config = { + "services": [ + "cluster", + "session", + "meta", + "lifecycle", + "scheduling", + "subtask", + "task", + "mutable", + ], + "cluster": { + "backend": "fixed", + "lookup_address": pool.external_address, + }, + "meta": {"store": "dict"}, + } + await start_services(NodeRole.SUPERVISOR, config, address=pool.external_address) + + session_api = await SessionAPI.create(pool.external_address) + + session_id = "test_session" + await session_api.create_session(session_id) + lock = await session_api.create_remote_object( + session_id, "my_lock", threading.Lock + ) + await lock.acquire() + lock = await session_api.get_remote_object(session_id, "my_lock") + await lock.release() + with pytest.raises(AttributeError): + await lock.abc() + await session_api.destroy_remote_object(session_id, "my_lock") + + await stop_services(NodeRole.SUPERVISOR, config, address=pool.external_address) diff --git a/python/xorbits/_mars/services/session/worker/__init__.py b/python/xorbits/_mars/services/session/worker/__init__.py new file mode 100644 index 000000000..1467357ae --- /dev/null +++ b/python/xorbits/_mars/services/session/worker/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .custom_log import CustomLogActor +from .service import SessionWorkerService diff --git a/python/xorbits/_mars/services/session/worker/custom_log.py b/python/xorbits/_mars/services/session/worker/custom_log.py new file mode 100644 index 000000000..8441a86d3 --- /dev/null +++ b/python/xorbits/_mars/services/session/worker/custom_log.py @@ -0,0 +1,71 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import tempfile +from typing import Any, Dict, List + +from .... import oscar as mo +from ....lib.aio import AioFileObject + + +class CustomLogActor(mo.Actor): + def __init__(self, custom_log_dir: str): + self._custom_log_dir = custom_log_dir + + @staticmethod + def _get_custom_log_dir(custom_log_dir: str, session_id: str): + if custom_log_dir == "auto": + return tempfile.mkdtemp(prefix=f"marslog-{session_id}") + elif custom_log_dir is None: + return + else: + return os.path.join(custom_log_dir, session_id) + + def new_custom_log_dir(self, session_id: str): + custom_log_dir = self._get_custom_log_dir(self._custom_log_dir, session_id) + if custom_log_dir: + os.makedirs(custom_log_dir, exist_ok=True) + return custom_log_dir + + @classmethod + def clear_custom_log_dirs(cls, paths: List[str]): + _ = [shutil.rmtree(path, ignore_errors=True) for path in paths] + + @classmethod + async def fetch_logs( + cls, log_paths: List[str], offsets: List[int], sizes: List[int] + ) -> List[Dict[str, Any]]: + result = [] + for i, log_path in enumerate(log_paths): + log_result = dict() + + offset = offsets[i] + size = sizes[i] + + async with AioFileObject(open(log_path)) as f: + if offset < 0: + # process negative offset + offset = max(os.path.getsize(log_path) + offset, 0) + + if offset: + await f.seek(offset) + + log_result["log"] = await f.read(size) + log_result["offset"] = await f.tell() + + result.append(log_result) + + return result diff --git a/python/xorbits/_mars/services/session/worker/service.py b/python/xorbits/_mars/services/session/worker/service.py new file mode 100644 index 000000000..6cf44f496 --- /dev/null +++ b/python/xorbits/_mars/services/session/worker/service.py @@ -0,0 +1,45 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .... import oscar as mo +from ...core import AbstractService +from .custom_log import CustomLogActor + + +class SessionWorkerService(AbstractService): + """ + Session service on worker. + + Service Configuration + --------------------- + { + "session" : { + } + } + """ + + async def start(self): + session_config = self._config.get("session", dict()) + custom_log_dir = session_config.get("custom_log_dir") + await mo.create_actor( + CustomLogActor, + custom_log_dir, + address=self._address, + uid=CustomLogActor.default_uid(), + ) + + async def stop(self): + await mo.destroy_actor( + mo.create_actor_ref(uid=CustomLogActor.default_uid(), address=self._address) + ) diff --git a/python/xorbits/_mars/services/storage/__init__.py b/python/xorbits/_mars/services/storage/__init__.py new file mode 100644 index 000000000..9743eaac7 --- /dev/null +++ b/python/xorbits/_mars/services/storage/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .api import MockStorageAPI, StorageAPI, WebStorageAPI +from .core import DataInfo +from .errors import DataNotExist diff --git a/python/xorbits/_mars/services/storage/api/__init__.py b/python/xorbits/_mars/services/storage/api/__init__.py new file mode 100644 index 000000000..5c0ef6c3f --- /dev/null +++ b/python/xorbits/_mars/services/storage/api/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .core import AbstractStorageAPI +from .oscar import MockStorageAPI, StorageAPI +from .web import WebStorageAPI diff --git a/python/xorbits/_mars/services/storage/api/core.py b/python/xorbits/_mars/services/storage/api/core.py new file mode 100644 index 000000000..401859776 --- /dev/null +++ b/python/xorbits/_mars/services/storage/api/core.py @@ -0,0 +1,81 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from typing import Any, List + +from ....storage.base import StorageLevel +from ..core import DataInfo + + +class AbstractStorageAPI(ABC): + @abstractmethod + async def get( + self, data_key: str, conditions: List = None, error: str = "raise" + ) -> Any: + """ + Get object by data key. + + Parameters + ---------- + data_key: str + date key to get. + + conditions: List + Index conditions to pushdown + + error: str + raise or ignore + + Returns + ------- + object + """ + + @abstractmethod + async def put( + self, data_key: str, obj: object, level: StorageLevel = StorageLevel.MEMORY + ) -> DataInfo: + """ + Put object into storage. + + Parameters + ---------- + data_key: str + data key to put. + obj: object + object to put. + level: StorageLevel + the storage level to put into, MEMORY as default + + Returns + ------- + object information: ObjectInfo + the put object information + """ + + @abstractmethod + async def get_infos(self, data_key: str) -> List[DataInfo]: + """ + Get data information items for specific data key + + Parameters + ---------- + data_key + + Returns + ------- + out + List of information for specified key + """ diff --git a/python/xorbits/_mars/services/storage/api/oscar.py b/python/xorbits/_mars/services/storage/api/oscar.py new file mode 100644 index 000000000..18922151b --- /dev/null +++ b/python/xorbits/_mars/services/storage/api/oscar.py @@ -0,0 +1,363 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +from typing import Any, List, Tuple, Type, TypeVar, Union + +from .... import oscar as mo +from ....lib.aio import alru_cache +from ....storage.base import StorageFileObject, StorageLevel +from ...cluster import StorageInfo +from ..core import ( + DataInfo, + DataManagerActor, + StorageManagerActor, + WrappedStorageFileObject, +) +from ..handler import StorageHandlerActor +from .core import AbstractStorageAPI + +_is_windows = sys.platform.lower().startswith("win") +APIType = TypeVar("APIType", bound="StorageAPI") + + +class StorageAPI(AbstractStorageAPI): + _storage_handler_ref: mo.ActorRefType[StorageHandlerActor] + _data_manager_ref: mo.ActorRefType[DataManagerActor] + + def __init__(self, address: str, session_id: str, band_name: str): + self._address = address + self._session_id = session_id + self._band_name = band_name + + async def _init(self): + self._storage_handler_ref = await mo.actor_ref( + self._address, StorageHandlerActor.gen_uid(self._band_name) + ) + self._data_manager_ref = await mo.actor_ref( + self._address, DataManagerActor.default_uid() + ) + + @classmethod + @alru_cache(cache_exceptions=False) + async def create( + cls: Type[APIType], + session_id: str, + address: str, + band_name: str = "numa-0", + **kwargs, + ) -> APIType: + """ + Create storage API. + + Parameters + ---------- + session_id: str + session id + + address: str + worker address + + band_name: str + name of band, default as 'numa-0' + + Returns + ------- + storage_api + Storage api. + """ + if kwargs: # pragma: no cover + raise TypeError(f'Got unexpected arguments: {",".join(kwargs)}') + api = StorageAPI(address, session_id, band_name) + await api._init() + return api + + async def is_seekable(self, storage_level: StorageLevel = None) -> bool: + """ + If storage backend is seekable. + """ + return await self._storage_handler_ref.is_seekable(storage_level) + + @mo.extensible + async def get( + self, data_key: str, conditions: List = None, error: str = "raise" + ) -> Any: + return await self._storage_handler_ref.get( + self._session_id, data_key, conditions, error + ) + + @get.batch + async def batch_get(self, args_list, kwargs_list): + gets = [] + for args, kwargs in zip(args_list, kwargs_list): + gets.append( + self._storage_handler_ref.get.delay(self._session_id, *args, **kwargs) + ) + return await self._storage_handler_ref.get.batch(*gets) + + @mo.extensible + async def put( + self, data_key: str, obj: object, level: StorageLevel = None + ) -> DataInfo: + return await self._storage_handler_ref.put( + self._session_id, data_key, obj, level + ) + + @put.batch + async def batch_put(self, args_list, kwargs_list): + puts = [] + for args, kwargs in zip(args_list, kwargs_list): + puts.append( + self._storage_handler_ref.put.delay(self._session_id, *args, **kwargs) + ) + return await self._storage_handler_ref.put.batch(*puts) + + @mo.extensible + async def get_infos(self, data_key: str) -> List[DataInfo]: + """ + Get data information items for specific data key + + Parameters + ---------- + data_key + + Returns + ------- + out + List of information for specified key + """ + return await self._data_manager_ref.get_data_infos( + self._session_id, data_key, self._band_name + ) + + @mo.extensible + async def delete(self, data_key: str, error: str = "raise"): + """ + Delete object. + + Parameters + ---------- + data_key: str + object key to delete + error: str + raise or ignore + """ + await self._storage_handler_ref.delete(self._session_id, data_key, error=error) + + @delete.batch + async def batch_delete(self, args_list, kwargs_list): + deletes = [] + for args, kwargs in zip(args_list, kwargs_list): + deletes.append( + self._storage_handler_ref.delete.delay( + self._session_id, *args, **kwargs + ) + ) + return await self._storage_handler_ref.delete.batch(*deletes) + + @mo.extensible + async def fetch( + self, + data_key: Union[str, Tuple], + level: StorageLevel = None, + band_name: str = None, + remote_address: str = None, + error: str = "raise", + ): + """ + Fetch object from remote worker or load object from disk. + + Parameters + ---------- + data_key: str or tuple + data key(tuple when is shuffle key) to fetch to current worker + with specific level. + level: StorageLevel + the storage level to put into, MEMORY as default + band_name: BandType + put data on specific band + remote_address: + remote address that stores the data + error: str + raise or ignore + """ + await self._storage_handler_ref.fetch_batch( + self._session_id, [data_key], level, band_name, remote_address, error + ) + + @fetch.batch + async def batch_fetch(self, args_list, kwargs_list): + extracted_args = [] + data_keys = [] + for args, kwargs in zip(args_list, kwargs_list): + data_key, level, band_name, dest_address, error = self.fetch.bind( + *args, **kwargs + ) + if extracted_args: + assert extracted_args == (level, band_name, dest_address, error) + extracted_args = (level, band_name, dest_address, error) + data_keys.append(data_key) + await self._storage_handler_ref.fetch_batch( + self._session_id, data_keys, *extracted_args + ) + + @mo.extensible + async def unpin(self, data_key: str, error: str = "raise"): + """ + Unpin the data, allow storage to release the data. + + Parameters + ---------- + data_key: str + data key to unpin + error: str + raise or ignore + """ + await self._storage_handler_ref.unpin(self._session_id, data_key, error) + + @unpin.batch + async def batch_unpin(self, args_list, kwargs_list): + unpins = [] + for args, kwargs in zip(args_list, kwargs_list): + data_key, error = self.unpin.bind(*args, **kwargs) + unpins.append( + self._storage_handler_ref.unpin.delay(self._session_id, data_key, error) + ) + return await self._storage_handler_ref.unpin.batch(*unpins) + + async def open_reader(self, data_key: str) -> StorageFileObject: + """ + Return a file-like object for reading. + + Parameters + ---------- + data_key: str + data key + + Returns + ------- + return a file-like object. + """ + return await self._storage_handler_ref.open_reader(self._session_id, data_key) + + async def open_writer( + self, data_key: Union[Tuple, str], size: int, level: StorageLevel = None + ) -> WrappedStorageFileObject: + """ + Return a file-like object for writing data. + + Parameters + ---------- + data_key: str or tuple + data key + size: int + the total size of data + level: StorageLevel + the storage level to write + + Returns + ------- + return a file-like object. + """ + return await self._storage_handler_ref.open_writer( + self._session_id, data_key, size, level + ) + + async def list(self, level: StorageLevel) -> List: + """ + List all stored data_keys in storage. + + Parameters + ---------- + level: StorageLevel + the storage level to list all objects + + Returns + ------- + list of data keys + """ + return await self._storage_handler_ref.list(level=level) + + async def get_storage_level_info(self, level: StorageLevel) -> StorageInfo: + """ + Get storage level's info. + + Parameters + ---------- + level : StorageLevel + Storage level. + + Returns + ------- + storage_level_info : StorageInfo + """ + return await self._storage_handler_ref.get_storage_level_info(level) + + async def get_storage_info(self, level: StorageLevel) -> dict: + """ + Get the customized storage backend info of requested storage backend. + + Parameters + ---------- + level: StorageLevel + the storage level to fetch the backend info. + + Returns + ------- + info : dict + Customized storage backend info dict. + """ + return await self._storage_handler_ref.get_storage_backend_info(level) + + +class MockStorageAPI(StorageAPI): + @classmethod + async def create( + cls: Type[APIType], session_id: str, address: str, **kwargs + ) -> APIType: + from ..core import StorageManagerActor + + storage_configs = kwargs.get("storage_configs") + if not storage_configs: + if sys.platform == "darwin": + plasma_dir = "/tmp" + else: + plasma_dir = "/dev/shm" + plasma_setup_params = dict( + store_memory=10 * 1024 * 1024, + plasma_directory=plasma_dir, + check_dir_size=False, + ) + if _is_windows: + storage_configs = {"shared_memory": {}} + else: + storage_configs = { + "plasma": plasma_setup_params, + } + + storage_handler_cls = kwargs.pop("storage_handler_cls", StorageHandlerActor) + await mo.create_actor( + StorageManagerActor, + storage_configs, + storage_handler_cls=storage_handler_cls, + uid=StorageManagerActor.default_uid(), + address=address, + ) + return await super().create(address=address, session_id=session_id) + + @classmethod + async def cleanup(cls: Type[APIType], address: str): + await mo.destroy_actor( + await mo.actor_ref(address, StorageManagerActor.default_uid()) + ) diff --git a/python/xorbits/_mars/services/storage/api/web.py b/python/xorbits/_mars/services/storage/api/web.py new file mode 100644 index 000000000..bca97ff7d --- /dev/null +++ b/python/xorbits/_mars/services/storage/api/web.py @@ -0,0 +1,170 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +from typing import Any, Callable, List + +from .... import oscar as mo +from ....storage import StorageLevel +from ....utils import deserialize_serializable, serialize_serializable +from ...web import MarsServiceWebAPIHandler, MarsWebAPIClientMixin, web_api +from ..core import DataInfo +from .core import AbstractStorageAPI + + +class StorageWebAPIHandler(MarsServiceWebAPIHandler): + _root_pattern = "/api/session/(?P[^/]+)/storage" + + async def _get_oscar_meta_api(self, session_id: str): + from ...meta import MetaAPI + + return await self._get_api_by_key(MetaAPI, session_id) + + async def _get_storage_api_by_object_id(self, session_id: str, object_id: str): + from .oscar import StorageAPI + + meta_api = await self._get_oscar_meta_api(session_id) + bands = (await meta_api.get_chunk_meta(object_id, ["bands"])).get("bands") + if not bands: + raise KeyError + return await StorageAPI.create(session_id, bands[0][0], bands[0][1]) + + @web_api("(?P[^/]+)", method="get") + async def get_data(self, session_id: str, data_key: str): + oscar_api = await self._get_storage_api_by_object_id(session_id, data_key) + result = await oscar_api.get(data_key) + self.write(serialize_serializable(result)) + + @web_api("batch/get", method="post") + async def get_batch_data(self, session_id: str): + body_args = deserialize_serializable(self.request.body) + storage_api_to_gets = defaultdict(list) + storage_api_to_idx = defaultdict(list) + results = [None] * len(body_args) + for i, (data_key, conditions, error) in enumerate(body_args): + oscar_api = await self._get_storage_api_by_object_id(session_id, data_key) + storage_api_to_idx[oscar_api].append(i) + storage_api_to_gets[oscar_api].append( + oscar_api.get.delay(data_key, conditions=conditions, error=error) + ) + for api, fetches in storage_api_to_gets.items(): + data_list = await api.get.batch(*fetches) + for idx, data in zip(storage_api_to_idx[api], data_list): + results[idx] = data + res_data = serialize_serializable(results) + self.write(res_data) + + @web_api("(?P[^/]+)", method="post") + async def get_data_by_post(self, session_id: str, data_key: str): + body_args = ( + deserialize_serializable(self.request.body) if self.request.body else None + ) + conditions = body_args.get("conditions") + + oscar_api = await self._get_storage_api_by_object_id(session_id, data_key) + result = await oscar_api.get(data_key, conditions) + self.write(serialize_serializable(result)) + + @web_api("(?P[^/]+)", method="put") + async def put_data(self, session_id: str, data_key: str): + level = self.get_argument("level", None) or "MEMORY" + level = getattr(StorageLevel, level.upper()) + + oscar_api = await self._get_storage_api_by_object_id(session_id, data_key) + res = await oscar_api.put( + data_key, deserialize_serializable(self.request.body), level + ) + self.write(serialize_serializable(res)) + + @web_api("(?P[^/]+)", method="get", arg_filter={"action": "get_infos"}) + async def get_infos(self, session_id: str, data_key: str): + oscar_api = await self._get_storage_api_by_object_id(session_id, data_key) + res = await oscar_api.get_infos(data_key) + self.write(serialize_serializable(res)) + + +web_handlers = {StorageWebAPIHandler.get_root_pattern(): StorageWebAPIHandler} + + +class WebStorageAPI(AbstractStorageAPI, MarsWebAPIClientMixin): + def __init__( + self, + session_id: str, + address: str, + band_name: str, + request_rewriter: Callable = None, + ): + self._session_id = session_id + self._address = address.rstrip("/") + self._band_name = band_name + self.request_rewriter = request_rewriter + + @mo.extensible + async def get( + self, data_key: str, conditions: List = None, error: str = "raise" + ) -> Any: + path = f"{self._address}/api/session/{self._session_id}/storage/{data_key}" + params = dict(error=error) + if conditions is not None: + params["conditions"] = conditions + body = serialize_serializable(params) + res = await self._request_url( + path=path, + method="POST", + headers={"Content-Type": "application/octet-stream"}, + data=body, + ) + return deserialize_serializable(res.body) + + @get.batch + async def get_batch(self, args_list, kwargs_list): + get_chunks = [] + for args, kwargs in zip(args_list, kwargs_list): + data_key, conditions, error = self.get.bind(*args, **kwargs) + get_chunks.append([data_key, conditions, error]) + + path = f"{self._address}/api/session/{self._session_id}/storage/batch/get" + res = await self._request_url( + path=path, + method="POST", + data=serialize_serializable(get_chunks), + ) + return deserialize_serializable(res.body) + + @mo.extensible + async def put( + self, data_key: str, obj: object, level: StorageLevel = StorageLevel.MEMORY + ) -> DataInfo: + params = dict(level=level.name.lower()) + path = f"{self._address}/api/session/{self._session_id}/storage/{data_key}" + res = await self._request_url( + path=path, + method="PUT", + params=params, + headers={"Content-Type": "application/octet-stream"}, + data=serialize_serializable(obj), + ) + return deserialize_serializable(res.body) + + @mo.extensible + async def get_infos(self, data_key: str) -> List[DataInfo]: + path = f"{self._address}/api/session/{self._session_id}/storage/{data_key}" + params = dict(action="get_infos") + res = await self._request_url( + path=path, + method="GET", + headers={"Content-Type": "application/octet-stream"}, + params=params, + ) + return deserialize_serializable(res.body) diff --git a/python/xorbits/_mars/services/storage/core.py b/python/xorbits/_mars/services/storage/core.py new file mode 100644 index 000000000..caded2679 --- /dev/null +++ b/python/xorbits/_mars/services/storage/core.py @@ -0,0 +1,654 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import logging +from collections import defaultdict +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Union + +from ... import oscar as mo +from ...lib.aio import AioFileObject +from ...oscar.backends.allocate_strategy import IdleLabel, NoIdleSlot +from ...resource import cuda_card_stats +from ...storage import StorageLevel, get_storage_backend +from ...storage.base import ObjectInfo, StorageBackend +from ...storage.core import StorageFileObject +from ...utils import dataslots +from .errors import DataNotExist, StorageFull + +logger = logging.getLogger(__name__) + + +def build_data_info(storage_info: ObjectInfo, level, size, band_name=None): + # todo handle multiple + if band_name is None: + band_name = ( + "numa-0" if storage_info.device is None else f"gpu-{storage_info.device}" + ) + if storage_info.size is None: + store_size = size + else: + store_size = storage_info.size + return DataInfo(storage_info.object_id, level, size, store_size, band_name) + + +class WrappedStorageFileObject(AioFileObject): + """ + Wrap to hold ref after write close + """ + + def __init__( + self, + file: StorageFileObject, + level: StorageLevel, + size: int, + session_id: str, + data_key: Union[str, Tuple], + data_manager: mo.ActorRefType["DataManagerActor"], + storage_handler: StorageBackend, + ): + self._object_id = file.object_id + super().__init__(file) + self._size = size + self._level = level + self._session_id = session_id + self._data_key = data_key + self._data_manager = data_manager + self._storage_handler = storage_handler + # infos for multiple data + self._sub_key_infos = dict() + + def __getattr__(self, item): + return getattr(self._file, item) + + def commit_once(self, sub_key: Tuple, offset: int, size: int): + self._sub_key_infos[sub_key] = (offset, size) + + async def clean_up(self): + self._file.close() + + async def close(self): + self._file.close() + if self._object_id is None: + # for some backends like vineyard, + # object id is generated after write close + self._object_id = self._file.object_id + if "w" in self._file.mode: + object_info = await self._storage_handler.object_info(self._object_id) + object_info.size = self._size + data_info = build_data_info(object_info, self._level, self._size) + await self._data_manager.put_data_info( + self._session_id, + self._data_key, + data_info, + object_info, + self._sub_key_infos, + ) + + +class StorageQuotaActor(mo.Actor): + def __init__( + self, + data_manager: mo.ActorRefType["DataManagerActor"], + level: StorageLevel, + total_size: Optional[Union[int, float]], + ): + self._data_manager = data_manager + self._total_size = total_size if total_size is None else total_size * 0.95 + self._used_size = 0 + self._level = level + + @classmethod + def gen_uid(cls, band_name: str, level: StorageLevel): + return f"storage_quota_{band_name}_{level}" + + def update_quota(self, size: int): + self._used_size += size + logger.debug( + "Update %s bytes of %s, used size now is %s", + size, + self._level, + self._used_size, + ) + + def request_quota(self, size: int) -> bool: + if self._total_size is not None and size > self._total_size: # pragma: no cover + raise StorageFull( + f"Request size {size} is larger than total size {self._total_size}" + ) + if self._total_size is not None and self._used_size + size > self._total_size: + logger.debug( + "Request %s bytes of %s, used size now is %s," + "space is not enough for the request", + size, + self._level, + self._used_size, + ) + return False + else: + self._used_size += size + logger.debug( + "Request %s bytes of %s, used size now is %s, total size is %s", + size, + self._level, + self._used_size, + self._total_size, + ) + return True + + def release_quota(self, size: int): + self._used_size -= size + logger.debug( + "Release %s bytes of %s, used size now is %s, total size is %s", + size, + self._level, + self._used_size, + self._total_size, + ) + + def get_quota(self) -> Tuple[float, float]: + return self._total_size, self._used_size + + +@dataslots +@dataclass +class DataInfo: + object_id: object + level: StorageLevel + memory_size: int + store_size: int + band: str = None + offset: int = None + + +@dataslots +@dataclass +class InternalDataInfo: + data_info: DataInfo + object_info: ObjectInfo + + +@dataslots +@dataclass +class SubInfo: + store_key: str + offset: int + size: int + + +class DataManagerActor(mo.Actor): + _data_key_to_infos: Dict[Tuple, List[InternalDataInfo]] + _data_info_list: Dict[Tuple, Dict] + _spill_strategy: Dict[Tuple, Any] + _sub_key_to_sub_info: Dict[Tuple, SubInfo] + _store_key_to_sub_infos: Dict[Tuple, Dict[Tuple, SubInfo]] + + def __init__(self, bands: List): + from .spill import FIFOStrategy + + # mapping key is (session_id, data_key) + # mapping value is list of InternalDataInfo + self._bands = bands + self._data_key_to_infos = defaultdict(list) + self._data_info_list = dict() + self._spill_strategy = dict() + # data key may be a tuple in shuffle cases, + # we record the mapping from main key to sub keys, + # it's used when decref mapper data using main key + self._main_key_to_sub_keys = defaultdict(set) + # we may store multiple small data into one file, + # it records offset and size. + self._sub_key_to_sub_info = dict() + self._store_key_to_sub_infos = dict() + for level in StorageLevel.__members__.values(): + for band_name in bands: + self._data_info_list[level, band_name] = dict() + self._spill_strategy[level, band_name] = FIFOStrategy(level) + + @mo.extensible + def get_data_infos( + self, + session_id: str, + data_key: Union[str, Tuple], + band_name: str, + error: str = "raise", + ) -> Optional[Union[List[DataInfo], Dict]]: + if (session_id, data_key) in self._data_key_to_infos: + available_infos = [] + for info in self._data_key_to_infos[session_id, data_key]: + info_band = info.data_info.band + if info_band.startswith("gpu-"): # pragma: no cover + # not available for different GPU bands + if info_band == band_name: + available_infos.append(info.data_info) + else: + available_infos.append(info.data_info) + return available_infos + else: + if error == "raise": + raise DataNotExist(f"Data key {session_id, data_key} not exists.") + else: + return + + @mo.extensible + def get_data_info( + self, + session_id: str, + data_key: Union[str, Tuple], + band_name: str = None, + error: str = "raise", + ) -> Union[DataInfo, None]: + sub_info = None + if (session_id, data_key) in self._sub_key_to_sub_info: + sub_info = self._sub_key_to_sub_info[(session_id, data_key)] + data_key = sub_info.store_key + + # if the data is stored in multiply levels, + # return the lowest level info + infos = self.get_data_infos(session_id, data_key, band_name, error) + if not infos: + return + info = sorted(infos, key=lambda x: x.level)[0] + if sub_info is not None: + return DataInfo( + info.object_id, + info.level, + sub_info.size, + sub_info.size, + info.band, + sub_info.offset, + ) + else: + return info + + @mo.extensible + def put_data_info( + self, + session_id: str, + data_key: Union[str, Tuple], + data_info: DataInfo, + object_info: ObjectInfo = None, + sub_key_infos: Dict = None, + ): + info = InternalDataInfo(data_info, object_info) + self._data_key_to_infos[(session_id, data_key)].append(info) + self._data_info_list[data_info.level, data_info.band][ + (session_id, data_key) + ] = object_info + self._spill_strategy[data_info.level, data_info.band].record_put_info( + (session_id, data_key), data_info.store_size + ) + if sub_key_infos: + for key, (offset, size) in sub_key_infos.items(): + self._sub_key_to_sub_info[(session_id, key)] = SubInfo( + data_key, offset, size + ) + self._store_key_to_sub_infos[(session_id, data_key)] = sub_key_infos + if isinstance(data_key, tuple): + self._main_key_to_sub_keys[(session_id, data_key[0])].add(data_key) + + @mo.extensible + def delete_data_info( + self, + session_id: str, + data_key: Union[str, Tuple], + level: StorageLevel, + band_name: str, + ): + if (session_id, data_key) in self._data_key_to_infos: + self._data_info_list[level, band_name].pop((session_id, data_key)) + self._spill_strategy[level, band_name].record_delete_info( + (session_id, data_key) + ) + infos = self._data_key_to_infos[(session_id, data_key)] + rest = [info for info in infos if info.data_info.level != level] + if len(rest) == 0: + del self._data_key_to_infos[(session_id, data_key)] + else: # pragma: no cover + self._data_key_to_infos[(session_id, data_key)] = rest + + @mo.extensible + def get_store_key(self, session_id: str, data_key: Union[str, Tuple, List]): + if (session_id, data_key) in self._sub_key_to_sub_info: + return self._sub_key_to_sub_info[(session_id, data_key)].store_key + elif (session_id, data_key) in self._main_key_to_sub_keys: + # only into when delete mapper main key + return list(self._main_key_to_sub_keys[(session_id, data_key)]) + else: + return data_key + + @mo.extensible + def get_sub_infos(self, session_id: str, store_key: str): + if (session_id, store_key) in self._store_key_to_sub_infos: + return self._store_key_to_sub_infos[(session_id, store_key)] + else: + return None + + def list(self, level: StorageLevel, band_name: str): + return list(self._data_info_list[level, band_name].keys()) + + @mo.extensible + def pin(self, session_id, data_key, band_name, error="raise"): + info = self.get_data_info(session_id, data_key, band_name, error=error) + if info is not None: + self._spill_strategy[info.level, info.band].pin_data((session_id, data_key)) + + @mo.extensible + def unpin( + self, + session_id: str, + data_keys: List[str], + band_name: str, + error: str = "raise", + ): + if error not in ("raise", "ignore"): # pragma: no cover + raise ValueError("error must be raise or ignore") + levels = set() + for data_key in data_keys: + info = self.get_data_info(session_id, data_key, band_name, error) + if info: + level = info.level + self._spill_strategy[level, info.band].unpin_data( + (session_id, data_key) + ) + levels.add(level) + return list(levels) + + def get_spillable_size(self, level: StorageLevel, band_name: str): + return self._spill_strategy[level, band_name].get_spillable_size() + + async def get_spill_keys(self, level: StorageLevel, band_name: str, size: int): + return self._spill_strategy[level, band_name].get_spill_keys(size) + + +class StorageManagerActor(mo.StatelessActor): + """ + Storage manager actor, created only on main process, mainly to setup storage backends + and create all the necessary actors for storage service. + """ + + _data_manager: mo.ActorRefType[DataManagerActor] + + def __init__( + self, storage_configs: Dict, transfer_block_size: int = None, **kwargs + ): + from .handler import StorageHandlerActor + + self._handler_cls = kwargs.pop("storage_handler_cls", StorageHandlerActor) + self._storage_configs = storage_configs + self._all_bands = None + self._cluster_api = None + self._upload_task = None + + # params to init and teardown + self._init_params = defaultdict(dict) + self._teardown_params = defaultdict(dict) + self._supervisor_address = None + + # transfer config + self._transfer_block_size = transfer_block_size + self._quotas = None + self._spill_managers = None + + async def __post_create__(self): + from ..cluster.api import ClusterAPI + from .handler import StorageHandlerActor + + try: + self._cluster_api = cluster_api = await ClusterAPI.create(self.address) + band_to_resource = await cluster_api.get_bands() + self._all_bands = [band[1] for band in band_to_resource] + except mo.ActorNotExist: + # in some test cases, cluster service is not available + self._all_bands = ["numa-0"] + + # stores the mapping from data key to storage info + self._data_manager = await mo.create_actor( + DataManagerActor, + self._all_bands, + uid=DataManagerActor.default_uid(), + address=self.address, + ) + + # setup storage backend + await self._setup_storage_backends() + + # create in main process + default_band_name = "numa-0" + await mo.create_actor( + self._handler_cls, + self._init_params[default_band_name], + self._data_manager, + self._spill_managers[default_band_name], + self._quotas[default_band_name], + default_band_name, + uid=StorageHandlerActor.gen_uid(default_band_name), + address=self.address, + ) + + # create handler actors for every process + await self._create_storage_handler_actors() + # create actor for transfer + await self._create_transfer_actors() + await self.upload_disk_info() + # create task for uploading storage usages + self._upload_task = asyncio.create_task(self.upload_storage_info()) + + async def __pre_destroy__(self): + if self._upload_task: + self._upload_task.cancel() + for _, params in self._teardown_params.items(): + for backend, teardown_params in params.items(): + backend_cls = get_storage_backend(backend) + await backend_cls.teardown(**teardown_params) + + async def _setup_storage_backends(self): + from .spill import SpillManagerActor + + self._quotas = quotas = defaultdict(dict) + self._spill_managers = spill_managers = defaultdict(dict) + for backend, setup_params in self._storage_configs.items(): + if backend == "cuda": # pragma: no cover + cuda_infos = await asyncio.to_thread(cuda_card_stats) + storage_bands = [s for s in self._all_bands if s.startswith("gpu-")] + clients = [] + for gpu_band in storage_bands: + index = int(gpu_band[4:]) + size = cuda_infos[index].fb_mem_info.available + params = dict(size=size, **setup_params) + clients.append(await self._setup_storage(gpu_band, backend, params)) + else: + storage_bands = ["numa-0"] + clients = [ + await self._setup_storage(band_name, backend, setup_params) + for band_name in storage_bands + ] + + for level in StorageLevel.__members__.values(): + for client, storage_band in zip(clients, storage_bands): + if client.level & level: + logger.debug( + "Create quota manager for %s, total size is %s", + level, + client.size, + ) + quotas[storage_band][level] = await mo.create_actor( + StorageQuotaActor, + self._data_manager, + level, + client.size, + uid=StorageQuotaActor.gen_uid(storage_band, level), + address=self.address, + ) + spill_managers[storage_band][level] = await mo.create_actor( + SpillManagerActor, + level, + uid=SpillManagerActor.gen_uid(storage_band, level), + address=self.address, + ) + + async def _create_storage_handler_actors(self): + from .handler import StorageHandlerActor + from .transfer import ReceiverManagerActor, SenderManagerActor + + for band_name in self._init_params: + strategy = IdleLabel(band_name, "StorageHandler") + sender_strategy = IdleLabel(band_name, "sender") + receiver_strategy = IdleLabel(band_name, "receiver") + init_params = self._get_band_init_params(band_name) + band_spill_managers = self._get_band_spill_managers(band_name) + band_quotas = self._get_band_quota_refs(band_name) + while True: + try: + handler_ref = await mo.create_actor( + self._handler_cls, + init_params, + self._data_manager, + band_spill_managers, + band_quotas, + band_name, + uid=StorageHandlerActor.gen_uid(band_name), + address=self.address, + allocate_strategy=strategy, + ) + # create transfer actor for GPU bands + if band_name.startswith("gpu-"): # pragma: no cover + await mo.create_actor( + SenderManagerActor, + band_name, + data_manager_ref=self._data_manager, + storage_handler_ref=handler_ref, + uid=SenderManagerActor.gen_uid(band_name), + address=self.address, + allocate_strategy=sender_strategy, + ) + await mo.create_actor( + ReceiverManagerActor, + band_quotas, + handler_ref, + address=self.address, + uid=ReceiverManagerActor.gen_uid(band_name), + allocate_strategy=receiver_strategy, + ) + except NoIdleSlot: + break + + async def _create_transfer_actors(self): + from .handler import StorageHandlerActor + from .transfer import ReceiverManagerActor, SenderManagerActor + + default_band_name = "numa-0" + sender_strategy = IdleLabel("io", "sender") + receiver_strategy = IdleLabel("io", "receiver") + handler_strategy = IdleLabel("io", "handler") + while True: + try: + handler_ref = await mo.create_actor( + self._handler_cls, + self._init_params[default_band_name], + self._data_manager, + self._spill_managers[default_band_name], + self._quotas[default_band_name], + default_band_name, + uid=StorageHandlerActor.gen_uid(default_band_name), + address=self.address, + allocate_strategy=handler_strategy, + ) + await mo.create_actor( + SenderManagerActor, + data_manager_ref=self._data_manager, + storage_handler_ref=handler_ref, + uid=SenderManagerActor.gen_uid(default_band_name), + address=self.address, + allocate_strategy=sender_strategy, + ) + + await mo.create_actor( + ReceiverManagerActor, + self._quotas[default_band_name], + handler_ref, + address=self.address, + uid=ReceiverManagerActor.gen_uid(default_band_name), + allocate_strategy=receiver_strategy, + ) + except NoIdleSlot: + break + + def _get_band_init_params(self, band_name): + init_params = self._init_params["numa-0"].copy() + init_params.update(self._init_params[band_name]) + return init_params + + def _get_band_quota_refs(self, band_name): + band_quotas = self._quotas[band_name].copy() + band_quotas.update(self._quotas["numa-0"]) + return band_quotas + + def _get_band_spill_managers(self, band_name): + band_spill_managers = self._spill_managers[band_name].copy() + band_spill_managers.update(self._spill_managers["numa-0"]) + return band_spill_managers + + async def _setup_storage( + self, band_name: str, storage_backend: str, storage_config: Dict + ): + backend = get_storage_backend(storage_backend) + storage_config = storage_config or dict() + init_params, teardown_params = await backend.setup(**storage_config) + client = backend(**init_params) + self._init_params[band_name][storage_backend] = init_params + self._teardown_params[band_name][storage_backend] = teardown_params + return client + + def get_client_params(self): + return self._init_params + + async def upload_storage_info(self): + from ..cluster import StorageInfo + + if self._cluster_api is not None: + while True: + upload_tasks = [] + for band, level_to_quota in self._quotas.items(): + for level, quota_ref in level_to_quota.items(): + total, used = await quota_ref.get_quota() + used = int(used) + if total is not None: + total = int(total) + storage_info = StorageInfo( + storage_level=level, total_size=total, used_size=used + ) + upload_tasks.append( + self._cluster_api.set_band_storage_info.delay( + band, storage_info + ) + ) + await self._cluster_api.set_band_storage_info.batch(*upload_tasks) + await asyncio.sleep(0.5) + + async def upload_disk_info(self): + from ..cluster import DiskInfo + + disk_infos = [] + if ( + self._cluster_api is not None + and "filesystem" in self._init_params["numa-0"] + ): + if self._init_params["numa-0"]["filesystem"]["level"] == StorageLevel.DISK: + params = self._init_params["numa-0"]["filesystem"] + size = params["size"] + for path in params["root_dirs"]: + disk_infos.append(DiskInfo(path=path, limit_size=size)) + await self._cluster_api.set_node_disk_info(disk_infos) diff --git a/python/xorbits/_mars/services/storage/errors.py b/python/xorbits/_mars/services/storage/errors.py new file mode 100644 index 000000000..3c31adce6 --- /dev/null +++ b/python/xorbits/_mars/services/storage/errors.py @@ -0,0 +1,26 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...core.base import MarsError +from ...storage.errors import DataNotExist + +DataNotExist = DataNotExist + + +class NoDataToSpill(MarsError): + pass + + +class StorageFull(MarsError): + pass diff --git a/python/xorbits/_mars/services/storage/handler.py b/python/xorbits/_mars/services/storage/handler.py new file mode 100644 index 000000000..f8ad75e49 --- /dev/null +++ b/python/xorbits/_mars/services/storage/handler.py @@ -0,0 +1,700 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import logging +from collections import defaultdict +from typing import Any, Dict, List, Union + +from ... import oscar as mo +from ...serialization import AioDeserializer +from ...storage import StorageLevel, get_storage_backend +from ...storage.core import StorageFileObject +from ...typing import BandType +from ...utils import calc_data_size, lazy_import +from ..cluster import ClusterAPI, StorageInfo +from ..meta import MetaAPI +from .core import ( + DataInfo, + DataManagerActor, + StorageQuotaActor, + WrappedStorageFileObject, + build_data_info, +) +from .errors import DataNotExist, NoDataToSpill + +cupy = lazy_import("cupy") +cudf = lazy_import("cudf") + +logger = logging.getLogger(__name__) + + +class StorageHandlerActor(mo.Actor): + """ + Storage handler actor, provide methods like `get`, `put`, etc. + This actor is stateful and created on worker's sub pools. + """ + + def __init__( + self, + storage_init_params: Dict, + data_manager_ref: mo.ActorRefType[DataManagerActor], + spill_manager_refs, + quota_refs: Dict[StorageLevel, mo.ActorRefType[StorageQuotaActor]], + band_name: str = "numa-0", + ): + from .spill import SpillManagerActor + + self._storage_init_params = storage_init_params + self._data_manager_ref = data_manager_ref + self._spill_manager_refs: Dict[ + StorageLevel, mo.ActorRefType[SpillManagerActor] + ] = spill_manager_refs + self._quota_refs = quota_refs + self._band_name = band_name + self._supervisor_address = None + + @classmethod + def gen_uid(cls, band_name: str): + return f"storage_handler_{band_name}" + + @property + def highest_level(self): + return min(self._quota_refs) + + async def __post_create__(self): + self._clients = clients = dict() + for backend, init_params in self._storage_init_params.items(): + logger.debug("Start storage %s with params %s", backend, init_params) + storage_cls = get_storage_backend(backend) + client = storage_cls(**init_params) + for level in StorageLevel.__members__.values(): + if client.level & level: + clients[level] = client + + def is_seekable(self, level: StorageLevel): + if level is None: + level = self.highest_level + return self._clients[level].is_seekable + + async def _get_data(self, data_info: DataInfo, conditions: List[Any]): + if data_info.offset is not None: + reader = await self._clients[data_info.level].open_reader( + data_info.object_id + ) + await reader.seek(data_info.offset) + res = await AioDeserializer(reader).run() + if conditions is not None: + try: + res = res.iloc[tuple(conditions)] + except AttributeError: # pragma: no cover + res = res[tuple(conditions)] + elif conditions is None: + res = yield self._clients[data_info.level].get(data_info.object_id) + else: + try: + res = yield self._clients[data_info.level].get( + data_info.object_id, conditions=conditions + ) + except NotImplementedError: + data = yield self._clients[data_info.level].get(data_info.object_id) + try: + sliced_value = data.iloc[tuple(conditions)] + except AttributeError: + sliced_value = data[tuple(conditions)] + res = sliced_value + raise mo.Return(res) + + @mo.extensible + async def get( + self, + session_id: str, + data_key: str, + conditions: List = None, + error: str = "raise", + ): + try: + data_info = await self._data_manager_ref.get_data_info( + session_id, data_key, self._band_name + ) + data = yield self._get_data(data_info, conditions) + raise mo.Return(data) + except DataNotExist: + if error == "raise": + raise + + def _get_data_info( + self, + session_id: str, + data_key: str, + conditions: List = None, + error: str = "raise", + ): + info = self._data_manager_ref.get_data_info.delay( + session_id, data_key, self._band_name, error + ) + return info, conditions + + @get.batch + async def batch_get(self, args_list, kwargs_list): + infos = [] + conditions_list = [] + for args, kwargs in zip(args_list, kwargs_list): + info, conditions = self._get_data_info(*args, **kwargs) + infos.append(info) + conditions_list.append(conditions) + data_infos = await self._data_manager_ref.get_data_info.batch(*infos) + results = [] + writer_args = [ + (info.object_id, info.level) + for info in data_infos + if info is not None and info.offset is not None + ] + object_id_to_reader = dict() + for object_id, level in writer_args: + object_id_to_reader[object_id] = await self._clients[level].open_reader( + object_id + ) + for data_info, conditions in zip(data_infos, conditions_list): + if data_info is None: + results.append(None) + elif data_info.offset is not None: + reader = object_id_to_reader[data_info.object_id] + await reader.seek(data_info.offset) + result = await AioDeserializer(reader).run() + results.append(result) + else: + result = yield self._get_data(data_info, conditions) + results.append(result) + raise mo.Return(results) + + def _get_default_level(self, obj): + obj = obj[0] if isinstance(obj, (list, tuple)) else obj + if self.highest_level != StorageLevel.GPU: + return self.highest_level + else: # pragma: no cover + if cudf is not None and isinstance( + obj, (cudf.DataFrame, cudf.Series, cudf.Index) + ): + return StorageLevel.GPU + elif cupy is not None and isinstance(obj, cupy.ndarray): + return StorageLevel.GPU + else: + return StorageLevel.MEMORY + + @mo.extensible + async def put( + self, session_id: str, data_key: str, obj: object, level: StorageLevel = None + ) -> DataInfo: + if level is None: + level = self._get_default_level(obj) + size = await asyncio.to_thread(calc_data_size, obj) + await self.request_quota_with_spill(level, size) + object_info = await self._clients[level].put(obj) + data_info = build_data_info(object_info, level, size, self._band_name) + await self._data_manager_ref.put_data_info( + session_id, data_key, data_info, object_info + ) + if object_info.size is not None and data_info.memory_size != object_info.size: + await self._quota_refs[level].update_quota( + object_info.size - data_info.memory_size + ) + await self.notify_spillable_space(level) + return data_info + + @put.batch + async def batch_put(self, args_list, kwargs_list): + objs = [] + data_keys = [] + session_id = None + level = last_level = None + sizes = [] + for args, kwargs in zip(args_list, kwargs_list): + session_id, data_key, obj, level = self.put.bind(*args, **kwargs) + if level is None: + level = self._get_default_level(obj) + size = await asyncio.to_thread(calc_data_size, obj) + if last_level is not None: + assert last_level == level + last_level = level + objs.append(obj) + data_keys.append(data_key) + sizes.append(size) + + await self.request_quota_with_spill(level, sum(sizes)) + + data_infos = [] + put_infos = [] + quota_delta = 0 + for size, data_key, obj in zip(sizes, data_keys, objs): + object_info = await self._clients[level].put(obj) + data_info = build_data_info(object_info, level, size, self._band_name) + data_infos.append(data_info) + if ( + object_info.size is not None + and data_info.memory_size != object_info.size + ): + # we request memory size before putting, when put finishes, + # update quota to the true store size + quota_delta += object_info.size - data_info.memory_size + put_infos.append( + self._data_manager_ref.put_data_info.delay( + session_id, data_key, data_info, object_info + ) + ) + await self._quota_refs[level].update_quota(quota_delta) + await self._data_manager_ref.put_data_info.batch(*put_infos) + await self.notify_spillable_space(level) + return data_infos + + async def delete_object( + self, + session_id: str, + data_key: Any, + data_size: Union[int, float], + object_id: Any, + level: StorageLevel, + ): + data_key = await self._data_manager_ref.get_store_key(session_id, data_key) + await self._data_manager_ref.delete_data_info( + session_id, data_key, level, self._band_name + ) + await self._clients[level].delete(object_id) + await self._quota_refs[level].release_quota(data_size) + + @mo.extensible + async def delete(self, session_id: str, data_key: str, error: str = "raise"): + if error not in ("raise", "ignore"): # pragma: no cover + raise ValueError("error must be raise or ignore") + + data_key = await self._data_manager_ref.get_store_key(session_id, data_key) + if isinstance(data_key, list): + # delete mapper main key + data_keys = data_key + else: + data_keys = [data_key] + for data_key in data_keys: + all_infos = await self._data_manager_ref.get_data_infos( + session_id, data_key, self._band_name, error + ) + if not all_infos: + return + + key_to_infos = ( + all_infos if isinstance(all_infos, dict) else {data_key: all_infos} + ) + + for key, infos in key_to_infos.items(): + for info in infos: + level = info.level + await self._data_manager_ref.delete_data_info( + session_id, key, level, self._band_name + ) + await self._clients[level].delete(info.object_id) + await self._quota_refs[level].release_quota(info.store_size) + + @delete.batch + async def batch_delete(self, args_list, kwargs_list): + session_id = None + error = None + data_keys = [] + for args, kwargs in zip(args_list, kwargs_list): + session_id, data_key, error = self.delete.bind(*args, **kwargs) + data_keys.append( + self._data_manager_ref.get_store_key.delay(session_id, data_key) + ) + store_keys = await self._data_manager_ref.get_store_key.batch(*data_keys) + data_keys = set() + for k in store_keys: + if isinstance(k, list): + data_keys.update(set(k)) + else: + data_keys.add(k) + + infos_list = await self._data_manager_ref.get_data_infos.batch( + *[ + self._data_manager_ref.get_data_infos.delay( + session_id, data_key, self._band_name, error + ) + for data_key in data_keys + ] + ) + + delete_infos = [] + to_removes = [] + level_sizes = defaultdict(lambda: 0) + for all_infos, data_key in zip(infos_list, data_keys): + if not all_infos: + # data not exist and error == 'ignore' + continue + key_to_infos = ( + all_infos if isinstance(all_infos, dict) else {data_key: all_infos} + ) + + for key, infos in key_to_infos.items(): + for info in infos: + level = info.level + delete_infos.append( + self._data_manager_ref.delete_data_info.delay( + session_id, key, level, info.band + ) + ) + to_removes.append((level, info.object_id)) + level_sizes[level] += info.store_size + + if not delete_infos: + # no data to remove + return + + await self._data_manager_ref.delete_data_info.batch(*delete_infos) + await asyncio.gather( + *[self._clients[level].delete(object_id) for level, object_id in to_removes] + ) + for level, size in level_sizes.items(): + await self._quota_refs[level].release_quota(size) + + @mo.extensible + async def open_reader(self, session_id: str, data_key: str) -> StorageFileObject: + data_info = await self._data_manager_ref.get_data_info( + session_id, data_key, self._band_name + ) + reader = await self._clients[data_info.level].open_reader(data_info.object_id) + return reader + + @open_reader.batch + async def batch_open_readers(self, args_list, kwargs_list): + get_data_infos = [] + for args, kwargs in zip(args_list, kwargs_list): + get_data_infos.append( + self._data_manager_ref.get_data_info.delay( + *args, band_name=self._band_name, **kwargs + ) + ) + data_infos = await self._data_manager_ref.get_data_info.batch(*get_data_infos) + return await asyncio.gather( + *[ + self._clients[data_info.level].open_reader(data_info.object_id) + for data_info in data_infos + ] + ) + + @mo.extensible + async def open_writer( + self, + session_id: str, + data_key: str, + size: int, + level: StorageLevel, + request_quota=True, + ) -> WrappedStorageFileObject: + if level is None: + level = self.highest_level + if request_quota: + await self.request_quota_with_spill(level, size) + writer = await self._clients[level].open_writer(size) + return WrappedStorageFileObject( + writer, + level, + size, + session_id, + data_key, + self._data_manager_ref, + self._clients[level], + ) + + @open_writer.batch + async def batch_open_writers(self, args_list, kwargs_list): + extracted_args = None + data_keys, sizes = [], [] + for args, kwargs in zip(args_list, kwargs_list): + session_id, data_key, size, level, request_quota = self.open_writer.bind( + *args, **kwargs + ) + if extracted_args: + assert extracted_args == (session_id, level, request_quota) + extracted_args = (session_id, level, request_quota) + data_keys.append(data_key) + sizes.append(size) + session_id, level, request_quota = extracted_args + if level is None: # pragma: no cover + level = self.highest_level + if request_quota: # pragma: no cover + await self.request_quota_with_spill(level, sum(sizes)) + writers = await asyncio.gather( + *[self._clients[level].open_writer(size) for size in sizes] + ) + wrapped_writers = [] + for writer, size, data_key in zip(writers, sizes, data_keys): + wrapped_writers.append( + WrappedStorageFileObject( + writer, + level, + size, + session_id, + data_key, + self._data_manager_ref, + self._clients[level], + ) + ) + return wrapped_writers + + async def _get_meta_api(self, session_id: str): + if self._supervisor_address is None: + cluster_api = await ClusterAPI.create(self.address) + [self._supervisor_address] = await cluster_api.get_supervisors_by_keys( + [session_id] + ) + + return await MetaAPI.create( + session_id=session_id, address=self._supervisor_address + ) + + async def _fetch_remote( + self, + session_id: str, + data_keys: List[Union[str, tuple]], + remote_band: BandType, + error: str, + ): + remote_manager_ref: mo.ActorRefType[DataManagerActor] = await mo.actor_ref( + uid=DataManagerActor.default_uid(), address=remote_band[0] + ) + get_data_infos = [] + for data_key in data_keys: + get_data_infos.append( + remote_manager_ref.get_data_info.delay(session_id, data_key, error) + ) + data_infos = await remote_manager_ref.get_data_info.batch(*get_data_infos) + data_infos, data_keys = zip( + *[ + (data_info, data_key) + for data_info, data_key in zip(data_infos, data_keys) + if data_info is not None + ] + ) + put_data_info_delays = [] + fetch_tasks = [] + for data_info, data_key in zip(data_infos, data_keys): + put_data_info_delays.append( + self._data_manager_ref.put_data_info.delay( + session_id, data_key, data_info, None + ) + ) + fetch_tasks.append( + self._clients[StorageLevel.REMOTE].fetch(data_info.object_id) + ) + await self._data_manager_ref.put_data_info.batch(*put_data_info_delays) + await asyncio.gather(*fetch_tasks) + + async def _fetch_via_transfer( + self, + session_id: str, + data_keys: List[Union[str, tuple]], + level: StorageLevel, + remote_band: BandType, + fetch_band_name: str, + error: str, + ): + from .transfer import SenderManagerActor + + logger.debug("Begin to fetch %s from band %s", data_keys, remote_band) + sender_ref: mo.ActorRefType[SenderManagerActor] = await mo.actor_ref( + address=remote_band[0], uid=SenderManagerActor.gen_uid(remote_band[1]) + ) + await sender_ref.send_batch_data( + session_id, + data_keys, + self._data_manager_ref.address, + level, + fetch_band_name, + error=error, + ) + logger.debug("Finish fetching %s from band %s", data_keys, remote_band) + + async def fetch_batch( + self, + session_id: str, + data_keys: List[str], + level: StorageLevel, + band_name: str, + address: str, + error: str, + ): + if error not in ("raise", "ignore"): # pragma: no cover + raise ValueError("error must be raise or ignore") + + meta_api = await self._get_meta_api(session_id) + remote_keys = defaultdict(set) + missing_keys = [] + get_metas = [] + get_info_delays = [] + for data_key in data_keys: + get_info_delays.append( + self._data_manager_ref.get_data_info.delay( + session_id, data_key, band_name, error="ignore" + ) + ) + data_infos = await self._data_manager_ref.get_data_info.batch(*get_info_delays) + pin_delays = [] + for data_key, info in zip(data_keys, data_infos): + # for gpu bands, need transfer between gpu cards + if info is not None: + if band_name and band_name != info.band: + missing_keys.append(data_key) + else: + pin_delays.append( + self._data_manager_ref.pin.delay( + session_id, data_key, self._band_name + ) + ) + else: + # Not exists in local, fetch from remote worker + missing_keys.append(data_key) + if address is None or band_name is None: + # some mapper keys are absent, specify error='ignore' + # remember that meta only records those main keys + get_metas = [ + ( + meta_api.get_chunk_meta.delay( + data_key[0] if isinstance(data_key, tuple) else data_key, + fields=["bands"], + error="ignore", + ) + ) + for data_key in missing_keys + ] + await self._data_manager_ref.pin.batch(*pin_delays) + + if get_metas: + metas = await meta_api.get_chunk_meta.batch(*get_metas) + else: # pragma: no cover + metas = [{"bands": [(address, band_name)]}] * len(missing_keys) + assert len(metas) == len(missing_keys) + for data_key, bands in zip(missing_keys, metas): + if bands is not None: + remote_keys[bands["bands"][0]].add(data_key) + transfer_tasks = [] + fetch_keys = [] + for band, keys in remote_keys.items(): + if StorageLevel.REMOTE in self._quota_refs: + # if storage support remote level, just fetch object id + transfer_tasks.append( + self._fetch_remote(session_id, list(keys), band, error) + ) + else: + # fetch via transfer + transfer_tasks.append( + self._fetch_via_transfer( + session_id, list(keys), level, band, band_name or band[1], error + ) + ) + fetch_keys.extend(list(keys)) + + await asyncio.gather(*transfer_tasks) + + set_meta_keys = set() + for data_key in fetch_keys: + # skip shuffle keys + if isinstance(data_key, tuple): + set_meta_keys.add(data_key[0]) + else: + set_meta_keys.add(data_key) + append_bands_delays = [ + meta_api.add_chunk_bands.delay(key, [(self.address, self._band_name)]) + for key in set_meta_keys + ] + + if append_bands_delays: + await meta_api.add_chunk_bands.batch(*append_bands_delays) + + async def request_quota_with_spill(self, level: StorageLevel, size: int): + if await self._quota_refs[level].request_quota(size): + return + else: + total, used = await self._quota_refs[level].get_quota() + await self.spill(level, int(used + size - total), size) + await self._quota_refs[level].request_quota(size) + logger.debug( + "Spill is triggered, request %s bytes of %s finished", size, level + ) + + async def notify_spillable_space(self, level): + if await self._spill_manager_refs[level].has_spill_task(): + total, used = await self._quota_refs[level].get_quota() + tasks = [] + if total is not None: + spillable_size = await self._data_manager_ref.get_spillable_size( + level, self._band_name + ) + tasks.append( + self._spill_manager_refs[level].notify_spillable_space( + spillable_size, total - used + ) + ) + await asyncio.gather(*tasks) + + async def spill(self, level: StorageLevel, request_size: int, object_size: int): + from .spill import spill + + try: + await spill( + request_size, level, self._band_name, self._data_manager_ref, self + ) + except NoDataToSpill: + logger.warning( + "No data to spill %s bytes, waiting more space", request_size + ) + size = await self._spill_manager_refs[level].wait_for_space(object_size) + await spill(size, level, self._band_name, self._data_manager_ref, self) + + async def list(self, level: StorageLevel) -> List: + return await self._data_manager_ref.list(level, self._band_name) + + @mo.extensible + async def unpin(self, session_id: str, data_key: str, error: str = "raise"): + levels = await self._data_manager_ref.unpin( + session_id, [data_key], self._band_name, error + ) + if levels: + await self.notify_spillable_space(levels[0]) + + @unpin.batch + async def batch_unpin(self, args_list, kwargs_list): + extracted_args = [] + data_keys = [] + for args, kw in zip(args_list, kwargs_list): + session_id, data_key, error = self.unpin.bind(*args, **kw) + if extracted_args: + assert extracted_args == (session_id, error) + extracted_args = session_id, error + data_keys.append(data_key) + if extracted_args: + session_id, error = extracted_args + levels = await self._data_manager_ref.unpin( + session_id, data_keys, self._band_name, error + ) + for level in levels: + await self.notify_spillable_space(level) + + async def get_storage_level_info(self, level: StorageLevel) -> StorageInfo: + quota_ref = self._quota_refs[level] + total_size, used_size = await quota_ref.get_quota() + return StorageInfo( + storage_level=level, + total_size=int(total_size) if total_size else total_size, + used_size=int(used_size), + ) + + async def get_storage_backend_info(self, level: StorageLevel) -> dict: + return self._clients[level].backend_info diff --git a/python/xorbits/_mars/services/storage/spill.py b/python/xorbits/_mars/services/storage/spill.py new file mode 100644 index 000000000..544d77a91 --- /dev/null +++ b/python/xorbits/_mars/services/storage/spill.py @@ -0,0 +1,209 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import logging +from abc import ABC, abstractmethod +from collections import defaultdict +from typing import List, Tuple + +from ... import oscar as mo +from ...storage import StorageLevel +from .core import DataManagerActor +from .errors import NoDataToSpill +from .handler import StorageHandlerActor + +logger = logging.getLogger(__name__) + +DEFAULT_SPILL_BLOCK_SIZE = 128 * 1024 + + +class SpillStrategy(ABC): + @abstractmethod + def record_put_info(self, key, data_size: int): + """ + Record the data key and data size when putting into storage + """ + + @abstractmethod + def record_delete_info(self, key): + """ + Record who is removed from storage + """ + + @abstractmethod + def get_spill_keys(self, size: int) -> Tuple[List, List]: + """ + Return sizes and keys for spilling according to spill size + """ + + +class FIFOStrategy(SpillStrategy): + def __init__(self, level: StorageLevel): + self._level = level + self._data_sizes = dict() + self._pinned_keys = defaultdict(int) + self._spilling_keys = set() + + def record_put_info(self, key, data_size: int): + self._data_sizes[key] = data_size + + def record_delete_info(self, key): + self._data_sizes.pop(key, None) + if key in self._spilling_keys: + self._spilling_keys.remove(key) + + def pin_data(self, key): + self._pinned_keys[key] += 1 + + def unpin_data(self, key): + if key not in self._pinned_keys: + return + self._pinned_keys[key] -= 1 + if self._pinned_keys[key] <= 0: + del self._pinned_keys[key] + + def get_spillable_size(self): + total_size = 0 + for data_key, data_size in self._data_sizes.items(): + if ( + data_key not in self._pinned_keys + and data_key not in self._spilling_keys + ): + total_size += data_size + return total_size + + def get_spill_keys(self, size: int) -> Tuple[List, List]: + spill_sizes = [] + spill_keys = [] + spill_size = 0 + for data_key, data_size in self._data_sizes.items(): + if spill_size >= size: + break + if data_key in self._pinned_keys: + continue + if data_key in self._spilling_keys: + continue + spill_sizes.append(data_size) + spill_keys.append(data_key) + spill_size += data_size + + if spill_size < size: # pragma: no cover + pinned_sizes = dict((k, self._data_sizes[k]) for k in self._pinned_keys) + spilling_keys = dict((k, self._data_sizes[k]) for k in self._spilling_keys) + logger.debug( + "No data can be spilled for level: %s, pinned keys: %s," + " spilling keys: %s", + self._level, + pinned_sizes, + spilling_keys, + ) + raise NoDataToSpill(f"No data can be spilled for level: {self._level}") + self._spilling_keys.update(set(spill_keys)) + return spill_sizes, spill_keys + + +class SpillManagerActor(mo.StatelessActor): + """ + The actor to handle the race condition when NoDataToSpill happens. + There are two situations when spill raises `NoDataToSpill`, + one is that space is allocated while objects are not put into storage, + another is some objects are pinned that can not be spilled, + so we create an asyncio event if not have enough objects to spill, + when put or unpin happens, we will notify and check spillable size, + if size is enough for spilling, call event.set() to wake up spilling task. + """ + + def __init__(self, level: StorageLevel): + self._level = level + self._event = None + self._lock = asyncio.Lock() + + @classmethod + def gen_uid(cls, band_name: str, level: StorageLevel): + return f"spill_manager_{band_name}_{level}" + + def has_spill_task(self): + return self._event is not None + + def notify_spillable_space(self, spillable_size: int, quota_left: int): + event = self._event + if event is None: + return + logger.debug("Notify to check if has space for spilling") + if spillable_size + quota_left > event.size: + logger.debug( + "Check pass, wake up spill task, spill bytes is %s", + event.size - quota_left, + ) + event.size = event.size - quota_left + event.set() + + async def wait_for_space(self, size: int): + # make sure only one spilling task is waiting the event + async with self._lock: + self._event = event = asyncio.Event() + event.size = size + await self._event.wait() + size = self._event.size + self._event = None + return size + + +async def spill( + request_size: int, + level: StorageLevel, + band_name: str, + data_manager: mo.ActorRefType[DataManagerActor], + storage_handler: mo.ActorRefType[StorageHandlerActor], + block_size=None, + multiplier=1.1, +): + logger.debug( + "%s is full, need to spill %s bytes, multiplier is %s", + level, + request_size, + multiplier, + ) + request_size *= multiplier + block_size = block_size or DEFAULT_SPILL_BLOCK_SIZE + spill_level = level.spill_level() + spill_sizes, spill_keys = await data_manager.get_spill_keys( + level, band_name, request_size + ) + logger.debug( + "Decide to spill %s bytes, data keys are %s", sum(spill_sizes), spill_keys + ) + + for (session_id, key), size in zip(spill_keys, spill_sizes): + reader = await storage_handler.open_reader(session_id, key) + writer = await storage_handler.open_writer(session_id, key, size, spill_level) + async with reader: + async with writer: + while True: + block_data = await reader.read(block_size) + if not block_data: + break + else: + await writer.write(block_data) + try: + await storage_handler.delete_object( + session_id, key, size, reader.object_id, level + ) + except KeyError: # pragma: no cover + # workaround for the case that the object + # has been deleted during spill + logger.debug("Data %s %s is deleted during spill", session_id, key) + await storage_handler.delete(session_id, key, error="ignore") + logger.debug("Spill finishes, release %s bytes of %s", sum(spill_sizes), level) diff --git a/python/xorbits/_mars/services/storage/supervisor/__init__.py b/python/xorbits/_mars/services/storage/supervisor/__init__.py new file mode 100644 index 000000000..cb61fbb57 --- /dev/null +++ b/python/xorbits/_mars/services/storage/supervisor/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...core import EmptyService + + +class StorageSupervisorService(EmptyService): + pass diff --git a/python/xorbits/_mars/services/storage/tests/__init__.py b/python/xorbits/_mars/services/storage/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/services/storage/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/services/storage/tests/test_api.py b/python/xorbits/_mars/services/storage/tests/test_api.py new file mode 100644 index 000000000..a0d926339 --- /dev/null +++ b/python/xorbits/_mars/services/storage/tests/test_api.py @@ -0,0 +1,191 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import tempfile + +import numpy as np +import pandas as pd +import pytest + +from .... import oscar as mo +from .... import tensor as mt +from ....core import tile +from ....serialization import AioDeserializer, AioSerializer +from ....storage import StorageLevel +from ....tests.core import require_ray +from ....utils import get_next_port, lazy_import +from ...cluster import MockClusterAPI +from ...meta import MockMetaAPI +from ...session import MockSessionAPI +from ...web import WebActor +from ..api import MockStorageAPI, WebStorageAPI + +ray = lazy_import("ray") +vineyard = lazy_import("vineyard") + +require_lib = lambda x: x +storage_configs = [] + +# plasma backend +plasma_storage_size = 10 * 1024 * 1024 +if sys.platform == "darwin": + plasma_dir = "/tmp" +else: + plasma_dir = "/dev/shm" +plasma_setup_params = dict( + store_memory=plasma_storage_size, plasma_directory=plasma_dir, check_dir_size=False +) +if not sys.platform.lower().startswith("win"): + storage_configs.append({"plasma": plasma_setup_params}) + +# ray backend +if ray is not None: + require_lib = require_ray + storage_configs.append({"ray": dict()}) + +# vineyard +if vineyard is not None: + storage_configs.append({"vineyard": dict(vineyard_size="256M")}) + +# shared_memory +storage_configs.append({"shared_memory": dict()}) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("storage_configs", storage_configs) +@pytest.mark.parametrize( + "ray_start_regular", [{"enable": ray is not None}], indirect=True +) +@require_lib +async def test_storage_mock_api(ray_start_regular, storage_configs): + start_method = "fork" if sys.platform != "win32" else None + pool = await mo.create_actor_pool( + "127.0.0.1", + 2, + labels=["main", "numa-0", "io"], + subprocess_start_method=start_method, + ) + async with pool: + session_id = "mock_session_id" + storage_api = await MockStorageAPI.create( + address=pool.external_address, + session_id=session_id, + storage_configs=storage_configs, + ) + + # test put and get + value1 = np.random.rand(10, 10) + await storage_api.put("data1", value1) + get_value1 = await storage_api.get("data1") + np.testing.assert_array_equal(value1, get_value1) + + value2 = pd.DataFrame( + { + "col1": [str(i) for i in range(10)], + "col2": np.random.randint(0, 100, (10,)), + } + ) + await storage_api.put("data2", value2) + get_value2 = await storage_api.get("data2") + pd.testing.assert_frame_equal(value2, get_value2) + + sliced_value = await storage_api.get( + "data2", conditions=[slice(3, 5), slice(None, None)] + ) + pd.testing.assert_frame_equal(value2.iloc[3:5, :], sliced_value) + + infos = await storage_api.get_infos("data2") + assert infos[0].store_size > 0 + + await storage_api.delete("data2") + buffers = await AioSerializer(value2).run() + size = sum(getattr(buf, "nbytes", len(buf)) for buf in buffers) + # test open_reader and open_writer + writer = await storage_api.open_writer("write_key", size, StorageLevel.MEMORY) + async with writer: + for buf in buffers: + await writer.write(buf) + + reader = await storage_api.open_reader("write_key") + async with reader: + read_value = await AioDeserializer(reader).run() + + pd.testing.assert_frame_equal(value2, read_value) + + await MockStorageAPI.cleanup(pool.external_address) + + +@pytest.mark.asyncio +async def test_web_storage_api(): + from ..api.web import StorageWebAPIHandler + + tempdir = tempfile.mkdtemp() + start_method = "fork" if sys.platform != "win32" else None + pool = await mo.create_actor_pool( + "127.0.0.1", 1, subprocess_start_method=start_method + ) + async with pool: + session_id = "mock_session_id" + await MockClusterAPI.create(address=pool.external_address) + await MockSessionAPI.create( + session_id=session_id, address=pool.external_address + ) + meta_api = await MockMetaAPI.create( + session_id=session_id, address=pool.external_address + ) + await MockStorageAPI.create( + address=pool.external_address, + session_id=session_id, + storage_configs={ + "shared_memory": dict(), + "disk": dict(root_dirs=[tempdir]), + }, + ) + + web_config = { + "port": get_next_port(), + "web_handlers": { + StorageWebAPIHandler.get_root_pattern(): StorageWebAPIHandler + }, + } + await mo.create_actor(WebActor, web_config, address=pool.external_address) + + web_storage_api = WebStorageAPI( + session_id, f'http://127.0.0.1:{web_config["port"]}', "numa-0" + ) + + value = np.random.rand(10, 10) + t = mt.random.rand(10, 10) + t = tile(t) + await meta_api.set_chunk_meta( + t.chunks[0], bands=[(pool.external_address, "numa-0")] + ) + await web_storage_api.put(t.chunks[0].key, value) + + ret_value = await web_storage_api.get(t.chunks[0].key) + np.testing.assert_array_equal(value, ret_value) + + sliced_value = await web_storage_api.get( + t.chunks[0].key, conditions=[slice(3, 5), slice(None, None)] + ) + np.testing.assert_array_equal(value[3:5, :], sliced_value) + + infos = await web_storage_api.get_infos(t.chunks[0].key) + assert len(infos) == 1 + assert infos[0].level == StorageLevel.MEMORY + assert infos[0].memory_size == t.chunks[0].nbytes + + await MockStorageAPI.cleanup(pool.external_address) + await MockClusterAPI.cleanup(pool.external_address) diff --git a/python/xorbits/_mars/services/storage/tests/test_service.py b/python/xorbits/_mars/services/storage/tests/test_service.py new file mode 100644 index 000000000..63fa2285f --- /dev/null +++ b/python/xorbits/_mars/services/storage/tests/test_service.py @@ -0,0 +1,209 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys + +import numpy as np +import pandas as pd +import pytest + +from .... import oscar as mo +from ....resource import Resource +from ....serialization import AioDeserializer, AioSerializer +from ....storage import StorageLevel +from ....tests.core import require_cudf, require_cupy +from ... import NodeRole, start_services, stop_services +from ...cluster import MockClusterAPI +from .. import StorageAPI + +_is_windows = sys.platform.lower().startswith("win") + + +@pytest.fixture +async def actor_pools(): + async def start_pool(): + start_method = ( + os.environ.get("POOL_START_METHOD", "forkserver") + if sys.platform != "win32" + else None + ) + pool = await mo.create_actor_pool( + "127.0.0.1", + n_process=2, + subprocess_start_method=start_method, + labels=["main", "numa-0", "io"], + ) + await pool.start() + return pool + + worker_pool = await start_pool() + try: + yield worker_pool + finally: + await worker_pool.stop() + + +@pytest.mark.asyncio +async def test_storage_service(actor_pools): + worker_pool = actor_pools + + if sys.platform == "darwin": + plasma_dir = "/tmp" + else: + plasma_dir = "/dev/shm" + plasma_setup_params = dict( + store_memory=10 * 1024 * 1024, plasma_directory=plasma_dir, check_dir_size=False + ) + + config = { + "services": ["storage"], + "storage": { + "backends": ["plasma" if not _is_windows else "shared_memory"], + "plasma": plasma_setup_params, + }, + } + + await start_services(NodeRole.WORKER, config, address=worker_pool.external_address) + + api = await StorageAPI.create("mock_session", worker_pool.external_address) + value1 = np.random.rand(10, 10) + await api.put("data1", value1) + get_value1 = await api.get("data1") + np.testing.assert_array_equal(value1, get_value1) + + # test api in subpool + subpool_address = list(worker_pool._sub_processes.keys())[0] + api2 = await StorageAPI.create("mock_session", subpool_address) + assert api2._storage_handler_ref.address == subpool_address + + get_value1 = await api2.get("data1") + np.testing.assert_array_equal(value1, get_value1) + + sliced_value = await api2.get("data1", conditions=[slice(None, None), slice(0, 4)]) + np.testing.assert_array_equal(value1[:, :4], sliced_value) + + await api.unpin("data1") + + value2 = pd.DataFrame(value1) + await api2.put("data2", value2) + + get_value2 = await api.get("data2") + pd.testing.assert_frame_equal(value2, get_value2) + + # test writer and read + buffers = await AioSerializer(value2).run() + size = sum(getattr(buf, "nbytes", len(buf)) for buf in buffers) + # test open_reader and open_writer + writer = await api.open_writer("write_key", size, StorageLevel.MEMORY) + async with writer: + for buf in buffers: + await writer.write(buf) + + reader = await api.open_reader("write_key") + async with reader: + read_value = await AioDeserializer(reader).run() + + pd.testing.assert_frame_equal(value2, read_value) + + await stop_services( + NodeRole.WORKER, address=worker_pool.external_address, config=config + ) + + +@pytest.fixture +async def actor_pools_with_gpu(): + async def start_pool(): + start_method = ( + os.environ.get("POOL_START_METHOD", "forkserver") + if sys.platform != "win32" + else None + ) + pool = await mo.create_actor_pool( + "127.0.0.1", + n_process=3, + subprocess_start_method=start_method, + labels=["main", "numa-0", "gpu-0", "io"], + ) + await pool.start() + return pool + + worker_pool = await start_pool() + try: + yield worker_pool + finally: + await worker_pool.stop() + + +@require_cupy +@require_cudf +@pytest.mark.asyncio +async def test_storage_service_with_cuda(actor_pools_with_gpu): + import cudf + import cupy + + worker_pool = actor_pools_with_gpu + + if sys.platform == "darwin": + plasma_dir = "/tmp" + else: + plasma_dir = "/dev/shm" + plasma_setup_params = dict( + store_memory=10 * 1024 * 1024, plasma_directory=plasma_dir, check_dir_size=False + ) + + config = { + "services": ["storage"], + "storage": { + "backends": ["plasma" if not _is_windows else "shared_memory", "cuda"], + "plasma": plasma_setup_params, + "cuda": dict(), + }, + } + + await MockClusterAPI.create( + worker_pool.external_address, + band_to_resource={ + "numa-0": Resource(num_cpus=1), + "gpu-0": Resource(num_gpus=1), + }, + use_gpu=True, + ) + await start_services(NodeRole.WORKER, config, address=worker_pool.external_address) + + storage_api = await StorageAPI.create( + "mock_session", worker_pool.external_address, band_name="gpu-0" + ) + data1 = cupy.asarray(np.random.rand(10, 10)) + await storage_api.put("mock_cupy_key", data1, level=StorageLevel.GPU) + get_data1 = await storage_api.get("mock_cupy_key") + assert isinstance(get_data1, cupy.ndarray) + cupy.testing.assert_array_equal(data1, get_data1) + + data2 = cudf.DataFrame( + pd.DataFrame( + { + "col1": np.arange(10), + "col2": [f"str{i}" for i in range(10)], + "col3": np.random.rand(10), + }, + ) + ) + await storage_api.put("mock_cudf_key", data2, level=StorageLevel.GPU) + get_data2 = await storage_api.get("mock_cudf_key") + assert isinstance(get_data2, cudf.DataFrame) + cudf.testing.assert_frame_equal(data2, get_data2) + + await MockClusterAPI.cleanup(worker_pool.external_address) + await stop_services(NodeRole.WORKER, config, address=worker_pool.external_address) diff --git a/python/xorbits/_mars/services/storage/tests/test_spill.py b/python/xorbits/_mars/services/storage/tests/test_spill.py new file mode 100644 index 000000000..002799739 --- /dev/null +++ b/python/xorbits/_mars/services/storage/tests/test_spill.py @@ -0,0 +1,236 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import os +import sys +import tempfile + +import numpy as np +import pytest + +from .... import oscar as mo +from ....storage import PlasmaStorage, StorageLevel +from ....utils import calc_data_size +from ...cluster import MockClusterAPI +from ...cluster.supervisor.node_info import NodeInfoCollectorActor +from ...cluster.uploader import NodeInfoUploaderActor +from ..core import StorageManagerActor, StorageQuotaActor, build_data_info +from ..handler import StorageHandlerActor + +# todo enable this test module when spill support added +# on storage quotas +if sys.platform.lower().startswith("win"): + pytestmark = pytest.mark.skip + +MEMORY_SIZE = 100 * 1024 + + +@pytest.fixture +async def actor_pool(): + async def start_pool(): + start_method = ( + os.environ.get("POOL_START_METHOD", "forkserver") + if sys.platform != "win32" + else None + ) + + pool = await mo.create_actor_pool( + "127.0.0.1", + n_process=2, + labels=["main", "numa-0", "io"], + subprocess_start_method=start_method, + ) + await pool.start() + return pool + + worker_pool = await start_pool() + try: + yield worker_pool + finally: + await worker_pool.stop() + + +def _build_storage_config(): + if sys.platform == "darwin": + plasma_dir = "/tmp" + else: + plasma_dir = "/dev/shm" + plasma_setup_params = dict( + store_memory=MEMORY_SIZE, plasma_directory=plasma_dir, check_dir_size=False + ) + tempdir = tempfile.mkdtemp() + disk_setup_params = dict(root_dirs=tempdir, level="disk") + storage_configs = {"plasma": plasma_setup_params, "filesystem": disk_setup_params} + return storage_configs + + +@pytest.fixture +async def create_actors(actor_pool): + _ = await MockClusterAPI.create(address=actor_pool.external_address) + storage_configs = _build_storage_config() + manager_ref = await mo.create_actor( + StorageManagerActor, + storage_configs, + uid=StorageManagerActor.default_uid(), + address=actor_pool.external_address, + ) + + sub_processes = list(actor_pool.sub_processes) + yield actor_pool.external_address, sub_processes[0], sub_processes[1] + await mo.destroy_actor(manager_ref) + + +@pytest.mark.asyncio +async def test_spill(create_actors): + worker_address, _, _ = create_actors + storage_handler = await mo.actor_ref( + uid=StorageHandlerActor.gen_uid("numa-0"), address=worker_address + ) + + storage_manager = await mo.actor_ref( + uid=StorageManagerActor.default_uid(), address=worker_address + ) + + init_params = (await storage_manager.get_client_params())["numa-0"] + plasma_init_params = init_params["plasma"] + plasma_handler = PlasmaStorage(**plasma_init_params) + memory_quota = await mo.actor_ref( + StorageQuotaActor, + StorageLevel.MEMORY, + MEMORY_SIZE, + address=worker_address, + uid=StorageQuotaActor.gen_uid("numa-0", StorageLevel.MEMORY), + ) + + # fill to trigger spill + session_id = "mock_session" + data_list = [] + key_list = [] + for i in range(10): + data = np.random.randint(0, 10000, (8000,), np.int16) + key = f"mock_key_{i}" + await storage_handler.put(session_id, key, data, StorageLevel.MEMORY) + used = (await memory_quota.get_quota())[1] + assert used < MEMORY_SIZE + data_list.append(data) + key_list.append(key) + + memory_object_list = await storage_handler.list(StorageLevel.MEMORY) + disk_object_list = await storage_handler.list(StorageLevel.DISK) + assert len(memory_object_list) == 3 + assert len(disk_object_list) == 7 + + for key, data in zip(key_list, data_list): + get_data = await storage_handler.get(session_id, key) + np.testing.assert_array_equal(data, get_data) + + plasma_list = await plasma_handler.list() + assert len(plasma_list) == len(memory_object_list) + + +@pytest.mark.asyncio +async def test_disk_info(create_actors): + worker_address, _, _ = create_actors + uploader_ref = await mo.actor_ref( + address=worker_address, uid=NodeInfoUploaderActor.default_uid() + ) + await uploader_ref.upload_node_info() + collector_ref = await mo.actor_ref( + address=worker_address, uid=NodeInfoCollectorActor.default_uid() + ) + storage_manager = await mo.actor_ref( + uid=StorageManagerActor.default_uid(), address=worker_address + ) + init_params = (await storage_manager.get_client_params())["numa-0"] + assert "filesystem" in init_params + assert "level" in init_params["filesystem"] + assert init_params["filesystem"]["level"] == StorageLevel.DISK + + node_info = await collector_ref.get_nodes_info(detail=True) + disk_partitions = node_info[worker_address]["detail"]["disk"]["partitions"] + assert disk_partitions + for _, info in disk_partitions.items(): + assert "inode_used" in info + + +class DelayPutStorageHandler(StorageHandlerActor): + async def put( + self, session_id: str, data_key: str, obj: object, level: StorageLevel + ): + size = calc_data_size(obj) + await self.request_quota_with_spill(level, size) + # sleep to trigger `NoDataToSpill` + await asyncio.sleep(0.5) + object_info = await self._clients[level].put(obj) + data_info = build_data_info(object_info, level, size) + await self._data_manager_ref.put_data_info( + session_id, data_key, data_info, object_info + ) + if object_info.size is not None and data_info.memory_size != object_info.size: + await self._quota_refs[level].update_quota( + object_info.size - data_info.memory_size + ) + await self.notify_spillable_space(level) + return data_info + + +@pytest.fixture +async def create_actors_with_delay(actor_pool): + storage_configs = _build_storage_config() + manager_ref = await mo.create_actor( + StorageManagerActor, + storage_configs, + storage_handler_cls=DelayPutStorageHandler, + uid=StorageManagerActor.default_uid(), + address=actor_pool.external_address, + ) + + sub_processes = list(actor_pool.sub_processes) + yield actor_pool.external_address, sub_processes[0], sub_processes[1] + await mo.destroy_actor(manager_ref) + + +@pytest.mark.asyncio +async def test_spill_event(create_actors_with_delay): + worker_address, sub_pool_address1, sub_pool_address2 = create_actors_with_delay + storage_handler1 = await mo.actor_ref( + uid=StorageHandlerActor.gen_uid("numa-0"), address=sub_pool_address1 + ) + storage_handler2 = await mo.actor_ref( + uid=StorageHandlerActor.gen_uid("numa-0"), address=sub_pool_address2 + ) + # total store size is 65536, single data size is around 40000 + # we put two data simultaneously + data = np.random.randint(0, 10000, (5000,)) + session_id = "mock_session" + key1 = "mock_key1" + key2 = "mock_key2" + put1 = asyncio.create_task( + storage_handler1.put(session_id, key1, data, StorageLevel.MEMORY) + ) + put2 = asyncio.create_task( + storage_handler2.put(session_id, key2, data, StorageLevel.MEMORY) + ) + await asyncio.gather(put1, put2) + + get_data = await storage_handler2.get(session_id, key1) + np.testing.assert_array_equal(data, get_data) + get_data = await storage_handler1.get(session_id, key2) + np.testing.assert_array_equal(data, get_data) + + memory_object_list = await storage_handler1.list(StorageLevel.MEMORY) + disk_object_list = await storage_handler1.list(StorageLevel.DISK) + assert len(memory_object_list) == 1 + assert len(disk_object_list) == 1 diff --git a/python/xorbits/_mars/services/storage/tests/test_transfer.py b/python/xorbits/_mars/services/storage/tests/test_transfer.py new file mode 100644 index 000000000..2e19be19c --- /dev/null +++ b/python/xorbits/_mars/services/storage/tests/test_transfer.py @@ -0,0 +1,322 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import os +import sys + +import numpy as np +import pandas as pd +import pytest + +from .... import oscar as mo +from ....oscar.backends.allocate_strategy import IdleLabel +from ....storage import StorageLevel +from ..core import DataManagerActor, StorageManagerActor, StorageQuotaActor +from ..errors import DataNotExist +from ..handler import StorageHandlerActor +from ..transfer import ReceiverManagerActor, SenderManagerActor + +_is_windows = sys.platform.lower().startswith("win") + + +@pytest.fixture +async def actor_pools(): + async def start_pool(): + start_method = ( + os.environ.get("POOL_START_METHOD", "forkserver") + if sys.platform != "win32" + else None + ) + + pool = await mo.create_actor_pool( + "127.0.0.1", + n_process=2, + labels=["main", "numa-0", "io"], + subprocess_start_method=start_method, + ) + await pool.start() + return pool + + worker_pool_1 = await start_pool() + worker_pool_2 = await start_pool() + try: + yield worker_pool_1, worker_pool_2 + finally: + await worker_pool_1.stop() + await worker_pool_2.stop() + + +@pytest.fixture +async def create_actors(actor_pools): + worker_pool_1, worker_pool_2 = actor_pools + + if sys.platform == "darwin": + plasma_dir = "/tmp" + else: + plasma_dir = "/dev/shm" + plasma_setup_params = dict( + store_memory=5 * 1024 * 1024, plasma_directory=plasma_dir, check_dir_size=False + ) + storage_configs = ( + {"plasma": plasma_setup_params} if not _is_windows else {"shared_memory": {}} + ) + + manager_ref1 = await mo.create_actor( + StorageManagerActor, + storage_configs, + uid=StorageManagerActor.default_uid(), + address=worker_pool_1.external_address, + ) + + manager_ref2 = await mo.create_actor( + StorageManagerActor, + storage_configs, + uid=StorageManagerActor.default_uid(), + address=worker_pool_2.external_address, + ) + yield worker_pool_1.external_address, worker_pool_2.external_address + await mo.destroy_actor(manager_ref1) + await mo.destroy_actor(manager_ref2) + + +@pytest.mark.asyncio +async def test_simple_transfer(create_actors): + worker_address_1, worker_address_2 = create_actors + + session_id = "mock_session" + data1 = np.random.rand(100, 100) + data2 = pd.DataFrame(np.random.randint(0, 100, (500, 10))) + + storage_handler1 = await mo.actor_ref( + uid=StorageHandlerActor.gen_uid("numa-0"), address=worker_address_1 + ) + storage_handler2 = await mo.actor_ref( + uid=StorageHandlerActor.gen_uid("numa-0"), address=worker_address_2 + ) + + await storage_handler1.put(session_id, "data_key1", data1, StorageLevel.MEMORY) + await storage_handler1.put(session_id, "data_key2", data2, StorageLevel.MEMORY) + await storage_handler2.put(session_id, "data_key3", data2, StorageLevel.MEMORY) + + sender_actor = await mo.actor_ref( + address=worker_address_1, uid=SenderManagerActor.gen_uid("numa-0") + ) + + # send data to worker2 from worker1 + await sender_actor.send_batch_data( + session_id, + ["data_key1"], + worker_address_2, + StorageLevel.MEMORY, + block_size=1000, + ) + + await sender_actor.send_batch_data( + session_id, + ["data_key2"], + worker_address_2, + StorageLevel.MEMORY, + block_size=1000, + ) + + get_data1 = await storage_handler2.get(session_id, "data_key1") + np.testing.assert_array_equal(data1, get_data1) + + get_data2 = await storage_handler2.get(session_id, "data_key2") + pd.testing.assert_frame_equal(data2, get_data2) + + # send data to worker1 from worker2 + sender_actor = await mo.actor_ref( + address=worker_address_2, uid=SenderManagerActor.gen_uid("numa-0") + ) + await sender_actor.send_batch_data( + session_id, ["data_key3"], worker_address_1, StorageLevel.MEMORY + ) + get_data3 = await storage_handler1.get(session_id, "data_key3") + pd.testing.assert_frame_equal(data2, get_data3) + + +# test for cancelling happens when writing +class MockReceiverManagerActor(ReceiverManagerActor): + async def do_write(self, *args, **kw): + await asyncio.sleep(3) + await super().do_write(*args, **kw) + + +class MockSenderManagerActor(SenderManagerActor): + @staticmethod + async def get_receiver_ref(address: str, band_name: str): + return await mo.actor_ref( + address=address, uid=MockReceiverManagerActor.default_uid() + ) + + +# test for cancelling happens when creating writer +class MockReceiverManagerActor2(ReceiverManagerActor): + async def create_writers(self, session_id, data_keys, data_sizes, level, sub_infos): + await asyncio.sleep(3) + return await super().create_writers( + session_id, data_keys, data_sizes, level, sub_infos + ) + + +class MockSenderManagerActor2(SenderManagerActor): + @staticmethod + async def get_receiver_ref(address: str, band_name: str): + return await mo.actor_ref( + address=address, uid=MockReceiverManagerActor2.default_uid() + ) + + +@pytest.mark.parametrize( + "mock_sender, mock_receiver", + [ + (MockSenderManagerActor, MockReceiverManagerActor), + (MockSenderManagerActor2, MockReceiverManagerActor2), + ], +) +@pytest.mark.asyncio +async def test_cancel_transfer(create_actors, mock_sender, mock_receiver): + worker_address_1, worker_address_2 = create_actors + + quota_refs = { + StorageLevel.MEMORY: await mo.actor_ref( + StorageQuotaActor, + StorageLevel.MEMORY, + 5 * 1024 * 1024, + address=worker_address_2, + uid=StorageQuotaActor.gen_uid("numa-0", StorageLevel.MEMORY), + ) + } + data_manager_ref = await mo.actor_ref( + uid=DataManagerActor.default_uid(), address=worker_address_1 + ) + storage_handler1 = await mo.actor_ref( + uid=StorageHandlerActor.gen_uid("numa-0"), address=worker_address_1 + ) + storage_handler2 = await mo.actor_ref( + uid=StorageHandlerActor.gen_uid("numa-0"), address=worker_address_2 + ) + + sender_actor = await mo.create_actor( + mock_sender, + data_manager_ref=data_manager_ref, + uid=mock_sender.default_uid(), + address=worker_address_1, + allocate_strategy=IdleLabel("io", "mock_sender"), + ) + await mo.create_actor( + mock_receiver, + quota_refs, + uid=mock_receiver.default_uid(), + address=worker_address_2, + allocate_strategy=IdleLabel("io", "mock_receiver"), + ) + + data1 = np.random.rand(10, 10) + await storage_handler1.put("mock", "data_key1", data1, StorageLevel.MEMORY) + data2 = pd.DataFrame(np.random.rand(100, 100)) + await storage_handler1.put("mock", "data_key2", data2, StorageLevel.MEMORY) + + used_before = (await quota_refs[StorageLevel.MEMORY].get_quota())[1] + + send_task = asyncio.create_task( + sender_actor.send_batch_data( + "mock", ["data_key1"], worker_address_2, StorageLevel.MEMORY + ) + ) + + await asyncio.sleep(0.5) + send_task.cancel() + + with pytest.raises(asyncio.CancelledError): + await send_task + + used = (await quota_refs[StorageLevel.MEMORY].get_quota())[1] + assert used == used_before + + with pytest.raises(DataNotExist): + await storage_handler2.get("mock", "data_key1") + + send_task = asyncio.create_task( + sender_actor.send_batch_data( + "mock", ["data_key1"], worker_address_2, StorageLevel.MEMORY + ) + ) + await send_task + get_data = await storage_handler2.get("mock", "data_key1") + np.testing.assert_array_equal(data1, get_data) + + # cancel when fetch the same data Simultaneously + if mock_sender is MockSenderManagerActor: + send_task1 = asyncio.create_task( + sender_actor.send_batch_data( + "mock", ["data_key2"], worker_address_2, StorageLevel.MEMORY + ) + ) + send_task2 = asyncio.create_task( + sender_actor.send_batch_data( + "mock", ["data_key2"], worker_address_2, StorageLevel.MEMORY + ) + ) + await asyncio.sleep(0.5) + send_task1.cancel() + with pytest.raises(asyncio.CancelledError): + await send_task1 + await send_task2 + get_data2 = await storage_handler2.get("mock", "data_key2") + pd.testing.assert_frame_equal(get_data2, data2) + + +@pytest.mark.asyncio +async def test_transfer_same_data(create_actors): + worker_address_1, worker_address_2 = create_actors + + session_id = "mock_session" + data1 = np.random.rand(100, 100) + storage_handler1 = await mo.actor_ref( + uid=StorageHandlerActor.gen_uid("numa-0"), address=worker_address_1 + ) + storage_handler2 = await mo.actor_ref( + uid=StorageHandlerActor.gen_uid("numa-0"), address=worker_address_2 + ) + + await storage_handler1.put(session_id, "data_key1", data1, StorageLevel.MEMORY) + sender_actor = await mo.actor_ref( + address=worker_address_1, uid=SenderManagerActor.gen_uid("numa-0") + ) + + # send data to worker2 from worker1 + task1 = asyncio.create_task( + sender_actor.send_batch_data( + session_id, + ["data_key1"], + worker_address_2, + StorageLevel.MEMORY, + block_size=1000, + ) + ) + task2 = asyncio.create_task( + sender_actor.send_batch_data( + session_id, + ["data_key1"], + worker_address_2, + StorageLevel.MEMORY, + block_size=1000, + ) + ) + await asyncio.gather(task1, task2) + get_data1 = await storage_handler2.get(session_id, "data_key1") + np.testing.assert_array_equal(data1, get_data1) diff --git a/python/xorbits/_mars/services/storage/transfer.py b/python/xorbits/_mars/services/storage/transfer.py new file mode 100644 index 000000000..3af253cf9 --- /dev/null +++ b/python/xorbits/_mars/services/storage/transfer.py @@ -0,0 +1,351 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import logging +from dataclasses import dataclass +from typing import Dict, List + +from ... import oscar as mo +from ...lib.aio import alru_cache +from ...storage import StorageLevel +from ...utils import dataslots +from .core import DataManagerActor, WrappedStorageFileObject +from .handler import StorageHandlerActor + +DEFAULT_TRANSFER_BLOCK_SIZE = 4 * 1024**2 + + +logger = logging.getLogger(__name__) + + +class SenderManagerActor(mo.StatelessActor): + def __init__( + self, + band_name: str = "numa-0", + transfer_block_size: int = None, + data_manager_ref: mo.ActorRefType[DataManagerActor] = None, + storage_handler_ref: mo.ActorRefType[StorageHandlerActor] = None, + ): + self._band_name = band_name + self._data_manager_ref = data_manager_ref + self._storage_handler = storage_handler_ref + self._transfer_block_size = transfer_block_size or DEFAULT_TRANSFER_BLOCK_SIZE + + @classmethod + def gen_uid(cls, band_name: str): + return f"sender_manager_{band_name}" + + async def __post_create__(self): + if self._storage_handler is None: # for test + self._storage_handler = await mo.actor_ref( + self.address, StorageHandlerActor.gen_uid("numa-0") + ) + + @staticmethod + @alru_cache + async def get_receiver_ref(address: str, band_name: str): + return await mo.actor_ref( + address=address, uid=ReceiverManagerActor.gen_uid(band_name) + ) + + async def _send_data( + self, + receiver_ref: mo.ActorRefType["ReceiverManagerActor"], + session_id: str, + data_keys: List[str], + block_size: int, + ): + class BufferedSender: + def __init__(self): + self._buffers = [] + self._send_keys = [] + self._eof_marks = [] + + async def flush(self): + if self._buffers: + await receiver_ref.receive_part_data( + self._buffers, session_id, self._send_keys, self._eof_marks + ) + + self._buffers = [] + self._send_keys = [] + self._eof_marks = [] + + async def send(self, buffer, eof_mark, key): + self._eof_marks.append(eof_mark) + self._buffers.append(buffer) + self._send_keys.append(key) + if sum(len(b) for b in self._buffers) >= block_size: + await self.flush() + + sender = BufferedSender() + open_reader_tasks = [] + for data_key in data_keys: + open_reader_tasks.append( + self._storage_handler.open_reader.delay(session_id, data_key) + ) + readers = await self._storage_handler.open_reader.batch(*open_reader_tasks) + + for data_key, reader in zip(data_keys, readers): + while True: + part_data = await reader.read(block_size) + # Notes on [How to decide whether the reader reaches EOF?] + # + # In some storage backend, e.g., the reported memory usage (i.e., the + # `store_size`) may not same with the byte size that need to be transferred + # when moving to a remote worker. Thus, we think the reader reaches EOF + # when a `read` request returns nothing, rather than comparing the `sent_size` + # and the `store_size`. + # + is_eof = not part_data # can be non-empty bytes, empty bytes and None + await sender.send(part_data, is_eof, data_key) + if is_eof: + break + await sender.flush() + + @mo.extensible + async def send_batch_data( + self, + session_id: str, + data_keys: List[str], + address: str, + level: StorageLevel, + band_name: str = "numa-0", + block_size: int = None, + error: str = "raise", + ): + logger.debug( + "Begin to send data (%s, %s) to %s", session_id, data_keys, address + ) + + tasks = [] + for key in data_keys: + tasks.append(self._data_manager_ref.get_store_key.delay(session_id, key)) + data_keys = await self._data_manager_ref.get_store_key.batch(*tasks) + data_keys = list(set(data_keys)) + sub_infos = await self._data_manager_ref.get_sub_infos.batch( + *[ + self._data_manager_ref.get_sub_infos.delay(session_id, key) + for key in data_keys + ] + ) + + block_size = block_size or self._transfer_block_size + receiver_ref: mo.ActorRefType[ + ReceiverManagerActor + ] = await self.get_receiver_ref(address, band_name) + get_infos = [] + pin_tasks = [] + for data_key in data_keys: + get_infos.append( + self._data_manager_ref.get_data_info.delay( + session_id, data_key, self._band_name, error + ) + ) + pin_tasks.append( + self._data_manager_ref.pin.delay( + session_id, data_key, self._band_name, error + ) + ) + await self._data_manager_ref.pin.batch(*pin_tasks) + infos = await self._data_manager_ref.get_data_info.batch(*get_infos) + filtered = [ + (data_info, data_key) + for data_info, data_key in zip(infos, data_keys) + if data_info is not None + ] + if filtered: + infos, data_keys = zip(*filtered) + else: # pragma: no cover + # no data to be transferred + return + data_sizes = [info.store_size for info in infos] + if level is None: + level = infos[0].level + is_transferring_list = await receiver_ref.open_writers( + session_id, data_keys, data_sizes, level, sub_infos + ) + to_send_keys = [] + to_wait_keys = [] + for data_key, is_transferring in zip(data_keys, is_transferring_list): + if is_transferring: + to_wait_keys.append(data_key) + else: + to_send_keys.append(data_key) + + if to_send_keys: + await self._send_data(receiver_ref, session_id, to_send_keys, block_size) + if to_wait_keys: + await receiver_ref.wait_transfer_done(session_id, to_wait_keys) + unpin_tasks = [] + for data_key in data_keys: + unpin_tasks.append( + self._data_manager_ref.unpin.delay( + session_id, [data_key], self._band_name, error="ignore" + ) + ) + await self._data_manager_ref.unpin.batch(*unpin_tasks) + logger.debug( + "Finish sending data (%s, %s) to %s, total size is %s", + session_id, + data_keys, + address, + sum(data_sizes), + ) + + +@dataslots +@dataclass +class WritingInfo: + writer: WrappedStorageFileObject + size: int + level: StorageLevel + event: asyncio.Event + ref_counts: int + + +class ReceiverManagerActor(mo.StatelessActor): + def __init__( + self, + quota_refs: Dict, + storage_handler_ref: mo.ActorRefType[StorageHandlerActor] = None, + ): + self._quota_refs = quota_refs + self._storage_handler = storage_handler_ref + self._writing_infos: Dict[tuple, WritingInfo] = dict() + self._lock = asyncio.Lock() + + async def __post_create__(self): + if self._storage_handler is None: # for test + self._storage_handler = await mo.actor_ref( + self.address, StorageHandlerActor.gen_uid("numa-0") + ) + + @classmethod + def gen_uid(cls, band_name: str): + return f"receiver_manager_{band_name}" + + def _decref_writing_key(self, session_id: str, data_key: str): + self._writing_infos[(session_id, data_key)].ref_counts -= 1 + if self._writing_infos[(session_id, data_key)].ref_counts == 0: + del self._writing_infos[(session_id, data_key)] + + async def create_writers( + self, + session_id: str, + data_keys: List[str], + data_sizes: List[int], + level: StorageLevel, + sub_infos: List, + ): + tasks = dict() + key_to_sub_infos = dict() + data_key_to_size = dict() + being_processed = [] + for data_key, data_size, sub_info in zip(data_keys, data_sizes, sub_infos): + data_key_to_size[data_key] = data_size + if (session_id, data_key) not in self._writing_infos: + being_processed.append(False) + tasks[data_key] = self._storage_handler.open_writer.delay( + session_id, data_key, data_size, level, request_quota=False + ) + key_to_sub_infos[data_key] = sub_info + else: + being_processed.append(True) + self._writing_infos[(session_id, data_key)].ref_counts += 1 + if tasks: + writers = await self._storage_handler.open_writer.batch( + *tuple(tasks.values()) + ) + for data_key, writer in zip(tasks, writers): + self._writing_infos[(session_id, data_key)] = WritingInfo( + writer, data_key_to_size[data_key], level, asyncio.Event(), 1 + ) + if key_to_sub_infos[data_key] is not None: + writer._sub_key_infos = key_to_sub_infos[data_key] + return being_processed + + async def open_writers( + self, + session_id: str, + data_keys: List[str], + data_sizes: List[int], + level: StorageLevel, + sub_infos: List, + ): + async with self._lock: + await self._storage_handler.request_quota_with_spill(level, sum(data_sizes)) + future = asyncio.create_task( + self.create_writers(session_id, data_keys, data_sizes, level, sub_infos) + ) + try: + return await future + except asyncio.CancelledError: + await self._quota_refs[level].release_quota(sum(data_sizes)) + future.cancel() + raise + + async def do_write( + self, data: list, session_id: str, data_keys: List[str], eof_marks: List[bool] + ): + # close may be a high-cost operation, use create_task + close_tasks = [] + finished_keys = [] + for data, data_key, is_eof in zip(data, data_keys, eof_marks): + writer = self._writing_infos[(session_id, data_key)].writer + if data: + await writer.write(data) + if is_eof: + close_tasks.append(writer.close()) + finished_keys.append(data_key) + await asyncio.gather(*close_tasks) + async with self._lock: + for data_key in finished_keys: + event = self._writing_infos[(session_id, data_key)].event + event.set() + self._decref_writing_key(session_id, data_key) + + async def receive_part_data( + self, data: list, session_id: str, data_keys: List[str], eof_marks: List[bool] + ): + write_task = asyncio.create_task( + self.do_write(data, session_id, data_keys, eof_marks) + ) + try: + await asyncio.shield(write_task) + except asyncio.CancelledError: + async with self._lock: + for data_key in data_keys: + if (session_id, data_key) in self._writing_infos: + if self._writing_infos[(session_id, data_key)].ref_counts == 1: + info = self._writing_infos[(session_id, data_key)] + await self._quota_refs[info.level].release_quota(info.size) + await self._storage_handler.delete( + session_id, data_key, error="ignore" + ) + await info.writer.clean_up() + info.event.set() + self._decref_writing_key(session_id, data_key) + write_task.cancel() + await write_task + raise + + async def wait_transfer_done(self, session_id, data_keys): + await asyncio.gather( + *[self._writing_infos[(session_id, key)].event.wait() for key in data_keys] + ) + async with self._lock: + for data_key in data_keys: + self._decref_writing_key(session_id, data_key) diff --git a/python/xorbits/_mars/services/storage/worker/__init__.py b/python/xorbits/_mars/services/storage/worker/__init__.py new file mode 100644 index 000000000..d128f2194 --- /dev/null +++ b/python/xorbits/_mars/services/storage/worker/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .service import StorageWorkerService diff --git a/python/xorbits/_mars/services/storage/worker/service.py b/python/xorbits/_mars/services/storage/worker/service.py new file mode 100644 index 000000000..d47d25225 --- /dev/null +++ b/python/xorbits/_mars/services/storage/worker/service.py @@ -0,0 +1,78 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .... import oscar as mo +from ...core import AbstractService +from ..core import StorageManagerActor + + +class StorageWorkerService(AbstractService): + """ + Storage service on worker + + Service Configuration + --------------------- + { + "storage": { + "backends": ["plasma"], + "": "", + } + } + """ + + async def start(self): + storage_configs = self._config["storage"] + backends = storage_configs.get("backends") + options = storage_configs.get("default_config", dict()) + transfer_block_size = options.get("transfer_block_size", None) + backend_config = {} + for backend in backends: + storage_config = storage_configs.get(backend, dict()) + backend_config[backend] = storage_config + if backend == "ray": + # Specify supervisor as ray owner will be costly when mars do shuffle which there will be m*n objects + # need to specify supervisor as owner, so enable it only for auto scale to avoid data lost when scale + # in. This limit can be removed when ray support ownership transfer. + if ( + self._config.get("scheduling", {}) + .get("autoscale", {}) + .get("enabled", False) + ): + try: + from ...cluster.api import ClusterAPI + + cluster_api = await ClusterAPI.create(self._address) + supervisor_address = (await cluster_api.get_supervisors())[0] + # ray storage backend need to set supervisor as owner to avoid data lost when worker dies. + owner = supervisor_address + except mo.ActorNotExist: + owner = self._address + else: + owner = self._address + storage_config["owner"] = owner + + await mo.create_actor( + StorageManagerActor, + backend_config, + transfer_block_size, + uid=StorageManagerActor.default_uid(), + address=self._address, + ) + + async def stop(self): + await mo.destroy_actor( + mo.create_actor_ref( + address=self._address, uid=StorageManagerActor.default_uid() + ) + ) diff --git a/python/xorbits/_mars/services/subtask/__init__.py b/python/xorbits/_mars/services/subtask/__init__.py new file mode 100644 index 000000000..bc8e89895 --- /dev/null +++ b/python/xorbits/_mars/services/subtask/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .api import MockSubtaskAPI, SubtaskAPI +from .core import Subtask, SubtaskGraph, SubtaskResult, SubtaskStatus +from .errors import SlotOccupiedAlready, SubtaskNotExist diff --git a/python/xorbits/_mars/services/subtask/api.py b/python/xorbits/_mars/services/subtask/api.py new file mode 100644 index 000000000..f2c3b83d0 --- /dev/null +++ b/python/xorbits/_mars/services/subtask/api.py @@ -0,0 +1,115 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import oscar as mo +from ...lib.aio import alru_cache +from ...oscar.backends.context import ProfilingContext +from ...oscar.profiling import MARS_ENABLE_PROFILING +from .core import Subtask + + +class SubtaskAPI: + def __init__(self, address: str): + self._address = address + + @classmethod + async def create(cls, address: str) -> "SubtaskAPI": + return SubtaskAPI(address) + + @alru_cache(cache_exceptions=False) + async def _get_runner_ref(self, band_name: str, slot_id: int): + from .worker.runner import SubtaskRunnerActor + + return await mo.actor_ref( + SubtaskRunnerActor.gen_uid(band_name, slot_id), address=self._address + ) + + @alru_cache(cache_exceptions=False) + async def _get_subtask_processor_ref(self, session_id: str, slot_address: str): + from .worker.processor import SubtaskProcessorActor + + return await mo.actor_ref( + SubtaskProcessorActor.gen_uid(session_id), address=slot_address + ) + + async def run_subtask_in_slot(self, band_name: str, slot_id: int, subtask: Subtask): + """ + Run subtask in current worker + + Parameters + ---------- + band_name + subtask + slot_id + + Returns + ------- + + """ + ref = await self._get_runner_ref(band_name, slot_id) + extra_config = subtask.extra_config + enable_profiling = MARS_ENABLE_PROFILING or ( + extra_config and extra_config.get("enable_profiling") + ) + profiling_context = ( + ProfilingContext(task_id=subtask.task_id) if enable_profiling else None + ) + return await ref.run_subtask.options(profiling_context=profiling_context).send( + subtask + ) + + async def cancel_subtask_in_slot(self, band_name: str, slot_id: int): + """ + Cancel subtask running in a worker slot and wait until it is cancelled + + Parameters + ---------- + band_name : str + name of a worker band, for instance, 'numa-0' + slot_id : int + index of a slot in a band + """ + ref = await self._get_runner_ref(band_name, slot_id) + await ref.cancel_subtask() + + async def set_running_operand_progress( + self, session_id: str, op_key: str, slot_address: str, progress: float + ): + ref = await self._get_subtask_processor_ref(session_id, slot_address) + await ref.set_running_op_progress(op_key, progress) + + +class MockSubtaskAPI(SubtaskAPI): + @classmethod + async def create(cls, address: str) -> "SubtaskAPI": + from .worker.manager import SubtaskRunnerManagerActor + + await mo.create_actor( + SubtaskRunnerManagerActor, + address, + None, + uid=SubtaskRunnerManagerActor.default_uid(), + address=address, + ) + return await super().create(address) + + @classmethod + async def cleanup(cls, address: str): + from .worker.manager import SubtaskRunnerManagerActor + + await mo.destroy_actor( + mo.create_actor_ref( + uid=SubtaskRunnerManagerActor.default_uid(), address=address + ) + ) diff --git a/python/xorbits/_mars/services/subtask/core.py b/python/xorbits/_mars/services/subtask/core.py new file mode 100644 index 000000000..a7c862231 --- /dev/null +++ b/python/xorbits/_mars/services/subtask/core.py @@ -0,0 +1,223 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from enum import Enum +from typing import Iterable, List, Optional, Set, Tuple + +from ...core import DAG, ChunkData, ChunkGraph +from ...resource import Resource +from ...serialization.serializables import ( + AnyField, + BoolField, + DictField, + FieldTypes, + Float64Field, + Int32Field, + Int64Field, + ListField, + ReferenceField, + Serializable, + StringField, + TupleField, +) +from ...serialization.serializables.field_type import TupleType +from ...typing import BandType, ChunkType + + +class SubtaskStatus(Enum): + pending = 0 + running = 1 + succeeded = 2 + errored = 3 + cancelled = 4 + + @property + def is_done(self) -> bool: + return self in ( + SubtaskStatus.succeeded, + SubtaskStatus.errored, + SubtaskStatus.cancelled, + ) + + +class Subtask(Serializable): + __slots__ = ("_repr", "_pure_depend_keys", "runtime") + + subtask_id: str = StringField("subtask_id") + subtask_name: str = StringField("subtask_name") + session_id: str = StringField("session_id") + task_id: str = StringField("task_id") + chunk_graph: ChunkGraph = ReferenceField("chunk_graph", ChunkGraph) + expect_bands: List[BandType] = ListField( + "expect_bands", TupleType(FieldTypes.string, FieldTypes.string) + ) + virtual: bool = BoolField("virtual") + retryable: bool = BoolField("retryable") + priority: Tuple[int, int] = TupleField("priority", FieldTypes.int32) + extra_config: dict = DictField("extra_config") + stage_id: str = StringField("stage_id") + # chunks that need meta updated + update_meta_chunks: List[ChunkType] = ListField( + "update_meta_chunks", FieldTypes.reference(ChunkData) + ) + # A unique and deterministic key for subtask compute logic. See logic_key in operator.py. + logic_key: str = StringField("logic_key") + # index for subtask with same compute logic. + logic_index: int = Int32Field("logic_index") + # parallelism for subtask with same compute logic. + logic_parallelism: int = Int32Field("logic_parallelism") + # subtask can only run in specified bands in `expect_bands` + bands_specified: bool = BoolField("bands_specified") + required_resource: Resource = AnyField("required_resource", Resource) + # The count of result chunks that are the stage's results. + stage_n_outputs: int = Int32Field("stage_n_outputs") + + def __init__( + self, + subtask_id: str = None, + session_id: str = None, + task_id: str = None, + chunk_graph: ChunkGraph = None, + subtask_name: str = None, + expect_bands: List[BandType] = None, + priority: Tuple[int, int] = None, + virtual: bool = False, + retryable: bool = True, + extra_config: dict = None, + stage_id: str = None, + update_meta_chunks: List[ChunkType] = None, + logic_key: str = None, + logic_index: int = None, + logic_parallelism: int = None, + bands_specified: bool = False, + required_resource: Resource = None, + stage_n_outputs: int = 0, + ): + super().__init__( + subtask_id=subtask_id, + subtask_name=subtask_name, + session_id=session_id, + task_id=task_id, + chunk_graph=chunk_graph, + expect_bands=expect_bands, + priority=priority, + virtual=virtual, + retryable=retryable, + extra_config=extra_config, + stage_id=stage_id, + update_meta_chunks=update_meta_chunks, + logic_key=logic_key, + logic_index=logic_index, + logic_parallelism=logic_parallelism, + bands_specified=bands_specified, + required_resource=required_resource, + stage_n_outputs=stage_n_outputs, + ) + self._pure_depend_keys = None + self._repr = None + self.runtime = None + + def __on_deserialize__(self): + super(Subtask, self).__on_deserialize__() + self._pure_depend_keys = None + self._repr = None + self.runtime = None + + @property + def expect_band(self): + if self.expect_bands: + return self.expect_bands[0] + + @property + def pure_depend_keys(self) -> Set[str]: + if self._pure_depend_keys is not None: + return self._pure_depend_keys + pure_dep_keys = set() + for n in self.chunk_graph: + pure_dep_keys.update( + inp.key + for inp, pure_dep in zip(n.inputs, n.op.pure_depends) + if pure_dep + ) + self._pure_depend_keys = pure_dep_keys + return pure_dep_keys + + def __repr__(self): + if self._repr is not None: + return self._repr + + if self.chunk_graph: + result_chunk_repr = " ".join( + [ + f"{type(chunk.op).__name__}({chunk.key})" + for chunk in self.chunk_graph.result_chunks + ] + ) + else: # pragma: no cover + result_chunk_repr = None + self._repr = f"" + return self._repr + + +class SubtaskResult(Serializable): + subtask_id: str = StringField("subtask_id") + session_id: str = StringField("session_id") + task_id: str = StringField("task_id") + stage_id: str = StringField("stage_id") + status: SubtaskStatus = ReferenceField("status", SubtaskStatus) + progress: float = Float64Field("progress", default=0.0) + data_size: int = Int64Field("data_size", default=None) + bands: List[BandType] = ListField("band", FieldTypes.tuple, default=None) + error = AnyField("error", default=None) + traceback = AnyField("traceback", default=None) + # The following is the execution information of the subtask + execution_start_time: float = Float64Field("execution_start_time") + execution_end_time: float = Float64Field("execution_end_time") + + def update(self, result: Optional["SubtaskResult"]): + if result and result.bands: + bands = self.bands or [] + self.bands = sorted(set(bands + result.bands)) + self.execution_start_time = result.execution_start_time + if hasattr(result, "execution_end_time"): + self.execution_end_time = result.execution_end_time + return self + + +class SubtaskGraph(DAG, Iterable[Subtask]): + """ + Subtask graph. + """ + + def __init__(self): + super().__init__() + self._proxy_subtasks = [] + + @classmethod + def _extract_operands(cls, node: Subtask): + from ...core.operand import Fetch, FetchShuffle + + for node in node.chunk_graph: + if isinstance(node.op, (Fetch, FetchShuffle)): + continue + yield node.op + + def add_shuffle_proxy_subtask(self, proxy_subtask): + self._proxy_subtasks.append(proxy_subtask) + + def num_shuffles(self) -> int: + return len(self._proxy_subtasks) + + def get_shuffle_proxy_subtasks(self): + return self._proxy_subtasks diff --git a/python/xorbits/_mars/services/subtask/errors.py b/python/xorbits/_mars/services/subtask/errors.py new file mode 100644 index 000000000..c27607ad6 --- /dev/null +++ b/python/xorbits/_mars/services/subtask/errors.py @@ -0,0 +1,21 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class SubtaskNotExist(Exception): + pass + + +class SlotOccupiedAlready(Exception): + pass diff --git a/python/xorbits/_mars/services/subtask/supervisor/__init__.py b/python/xorbits/_mars/services/subtask/supervisor/__init__.py new file mode 100644 index 000000000..62c018d71 --- /dev/null +++ b/python/xorbits/_mars/services/subtask/supervisor/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...core import EmptyService + + +class SubtaskSupervisorService(EmptyService): + pass diff --git a/python/xorbits/_mars/services/subtask/tests/__init__.py b/python/xorbits/_mars/services/subtask/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/services/subtask/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/services/subtask/tests/test_service.py b/python/xorbits/_mars/services/subtask/tests/test_service.py new file mode 100644 index 000000000..928aab651 --- /dev/null +++ b/python/xorbits/_mars/services/subtask/tests/test_service.py @@ -0,0 +1,157 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import time + +import numpy as np +import pytest + +from .... import oscar as mo +from .... import remote as mr +from .... import tensor as mt +from ....core.graph import ChunkGraphBuilder, TileableGraph, TileableGraphBuilder +from ....resource import Resource +from ....utils import Timer +from ... import NodeRole, start_services, stop_services +from ...meta import MetaAPI +from ...session import SessionAPI +from ...storage import MockStorageAPI +from ...task import new_task_id +from ...task.supervisor.manager import TaskManagerActor +from .. import Subtask, SubtaskAPI, SubtaskResult + + +class FakeTaskManager(TaskManagerActor): + def set_subtask_result(self, subtask_result: SubtaskResult): + return + + +def _gen_subtask(t, session_id): + graph = TileableGraph([t.data]) + next(TileableGraphBuilder(graph).build()) + + chunk_graph = next(ChunkGraphBuilder(graph, fuse_enabled=False).build()) + subtask = Subtask(new_task_id(), session_id, new_task_id(), chunk_graph) + + return subtask + + +@pytest.fixture +async def actor_pools(): + async def start_pool(is_worker: bool): + if is_worker: + kw = dict( + n_process=2, + labels=["main"] + ["numa-0"] * 2, + subprocess_start_method="spawn", + ) + else: + kw = dict(n_process=0, subprocess_start_method="spawn") + pool = await mo.create_actor_pool("127.0.0.1", **kw) + await pool.start() + return pool + + try: + sv_pool, worker_pool = await asyncio.gather(start_pool(False), start_pool(True)) + yield sv_pool, worker_pool + finally: + await asyncio.gather(sv_pool.stop(), worker_pool.stop()) + + +@pytest.mark.asyncio +async def test_subtask_service(actor_pools): + sv_pool, worker_pool = actor_pools + + config = { + "services": [ + "cluster", + "session", + "meta", + "lifecycle", + "scheduling", + "subtask", + "task", + "mutable", + ], + "cluster": { + "backend": "fixed", + "lookup_address": sv_pool.external_address, + "resource": {"numa-0": Resource(num_cpus=2)}, + }, + "meta": {"store": "dict"}, + "scheduling": {}, + "subtask": {}, + } + await start_services(NodeRole.SUPERVISOR, config, address=sv_pool.external_address) + await start_services(NodeRole.WORKER, config, address=worker_pool.external_address) + + session_id = "test_session" + session_api = await SessionAPI.create(sv_pool.external_address) + await session_api.create_session(session_id) + ref = await mo.actor_ref( + FakeTaskManager.gen_uid(session_id), address=sv_pool.external_address + ) + await mo.destroy_actor(ref) + await mo.create_actor( + FakeTaskManager, + session_id, + uid=FakeTaskManager.gen_uid(session_id), + address=sv_pool.external_address, + ) + + subtask_api = await SubtaskAPI.create(worker_pool.external_address) + # create mock meta and storage APIs + meta_api = await MetaAPI.create(session_id, sv_pool.external_address) + storage_api = await MockStorageAPI.create(session_id, worker_pool.external_address) + + a = mt.ones((10, 10), chunk_size=10) + b = a + 1 + + subtask = _gen_subtask(b, session_id) + assert "TensorAdd" in repr(subtask) + await subtask_api.run_subtask_in_slot("numa-0", 0, subtask) + + # check storage + expected = np.ones((10, 10)) + 1 + result_key = subtask.chunk_graph.results[0].key + result = await storage_api.get(result_key) + np.testing.assert_array_equal(expected, result) + + # check meta + chunk_meta = await meta_api.get_chunk_meta(result_key) + assert chunk_meta is not None + assert chunk_meta["bands"][0] == (worker_pool.external_address, "numa-0") + + def sleep(timeout: int): + time.sleep(timeout) + return timeout + + b = mr.spawn(sleep, 1) + + subtask2 = _gen_subtask(b, session_id) + asyncio.create_task(subtask_api.run_subtask_in_slot("numa-0", 0, subtask2)) + await asyncio.sleep(0.2) + with Timer() as timer: + # normal cancel by cancel asyncio Task + await asyncio.wait_for( + subtask_api.cancel_subtask_in_slot("numa-0", 0), timeout=2 + ) + # need 1 sec to reach timeout, then killing actor and wait for auto recovering + # the time would not be over 5 sec + assert timer.duration < 2 + + await MockStorageAPI.cleanup(worker_pool.external_address) + await stop_services(NodeRole.WORKER, config, address=worker_pool.external_address) + await stop_services(NodeRole.SUPERVISOR, config, address=sv_pool.external_address) diff --git a/python/xorbits/_mars/services/subtask/utils.py b/python/xorbits/_mars/services/subtask/utils.py new file mode 100644 index 000000000..f71893933 --- /dev/null +++ b/python/xorbits/_mars/services/subtask/utils.py @@ -0,0 +1,79 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, Iterator, List, Tuple + +from ...core import ChunkGraph +from ...core.operand import Fetch, FetchShuffle, MapReduceOperand, VirtualOperand +from .core import Subtask + + +def iter_input_data_keys( + subtask: Subtask, + chunk_graph: ChunkGraph, + chunk_key_to_data_keys: Dict[str, List[str]], +) -> Iterator[Tuple[str, bool]]: + """An iterator yield (input data key, is shuffle).""" + data_keys = set() + for chunk in chunk_graph.iter_indep(): + if isinstance(chunk.op, Fetch) and chunk.key not in subtask.pure_depend_keys: + data_keys.add(chunk.key) + yield chunk.key, False + elif isinstance(chunk.op, FetchShuffle): + for key in chunk_key_to_data_keys[chunk.key]: + if key not in data_keys: + data_keys.add(key) + yield key, True + + +def get_mapper_data_keys(key: str, context: Dict[str, Any]) -> List[str]: + """Get the mapper data keys of key from context.""" + return [ + store_key + for store_key in context + if isinstance(store_key, tuple) and store_key[0] == key + ] + + +def iter_output_data( + chunk_graph: ChunkGraph, context: Dict[str, Any] +) -> Iterator[Tuple[str, Any, bool]]: + """An iterator yield (output chunk key, output data, is shuffle).""" + data_keys = set() + for result_chunk in chunk_graph.result_chunks: + # skip virtual operands for result chunks + if isinstance(result_chunk.op, VirtualOperand): + continue + key = result_chunk.key + if key in context: + # non shuffle op + data = context[key] + # update meta + if not isinstance(data, tuple): + result_chunk.params = result_chunk.get_params_from_data(data) + # check key after update meta + if key in data_keys: + continue + yield key, data, False + data_keys.add(key) + else: + assert isinstance(result_chunk.op, MapReduceOperand) + keys = get_mapper_data_keys(key, context) + for key in keys: + if key in data_keys: + continue + # shuffle op + data = context[key] + yield key, data, True + data_keys.add(key) diff --git a/python/xorbits/_mars/services/subtask/worker/__init__.py b/python/xorbits/_mars/services/subtask/worker/__init__.py new file mode 100644 index 000000000..41ac782f1 --- /dev/null +++ b/python/xorbits/_mars/services/subtask/worker/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .service import SubtaskWorkerService diff --git a/python/xorbits/_mars/services/subtask/worker/manager.py b/python/xorbits/_mars/services/subtask/worker/manager.py new file mode 100644 index 000000000..367840b1e --- /dev/null +++ b/python/xorbits/_mars/services/subtask/worker/manager.py @@ -0,0 +1,60 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import asyncio +from typing import Type + +from .... import oscar as mo +from ....oscar.backends.allocate_strategy import IdleLabel +from .runner import SubtaskRunnerActor + + +class SubtaskRunnerManagerActor(mo.Actor): + def __init__(self, worker_address: str, subtask_processor_cls: Type): + # specify subtask process class + # for test purpose + self._worker_address = worker_address + self._subtask_processor_cls = subtask_processor_cls + self._cluster_api = None + + self._band_slot_runner_refs = dict() + + async def __post_create__(self): + from ...cluster.api import ClusterAPI + + self._cluster_api = await ClusterAPI.create(self.address) + + band_to_resource = await self._cluster_api.get_bands() + for band, resource in band_to_resource.items(): + await self._create_band_runner_actors( + band[1], int(resource.num_cpus or resource.num_gpus) + ) + + async def _create_band_runner_actors(self, band_name: str, n_slots: int): + strategy = IdleLabel(band_name, "subtask_runner") + band = (self.address, band_name) + for slot_id in range(n_slots): + self._band_slot_runner_refs[(band_name, slot_id)] = await mo.create_actor( + SubtaskRunnerActor, + band, + worker_address=self._worker_address, + subtask_processor_cls=self._subtask_processor_cls, + uid=SubtaskRunnerActor.gen_uid(band_name, slot_id), + address=self.address, + allocate_strategy=strategy, + ) + + async def __pre_destroy__(self): + await asyncio.gather( + *[mo.destroy_actor(ref) for ref in self._band_slot_runner_refs.values()] + ) diff --git a/python/xorbits/_mars/services/subtask/worker/processor.py b/python/xorbits/_mars/services/subtask/worker/processor.py new file mode 100644 index 000000000..1f8242695 --- /dev/null +++ b/python/xorbits/_mars/services/subtask/worker/processor.py @@ -0,0 +1,763 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import logging +import sys +import time +from collections import defaultdict +from typing import Any, Dict, List, Optional, Set, Tuple, Type + +from .... import oscar as mo +from ....core import ChunkGraph, ExecutionError, OperandType, enter_mode +from ....core.context import get_context +from ....core.operand import Fetch, FetchShuffle, execute +from ....lib.aio import alru_cache +from ....metrics import Metrics +from ....optimization.physical import optimize +from ....serialization import AioSerializer +from ....typing import BandType, ChunkType +from ....utils import calc_data_size, get_chunk_key_to_data_keys +from ...context import ThreadedServiceContext +from ...meta.api import MetaAPI, WorkerMetaAPI +from ...session import SessionAPI +from ...storage import StorageAPI +from ...task import TaskAPI, task_options +from ..core import Subtask, SubtaskResult, SubtaskStatus +from ..utils import get_mapper_data_keys, iter_input_data_keys, iter_output_data + +logger = logging.getLogger(__name__) + + +class ProcessorContext(dict): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._current_chunk = None + + def __getattr__(self, attr): + ctx = get_context() + return getattr(ctx, attr) + + def set_current_chunk(self, chunk: ChunkType): + """Set current executing chunk.""" + self._current_chunk = chunk + + def get_current_chunk(self) -> ChunkType: + """Get current executing chunk.""" + return self._current_chunk + + +BASIC_META_FIELDS = ["memory_size", "store_size", "bands", "object_ref"] + + +class SubtaskProcessor: + _chunk_graph: ChunkGraph + _chunk_key_to_data_keys: Dict[str, List[str]] + + def __init__( + self, + subtask: Subtask, + session_api: SessionAPI, + storage_api: StorageAPI, + meta_api: MetaAPI, + worker_meta_api: WorkerMetaAPI, + band: BandType, + supervisor_address: str, + engines: List[str] = None, + ): + self.subtask = subtask + self._session_id = self.subtask.session_id + self._chunk_graph = subtask.chunk_graph + self._actual_chunk_count = len( + [ + chunk + for chunk in subtask.chunk_graph + if not isinstance(chunk.op, (Fetch, FetchShuffle)) + ] + ) + self._band = band + self._supervisor_address = supervisor_address + self._engines = engines if engines is not None else task_options.runtime_engines + + # result + self.result = SubtaskResult( + subtask_id=subtask.subtask_id, + session_id=subtask.session_id, + task_id=subtask.task_id, + stage_id=subtask.stage_id, + status=SubtaskStatus.pending, + bands=[self._band], + progress=0.0, + execution_start_time=time.time(), + ) + self.is_done = asyncio.Event() + + # status and intermediate states + # operand progress, from op key to progress + self._op_progress: Dict[str, float] = defaultdict(lambda: 0.0) + # temp data store that holds chunk data during computation + self._processor_context = ProcessorContext() + # chunk key to real data keys + self._chunk_key_to_data_keys = dict() + + # other service APIs + self._session_api = session_api + self._storage_api = storage_api + self._meta_api = meta_api + self._worker_meta_api = worker_meta_api + + # add metrics + self._subtask_execution_time = Metrics.gauge( + "mars.subtask_execution_time_secs", + "Time consuming in seconds to execute a subtask", + ("session_id", "subtask_id"), + ) + + @property + def status(self): + return self.result.status + + @property + def subtask_id(self): + return self.subtask.subtask_id + + async def _load_input_data(self): + keys, gets, accept_nones = [], [], [] + for key, is_shuffle in iter_input_data_keys( + self.subtask, self._chunk_graph, self._chunk_key_to_data_keys + ): + keys.append(key) + accept_nones.append(not is_shuffle) + gets_params = {"error": "ignore"} if is_shuffle else {} + gets.append(self._storage_api.get.delay(key, **gets_params)) + if keys: + logger.debug( + "Start getting input data, keys: %.500s, subtask id: %s", + keys, + self.subtask.subtask_id, + ) + inputs = await self._storage_api.get.batch(*gets) + self._processor_context.update( + { + key: get + for key, get, accept_none in zip(keys, inputs, accept_nones) + if accept_none or get is not None + } + ) + logger.debug( + "Finish getting input data keys: %.500s, subtask id: %s", + keys, + self.subtask.subtask_id, + ) + return keys + + @staticmethod + async def notify_task_manager_result( + supervisor_address: str, result: SubtaskResult + ): + task_api = await TaskAPI.create(result.session_id, supervisor_address) + # notify task service + await task_api.set_subtask_result(result) + + def _init_ref_counts(self) -> Dict[str, int]: + chunk_graph = self._chunk_graph + ref_counts = defaultdict(lambda: 0) + # set 1 for result chunks + for result_chunk in chunk_graph.result_chunks: + ref_counts[result_chunk.key] += 1 + # iter graph to set ref counts + for chunk in chunk_graph: + ref_counts[chunk.key] += chunk_graph.count_successors(chunk) + return ref_counts + + async def _async_execute_operand(self, ctx: Dict[str, Any], op: OperandType): + if not isinstance(op, (Fetch, FetchShuffle)): + self._op_progress[op.key] = 0.0 + get_context().set_running_operand_key(self._session_id, op.key) + return asyncio.to_thread(self._execute_operand, ctx, op) + + def set_op_progress(self, op_key: str, progress: float): + if op_key in self._op_progress: # pragma: no branch + self._op_progress[op_key] = progress + + @enter_mode(build=False, kernel=True) + def _execute_operand( + self, ctx: Dict[str, Any], op: OperandType + ): # noqa: R0201 # pylint: disable=no-self-use + try: + return execute(ctx, op) + except BaseException as ex: + # wrap exception in execution to avoid side effects + raise ExecutionError(ex).with_traceback(ex.__traceback__) from None + + async def _execute_graph(self, chunk_graph: ChunkGraph): + loop = asyncio.get_running_loop() + ref_counts = self._init_ref_counts() + + # from data_key to results + for chunk in chunk_graph.topological_iter(): + if chunk.key not in self._processor_context: + # since `op.execute` may be a time-consuming operation, + # we make it run in a thread pool to not block current thread. + logger.debug( + "Start executing operand: %s, chunk: %s, subtask id: %s", + chunk.op, + chunk, + self.subtask.subtask_id, + ) + self._processor_context.set_current_chunk(chunk) + future = asyncio.create_task( + await self._async_execute_operand(self._processor_context, chunk.op) + ) + to_wait = loop.create_future() + + def cb(fut): + if not to_wait.done(): + if fut.exception(): + to_wait.set_exception(fut.exception()) + else: + to_wait.set_result(fut.result()) + + future.add_done_callback(cb) + + try: + await to_wait + logger.debug( + "Finish executing operand: %s, chunk: %s, subtask id: %s", + chunk.op, + chunk, + self.subtask.subtask_id, + ) + except asyncio.CancelledError: + logger.debug( + "Receive cancel instruction for operand: %s," + "chunk: %s, subtask id: %s", + chunk.op, + chunk, + self.subtask.subtask_id, + ) + # wait for this computation to finish + await future + # if cancelled, stop next computation + logger.debug( + "Cancelled operand: %s, chunk: %s, subtask id: %s", + chunk.op, + chunk, + self.subtask.subtask_id, + ) + self.result.status = SubtaskStatus.cancelled + raise + + self.set_op_progress(chunk.op.key, 1.0) + + for inp in chunk_graph.iter_predecessors(chunk): + ref_counts[inp.key] -= 1 + if ref_counts[inp.key] == 0: + # ref count reaches 0, remove it + for key in self._chunk_key_to_data_keys[inp.key]: + if key in self._processor_context: + del self._processor_context[key] + + async def _unpin_data(self, data_keys): + # unpin input keys + unpins = [] + shuffle_unpins = [] + for key in data_keys: + if isinstance(key, tuple): + # a tuple key means it's a shuffle key, + # some shuffle data is None and not stored in storage + shuffle_unpins.append( + self._storage_api.unpin.delay(key, error="ignore") + ) + else: + unpins.append(self._storage_api.unpin.delay(key)) + if unpins: + await self._storage_api.unpin.batch(*unpins) + if shuffle_unpins: + # TODO(hks): The batch method doesn't accept different error arguments, + # combine them when it can. + await self._storage_api.unpin.batch(*shuffle_unpins) + + async def _store_data(self, chunk_graph: ChunkGraph): + # store data into storage + data_key_to_puts = {} + shuffle_key_to_data = {} + is_storage_seekable = await self._storage_api.is_seekable() + for key, data, _ in iter_output_data(chunk_graph, self._processor_context): + if isinstance(key, tuple) and is_storage_seekable: + shuffle_key_to_data[key] = data + else: + put = self._storage_api.put.delay(key, data) + data_key_to_puts[key] = put + + stored_keys = list(data_key_to_puts.keys()) + puts = data_key_to_puts.values() + logger.debug( + "Start putting data keys: %s, subtask id: %s", + stored_keys, + self.subtask.subtask_id, + ) + data_key_to_store_size = dict() + data_key_to_memory_size = dict() + data_key_to_object_id = dict() + if puts: + put_infos = asyncio.create_task(self._storage_api.put.batch(*puts)) + try: + store_infos = await put_infos + for store_key, store_info in zip(stored_keys, store_infos): + data_key_to_store_size[store_key] = store_info.store_size + data_key_to_memory_size[store_key] = store_info.memory_size + data_key_to_object_id[store_key] = store_info.object_id + logger.debug( + "Finish putting data keys: %s, subtask id: %s", + stored_keys, + self.subtask.subtask_id, + ) + except asyncio.CancelledError: + logger.debug( + "Cancelling put data keys: %s, subtask id: %s", + stored_keys, + self.subtask.subtask_id, + ) + put_infos.cancel() + + logger.debug( + "Cancelled put data keys: %s, subtask id: %s", + stored_keys, + self.subtask.subtask_id, + ) + self.result.status = SubtaskStatus.cancelled + raise + + if shuffle_key_to_data: + await self._store_mapper_data( + shuffle_key_to_data, + data_key_to_store_size, + data_key_to_memory_size, + data_key_to_object_id, + ) + # clear data + self._processor_context = ProcessorContext() + return ( + stored_keys, + data_key_to_store_size, + data_key_to_memory_size, + data_key_to_object_id, + ) + + async def _write_aggregated_mapper_data( + self, key_and_band: Tuple, objects: List, data_keys: List + ): + serialization_tasks = [AioSerializer(obj).run() for obj in objects] + + def calc_memory_size(objs): + return sum(calc_data_size(obj) for obj in objs) + + memory_size = await asyncio.to_thread(calc_memory_size, objects) + + buffer_list = await asyncio.gather(*serialization_tasks) + sizes = [ + sum(b.size if hasattr(b, "size") else len(b) for b in buf) + for buf in buffer_list + ] + writer = await self._storage_api.open_writer(key_and_band, sum(sizes)) + offset = 0 + for buffers, size, data_key in zip(buffer_list, sizes, data_keys): + for buf in buffers: + await writer.write(buf) + writer.commit_once(data_key, offset, size) + offset += size + await writer.close() + return key_and_band, memory_size, sum(sizes), writer._object_id + + async def _store_mapper_data( + self, + shuffle_key_to_data: Dict, + data_key_to_store_size: Dict, + data_key_to_memory_size: Dict, + data_key_to_object_id: Dict, + ): + band_to_mapper_key = defaultdict(list) + for result_chunk in self._chunk_graph.result_chunks: + map_reduce_id = getattr(result_chunk, "extra_params", dict()).get( + "analyzer_map_reduce_id" + ) + if map_reduce_id is None: + continue + reducer_index_to_bands = await self._gen_reducer_index_to_bands( + self._session_id, + self._supervisor_address, + self.subtask.task_id, + map_reduce_id, + ) + for reducer_index, band in reducer_index_to_bands.items(): + # mapper key is a tuple + band_to_mapper_key[(result_chunk.key, band)].append( + (result_chunk.key, reducer_index) + ) + + write_tasks = [] + for key_and_band, shuffle_keys in band_to_mapper_key.items(): + objects = [shuffle_key_to_data[key] for key in shuffle_keys] + write_tasks.append( + self._write_aggregated_mapper_data(key_and_band, objects, shuffle_keys) + ) + infos = await asyncio.gather(*write_tasks) + for key, memory_size, store_size, object_id in infos: + data_key_to_memory_size[key] = memory_size + data_key_to_store_size[key] = store_size + data_key_to_object_id[key] = object_id + + async def _store_meta( + self, + chunk_graph: ChunkGraph, + data_key_to_store_size: Dict, + data_key_to_memory_size: Dict, + data_key_to_object_id: Dict, + update_meta_chunks: Set[ChunkType], + ): + # store meta + set_chunk_metas = [] + set_worker_chunk_metas = [] + result_data_size = 0 + set_meta_keys = [] + for result_chunk in chunk_graph.result_chunks: + chunk_key = result_chunk.key + set_meta_keys.append(chunk_key) + if chunk_key in data_key_to_store_size: + # normal chunk + store_size = data_key_to_store_size[chunk_key] + memory_size = data_key_to_memory_size[chunk_key] + result_data_size += memory_size + object_ref = data_key_to_object_id[chunk_key] + else: + # mapper chunk + mapper_keys = get_mapper_data_keys(chunk_key, data_key_to_store_size) + store_size = sum(data_key_to_store_size[k] for k in mapper_keys) + memory_size = sum(data_key_to_memory_size[k] for k in mapper_keys) + # Skip meta for shuffle + object_ref = None + # for worker, if chunk in update_meta_chunks + # save meta including dtypes_value etc, otherwise, + # save basic meta only + if result_chunk in update_meta_chunks: + set_worker_chunk_metas.append( + self._worker_meta_api.set_chunk_meta.delay( + result_chunk, + memory_size=memory_size, + store_size=store_size, + bands=[self._band], + chunk_key=chunk_key, + exclude_fields=["object_ref"], + ) + ) + # for supervisor, only save basic meta that is small like memory_size etc + set_chunk_metas.append( + self._meta_api.set_chunk_meta.delay( + result_chunk, + memory_size=memory_size, + store_size=store_size, + bands=[self._band], + chunk_key=chunk_key, + object_ref=object_ref, + fields=BASIC_META_FIELDS, + ) + ) + logger.debug( + "Start storing chunk metas for data keys: %s, subtask id: %s", + set_meta_keys, + self.subtask.subtask_id, + ) + if set_chunk_metas: + f = asyncio.get_running_loop().create_future() + + async def set_chunks_meta(): + coros = [] + if set_worker_chunk_metas: + coros.append( + self._worker_meta_api.set_chunk_meta.batch( + *set_worker_chunk_metas + ) + ) + coros.append(self._meta_api.set_chunk_meta.batch(*set_chunk_metas)) + await asyncio.gather(*coros) + logger.debug( + "Finish store chunk metas for data keys: %s, subtask id: %s", + set_meta_keys, + self.subtask.subtask_id, + ) + f.set_result(None) + + try: + # Since we don't delete chunk data on this worker, + # we need to ensure chunk meta are recorded + # in meta service, so that `processor.decref_stage` + # can delete the chunk data finally. + await asyncio.shield(set_chunks_meta()) + except asyncio.CancelledError: # pragma: no cover + await f + raise + # set result data size + self.result.data_size = result_data_size + + @classmethod + @alru_cache(cache_exceptions=False) + async def _gen_reducer_index_to_bands( + cls, session_id: str, supervisor_address: str, task_id: str, map_reduce_id: int + ) -> Dict[Tuple[int], BandType]: + task_api = await TaskAPI.create(session_id, supervisor_address) + map_reduce_info = await task_api.get_map_reduce_info(task_id, map_reduce_id) + assert len(map_reduce_info.reducer_indexes) == len( + map_reduce_info.reducer_bands + ) + return { + reducer_index: band + for reducer_index, band in zip( + map_reduce_info.reducer_indexes, map_reduce_info.reducer_bands + ) + } + + async def done(self): + if self.result.status == SubtaskStatus.running: + self.result.status = SubtaskStatus.succeeded + # Only update end time when subtask succeeded + self.result.execution_end_time = time.time() + self.result.progress = 1.0 + self.is_done.set() + + async def run(self): + self.result.status = SubtaskStatus.running + input_keys = None + unpinned = False + try: + raw_result_chunks = list(self._chunk_graph.result_chunks) + chunk_graph = optimize(self._chunk_graph, self._engines) + self._chunk_key_to_data_keys = get_chunk_key_to_data_keys(chunk_graph) + report_progress = asyncio.create_task(self.report_progress_periodically()) + + result_chunk_to_optimized = { + c: o for c, o in zip(raw_result_chunks, chunk_graph.result_chunks) + } + raw_update_meta_chunks = self.subtask.update_meta_chunks + if raw_update_meta_chunks is None: + raw_update_meta_chunks = raw_result_chunks + update_meta_chunks = { + result_chunk_to_optimized[c] for c in raw_update_meta_chunks + } + + # load inputs data + input_keys = await self._load_input_data() + try: + # execute chunk graph + await self._execute_graph(chunk_graph) + finally: + # unpin inputs data + unpinned = True + await self._unpin_data(input_keys) + # store results data + ( + stored_keys, + store_sizes, + memory_sizes, + data_key_to_object_id, + ) = await self._store_data(chunk_graph) + # store meta + await self._store_meta( + chunk_graph, + store_sizes, + memory_sizes, + data_key_to_object_id, + update_meta_chunks, + ) + except asyncio.CancelledError: + self.result.status = SubtaskStatus.cancelled + self.result.progress = 1.0 + raise + except ( + BaseException + ) as ex: # noqa: E722 # nosec # pylint: disable=bare-except + self.result.status = SubtaskStatus.errored + self.result.progress = 1.0 + if isinstance(ex, ExecutionError): + self.result.error = ex.nested_error + self.result.traceback = ex.nested_error.__traceback__ + else: # pragma: no cover + _, self.result.error, self.result.traceback = sys.exc_info() + await self.done() + raise + finally: + if input_keys is not None and not unpinned: + await self._unpin_data(input_keys) + + await self.done() + if self.result.status == SubtaskStatus.succeeded: + cost_time_secs = ( + self.result.execution_end_time - self.result.execution_start_time + ) + logger.info( + "Time consuming to execute a subtask is %ss with session_id %s, subtask_id %s", + cost_time_secs, + self._session_id, + self.subtask.subtask_id, + ) + self._subtask_execution_time.record( + cost_time_secs, + {"session_id": self._session_id, "subtask_id": self.subtask.subtask_id}, + ) + report_progress.cancel() + try: + await report_progress + except asyncio.CancelledError: + pass + return self.result + + async def report_progress_periodically(self, interval=0.5, eps=0.001): + last_progress = self.result.progress + while not self.result.status.is_done: + size = self._actual_chunk_count + progress = sum(self._op_progress.values()) / size + assert progress <= 1 + self.result.progress = progress + if abs(last_progress - progress) >= eps: + # report progress + if not self.result.status.is_done: + fut = self.notify_task_manager_result( + self._supervisor_address, self.result + ) + if fut: + await fut + await asyncio.sleep(interval) + last_progress = progress + + +class SubtaskProcessorActor(mo.Actor): + _session_api: Optional[SessionAPI] + _storage_api: Optional[StorageAPI] + _meta_api: Optional[MetaAPI] + _worker_meta_api: Optional[WorkerMetaAPI] + _processor: Optional[SubtaskProcessor] + _last_processor: Optional[SubtaskProcessor] + _running_aio_task: Optional[asyncio.Task] + + def __init__( + self, + session_id: str, + band: BandType, + supervisor_address: str, + worker_address: str, + subtask_processor_cls: Type[SubtaskProcessor], + ): + self._session_id = session_id + self._band = band + self._supervisor_address = supervisor_address + self._worker_address = worker_address + self._subtask_processor_cls = subtask_processor_cls + + # current processor + self._processor = None + self._last_processor = None + self._running_aio_task = None + + self._session_api = None + self._storage_api = None + self._meta_api = None + self._worker_meta_api = None + + @classmethod + def gen_uid(cls, session_id: str): + return f"{session_id}_subtask_processor" + + async def __post_create__(self): + coros = [ + SessionAPI.create(self._supervisor_address), + StorageAPI.create(self._session_id, self.address, self._band[1]), + MetaAPI.create(self._session_id, self._supervisor_address), + WorkerMetaAPI.create(self._session_id, self.address), + ] + coros = [asyncio.ensure_future(coro) for coro in coros] + await asyncio.gather(*coros) + self._session_api, self._storage_api, self._meta_api, self._worker_meta_api = [ + coro.result() for coro in coros + ] + + async def _init_context(self, session_id: str) -> ThreadedServiceContext: + loop = asyncio.get_running_loop() + context = ThreadedServiceContext( + session_id, + self._supervisor_address, + self._worker_address, + self.address, + loop, + band=self._band, + ) + await context.init() + return context + + async def run(self, subtask: Subtask): + logger.info( + "Start to run subtask: %r on %s. chunk graph contains %s", + subtask, + self.address, + [c for c in subtask.chunk_graph], + ) + + assert subtask.session_id == self._session_id + + # init context + ctx = await self._init_context(self._session_id) + with ctx: + processor = self._subtask_processor_cls( + subtask, + self._session_api, + self._storage_api, + self._meta_api, + self._worker_meta_api, + self._band, + self._supervisor_address, + ) + self._processor = self._last_processor = processor + self._running_aio_task = asyncio.create_task(processor.run()) + try: + result = yield self._running_aio_task + logger.info("Finished subtask: %s", subtask.subtask_id) + raise mo.Return(result) + finally: + self._processor = self._running_aio_task = None + + async def wait(self): + return self._processor.is_done.wait() + + async def result(self): + return self._last_processor.result + + async def cancel(self): + logger.info("Cancelling subtask: %s", self._processor.subtask_id) + + aio_task = self._running_aio_task + aio_task.cancel() + + async def waiter(): + try: + await aio_task + except asyncio.CancelledError: + pass + + # return asyncio task to not block current actor + return waiter() + + def get_running_subtask_id(self): + return self._processor.subtask_id + + def set_running_op_progress(self, op_key: str, progress: float): + self._processor.set_op_progress(op_key, progress) diff --git a/python/xorbits/_mars/services/subtask/worker/runner.py b/python/xorbits/_mars/services/subtask/worker/runner.py new file mode 100644 index 000000000..8fa3c1263 --- /dev/null +++ b/python/xorbits/_mars/services/subtask/worker/runner.py @@ -0,0 +1,143 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import importlib +import logging +from typing import Dict, Optional, Type + +from .... import oscar as mo +from ....lib.aio import alru_cache +from ....typing import BandType +from ...cluster import ClusterAPI +from ..core import Subtask, SubtaskResult +from ..errors import SlotOccupiedAlready +from .processor import SubtaskProcessor, SubtaskProcessorActor + +logger = logging.getLogger(__name__) + + +SubtaskRunnerRef = mo.ActorRefType["SubtaskRunnerActor"] + + +class SubtaskRunnerActor(mo.Actor): + _session_id_to_processors: Dict[str, mo.ActorRefType[SubtaskProcessorActor]] + _running_processor: Optional[mo.ActorRefType[SubtaskProcessorActor]] + _last_processor: Optional[mo.ActorRefType[SubtaskProcessorActor]] + + @classmethod + def gen_uid(cls, band_name: str, slot_id: int): + return f"slot_{band_name}_{slot_id}_subtask_runner" + + def __init__( + self, band: BandType, worker_address: str, subtask_processor_cls: Type = None + ): + self._band = band + self._worker_address = worker_address + self._subtask_processor_cls = self._get_subtask_processor_cls( + subtask_processor_cls + ) + + self._cluster_api = None + + self._session_id_to_processors = dict() + self._running_processor = None + self._last_processor = None + + async def __post_create__(self): + self._cluster_api = await ClusterAPI.create(address=self.address) + + async def __pre_destroy__(self): + try: + await asyncio.gather( + *[ + mo.destroy_actor(ref) + for ref in self._session_id_to_processors.values() + ] + ) + except mo.ActorNotExist: # pragma: no cover + # deleted, ignore + pass + + @classmethod + def _get_subtask_processor_cls(cls, subtask_processor_cls): + if subtask_processor_cls is None: + return SubtaskProcessor + else: + assert isinstance(subtask_processor_cls, str) + module, class_name = subtask_processor_cls.rsplit(".", 1) + return getattr(importlib.import_module(module), class_name) + + async def _run_subtask(self, subtask: Subtask): + processor = await self._init_subtask_processor(subtask) + self._subtask_info.processor = processor + return await processor.run() + + @alru_cache(cache_exceptions=False) + async def _get_supervisor_address(self, session_id: str): + [address] = await self._cluster_api.get_supervisors_by_keys([session_id]) + return address + + async def run_subtask(self, subtask: Subtask): + if self._running_processor is not None: # pragma: no cover + running_subtask_id = await self._running_processor.get_running_subtask_id() + # current subtask is still running + raise SlotOccupiedAlready( + f"There is subtask(id: {running_subtask_id}) running in {self.uid} " + f"at {self.address}, cannot run subtask {subtask.subtask_id}" + ) + + session_id = subtask.session_id + supervisor_address = await self._get_supervisor_address(session_id) + if session_id not in self._session_id_to_processors: + try: + self._session_id_to_processors[session_id] = await mo.create_actor( + SubtaskProcessorActor, + session_id, + self._band, + supervisor_address, + self._worker_address, + self._subtask_processor_cls, + uid=SubtaskProcessorActor.gen_uid(session_id), + address=self.address, + ) + except mo.ActorAlreadyExist: + # when recovering actor pools, the actor created in sub pools + # may be recovered already + self._session_id_to_processors[session_id] = await mo.actor_ref( + uid=SubtaskProcessorActor.gen_uid(session_id), + address=self.address, + ) + processor = self._session_id_to_processors[session_id] + try: + self._running_processor = self._last_processor = processor + result = yield self._running_processor.run(subtask) + finally: + self._running_processor = None + raise mo.Return(result) + + async def get_subtask_result(self) -> SubtaskResult: + return self._last_processor.result() + + def is_runner_free(self): + return self._running_processor is None + + async def cancel_subtask(self): + if self._running_processor is None: + return + running_subtask_id = await self._running_processor.get_running_subtask_id() + logger.info("Start to cancel subtask %s.", running_subtask_id) + await self._running_processor.cancel() + self._running_processor = None + logger.info("Canceled subtask %s.", running_subtask_id) diff --git a/python/xorbits/_mars/services/subtask/worker/service.py b/python/xorbits/_mars/services/subtask/worker/service.py new file mode 100644 index 000000000..2a45f9c7f --- /dev/null +++ b/python/xorbits/_mars/services/subtask/worker/service.py @@ -0,0 +1,49 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .... import oscar as mo +from ...core import AbstractService +from .manager import SubtaskRunnerManagerActor + + +class SubtaskWorkerService(AbstractService): + """ + Subtask service on worker. + + Service Configuration + --------------------- + { + "subtask" : { + + } + } + """ + + async def start(self): + subtask_config = self._config.get("subtask", dict()) + subtask_processor_cls = subtask_config.get("subtask_processor_cls") + await mo.create_actor( + SubtaskRunnerManagerActor, + worker_address=self._address, + subtask_processor_cls=subtask_processor_cls, + address=self._address, + uid=SubtaskRunnerManagerActor.default_uid(), + ) + + async def stop(self): + await mo.destroy_actor( + mo.create_actor_ref( + uid=SubtaskRunnerManagerActor.default_uid(), address=self._address + ) + ) diff --git a/python/xorbits/_mars/services/subtask/worker/tests/__init__.py b/python/xorbits/_mars/services/subtask/worker/tests/__init__.py new file mode 100644 index 000000000..76a74ffc0 --- /dev/null +++ b/python/xorbits/_mars/services/subtask/worker/tests/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .subtask_processor import CheckedSubtaskProcessor diff --git a/python/xorbits/_mars/services/subtask/worker/tests/subtask_processor.py b/python/xorbits/_mars/services/subtask/worker/tests/subtask_processor.py new file mode 100644 index 000000000..823a69031 --- /dev/null +++ b/python/xorbits/_mars/services/subtask/worker/tests/subtask_processor.py @@ -0,0 +1,92 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict + +from .....core import OperandType +from .....tests.core import ObjectCheckMixin, _check_args +from ...worker.processor import SubtaskProcessor + + +class CheckStorageAPI: + def __init__(self, storage_api): + self._storage_api = storage_api + self._put_data_keys = set() + + def __getattr__(self, item): + return getattr(self._storage_api, item) + + @property + def put(self): + owner = self + put = self._storage_api.put + + class _PutWrapper: + def delay(self, data_key: str, obj: object, level=None): + if data_key in owner._put_data_keys: + raise Exception(f"Duplicate data put: {data_key}, obj: {obj}") + else: + owner._put_data_keys.add(data_key) + return put.delay(data_key, obj, level) + + def __getattr__(self, item): + return getattr(put, item) + + return _PutWrapper() + + +class CheckedSubtaskProcessor(ObjectCheckMixin, SubtaskProcessor): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + check_options = dict() + if self.subtask.extra_config: + kwargs = self.subtask.extra_config.copy() + else: + kwargs = dict() + self._operand_executors = operand_executors = kwargs.pop( + "operand_executors", dict() + ) + for op, executor in operand_executors.items(): + op.register_executor(executor) + for key in _check_args: + check_options[key] = kwargs.get(key, True) + self._check_options = check_options + self._check_keys = kwargs.get("check_keys") + self._storage_api = CheckStorageAPI(self._storage_api) + + def _execute_operand(self, ctx: Dict[str, Any], op: OperandType): + super()._execute_operand(ctx, op) + if self._check_options.get("check_all", True): + for out in op.outputs: + if out not in self._chunk_graph.result_chunks: + continue + if self._check_keys and out.key not in self._check_keys: + continue + # The first char of key is a letter. + assert out.key[0] in {"c", "d", "e", "f"}, out.key + if out.key not in ctx and any( + k[0] == out.key for k in ctx if isinstance(k, tuple) + ): + # both shuffle mapper and reducer + continue + self.assert_object_consistent(out, ctx[out.key]) + + async def done(self): + await super().done() + for op in self._operand_executors: + try: + op.unregister_executor() + except KeyError: + pass diff --git a/python/xorbits/_mars/services/subtask/worker/tests/test_subtask.py b/python/xorbits/_mars/services/subtask/worker/tests/test_subtask.py new file mode 100644 index 000000000..2f3addfe1 --- /dev/null +++ b/python/xorbits/_mars/services/subtask/worker/tests/test_subtask.py @@ -0,0 +1,311 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import os +import sys +import time + +import numpy as np +import pandas as pd +import pytest + +from ..... import dataframe as md +from ..... import oscar as mo +from ..... import remote as mr +from ..... import tensor as mt +from .....core import ChunkGraph, ExecutionError +from .....core.context import get_context +from .....core.graph import ChunkGraphBuilder, TileableGraph, TileableGraphBuilder +from .....core.operand import OperandStage +from .....resource import Resource +from .....utils import Timer +from ....cluster import MockClusterAPI +from ....lifecycle import MockLifecycleAPI +from ....meta import MockMetaAPI, MockWorkerMetaAPI +from ....mutable import MockMutableAPI +from ....scheduling import MockSchedulingAPI +from ....session import MockSessionAPI +from ....storage import MockStorageAPI +from ....task import MapReduceInfo, new_task_id +from ....task.supervisor.manager import TaskConfigurationActor, TaskManagerActor +from ... import Subtask, SubtaskResult, SubtaskStatus +from ...worker.manager import SubtaskRunnerManagerActor +from ...worker.runner import SubtaskRunnerActor, SubtaskRunnerRef + + +class FakeTaskManager(TaskManagerActor): + def set_subtask_result(self, subtask_result: SubtaskResult): + return + + def get_map_reduce_info(self, task_id: str, map_reduce_id: int) -> MapReduceInfo: + return MapReduceInfo( + map_reduce_id=0, + reducer_indexes=[(0, 0)], + reducer_bands=[(self.address, "numa-0")], + ) + + +@pytest.fixture +async def actor_pool(): + start_method = ( + os.environ.get("POOL_START_METHOD", "forkserver") + if sys.platform != "win32" + else None + ) + pool = await mo.create_actor_pool( + "127.0.0.1", + n_process=3, + labels=["main"] + ["numa-0"] * 2 + ["io"], + subprocess_start_method=start_method, + ) + + async with pool: + session_id = "test_session" + # create mock APIs + await MockClusterAPI.create( + pool.external_address, band_to_resource={"numa-0": Resource(num_cpus=2)} + ) + await MockSessionAPI.create(pool.external_address, session_id=session_id) + meta_api = await MockMetaAPI.create(session_id, pool.external_address) + await MockWorkerMetaAPI.create(session_id, pool.external_address) + await MockLifecycleAPI.create(session_id, pool.external_address) + storage_api = await MockStorageAPI.create(session_id, pool.external_address) + await MockSchedulingAPI.create(session_id, pool.external_address) + await MockMutableAPI.create(session_id, pool.external_address) + + # create configuration + await mo.create_actor( + TaskConfigurationActor, + dict(), + dict(), + uid=TaskConfigurationActor.default_uid(), + address=pool.external_address, + ) + await mo.create_actor( + FakeTaskManager, + session_id, + uid=FakeTaskManager.gen_uid(session_id), + address=pool.external_address, + ) + manager = await mo.create_actor( + SubtaskRunnerManagerActor, + pool.external_address, + None, + uid=SubtaskRunnerManagerActor.default_uid(), + address=pool.external_address, + ) + try: + yield pool, session_id, meta_api, storage_api, manager + finally: + await MockStorageAPI.cleanup(pool.external_address) + await MockClusterAPI.cleanup(pool.external_address) + await MockMutableAPI.cleanup(session_id, pool.external_address) + + +def _gen_subtask(t, session_id): + graph = TileableGraph([t.data]) + next(TileableGraphBuilder(graph).build()) + + chunk_graph = next(ChunkGraphBuilder(graph, fuse_enabled=False).build()) + subtask = Subtask(new_task_id(), session_id, new_task_id(), chunk_graph) + + return subtask + + +@pytest.mark.asyncio +async def test_subtask_success(actor_pool): + pool, session_id, meta_api, storage_api, manager = actor_pool + + a = mt.ones((10, 10), chunk_size=10) + b = a + 1 + + subtask = _gen_subtask(b, session_id) + subtask_runner: SubtaskRunnerRef = await mo.actor_ref( + SubtaskRunnerActor.gen_uid("numa-0", 0), address=pool.external_address + ) + await subtask_runner.run_subtask(subtask) + result = await subtask_runner.get_subtask_result() + assert result.status == SubtaskStatus.succeeded + + # check storage + expected = np.ones((10, 10)) + 1 + result_key = subtask.chunk_graph.results[0].key + result = await storage_api.get(result_key) + np.testing.assert_array_equal(expected, result) + + # check meta + chunk_meta = await meta_api.get_chunk_meta(result_key) + assert chunk_meta is not None + assert chunk_meta["bands"][0] == (pool.external_address, "numa-0") + assert await subtask_runner.is_runner_free() is True + + +@pytest.mark.asyncio +async def test_shuffle_subtask(actor_pool): + pool, session_id, meta_api, storage_api, manager = actor_pool + + pdf = pd.DataFrame({"f1": ["a", "b", "a"], "f2": [1, 2, 3]}) + df = md.DataFrame(pdf) + result = df.groupby("f1").sum(method="shuffle") + + graph = TileableGraph([result.data]) + next(TileableGraphBuilder(graph).build()) + chunk_graph = next(ChunkGraphBuilder(graph, fuse_enabled=False).build()) + result_chunks = [] + new_chunk_graph = ChunkGraph(result_chunks) + chunk_graph_iter = chunk_graph.topological_iter() + curr = None + for _ in range(3): + prev = curr + curr = next(chunk_graph_iter) + new_chunk_graph.add_node(curr) + if prev is not None: + new_chunk_graph.add_edge(prev, curr) + assert curr.op.stage == OperandStage.map + curr.op.extra_params = {"analyzer_map_reduce_id": 0} + result_chunks.append(curr) + subtask = Subtask(new_task_id(), session_id, new_task_id(), new_chunk_graph) + subtask_runner: SubtaskRunnerRef = await mo.actor_ref( + SubtaskRunnerActor.gen_uid("numa-0", 0), address=pool.external_address + ) + await subtask_runner.run_subtask(subtask) + result = await subtask_runner.get_subtask_result() + assert result.status == SubtaskStatus.succeeded + + +@pytest.mark.asyncio +async def test_subtask_failure(actor_pool): + pool, session_id, meta_api, storage_api, manager = actor_pool + + # test execution error + with mt.errstate(divide="raise"): + a = mt.ones((10, 10), chunk_size=10) + c = a / 0 + + subtask = _gen_subtask(c, session_id) + subtask_runner: SubtaskRunnerRef = await mo.actor_ref( + SubtaskRunnerActor.gen_uid("numa-0", 0), address=pool.external_address + ) + with pytest.raises(ExecutionError) as ex_info: + await subtask_runner.run_subtask(subtask) + assert isinstance(ex_info.value.nested_error, FloatingPointError) + result = await subtask_runner.get_subtask_result() + assert result.status == SubtaskStatus.errored + assert isinstance(result.error, FloatingPointError) + assert await subtask_runner.is_runner_free() is True + + +@pytest.mark.asyncio +async def test_cancel_subtask(actor_pool): + pool, session_id, meta_api, storage_api, manager = actor_pool + subtask_runner: SubtaskRunnerRef = await mo.actor_ref( + SubtaskRunnerActor.gen_uid("numa-0", 0), address=pool.external_address + ) + + def sleep(timeout: int): + time.sleep(timeout) + return timeout + + b = mr.spawn(sleep, 100) + + subtask = _gen_subtask(b, session_id) + asyncio.create_task(subtask_runner.run_subtask(subtask)) + await asyncio.sleep(0.2) + with Timer() as timer: + # normal cancel by cancel asyncio Task + aio_task = asyncio.create_task( + asyncio.wait_for(asyncio.shield(subtask_runner.cancel_subtask()), timeout=1) + ) + assert await subtask_runner.is_runner_free() is False + with pytest.raises(asyncio.TimeoutError): + await aio_task + # need 1 sec to reach timeout, then killing actor and wait for auto recovering + # the time would not be over 5 sec + assert timer.duration < 5 + + async def wait_slot_restore(): + while True: + try: + assert await subtask_runner.is_runner_free() is True + except (mo.ServerClosed, ConnectionRefusedError, mo.ActorNotExist): + await asyncio.sleep(0.5) + else: + break + + await mo.kill_actor(subtask_runner) + await wait_slot_restore() + + a = mr.spawn(sleep, 2) + + subtask2 = _gen_subtask(a, session_id) + asyncio.create_task(subtask_runner.run_subtask(subtask2)) + await asyncio.sleep(0.2) + with Timer() as timer: + # normal cancel by cancel asyncio Task + await asyncio.wait_for(subtask_runner.cancel_subtask(), timeout=6) + # do not need to wait 10 sec + assert timer.duration < 10 + assert await subtask_runner.is_runner_free() is True + + +@pytest.mark.asyncio +async def test_subtask_op_progress(actor_pool): + pool, session_id, meta_api, storage_api, manager = actor_pool + subtask_runner: SubtaskRunnerRef = await mo.actor_ref( + SubtaskRunnerActor.gen_uid("numa-0", 0), address=pool.external_address + ) + + def progress_sleep(interval: float, count: int): + for idx in range(count): + time.sleep(interval) + get_context().set_progress((1 + idx) * 1.0 / count) + + b = mr.spawn(progress_sleep, args=(0.75, 2)) + + subtask = _gen_subtask(b, session_id) + aio_task = asyncio.create_task(subtask_runner.run_subtask(subtask)) + try: + await asyncio.sleep(0.5) + result = await subtask_runner.get_subtask_result() + assert result.progress == 0.0 + + await asyncio.sleep(0.75) + result = await subtask_runner.get_subtask_result() + assert result.progress == 0.5 + finally: + await aio_task + + result = await subtask_runner.get_subtask_result() + assert result.progress == 1.0 + + +def test_update_subtask_result(): + subtask_result = SubtaskResult( + subtask_id="test_subtask_abc", + status=SubtaskStatus.pending, + progress=0.0, + bands=[("127.0.0.1", "numa-0")], + ) + new_result = SubtaskResult( + subtask_id="test_subtask_abc", + status=SubtaskStatus.succeeded, + progress=1.0, + bands=[("127.0.0.1", "numa-0")], + execution_start_time=1646125099.622051, + execution_end_time=1646125104.448726, + ) + subtask_result.update(new_result) + assert subtask_result.execution_start_time == new_result.execution_start_time + assert subtask_result.execution_end_time == new_result.execution_end_time diff --git a/python/xorbits/_mars/services/task/__init__.py b/python/xorbits/_mars/services/task/__init__.py new file mode 100644 index 000000000..13030132e --- /dev/null +++ b/python/xorbits/_mars/services/task/__init__.py @@ -0,0 +1,18 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .api import AbstractTaskAPI, TaskAPI, WebTaskAPI +from .config import task_options +from .core import MapReduceInfo, Task, TaskResult, TaskStatus, new_task_id +from .errors import TaskNotExist diff --git a/python/xorbits/_mars/services/task/analyzer/__init__.py b/python/xorbits/_mars/services/task/analyzer/__init__.py new file mode 100644 index 000000000..a2e0c4cfd --- /dev/null +++ b/python/xorbits/_mars/services/task/analyzer/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .analyzer import GraphAnalyzer diff --git a/python/xorbits/_mars/services/task/analyzer/analyzer.py b/python/xorbits/_mars/services/task/analyzer/analyzer.py new file mode 100644 index 000000000..10d82f5fb --- /dev/null +++ b/python/xorbits/_mars/services/task/analyzer/analyzer.py @@ -0,0 +1,546 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import logging +from collections import defaultdict, deque +from typing import Dict, List, Tuple, Type, Union + +from ....config import Config +from ....core import ChunkGraph, ChunkType, enter_mode +from ....core.operand import ( + Fetch, + LogicKeyGenerator, + MapReduceOperand, + OperandStage, + ShuffleFetchType, + ShuffleProxy, + VirtualOperand, +) +from ....lib.ordered_set import OrderedSet +from ....resource import Resource +from ....typing import BandType, OperandType +from ....utils import build_fetch, build_fetch_shuffle, tokenize +from ...subtask import Subtask, SubtaskGraph +from ..core import MapReduceInfo, Task, new_task_id +from .assigner import AbstractGraphAssigner, GraphAssigner +from .fusion import Coloring + +logger = logging.getLogger(__name__) + + +def need_reassign_worker(op: OperandType) -> bool: + # NOTE(qinxuye): special process for reducer + # We'd better set reducer op's stage to reduce, however, + # in many case, we copy a reducer op from tileable op, + # then set stage as reducer one, + # it would be quite nasty to take over the __setattr__ and + # make reassign_worker True etc. + return op.reassign_worker or ( + isinstance(op, MapReduceOperand) and op.stage == OperandStage.reduce + ) + + +class GraphAnalyzer: + """ + An subtask graph builder which build subtask graph for chunk graph based on passed band_resource. + + If push shuffle is used, this builder will validate predecessors orders consistency of ShuffleProxy between + chunk graph and generated subtask graph. + """ + + _map_reduce_id = itertools.count() + + def __init__( + self, + chunk_graph: ChunkGraph, + band_resource: Dict[BandType, Resource], + task: Task, + config: Config, + chunk_to_subtasks: Dict[ChunkType, Subtask], + graph_assigner_cls: Type[AbstractGraphAssigner] = None, + stage_id: str = None, + map_reduce_id_to_infos: Dict[int, MapReduceInfo] = None, + shuffle_fetch_type: ShuffleFetchType = ShuffleFetchType.FETCH_BY_KEY, + ): + self._chunk_graph = chunk_graph + self._final_result_chunks_set = set(self._chunk_graph.result_chunks) + self._band_resource = band_resource + self._task = task + self._stage_id = stage_id + self._config = config + self._shuffle_fetch_type = shuffle_fetch_type + self._has_shuffle = any( + isinstance(c.op, MapReduceOperand) for c in self._chunk_graph + ) + self._fuse_enabled = task.fuse_enabled + self._extra_config = task.extra_config + self._chunk_to_subtasks = chunk_to_subtasks + self._map_reduce_id_to_infos = map_reduce_id_to_infos + if graph_assigner_cls is None: + graph_assigner_cls = GraphAssigner + self._graph_assigner_cls = graph_assigner_cls + self._chunk_to_copied = dict() + self._logic_key_generator = LogicKeyGenerator() + + @classmethod + def next_map_reduce_id(cls) -> int: + return next(cls._map_reduce_id) + + @classmethod + def _iter_start_ops(cls, chunk_graph: ChunkGraph): + visited = set() + op_keys = set() + start_chunks = deque(chunk_graph.iter_indep()) + stack = deque([start_chunks.popleft()]) + + while stack: + chunk = stack.popleft() + if chunk not in visited: + inp_chunks = chunk_graph.predecessors(chunk) + if not inp_chunks or all( + inp_chunk in visited for inp_chunk in inp_chunks + ): + if len(inp_chunks) == 0: + op_key = chunk.op.key + if op_key not in op_keys: + op_keys.add(op_key) + yield chunk.op + visited.add(chunk) + stack.extend(c for c in chunk_graph[chunk] if c not in visited) + else: + stack.appendleft(chunk) + stack.extendleft( + reversed( + [ + c + for c in chunk_graph.predecessors(chunk) + if c not in visited + ] + ) + ) + if not stack and start_chunks: + stack.appendleft(start_chunks.popleft()) + + def _gen_input_chunks( + self, + inp_chunks: List[ChunkType], + chunk_to_fetch_chunk: Dict[ChunkType, ChunkType], + ) -> List[ChunkType]: + # gen fetch chunks for input chunks + inp_fetch_chunks = [] + for inp_chunk in inp_chunks: + if inp_chunk in chunk_to_fetch_chunk: + inp_fetch_chunks.append(chunk_to_fetch_chunk[inp_chunk]) + elif isinstance(inp_chunk.op, Fetch): + chunk_to_fetch_chunk[inp_chunk] = inp_chunk + inp_fetch_chunks.append(inp_chunk) + elif isinstance(inp_chunk.op, ShuffleProxy): + n_reducers = inp_chunk.op.n_reducers + fetch_chunk = build_fetch_shuffle( + inp_chunk, + n_reducers=n_reducers, + shuffle_fetch_type=self._shuffle_fetch_type, + ).data + chunk_to_fetch_chunk[inp_chunk] = fetch_chunk + inp_fetch_chunks.append(fetch_chunk) + else: + fetch_chunk = build_fetch(inp_chunk).data + chunk_to_fetch_chunk[inp_chunk] = fetch_chunk + inp_fetch_chunks.append(fetch_chunk) + + return inp_fetch_chunks + + @staticmethod + def _to_band(band_or_worker: Union[BandType, str]) -> BandType: + if isinstance(band_or_worker, tuple) and len(band_or_worker) == 2: + # band already + return band_or_worker + else: + return band_or_worker, "numa-0" + + @staticmethod + def _get_expect_band(op: OperandType): + if op.expect_band is not None: + return op.expect_band + elif op.expect_worker is not None: + return GraphAnalyzer._to_band(op.expect_worker) + + def _gen_subtask_info( + self, + chunks: List[ChunkType], + chunk_to_subtask: Dict[ChunkType, Subtask], + chunk_to_bands: Dict[ChunkType, BandType], + chunk_to_fetch_chunk: Dict[ChunkType, ChunkType], + ) -> Tuple[Subtask, List[Subtask], bool]: + # gen subtask and its input subtasks + chunks_set = set(chunks) + result_chunks = [] + result_chunks_set = set() + chunk_graph = ChunkGraph(result_chunks) + out_of_scope_chunks = [] + chunk_to_copied = self._chunk_to_copied + update_meta_chunks = [] + # subtask properties + band = None + is_virtual = None + retryable = True + chunk_priority = None + expect_band = None + bands_specified = None + processed = set() + for chunk in chunks: + if chunk in processed: + continue + if expect_band is None: + expect_band = self._get_expect_band(chunk.op) + bands_specified = expect_band is not None + else: # pragma: no cover + curr_expect_band = self._get_expect_band(chunk.op) + assert curr_expect_band is None or expect_band == curr_expect_band, ( + f"expect_band {curr_expect_band} conflicts with chunks that have same color: " + f"{expect_band}" + ) + # process band + chunk_band = chunk_to_bands.get(chunk) + if chunk_band is not None: + assert ( + band is None or band == chunk_band + ), "band conflicts with chunks that have same color" + band = chunk_band + # process is_virtual + if isinstance(chunk.op, VirtualOperand): + assert is_virtual is None, "only 1 virtual operand can exist" + is_virtual = True + else: + is_virtual = False + # process retryable + if not chunk.op.retryable: + retryable = False + # process priority + if chunk.op.priority is not None: + assert ( + chunk_priority is None or chunk_priority == chunk.op.priority + ), "priority conflicts with chunks that have same color" + chunk_priority = chunk.op.priority + # process input chunks + inp_chunks = [] + build_fetch_index_to_chunks = dict() + for i, inp_chunk in enumerate(chunk.inputs): + if inp_chunk in chunks_set: + inp_chunks.append(chunk_to_copied[inp_chunk]) + else: + build_fetch_index_to_chunks[i] = inp_chunk + inp_chunks.append(None) + if not isinstance(inp_chunk.op, Fetch): + out_of_scope_chunks.append(inp_chunk) + fetch_chunks = self._gen_input_chunks( + list(build_fetch_index_to_chunks.values()), chunk_to_fetch_chunk + ) + for i, fetch_chunk in zip(build_fetch_index_to_chunks, fetch_chunks): + inp_chunks[i] = fetch_chunk + copied_op = chunk.op.copy() + copied_op._key = chunk.op.key + out_chunks = [ + c.data + for c in copied_op.new_chunks( + inp_chunks, kws=[c.params.copy() for c in chunk.op.outputs] + ) + ] + for src_chunk, out_chunk in zip(chunk.op.outputs, out_chunks): + processed.add(src_chunk) + out_chunk._key = src_chunk.key + chunk_graph.add_node(out_chunk) + # cannot be copied twice + assert src_chunk not in chunk_to_copied + chunk_to_copied[src_chunk] = out_chunk + if src_chunk in self._final_result_chunks_set: + if out_chunk not in result_chunks_set: + # add to result chunks + result_chunks.append(out_chunk) + # chunk is in the result chunks of full chunk graph + # meta need to be updated + update_meta_chunks.append(out_chunk) + result_chunks_set.add(out_chunk) + if not is_virtual: + # skip adding fetch chunk to chunk graph when op is virtual operand + for c in inp_chunks: + if c not in chunk_graph: + chunk_graph.add_node(c) + chunk_graph.add_edge(c, out_chunk) + stage_n_outputs = len(result_chunks) + # add chunks with no successors into result chunks + result_chunks.extend( + c + for c in chunk_graph.iter_indep(reverse=True) + if c not in result_chunks_set + ) + expect_bands = ( + [expect_band] if bands_specified else ([band] if band is not None else None) + ) + # calculate priority + if out_of_scope_chunks: + inp_subtasks = [] + for out_of_scope_chunk in out_of_scope_chunks: + copied_out_of_scope_chunk = chunk_to_copied[out_of_scope_chunk] + inp_subtask = chunk_to_subtask[out_of_scope_chunk] + if ( + copied_out_of_scope_chunk + not in inp_subtask.chunk_graph.result_chunks + ): + # make sure the chunk that out of scope + # is in the input subtask's results, + # or the meta may be lost + inp_subtask.chunk_graph.result_chunks.append( + copied_out_of_scope_chunk + ) + inp_subtasks.append(inp_subtask) + depth = max(st.priority[0] for st in inp_subtasks) + 1 + else: + inp_subtasks = [] + depth = 0 + priority = (depth, chunk_priority or 0) + + subtask = Subtask( + subtask_id=new_task_id(), + stage_id=self._stage_id, + logic_key=self._gen_logic_key(chunks), + session_id=self._task.session_id, + task_id=self._task.task_id, + chunk_graph=chunk_graph, + expect_bands=expect_bands, + bands_specified=bands_specified, + virtual=is_virtual, + priority=priority, + retryable=retryable, + update_meta_chunks=update_meta_chunks, + extra_config=self._extra_config, + stage_n_outputs=stage_n_outputs, + ) + + is_shuffle_proxy = False + if self._has_shuffle: + proxy_chunks = [c for c in result_chunks if isinstance(c.op, ShuffleProxy)] + if proxy_chunks: + assert len(proxy_chunks) <= 1, proxy_chunks + is_shuffle_proxy = True + return subtask, inp_subtasks, is_shuffle_proxy + + def _gen_logic_key(self, chunks: List[ChunkType]): + return tokenize( + *[self._logic_key_generator.get_logic_key(chunk.op) for chunk in chunks] + ) + + def _gen_map_reduce_info( + self, chunk: ChunkType, assign_results: Dict[ChunkType, BandType] + ): + reducer_ops = OrderedSet( + [ + c.op + for c in self._chunk_graph.successors(chunk) + if c.op.stage == OperandStage.reduce + ] + ) + map_chunks = [ + c + for c in self._chunk_graph.predecessors(chunk) + if (c.op.stage == OperandStage.map) or c.is_mapper + ] + map_reduce_id = self.next_map_reduce_id() + for map_chunk in map_chunks: + # record analyzer map reduce id for mapper op + # copied chunk exists because map chunk must have + # been processed before shuffle proxy + copied_map_chunk = self._chunk_to_copied[map_chunk] + if not hasattr(copied_map_chunk, "extra_params"): # pragma: no cover + copied_map_chunk.extra_params = dict() + copied_map_chunk.extra_params["analyzer_map_reduce_id"] = map_reduce_id + reducer_bands = [assign_results[r.outputs[0]] for r in reducer_ops] + map_reduce_info = MapReduceInfo( + map_reduce_id=map_reduce_id, + reducer_indexes=[reducer_op.reducer_index for reducer_op in reducer_ops], + reducer_bands=reducer_bands, + ) + self._map_reduce_id_to_infos[map_reduce_id] = map_reduce_info + + @enter_mode(build=True) + def gen_subtask_graph( + self, op_to_bands: Dict[str, BandType] = None + ) -> SubtaskGraph: + """ + Analyze chunk graph and generate subtask graph. + + Returns + ------- + subtask_graph: SubtaskGraph + Subtask graph. + """ + # reassign worker when specified reassign_worker = True + # or it's a reducer operands + reassign_worker_ops = [ + chunk.op for chunk in self._chunk_graph if need_reassign_worker(chunk.op) + ] + start_ops = ( + list(self._iter_start_ops(self._chunk_graph)) + if len(self._chunk_graph) > 0 + else [] + ) + + # assign start chunks + to_assign_ops = start_ops + reassign_worker_ops + assigner = self._graph_assigner_cls( + self._chunk_graph, to_assign_ops, self._band_resource + ) + # assign expect bands + cur_assigns = { + op.key: self._get_expect_band(op) + for op in start_ops + if op.expect_band is not None or op.expect_worker is not None + } + if op_to_bands: + cur_assigns.update(op_to_bands) + logger.debug( + "Start to assign %s start chunks for task %s", + len(start_ops), + self._task.task_id, + ) + chunk_to_bands = assigner.assign(cur_assigns=cur_assigns) + logger.debug( + "Assigned %s start chunks for task %s", len(start_ops), self._task.task_id + ) + # assign expect workers for those specified with `expect_worker` or `expect_band` + # skip `start_ops`, which have been assigned before + start_ops_set = set(start_ops) + for chunk in self._chunk_graph: + if chunk not in start_ops_set: + if chunk.op.expect_band is not None: + chunk_to_bands[chunk] = chunk.op.expect_band + elif chunk.op.expect_worker is not None: + chunk_to_bands[chunk] = self._to_band(chunk.op.expect_worker) + + # color nodes + if self._fuse_enabled: + logger.debug("Start to fuse chunks for task %s", self._task.task_id) + # sort start chunks in coloring as start_ops + op_key_to_chunks = defaultdict(list) + for chunk in self._chunk_graph: + op_key_to_chunks[chunk.op.key].append(chunk) + init_chunk_to_bands = dict() + for start_op in start_ops: + for start_chunk in op_key_to_chunks[start_op.key]: + init_chunk_to_bands[start_chunk] = chunk_to_bands[start_chunk] + if ( + self._has_shuffle + and self._shuffle_fetch_type == ShuffleFetchType.FETCH_BY_INDEX + ): + # ensure no shuffle mapper chunks fused into same subtask. + initial_same_color_num = 1 + else: + initial_same_color_num = getattr( + self._config, "initial_same_color_num", None + ) + coloring = Coloring( + self._chunk_graph, + list(self._band_resource), + init_chunk_to_bands, + initial_same_color_num=initial_same_color_num, + as_broadcaster_successor_num=getattr( + self._config, "as_broadcaster_successor_num", None + ), + ) + chunk_to_colors = coloring.color() + else: + # if not fuse enabled, color all chunks with different colors + op_to_colors = dict() + chunk_to_colors = dict() + color_gen = itertools.count() + for c in self._chunk_graph.topological_iter(): + if c.op not in op_to_colors: + chunk_to_colors[c] = op_to_colors[c.op] = next(color_gen) + else: + chunk_to_colors[c] = op_to_colors[c.op] + color_to_chunks = defaultdict(list) + for chunk, color in chunk_to_colors.items(): + if not isinstance(chunk.op, Fetch): + color_to_chunks[color].append(chunk) + + # gen subtask graph + subtask_graph = SubtaskGraph() + chunk_to_fetch_chunk = dict() + chunk_to_subtask = self._chunk_to_subtasks + # states + visited = set() + logic_key_to_subtasks = defaultdict(list) + if self._shuffle_fetch_type == ShuffleFetchType.FETCH_BY_INDEX: + for chunk in self._chunk_graph.topological_iter(): + if not isinstance(chunk.op, ShuffleProxy): + continue + # Can't use `OperandStage.map` to find mappers directly, since `stage` of some operand + # such as `DataFrameIndexAlign` are `OperandStage.map` but not a shuffle mapper sometimes. + mapper_chunks = self._chunk_graph.predecessors(chunk) + for mapper_chunk in mapper_chunks: + chunk_color = chunk_to_colors[mapper_chunk] + same_color_chunks = color_to_chunks[chunk_color] + mappers = [ + c + for c in same_color_chunks + if c.op.stage == OperandStage.map + and any( + isinstance(succ.op, ShuffleProxy) + for succ in self._chunk_graph.iter_successors(c) + ) + ] + if len(mappers) > 1: + # ensure every subtask contains only at most one mapper + for mapper in mappers: + same_color_chunks.remove(mapper) + mapper_color = coloring.next_color() + chunk_to_colors[mapper] = mapper_color + color_to_chunks[mapper_color] = [mapper] + for chunk in self._chunk_graph.topological_iter(): + if chunk in visited or isinstance(chunk.op, Fetch): + # skip fetch chunk + continue + + color = chunk_to_colors[chunk] + same_color_chunks = color_to_chunks[color] + if all(isinstance(c.op, Fetch) for c in same_color_chunks): + # all fetch ops, no need to gen subtask + continue + subtask, inp_subtasks, is_shuffle_proxy = self._gen_subtask_info( + same_color_chunks, + chunk_to_subtask, + chunk_to_bands, + chunk_to_fetch_chunk, + ) + subtask_graph.add_node(subtask) + if is_shuffle_proxy: + subtask_graph.add_shuffle_proxy_subtask(subtask) + logic_key_to_subtasks[subtask.logic_key].append(subtask) + for inp_subtask in inp_subtasks: + subtask_graph.add_edge(inp_subtask, subtask) + + for c in same_color_chunks: + chunk_to_subtask[c] = subtask + if self._map_reduce_id_to_infos is not None and isinstance( + chunk.op, ShuffleProxy + ): + self._gen_map_reduce_info(chunk, chunk_to_bands) + visited.update(same_color_chunks) + + for subtasks in logic_key_to_subtasks.values(): + for logic_index, subtask in enumerate(subtasks): + subtask.logic_index = logic_index + subtask.logic_parallelism = len(subtasks) + return subtask_graph diff --git a/python/xorbits/_mars/services/task/analyzer/assigner.py b/python/xorbits/_mars/services/task/analyzer/assigner.py new file mode 100644 index 000000000..870dc4df1 --- /dev/null +++ b/python/xorbits/_mars/services/task/analyzer/assigner.py @@ -0,0 +1,243 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from collections import defaultdict +from operator import itemgetter +from typing import Dict, List, Union + +import numpy as np + +from ....core import ChunkData, ChunkGraph +from ....core.operand import Fetch, Operand +from ....lib.ordered_set import OrderedSet +from ....resource import Resource +from ....typing import BandType +from ....utils import implements + + +class AbstractGraphAssigner(ABC): + """ + Assign start nodes. + """ + + def __init__( + self, + chunk_graph: ChunkGraph, + start_ops: List[Operand], + band_resource: Dict[BandType, Resource], + ): + self._chunk_graph = chunk_graph + self._start_ops = start_ops + self._band_resource = band_resource + + @abstractmethod + def assign(self, cur_assigns: Dict[str, str] = None) -> Dict[ChunkData, BandType]: + """ + Assign start nodes to bands. + + cur_assigns : dict + op already assigned. + + Returns + ------- + node_to_bands : dict + From node to band. + """ + + def _is_gpu_band(self) -> bool: + gpu_ops = ( + [op for op in self._start_ops if not isinstance(op, Fetch)] + if self._start_ops + else [] + ) + if gpu_ops and all(op.gpu for op in gpu_ops): + return True + return False + + def get_device_band_slots(self) -> Dict[BandType, int]: + if self._is_gpu_band(): # pragma: no cover + band_prefix = "gpu" + else: + band_prefix = "numa" + return { + band: resource.num_cpus or resource.num_gpus + for band, resource in self._band_resource.items() + if band[1].startswith(band_prefix) + } + + +class GraphAssigner(AbstractGraphAssigner): + def __init__( + self, + chunk_graph: ChunkGraph, + start_ops: List[Operand], + band_resource: Dict[BandType, Resource], + ): + super().__init__(chunk_graph, start_ops, band_resource) + self._op_keys: OrderedSet = OrderedSet([start_op.key for start_op in start_ops]) + + def _calc_band_assign_limits( + self, initial_count: int, occupied: Dict[BandType, int] + ) -> Dict[BandType, int]: + """ + Calculate limitation of number of initial operands for bands. + + Parameters + ---------- + initial_count : int + Number of nodes that is ready for running. + occupied : dict + Band to those initials that already assigned. + + Returns + ------- + slot_assign_limits: dict + Slot to limitation of number of initial operands. + """ + actual_count: int = initial_count - sum(occupied.values()) + band_slots = sorted( + self.get_device_band_slots().items(), key=itemgetter(1), reverse=True + ) + bands: List[BandType] = [it[0] for it in band_slots] + slots = np.asarray([it[1] for it in band_slots], dtype=np.float32) + + # remove assigned nodes from limitations + counts = initial_count * slots / slots.sum() + for i, band in enumerate(bands): + counts[i] = max(0, counts[i] - occupied.get(band, 0)) + + # all assigned, nothing to do + if counts.sum() == 0: + return {band: 0 for band in bands} + + # assign remaining nodes + counts = (actual_count * counts / counts.sum()).astype(np.int32) + pos = 0 + rest = actual_count - counts.sum() + while rest > 0: + counts[pos] += 1 + rest -= 1 + pos = (pos + 1) % len(counts) + return dict(zip(bands, counts)) + + @classmethod + def _assign_by_bfs( + cls, + undirected_chunk_graph: ChunkGraph, + start: ChunkData, + band: BandType, + initial_sizes: Dict[BandType, int], + spread_limits: Dict[BandType, float], + key_to_assign: OrderedSet, + assigned_record: Dict[str, Union[str, BandType]], + ): + """ + Assign initial nodes using breath-first search given initial sizes and + limitations of spread range. + """ + if initial_sizes[band] <= 0: + return + + assigned = 0 + spread_range = 0 + for chunk in undirected_chunk_graph.bfs(start=start, visit_predicate="all"): + op_key = chunk.op.key + if op_key in assigned_record: + continue + spread_range += 1 + # `op_key` may not be in `key_to_assign`, + # but we need to record it to avoid iterate the node repeatedly. + assigned_record[op_key] = band + if op_key not in key_to_assign: + continue + assigned += 1 + if spread_range >= spread_limits[band] or assigned >= initial_sizes[band]: + break + initial_sizes[band] -= assigned + + def _build_undirected_chunk_graph( + self, chunk_to_assign: List[ChunkData] + ) -> ChunkGraph: + chunk_graph = self._chunk_graph.copy() + # remove edges for all chunk_to_assign which may contain chunks + # that need be reassigned + for chunk in chunk_to_assign: + if chunk_graph.count_predecessors(chunk) > 0: + for pred in list(chunk_graph.predecessors(chunk)): + chunk_graph.remove_edge(pred, chunk) + return chunk_graph.build_undirected() + + @implements(AbstractGraphAssigner.assign) + def assign( + self, cur_assigns: Dict[str, BandType] = None + ) -> Dict[ChunkData, BandType]: + graph = self._chunk_graph + assign_result = dict() + cur_assigns = cur_assigns or dict() + # assigned by expect worker or band + initial_assigned_op_keys = set(cur_assigns) + + op_key_to_chunks = defaultdict(list) + for chunk in graph: + op_key_to_chunks[chunk.op.key].append(chunk) + + op_keys = OrderedSet(self._op_keys) + chunk_to_assign = [ + op_key_to_chunks[op_key][0] + for op_key in op_keys + if op_key not in cur_assigns + ] + assigned_counts = defaultdict(lambda: 0) + for band in cur_assigns.values(): + assigned_counts[band] += 1 + + # build undirected graph + undirected_chunk_graph = self._build_undirected_chunk_graph(chunk_to_assign) + + # calculate the number of chunks to be assigned to each band + # given number of bands and existing assignments + band_quotas = self._calc_band_assign_limits( + len(chunk_to_assign) + sum(assigned_counts.values()), assigned_counts + ) + + # calculate expected descendant count (spread range) of + # every band and subtract assigned number from it + average_spread_range = len(graph) * 1.0 / len(self.get_device_band_slots()) + spread_ranges = defaultdict(lambda: average_spread_range) + # assign from other chunks to be assigned + # TODO: sort by what? + sorted_candidates = chunk_to_assign.copy() + while max(band_quotas.values()): + band = max(band_quotas, key=lambda k: band_quotas[k]) + cur = sorted_candidates.pop() + while cur.op.key in cur_assigns: + cur = sorted_candidates.pop() + self._assign_by_bfs( + undirected_chunk_graph, + cur, + band, + band_quotas, + spread_ranges, + op_keys, + cur_assigns, + ) + + key_to_assign = {n.op.key for n in chunk_to_assign} | initial_assigned_op_keys + for op_key, band in cur_assigns.items(): + if op_key in key_to_assign: + for chunk in op_key_to_chunks[op_key]: + assign_result[chunk] = band + + return assign_result diff --git a/python/xorbits/_mars/services/task/analyzer/fusion.py b/python/xorbits/_mars/services/task/analyzer/fusion.py new file mode 100644 index 000000000..653a3d83a --- /dev/null +++ b/python/xorbits/_mars/services/task/analyzer/fusion.py @@ -0,0 +1,194 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from collections import defaultdict +from typing import Dict, List + +from ....config import options +from ....core import ChunkGraph +from ....core.operand import VirtualOperand +from ....typing import BandType, ChunkType, OperandType + + +class Coloring: + """ + Coloring a chunk graph according to an algorithm + described in https://github.com/mars-project/mars/issues/2435 + """ + + def __init__( + self, + chunk_graph: ChunkGraph, + all_bands: List[BandType], + chunk_to_bands: Dict[ChunkType, BandType], + initial_same_color_num: int = None, + as_broadcaster_successor_num: int = None, + ): + self.chunk_graph = chunk_graph + self.all_bands = all_bands + self.chunk_to_bands = chunk_to_bands + if initial_same_color_num is None: + has_gpu = any(c.op.gpu for c in chunk_graph) + if not has_gpu: + initial_same_color_num = max(options.combine_size // 2, 1) + else: + # if gpu exists, we try to fuse more node to reduce cost + initial_same_color_num = max(options.combine_size * 2, 1) + self.initial_same_color_num = initial_same_color_num + if as_broadcaster_successor_num is None: + as_broadcaster_successor_num = options.combine_size * 2 + self.successor_same_color_num = as_broadcaster_successor_num + + self._coloring_iter = itertools.count() + + def next_color(self) -> int: + return next(self._coloring_iter) + + @classmethod + def _can_color_same(cls, chunk: ChunkType, predecessors: List[ChunkType]) -> bool: + if ( + # VirtualOperand cannot be fused + any(isinstance(n.op, VirtualOperand) for n in [chunk] + predecessors) + # allocated on different bands + or len({n.op.gpu for n in [chunk] + predecessors}) > 1 + # expect worker changed + or len({n.op.expect_worker for n in [chunk] + predecessors}) > 1 + # scheduling hint tells that cannot be fused + or ( + chunk.op.scheduling_hint is not None + and not chunk.op.scheduling_hint.can_be_fused() + ) + ): + return False + return True + + def _color_init_nodes(self) -> Dict[OperandType, int]: + # for initial op with same band but different priority + # we color them w/ different colors, + # to prevent from wrong fusion. + # e.g. md.read_csv ensure incremental index by generating + # chunks with ascending priorities (smaller one has higher priority), + # chunk 0 has higher priority than chunk 1, + # so that when chunk 1 executing, it would know chunk 0's shape + # TODO: make it general instead handle priority as a special case + band_priority_to_colors = dict() + for chunk, band in self.chunk_to_bands.items(): + band_priority = (band, chunk.op.priority) + if band_priority not in band_priority_to_colors: + band_priority_to_colors[band_priority] = self.next_color() + + band_priority_to_color_list = defaultdict(list) + for (band, priority), color in band_priority_to_colors.items(): + band_priority_to_color_list[band, priority].append(color) + color_to_size = defaultdict(lambda: 0) + op_to_colors = dict() + for chunk, band in self.chunk_to_bands.items(): + priority = chunk.op.priority + color = band_priority_to_color_list[band, priority][-1] + size = color_to_size[color] + if size >= self.initial_same_color_num: + color = self.next_color() + band_priority_to_color_list[band, priority].append(color) + color_to_size[color] += 1 + op_to_colors[chunk.op] = color + return op_to_colors + + def color(self) -> Dict[ChunkType, int]: + chunk_to_colors = dict() + + # step 1: Coloring the initial nodes according to the bands that assigned by assigner + op_to_colors = self._color_init_nodes() + + # step2: Propagate color in the topological order, + # if the input nodes have same color, color it with the same color; + # otherwise, color with a new color. + broadcaster_chunk_set = set() + for chunk in self.chunk_graph.topological_iter(): + if self.chunk_graph.count_successors(chunk) > self.successor_same_color_num: + # is broadcaster + broadcaster_chunk_set.add(chunk) + + if chunk.op in op_to_colors: + # colored + chunk_to_colors[chunk] = op_to_colors[chunk.op] + continue + + predecessors = self.chunk_graph.predecessors(chunk) + pred_colors = {op_to_colors[pred.op] for pred in predecessors} + if len(predecessors) == 1 and predecessors[0] in broadcaster_chunk_set: + # TODO: handle situation that chunks which specify reassign_workers + # predecessor is broadcaster, just allocate a new color + color = self.next_color() + elif len(pred_colors) == 1: + if self._can_color_same(chunk, predecessors): + # predecessors have only 1 color, will color with same one + color = next(iter(pred_colors)) + else: + color = self.next_color() + else: + # has more than 1 color, color a new one + assert len(pred_colors) > 1 + color = self.next_color() + + op_to_colors[chunk.op] = chunk_to_colors[chunk] = color + + # step 3: Propagate with reversed topological order, + # check a node with its inputs, if all inputs have different color with itself, skip; + # otherwise, if some of inputs have the same color, but some others have different color, + # color the input nodes with same one with a new color, and propagate to its inputs and so on. + for chunk in self.chunk_graph.topological_iter(reverse=True): + pred_colors = { + op_to_colors[pred.op] + for pred in self.chunk_graph.iter_successors(chunk) + } + chunk_color = chunk_to_colors[chunk] + if chunk_color in pred_colors and len(pred_colors) > 1: + # conflict + # color the successors with new colors + stack = [] + for succ in self.chunk_graph.iter_successors(chunk): + if chunk_to_colors[succ] == chunk_color: + new_color = op_to_colors[succ.op] = self.next_color() + for c in succ.op.outputs: + if c not in self.chunk_graph: # pragma: no cover + continue + chunk_to_colors[c] = new_color + stack.extend(self.chunk_graph.successors(c)) + # color the descendants with same color to the new one + # the descendants will not be visited more than 2 times + while len(stack) > 0: + node = stack.pop() + node_color = chunk_to_colors[node] + if node_color == chunk_color: + # same color, recolor to the new one + node_pred_colors = list( + { + op_to_colors[inp.op] + for inp in self.chunk_graph.iter_predecessors(node) + } + ) + node_input_same_color = len(node_pred_colors) == 1 + if node_input_same_color: + node_new_color = node_pred_colors[0] + else: + node_new_color = self.next_color() + op_to_colors[node.op] = node_new_color + for c in node.op.outputs: + if c not in self.chunk_graph: # pragma: no cover + continue + chunk_to_colors[c] = node_new_color + stack.extend(self.chunk_graph.successors(c)) + + return chunk_to_colors diff --git a/python/xorbits/_mars/services/task/analyzer/tests/__init__.py b/python/xorbits/_mars/services/task/analyzer/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/services/task/analyzer/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/services/task/analyzer/tests/test_analyzer.py b/python/xorbits/_mars/services/task/analyzer/tests/test_analyzer.py new file mode 100644 index 000000000..e00690a31 --- /dev/null +++ b/python/xorbits/_mars/services/task/analyzer/tests/test_analyzer.py @@ -0,0 +1,81 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from ..... import dataframe as md +from ..... import tensor as mt +from .....config import Config +from .....core.operand.shuffle import ShuffleFetchType, ShuffleProxy +from .....resource import Resource +from ...core import Task +from ..analyzer import GraphAnalyzer + +t1 = mt.random.RandomState(0).rand(31, 27, chunk_size=10) +t2 = t1.reshape(27, 31) +t2.op.extra_params["_reshape_with_shuffle"] = True +df1 = md.DataFrame(t1, columns=[f"c{i}" for i in range(t1.shape[1])]) +df2 = df1.groupby(["c1"]).apply(lambda pdf: pdf.sum()) + + +@pytest.mark.parametrize("tileable", [df1.describe(), df2, t2]) +@pytest.mark.parametrize("fuse", [True, False]) +def test_shuffle_graph(tileable, fuse): + # can't test df.groupby and mt.bincount, those chunk graph build depend on ctx.get_chunks_meta/get_chunks_result + chunk_graph = tileable.build_graph(tile=True) + assert len(chunk_graph) > 0 + all_bands = [(f"address_{i}", "numa-0") for i in range(5)] + band_resource = dict((band, Resource(num_cpus=1)) for band in all_bands) + task = Task("mock_task", "mock_session", fuse_enabled=fuse) + analyzer = GraphAnalyzer( + chunk_graph, + band_resource, + task, + Config(), + dict(), + shuffle_fetch_type=ShuffleFetchType.FETCH_BY_INDEX, + ) + subtask_graph = analyzer.gen_subtask_graph() + proxy_subtasks = [] + for subtask in subtask_graph: + for c in subtask.chunk_graph.results: + if isinstance(c.op, ShuffleProxy): + assert len(subtask.chunk_graph.results) == 1 + proxy_subtasks.append(subtask) + proxy_chunks = [ + c + for subtask in proxy_subtasks + for c in chunk_graph + if subtask.chunk_graph.results[0].key == c.key + ] + assert len(proxy_subtasks) == len(proxy_chunks) + assert len(proxy_subtasks) > 0 + assert len(proxy_subtasks) == len(subtask_graph.get_shuffle_proxy_subtasks()) + for proxy_chunk, proxy_subtask in zip(proxy_chunks, proxy_subtasks): + reducer_subtasks = subtask_graph.successors(proxy_subtask) + for reducer_subtask in reducer_subtasks: + start_chunks = list(reducer_subtask.chunk_graph.iter_indep()) + assert len(start_chunks) == 1 + assert ( + start_chunks[0].op.shuffle_fetch_type == ShuffleFetchType.FETCH_BY_INDEX + ) + reducer_chunks = chunk_graph.successors(proxy_chunk) + # single reducer may have multiple output chunks, see `PSRSShuffle._execute_reduce + if len(reducer_subtasks) != len(reducer_chunks): + assert len(reducer_subtasks) == len(set(c.op for c in reducer_chunks)) + mapper_subtasks = subtask_graph.predecessors(proxy_subtask) + for mapper_subtask in mapper_subtasks: + assert len(mapper_subtask.chunk_graph.results) == 1 + mapper_chunks = chunk_graph.predecessors(proxy_chunk) + assert len(mapper_subtasks) == len(mapper_chunks) diff --git a/python/xorbits/_mars/services/task/analyzer/tests/test_assigner.py b/python/xorbits/_mars/services/task/analyzer/tests/test_assigner.py new file mode 100644 index 000000000..cc64f0cb8 --- /dev/null +++ b/python/xorbits/_mars/services/task/analyzer/tests/test_assigner.py @@ -0,0 +1,108 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ..... import dataframe as md +from .....config import Config +from .....core import ChunkGraph +from .....core.graph.builder.utils import build_graph +from .....core.operand import OperandStage +from .....resource import Resource +from .....tensor.arithmetic import TensorAdd +from .....tensor.fetch import TensorFetch +from .....tensor.random import TensorRand +from ...core import Task +from ..analyzer import GraphAnalyzer, need_reassign_worker +from ..assigner import GraphAssigner + + +def test_assigner_with_fetch_inputs(): + band_num = 8 + all_bands = [(f"address_{i}", "numa-0") for i in range(band_num)] + inputs = [ + TensorFetch(key=str(i), source_key=str(i), dtype=np.dtype(int)).new_chunk([]) + for i in range(band_num) + ] + no_fetch_inputs = [TensorRand(i).new_chunk([]) for i in range(4)] + results = [TensorAdd(lhs=inp, rhs=1).new_chunk([inp]) for inp in inputs] + cur_assigns = dict( + (fetch_chunk.op.key, band[0][0]) + for fetch_chunk, band in zip(reversed(inputs), all_bands) + ) + + chunk_graph = ChunkGraph() + for fetch_chunk, add_chunk in zip(inputs, results): + chunk_graph.add_node(fetch_chunk) + chunk_graph.add_node(add_chunk) + chunk_graph.add_edge(fetch_chunk, add_chunk) + for inp in no_fetch_inputs: + results.append(inp) + chunk_graph.add_node(inp) + chunk_graph.results = results + + band_resource = dict((band, Resource(num_cpus=1)) for band in all_bands) + + task = Task("mock_task", "mock_session") + analyzer = GraphAnalyzer(chunk_graph, band_resource, task, Config(), dict()) + subtask_graph = analyzer.gen_subtask_graph(cur_assigns) + + assigner = GraphAssigner( + chunk_graph, list(GraphAnalyzer._iter_start_ops(chunk_graph)), band_resource + ) + assigns = assigner.assign(cur_assigns) + key_to_assign = dict((c.key, band) for c, band in assigns.items()) + for subtask in subtask_graph: + input_chunks = list(subtask.chunk_graph.iter_indep()) + if all(isinstance(inp.op, TensorFetch) for inp in input_chunks): + # all inputs are fetch, expect band should be None + assert subtask.expect_band is None + else: + # if subtask has truly initial chunks, expect band should be + # same as assign results + for inp in input_chunks: + if not isinstance(inp.op, TensorFetch): + assert subtask.expect_band == key_to_assign[inp.key] + + +def test_shuffle_assign(): + band_num = 8 + all_bands = [(f"address_{i}", "numa-0") for i in range(band_num)] + + pdf = pd.DataFrame(np.random.rand(32, 4)) + df = md.DataFrame(pdf, chunk_size=4) + r = df.groupby(0).sum(method="shuffle") + chunk_graph = build_graph([r], tile=True) + + band_resource = dict((band, Resource(num_cpus=1)) for band in all_bands) + + reassign_worker_ops = [ + chunk.op for chunk in chunk_graph if need_reassign_worker(chunk.op) + ] + start_ops = list(GraphAnalyzer._iter_start_ops(chunk_graph)) + to_assign_ops = start_ops + reassign_worker_ops + + assigner = GraphAssigner(chunk_graph, to_assign_ops, band_resource) + assigns = assigner.assign() + assert len(assigns) == 16 + init_assigns = set() + reducer_assigns = set() + for chunk, assign in assigns.items(): + if chunk.op.stage == OperandStage.reduce: + reducer_assigns.add(assign) + else: + init_assigns.add(assign) + # init and reducers are assigned on all bands + assert len(init_assigns) == len(reducer_assigns) == 8 diff --git a/python/xorbits/_mars/services/task/analyzer/tests/test_fusion.py b/python/xorbits/_mars/services/task/analyzer/tests/test_fusion.py new file mode 100644 index 000000000..ece9c18ac --- /dev/null +++ b/python/xorbits/_mars/services/task/analyzer/tests/test_fusion.py @@ -0,0 +1,236 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .....core import ChunkGraph +from .....tensor.arithmetic import TensorTreeAdd +from ..fusion import Coloring + + +def test_simple_coloring(): + # graph: https://user-images.githubusercontent.com/357506/132340029-b595afcf-3cec-44cb-b1c3-aac379e2e607.png + chunks = [ + TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(8) + ] + graph = ChunkGraph([chunks[3], chunks[7]]) + for c in chunks: + graph.add_node(c) + chunks[2].op._inputs = [chunks[0], chunks[1]] + graph.add_edge(chunks[0], chunks[2]) + graph.add_edge(chunks[1], chunks[2]) + chunks[3].op._inputs = [chunks[2]] + graph.add_edge(chunks[2], chunks[3]) + chunks[6].op._inputs = [chunks[4], chunks[5]] + graph.add_edge(chunks[4], chunks[6]) + graph.add_edge(chunks[5], chunks[6]) + chunks[7].op._inputs = [chunks[6]] + graph.add_edge(chunks[6], chunks[7]) + + all_bands = [("127.0.0.1", "0"), ("127.0.0.1", "1")] + chunk_to_bands = { + chunks[0]: all_bands[0], + chunks[1]: all_bands[0], + chunks[4]: all_bands[1], + chunks[5]: all_bands[1], + } + + # allocate node 0, 1 with band 0, node 4, 5 with band 1 + coloring = Coloring(graph, all_bands, chunk_to_bands) + chunk_to_colors = coloring.color() + assert len(set(chunk_to_colors.values())) == 2 + assert ( + chunk_to_colors[chunks[0]] + == chunk_to_colors[chunks[1]] + == chunk_to_colors[chunks[2]] + == chunk_to_colors[chunks[3]] + ) + assert ( + chunk_to_colors[chunks[4]] + == chunk_to_colors[chunks[5]] + == chunk_to_colors[chunks[6]] + == chunk_to_colors[chunks[7]] + ) + + # initial nodes all have different colors + coloring = Coloring(graph, all_bands, chunk_to_bands, initial_same_color_num=1) + chunk_to_colors = coloring.color() + assert len(set(chunk_to_colors.values())) == 6 + assert ( + len( + { + chunk_to_colors[chunks[0]], + chunk_to_colors[chunks[1]], + chunk_to_colors[chunks[2]], + } + ) + == 3 + ) + assert chunk_to_colors[chunks[2]] == chunk_to_colors[chunks[3]] + + +def test_coloring_with_gpu_attr(): + chunks = [ + TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(8) + ] + graph = ChunkGraph([chunks[3], chunks[7]]) + for c in chunks: + graph.add_node(c) + + # two lines, one line can be fused as one task, + # the other cannot, because gpu attributes are different + chunks[0].op.gpu = True + chunks[1].op.gpu = True + chunks[1].op._inputs = [chunks[0]] + graph.add_edge(chunks[0], chunks[1]) + chunks[2].op._inputs = [chunks[1]] + graph.add_edge(chunks[1], chunks[2]) + chunks[3].op._inputs = [chunks[2]] + graph.add_edge(chunks[2], chunks[3]) + chunks[5].op._inputs = [chunks[4]] + graph.add_edge(chunks[4], chunks[5]) + chunks[6].op._inputs = [chunks[5]] + graph.add_edge(chunks[5], chunks[6]) + chunks[7].op._inputs = [chunks[6]] + graph.add_edge(chunks[6], chunks[7]) + + all_bands = [("127.0.0.1", "0"), ("127.0.0.1", "1")] + chunk_to_bands = { + chunks[0]: all_bands[0], + chunks[4]: all_bands[1], + } + + coloring = Coloring(graph, all_bands, chunk_to_bands) + chunk_to_colors = coloring.color() + assert len(set(chunk_to_colors.values())) == 3 + assert chunk_to_colors[chunks[0]] == chunk_to_colors[chunks[1]] + assert chunk_to_colors[chunks[2]] == chunk_to_colors[chunks[3]] + assert ( + chunk_to_colors[chunks[4]] + == chunk_to_colors[chunks[5]] + == chunk_to_colors[chunks[6]] + == chunk_to_colors[chunks[7]] + ) + + +def test_complex_coloring(): + # graph: https://user-images.githubusercontent.com/357506/132340055-f08106dd-b507-4e24-bc79-8364d6e1ef79.png + chunks = [ + TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data + for n in range(13) + ] + graph = ChunkGraph([chunks[7], chunks[12]]) + for c in chunks: + graph.add_node(c) + chunks[2].op._inputs = [chunks[0], chunks[1]] + graph.add_edge(chunks[0], chunks[2]) + graph.add_edge(chunks[1], chunks[2]) + chunks[10].op._inputs = [chunks[8], chunks[9]] + graph.add_edge(chunks[8], chunks[10]) + graph.add_edge(chunks[9], chunks[10]) + chunks[3].op._inputs = [chunks[2]] + graph.add_edge(chunks[2], chunks[3]) + chunks[4].op._inputs = [chunks[3]] + graph.add_edge(chunks[3], chunks[4]) + chunks[5].op._inputs = [chunks[2], chunks[10]] + graph.add_edge(chunks[2], chunks[5]) + graph.add_edge(chunks[10], chunks[5]) + chunks[6].op._inputs = [chunks[5]] + graph.add_edge(chunks[5], chunks[6]) + chunks[7].op._inputs = [chunks[4], chunks[6]] + graph.add_edge(chunks[4], chunks[7]) + graph.add_edge(chunks[6], chunks[7]) + chunks[11].op._inputs = [chunks[10]] + graph.add_edge(chunks[10], chunks[11]) + chunks[12].op._inputs = [chunks[6], chunks[11]] + graph.add_edge(chunks[6], chunks[12]) + graph.add_edge(chunks[11], chunks[12]) + + all_bands = [("127.0.0.1", "0"), ("127.0.0.1", "1")] + chunk_to_bands = { + chunks[0]: all_bands[0], + chunks[1]: all_bands[0], + chunks[8]: all_bands[1], + chunks[9]: all_bands[1], + } + # allocate node 0, 1 with band 0, node 8, 9 with band 1 + coloring = Coloring(graph, all_bands, chunk_to_bands) + chunk_to_colors = coloring.color() + assert len(set(chunk_to_colors.values())) == 7 + assert ( + chunk_to_colors[chunks[0]] + == chunk_to_colors[chunks[1]] + == chunk_to_colors[chunks[2]] + ) + assert chunk_to_colors[chunks[3]] == chunk_to_colors[chunks[4]] + assert chunk_to_colors[chunks[5]] == chunk_to_colors[chunks[6]] + assert ( + chunk_to_colors[chunks[8]] + == chunk_to_colors[chunks[9]] + == chunk_to_colors[chunks[10]] + ) + assert ( + len( + { + chunk_to_colors[chunks[0]], + chunk_to_colors[chunks[3]], + chunk_to_colors[chunks[5]], + chunk_to_colors[chunks[7]], + chunk_to_colors[chunks[8]], + chunk_to_colors[chunks[11]], + chunk_to_colors[chunks[12]], + } + ) + == 7 + ) + + +def test_coloring_broadcaster(): + chunks = [ + TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(3) + ] + graph = ChunkGraph([chunks[2]]) + for c in chunks: + graph.add_node(c) + chunks[1].op._inputs = [chunks[0]] + graph.add_edge(chunks[0], chunks[1]) + chunks[2].op._inputs = [chunks[0]] + graph.add_edge(chunks[0], chunks[2]) + + all_bands = [("127.0.0.1", "0"), ("127.0.0.1", "1")] + chunk_to_bands = { + chunks[0]: all_bands[0], + } + + coloring = Coloring(graph, all_bands, chunk_to_bands) + chunk_to_colors = coloring.color() + assert len(set(chunk_to_colors.values())) == 1 + assert ( + chunk_to_colors[chunks[0]] + == chunk_to_colors[chunks[1]] + == chunk_to_colors[chunks[2]] + ) + coloring = Coloring( + graph, all_bands, chunk_to_bands, as_broadcaster_successor_num=1 + ) + chunk_to_colors = coloring.color() + assert len(set(chunk_to_colors.values())) == 3 + assert ( + len( + { + chunk_to_colors[chunks[0]], + chunk_to_colors[chunks[1]], + chunk_to_colors[chunks[2]], + } + ) + == 3 + ) diff --git a/python/xorbits/_mars/services/task/api/__init__.py b/python/xorbits/_mars/services/task/api/__init__.py new file mode 100644 index 000000000..d1e1792c9 --- /dev/null +++ b/python/xorbits/_mars/services/task/api/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .core import AbstractTaskAPI +from .oscar import TaskAPI +from .web import WebTaskAPI diff --git a/python/xorbits/_mars/services/task/api/core.py b/python/xorbits/_mars/services/task/api/core.py new file mode 100644 index 000000000..47c7330ac --- /dev/null +++ b/python/xorbits/_mars/services/task/api/core.py @@ -0,0 +1,147 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from typing import List, Union + +from ....core import Tileable +from ..core import TaskResult, TileableGraph + + +class AbstractTaskAPI(ABC): + @abstractmethod + async def get_task_results(self, progress: bool = False) -> List[TaskResult]: + """ + Get results of all tasks in the session + + Parameters + ---------- + progress : bool + If True, will return task progress + + Returns + ------- + task_results: List[TaskResult] + List of task results + """ + + @abstractmethod + async def submit_tileable_graph( + self, + graph: TileableGraph, + fuse_enabled: bool = True, + extra_config: dict = None, + ) -> str: + """ + Submit a tileable graph + + Parameters + ---------- + graph : TileableGraph + Tileable graph. + task_name : str + Task name + fuse_enabled : bool + Enable fuse optimization + extra_config : dict + Extra config. + + Returns + ------- + task_id : str + Task ID. + """ + + @abstractmethod + async def wait_task(self, task_id: str, timeout: float = None): + """ + Wait for a task to finish. + + Parameters + ---------- + task_id : str + Task ID + timeout: float + Second to timeout + """ + + @abstractmethod + async def cancel_task(self, task_id: str): + """ + Cancel task. + + Parameters + ---------- + task_id : str + Task ID. + """ + + @abstractmethod + async def get_task_result(self, task_id: str) -> TaskResult: + """ + Get task status. + + Parameters + ---------- + task_id : str + Task ID. + + Returns + ------- + result : TaskResult + Task result. + """ + + @abstractmethod + async def get_task_progress(self, task_id: str) -> float: + """ + Get task progress. + + Parameters + ---------- + task_id : str + Task ID. + + Returns + ------- + progress : float + Get task progress. + """ + + @abstractmethod + async def get_fetch_tileables(self, task_id: str) -> List[Tileable]: + """ + Get fetch tileable for a task. + + Parameters + ---------- + task_id : str + Task ID. + + Returns + ------- + fetch_tileable_list + Fetch tileable list. + """ + + @abstractmethod + async def get_last_idle_time(self) -> Union[float, None]: + """ + Get last idle time from task manager. + + Returns + ------- + last_idle_time: float + The last idle time if the task manager is idle else None. + """ diff --git a/python/xorbits/_mars/services/task/api/oscar.py b/python/xorbits/_mars/services/task/api/oscar.py new file mode 100644 index 000000000..9750f5590 --- /dev/null +++ b/python/xorbits/_mars/services/task/api/oscar.py @@ -0,0 +1,112 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Union + +from .... import oscar as mo +from ....core import Tileable +from ....lib.aio import alru_cache +from ...subtask import SubtaskResult +from ..core import MapReduceInfo, TaskResult, TileableGraph +from ..supervisor.manager import TaskManagerActor +from .core import AbstractTaskAPI + + +class TaskAPI(AbstractTaskAPI): + def __init__( + self, session_id: str, task_manager_ref: mo.ActorRefType[TaskManagerActor] + ): + self._session_id = session_id + self._task_manager_ref = task_manager_ref + + @classmethod + @alru_cache(cache_exceptions=False) + async def create(cls, session_id: str, address: str) -> "TaskAPI": + """ + Create Task API. + + Parameters + ---------- + session_id : str + Session ID + address : str + Supervisor address. + + Returns + ------- + task_api + Task API. + """ + task_manager_ref = await mo.actor_ref( + address, TaskManagerActor.gen_uid(session_id) + ) + return TaskAPI(session_id, task_manager_ref) + + async def get_task_results(self, progress: bool = False) -> List[TaskResult]: + return await self._task_manager_ref.get_task_results(progress) + + async def submit_tileable_graph( + self, + graph: TileableGraph, + fuse_enabled: bool = None, + extra_config: dict = None, + ) -> str: + try: + return await self._task_manager_ref.submit_tileable_graph( + graph, fuse_enabled=fuse_enabled, extra_config=extra_config + ) + except mo.ActorNotExist: + raise RuntimeError("Session closed already") + + async def get_tileable_graph_as_json(self, task_id: str): + return await self._task_manager_ref.get_tileable_graph_dict_by_task_id(task_id) + + async def get_tileable_details(self, task_id: str): + return await self._task_manager_ref.get_tileable_details(task_id) + + async def get_tileable_subtasks( + self, task_id: str, tileable_id: str, with_input_output: bool + ): + return await self._task_manager_ref.get_tileable_subtasks( + task_id, tileable_id, with_input_output + ) + + async def wait_task(self, task_id: str, timeout: float = None): + return await self._task_manager_ref.wait_task(task_id, timeout=timeout) + + async def get_task_result(self, task_id: str) -> TaskResult: + return await self._task_manager_ref.get_task_result(task_id) + + async def get_task_progress(self, task_id: str) -> float: + return await self._task_manager_ref.get_task_progress(task_id) + + async def cancel_task(self, task_id: str): + return await self._task_manager_ref.cancel_task(task_id) + + async def get_fetch_tileables(self, task_id: str) -> List[Tileable]: + return await self._task_manager_ref.get_task_result_tileables(task_id) + + async def set_subtask_result(self, subtask_result: SubtaskResult): + return await self._task_manager_ref.set_subtask_result.tell(subtask_result) + + async def get_last_idle_time(self) -> Union[float, None]: + return await self._task_manager_ref.get_last_idle_time() + + async def remove_tileables(self, tileable_keys: List[str]): + return await self._task_manager_ref.remove_tileables(tileable_keys) + + async def get_map_reduce_info( + self, task_id: str, map_reduce_id: int + ) -> MapReduceInfo: + return await self._task_manager_ref.get_map_reduce_info(task_id, map_reduce_id) diff --git a/python/xorbits/_mars/services/task/api/web.py b/python/xorbits/_mars/services/task/api/web.py new file mode 100644 index 000000000..86f10d01a --- /dev/null +++ b/python/xorbits/_mars/services/task/api/web.py @@ -0,0 +1,296 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import base64 +import json +from typing import Callable, List, Optional, Union + +from ....core import Tileable, TileableGraph +from ....lib.tbcode import dump_traceback_code, load_traceback_code +from ....utils import deserialize_serializable, serialize_serializable +from ...web import MarsServiceWebAPIHandler, MarsWebAPIClientMixin, web_api +from ..core import TaskResult, TaskStatus +from .core import AbstractTaskAPI + + +def _json_serial_task_result(result: Optional[TaskResult]): + if result is None: + return {} + res_json = { + "task_id": result.task_id, + "session_id": result.session_id, + "stage_id": result.stage_id, + "start_time": result.start_time, + "end_time": result.end_time, + "progress": result.progress, + "status": result.status.value, + "profiling": result.profiling, + } + if result.error is not None: + res_json["error"] = base64.b64encode( + serialize_serializable(result.error) + ).decode() + res_json["traceback"] = base64.b64encode( + serialize_serializable(result.traceback) + ).decode() + res_json["traceback_code"] = dump_traceback_code(result.traceback) + return res_json + + +def _json_deserial_task_result(d: dict) -> Optional[TaskResult]: + if not d: + return None + if "error" in d: + d["error"] = deserialize_serializable(base64.b64decode(d["error"])) + d["traceback"] = deserialize_serializable(base64.b64decode(d["traceback"])) + load_traceback_code(d.pop("traceback_code")) + d["status"] = TaskStatus(d["status"]) + return TaskResult(**d) + + +class TaskWebAPIHandler(MarsServiceWebAPIHandler): + _root_pattern = "/api/session/(?P[^/]+)/task" + + async def _get_oscar_task_api(self, session_id: str): + from .oscar import TaskAPI + + return await self._get_api_by_key(TaskAPI, session_id) + + @web_api("", method="post") + async def submit_tileable_graph(self, session_id: str): + body_args = ( + deserialize_serializable(self.request.body) if self.request.body else None + ) + + fuse_enabled = body_args.get("fuse") + + graph = body_args["graph"] + extra_config = body_args.get("extra_config", None) + if extra_config: + extra_config = deserialize_serializable(extra_config) + + oscar_api = await self._get_oscar_task_api(session_id) + task_id = await oscar_api.submit_tileable_graph( + graph, + fuse_enabled=fuse_enabled, + extra_config=extra_config, + ) + self.write(task_id) + + @web_api("", method="get", cache_blocking=True) + async def get_task_results(self, session_id: str): + progress = bool(int(self.get_argument("progress", "0"))) + oscar_api = await self._get_oscar_task_api(session_id) + res = await oscar_api.get_task_results(progress=progress) + self.write(json.dumps({"tasks": [_json_serial_task_result(r) for r in res]})) + + @web_api( + "(?P[^/]+)", + method="get", + arg_filter={"action": "fetch_tileables"}, + cache_blocking=True, + ) + async def get_fetch_tileables(self, session_id: str, task_id: str): + oscar_api = await self._get_oscar_task_api(session_id) + res = await oscar_api.get_fetch_tileables(task_id) + self.write(serialize_serializable(res)) + + @web_api("(?P[^/]+)", method="get", cache_blocking=True) + async def get_task_result(self, session_id: str, task_id: str): + oscar_api = await self._get_oscar_task_api(session_id) + res = await oscar_api.get_task_result(task_id) + self.write(json.dumps(_json_serial_task_result(res))) + + @web_api( + "(?P[^/]+)/tileable_graph", + method="get", + arg_filter={"action": "get_tileable_graph_as_json"}, + cache_blocking=True, + ) + async def get_tileable_graph_as_json(self, session_id: str, task_id: str): + oscar_api = await self._get_oscar_task_api(session_id) + res = await oscar_api.get_tileable_graph_as_json(task_id) + self.write(json.dumps(res)) + + @web_api("(?P[^/]+)/tileable_detail", method="get", cache_blocking=True) + async def get_tileable_details(self, session_id: str, task_id: str): + oscar_api = await self._get_oscar_task_api(session_id) + res = await oscar_api.get_tileable_details(task_id) + self.write(json.dumps(res)) + + @web_api( + "(?P[^/]+)/(?P[^/]+)/subtask", + method="get", + cache_blocking=True, + ) + async def get_tileable_subtasks( + self, session_id: str, task_id: str, tileable_id: str + ): + with_input_output = self.get_argument("with_input_output", "false") == "true" + oscar_api = await self._get_oscar_task_api(session_id) + res = await oscar_api.get_tileable_subtasks( + task_id, tileable_id, with_input_output + ) + self.write(json.dumps(res)) + + @web_api( + "(?P[^/]+)", + method="get", + arg_filter={"action": "progress"}, + cache_blocking=True, + ) + async def get_task_progress(self, session_id: str, task_id: str): + oscar_api = await self._get_oscar_task_api(session_id) + res = await oscar_api.get_task_progress(task_id) + self.write(str(res)) + + @web_api("", method="get", arg_filter={"action": "last_idle_time"}) + async def get_last_idle_time(self, session_id: str): + oscar_api = await self._get_oscar_task_api(session_id) + res = await oscar_api.get_last_idle_time() + if res: + self.write(str(res)) + + @web_api("(?P[^/]+)", method="get", arg_filter={"action": "wait"}) + async def wait_task(self, session_id: str, task_id: str): + timeout = self.get_argument("timeout", None) or None + timeout = float(timeout) if timeout is not None else None + oscar_api = await self._get_oscar_task_api(session_id) + if timeout: + try: + res = await asyncio.wait_for( + asyncio.shield(oscar_api.wait_task(task_id, timeout)), + timeout=timeout, + ) + self.write(json.dumps(_json_serial_task_result(res))) + except asyncio.TimeoutError: + self.write(json.dumps({})) + else: + res = await oscar_api.wait_task(task_id, timeout) + self.write(json.dumps(_json_serial_task_result(res))) + + @web_api("(?P[^/]+)", method="delete") + async def cancel_task(self, session_id: str, task_id: str): + oscar_api = await self._get_oscar_task_api(session_id) + await oscar_api.cancel_task(task_id) + + +web_handlers = {TaskWebAPIHandler.get_root_pattern(): TaskWebAPIHandler} + + +class WebTaskAPI(AbstractTaskAPI, MarsWebAPIClientMixin): + def __init__( + self, session_id: str, address: str, request_rewriter: Callable = None + ): + self._session_id = session_id + self._address = address.rstrip("/") + self.request_rewriter = request_rewriter + + async def get_task_results(self, progress: bool = False) -> List[TaskResult]: + path = f"{self._address}/api/session/{self._session_id}/task" + params = {"progress": int(progress)} + res = await self._request_url("GET", path, params=params) + return [ + _json_deserial_task_result(d) + for d in json.loads(res.body.decode())["tasks"] + ] + + async def submit_tileable_graph( + self, + graph: TileableGraph, + fuse_enabled: bool = True, + extra_config: dict = None, + ) -> str: + path = f"{self._address}/api/session/{self._session_id}/task" + extra_config_ser = ( + serialize_serializable(extra_config) if extra_config else None + ) + body = serialize_serializable( + { + "fuse": fuse_enabled, + "graph": graph, + "extra_config": extra_config_ser, + } + ) + res = await self._request_url( + path=path, + method="POST", + headers={"Content-Type": "application/octet-stream"}, + data=body, + ) + return res.body.decode().strip() + + async def get_fetch_tileables(self, task_id: str) -> List[Tileable]: + path = ( + f"{self._address}/api/session/{self._session_id}/task/{task_id}" + f"?action=fetch_tileables" + ) + res = await self._request_url("GET", path) + return deserialize_serializable(res.body) + + async def get_task_result(self, task_id: str) -> TaskResult: + path = f"{self._address}/api/session/{self._session_id}/task/{task_id}" + res = await self._request_url("GET", path) + return _json_deserial_task_result(json.loads(res.body.decode())) + + async def get_task_progress(self, task_id: str) -> float: + path = f"{self._address}/api/session/{self._session_id}/task/{task_id}" + params = dict(action="progress") + res = await self._request_url("GET", path, params=params) + return float(res.body.decode()) + + async def get_last_idle_time(self) -> Union[float, None]: + path = f"{self._address}/api/session/{self._session_id}/task" + params = dict(action="last_idle_time") + res = await self._request_url("GET", path, params=params) + content = res.body.decode() + return float(content) if content else None + + async def wait_task(self, task_id: str, timeout: float = None): + path = f"{self._address}/api/session/{self._session_id}/task/{task_id}" + # increase client timeout to handle network overhead during entire request + client_timeout = timeout + 3 if timeout else 0 + params = {"action": "wait", "timeout": "" if timeout is None else str(timeout)} + res = await self._request_url( + "GET", path, params=params, request_timeout=client_timeout + ) + return _json_deserial_task_result(json.loads(res.body.decode())) + + async def cancel_task(self, task_id: str): + path = f"{self._address}/api/session/{self._session_id}/task/{task_id}" + await self._request_url(path=path, method="DELETE") + + async def get_tileable_graph_as_json(self, task_id: str): + path = f"{self._address}/api/session/{self._session_id}/task/{task_id}/tileable_graph" + params = dict(action="get_tileable_graph_as_json") + res = await self._request_url(path=path, params=params, method="GET") + return json.loads(res.body.decode()) + + async def get_tileable_details(self, task_id: str): + path = f"{self._address}/api/session/{self._session_id}/task/{task_id}/tileable_detail" + res = await self._request_url(path=path, method="GET") + return json.loads(res.body.decode()) + + async def get_tileable_subtasks( + self, task_id: str, tileable_id: str, with_input_output: bool + ): + with_input_output = "true" if with_input_output else "false" + path = f"{self._address}/api/session/{self._session_id}/task/{task_id}/{tileable_id}/subtask" + params = { + "action": "fetch_graph", + "with_input_output": with_input_output, + } + res = await self._request_url(path=path, params=params, method="GET") + return json.loads(res.body.decode()) diff --git a/python/xorbits/_mars/services/task/config.py b/python/xorbits/_mars/services/task/config.py new file mode 100644 index 000000000..7fb76001f --- /dev/null +++ b/python/xorbits/_mars/services/task/config.py @@ -0,0 +1,26 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...config import Config, is_bool, is_integer, is_list + +task_options = Config() + +# supervisor +task_options.register_option("optimize_tileable_graph", True, validator=is_bool) +task_options.register_option("optimize_chunk_graph", True, validator=is_bool) +task_options.register_option("fuse_enabled", True, validator=is_bool) +task_options.register_option("reserved_finish_tasks", 25, validator=is_integer) + +# worker +task_options.register_option("runtime_engines", ["numexpr", "cupy"], validator=is_list) diff --git a/python/xorbits/_mars/services/task/core.py b/python/xorbits/_mars/services/task/core.py new file mode 100644 index 000000000..5c871128d --- /dev/null +++ b/python/xorbits/_mars/services/task/core.py @@ -0,0 +1,120 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +from enum import Enum +from string import ascii_letters, digits +from typing import Any, Dict, List, Optional, Tuple + +from ...core import TileableGraph +from ...serialization.serializables import ( + AnyField, + BoolField, + DictField, + FieldTypes, + Float64Field, + Int32Field, + ListField, + ReferenceField, + Serializable, + StringField, +) +from ...typing import BandType + + +class TaskStatus(Enum): + pending = 0 + running = 1 + terminated = 2 + + +class Task(Serializable): + task_id: str = StringField("task_id") + session_id: str = StringField("session_id") + tileable_graph: TileableGraph = ReferenceField("tileable_graph", TileableGraph) + fuse_enabled: bool = BoolField("fuse_enabled") + extra_config: dict = DictField("extra_config") + + def __init__( + self, + task_id: str = None, + session_id: str = None, + tileable_graph: TileableGraph = None, + fuse_enabled: bool = True, + extra_config: dict = None, + ): + super().__init__( + task_id=task_id, + session_id=session_id, + tileable_graph=tileable_graph, + fuse_enabled=fuse_enabled, + extra_config=extra_config, + ) + + +class TaskResult(Serializable): + task_id: str = StringField("task_id") + session_id: str = StringField("session_id") + stage_id: str = StringField("stage_id") + start_time: Optional[float] = Float64Field("start_time") + end_time: Optional[float] = Float64Field("end_time") + progress: Optional[float] = Float64Field("progress") + status: TaskStatus = ReferenceField("status", TaskStatus) + error = AnyField("error") + traceback = AnyField("traceback") + profiling: Dict = DictField("profiling") + + def __init__( + self, + task_id: str = None, + session_id: str = None, + stage_id: str = None, + start_time: Optional[float] = None, + end_time: Optional[float] = None, + progress: Optional[float] = None, + status: TaskStatus = None, + error: Any = None, + traceback: Any = None, + profiling: Dict = None, + ): + super().__init__( + task_id=task_id, + session_id=session_id, + stage_id=stage_id, + start_time=start_time, + end_time=end_time, + progress=progress, + status=status, + error=error, + traceback=traceback, + profiling=profiling, + ) + + +def new_task_id(): + return "".join(random.choice(ascii_letters + digits) for _ in range(24)) + + +class MapReduceInfo(Serializable): + # record map reduce info during analyzing + # record reducer indexes, and assigned bands + map_reduce_id: int = Int32Field("map_reduce_id") + reducer_indexes: List[Tuple[int]] = ListField( + "reducer_indexes", FieldTypes.tuple(FieldTypes.int64), default_factory=list + ) + reducer_bands: List[BandType] = ListField( + "reducer_bands", + FieldTypes.tuple(FieldTypes.string, FieldTypes.string), + default_factory=list, + ) diff --git a/python/xorbits/_mars/services/task/errors.py b/python/xorbits/_mars/services/task/errors.py new file mode 100644 index 000000000..e431bf07d --- /dev/null +++ b/python/xorbits/_mars/services/task/errors.py @@ -0,0 +1,19 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...core.base import MarsError + + +class TaskNotExist(MarsError): + pass diff --git a/python/xorbits/_mars/services/task/execution/__init__.py b/python/xorbits/_mars/services/task/execution/__init__.py new file mode 100644 index 000000000..61f9a12fe --- /dev/null +++ b/python/xorbits/_mars/services/task/execution/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .mars import * +from .ray import * diff --git a/python/xorbits/_mars/services/task/execution/api.py b/python/xorbits/_mars/services/task/execution/api.py new file mode 100644 index 000000000..718e0805a --- /dev/null +++ b/python/xorbits/_mars/services/task/execution/api.py @@ -0,0 +1,256 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any, Dict, List, Type, Union + +from ....core import Chunk, ChunkGraph, TileContext +from ....core.operand.shuffle import ShuffleFetchType +from ....resource import Resource +from ....typing import BandType +from ....utils import merge_dict +from ...subtask import SubtaskGraph, SubtaskResult + + +class ExecutionConfig: + """ + The config for execution backends. + + This class should ONLY provide the APIs for the parts other than + just the execution. Each backend may have a different implementation + of the API. + + If some configuration is for a specific backend. They should be in + the backend config. e.g. `get_mars_special_config()` should be in + the `MarsExecutionConfig`. + """ + + name = None + + def __init__(self, config: Dict): + """ + An example of config: + { + "backend": "mars", + "mars": { + "n_worker": 1, + "n_cpu": 2, + ... + }, + } + """ + self._config = config + + def merge_from(self, execution_config: "ExecutionConfig") -> "ExecutionConfig": + assert isinstance(execution_config, ExecutionConfig) + assert self.backend == execution_config.backend + merge_dict( + self._config, + execution_config.get_config_dict(), + ) + return self + + @property + def backend(self) -> str: + """The backend from config.""" + return self._config["backend"] + + def get_config_dict(self) -> Dict: + """Get the execution config dict.""" + return self._config + + @abstractmethod + def get_deploy_band_resources(self) -> List[Dict[str, Resource]]: + """Get the band resources for deployment.""" + + @abstractmethod + def get_shuffle_fetch_type(self) -> ShuffleFetchType: + """Get shuffle fetch type for shuffle execution""" + + @classmethod + def from_config(cls, config: Dict, backend: str = None) -> "ExecutionConfig": + """Construct an execution config instance from config.""" + execution_config = config["task"]["execution_config"] + return cls.from_execution_config(execution_config, backend) + + @classmethod + def from_execution_config( + cls, execution_config: Union[Dict, "ExecutionConfig"], backend: str = None + ) -> "ExecutionConfig": + """Construct an execution config instance from execution config.""" + if isinstance(execution_config, ExecutionConfig): + assert backend is None + return execution_config + if backend is not None: + name = execution_config["backend"] = backend + else: + name = execution_config.setdefault("backend", "mars") + config_cls = _name_to_config_cls[name] + execution_config.setdefault(name, {}) + return config_cls(execution_config) + + @classmethod + def from_params( + cls, + backend: str, + n_worker: int, + n_cpu: int, + mem_bytes: int = 0, + cuda_devices: List[List[int]] = None, + **kwargs, + ) -> "ExecutionConfig": + """Construct an execution config instance from params.""" + execution_config = { + "backend": backend, + backend: dict( + { + "n_worker": n_worker, + "n_cpu": n_cpu, + "mem_bytes": mem_bytes, + "cuda_devices": cuda_devices, + }, + **kwargs, + ), + } + return cls.from_execution_config(execution_config) + + +_name_to_config_cls: Dict[str, Type[ExecutionConfig]] = {} + + +def register_config_cls(config_cls: Type[ExecutionConfig]): + _name_to_config_cls[config_cls.name] = config_cls + return config_cls + + +@dataclass +class ExecutionChunkResult: + meta: Dict # The chunk meta for iterative tiling. + context: Any # The context info, e.g. ray.ObjectRef. + + +class TaskExecutor(ABC): + name = None + + @classmethod + @abstractmethod + async def create( + cls, + config: Union[Dict, ExecutionConfig], + *, + session_id: str, + address: str, + task, + tile_context: TileContext, + **kwargs, + ) -> "TaskExecutor": + backend_config = ExecutionConfig.from_execution_config(config) + executor_cls = _name_to_task_executor_cls[backend_config.backend] + if executor_cls.create.__func__ is TaskExecutor.create.__func__: + raise NotImplementedError( + f"The {executor_cls} should implement the abstract classmethod `create`." + ) + return await executor_cls.create( + backend_config, + session_id=session_id, + address=address, + task=task, + tile_context=tile_context, + **kwargs, + ) + + @abstractmethod + def get_execution_config(self) -> ExecutionConfig: + """Return execution config.""" + + def destroy(self): + """Destroy the executor.""" + + async def __aenter__(self): + """Called when begin to execute the task.""" + + @abstractmethod + async def execute_subtask_graph( + self, + stage_id: str, + subtask_graph: SubtaskGraph, + chunk_graph: ChunkGraph, + tile_context: TileContext, + context: Any = None, + ) -> Dict[Chunk, ExecutionChunkResult]: + """Execute a subtask graph and returns result.""" + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Called when finish the task.""" + + @abstractmethod + async def get_available_band_resources(self) -> Dict[BandType, Resource]: + """Get available band resources.""" + + @abstractmethod + async def get_progress(self) -> float: + """Get the execution progress.""" + + @abstractmethod + async def cancel(self): + """Cancel execution.""" + + # The following APIs are for compatible with mars backend, they + # will be removed as soon as possible. + async def set_subtask_result(self, subtask_result: SubtaskResult): + """Set the subtask result.""" + + def get_stage_processors(self): + """Get stage processors.""" + + +_name_to_task_executor_cls: Dict[str, Type[TaskExecutor]] = {} + + +def register_executor_cls(executor_cls: Type[TaskExecutor]): + _name_to_task_executor_cls[executor_cls.name] = executor_cls + return executor_cls + + +class Fetcher: + """The data fetcher for execution backends.""" + + name = None + required_meta_keys = () # The required meta keys. + + @abstractmethod + def __init__(self, **kwargs): + pass + + @abstractmethod + async def append(self, chunk_key: str, chunk_meta: Dict, conditions: List = None): + """Append chunk key and related infos.""" + + @abstractmethod + async def get(self): + """Get all the data of appended chunk keys.""" + + @classmethod + def create(cls, backend: str, **kwargs) -> "Fetcher": + fetcher_cls = _name_to_fetcher_cls[backend] + return fetcher_cls(**kwargs) + + +_name_to_fetcher_cls: Dict[str, Type[Fetcher]] = {} + + +def register_fetcher_cls(fetcher_cls: Type[Fetcher]): + _name_to_fetcher_cls[fetcher_cls.name] = fetcher_cls + return fetcher_cls diff --git a/python/xorbits/_mars/services/task/execution/mars/__init__.py b/python/xorbits/_mars/services/task/execution/mars/__init__.py new file mode 100644 index 000000000..b0df63861 --- /dev/null +++ b/python/xorbits/_mars/services/task/execution/mars/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .config import MarsExecutionConfig +from .executor import MarsTaskExecutor +from .fetcher import MarsFetcher diff --git a/python/xorbits/_mars/services/task/execution/mars/config.py b/python/xorbits/_mars/services/task/execution/mars/config.py new file mode 100644 index 000000000..962b7d898 --- /dev/null +++ b/python/xorbits/_mars/services/task/execution/mars/config.py @@ -0,0 +1,35 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List + +from .....core.operand.shuffle import ShuffleFetchType +from .....resource import Resource +from ..api import ExecutionConfig, register_config_cls +from ..utils import get_band_resources_from_config + + +@register_config_cls +class MarsExecutionConfig(ExecutionConfig): + name = "mars" + + def __init__(self, execution_config: Dict): + super().__init__(execution_config) + self._mars_execution_config = execution_config[self.backend] + + def get_deploy_band_resources(self) -> List[Dict[str, Resource]]: + return get_band_resources_from_config(self._mars_execution_config) + + def get_shuffle_fetch_type(self) -> ShuffleFetchType: + return ShuffleFetchType.FETCH_BY_KEY diff --git a/python/xorbits/_mars/services/task/execution/mars/executor.py b/python/xorbits/_mars/services/task/execution/mars/executor.py new file mode 100644 index 000000000..fb5b4d198 --- /dev/null +++ b/python/xorbits/_mars/services/task/execution/mars/executor.py @@ -0,0 +1,461 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import logging +import sys +from collections import defaultdict +from typing import Dict, List, Optional, Set + +from ..... import oscar as mo +from .....core import ChunkGraph, TileContext +from .....core.operand import Fetch, MapReduceOperand, OperandStage, ShuffleProxy +from .....lib.aio import alru_cache +from .....oscar.profiling import ProfilingData +from .....resource import Resource +from .....typing import BandType, TileableType +from .....utils import Timer +from ....cluster.api import ClusterAPI +from ....context import ThreadedServiceContext +from ....lifecycle.api import LifecycleAPI +from ....meta.api import MetaAPI +from ....scheduling import SchedulingAPI +from ....subtask import Subtask, SubtaskGraph, SubtaskResult, SubtaskStatus +from ...core import Task +from ..api import TaskExecutor, register_executor_cls +from ..utils import ResultTileablesLifecycle +from .config import MarsExecutionConfig +from .resource import ResourceEvaluator +from .stage import TaskStageProcessor + +logger = logging.getLogger(__name__) + + +def _get_n_reducers(subtask: Subtask) -> int: + return len( + [ + r + for r in subtask.chunk_graph + if isinstance(r.op, MapReduceOperand) and r.op.stage == OperandStage.reduce + ] + ) + + +@register_executor_cls +class MarsTaskExecutor(TaskExecutor): + name = "mars" + _stage_processors: List[TaskStageProcessor] + _stage_tile_progresses: List[float] + _cur_stage_processor: Optional[TaskStageProcessor] + _meta_updated_tileables: Set[TileableType] + _ctx: ThreadedServiceContext + + def __init__( + self, + config: MarsExecutionConfig, + task: Task, + tile_context: TileContext, + cluster_api: ClusterAPI, + lifecycle_api: LifecycleAPI, + scheduling_api: SchedulingAPI, + meta_api: MetaAPI, + resource_evaluator: ResourceEvaluator, + ctx: ThreadedServiceContext, + ): + self._config = config + self._task = task + self._tileable_graph = task.tileable_graph + self._raw_tile_context = tile_context.copy() + self._tile_context = tile_context + self._session_id = task.session_id + + # api + self._cluster_api = cluster_api + self._lifecycle_api = lifecycle_api + self._scheduling_api = scheduling_api + self._meta_api = meta_api + + self._stage_processors = [] + self._stage_tile_progresses = [] + self._cur_stage_processor = None + self._result_tileables_lifecycle = None + self._subtask_decref_events = dict() + self._meta_updated_tileables = set() + + # Evaluate and initialize subtasks required resource. + self._resource_evaluator = resource_evaluator + + # context + self._ctx = ctx + + @classmethod + async def create( + cls, + config: MarsExecutionConfig, + *, + session_id: str, + address: str, + task: Task, + tile_context: TileContext, + **kwargs, + ) -> "MarsTaskExecutor": + assert ( + len(kwargs) == 0 + ), f"Unexpected kwargs for {cls.__name__}.create: {kwargs}" + cluster_api, lifecycle_api, scheduling_api, meta_api = await cls._get_apis( + session_id, address + ) + resource_evaluator = await ResourceEvaluator.create( + config.get_config_dict(), + session_id=task.session_id, + task_id=task.task_id, + cluster_api=cluster_api, + ) + ctx = await cls._init_context(session_id, address) + return cls( + config, + task, + tile_context, + cluster_api, + lifecycle_api, + scheduling_api, + meta_api, + resource_evaluator, + ctx, + ) + + def get_execution_config(self): + return self._config + + @classmethod + @alru_cache(cache_exceptions=False) + async def _get_apis(cls, session_id: str, address: str): + return await asyncio.gather( + ClusterAPI.create(address), + LifecycleAPI.create(session_id, address), + SchedulingAPI.create(session_id, address), + MetaAPI.create(session_id, address), + ) + + @classmethod + async def _init_context( + cls, session_id: str, address: str + ) -> ThreadedServiceContext: + loop = asyncio.get_running_loop() + context = ThreadedServiceContext( + session_id, address, address, address, loop=loop + ) + await context.init() + return context + + async def __aenter__(self): + profiling = ProfilingData[self._task.task_id, "general"] + # incref fetch tileables to ensure fetch data not deleted + with Timer() as timer: + await self._incref_fetch_tileables() + profiling.set("incref_fetch_tileables", timer.duration) + self._result_tileables_lifecycle = ResultTileablesLifecycle( + self._tileable_graph, self._tile_context, self._lifecycle_api + ) + self._ctx.__enter__() + + async def execute_subtask_graph( + self, + stage_id: str, + subtask_graph: SubtaskGraph, + chunk_graph: ChunkGraph, + tile_context: TileContext, + context=None, + ): + available_bands = await self.get_available_band_resources() + await self._result_tileables_lifecycle.incref_tiled() + stage_processor = TaskStageProcessor( + stage_id, + self._task, + chunk_graph, + subtask_graph, + list(available_bands), + tile_context, + self._scheduling_api, + self._meta_api, + ) + await self._incref_stage(stage_processor) + await self._resource_evaluator.evaluate(stage_processor) + self._stage_processors.append(stage_processor) + self._cur_stage_processor = stage_processor + # get the tiled progress for current stage + prev_progress = sum(self._stage_tile_progresses) + curr_tile_progress = self._tile_context.get_all_progress() - prev_progress + self._stage_tile_progresses.append(curr_tile_progress) + return await stage_processor.run() + + async def __aexit__(self, exc_type, exc_val, exc_tb): + # clean-ups + decrefs = [] + error_or_cancelled = False + for stage_processor in self._stage_processors: + if stage_processor.error_or_cancelled(): + error_or_cancelled = True + decrefs.append(self._decref_stage.delay(stage_processor)) + await self._decref_stage.batch(*decrefs) + # revert fetch incref + await self._decref_fetch_tileables() + if error_or_cancelled: + # revert result incref if error or cancelled + await self._result_tileables_lifecycle.decref_tracked() + await self._resource_evaluator.report() + self._ctx.__exit__(exc_type, exc_val, exc_tb) + + async def get_available_band_resources(self) -> Dict[BandType, Resource]: + async for bands in self._cluster_api.watch_all_bands(): + if bands: + return bands + + async def get_progress(self) -> float: + # get progress of stages + executor_progress = 0.0 + assert len(self._stage_tile_progresses) == len(self._stage_processors) + for stage_processor, stage_tile_progress in zip( + self._stage_processors, self._stage_tile_progresses + ): + if stage_processor.subtask_graph is None: # pragma: no cover + # generating subtask + continue + n_subtask = len(stage_processor.subtask_graph) + if n_subtask == 0: # pragma: no cover + continue + progress = sum( + result.progress for result in stage_processor.subtask_results.values() + ) + progress += sum( + result.progress + for subtask_key, result in stage_processor.subtask_snapshots.items() + if subtask_key not in stage_processor.subtask_results + ) + subtask_progress = progress / n_subtask + executor_progress += subtask_progress * stage_tile_progress + return executor_progress + + async def cancel(self): + if self._cur_stage_processor is not None: + await self._cur_stage_processor.cancel() + + async def set_subtask_result(self, subtask_result: SubtaskResult): + if self._cur_stage_processor is None or ( + subtask_result.stage_id + and self._cur_stage_processor.stage_id != subtask_result.stage_id + ): + logger.warning( + "Stage %s for subtask %s not exists, got stale subtask result %s which may be " + "speculative execution from previous stages, just ignore it.", + subtask_result.stage_id, + subtask_result.subtask_id, + subtask_result, + ) + return + stage_processor = self._cur_stage_processor + subtask = stage_processor.subtask_id_to_subtask[subtask_result.subtask_id] + + prev_result = stage_processor.subtask_results.get(subtask) + if prev_result and ( + prev_result.status == SubtaskStatus.succeeded + or prev_result.progress > subtask_result.progress + ): + logger.info( + "Skip set subtask %s with result %s, previous result is %s.", + subtask.subtask_id, + subtask_result, + prev_result, + ) + # For duplicate run of subtasks, if the progress is smaller or the subtask has finished or canceled + # in task speculation, just do nothing. + # TODO(chaokunyang) If duplicate run of subtasks failed, it may be the fault in worker node, + # print the exception, and if multiple failures on the same node, remove the node from the cluster. + return + if subtask_result.bands: + [band] = subtask_result.bands + else: + band = None + stage_processor.subtask_snapshots[subtask] = subtask_result.update( + stage_processor.subtask_snapshots.get(subtask) + ) + if subtask_result.status.is_done: + # update stage_processor.subtask_results to avoid concurrent set_subtask_result + # since we release lock when `_decref_input_subtasks`. + stage_processor.subtask_results[subtask] = subtask_result.update( + stage_processor.subtask_results.get(subtask) + ) + try: + # Since every worker will call supervisor to set subtask result, + # we need to release actor lock to make `decref_chunks` parallel to avoid blocking + # other `set_subtask_result` calls. + # If speculative execution enabled, concurrent subtasks may got error since input chunks may + # got deleted. But it's OK because the current subtask run has succeed. + if subtask.subtask_id not in stage_processor.decref_subtask: + stage_processor.decref_subtask.add(subtask.subtask_id) + await self._decref_input_subtasks( + subtask, stage_processor.subtask_graph + ) + + except: # noqa: E722 # nosec # pylint: disable=bare-except # pragma: no cover + logger.debug( + "Decref input subtasks for subtask %s failed.", subtask.subtask_id + ) + _, err, tb = sys.exc_info() + if subtask_result.status not in ( + SubtaskStatus.errored, + SubtaskStatus.cancelled, + ): + subtask_result.status = SubtaskStatus.errored + subtask_result.error = err + subtask_result.traceback = tb + await stage_processor.set_subtask_result(subtask_result, band=band) + + def get_stage_processors(self): + return self._stage_processors + + async def _incref_fetch_tileables(self): + # incref fetch tileables in tileable graph to prevent them from deleting + to_incref_tileable_keys = [ + tileable.op.source_key + for tileable in self._tileable_graph + if isinstance(tileable.op, Fetch) and tileable in self._raw_tile_context + ] + await self._lifecycle_api.incref_tileables(to_incref_tileable_keys) + + async def _decref_fetch_tileables(self): + fetch_tileable_keys = [ + tileable.op.source_key + for tileable in self._tileable_graph + if isinstance(tileable.op, Fetch) and tileable in self._raw_tile_context + ] + await self._lifecycle_api.decref_tileables(fetch_tileable_keys) + + async def _incref_stage(self, stage_processor: "TaskStageProcessor"): + subtask_graph = stage_processor.subtask_graph + incref_chunk_key_to_counts = defaultdict(lambda: 0) + for subtask in subtask_graph: + # for subtask has successors, incref number of successors + n = subtask_graph.count_successors(subtask) + for c in subtask.chunk_graph.results: + incref_chunk_key_to_counts[c.key] += n + # process reducer, incref mapper chunks + for pre_graph in subtask_graph.iter_predecessors(subtask): + for chk in pre_graph.chunk_graph.results: + if isinstance(chk.op, ShuffleProxy): + n_reducers = _get_n_reducers(subtask) + for map_chunk in chk.inputs: + incref_chunk_key_to_counts[map_chunk.key] += n_reducers + result_chunks = stage_processor.chunk_graph.result_chunks + for c in result_chunks: + incref_chunk_key_to_counts[c.key] += 1 + logger.debug( + "Incref chunks for stage %s: %s", + stage_processor.stage_id, + incref_chunk_key_to_counts, + ) + await self._lifecycle_api.incref_chunks( + list(incref_chunk_key_to_counts), + counts=list(incref_chunk_key_to_counts.values()), + ) + + @classmethod + def _get_decref_stage_chunk_key_to_counts( + cls, stage_processor: "TaskStageProcessor" + ) -> Dict[str, int]: + decref_chunk_key_to_counts = defaultdict(lambda: 0) + error_or_cancelled = stage_processor.error_or_cancelled() + if stage_processor.subtask_graph: + subtask_graph = stage_processor.subtask_graph + if error_or_cancelled: + # error or cancel, rollback incref for subtask results + for subtask in subtask_graph: + if subtask.subtask_id in stage_processor.decref_subtask: + continue + stage_processor.decref_subtask.add(subtask.subtask_id) + # if subtask not executed, rollback incref of predecessors + for inp_subtask in subtask_graph.predecessors(subtask): + for c in inp_subtask.chunk_graph.results: + decref_chunk_key_to_counts[c.key] += 1 + # decref result of chunk graphs + for c in stage_processor.chunk_graph.results: + decref_chunk_key_to_counts[c.key] += 1 + return decref_chunk_key_to_counts + + @mo.extensible + async def _decref_stage(self, stage_processor: "TaskStageProcessor"): + decref_chunk_key_to_counts = self._get_decref_stage_chunk_key_to_counts( + stage_processor + ) + logger.debug( + "Decref chunks when stage %s finish: %s", + stage_processor.stage_id, + decref_chunk_key_to_counts, + ) + await self._lifecycle_api.decref_chunks( + list(decref_chunk_key_to_counts), + counts=list(decref_chunk_key_to_counts.values()), + ) + + @_decref_stage.batch + async def _decref_stage(self, args_list, kwargs_list): + decref_chunk_key_to_counts = defaultdict(lambda: 0) + for args, kwargs in zip(args_list, kwargs_list): + chunk_key_to_counts = self._get_decref_stage_chunk_key_to_counts( + *args, **kwargs + ) + for k, c in chunk_key_to_counts.items(): + decref_chunk_key_to_counts[k] += c + logger.debug("Decref chunks when stages finish: %s", decref_chunk_key_to_counts) + await self._lifecycle_api.decref_chunks( + list(decref_chunk_key_to_counts), + counts=list(decref_chunk_key_to_counts.values()), + ) + + async def _decref_input_subtasks( + self, subtask: Subtask, subtask_graph: SubtaskGraph + ): + # make sure subtasks are decreffed only once + if subtask.subtask_id not in self._subtask_decref_events: + self._subtask_decref_events[subtask.subtask_id] = asyncio.Event() + else: # pragma: no cover + await self._subtask_decref_events[subtask.subtask_id].wait() + return + + decref_chunk_key_to_counts = defaultdict(lambda: 0) + for in_subtask in subtask_graph.iter_predecessors(subtask): + for result_chunk in in_subtask.chunk_graph.results: + # for reducer chunk, decref mapper chunks + if isinstance(result_chunk.op, ShuffleProxy): + n_reducers = _get_n_reducers(subtask) + for inp in result_chunk.inputs: + decref_chunk_key_to_counts[inp.key] += n_reducers + decref_chunk_key_to_counts[result_chunk.key] += 1 + logger.debug( + "Decref chunks %s when subtask %s finish", + decref_chunk_key_to_counts, + subtask.subtask_id, + ) + await self._lifecycle_api.decref_chunks( + list(decref_chunk_key_to_counts), + counts=list(decref_chunk_key_to_counts.values()), + ) + + # `set_subtask_result` will be called when subtask finished + # but report progress will call set_subtask_result too, + # so it have risk to duplicate decrease some subtask input object reference, + # it will cause object reference count lower zero + # TODO(Catch-Bull): Pop asyncio.Event when current subtask `set_subtask_result` + # will never be called + self._subtask_decref_events[subtask.subtask_id].set() diff --git a/python/xorbits/_mars/services/task/execution/mars/fetcher.py b/python/xorbits/_mars/services/task/execution/mars/fetcher.py new file mode 100644 index 000000000..957ea2558 --- /dev/null +++ b/python/xorbits/_mars/services/task/execution/mars/fetcher.py @@ -0,0 +1,56 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator +from collections import defaultdict, namedtuple +from typing import Dict, List + +from ..api import Fetcher, register_fetcher_cls + +_GetWithIndex = namedtuple("GetWithIndex", ["get", "index"]) + + +@register_fetcher_cls +class MarsFetcher(Fetcher): + name = "mars" + required_meta_keys = ("bands",) + + def __init__(self, get_storage_api, **kwargs): + self._get_storage_api = get_storage_api + self._storage_api_to_gets = defaultdict(list) + self._counter = 0 + + async def append(self, chunk_key: str, chunk_meta: Dict, conditions: List = None): + band = None + if chunk_meta: + bands = chunk_meta.get("bands") + if bands: + band = bands[0] + storage_api = await self._get_storage_api(band) + get = _GetWithIndex( + storage_api.get.delay(chunk_key, conditions=conditions), self._counter + ) + self._storage_api_to_gets[storage_api].append(get) + self._counter += 1 + + async def get(self): + results = [None] * self._counter + for storage_api in self._storage_api_to_gets: + gets = self._storage_api_to_gets[storage_api] + fetched_data = await storage_api.get.batch( + *map(operator.itemgetter(0), gets) + ) + for get, data in zip(gets, fetched_data): + results[get.index] = data + return results diff --git a/python/xorbits/_mars/services/task/execution/mars/resource.py b/python/xorbits/_mars/services/task/execution/mars/resource.py new file mode 100644 index 000000000..e3e85c089 --- /dev/null +++ b/python/xorbits/_mars/services/task/execution/mars/resource.py @@ -0,0 +1,95 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from typing import Any, Dict, Type + +from .....resource import Resource + +_name_to_resource_evaluator: Dict[str, Type["ResourceEvaluator"]] = {} + + +def register_resource_evaluator(evaluator_cls: Type["ResourceEvaluator"]): + _name_to_resource_evaluator[evaluator_cls.name] = evaluator_cls + return evaluator_cls + + +def init_default_resource_for_subtask(subtask_graph: "SubtaskGraph"): # noqa: F821 + for subtask in subtask_graph.iter_nodes(): + is_gpu = any(c.op.gpu for c in subtask.chunk_graph) + subtask.required_resource = ( + Resource(num_gpus=1) if is_gpu else Resource(num_cpus=1) + ) + + +class ResourceEvaluator(ABC): + """ + Resource evaluator is used to estimate and set resources required by + subtasks. It can be an internal service or an external service. If it + is an internal service, we can set default of adjustable resources for + subtasks. If it is an external service, we should report the running + result of the task to the external service, so that it can accurately + predict the required resources of subtasks based on the historical + running information, we call it HBO. + + Best practice + ---------- + You can follow the steps below to add a new resource evaluator: + * Inherit `ResourceEvaluator` and implement `create`, `evaluate` + and `report` methods. The `create` method is to create a new + resource evaluator instance. The `evaluate` method is to estimate + and set required resources for the subtasks of a task stage. And + this method must be implemented. The `report` method is to report + the running information and result of the task. And this method + does not have to be implemented. + + * Add default configs of the new evaluator needed in `base_config.xml` + or its descendant files. + + * Set the `resource_evaluator` to choose a resource evaluator in + `base_config.xml` when running a mars job. + """ + + name = None + + @classmethod + @abstractmethod + async def create(cls, config: Dict[str, Any], **kwargs) -> "ResourceEvaluator": + name = config.get("resource_evaluator", "default") + evaluator_config = config.get(name, {}) + evaluator_cls = _name_to_resource_evaluator[name] + return await evaluator_cls.create(evaluator_config, **kwargs) + + @abstractmethod + async def evaluate(self, stage_processor: "TaskStageProcessor"): # noqa: F821 + """Called before executing a task stage.""" + + @abstractmethod + async def report(self): + """Called after executing a task.""" + + +@register_resource_evaluator +class DefaultEvaluator(ResourceEvaluator): + name = "default" + + @classmethod + async def create(cls, config, **kwargs) -> "ResourceEvaluator": + return cls() + + async def evaluate(self, stage_processor: "TaskStageProcessor"): # noqa: F821 + init_default_resource_for_subtask(stage_processor.subtask_graph) + + async def report(self): + pass diff --git a/python/xorbits/_mars/services/task/execution/mars/stage.py b/python/xorbits/_mars/services/task/execution/mars/stage.py new file mode 100644 index 000000000..afa2ec9f9 --- /dev/null +++ b/python/xorbits/_mars/services/task/execution/mars/stage.py @@ -0,0 +1,351 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import itertools +import logging +import time +from collections import defaultdict +from typing import Dict, List + +from ..... import oscar as mo +from .....core import Chunk, ChunkGraph +from .....core.operand import Fetch, Fuse +from .....metrics import Metrics +from .....typing import BandType, TileableType +from .....utils import get_chunk_params +from ....meta import MetaAPI, WorkerMetaAPI +from ....scheduling import SchedulingAPI +from ....subtask import Subtask, SubtaskGraph, SubtaskResult, SubtaskStatus +from ....task.core import Task, TaskResult, TaskStatus +from ..api import ExecutionChunkResult + +logger = logging.getLogger(__name__) + + +class TaskStageProcessor: + def __init__( + self, + stage_id: str, + task: Task, + chunk_graph: ChunkGraph, + subtask_graph: SubtaskGraph, + bands: List[BandType], + tile_context: Dict[TileableType, TileableType], + scheduling_api: SchedulingAPI, + meta_api: MetaAPI, + ): + self.stage_id = stage_id + self.task = task + self.chunk_graph = chunk_graph + self.subtask_graph = subtask_graph + self._bands = bands + self._tile_context = tile_context + + # APIs + self._scheduling_api = scheduling_api + self._meta_api = meta_api + + # gen subtask_id to subtask + self.subtask_id_to_subtask = { + subtask.subtask_id: subtask for subtask in subtask_graph + } + self._subtask_to_bands: Dict[Subtask, BandType] = dict() + self.subtask_snapshots: Dict[Subtask, SubtaskResult] = dict() + self.subtask_results: Dict[Subtask, SubtaskResult] = dict() + self._submitted_subtask_ids = set() + + # All subtask IDs whose input chunk reference count is reduced. + self.decref_subtask = set() + + self._band_manager: Dict[BandType, mo.ActorRef] = dict() + + # result + self.result = TaskResult( + task.task_id, + task.session_id, + self.stage_id, + status=TaskStatus.pending, + start_time=time.time(), + ) + # status + self._done = asyncio.Event() + self._cancelled = asyncio.Event() + + # add metrics + self._stage_execution_time = Metrics.gauge( + "mars.stage_execution_time_secs", + "Time consuming in seconds to execute a stage", + ("session_id", "task_id", "stage_id"), + ) + + def is_cancelled(self): + return self._cancelled.is_set() + + async def _schedule_subtasks(self, subtasks: List[Subtask]): + subtasks = [ + subtask + for subtask in subtasks + if subtask.subtask_id not in self._submitted_subtask_ids + ] + if not subtasks: + return + self._submitted_subtask_ids.update(subtask.subtask_id for subtask in subtasks) + return await self._scheduling_api.add_subtasks( + subtasks, [subtask.priority for subtask in subtasks] + ) + + async def _get_stage_result(self): + chunks = [] + get_meta = [] + results_chunks = self.chunk_graph.result_chunks + for chunk in results_chunks: + if isinstance(chunk.op, Fetch): + continue + chunks.append(chunk) + if isinstance(chunk.op, Fuse): + chunk = chunk.chunk + get_meta.append( + self._meta_api.get_chunk_meta.delay( + chunk.key, + # only fetch bands from supervisor meta + fields=["bands"], + ) + ) + metas = await self._meta_api.get_chunk_meta.batch(*get_meta) + execution_chunk_results = { + chunk: ExecutionChunkResult(meta=meta, context=None) + for chunk, meta in zip(chunks, metas) + } + await self._update_result_meta(execution_chunk_results) + return execution_chunk_results + + def _schedule_done(self): + self._done.set() + + async def set_subtask_result(self, result: SubtaskResult, band: BandType = None): + assert result.status.is_done + subtask = self.subtask_id_to_subtask[result.subtask_id] + # update subtask_results in `TaskProcessorActor.set_subtask_result` + self._submitted_subtask_ids.difference_update([result.subtask_id]) + + all_done = len(self.subtask_results) == len(self.subtask_graph) + error_or_cancelled = result.status in ( + SubtaskStatus.errored, + SubtaskStatus.cancelled, + ) + + if all_done or error_or_cancelled: + # tell scheduling to finish subtasks + await self._scheduling_api.finish_subtasks( + [result.subtask_id], bands=[band], schedule_next=not error_or_cancelled + ) + if self.result.status != TaskStatus.terminated: + self.result = TaskResult( + self.task.task_id, + self.task.session_id, + self.stage_id, + start_time=self.result.start_time, + end_time=time.time(), + status=TaskStatus.terminated, + error=result.error, + traceback=result.traceback, + ) + if not all_done and error_or_cancelled: + if result.status == SubtaskStatus.errored: + logger.exception( + "Subtask %s errored", + subtask.subtask_id, + exc_info=( + type(result.error), + result.error, + result.traceback, + ), + ) + if result.status == SubtaskStatus.cancelled: # pragma: no cover + logger.warning( + "Subtask %s from band %s canceled.", + subtask.subtask_id, + band, + ) + logger.info( + "Start to cancel stage %s of task %s.", self.stage_id, self.task + ) + # if error or cancel, cancel all submitted subtasks + await self._scheduling_api.cancel_subtasks( + list(self._submitted_subtask_ids) + ) + self._schedule_done() + cost_time_secs = self.result.end_time - self.result.start_time + logger.info( + "Time consuming to execute a stage is %ss with " + "session id %s, task id %s, stage id %s", + cost_time_secs, + self.result.session_id, + self.result.task_id, + self.result.stage_id, + ) + self._stage_execution_time.record( + cost_time_secs, + { + "session_id": self.result.session_id, + "task_id": self.result.task_id, + "stage_id": self.result.stage_id, + }, + ) + else: + # not terminated, push success subtasks to queue if they are ready + to_schedule_subtasks = [] + for succ_subtask in self.subtask_graph.successors(subtask): + if succ_subtask in self.subtask_results: # pragma: no cover + continue + pred_subtasks = self.subtask_graph.predecessors(succ_subtask) + if all( + pred_subtask in self.subtask_results + for pred_subtask in pred_subtasks + ): + # all predecessors finished + to_schedule_subtasks.append(succ_subtask) + await self._schedule_subtasks(to_schedule_subtasks) + await self._scheduling_api.finish_subtasks( + [result.subtask_id], bands=[band] + ) + + async def run(self): + try: + if self.subtask_graph.num_shuffles() > 0: + # disable scale-in when shuffle is executing so that we can skip + # store shuffle meta in supervisor. + await self._scheduling_api.disable_autoscale_in() + return await self._run() + finally: + if self.subtask_graph.num_shuffles() > 0: + await self._scheduling_api.try_enable_autoscale_in() + + async def _run(self): + if len(self.subtask_graph) == 0: + # no subtask to schedule, set status to done + self._schedule_done() + self.result.status = TaskStatus.terminated + return {} + + # schedule independent subtasks + indep_subtasks = list(self.subtask_graph.iter_indep()) + await self._schedule_subtasks(indep_subtasks) + + # wait for completion + await self._done.wait() + if self.error_or_cancelled(): + if self.result.error is not None: + raise self.result.error.with_traceback(self.result.traceback) + else: + raise asyncio.CancelledError() + return await self._get_stage_result() + + async def cancel(self): + logger.info("Start to cancel stage %s of task %s.", self.stage_id, self.task) + if self._done.is_set(): # pragma: no cover + # already finished, ignore cancel + return + self._cancelled.set() + # cancel running subtasks + await self._scheduling_api.cancel_subtasks(list(self._submitted_subtask_ids)) + self._done.set() + + def error_or_cancelled(self) -> bool: + if self.result.error is not None: + return True + if self.is_cancelled(): + return True + return False + + async def _update_result_meta( + self, chunk_to_result: Dict[Chunk, ExecutionChunkResult] + ): + session_id = self.task.session_id + tile_context = self._tile_context + + update_meta_chunks = chunk_to_result.keys() - set( + itertools.chain.from_iterable( + (c.data for c in tiled_tileable.chunks) + for tiled_tileable in tile_context.values() + ) + ) + + worker_meta_api_to_chunk_delays = defaultdict(dict) + for c in update_meta_chunks: + address = chunk_to_result[c].meta["bands"][0][0] + meta_api = await WorkerMetaAPI.create(session_id, address) + call = meta_api.get_chunk_meta.delay( + c.key, fields=list(get_chunk_params(c).keys()) + ) + worker_meta_api_to_chunk_delays[meta_api][c] = call + for tileable in tile_context.values(): + chunks = [c.data for c in tileable.chunks] + for c, params_fields in zip(chunks, self._get_params_fields(tileable)): + address = chunk_to_result[c].meta["bands"][0][0] + meta_api = await WorkerMetaAPI.create(session_id, address) + call = meta_api.get_chunk_meta.delay(c.key, fields=params_fields) + worker_meta_api_to_chunk_delays[meta_api][c] = call + coros = [] + for worker_meta_api, chunk_delays in worker_meta_api_to_chunk_delays.items(): + coros.append(worker_meta_api.get_chunk_meta.batch(*chunk_delays.values())) + worker_metas = await asyncio.gather(*coros) + for chunk_delays, metas in zip( + worker_meta_api_to_chunk_delays.values(), worker_metas + ): + for c, meta in zip(chunk_delays, metas): + chunk_to_result[c].meta = meta + + @classmethod + def _get_params_fields(cls, tileable: TileableType): + from .....dataframe.core import DATAFRAME_TYPE, SERIES_TYPE + from .....tensor.core import TENSOR_TYPE + + params_fields = [] + fields = get_chunk_params(tileable.chunks[0]) + if isinstance(tileable, DATAFRAME_TYPE): + for c in tileable.chunks: + cur_fields = set(fields) + if c.index[1] > 0: + # skip fetch index_value for i >= 1 on column axis + cur_fields.remove("index_value") + if c.index[0] > 0: + # skip fetch dtypes_value for i >= 1 on index axis + cur_fields.remove("dtypes_value") + if c.index[0] > 0 and c.index[1] > 0: + # fetch shape only for i == 0 on index or column axis + cur_fields.remove("shape") + params_fields.append(list(cur_fields)) + elif isinstance(tileable, SERIES_TYPE): + for c in tileable.chunks: + cur_fields = set(fields) + if c.index[0] > 0: + # skip fetch name and dtype for i >= 1 + cur_fields.remove("name") + cur_fields.remove("dtype") + params_fields.append(list(cur_fields)) + elif isinstance(tileable, TENSOR_TYPE): + for i, c in enumerate(tileable.chunks): + cur_fields = set(fields) + if c.ndim > 1 and all(j > 0 for j in c.index): + cur_fields.remove("shape") + if i > 0: + cur_fields.remove("dtype") + cur_fields.remove("order") + params_fields.append(list(cur_fields)) + else: + for _ in tileable.chunks: + params_fields.append(list(fields)) + return params_fields diff --git a/python/xorbits/_mars/services/task/execution/mars/tests/__init__.py b/python/xorbits/_mars/services/task/execution/mars/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/services/task/execution/mars/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/services/task/execution/mars/tests/test_resource.py b/python/xorbits/_mars/services/task/execution/mars/tests/test_resource.py new file mode 100644 index 000000000..9c893ffaf --- /dev/null +++ b/python/xorbits/_mars/services/task/execution/mars/tests/test_resource.py @@ -0,0 +1,100 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict + +import numpy as np +import pytest + +from ...... import dataframe as md +from ...... import tensor as mt +from ......config import Config +from ......core import ChunkGraphBuilder, Tileable, TileableGraph +from ......resource import Resource +from .... import Task +from ....analyzer import GraphAnalyzer +from ..resource import DefaultEvaluator, ResourceEvaluator, register_resource_evaluator +from ..stage import TaskStageProcessor + + +@register_resource_evaluator +class MockedEvaluator(ResourceEvaluator): + name = "mock" + + def __init__(self, config, **kwargs): + self._config = config + + @classmethod + async def create(cls, config: Dict[str, Any], **kwargs) -> "ResourceEvaluator": + return cls(config, **kwargs) + + async def evaluate(self, stage_processor: "TaskStageProcessor"): + pass + + async def report(self): + pass + + +def _build_chunk_graph(tileable_graph: TileableGraph): + return next(ChunkGraphBuilder(tileable_graph).build()) + + +async def _gen_stage_processor(t): + tileable_graph = t.build_graph(tile=False) + chunk_graph = _build_chunk_graph(tileable_graph) + bands = [(f"address_{i}", "numa-0") for i in range(4)] + band_resource = dict((band, Resource(num_cpus=1)) for band in bands) + task = Task("mock_task", "mock_session", tileable_graph) + analyzer = GraphAnalyzer(chunk_graph, band_resource, task, Config(), dict()) + subtask_graph = analyzer.gen_subtask_graph() + stage_processor = TaskStageProcessor( + "stage_id", task, chunk_graph, subtask_graph, bands, None, None, None + ) + return stage_processor + + +async def _test_default_evaluator(config: Dict[str, Any], t: Tileable): + resource_evaluator = await ResourceEvaluator.create(config) + assert resource_evaluator is not None + assert isinstance(resource_evaluator, DefaultEvaluator) + stage_processor = await _gen_stage_processor(t) + await resource_evaluator.evaluate(stage_processor) + for subtask in stage_processor.subtask_graph.iter_nodes(): + is_gpu = any(c.op.gpu for c in subtask.chunk_graph) + assert ( + subtask.required_resource == Resource(num_gpus=1) + if is_gpu + else Resource(num_cpus=1) + ) + assert await resource_evaluator.report() is None + + +@pytest.mark.asyncio +async def test_resource_evaluator(): + # test mocked resource evaluator + resource_evaluator = await ResourceEvaluator.create({"resource_evaluator": "mock"}) + assert resource_evaluator is not None + assert isinstance(resource_evaluator, MockedEvaluator) + + # test default resource evaluator + t = mt.ones((10, 10), chunk_size=5) + 1 + await _test_default_evaluator({}, t) + await _test_default_evaluator({"resource_evaluator": "default"}, t) + df = md.DataFrame( + np.random.randint(0, 100, size=(100_000, 4)), + columns=list("ABCD"), + chunk_size=1000, + ) + df = df[df["A"] > 50] + await _test_default_evaluator({}, df) diff --git a/python/xorbits/_mars/services/task/execution/ray/__init__.py b/python/xorbits/_mars/services/task/execution/ray/__init__.py new file mode 100644 index 000000000..84e0ac757 --- /dev/null +++ b/python/xorbits/_mars/services/task/execution/ray/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .config import RayExecutionConfig +from .executor import RayTaskExecutor +from .fetcher import RayFetcher diff --git a/python/xorbits/_mars/services/task/execution/ray/config.py b/python/xorbits/_mars/services/task/execution/ray/config.py new file mode 100644 index 000000000..ab1c374a3 --- /dev/null +++ b/python/xorbits/_mars/services/task/execution/ray/config.py @@ -0,0 +1,99 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +from typing import Dict, List, Union + +from .....core.operand import ShuffleFetchType +from .....resource import Resource +from ..api import ExecutionConfig, register_config_cls +from ..utils import get_band_resources_from_config + +logger = logging.getLogger(__name__) + +IN_RAY_CI = os.environ.get("MARS_CI_BACKEND", "mars") == "ray" +# The default interval seconds to update progress and collect garbage. +DEFAULT_MONITOR_INTERVAL_SECONDS = 0 if IN_RAY_CI else 1 +DEFAULT_LOG_INTERVAL_SECONDS = 60 +DEFAULT_CHECK_SLOW_SUBTASKS_INTERVAL_SECONDS = 120 + + +@register_config_cls +class RayExecutionConfig(ExecutionConfig): + name = "ray" + + def __init__(self, execution_config: Dict): + super().__init__(execution_config) + self._ray_execution_config = execution_config[self.backend] + + def get_band_resources(self): + """ + Get the band resources from config for generating ray virtual + resources. + """ + return get_band_resources_from_config(self._ray_execution_config) + + def get_deploy_band_resources(self) -> List[Dict[str, Resource]]: + return [] + + def get_subtask_max_retries(self): + return self._ray_execution_config["subtask_max_retries"] + + def get_subtask_num_cpus(self) -> Union[int, float]: + return self._ray_execution_config.get("subtask_num_cpus", 1) + + def get_subtask_memory(self) -> Union[int, float]: + return self._ray_execution_config.get("subtask_memory", None) + + def get_n_cpu(self): + return self._ray_execution_config["n_cpu"] + + def get_n_worker(self): + return self._ray_execution_config["n_worker"] + + def get_monitor_interval_seconds(self): + """ + The interval seconds for the monitor task to update progress and + collect garbage. + """ + return self._ray_execution_config.get( + "monitor_interval_seconds", DEFAULT_MONITOR_INTERVAL_SECONDS + ) + + def get_log_interval_seconds(self): + return self._ray_execution_config.get( + "log_interval_seconds", DEFAULT_LOG_INTERVAL_SECONDS + ) + + def get_check_slow_subtasks_interval_seconds(self) -> float: + return self._ray_execution_config.get( + "check_slow_subtasks_interval_seconds", + DEFAULT_CHECK_SLOW_SUBTASKS_INTERVAL_SECONDS, + ) + + def get_check_slow_subtask_iqr_ratio(self) -> float: + # https://en.wikipedia.org/wiki/Box_plot + # iqr = q3 - q1 + # duration_threshold = q3 + check_slow_subtasks_iqr_ratio * (q3 - q1) + # So, the value == 3, extremely slow(probably hang); value == 1.5, slow + return self._ray_execution_config.get("check_slow_subtasks_iqr_ratio", 3) + + def get_shuffle_fetch_type(self) -> ShuffleFetchType: + return ShuffleFetchType.FETCH_BY_INDEX + + def get_gc_method(self): + method = self._ray_execution_config.get("gc_method", "submitted") + assert method in ["submitted", "completed"] + return method diff --git a/python/xorbits/_mars/services/task/execution/ray/context.py b/python/xorbits/_mars/services/task/execution/ray/context.py new file mode 100644 index 000000000..5b6f2f4b3 --- /dev/null +++ b/python/xorbits/_mars/services/task/execution/ray/context.py @@ -0,0 +1,243 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from dataclasses import asdict +from typing import Callable, Dict, List + +from .....core.context import Context +from .....storage.base import StorageLevel +from .....typing import ChunkType +from .....utils import implements, lazy_import, sync_to_async +from ....context import ThreadedServiceContext +from .config import RayExecutionConfig + +ray = lazy_import("ray") +logger = logging.getLogger(__name__) + + +class RayRemoteObjectManager: + """The remote object manager in task state actor.""" + + def __init__(self): + self._named_remote_objects = {} + + def create_remote_object(self, name: str, object_cls, *args, **kwargs): + remote_object = object_cls(*args, **kwargs) + self._named_remote_objects[name] = remote_object + + def destroy_remote_object(self, name: str): + self._named_remote_objects.pop(name, None) + + async def call_remote_object(self, name: str, attr: str, *args, **kwargs): + remote_object = self._named_remote_objects[name] + meth = getattr(remote_object, attr) + async_meth = sync_to_async(meth) + return await async_meth(*args, **kwargs) + + +class _RayRemoteObjectWrapper: + def __init__(self, task_state_actor: "ray.actor.ActorHandle", name: str): + self._task_state_actor = task_state_actor + self._name = name + + def __getattr__(self, attr): + def wrap(*args, **kwargs): + r = self._task_state_actor.call_remote_object.remote( + self._name, attr, *args, **kwargs + ) + return ray.get(r) + + return wrap + + +class _RayRemoteObjectContext: + def __init__( + self, + get_or_create_actor: Callable[[], "ray.actor.ActorHandle"], + *args, + **kwargs + ): + super().__init__(*args, **kwargs) + self._get_or_create_actor = get_or_create_actor + self._task_state_actor = None + + def _get_task_state_actor(self) -> "ray.actor.ActorHandle": + # Get the RayTaskState actor, this is more clear and faster than wraps + # the `get_or_create_actor` by lru_cache in __init__ because this method + # is called as needed. + if self._task_state_actor is None: + self._task_state_actor = self._get_or_create_actor() + return self._task_state_actor + + @implements(Context.create_remote_object) + def create_remote_object(self, name: str, object_cls, *args, **kwargs): + task_state_actor = self._get_task_state_actor() + r = task_state_actor.create_remote_object.remote( + name, object_cls, *args, **kwargs + ) + # Make sure the actor is created. The remote object may not be created + # when get_remote_object from worker because the callers of + # create_remote_object and get_remote_object are not in the same worker. + # Use sync Ray actor requires this `ray.get`, too. + ray.get(r) + return _RayRemoteObjectWrapper(task_state_actor, name) + + @implements(Context.get_remote_object) + def get_remote_object(self, name: str): + task_state_actor = self._get_task_state_actor() + return _RayRemoteObjectWrapper(task_state_actor, name) + + @implements(Context.destroy_remote_object) + def destroy_remote_object(self, name: str): + task_state_actor = self._get_task_state_actor() + task_state_actor.destroy_remote_object.remote(name) + + +# TODO(fyrestone): Implement more APIs for Ray. +class RayExecutionContext(_RayRemoteObjectContext, ThreadedServiceContext): + """The context for tiling.""" + + def __init__( + self, + config: RayExecutionConfig, + task_context: Dict, + task_chunks_meta: Dict, + worker_addresses: List[str], + *args, + **kwargs + ): + super().__init__(*args, **kwargs) + self._config = config + self._task_context = task_context + self._task_chunks_meta = task_chunks_meta + self._worker_addresses = worker_addresses + + @implements(Context.get_chunks_result) + def get_chunks_result(self, data_keys: List[str], fetch_only: bool = False) -> List: + logger.info("Getting %s chunks result.", len(data_keys)) + object_refs = [self._task_context[key] for key in data_keys] + result = ray.get(object_refs) + logger.info("Got %s chunks result.", len(result)) + return result if not fetch_only else None + + @implements(Context.get_chunks_meta) + def get_chunks_meta( + self, data_keys: List[str], fields: List[str] = None, error="raise" + ) -> List[Dict]: + if not self._task_chunks_meta: + result = self._call( + self._get_chunks_meta_from_service( + data_keys, fields=fields, error=error + ) + ) + else: + result = [{}] * len(data_keys) + missing_key_indexes = [] + missing_keys = [] + for idx, key in enumerate(data_keys): + try: + chunk_meta = self._task_chunks_meta[key] + except KeyError: + missing_key_indexes.append(idx) + missing_keys.append(key) + else: + meta = asdict(chunk_meta) + meta = {f: meta.get(f) for f in fields} + result[idx] = meta + if missing_keys: + missing_meta = self._call( + self._get_chunks_meta_from_service( + missing_keys, fields=fields, error=error + ) + ) + for idx, meta in zip(missing_key_indexes, missing_meta): + result[idx] = meta + return result + + async def _get_chunks_meta_from_service( + self, data_keys: List[str], fields: List[str] = None, error="raise" + ) -> List[Dict]: + get_metas = [ + self._meta_api.get_chunk_meta.delay(data_key, fields=fields, error=error) + for data_key in data_keys + ] + return await self._meta_api.get_chunk_meta.batch(*get_metas) + + @implements(Context.get_total_n_cpu) + def get_total_n_cpu(self) -> int: + # TODO(fyrestone): Support auto scaling. + return self._config.get_n_cpu() * self._config.get_n_worker() + + @implements(Context.get_worker_addresses) + def get_worker_addresses(self) -> List[str]: + # Returns virtual worker addresses. + return self._worker_addresses + + +# TODO(fyrestone): Implement more APIs for Ray. +class RayExecutionWorkerContext(_RayRemoteObjectContext, dict): + """The context for executing operands.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._current_chunk = None + + @classmethod + @implements(Context.new_custom_log_dir) + def new_custom_log_dir(cls): + logger.info( + "%s does not support register_custom_log_path / new_custom_log_dir", + cls.__name__, + ) + return None + + @staticmethod + @implements(Context.register_custom_log_path) + def register_custom_log_path( + session_id: str, + tileable_op_key: str, + chunk_op_key: str, + worker_address: str, + log_path: str, + ): + raise NotImplementedError + + @classmethod + @implements(Context.set_progress) + def set_progress(cls, progress: float): + logger.info( + "%s does not support set_running_operand_key / set_progress", cls.__name__ + ) + + @staticmethod + @implements(Context.set_running_operand_key) + def set_running_operand_key(session_id: str, op_key: str): + raise NotImplementedError + + @classmethod + @implements(Context.get_storage_info) + def get_storage_info( + cls, address: str = None, level: StorageLevel = StorageLevel.MEMORY + ): + logger.info("%s does not support get_storage_info", cls.__name__) + return {} + + def set_current_chunk(self, chunk: ChunkType): + """Set current executing chunk.""" + self._current_chunk = chunk + + def get_current_chunk(self) -> ChunkType: + """Set current executing chunk.""" + return self._current_chunk diff --git a/python/xorbits/_mars/services/task/execution/ray/executor.py b/python/xorbits/_mars/services/task/execution/ray/executor.py new file mode 100644 index 000000000..3be339a57 --- /dev/null +++ b/python/xorbits/_mars/services/task/execution/ray/executor.py @@ -0,0 +1,1086 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import collections +import enum +import functools +import itertools +import logging +import operator +import time +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List + +import numpy as np + +from .....core import Chunk, ChunkGraph, TileContext +from .....core.context import set_context +from .....core.operand import Fetch, Fuse, VirtualOperand, execute +from .....core.operand.fetch import FetchShuffle +from .....lib.aio import alru_cache +from .....metrics.api import Metrics, init_metrics +from .....resource import Resource +from .....serialization import deserialize, serialize +from .....typing import BandType +from .....utils import ( + aiotask_wrapper, + calc_data_size, + classproperty, + get_chunk_params, + lazy_import, +) +from ....lifecycle.api import LifecycleAPI +from ....meta.api import MetaAPI +from ....subtask import Subtask, SubtaskGraph +from ....subtask.utils import iter_output_data +from ...core import Task +from ..api import ExecutionChunkResult, TaskExecutor, register_executor_cls +from ..utils import ResultTileablesLifecycle +from .config import IN_RAY_CI, RayExecutionConfig +from .context import ( + RayExecutionContext, + RayExecutionWorkerContext, + RayRemoteObjectManager, +) +from .shuffle import ShuffleManager + +ray = lazy_import("ray") +logger = logging.getLogger(__name__) + + +class RayMetrics: + _submitted_subtask_number = None + _started_subtask_number = None + _completed_subtask_number = None + + @classproperty + def submitted_subtask_number(self): + if RayMetrics._submitted_subtask_number is None: + RayMetrics._submitted_subtask_number = Metrics.counter( + "mars.ray_dag.submitted_subtask_number", + "The number of submitted subtask.", + ("session_id", "task_id", "stage_id"), + ) + return RayMetrics._submitted_subtask_number + + @classproperty + def started_subtask_number(self): + if RayMetrics._started_subtask_number is None: + RayMetrics._started_subtask_number = Metrics.counter( + "mars.ray_dag.started_subtask_number", + "The number of started subtask.", + ) + return RayMetrics._started_subtask_number + + @classproperty + def completed_subtask_number(self): + if RayMetrics._completed_subtask_number is None: + RayMetrics._completed_subtask_number = Metrics.counter( + "mars.ray_dag.completed_subtask_number", + "The number of completed subtask.", + ) + return RayMetrics._completed_subtask_number + + +class RayTaskState(RayRemoteObjectManager): + handle = None + + @classmethod + def get_handle(cls): + """Get the RayTaskState actor handle.""" + logger.info("Getting RayTaskState handle.") + return ray.get_actor(cls.__name__) + + @classmethod + def create(cls): + """Create a RayTaskState actor.""" + logger.info("Creating RayTaskState actor.") + name = cls.__name__ + try: + cls.handle = ray.get_actor(name) + except ValueError: + # Attempt to create it (may race with other attempts). + try: + cls.handle = ray.remote(cls).options(name=name).remote() + except ValueError: # pragma: no cover + # We lost the creation race, ignore. + cls.handle = ray.get_actor(name) + return cls.handle + + +_optimize_physical = None + + +def _optimize_subtask_graph(subtask_graph): + global _optimize_physical + + if _optimize_physical is None: + from .....optimization.physical import optimize as _optimize_physical + return _optimize_physical(subtask_graph) + + +class _SubtaskGC: + """GC the inputs of subtask chunk.""" + + def __init__( + self, + subtask_chunk_graph: ChunkGraph, + context: RayExecutionWorkerContext, + ): + self._subtask_chunk_graph = subtask_chunk_graph + self._context = context + ref_counts = collections.defaultdict(lambda: 0) + # Set 1 for result chunks. + for result_chunk in subtask_chunk_graph.result_chunks: + ref_counts[result_chunk.key] += 1 + # Iter graph to set ref counts. + for chunk in subtask_chunk_graph: + ref_counts[chunk.key] += subtask_chunk_graph.count_successors(chunk) + self._chunk_key_ref_counts = ref_counts + + def gc_inputs(self, chunk: Chunk): + ref_counts = self._chunk_key_ref_counts + for inp in self._subtask_chunk_graph.iter_predecessors(chunk): + ref_counts[inp.key] -= 1 + if ref_counts[inp.key] == 0: + self._context.pop(inp.key, None) + + +def execute_subtask( + subtask_id: str, + subtask_chunk_graph: ChunkGraph, + output_meta_n_keys: int, + is_mapper, + *inputs, +): + """ + The function used for execute subtask in ray task. + + Parameters + ---------- + subtask_id: str + id of subtask + subtask_chunk_graph: ChunkGraph + chunk graph for subtask + output_meta_n_keys: int + will be 0 if subtask is a shuffle mapper. + is_mapper: bool + Whether current subtask is a shuffle mapper. Note that shuffle reducers such as `DataFrameDropDuplicates` + can be a mapper at the same time. + inputs: + inputs for current subtask + + Returns + ------- + subtask outputs and meta for outputs if `output_meta_keys` is provided. + """ + init_metrics("ray") + RayMetrics.started_subtask_number.record(1) + ray_task_id = ray.get_runtime_context().task_id + subtask_chunk_graph = deserialize(*subtask_chunk_graph) + logger.info("Start subtask: %s, ray task id: %s.", subtask_id, ray_task_id) + # Optimize chunk graph. + subtask_chunk_graph = _optimize_subtask_graph(subtask_chunk_graph) + fetch_chunks, shuffle_fetch_chunk = _get_fetch_chunks(subtask_chunk_graph) + context = RayExecutionWorkerContext(RayTaskState.get_handle) + if shuffle_fetch_chunk is not None: + # The subtask is a reducer subtask. + n_mappers = shuffle_fetch_chunk.op.n_mappers + # Some reducer may have multiple output chunks, see `PSRSshuffle._execute_reduce` and + # https://user-images.githubusercontent.com/12445254/168569524-f09e42a7-653a-4102-bdf0-cc1631b3168d.png + reducer_chunks = subtask_chunk_graph.successors(shuffle_fetch_chunk) + reducer_operands = set(c.op for c in reducer_chunks) + if len(reducer_operands) != 1: # pragma: no cover + raise ValueError( + f"Subtask {subtask_id} has more than 1 reduce operands: {subtask_chunk_graph.to_dot()}" + ) + reducer_operand = reducer_chunks[0].op + reducer_index = reducer_operand.reducer_index + # Virtual shuffle keys, keep this in sync with `MapReducerOperand#_iter_mapper_key_idx_pairs` + context.update( + {(i, reducer_index): block for i, block in enumerate(inputs[-n_mappers:])} + ) + inputs = inputs[:-n_mappers] + shuffle_input_key_count = len(context) + # Create a subtask GC object. + subtask_gc = _SubtaskGC(subtask_chunk_graph, context) + # Update non shuffle inputs to context. + context.update(zip((start_chunk.key for start_chunk in fetch_chunks), inputs)) + + for chunk in subtask_chunk_graph.topological_iter(): + if chunk.key not in context: + try: + context.set_current_chunk(chunk) + execute(context, chunk.op) + except Exception: + logger.exception( + "Execute operand %s of graph %s failed.", + chunk.op, + subtask_chunk_graph.to_dot(), + ) + raise + subtask_gc.gc_inputs(chunk) + + # For non-mapper subtask, output context is chunk key to results. + # For mapper subtasks, output context is data key to results. + # `iter_output_data` must ensure values order since we only return values. + normal_output = {} + mapper_output = {} + for key, data, is_mapper_block in iter_output_data(subtask_chunk_graph, context): + if is_mapper_block: + mapper_output[key] = data + else: + normal_output[key] = data + + # The inputs are referenced by the Ray worker in _raylet.pyx, GC them in Mars is useless. + # So, subtask GC has skipped GC shuffle input keys in order to simplify the implementation. + expect_context_count = ( + len(normal_output) + len(mapper_output) + shuffle_input_key_count + ) + assert ( + len(context) == expect_context_count + ), f"The remaining context count mismatch: {len(context)}(actual) != {expect_context_count}(expected)." + + output_values = [] + # assert output keys order consistent + if is_mapper: + # mapper may produce outputs which isn't shuffle blocks, such as TensorUnique._execute_agg_reduce. + mapper_main_keys = set(k[0] for k in mapper_output.keys()) + assert len(mapper_main_keys) == 1, mapper_main_keys + # sorted reducer_index's consistency with reducer_ordinal is checked in + # `OperandTilesHandler._check_shuffle_reduce_chunks`. + # So sort keys by reducer_index to ensure mapper outputs consist with reducer_ordinal, + # then downstream can fetch shuffle blocks by reducer_ordinal. + mapper_output = dict(sorted(mapper_output.items(), key=lambda item: item[0][1])) + if output_meta_n_keys: + output_meta = {} + # for non-shuffle subtask, record meta in supervisor. + for chunk in subtask_chunk_graph.result_chunks[:output_meta_n_keys]: + chunk_key = chunk.key + if chunk_key not in output_meta: + if isinstance(chunk.op, Fuse): # pragma: no cover + # fuse op + chunk = chunk.chunk + data = context[chunk_key] + memory_size = calc_data_size(data) + output_meta[chunk_key] = get_chunk_params(chunk), memory_size + output_values.append(output_meta) + output_values.extend(normal_output.values()) + output_values.extend(mapper_output.values()) + logger.info("Complete subtask: %s, ray task id: %s.", subtask_id, ray_task_id) + RayMetrics.completed_subtask_number.record(1) + return output_values[0] if len(output_values) == 1 else output_values + + +def _get_fetch_chunks(chunk_graph): + fetch_chunks = [] + shuffle_fetch_chunk = None + for start_chunk in chunk_graph.iter_indep(): + if isinstance(start_chunk.op, FetchShuffle): + assert shuffle_fetch_chunk is None, shuffle_fetch_chunk + shuffle_fetch_chunk = start_chunk + elif isinstance(start_chunk.op, Fetch): + fetch_chunks.append(start_chunk) + return sorted(fetch_chunks, key=operator.attrgetter("key")), shuffle_fetch_chunk + + +def _get_subtask_out_info( + subtask_chunk_graph: ChunkGraph, is_mapper: bool, n_reducers: int = None +): + # output_keys might be duplicate in chunk graph, use dict to deduplicate. + # output_keys order should be consistent with remote `execute_subtask`, + # dict can preserve insert order. + output_keys = {} + shuffle_chunk = None + if is_mapper: + assert n_reducers is not None + if len(subtask_chunk_graph.result_chunks) == 1: + return set(), n_reducers + for chunk in subtask_chunk_graph.result_chunks: + if not chunk.is_mapper: + output_keys[chunk.key] = 1 + # mapper may produce outputs which isn't shuffle blocks, such as TensorUnique._execute_agg_reduce + # which is mapper too, but some outputs are not mapper blocks: + # https://user-images.githubusercontent.com/12445254/184132642-a19259fd-43d6-4a27-a033-4aaa97d7586e.svg + else: + assert shuffle_chunk is None, (shuffle_chunk, chunk) + shuffle_chunk = chunk + return output_keys.keys(), len(output_keys) + n_reducers + for chunk in subtask_chunk_graph.result_chunks: + if isinstance( + chunk.op, VirtualOperand + ): # FIXME(chaokunyang) no need to check this? + continue + else: + output_keys[chunk.key] = 1 + return output_keys.keys(), len(output_keys) + + +class OrderedSet: + def __init__(self): + self._d = set() + self._l = list() + + def add(self, item): + self._d.add(item) + self._l.append(item) + assert len(self._d) == len(self._l) + + def update(self, items): + tmp = list(items) if isinstance(items, collections.Iterator) else items + self._l.extend(tmp) + self._d.update(tmp) + assert len(self._d) == len(self._l) + + def __contains__(self, item): + return item in self._d + + def __getitem__(self, item): + return self._l[item] + + def __len__(self): + return len(self._d) + + +class _RayExecutionStage(enum.Enum): + INIT = 0 + SUBMITTING = 1 + WAITING = 2 + + +@dataclass +class _RayChunkMeta: + memory_size: int + + +@dataclass +class _RayMonitorContext: + stage: _RayExecutionStage = _RayExecutionStage.INIT + submitted_subtasks: OrderedSet = field(default_factory=OrderedSet) + completed_subtasks: OrderedSet = field(default_factory=OrderedSet) + # The shuffle manager for monitor task to GC the object refs of shuffles. + shuffle_manager: ShuffleManager = None + # The first output object ref of a Subtask to the Subtask. + object_ref_to_subtask: Dict["ray.ObjectRef", Subtask] = field(default_factory=dict) + # Stage chunk keys may be duplicate. + # TODO(fyrestone): Remove this if Mars chunk keys are unique. + chunk_key_ref_count: Dict[str, int] = field( + default_factory=lambda: collections.defaultdict(int) + ) + + +@dataclass +class _RaySubtaskRuntime: + start_time: float = 0.0 + + +class _RaySlowSubtaskChecker: + @dataclass + class _CheckInfo: + count: int + duration_threshold: float + + def __init__( + self, + total_subtask_count: int, + submitted_subtasks: OrderedSet, + completed_subtasks: OrderedSet, + interquartile_range_ratio: float = 3, + ): + self._total_subtask_count = total_subtask_count + self._submitted_subtasks = submitted_subtasks + self._completed_subtasks = completed_subtasks + self._logic_key_to_subtask_costs = collections.defaultdict(list) + self._logic_key_to_check_info = dict() + self._ratio = interquartile_range_ratio + + def update(self): + i = 0 + j = 0 + while i < self._total_subtask_count or j < self._total_subtask_count: + curr_time = time.time() + while i < len(self._submitted_subtasks): + subtask = self._submitted_subtasks[i] + subtask.runtime.start_time = curr_time + i += 1 + while j < len(self._completed_subtasks): + subtask = self._completed_subtasks[j] + self._logic_key_to_subtask_costs[subtask.logic_key].append( + curr_time - subtask.runtime.start_time + ) + j += 1 + yield + + def is_slow(self, subtask: Subtask): + logic_key = subtask.logic_key + if logic_key not in self._logic_key_to_subtask_costs: + # The subtask logic key has no costs. + return False + logic_parallelism = subtask.logic_parallelism + if not logic_parallelism: + # Invalid parallelism. + return False + subtask_costs = self._logic_key_to_subtask_costs[logic_key] + complete_count = len(subtask_costs) + if complete_count / logic_parallelism < 0.75: + # Too few complete subtasks. + return False + check_info = self._logic_key_to_check_info.get(logic_key) + if check_info is None or check_info.count != complete_count: + arr = np.array(subtask_costs) + # Please refer to: https://en.wikipedia.org/wiki/Box_plot + q1, q3 = np.quantile(arr, 0.25), np.quantile(arr, 0.75) + duration_threshold = q3 + self._ratio * (q3 - q1) + self._logic_key_to_check_info[ + logic_key + ] = _RaySlowSubtaskChecker._CheckInfo(complete_count, duration_threshold) + else: + duration_threshold = check_info.duration_threshold + assert subtask.runtime.start_time > 0 + return time.time() - subtask.runtime.start_time > duration_threshold + + +@register_executor_cls +class RayTaskExecutor(TaskExecutor): + name = "ray" + + def __init__( + self, + config: RayExecutionConfig, + task: Task, + tile_context: TileContext, + task_context: Dict[str, "ray.ObjectRef"], + task_chunks_meta: Dict[str, _RayChunkMeta], + lifecycle_api: LifecycleAPI, + meta_api: MetaAPI, + ): + logger.info( + "Start task %s with GC method %s.", + task.task_id, + config.get_gc_method(), + ) + self._config = config + self._task = task + self._tile_context = tile_context + self._task_context = task_context + self._task_chunks_meta = task_chunks_meta + self._ray_executor = self._get_ray_executor() + + # API + self._lifecycle_api = lifecycle_api + self._meta_api = meta_api + + self._available_band_resources = None + self._result_tileables_lifecycle = None + + # For progress and task cancel + self._stage_index = 0 + self._pre_all_stages_progress = 0.0 + self._pre_all_stages_tile_progress = 0.0 + self._cur_stage_progress = 0.0 + self._cur_stage_tile_progress = 0.0 + self._execute_subtask_graph_aiotask = None + self._cancelled = False + + @classmethod + async def create( + cls, + config: RayExecutionConfig, + *, + session_id: str, + address: str, + task: Task, + tile_context: TileContext, + **kwargs, + ) -> "RayTaskExecutor": + lifecycle_api, meta_api = await cls._get_apis(session_id, address) + task_context = {} + task_chunks_meta = {} + + executor = cls( + config, + task, + tile_context, + task_context, + task_chunks_meta, + lifecycle_api, + meta_api, + ) + available_band_resources = await executor.get_available_band_resources() + worker_addresses = list( + map(operator.itemgetter(0), available_band_resources.keys()) + ) + await cls._init_context( + config, + task_context, + task_chunks_meta, + RayTaskState.create, + worker_addresses, + session_id, + address, + ) + return executor + + def get_execution_config(self): + return self._config + + # noinspection DuplicatedCode + def destroy(self): + logger.info("Complete task %s.", self._task.task_id) + self._task = None + self._tile_context = None + self._task_context = {} + self._task_chunks_meta = {} + self._ray_executor = None + + # API + self._lifecycle_api = None + self._meta_api = None + + self._available_band_resources = None + self._result_tileables_lifecycle = None + + # For progress and task cancel + self._stage_index = 0 + self._pre_all_stages_progress = 1.0 + self._pre_all_stages_tile_progress = 1.0 + self._cur_stage_progress = 1.0 + self._cur_stage_tile_progress = 1.0 + self._execute_subtask_graph_aiotask = None + self._cancelled = None + self._config = None + + @classmethod + @alru_cache(cache_exceptions=False) + async def _get_apis(cls, session_id: str, address: str): + return await asyncio.gather( + LifecycleAPI.create(session_id, address), + MetaAPI.create(session_id, address), + ) + + @staticmethod + @functools.lru_cache(maxsize=None) # Specify maxsize=None to make it faster + def _get_ray_executor(): + # Export remote function once. + return ray.remote(execute_subtask) + + @classmethod + async def _init_context( + cls, + config: RayExecutionConfig, + task_context: Dict[str, "ray.ObjectRef"], + task_chunks_meta: Dict[str, _RayChunkMeta], + create_task_state_actor: Callable[[], "ray.actor.ActorHandle"], + worker_addresses: List[str], + session_id: str, + address: str, + ): + loop = asyncio.get_running_loop() + context = RayExecutionContext( + config, + task_context, + task_chunks_meta, + worker_addresses, + create_task_state_actor, + session_id, + address, + address, + address, + loop=loop, + ) + await context.init() + set_context(context) + + async def __aenter__(self): + self._result_tileables_lifecycle = ResultTileablesLifecycle( + self._task.tileable_graph, self._tile_context, self._lifecycle_api + ) + + async def execute_subtask_graph( + self, + stage_id: str, + subtask_graph: SubtaskGraph, + chunk_graph: ChunkGraph, + tile_context: TileContext, + context: Any = None, + ) -> Dict[Chunk, ExecutionChunkResult]: + if self._cancelled is True: # pragma: no cover + raise asyncio.CancelledError() + self._stage_index += 1 + stage_id = f"{self._stage_index}:{stage_id}" + logger.info("Start stage %s.", stage_id) + self._execute_subtask_graph_aiotask = asyncio.current_task() + + monitor_context = _RayMonitorContext() + monitor_aiotask = asyncio.create_task( + self._update_progress_and_collect_garbage( + stage_id, + subtask_graph, + chunk_graph, + monitor_context, + self._config.get_monitor_interval_seconds(), + self._config.get_gc_method(), + ) + ) + try: + # Previous execution may have duplicate tileable ids, the tileable may be decref + # during execution, so we should track and incref the result tileables before execute. + await self._result_tileables_lifecycle.incref_tiled() + return await self._execute_subtask_graph( + stage_id, subtask_graph, chunk_graph, monitor_context + ) + except asyncio.CancelledError: + logger.info( + "Cancel %s ray tasks of stage %s.", + len(monitor_context.object_ref_to_subtask), + stage_id, + ) + for object_ref in monitor_context.object_ref_to_subtask.keys(): + ray.cancel(object_ref, force=True) + raise + finally: + logger.info("Clear stage %s.", stage_id) + monitor_aiotask.cancel() + for subtask in subtask_graph: + subtask.runtime = None + for key in self._task_context.keys() - self._task_chunks_meta.keys(): + self._task_context.pop(key) + + async def _execute_subtask_graph( + self, + stage_id: str, + subtask_graph: SubtaskGraph, + chunk_graph: ChunkGraph, + monitor_context: _RayMonitorContext, + ) -> Dict[Chunk, ExecutionChunkResult]: + task_context = self._task_context + self._pre_all_stages_tile_progress = ( + self._pre_all_stages_tile_progress + self._cur_stage_tile_progress + ) + self._cur_stage_tile_progress = ( + self._tile_context.get_all_progress() - self._pre_all_stages_tile_progress + ) + shuffle_manager = ShuffleManager(subtask_graph) + monitor_context.stage = _RayExecutionStage.SUBMITTING + monitor_context.shuffle_manager = shuffle_manager + logger.info( + "Submitting %s subtasks of stage %s which contains shuffles: %s", + len(subtask_graph), + stage_id, + shuffle_manager.info(), + ) + subtask_max_retries = self._config.get_subtask_max_retries() + subtask_num_cpus = self._config.get_subtask_num_cpus() + subtask_memory = self._config.get_subtask_memory() + metrics_tags = { + "session_id": self._task.session_id, + "task_id": self._task.task_id, + "stage_id": stage_id, + } + output_meta_object_refs = [] + for subtask in subtask_graph.topological_iter(): + if subtask.virtual: + continue + subtask_chunk_graph = subtask.chunk_graph + input_object_refs = await self._load_subtask_inputs( + stage_id, subtask, task_context, shuffle_manager + ) + # Can't use `subtask_graph.count_successors(subtask) == 0` to check output meta, because a subtask + # may have some outputs which are dependent by downstream, but other outputs are not. see + # https://user-images.githubusercontent.com/12445254/168484663-a4caa3f4-0ccc-4cd7-bf20-092356815073.png + is_mapper, n_reducers = shuffle_manager.is_mapper(subtask), None + if is_mapper: + n_reducers = shuffle_manager.get_n_reducers(subtask) + output_keys, out_count = _get_subtask_out_info( + subtask_chunk_graph, is_mapper, n_reducers + ) + if is_mapper: + # shuffle meta won't be recorded in meta service. + output_count = out_count + else: + output_count = out_count + bool(subtask.stage_n_outputs) + assert output_count != 0 + subtask_max_retries = subtask_max_retries if subtask.retryable else 0 + output_object_refs = self._ray_executor.options( + num_cpus=subtask_num_cpus, + num_returns=output_count, + max_retries=subtask_max_retries, + memory=subtask_memory, + scheduling_strategy="DEFAULT" if len(input_object_refs) else "SPREAD", + ).remote( + subtask.subtask_id, + serialize(subtask_chunk_graph, context={"serializer": "ray"}), + subtask.stage_n_outputs, + is_mapper, + *input_object_refs, + ) + await asyncio.sleep(0) + if output_count == 1: + output_object_refs = [output_object_refs] + RayMetrics.submitted_subtask_number.record(1, metrics_tags) + monitor_context.submitted_subtasks.add(subtask) + monitor_context.object_ref_to_subtask[output_object_refs[0]] = subtask + subtask.runtime = _RaySubtaskRuntime() + if subtask.stage_n_outputs: + meta_object_ref, *output_object_refs = output_object_refs + # TODO(fyrestone): Fetch(not get) meta object here. + output_meta_object_refs.append(meta_object_ref) + if is_mapper: + shuffle_manager.add_mapper_output_refs( + subtask, output_object_refs[-n_reducers:] + ) + output_object_refs = output_object_refs[:-n_reducers] + # Mars chunk keys may be duplicate, so we should track the ref count. + for chunk_key, object_ref in zip(output_keys, output_object_refs): + if chunk_key in task_context: + monitor_context.chunk_key_ref_count[chunk_key] += 1 + task_context[chunk_key] = object_ref + logger.info("Submitted %s subtasks of stage %s.", len(subtask_graph), stage_id) + + monitor_context.stage = _RayExecutionStage.WAITING + key_to_meta = {} + if len(output_meta_object_refs) > 0: + # TODO(fyrestone): Optimize update meta by fetching partial meta. + meta_count = len(output_meta_object_refs) + logger.info("Getting %s metas of stage %s.", meta_count, stage_id) + meta_list = await asyncio.gather(*output_meta_object_refs) + for meta in meta_list: + for key, (params, memory_size) in meta.items(): + key_to_meta[key] = params + self._task_chunks_meta[key] = _RayChunkMeta(memory_size=memory_size) + logger.info("Got %s metas of stage %s.", meta_count, stage_id) + + chunk_to_meta = {} + # ray.wait requires the object ref list is unique. + output_object_refs = set() + for chunk in chunk_graph.result_chunks: + chunk_key = chunk.key + # The result chunk may be in previous stage result, + # then the chunk does not have to be processed. + if chunk_key in task_context: + object_ref = task_context[chunk_key] + output_object_refs.add(object_ref) + chunk_params = key_to_meta.get(chunk_key) + if chunk_params is not None: + chunk_to_meta[chunk] = ExecutionChunkResult( + chunk_params, object_ref + ) + + logger.info("Waiting for stage %s complete.", stage_id) + # Patched the asyncio.to_thread for Python < 3.9 at mars/lib/aio/__init__.py + await asyncio.to_thread(ray.wait, list(output_object_refs), fetch_local=False) + + logger.info("Complete stage %s.", stage_id) + return chunk_to_meta + + async def __aexit__(self, exc_type, exc_val, exc_tb): + if exc_type is not None: + await self._result_tileables_lifecycle.decref_tracked() + try: + await self.cancel() + except BaseException: # noqa: E722 # nosec # pylint: disable=bare-except + pass + return + + # Update info if no exception occurs. + update_metas = [] + for tileable in self._task.tileable_graph.result_tileables: + tileable = tileable.data if hasattr(tileable, "data") else tileable + chunk_keys = [] + for chunk in self._tile_context[tileable].chunks: + chunk_key = chunk.key + chunk_keys.append(chunk_key) + if ( + chunk_key in self._task_context + and chunk_key in self._task_chunks_meta + ): + # Some tileable graph may have result chunks that not be executed, + # for example: + # r, b = cut(series, bins, retbins=True) + # r_result = r.execute().fetch() + # b_result = b.execute().fetch() <- This is the case + object_ref = self._task_context[chunk_key] + chunk_meta = self._task_chunks_meta[chunk_key] + update_metas.append( + self._meta_api.set_chunk_meta.delay( + chunk, + bands=[], + object_ref=object_ref, + memory_size=chunk_meta.memory_size, + ) + ) + if update_metas: + await self._meta_api.set_chunk_meta.batch(*update_metas) + + async def get_available_band_resources(self) -> Dict[BandType, Resource]: + if self._available_band_resources is None: + band_resources = self._config.get_band_resources() + virtual_band_resources = {} + idx = 0 + for band_resource in band_resources: + for band, resource in band_resource.items(): + virtual_band_resources[ + (f"ray_virtual_address_{idx}:0", band) + ] = resource + idx += 1 + self._available_band_resources = virtual_band_resources + + return self._available_band_resources + + async def get_progress(self) -> float: + """Get the execution progress.""" + return self._cur_stage_progress + + async def cancel(self): + """Cancel the task execution.""" + logger.info("Start to cancel task %s.", self._task) + if self._task is None or self._cancelled is True: + return + self._cancelled = True + if self._execute_subtask_graph_aiotask is not None: + self._execute_subtask_graph_aiotask.cancel() + + async def _load_subtask_inputs( + self, + stage_id: str, + subtask: Subtask, + context: Dict, + shuffle_manager: ShuffleManager, + ): + """ + Load input object refs of subtask from context. + + It updates the context if the input object refs are fetched from + the meta service. + """ + normal_object_refs = [] + shuffle_object_refs = [] + key_to_get_meta = {} + # for non-shuffle chunks, chunk key will be used for indexing object refs. + # for shuffle chunks, mapper subtasks will have only one mapper chunk, and all outputs for mapper + # subtask will be shuffle blocks, the downstream reducers will receive inputs in the mappers order. + fetch_chunks, shuffle_fetch_chunk = _get_fetch_chunks(subtask.chunk_graph) + for index, fetch_chunk in enumerate(fetch_chunks): + chunk_key = fetch_chunk.key + # pure_depend data is not used, skip it. + if chunk_key in subtask.pure_depend_keys: + normal_object_refs.append(None) + elif chunk_key in context: + normal_object_refs.append(context[chunk_key]) + else: + normal_object_refs.append(None) + key_to_get_meta[index] = self._meta_api.get_chunk_meta.delay( + chunk_key, fields=["object_refs"] + ) + if shuffle_fetch_chunk is not None: + # shuffle meta won't be recorded in meta service, query it from shuffle manager. + shuffle_object_refs = list(shuffle_manager.get_reducer_input_refs(subtask)) + + if key_to_get_meta: + logger.debug( + "Fetch %s metas and update context of stage %s.", + len(key_to_get_meta), + stage_id, + ) + meta_list = await self._meta_api.get_chunk_meta.batch( + *key_to_get_meta.values() + ) + for index, meta in zip(key_to_get_meta.keys(), meta_list): + object_ref = meta["object_refs"][0] + normal_object_refs[index] = object_ref + context[fetch_chunks[index].key] = object_ref + return normal_object_refs + shuffle_object_refs + + @aiotask_wrapper(exit_if_exception=IN_RAY_CI) + async def _update_progress_and_collect_garbage( + self, + stage_id: str, + subtask_graph: SubtaskGraph, + chunk_graph: ChunkGraph, + monitor_context: _RayMonitorContext, + interval_seconds: float, + method: str, + ): + total = sum(not subtask.virtual for subtask in subtask_graph) + completed_subtasks = monitor_context.completed_subtasks + submitted_subtasks = monitor_context.submitted_subtasks + result_chunk_keys = {chunk.key for chunk in chunk_graph.result_chunks} + chunk_key_ref_count = monitor_context.chunk_key_ref_count + object_ref_to_subtask = monitor_context.object_ref_to_subtask + slow_subtask_checker = _RaySlowSubtaskChecker( + total, + submitted_subtasks, + completed_subtasks, + self._config.get_check_slow_subtask_iqr_ratio(), + ) + + def gc(): + """ + Consume the completed subtasks and collect garbage. + + GC the output object refs of the subtask which successors are submitted + (not completed as above) can reduce the memory peaks, but we can't cancel + and rerun slow subtasks because the input object refs of running subtasks + may be deleted. + """ + i = 0 + gc_subtasks = set() + gc_targets = ( + submitted_subtasks if method == "submitted" else completed_subtasks + ) + + while i < total: + while i >= len(gc_targets): + yield + # Iterate the completed subtasks once. + subtask = gc_targets[i] + i += 1 + logger.debug("GC[stage=%s] subtask: %s", stage_id, subtask) + + # Note: There may be a scenario in which delayed gc occurs. + # When a subtask has more than one predecessor, like A, B, + # and in the `for ... in ...` loop we get A firstly while + # B's successors are completed, A's not. Then we cannot remove + # B's results chunks before A's. + for pred in subtask_graph.iter_predecessors(subtask): + if pred in gc_subtasks: + continue + for succ in subtask_graph.iter_successors(pred): + while succ not in gc_targets: + yield + if pred.virtual: + # For virtual subtask, remove all the predecessors if it is + # completed. + ppreds = subtask_graph.predecessors(pred) + gc_subtasks.update(ppreds) + gc_chunks = itertools.chain( + *(p.chunk_graph.results for p in ppreds) + ) + # Remove object refs from shuffle manager. + for p in ppreds: + logger.debug("GC[stage=%s] shuffle: %s", stage_id, p) + monitor_context.shuffle_manager.remove_object_refs(p) + else: + gc_subtasks.add(pred) + gc_chunks = pred.chunk_graph.results + # We use ref count to handle duplicate chunk keys, so here decref + # should be the same as incref, use deduped chunk keys of a subtask. + pred_result_keys = set() + for chunk in gc_chunks: + chunk_key = chunk.key + if chunk_key in pred_result_keys: + continue + pred_result_keys.add(chunk_key) + # We need to check the GC chunk key is not in the + # result meta keys, because there are some special + # cases that the result meta keys are not the leaves. + # + # example: test_cut_execution + if chunk_key not in result_chunk_keys: + logger.debug("GC[stage=%s] chunk: %s", stage_id, chunk) + ref_count = chunk_key_ref_count.get(chunk_key, 0) + if ref_count == 0: + self._task_context.pop(chunk_key, None) + else: + chunk_key_ref_count[chunk_key] = ref_count - 1 + + # TODO(fyrestone): Check the remaining self._task_context.keys() + # in the result subtasks + + collect_garbage = gc() + update_subtask_cost = slow_subtask_checker.update() + last_log_time = last_check_slow_time = time.time() + log_interval_seconds = self._config.get_log_interval_seconds() + check_slow_subtasks_interval_seconds = ( + self._config.get_check_slow_subtasks_interval_seconds() + ) + stage_to_log_func = { + _RayExecutionStage.SUBMITTING: lambda: logger.info( + "Submitted [%s/%s] subtasks of stage %s.", + len(submitted_subtasks), + total, + stage_id, + ), + _RayExecutionStage.WAITING: lambda: logger.info( + "Completed [%s/%s] subtasks of stage %s, one of waiting ray tasks: %s", + len(completed_subtasks), + total, + stage_id, + next(iter(object_ref_to_subtask)).task_id() + if object_ref_to_subtask + else None, + ), + } + + while len(completed_subtasks) < total: + curr_time = time.time() + if monitor_context.stage != _RayExecutionStage.INIT: + if curr_time - last_log_time > log_interval_seconds: # pragma: no cover + stage_to_log_func[monitor_context.stage]() + last_log_time = curr_time + + if len(object_ref_to_subtask) <= 0: # pragma: no cover + await asyncio.sleep(interval_seconds) + # We should run ray.wait after at least one Ray task is submitted. + # Please refer to: https://github.com/mars-project/mars/issues/3274 + continue + + # Only wait for unready subtask object refs. + ready_objects, unready_objects = await asyncio.to_thread( + ray.wait, + list(object_ref_to_subtask.keys()), + num_returns=len(object_ref_to_subtask), + timeout=0, + fetch_local=False, + ) + + # Pop the completed subtasks from object_ref_to_subtask. + completed_subtasks.update(map(object_ref_to_subtask.pop, ready_objects)) + # Update progress. + stage_progress = ( + len(completed_subtasks) / total * self._cur_stage_tile_progress + ) + self._cur_stage_progress = self._pre_all_stages_progress + stage_progress + # Update subtask cost group by the logic key to logic_key_to_subtask_costs. + for _ in update_subtask_cost: + break + # Collect garbage, use `for ... in ...` to avoid raising StopIteration. + for _ in collect_garbage: + break + # Check slow subtasks, after update_subtask_cost. + if monitor_context.stage == _RayExecutionStage.WAITING: + if len(completed_subtasks) > 0 and ( + curr_time - last_check_slow_time + > check_slow_subtasks_interval_seconds + ): + slow_objects = [] + for obj in unready_objects: + maybe_slow_subtask = object_ref_to_subtask[obj] + slow = slow_subtask_checker.is_slow(maybe_slow_subtask) + if slow: + slow_objects.append(obj) + if len(slow_objects) > 0: + logger.info( + "Slow tasks(%s): %s", + len(slow_objects), + [o.task_id() for o in slow_objects[:5]], + ) + else: + logger.debug( + "No slow tasks in %s unready tasks.", len(unready_objects) + ) + last_check_slow_time = curr_time + # Fast to next loop and give it a chance to update object_ref_to_subtask. + await asyncio.sleep(interval_seconds if len(ready_objects) == 0 else 0) diff --git a/python/xorbits/_mars/services/task/execution/ray/fetcher.py b/python/xorbits/_mars/services/task/execution/ray/fetcher.py new file mode 100644 index 000000000..3636c2968 --- /dev/null +++ b/python/xorbits/_mars/services/task/execution/ray/fetcher.py @@ -0,0 +1,69 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import functools +from collections import namedtuple +from typing import Dict, List + +from .....utils import lazy_import +from ..api import Fetcher, register_fetcher_cls + +ray = lazy_import("ray") +_FetchInfo = namedtuple("FetchInfo", ["key", "object_ref", "conditions"]) + + +def _query_object_with_condition(o, conditions): + try: + return o.iloc[conditions] + except AttributeError: + return o[conditions] + + +@register_fetcher_cls +class RayFetcher(Fetcher): + name = "ray" + required_meta_keys = ("object_refs",) + + def __init__(self, **kwargs): + self._fetch_info_list = [] + self._no_conditions = True + + @staticmethod + @functools.lru_cache(maxsize=None) # Specify maxsize=None to make it faster + def _remote_query_object_with_condition(): + # Export remote function once. + return ray.remote(_query_object_with_condition) + + async def append(self, chunk_key: str, chunk_meta: Dict, conditions: List = None): + if conditions is not None: + self._no_conditions = False + self._fetch_info_list.append( + _FetchInfo(chunk_key, chunk_meta["object_refs"][0], conditions) + ) + + async def get(self): + if self._no_conditions: + return await asyncio.gather( + *(info.object_ref for info in self._fetch_info_list) + ) + refs = [None] * len(self._fetch_info_list) + for index, fetch_info in enumerate(self._fetch_info_list): + if fetch_info.conditions is None: + refs[index] = fetch_info.object_ref + else: + refs[index] = self._remote_query_object_with_condition().remote( + fetch_info.object_ref, tuple(fetch_info.conditions) + ) + return await asyncio.gather(*refs) diff --git a/python/xorbits/_mars/services/task/execution/ray/shuffle.py b/python/xorbits/_mars/services/task/execution/ray/shuffle.py new file mode 100644 index 000000000..81d1b4c26 --- /dev/null +++ b/python/xorbits/_mars/services/task/execution/ray/shuffle.py @@ -0,0 +1,176 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Iterable, List + +import numpy as np + +from .....core.operand import MapReduceOperand, OperandStage +from .....utils import lazy_import +from ....subtask import Subtask, SubtaskGraph + +ray = lazy_import("ray") + + +class ShuffleManager: + """Manage shuffle execution for ray by resolve dependencies between mappers outputs and reducers inputs based on + mapper and reducer index. + """ + + def __init__(self, subtask_graph: SubtaskGraph): + self._subtask_graph = subtask_graph + self._proxy_subtasks = subtask_graph.get_shuffle_proxy_subtasks() + self._num_shuffles = subtask_graph.num_shuffles() + self._mapper_output_refs = [] + self._mapper_indices = {} + self._reducer_indices = {} + for shuffle_index, proxy_subtask in enumerate(self._proxy_subtasks): + # Note that the reducers can also be mappers such as `DuplicateOperand`. + mapper_subtasks = subtask_graph.predecessors(proxy_subtask) + reducer_subtasks = subtask_graph.successors(proxy_subtask) + n_mappers = len(mapper_subtasks) + n_reducers = proxy_subtask.chunk_graph.results[0].op.n_reducers + mapper_output_arr = np.empty((n_mappers, n_reducers), dtype=object) + self._mapper_output_refs.append(mapper_output_arr) + self._mapper_indices.update( + { + subtask: (shuffle_index, mapper_index) + for mapper_index, subtask in enumerate(mapper_subtasks) + } + ) + # reducers subtask should be sorted by reducer_index and MapReduceOperand.map should insert shuffle block + # in reducers order, otherwise shuffle blocks will be sent to wrong reducers. + sorted_filled_reducer_subtasks = self._get_sorted_filled_reducers( + reducer_subtasks, n_reducers + ) + self._reducer_indices.update( + { + subtask: (shuffle_index, reducer_ordinal) + for reducer_ordinal, subtask in enumerate( + sorted_filled_reducer_subtasks + ) + } + ) + + @staticmethod + def _get_sorted_filled_reducers( + reducer_subtasks: Iterable[Subtask], n_reducers: int + ): + # For operands such as `PSRSAlign`, sometimes `reducer_subtasks` might be less than `n_reducers`. + # fill missing reducers with `None`. + filled_reducers = [None] * n_reducers + for subtask in reducer_subtasks: + reducer_ordinal = _get_reducer_operand(subtask.chunk_graph).reducer_ordinal + filled_reducers[reducer_ordinal] = subtask + return filled_reducers + + def has_shuffle(self): + """ + Whether current subtask graph has shuffles to execute. + """ + return self._num_shuffles > 0 + + def add_mapper_output_refs( + self, subtask: Subtask, output_object_refs: List["ray.ObjectRef"] + ): + """ + Record mapper output ObjectRefs which will be used by reducers later. + + Parameters + ---------- + subtask + output_object_refs : List["ray.ObjectRef"] + Mapper output ObjectRefs. + """ + shuffle_index, mapper_index = self._mapper_indices[subtask] + self._mapper_output_refs[shuffle_index][mapper_index] = np.array( + output_object_refs + ) + + def get_reducer_input_refs(self, subtask: Subtask) -> List["ray.ObjectRef"]: + """ + Get the reducer inputs ObjectRefs output by mappers. + + Parameters + ---------- + subtask : Subtask + A reducer subtask. + Returns + ------- + input_refs : List["ray.ObjectRef"] + The reducer inputs ObjectRefs output by mappers. + """ + shuffle_index, reducer_ordinal = self._reducer_indices[subtask] + return self._mapper_output_refs[shuffle_index][:, reducer_ordinal] + + def get_n_reducers(self, subtask: Subtask): + """ + Get the number of shuffle blocks that a mapper operand outputs, + which is also the number of the reducers when tiling shuffle operands. + Note that this might be greater than actual number of the reducers in the subtask graph, + because some reducers may not be added to chunk graph. + + Parameters + ---------- + subtask : Subtask + A mapper or reducer subtask. + Returns + ------- + n_reducers : int + The number of shuffle blocks that a mapper operand outputs. + """ + index = self._mapper_indices.get(subtask) or self._reducer_indices.get(subtask) + if index is None: + raise ValueError(f"The {subtask} should be a mapper or a reducer.") + else: + shuffle_index, _ = index + return self._mapper_output_refs[shuffle_index].shape[1] + + def is_mapper(self, subtask: Subtask): + """ + Check whether a subtask is a mapper subtask. Note the even this a mapper subtask, it can be a reducer subtask + at the same time such as `DuplicateOperand`, see + https://user-images.githubusercontent.com/12445254/174305282-f7c682a9-0346-47fe-a34c-1e384e6a1775.svg + """ + return subtask in self._mapper_indices + + def info(self): + """ + A list of (mapper count, reducer count). + """ + return [shuffle_mapper.shape for shuffle_mapper in self._mapper_output_refs] + + def remove_object_refs(self, subtask: Subtask): + """ + Set the object refs to None by subtask. + """ + index = self._mapper_indices.get(subtask) + if index is not None: + shuffle_index, mapper_index = index + self._mapper_output_refs[shuffle_index][mapper_index].fill(None) + return + index = self._reducer_indices.get(subtask) + if index is not None: + shuffle_index, reducer_ordinal = index + self._mapper_output_refs[shuffle_index][:, reducer_ordinal].fill(None) + return + raise ValueError(f"The {subtask} should be a mapper or a reducer.") + + +def _get_reducer_operand(subtask_chunk_graph): + return next( + c.op + for c in subtask_chunk_graph + if isinstance(c.op, MapReduceOperand) and c.op.stage == OperandStage.reduce + ) diff --git a/python/xorbits/_mars/services/task/execution/ray/tests/__init__.py b/python/xorbits/_mars/services/task/execution/ray/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/services/task/execution/ray/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/services/task/execution/ray/tests/test_ray_execution_backend.py b/python/xorbits/_mars/services/task/execution/ray/tests/test_ray_execution_backend.py new file mode 100644 index 000000000..97bf6e3fd --- /dev/null +++ b/python/xorbits/_mars/services/task/execution/ray/tests/test_ray_execution_backend.py @@ -0,0 +1,723 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import asyncio +import time +from collections import Counter + +import numpy as np +import pandas as pd +import pytest + +from ...... import dataframe as md +from ...... import tensor as mt +from ......config import Config +from ......core import TileContext +from ......core.context import get_context +from ......core.graph import ChunkGraphBuilder, TileableGraph, TileableGraphBuilder +from ......core.operand import ShuffleFetchType +from ......lib.aio.isolation import new_isolation, stop_isolation +from ......resource import Resource +from ......serialization import serialize +from ......tests.core import mock, require_ray +from ......utils import get_chunk_params, lazy_import +from .....context import ThreadedServiceContext +from .....subtask import Subtask +from ....analyzer import GraphAnalyzer +from ....core import Task, new_task_id +from ..config import RayExecutionConfig +from ..context import ( + RayExecutionContext, + RayExecutionWorkerContext, + RayRemoteObjectManager, + _RayRemoteObjectContext, +) +from ..executor import ( + OrderedSet, + RayTaskExecutor, + RayTaskState, + _RayChunkMeta, + _RaySlowSubtaskChecker, + _RaySubtaskRuntime, + execute_subtask, +) +from ..fetcher import RayFetcher +from ..shuffle import ShuffleManager + +ray = lazy_import("ray") + + +def _gen_subtask_chunk_graph(t): + graph = TileableGraph([t.data]) + next(TileableGraphBuilder(graph).build()) + return next(ChunkGraphBuilder(graph, fuse_enabled=False).build()) + + +def _gen_subtask_graph(t): + tileable_graph = t.build_graph(tile=False) + chunk_graph = next(ChunkGraphBuilder(tileable_graph).build()) + bands = [(f"address_{i}", "numa-0") for i in range(4)] + band_resource = dict((band, Resource(num_cpus=1)) for band in bands) + task = Task("mock_task", "mock_session", tileable_graph) + analyzer = GraphAnalyzer( + chunk_graph, + band_resource, + task, + Config(), + dict(), + shuffle_fetch_type=ShuffleFetchType.FETCH_BY_INDEX, + ) + subtask_graph = analyzer.gen_subtask_graph() + return chunk_graph, subtask_graph + + +class MockRayTaskExecutor(RayTaskExecutor): + def __init__(self, *args, **kwargs): + self._set_attrs = Counter() + self._monitor_tasks = [] + super().__init__(*args, **kwargs) + + @classmethod + async def _get_apis(cls, session_id: str, address: str): + return None, None + + @staticmethod + def _get_ray_executor(): + # Export remote function once. + return None + + async def get_available_band_resources(self): + return {} + + async def execute_subtask_graph(self, *args, **kwargs): + self._monitor_tasks.clear() + return await super().execute_subtask_graph(*args, **kwargs) + + async def _update_progress_and_collect_garbage(self, *args, **kwargs): + # Infinite loop to test monitor task cancel. + self._monitor_tasks.append(asyncio.current_task()) + return await super()._update_progress_and_collect_garbage(*args, **kwargs) + + def monitor_tasks(self): + return self._monitor_tasks + + def set_attr_counter(self): + return self._set_attrs + + def __setattr__(self, key, value): + super().__setattr__(key, value) + self._set_attrs[key] += 1 + + +class MockTileContext(TileContext): + def get_all_progress(self) -> float: + return 1.0 + + +@require_ray +@pytest.mark.asyncio +@mock.patch("mars.services.task.execution.ray.executor.RayTaskState.create") +@mock.patch("mars.services.task.execution.ray.context.RayExecutionContext.init") +@mock.patch("ray.get") +async def test_ray_executor_create( + mock_ray_get, mock_execution_context_init, mock_task_state_actor_create +): + task = Task("mock_task", "mock_session", TileableGraph([])) + + # Create RayTaskState actor as needed by default. + mock_config = RayExecutionConfig.from_execution_config({"backend": "ray"}) + executor = await MockRayTaskExecutor.create( + mock_config, + session_id="mock_session_id", + address="mock_address", + task=task, + tile_context=TileContext(), + ) + assert isinstance(executor, MockRayTaskExecutor) + assert mock_task_state_actor_create.call_count == 0 + ctx = get_context() + assert isinstance(ctx, RayExecutionContext) + ctx.create_remote_object("abc", lambda: None) + assert mock_ray_get.call_count == 1 + assert mock_task_state_actor_create.call_count == 1 + + +@require_ray +@pytest.mark.asyncio +async def test_ray_executor_destroy(): + task = Task("mock_task", "mock_session", TileableGraph([])) + mock_config = RayExecutionConfig.from_execution_config({"backend": "ray"}) + executor = MockRayTaskExecutor( + config=mock_config, + task=task, + tile_context=TileContext(), + task_context={}, + task_chunks_meta={}, + lifecycle_api=None, + meta_api=None, + ) + counter = executor.set_attr_counter() + assert len(counter) > 0 + keys = executor.__dict__.keys() + assert counter.keys() >= keys + counter.clear() + executor.destroy() + keys = set(keys) - {"_set_attrs", "_monitor_tasks"} + assert counter.keys() == keys, "Some keys are not reset in destroy()." + for k, v in counter.items(): + assert v == 1 + assert await executor.get_progress() == 1.0 + + +@require_ray +@mock.patch("ray.get_runtime_context") +def test_ray_execute_subtask_basic(_): + raw = np.ones((10, 10)) + raw_expect = raw + 1 + a = mt.ones((10, 10), chunk_size=10) + b = a + 1 + + subtask_id = new_task_id() + subtask_chunk_graph = _gen_subtask_chunk_graph(b) + r = execute_subtask(subtask_id, serialize(subtask_chunk_graph), 0, False) + np.testing.assert_array_equal(r, raw_expect) + test_get_meta_chunk = subtask_chunk_graph.result_chunks[0] + r = execute_subtask(subtask_id, serialize(subtask_chunk_graph), 1, False) + assert len(r) == 2 + meta_dict, r = r + assert len(meta_dict) == 1 + assert meta_dict[test_get_meta_chunk.key][0] == get_chunk_params( + test_get_meta_chunk + ) + np.testing.assert_array_equal(r, raw_expect) + + +@require_ray +@pytest.mark.asyncio +async def test_ray_fetcher(ray_start_regular_shared2): + pd_value = pd.DataFrame( + { + "col1": [str(i) for i in range(10)], + "col2": np.random.randint(0, 100, (10,)), + } + ) + pd_object_ref = ray.put(pd_value) + np_value = np.asarray([1, 3, 6, 2, 4]) + np_object_ref = ray.put(np_value) + # Test RayFetcher to fetch mixed values. + fetcher = RayFetcher() + await fetcher.append("pd_key", {"object_refs": [pd_object_ref]}) + await fetcher.append("np_key", {"object_refs": [np_object_ref]}) + await fetcher.append("pd_key", {"object_refs": [pd_object_ref]}, [slice(1, 3, 1)]) + await fetcher.append("np_key", {"object_refs": [np_object_ref]}, [slice(1, 3, 1)]) + results = await fetcher.get() + pd.testing.assert_frame_equal(results[0], pd_value) + np.testing.assert_array_equal(results[1], np_value) + pd.testing.assert_frame_equal(results[2], pd_value.iloc[1:3]) + np.testing.assert_array_equal(results[3], np_value[1:3]) + + +@require_ray +@pytest.mark.asyncio +async def test_ray_remote_object(ray_start_regular_shared2): + class _TestRemoteObject: + def __init__(self, i): + self._i = i + + def value(self): + return self._i + + def foo(self, a, b): + return self._i + a + b + + async def bar(self, a, b): + return self._i * a * b + + # Test RayTaskState reference + state = RayTaskState.create() + await state.create_remote_object.remote("aaa", _TestRemoteObject, 123) + assert await state.call_remote_object.remote("aaa", "value") == 123 + state = RayTaskState.create() + assert await state.call_remote_object.remote("aaa", "value") == 123 + + # Test RayRemoteObjectManager + name = "abc" + manager = RayRemoteObjectManager() + manager.create_remote_object(name, _TestRemoteObject, 2) + r = await manager.call_remote_object(name, "foo", 3, 4) + assert r == 9 + r = await manager.call_remote_object(name, "bar", 3, 4) + assert r == 24 + manager.destroy_remote_object(name) + with pytest.raises(KeyError): + await manager.call_remote_object(name, "foo", 3, 4) + + # Test _RayRemoteObjectContext + context = _RayRemoteObjectContext(lambda: RayTaskState.create()) + context.create_remote_object(name, _TestRemoteObject, 2) + remote_object = context.get_remote_object(name) + r = remote_object.foo(3, 4) + assert r == 9 + r = remote_object.bar(3, 4) + assert r == 24 + context.destroy_remote_object(name) + with pytest.raises(KeyError): + remote_object.foo(3, 4) + + class MyException(Exception): + pass + + class _ErrorRemoteObject: + def __init__(self): + raise MyException() + + with pytest.raises(MyException): + context.create_remote_object(name, _ErrorRemoteObject) + + handle = RayTaskState.get_handle() + assert handle is not None + + +@require_ray +def test_ray_execution_context(ray_start_regular_shared2): + value = 123 + o = ray.put(value) + + def fake_init(self): + pass + + async def fake_get_chunks_meta_from_service( + self, data_keys, fields=None, error="raise" + ): + mock_meta = {"meta_1": {fields[0]: 1}, "meta_3": {fields[0]: 3}} + return [mock_meta[k] for k in data_keys] + + with mock.patch.object( + ThreadedServiceContext, "__init__", new=fake_init + ), mock.patch.object( + RayExecutionContext, + "_get_chunks_meta_from_service", + new=fake_get_chunks_meta_from_service, + ): + mock_config = RayExecutionConfig.from_execution_config({"backend": "ray"}) + mock_worker_addresses = ["mock_worker_address"] + isolation = new_isolation("test", threaded=True) + try: + context = RayExecutionContext( + mock_config, {"abc": o}, {}, mock_worker_addresses, lambda: None + ) + context._loop = isolation.loop + r = context.get_chunks_result(["abc"]) + assert r == [value] + + r = context.get_worker_addresses() + assert r == mock_worker_addresses + + r = context.get_chunks_meta(["meta_1"], fields=["memory_size"]) + assert r == [{"memory_size": 1}] + + context._task_chunks_meta["meta_1"] = _RayChunkMeta(memory_size=2) + r = context.get_chunks_meta(["meta_1", "meta_3"], fields=["memory_size"]) + assert r == [{"memory_size": 2}, {"memory_size": 3}] + finally: + stop_isolation("test") + + +def test_ray_execution_worker_context(): + context = RayExecutionWorkerContext(lambda: None) + with pytest.raises(NotImplementedError): + context.set_running_operand_key("mock_session_id", "mock_op_key") + with pytest.raises(NotImplementedError): + context.register_custom_log_path( + "mock_session_id", + "mock_tileable_op_key", + "mock_chunk_op_key", + "mock_worker_address", + "mock_log_path", + ) + + assert context.set_progress(0.1) is None + assert context.new_custom_log_dir() is None + assert context.get_storage_info("mock_address") == {} + + +@require_ray +@pytest.mark.asyncio +async def test_ray_execution_config(ray_start_regular_shared2): + t1 = mt.random.randint(10, size=(100, 10), chunk_size=100) + chunk_graph, subtask_graph = _gen_subtask_graph(t1) + + real_executor = RayTaskExecutor._get_ray_executor() + + class MockExecutor: + opt = {} + + @classmethod + def options(cls, *args, **kwargs): + cls.opt = kwargs + return real_executor.options(*args, **kwargs) + + task = Task("mock_task", "mock_session", TileableGraph([])) + mock_config = RayExecutionConfig.from_execution_config( + { + "backend": "ray", + "ray": { + "monitor_interval_seconds": 0, + "subtask_max_retries": 4, + "subtask_num_cpus": 0.8, + "subtask_memory": 1001, + "n_cpu": 1, + "n_worker": 1, + }, + } + ) + tile_context = MockTileContext() + executor = MockRayTaskExecutor( + config=mock_config, + task=task, + tile_context=tile_context, + task_context={}, + task_chunks_meta={}, + lifecycle_api=None, + meta_api=None, + ) + executor._ray_executor = MockExecutor + async with executor: + await executor.execute_subtask_graph( + "mock_stage", subtask_graph, chunk_graph, tile_context + ) + + assert MockExecutor.opt["num_cpus"] == 0.8 + assert MockExecutor.opt["max_retries"] == 4 + assert MockExecutor.opt["memory"] == 1001 + + +@require_ray +@pytest.mark.asyncio +@pytest.mark.parametrize("gc_method", ["submitted", "completed"]) +async def test_executor_context_gc(ray_start_regular_shared2, gc_method): + popped_seq = [] + + class MockTaskContext(dict): + def pop(self, k, d=None): + popped_seq.append(k) + return super().pop(k, d) + + t1 = mt.random.randint(10, size=(100, 10), chunk_size=100) + t2 = mt.random.randint(10, size=(100, 10), chunk_size=50) + t3 = t2 + t1 + t4 = t3.sum(0) + chunk_graph, subtask_graph = _gen_subtask_graph(t4) + task = Task("mock_task", "mock_session", TileableGraph([]), fuse_enabled=True) + mock_config = RayExecutionConfig.from_execution_config( + { + "backend": "ray", + "ray": { + "monitor_interval_seconds": 0, + "log_interval_seconds": 0, + "subtask_max_retries": 0, + "n_cpu": 1, + "n_worker": 1, + "gc_method": gc_method, + }, + } + ) + tile_context = MockTileContext() + task_context = MockTaskContext() + executor = MockRayTaskExecutor( + config=mock_config, + task=task, + tile_context=tile_context, + task_context=task_context, + task_chunks_meta={}, + lifecycle_api=None, + meta_api=None, + ) + executor._ray_executor = RayTaskExecutor._get_ray_executor() + + original_execute_subtask_graph = executor._execute_subtask_graph + + async def _wait_gc_execute_subtask_graph(*args, **kwargs): + # Mock _execute_subtask_graph to wait the monitor task done. + await original_execute_subtask_graph(*args, **kwargs) + await executor.monitor_tasks()[0] + + with mock.patch.object( + executor, "_execute_subtask_graph", _wait_gc_execute_subtask_graph + ): + async with executor: + await executor.execute_subtask_graph( + "mock_stage", subtask_graph, chunk_graph, tile_context + ) + await asyncio.sleep(0) + assert len(executor.monitor_tasks()) == 1 + assert executor.monitor_tasks()[0].done() + + assert len(task_context) == 1 + assert len(popped_seq) == 6 + subtasks = list(subtask_graph.topological_iter()) + chunk_keys1 = set( + map( + lambda c: c.key, + ( + subtasks[0].chunk_graph.results + + subtasks[1].chunk_graph.results + + subtasks[3].chunk_graph.results + ), + ) + ) + chunk_keys2 = set( + map( + lambda c: c.key, + (subtasks[2].chunk_graph.results + subtasks[4].chunk_graph.results), + ) + ) + assert chunk_keys1 == set(popped_seq[0:4]) + assert chunk_keys2 == set(popped_seq[4:]) + + task_context.clear() + + original_update_progress_and_collect_garbage = ( + executor._update_progress_and_collect_garbage + ) + + async def infinite_update_progress_and_collect_garbage(*args, **kwargs): + # Mock _update_progress_and_collect_garbage that never done. + await original_update_progress_and_collect_garbage(*args, **kwargs) + while True: + await asyncio.sleep(0) + + with mock.patch("logging.Logger.info") as log_patch, mock.patch.object( + executor, + "_update_progress_and_collect_garbage", + infinite_update_progress_and_collect_garbage, + ): + async with executor: + await executor.execute_subtask_graph( + "mock_stage2", subtask_graph, chunk_graph, tile_context + ) + await asyncio.sleep(0) + assert len(executor.monitor_tasks()) == 1 + assert executor.monitor_tasks()[0].done() + assert log_patch.call_count > 0 + args = [c.args[0] for c in log_patch.call_args_list] + assert any("Submitted [%s/%s]" in a for a in args) + assert any("Completed [%s/%s]" in a for a in args) + + assert len(task_context) == 1 + + task_context.clear() + + # Test the monitor aiotask is done even an exception is raised. + async def _raise_load_subtask_inputs(*args, **kwargs): + # Mock _load_subtask_inputs to raise an exception. + await asyncio.sleep(0) + 1 / 0 + + with mock.patch.object( + executor, "_load_subtask_inputs", _raise_load_subtask_inputs + ): + async with executor: + with pytest.raises(ZeroDivisionError): + await executor.execute_subtask_graph( + "mock_stage3", subtask_graph, chunk_graph, tile_context + ) + await asyncio.sleep(0) + assert len(executor.monitor_tasks()) == 1 + assert executor.monitor_tasks()[0].done() + + +@require_ray +@pytest.mark.asyncio +@pytest.mark.parametrize("gc_method", ["submitted", "completed"]) +async def test_execute_shuffle(ray_start_regular_shared2, gc_method): + chunk_size, n_rows = 10, 50 + df = md.DataFrame( + pd.DataFrame(np.random.rand(n_rows, 3), columns=list("abc")), + chunk_size=chunk_size, + ) + df2 = df.groupby(["a"]).apply(lambda x: x) + chunk_graph, subtask_graph = _gen_subtask_graph(df2) + task = Task("mock_task", "mock_session", TileableGraph([]), fuse_enabled=True) + + class MockRayExecutor: + @staticmethod + def options(**kwargs): + num_returns = kwargs["num_returns"] + + class _Wrapper: + @staticmethod + def remote(*args): + args = [ + ray.get(a) if isinstance(a, ray.ObjectRef) else a for a in args + ] + r = execute_subtask(*args) + assert len(r) == num_returns + return [ray.put(i) for i in r] + + return _Wrapper + + mock_config = RayExecutionConfig.from_execution_config( + { + "backend": "ray", + "ray": { + "monitor_interval_seconds": 0, + "subtask_max_retries": 0, + "n_cpu": 1, + "n_worker": 1, + "gc_method": gc_method, + }, + } + ) + tile_context = MockTileContext() + task_context = {} + executor = MockRayTaskExecutor( + config=mock_config, + task=task, + tile_context=tile_context, + task_context=task_context, + task_chunks_meta={}, + lifecycle_api=None, + meta_api=None, + ) + executor._ray_executor = MockRayExecutor + + # Test ShuffleManager.remove_object_refs + sm = ShuffleManager(subtask_graph) + sm._mapper_output_refs[0].fill(1) + sm.remove_object_refs(next(iter(sm._reducer_indices.keys()))) + assert pd.isnull(sm._mapper_output_refs[0][:, 0]).all() + sm._mapper_output_refs[0].fill(1) + sm.remove_object_refs(next(iter(sm._mapper_indices.keys()))) + assert pd.isnull(sm._mapper_output_refs[0][0]).all() + with pytest.raises(ValueError): + sm.remove_object_refs(None) + + original_execute_subtask_graph = executor._execute_subtask_graph + + async def _wait_gc_execute_subtask_graph( + stage_id, subtask_graph, chunk_graph, monitor_context + ): + # Mock _execute_subtask_graph to wait the monitor task done. + await original_execute_subtask_graph( + stage_id, subtask_graph, chunk_graph, monitor_context + ) + await executor.monitor_tasks()[0] + assert pd.isnull(monitor_context.shuffle_manager._mapper_output_refs[0]).all() + + with mock.patch.object( + executor, "_execute_subtask_graph", _wait_gc_execute_subtask_graph + ), mock.patch("ray.get_runtime_context"): + async with executor: + await executor.execute_subtask_graph( + "mock_stage", subtask_graph, chunk_graph, tile_context + ) + await asyncio.sleep(0) + assert len(executor.monitor_tasks()) == 1 + assert executor.monitor_tasks()[0].done() + + assert len(task_context) == len(chunk_graph.results) + + +@require_ray +@pytest.mark.asyncio +async def test_slow_subtask_checker(): + subtasks = [ + Subtask(str(i), logic_key=f"logic_key1", logic_parallelism=5) for i in range(5) + ] + for s in subtasks: + s.runtime = _RaySubtaskRuntime() + submitted = OrderedSet() + completed = OrderedSet() + now = time.time() + checker = _RaySlowSubtaskChecker(5, submitted, completed) + updater = checker.update() + for s in subtasks: + submitted.add(s) + for _ in updater: + break + assert all(s.runtime.start_time >= now for s in subtasks) + await asyncio.sleep(0.01) + assert not any(checker.is_slow(s) for s in subtasks) + completed.add(subtasks[0]) + completed.add(subtasks[1]) + for _ in updater: + break + await asyncio.sleep(0.01) + completed.add(subtasks[2]) + assert not any(checker.is_slow(s) for s in subtasks[3:]) + completed.add(subtasks[3]) + for _ in updater: + break + assert not checker.is_slow(subtasks[4]) + await asyncio.sleep(0.1) + assert checker.is_slow(subtasks[4]) + + +@require_ray +@pytest.mark.asyncio +async def test_execute_slow_task(ray_start_regular_shared2): + t1 = mt.random.randint(10, size=(100, 10), chunk_size=10) + t2 = mt.random.randint(10, size=(100, 10), chunk_size=30) + t3 = t2 + t1 + t4 = t3.sum(0) + chunk_graph, subtask_graph = _gen_subtask_graph(t4) + task = Task("mock_task", "mock_session", TileableGraph([]), fuse_enabled=True) + mock_config = RayExecutionConfig.from_execution_config( + { + "backend": "ray", + "ray": { + "monitor_interval_seconds": 0, + "log_interval_seconds": 0, + "check_slow_subtasks_interval_seconds": 0, + "subtask_max_retries": 0, + "n_cpu": 1, + "n_worker": 1, + }, + } + ) + tile_context = MockTileContext() + executor = MockRayTaskExecutor( + config=mock_config, + task=task, + tile_context=tile_context, + task_context={}, + task_chunks_meta={}, + lifecycle_api=None, + meta_api=None, + ) + slow_subtask_id = list(subtask_graph)[-1].subtask_id + + def mock_execute_subtask(subtask_id, *args): + if subtask_id == slow_subtask_id: + time.sleep(1) + return execute_subtask(subtask_id, *args) + + executor._ray_executor = ray.remote(mock_execute_subtask) + + with mock.patch("logging.Logger.info") as log_patch: + async with executor: + await executor.execute_subtask_graph( + "mock_stage2", subtask_graph, chunk_graph, tile_context + ) + await asyncio.sleep(0) + assert len(executor.monitor_tasks()) == 1 + assert executor.monitor_tasks()[0].done() + assert log_patch.call_count > 0 + slow_ray_object_refs = set() + for c in log_patch.call_args_list: + if c.args[0] == "Slow tasks(%s): %s": + count, object_refs = c.args[1:] + assert count >= 1 + slow_ray_object_refs.update(object_refs) + assert len(slow_ray_object_refs) >= 1 diff --git a/python/xorbits/_mars/services/task/execution/utils.py b/python/xorbits/_mars/services/task/execution/utils.py new file mode 100644 index 000000000..854539d9e --- /dev/null +++ b/python/xorbits/_mars/services/task/execution/utils.py @@ -0,0 +1,96 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Dict, List + +from ....core import TileableGraph, TileContext +from ....resource import Resource +from ...lifecycle.api import LifecycleAPI + + +def get_band_resources_from_config( + backend_execution_config: Dict, +) -> List[Dict[str, Resource]]: + config = backend_execution_config + n_worker: int = config["n_worker"] + n_cpu: int = config["n_cpu"] + mem_bytes: int = config["mem_bytes"] + cuda_devices: List[List[int]] = config.get("cuda_devices") + + bands_to_resource = [] + worker_cpus = n_cpu // n_worker + cuda_devices = cuda_devices or ([[]] * n_worker) + if sum(len(devices) for devices in cuda_devices) == 0: + assert worker_cpus > 0, ( + f"{n_cpu} cpus are not enough " f"for {n_worker}, try to decrease workers." + ) + mem_bytes = mem_bytes // n_worker + for _, devices in zip(range(n_worker), cuda_devices): + worker_band_to_resource = dict() + if worker_cpus > 0: + worker_band_to_resource["numa-0"] = Resource( + num_cpus=worker_cpus, mem_bytes=mem_bytes + ) + for i in devices: + worker_band_to_resource[f"gpu-{i}"] = Resource(num_gpus=1) + bands_to_resource.append(worker_band_to_resource) + return bands_to_resource + + +class ResultTileablesLifecycle: + def __init__( + self, + tileable_graph: TileableGraph, + tile_context: TileContext, + lifecycle_api: LifecycleAPI, + ): + self._tileable_graph = tileable_graph + self._tile_context = tile_context + self._lifecycle_api = lifecycle_api + self._lifecycle_tracked_tileables = set() + self._lifecycle_untracked_tileables = set(self._tileable_graph.result_tileables) + + async def incref_tiled(self): + # track and incref result tileables if tiled + tracks = [], [] + new_track_tileables = set() + for tileable in self._lifecycle_untracked_tileables: + try: + tiled_tileable = self._tile_context[tileable] + except KeyError: + # not tiled, skip + pass + else: + tileable_key = tileable.key + tracks[0].append(tileable_key) + tracks[1].append( + self._lifecycle_api.track.delay( + tileable_key, [c.key for c in tiled_tileable.chunks] + ) + ) + new_track_tileables.add(tileable) + + if any(tracks): + # TODO(fyrestone): make the decref cancellation safe or + # make all the tileable ids unique. + self._lifecycle_untracked_tileables -= new_track_tileables + self._lifecycle_tracked_tileables |= new_track_tileables + await self._lifecycle_api.track.batch(*tracks[1]) + await self._lifecycle_api.incref_tileables(tracks[0]) + + async def decref_tracked(self): + await self._lifecycle_api.decref_tileables( + [t.key for t in self._lifecycle_tracked_tileables] + ) diff --git a/python/xorbits/_mars/services/task/supervisor/__init__.py b/python/xorbits/_mars/services/task/supervisor/__init__.py new file mode 100644 index 000000000..bd174cbc7 --- /dev/null +++ b/python/xorbits/_mars/services/task/supervisor/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .service import TaskSupervisorService diff --git a/python/xorbits/_mars/services/task/supervisor/graph_visualizer.py b/python/xorbits/_mars/services/task/supervisor/graph_visualizer.py new file mode 100644 index 000000000..0d09da9ad --- /dev/null +++ b/python/xorbits/_mars/services/task/supervisor/graph_visualizer.py @@ -0,0 +1,148 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from io import StringIO +from typing import Dict, List + +from ....core.operand import Fetch, FetchShuffle +from ...subtask import Subtask, SubtaskGraph + + +class GraphVisualizer: + @classmethod + def to_dot(cls, subtask_graphs: List[SubtaskGraph]): + sio = StringIO() + sio.write("digraph {\n") + sio.write("splines=curved\n") + sio.write("rankdir=BT\n") + sio.write("graph [compound=true];\n") + subgraph_index = 0 + current_stage = 0 + result_chunk_to_subtask = dict() + line_colors = dict() + color_iter = iter(itertools.cycle(range(1, 9))) + for stage_line in itertools.combinations(range(len(subtask_graphs))[::-1], 2): + line_colors[stage_line] = f'"/spectral9/{next(color_iter)}"' + + for subtask_graph in subtask_graphs: + for subtask in subtask_graph.topological_iter(): + current_cluster = f"cluster_{subgraph_index}" + sio.write( + cls._export_subtask_to_dot( + subtask, + current_cluster, + current_stage, + line_colors, + result_chunk_to_subtask, + ) + ) + for c in subtask.chunk_graph.results: + result_chunk_to_subtask[c.key] = [current_stage, current_cluster] + subgraph_index += 1 + current_stage += 1 + sio.write("}") + return sio.getvalue() + + @classmethod + def _gen_chunk_key(cls, chunk, trunc_key): + if "_" in chunk.key: + key, index = chunk.key.split("_", 1) + return "_".join([key[:trunc_key], index]) + else: # pragma: no cover + return chunk.key[:trunc_key] + + @classmethod + def _export_subtask_to_dot( + cls, + subtask: Subtask, + subgraph_name: str, + current_stage: int, + line_colors: Dict, + chunk_key_to_subtask: Dict[str, List], + trunc_key: int = 5, + ): + chunk_graph = subtask.chunk_graph + sio = StringIO() + chunk_style = "[shape=box]" + operand_style = "[shape=circle]" + + visited = set() + all_nodes = [] + for node in chunk_graph.iter_nodes(): + op = node.op + if isinstance(node.op, (Fetch, FetchShuffle)): + continue + op_name = type(op).__name__ + if op.stage is not None: + op_name = f"{op_name}:{op.stage.name}" + if op.key in visited: + continue + for input_chunk in op.inputs or []: + if input_chunk.key not in visited and not isinstance( + input_chunk.op, (Fetch, FetchShuffle) + ): # pragma: no cover + node_name = f'"Chunk:{cls._gen_chunk_key(input_chunk, trunc_key)}"' + sio.write(f"{node_name} {chunk_style}\n") + all_nodes.append(node_name) + visited.add(input_chunk.key) + if op.key not in visited: + node_name = f'"{op_name}:{op.key[:trunc_key]}"' + sio.write(f"{node_name} {operand_style}\n") + all_nodes.append(node_name) + visited.add(op.key) + if ( + isinstance(input_chunk.op, (Fetch, FetchShuffle)) + and input_chunk.key in chunk_key_to_subtask + ): + stage, tail_cluster = chunk_key_to_subtask[input_chunk.key] + if stage == current_stage: + line_style = "style=bold" + else: + line_style = ( + f"style=bold color={line_colors[(current_stage, stage)]}" + ) + sio.write( + f'"Chunk:{cls._gen_chunk_key(input_chunk, trunc_key)}" ->' + f' "{op_name}:{op.key[:trunc_key]}" ' + f"[lhead={subgraph_name} ltail={tail_cluster} {line_style}];\n" + ) + else: + sio.write( + f'"Chunk:{cls._gen_chunk_key(input_chunk, trunc_key)}" -> ' + f'"{op_name}:{op.key[:trunc_key]}"\n' + ) + + for output_chunk in op.outputs or []: + if output_chunk.key not in visited: + node_name = f'"Chunk:{cls._gen_chunk_key(output_chunk, trunc_key)}"' + sio.write(f"{node_name} {chunk_style}\n") + all_nodes.append(node_name) + visited.add(output_chunk.key) + if op.key not in visited: + node_name = f'"{op_name}:{op.key[:trunc_key]}"' + sio.write(f"{node_name} {operand_style}\n") + all_nodes.append(node_name) + visited.add(op.key) + sio.write( + f'"{op_name}:{op.key[:trunc_key]}" -> ' + f'"Chunk:{cls._gen_chunk_key(output_chunk, trunc_key)}"\n' + ) + # write subgraph info + sio.write(f"subgraph {subgraph_name} {{\n") + nodes_str = " ".join(all_nodes) + sio.write(f"{nodes_str};\n") + sio.write(f'label="{subtask.subtask_id}";\n}}') + sio.write("\n") + return sio.getvalue() diff --git a/python/xorbits/_mars/services/task/supervisor/manager.py b/python/xorbits/_mars/services/task/supervisor/manager.py new file mode 100644 index 000000000..e92626b50 --- /dev/null +++ b/python/xorbits/_mars/services/task/supervisor/manager.py @@ -0,0 +1,389 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import contextlib +import importlib +import logging +import time +import weakref +from collections import defaultdict, deque +from dataclasses import dataclass +from typing import Any, Dict, List, Type + +from .... import oscar as mo +from ....core import TileableGraph, TileableType, TileContext, enter_mode +from ....core.operand import Fetch +from ....oscar.errors import ActorNotExist, ServerClosed +from ....utils import _is_ci, aiotask_wrapper +from ...subtask import SubtaskGraph, SubtaskResult +from ..config import task_options +from ..core import MapReduceInfo, Task, TaskStatus, new_task_id +from ..errors import TaskNotExist +from .preprocessor import TaskPreprocessor +from .processor import TaskProcessor +from .task import TaskProcessorActor + +logger = logging.getLogger(__name__) + + +class TaskConfigurationActor(mo.Actor): + def __init__( + self, + task_conf: Dict[str, Any], + execution_config: Dict[str, Any], + task_processor_cls: Type[TaskProcessor] = None, + task_preprocessor_cls: Type[TaskPreprocessor] = None, + ): + for name, value in task_conf.items(): + setattr(task_options, name, value) + self._execution_config = execution_config + self._task_processor_cls = task_processor_cls + self._task_preprocessor_cls = task_preprocessor_cls + + def get_config(self): + return { + "task_options": task_options, + "execution_config": self._execution_config, + "task_processor_cls": self._task_processor_cls, + "task_preprocessor_cls": self._task_preprocessor_cls, + } + + +class _RefHolder: + pass + + +@dataclass +class ResultTileableInfo: + tileable: TileableType + processor_ref: mo.ActorRefType[TaskProcessorActor] + ref_holder: _RefHolder + + +class TaskManagerActor(mo.Actor): + _task_id_to_processor_ref: Dict[str, mo.ActorRefType[TaskProcessorActor]] + _result_tileable_key_to_info: Dict[str, List[ResultTileableInfo]] + + def __init__(self, session_id: str): + self._session_id = session_id + + self._config = None + self._execution_config = None + self._task_processor_cls = None + self._task_preprocessor_cls = None + self._last_idle_time = None + + self._task_id_to_processor_ref = dict() + self._result_tileable_key_to_info = defaultdict(list) + + async def __post_create__(self): + # get config + configuration_ref = await mo.actor_ref( + TaskConfigurationActor.default_uid(), address=self.address + ) + task_conf = await configuration_ref.get_config() + ( + self._config, + self._execution_config, + self._task_processor_cls, + self._task_preprocessor_cls, + ) = ( + task_conf["task_options"], + task_conf["execution_config"], + task_conf["task_processor_cls"], + task_conf["task_preprocessor_cls"], + ) + self._task_preprocessor_cls = self._get_task_preprocessor_cls() + reserved_finish_tasks = task_conf["task_options"].reserved_finish_tasks + logger.info("Task manager reserves %s finish tasks.", reserved_finish_tasks) + self._reserved_finish_tasks = deque(maxlen=reserved_finish_tasks) + + async def __pre_destroy__(self): + # Avoid RuntimeError: dictionary changed size during iteration. + coros = [ + processor_ref.destroy() + for processor_ref in self._task_id_to_processor_ref.values() + ] + await asyncio.gather(*coros) + + @staticmethod + def gen_uid(session_id): + return f"{session_id}_task_manager" + + @enter_mode(kernel=True) + async def submit_tileable_graph( + self, + graph: TileableGraph, + fuse_enabled: bool = None, + extra_config: dict = None, + ) -> str: + self._last_idle_time = None + # new task with task_name + task_id = new_task_id() + + uid = TaskProcessorActor.gen_uid(self._session_id, task_id) + # gen main task which mean each submission from user + processor_ref = await mo.create_actor( + TaskProcessorActor, + self._session_id, + task_id, + task_processor_cls=self._task_processor_cls, + address=self.address, + uid=uid, + ) + self._task_id_to_processor_ref[task_id] = processor_ref + + if fuse_enabled is None: + fuse_enabled = self._config.fuse_enabled + # gen task + task = Task( + task_id, + self._session_id, + graph, + fuse_enabled=fuse_enabled, + extra_config=extra_config, + ) + # gen task processor + tiled_context = await self._gen_tiled_context(graph) + await processor_ref.add_task( + task, + tiled_context, + self._config, + self._execution_config, + self._task_preprocessor_cls, + ) + + def _on_finalize(): + # The loop may be closed before the weakref is dead. + if loop.is_running(): + loop.create_task( + self._move_task_to_reserved(loop, task_id, processor_ref) + ) + + loop = asyncio.get_running_loop() + task_ref = _RefHolder() + weakref.finalize(task_ref, _on_finalize) + for tileable in graph.result_tileables: + info = ResultTileableInfo( + tileable=tileable, processor_ref=processor_ref, ref_holder=task_ref + ) + logger.debug( + "Add tileable info, task id: %s, tileable key: %s", + task_id, + tileable.key, + ) + self._result_tileable_key_to_info[tileable.key].append(info) + + return task_id + + @aiotask_wrapper(exit_if_exception=_is_ci) + async def _move_task_to_reserved(self, loop, task_id, processor_ref): + # TODO(fyrestone): Find a better way to wait and destroy the processor actor. + with contextlib.suppress(ActorNotExist, ServerClosed, ConnectionRefusedError): + await processor_ref.wait() + + logger.debug("Move task %s to reserved.", task_id) + ref_holder = _RefHolder() + self._reserved_finish_tasks.append(ref_holder) + + @aiotask_wrapper(exit_if_exception=_is_ci) + async def _destroy_actor(): + with contextlib.suppress( + ActorNotExist, ServerClosed, ConnectionRefusedError + ): + await processor_ref.destroy() + + def _remove_task(): + logger.debug("Remove task %s.", task_id) + self._task_id_to_processor_ref.pop(task_id, None) + if loop.is_running(): + loop.create_task(_destroy_actor()) + + weakref.finalize(ref_holder, _remove_task) + + async def get_subtask_graphs(self, task_id: str) -> List[SubtaskGraph]: + try: + processor_ref = self._task_id_to_processor_ref[task_id] + except KeyError: # pragma: no cover + raise TaskNotExist(f"Task {task_id} does not exist") + + return processor_ref.get_subtask_graphs(task_id) + + async def get_tileable_graph_dict_by_task_id(self, task_id: str): + try: + processor_ref = self._task_id_to_processor_ref[task_id] + except KeyError: + raise TaskNotExist(f"Task {task_id} does not exist") + + res = await processor_ref.get_tileable_graph_as_dict() + return res + + async def get_tileable_details(self, task_id): + try: + processor_ref = self._task_id_to_processor_ref[task_id] + except KeyError: + raise TaskNotExist(f"Task {task_id} does not exist") + + return await processor_ref.get_tileable_details() + + async def get_tileable_subtasks(self, task_id, tileable_id, with_input_output): + try: + processor_ref = self._task_id_to_processor_ref[task_id] + except KeyError: + raise TaskNotExist(f"Task {task_id} does not exist") + + return await processor_ref.get_tileable_subtasks(tileable_id, with_input_output) + + async def _gen_tiled_context(self, graph: TileableGraph) -> TileContext: + # process graph, add fetch node to tiled context + tiled_context = TileContext() + for tileable in graph: + if isinstance(tileable.op, Fetch) and tileable.is_coarse(): + info_list = self._result_tileable_key_to_info[tileable.key] + assert info_list, f"The tileable {tileable.key} has no info." + info = info_list[-1] + tiled_context[tileable] = await info.processor_ref.get_result_tileable( + tileable.key + ) + return tiled_context + + def _get_task_preprocessor_cls(self): + if self._task_preprocessor_cls is not None: + assert isinstance(self._task_preprocessor_cls, str) + module, name = self._task_preprocessor_cls.rsplit(".", 1) + return getattr(importlib.import_module(module), name) + else: + return TaskPreprocessor + + async def wait_task(self, task_id: str, timeout: int = None): + try: + processor_ref = self._task_id_to_processor_ref[task_id] + except KeyError: # pragma: no cover + raise TaskNotExist(f"Task {task_id} does not exist") + + return processor_ref.wait(timeout) + + async def cancel_task(self, task_id: str): + try: + processor_ref = self._task_id_to_processor_ref[task_id] + except KeyError: # pragma: no cover + raise TaskNotExist(f"Task {task_id} does not exist") + + yield processor_ref.cancel() + + async def get_task_results(self, progress: bool = False): + if not self._task_id_to_processor_ref: + raise mo.Return([]) + + results = yield asyncio.gather( + *[ref.result() for ref in self._task_id_to_processor_ref.values()] + ) + + if progress: + task_to_result = {res.task_id: res for res in results} + + progress_task_ids = [] + for res in results: + if res.status != TaskStatus.terminated: + progress_task_ids.append(res.task_id) + else: + res.progress = 1.0 + + progresses = yield asyncio.gather( + *[ + self._task_id_to_processor_ref[task_id].progress() + for task_id in progress_task_ids + ] + ) + for task_id, progress in zip(progress_task_ids, progresses): + task_to_result[task_id].progress = progress + + raise mo.Return(results) + + async def get_task_result(self, task_id: str): + try: + processor_ref = self._task_id_to_processor_ref[task_id] + except KeyError: # pragma: no cover + raise TaskNotExist(f"Task {task_id} does not exist") + + return await processor_ref.result() + + async def get_task_result_tileables(self, task_id: str): + try: + processor_ref = self._task_id_to_processor_ref[task_id] + except KeyError: # pragma: no cover + raise TaskNotExist(f"Task {task_id} does not exist") + + return await processor_ref.get_result_tileables() + + async def set_subtask_result(self, subtask_result: SubtaskResult): + task_id = subtask_result.task_id + try: + processor_ref = self._task_id_to_processor_ref[task_id] + except KeyError: # pragma: no cover + # raise TaskNotExist(f'Task {task_id} does not exist') + logger.warning( + "Current task is finished, got stale result %s for subtask %s " + "which may be speculative execution from previous tasks, just ignore it.", + subtask_result.subtask_id, + subtask_result, + ) + return + + yield processor_ref.set_subtask_result(subtask_result) + + @mo.extensible + async def get_task_progress(self, task_id: str) -> float: + try: + processor_ref = self._task_id_to_processor_ref[task_id] + except KeyError: # pragma: no cover + raise TaskNotExist(f"Task {task_id} does not exist") + + return await processor_ref.progress() + + async def get_last_idle_time(self): + if self._last_idle_time is None: + for processor_ref in self._task_id_to_processor_ref.values(): + if not await processor_ref.is_done(): + break + else: + self._last_idle_time = time.time() + return self._last_idle_time + + async def remove_tileables(self, tileable_keys: List[str]): + # TODO(fyrestone) yield if needed. + logger.debug("Remove tileable info: %s", tileable_keys) + for key in tileable_keys: + info_list = self._result_tileable_key_to_info.pop(key, []) + if info_list: + processor_is_done = await asyncio.gather( + *(info.processor_ref.is_done() for info in info_list) + ) + not_done_info = [ + info + for info, is_done in zip(info_list, processor_is_done) + if not is_done + ] + self._result_tileable_key_to_info[key] = not_done_info + + async def get_map_reduce_info( + self, task_id: str, map_reduce_id: int + ) -> MapReduceInfo: + try: + processor_ref = self._task_id_to_processor_ref[task_id] + except KeyError: # pragma: no cover + raise TaskNotExist(f"Task {task_id} does not exist") + + return await processor_ref.get_map_reduce_info(map_reduce_id) diff --git a/python/xorbits/_mars/services/task/supervisor/preprocessor.py b/python/xorbits/_mars/services/task/supervisor/preprocessor.py new file mode 100644 index 000000000..3cfdd566e --- /dev/null +++ b/python/xorbits/_mars/services/task/supervisor/preprocessor.py @@ -0,0 +1,264 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import asyncio +import logging +from functools import partial +from typing import Callable, Dict, Iterable, List, Set + +from ....config import Config +from ....core import ChunkGraph, ChunkGraphBuilder, TileableGraph, TileContext +from ....core.graph.builder.chunk import Tiler, _TileableHandler +from ....core.operand import Fetch, ShuffleFetchType +from ....resource import Resource +from ....typing import BandType, ChunkType, TileableType +from ...subtask import Subtask, SubtaskGraph +from ..analyzer import GraphAnalyzer +from ..core import MapReduceInfo, Task + +logger = logging.getLogger(__name__) + + +class CancellableTiler(Tiler): + def __init__( + self, + tileable_graph: TileableGraph, + tile_context: TileContext, + processed_chunks: Set[str], + chunk_to_fetch: Dict[ChunkType, ChunkType], + add_nodes: Callable, + cancelled: asyncio.Event = None, + check_duplicated_submission: bool = False, + ): + super().__init__( + tileable_graph, tile_context, processed_chunks, chunk_to_fetch, add_nodes + ) + self._cancelled = cancelled + self._check_duplicated_submission = check_duplicated_submission + + @property + def cancelled(self): + return self._cancelled.is_set() + + def _gen_tileable_handlers(self, next_tileable_handlers: List[_TileableHandler]): + for tile_handler in super()._gen_tileable_handlers(next_tileable_handlers): + if not self.cancelled: + yield tile_handler + else: + break + + def _gen_result_chunks( + self, + chunk_graph: ChunkGraph, + next_tileable_handlers: List[_TileableHandler], + ): + if not self.cancelled: + return super()._gen_result_chunks(chunk_graph, next_tileable_handlers) + else: + return + + def _iter_without_check(self): + while self._tileable_handlers: + to_update_tileables = self._iter() + if not self.cancelled: + yield self._cur_chunk_graph + if not self.cancelled: + for t in to_update_tileables: + t.refresh_params() + else: + break + + def _iter_with_check(self): + chunk_set = set() + chunk_graphs = [] + for chunk_graph in self._iter_without_check(): + chunk_graphs.append(chunk_graph) + chunks = [] + for chunk in chunk_graph: + if isinstance(chunk.op, Fetch): + continue + if chunk in chunk_set: + raise RuntimeError(f"chunk {chunk} submitted repeatedly") + chunks.append(chunk) + chunk_set.update(chunks) + yield chunk_graph + + def __iter__(self): + if not self._check_duplicated_submission: + return self._iter_without_check() + else: + return self._iter_with_check() + + +class TaskPreprocessor: + __slots__ = ( + "_task", + "tileable_graph", + "tile_context", + "_config", + "tileable_optimization_records", + "chunk_optimization_records_list", + "_cancelled", + "_done", + "map_reduce_id_to_infos", + ) + + tile_context: TileContext + map_reduce_id_to_infos: Dict[int, MapReduceInfo] + + def __init__( + self, + task: Task, + tiled_context: TileContext = None, + config: Config = None, + ): + self._task = task + self.tileable_graph = task.tileable_graph + self._config = config + + self.tile_context = tiled_context + self.tileable_optimization_records = None + self.chunk_optimization_records_list = [] + self.map_reduce_id_to_infos = dict() + + self._cancelled = asyncio.Event() + self._done = asyncio.Event() + + def optimize(self) -> TileableGraph: + """ + Optimize tileable graph. + + Returns + ------- + optimized_graph: TileableGraph + + """ + from ....optimization.logical.tileable import ( + optimize as optimize_tileable_graph, + ) + + if self._config.optimize_tileable_graph: + # enable optimization + self.tileable_optimization_records = optimize_tileable_graph( + self.tileable_graph + ) + return self.tileable_graph + + def _fill_fetch_tileable_with_chunks(self, tileable_graph: TileableGraph): + for t in tileable_graph: + if isinstance(t.op, Fetch) and t in self.tile_context: + tiled = self.tile_context[t] + t._chunks = tiled.chunks + t._nsplits = tiled.nsplits + + def _get_tiler_cls(self) -> Callable: + extra_config = self._task.extra_config or dict() + check_duplicated_submission = extra_config.get( + "check_duplicated_submission", False + ) + return partial( + CancellableTiler, + cancelled=self._cancelled, + check_duplicated_submission=check_duplicated_submission, + ) + + def tile(self, tileable_graph: TileableGraph) -> Iterable[ChunkGraph]: + """ + Generate chunk graphs + + Returns + ------- + chunk_graph_generator: Generator + Chunk graphs. + """ + from ....optimization.logical.chunk import optimize as optimize_chunk_graph + + self._fill_fetch_tileable_with_chunks(tileable_graph) + # iterative chunk graph builder + chunk_graph_builder = ChunkGraphBuilder( + tileable_graph, + fuse_enabled=self._task.fuse_enabled, + tile_context=self.tile_context, + tiler_cls=self._get_tiler_cls(), + ) + optimize = self._config.optimize_chunk_graph + for t in tileable_graph: + if hasattr(t.op, "logic_key") and t.op.logic_key is None: + t.op.logic_key = t.op.get_logic_key() + for chunk_graph in chunk_graph_builder.build(): + if len(chunk_graph) == 0: + continue + # optimize chunk graph + if optimize: + self.chunk_optimization_records_list.append( + optimize_chunk_graph(chunk_graph) + ) + yield chunk_graph + + def post_chunk_graph_execution(self): # pylint: disable=no-self-use + """Post calling after execution of current chunk graph""" + + def analyze( + self, + chunk_graph: ChunkGraph, + chunk_to_subtasks: Dict[ChunkType, Subtask], + available_bands: Dict[BandType, Resource], + stage_id: str = None, + op_to_bands: Dict[str, BandType] = None, + shuffle_fetch_type: ShuffleFetchType = None, + ) -> SubtaskGraph: + logger.debug("Start to gen subtask graph for task %s", self._task.task_id) + task = self._task + analyzer = GraphAnalyzer( + chunk_graph, + available_bands, + task, + self._config, + chunk_to_subtasks, + stage_id=stage_id, + shuffle_fetch_type=shuffle_fetch_type, + map_reduce_id_to_infos=self.map_reduce_id_to_infos, + ) + graph = analyzer.gen_subtask_graph(op_to_bands) + logger.debug( + "Generated subtask graph of %s subtasks for task %s", + len(graph), + self._task.task_id, + ) + return graph + + def _get_done(self): + return self._done.is_set() + + def _set_done(self, is_done: bool): + if is_done: + self._done.set() + else: # pragma: no cover + self._done.clear() + + done = property(_get_done, _set_done) + + def cancel(self): + self._cancelled.set() + + def get_tiled(self, tileable: TileableType): + tileable = tileable.data if hasattr(tileable, "data") else tileable + return self.tile_context[tileable] + + def get_map_reduce_info(self, map_reduce_id: int) -> MapReduceInfo: + return self.map_reduce_id_to_infos[map_reduce_id] + + def __await__(self): + return self._done.wait().__await__() diff --git a/python/xorbits/_mars/services/task/supervisor/processor.py b/python/xorbits/_mars/services/task/supervisor/processor.py new file mode 100644 index 000000000..90910c436 --- /dev/null +++ b/python/xorbits/_mars/services/task/supervisor/processor.py @@ -0,0 +1,471 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import logging +import os +import tempfile +import time +from typing import Dict, Iterator, List, Optional, Set + +from ....core import Chunk, ChunkGraph, TileableGraph, TileContext +from ....core.operand import Fetch +from ....metrics import Metrics +from ....optimization.logical import OptimizationRecords +from ....oscar.profiling import MARS_ENABLE_PROFILING, ProfilingData +from ....typing import ChunkType, TileableType +from ....utils import Timer +from ...subtask import Subtask, SubtaskResult +from ..core import MapReduceInfo, Task, TaskResult, TaskStatus, new_task_id +from ..execution.api import ExecutionChunkResult, TaskExecutor +from .preprocessor import TaskPreprocessor + +logger = logging.getLogger(__name__) + +MARS_ENABLE_DUMPING_SUBTASK_GRAPH = int(os.environ.get("MARS_DUMP_SUBTASK_GRAPH", 0)) + + +class TaskProcessor: + _tileable_to_subtasks: Dict[TileableType, List[Subtask]] + _tileable_id_to_tileable: Dict[str, TileableType] + _chunk_to_subtasks: Dict[ChunkType, Subtask] + _stage_tileables: Set[TileableType] + + def __init__( + self, + task: Task, + preprocessor: TaskPreprocessor, + executor: TaskExecutor, + ): + self._task = task + self._preprocessor = preprocessor + self._executor = executor + + self._tileable_id_to_tileable = dict() + self._chunk_to_subtasks = dict() + self._stage_tileables = set() + + if MARS_ENABLE_PROFILING: + ProfilingData.init(task.task_id) + elif task.extra_config and task.extra_config.get("enable_profiling"): + ProfilingData.init(task.task_id, task.extra_config["enable_profiling"]) + + self._dump_subtask_graph = False + self._subtask_graphs = [] + if MARS_ENABLE_DUMPING_SUBTASK_GRAPH or ( + task.extra_config and task.extra_config.get("dump_subtask_graph") + ): + self._dump_subtask_graph = True + + self.result = TaskResult( + task_id=task.task_id, + session_id=task.session_id, + start_time=time.time(), + status=TaskStatus.pending, + ) + self.done = asyncio.Event() + + # add metrics + self._chunk_graph_gen_time = Metrics.gauge( + "mars.chunk_graph_gen_time_secs", + "Time consuming in seconds to generate a chunk graph", + ("session_id", "task_id"), + ) + self._subtask_graph_gen_time = Metrics.gauge( + "mars.subtask_graph_gen_time_secs", + "Time consuming in seconds to generate a subtask graph", + ("session_id", "task_id", "stage_id"), + ) + self._task_execution_time = Metrics.gauge( + "mars.task_execution_time_secs", + "Time consuming in seconds to execute a task", + ("session_id", "task_id"), + ) + + @property + def task_id(self): + return self._task.task_id + + @property + def tileable_graph(self): + return self._preprocessor.tileable_graph + + @property + def tileable_id_to_tileable(self): + return self._tileable_id_to_tileable + + @property + def tile_context(self) -> TileContext: + return self._preprocessor.tile_context + + @property + def stage_processors(self): + # TODO(fyrestone): Remove it. + return self._executor.get_stage_processors() + + def get_tiled(self, tileable: TileableType): + return self._preprocessor.get_tiled(tileable) + + def get_subtasks(self, chunks: List[ChunkType]) -> List[Subtask]: + return [self._chunk_to_subtasks[chunk] for chunk in chunks] + + def get_tileable_to_subtasks(self) -> Dict[TileableType, List[Subtask]]: + tile_context = self.tile_context + result = dict() + for tileable, infos in tile_context.get_tileable_tile_infos().items(): + subtasks = [] + for info in infos: + chunks = [ + c for c in info.generated_chunks if not isinstance(c.op, Fetch) + ] + subtasks.extend(self.get_subtasks(chunks)) + result[tileable] = subtasks + return result + + @staticmethod + async def _get_next_chunk_graph( + chunk_graph_iter: Iterator[ChunkGraph], + ) -> Optional[ChunkGraph]: + def next_chunk_graph(): + try: + return next(chunk_graph_iter) + except StopIteration: + return + + fut = asyncio.to_thread(next_chunk_graph) + chunk_graph = await fut + return chunk_graph + + async def _iter_stage_chunk_graph(self): + tileable_graph = self._preprocessor.tileable_graph + chunk_graph_iter = iter(self._preprocessor.tile(tileable_graph)) + while True: + with Timer() as stage_timer: + with Timer() as timer: + chunk_graph = await self._get_next_chunk_graph(chunk_graph_iter) + if chunk_graph is None: + # tile finished + self._preprocessor.done = True + return + stage_id = new_task_id() + stage_profiler = ProfilingData[self._task.task_id, "general"].nest( + f"stage_{stage_id}" + ) + stage_profiler.set(f"tile({len(chunk_graph)})", timer.duration) + logger.info( + "Time consuming to gen a chunk graph is %ss with session id %s, task id %s", + timer.duration, + self._task.session_id, + self._task.task_id, + ) + self._chunk_graph_gen_time.record( + timer.duration, + { + "session_id": self._task.session_id, + "task_id": self._task.task_id, + }, + ) + yield stage_id, stage_profiler, chunk_graph + + stage_profiler.set("total", stage_timer.duration) + + async def _process_stage_chunk_graph( + self, + stage_id: str, + stage_profiler, + chunk_graph: ChunkGraph, + ): + available_bands = await self._executor.get_available_band_resources() + meta_api = self._executor._meta_api + get_meta_tasks = [] + fetch_op_keys = [] + for c in chunk_graph.iter_indep(): + if isinstance(c.op, Fetch): + get_meta_tasks.append( + meta_api.get_chunk_meta.delay(c.key, fields=["bands"]) + ) + fetch_op_keys.append(c.op.key) + # TODO(fyrestone): A more general way to get the key to bands + # for all execution backends. + try: + key_to_bands = await meta_api.get_chunk_meta.batch(*get_meta_tasks) + fetch_op_to_bands = dict( + (key, meta["bands"][0]) + for key, meta in zip(fetch_op_keys, key_to_bands) + ) + except (KeyError, IndexError): + fetch_op_to_bands = {} + shuffle_fetch_type = ( + self._executor.get_execution_config().get_shuffle_fetch_type() + ) + with Timer() as timer: + subtask_graph = await asyncio.to_thread( + self._preprocessor.analyze, + chunk_graph, + self._chunk_to_subtasks, + available_bands, + stage_id=stage_id, + op_to_bands=fetch_op_to_bands, + shuffle_fetch_type=shuffle_fetch_type, + ) + if self._dump_subtask_graph: + self._subtask_graphs.append(subtask_graph) + stage_profiler.set(f"gen_subtask_graph({len(subtask_graph)})", timer.duration) + logger.info( + "Time consuming to gen a subtask graph is %ss with session id %s, task id %s, stage id %s", + timer.duration, + self._task.session_id, + self._task.task_id, + stage_id, + ) + self._subtask_graph_gen_time.record( + timer.duration, + { + "session_id": self._task.session_id, + "task_id": self._task.task_id, + "stage_id": stage_id, + }, + ) + + tile_context = await asyncio.to_thread( + self._get_stage_tile_context, + {c for c in chunk_graph.result_chunks if not isinstance(c.op, Fetch)}, + ) + + with Timer() as timer: + chunk_to_result = await self._executor.execute_subtask_graph( + stage_id, subtask_graph, chunk_graph, tile_context + ) + stage_profiler.set("run", timer.duration) + + self._preprocessor.post_chunk_graph_execution() + if self._preprocessor.chunk_optimization_records_list: + optimization_records = self._preprocessor.chunk_optimization_records_list[ + -1 + ] + else: + optimization_records = None + self._update_stage_meta(chunk_to_result, tile_context, optimization_records) + + def _get_stage_tile_context(self, result_chunks: Set[Chunk]) -> TileContext: + collected = self._stage_tileables + tile_context = TileContext() + for tileable in self.tileable_graph: + if tileable in collected: + continue + tiled_tileable = self._preprocessor.tile_context.get(tileable) + if tiled_tileable is not None: + tileable_chunks = [c.data for c in tiled_tileable.chunks] + if any(c not in result_chunks for c in tileable_chunks): + continue + tile_context[tileable] = tiled_tileable + collected.add(tileable) + return tile_context + + @classmethod + def _update_stage_meta( + cls, + chunk_to_result: Dict[Chunk, ExecutionChunkResult], + tile_context: TileContext, + optimization_records: OptimizationRecords, + ): + for tiled_tileable in tile_context.values(): + cls._update_result_meta(chunk_to_result, tiled_tileable) + + for c, r in chunk_to_result.items(): + c.params = r.meta + original_chunk = ( + optimization_records and optimization_records.get_original_entity(c) + ) + if original_chunk is not None: + original_chunk.params = r.meta + + for tileable, tiled_tileable in tile_context.items(): + tiled_tileable.refresh_params() + tileable.params = tiled_tileable.params + + @classmethod + def _update_result_meta( + cls, chunk_to_result: Dict[Chunk, ExecutionChunkResult], tileable: TileableType + ): + from ....dataframe.core import DATAFRAME_TYPE, SERIES_TYPE + from ....tensor.core import TENSOR_TYPE + + chunks = [c.data for c in tileable.chunks] + if isinstance(tileable, DATAFRAME_TYPE): + for c in chunks: + i, j = c.index + meta = chunk_to_result[c].meta + shape = meta.get("shape") + update_shape = shape is None + shape = shape if not update_shape else [None, None] + if i > 0: + # update dtypes_value + c0j = chunk_to_result[tileable.cix[0, j].data].meta + meta["dtypes_value"] = c0j["dtypes_value"] + if update_shape: + shape[1] = c0j["shape"][1] + if j > 0: + # update index_value + ci0 = chunk_to_result[tileable.cix[i, 0].data].meta + meta["index_value"] = ci0["index_value"] + if update_shape: + shape[0] = ci0["shape"][0] + if update_shape: + meta["shape"] = tuple(shape) + elif isinstance(tileable, SERIES_TYPE): + first_meta = chunk_to_result[chunks[0]].meta + for c in chunks: + i = c.index[0] + meta = chunk_to_result[c].meta + if i > 0: + meta["name"] = first_meta["name"] + meta["dtype"] = first_meta["dtype"] + elif isinstance(tileable, TENSOR_TYPE): + ndim = tileable.ndim + for i, c in enumerate(chunks): + meta = chunk_to_result[c].meta + if "shape" not in meta: + shape = [] + for i, ind in enumerate(c.index): + ind0 = [0] * ndim + ind0[i] = ind + c0 = tileable.cix[tuple(ind0)].data + shape.append(chunk_to_result[c0].meta["shape"][i]) + meta["shape"] = tuple(shape) + if i > 0: + first = chunk_to_result[chunks[0]].meta + meta["dtype"] = first["dtype"] + meta["order"] = first["order"] + + async def run(self): + try: + profiling = ProfilingData[self.task_id, "general"] + self.result.status = TaskStatus.running + # optimization + with Timer() as timer: + # optimization, run it in executor, + # since optimization may be a CPU intensive operation + await asyncio.to_thread(self._preprocessor.optimize) + + profiling.set("optimize", timer.duration) + + self._tileable_id_to_tileable = await asyncio.to_thread( + self._get_tileable_id_to_tileable, self._preprocessor.tileable_graph + ) + + async with self._executor: + async for stage_args in self._iter_stage_chunk_graph(): + await self._process_stage_chunk_graph(*stage_args) + except Exception as ex: + self.result.error = ex + self.result.traceback = ex.__traceback__ + finally: + self._gen_result() + self._finish() + + async def get_progress(self) -> float: + # get tileable proportion that is tiled + return await self._executor.get_progress() + + async def cancel(self): + self._preprocessor.cancel() + await self._executor.cancel() + + async def set_subtask_result(self, subtask_result: SubtaskResult): + await self._executor.set_subtask_result(subtask_result) + + @staticmethod + def _get_tileable_id_to_tileable( + tileable_graph: TileableGraph, + ) -> Dict[str, TileableType]: + tileable_id_to_tileable = dict() + + for tileable in tileable_graph: + tileable_id_to_tileable[str(tileable.key)] = tileable + + return tileable_id_to_tileable + + def _gen_result(self): + self.result.status = TaskStatus.terminated + self.result.end_time = time.time() + cost_time_secs = self.result.end_time - self.result.start_time + logger.info( + "Time consuming to execute a task is %ss with session id %s, task id %s", + cost_time_secs, + self._task.session_id, + self._task.task_id, + ) + self._task_execution_time.record( + cost_time_secs, + {"session_id": self._task.session_id, "task_id": self._task.task_id}, + ) + + def get_map_reduce_info(self, map_reduce_id: int) -> MapReduceInfo: + return self._preprocessor.get_map_reduce_info(map_reduce_id) + + def dump_subtask_graph(self): + from .graph_visualizer import GraphVisualizer + + try: # pragma: no cover + import graphviz + except ImportError: + graphviz = None + + dot = GraphVisualizer.to_dot(self._subtask_graphs) + directory = os.environ.get("MARS_DUMP_SUBTASK_GRAPH_DIR") + if directory is None: + directory = tempfile.gettempdir() + os.makedirs(directory, exist_ok=True) + file_name = f"mars-{self.task_id}" + logger.info( + "Subtask graph of task %s is stored in %s", + self._task.task_id, + os.path.join(directory, file_name), + ) + if graphviz is not None: # pragma: no cover + try: + g = graphviz.Source(dot) + g.view(file_name, directory=directory) + return + except graphviz.ExecutableNotFound: # pragma: no cover + logger.info("dot executable is not found, dump dot file instead.") + + with open(os.path.join(directory, file_name), "w") as f: + f.write(dot) + + def _finish(self): + self._executor.destroy() + self.done.set() + if self._dump_subtask_graph: + self.dump_subtask_graph() + if MARS_ENABLE_PROFILING or ( + self._task.extra_config and self._task.extra_config.get("enable_profiling") + ): + ProfilingData[self._task.task_id, "general"].set( + "total", time.time() - self.result.start_time + ) + serialization = ProfilingData[self._task.task_id, "serialization"] + if not serialization.empty(): + serialization.set( + "total", + sum(serialization.values()), + ) + data = ProfilingData.pop(self._task.task_id) + self.result.profiling = { + "supervisor": data, + } + + def is_done(self) -> bool: + return self.done.is_set() diff --git a/python/xorbits/_mars/services/task/supervisor/service.py b/python/xorbits/_mars/services/task/supervisor/service.py new file mode 100644 index 000000000..e42738f8a --- /dev/null +++ b/python/xorbits/_mars/services/task/supervisor/service.py @@ -0,0 +1,77 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .... import oscar as mo +from ...core import AbstractService +from .manager import TaskConfigurationActor, TaskManagerActor + + +class TaskSupervisorService(AbstractService): + """ + Task service on supervisor. + + Service Configuration + --------------------- + { + "task": { + "default_config": { + "optimize_tileable_graph": True, + "optimize_chunk_graph": True, + "fuse_enabled": True, + "reserved_finish_tasks": 10 + }, + "execution_config": { + "backend": "mars", + "mars": {}, + } + } + } + """ + + async def start(self): + task_config = self._config.get("task", dict()) + options = task_config.get("default_config", dict()) + execution_config = task_config.get("execution_config", dict()) + task_processor_cls = task_config.get("task_processor_cls") + task_preprocessor_cls = task_config.get("task_preprocessor_cls") + await mo.create_actor( + TaskConfigurationActor, + options, + execution_config=execution_config, + task_processor_cls=task_processor_cls, + task_preprocessor_cls=task_preprocessor_cls, + address=self._address, + uid=TaskConfigurationActor.default_uid(), + ) + + async def stop(self): + await mo.destroy_actor( + mo.create_actor_ref( + uid=TaskConfigurationActor.default_uid(), address=self._address + ) + ) + + async def create_session(self, session_id: str): + await mo.create_actor( + TaskManagerActor, + session_id, + address=self._address, + uid=TaskManagerActor.gen_uid(session_id), + ) + + async def destroy_session(self, session_id: str): + task_manager_ref = await mo.actor_ref( + self._address, TaskManagerActor.gen_uid(session_id) + ) + return await mo.destroy_actor(task_manager_ref) diff --git a/python/xorbits/_mars/services/task/supervisor/task.py b/python/xorbits/_mars/services/task/supervisor/task.py new file mode 100644 index 000000000..11dad0d3f --- /dev/null +++ b/python/xorbits/_mars/services/task/supervisor/task.py @@ -0,0 +1,428 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import dataclasses +import importlib +import logging +from typing import Any, Dict, List, Optional, Set, Type + +from .... import oscar as mo +from ....config import Config +from ....core import TileContext +from ....core.operand import Fetch +from ....typing import TileableType +from ....utils import build_fetch +from ...subtask import SubtaskGraph, SubtaskResult, SubtaskStatus +from ..core import MapReduceInfo, Task, TaskStatus +from ..execution.api import TaskExecutor +from .preprocessor import TaskPreprocessor +from .processor import TaskProcessor + +logger = logging.getLogger(__name__) + + +@dataclasses.dataclass +class _TileableStageInfo: + progress: float + subtask_ids: Set[str] + + +@dataclasses.dataclass +class _TileableDetailInfo: + progress: float + subtask_count: int + status: int + properties: Dict[str, Any] + + +class _TaskInfoProcessorMixin: + _task_id_to_processor: Dict[str, TaskProcessor] + _tileable_to_details_cache: Dict[TileableType, _TileableDetailInfo] + + def _init_cache(self): + try: + return self._tileable_to_details_cache + except AttributeError: + cache = self._tileable_to_details_cache = dict() + return cache + + def _get_all_subtask_results(self) -> Dict[str, SubtaskResult]: + subtask_results = dict() + for processor in self._task_id_to_processor.values(): + for stage in processor.stage_processors: + for subtask, result in stage.subtask_results.items(): + subtask_results[subtask.subtask_id] = result + for subtask, result in stage.subtask_snapshots.items(): + if subtask.subtask_id in subtask_results: + continue + subtask_results[subtask.subtask_id] = result + return subtask_results + + def _get_tileable_infos(self) -> Dict[TileableType, _TileableDetailInfo]: + cache = self._init_cache() + + tileable_to_stage_infos: Dict[TileableType, List[_TileableStageInfo]] = dict() + for processor in self._task_id_to_processor.values(): + tile_context = processor.tile_context + for tileable, infos in tile_context.get_tileable_tile_infos().items(): + tileable_to_stage_infos[tileable] = [] + if tileable in cache: + # cached + continue + for info in infos: + chunks = [ + c for c in info.generated_chunks if not isinstance(c.op, Fetch) + ] + try: + subtask_ids = { + st.subtask_id for st in processor.get_subtasks(chunks) + } + except KeyError: # pragma: no cover + subtask_ids = None + stage_info = _TileableStageInfo( + progress=info.tile_progress, subtask_ids=subtask_ids + ) + tileable_to_stage_infos[tileable].append(stage_info) + + tileable_to_defails = dict() + subtask_id_to_results = self._get_all_subtask_results() + for tileable, infos in tileable_to_stage_infos.items(): + if tileable in cache: + # cached + tileable_to_defails[tileable] = cache[tileable] + continue + + statuses = set() + progress = 0.0 if not isinstance(tileable.op, Fetch) else 1.0 + n_subtask = 0 + for stage_info in infos: + tile_progress = stage_info.progress + stage_progress = 0.0 + if stage_info.subtask_ids is None: + continue + for subtask_id in stage_info.subtask_ids: + try: + result = subtask_id_to_results[subtask_id] + stage_progress += result.progress * tile_progress + statuses.add(result.status) + except KeyError: + # pending + statuses.add(SubtaskStatus.pending) + n_subtask += len(stage_info.subtask_ids) + if stage_info.subtask_ids: + progress += stage_progress / len(stage_info.subtask_ids) + else: + progress += tile_progress + + # calc status + if (not statuses or statuses == {SubtaskStatus.succeeded}) and abs( + progress - 1.0 + ) < 1e-3: + status = SubtaskStatus.succeeded + elif statuses == {SubtaskStatus.cancelled}: + status = SubtaskStatus.cancelled + elif statuses == {SubtaskStatus.pending}: + status = SubtaskStatus.pending + elif SubtaskStatus.errored in statuses: + status = SubtaskStatus.errored + else: + status = SubtaskStatus.running + + props = tileable.op.to_kv( + exclude_fields=("_key", "_id"), accept_value_types=(int, float, str) + ) + info = _TileableDetailInfo( + progress=progress, + subtask_count=n_subtask, + status=status.value, + properties=props, + ) + tileable_to_defails[tileable] = info + if status.is_done and tileable not in cache: + cache[tileable] = info + + return tileable_to_defails + + async def get_tileable_details(self): + tileable_to_details = yield asyncio.to_thread(self._get_tileable_infos) + raise mo.Return( + { + t.key: { + "progress": info.progress, + "subtaskCount": info.subtask_count, + "status": info.status, + "properties": info.properties, + } + for t, info in tileable_to_details.items() + } + ) + + def _get_tileable_graph_as_dict(self): + processor = list(self._task_id_to_processor.values())[-1] + tileable_graph = processor.tileable_graph + + node_list = [] + edge_list = [] + + visited = set() + + for chunk in tileable_graph: + if chunk.key in visited: # pragma: no cover + continue + visited.add(chunk.key) + + node_name = str(chunk.op) + + node_list.append({"tileableId": chunk.key, "tileableName": node_name}) + for inp, is_pure_dep in zip(chunk.inputs, chunk.op.pure_depends): + if inp not in tileable_graph: # pragma: no cover + continue + edge_list.append( + { + "fromTileableId": inp.key, + "toTileableId": chunk.key, + "linkType": 1 if is_pure_dep else 0, + } + ) + + graph_dict = {"tileables": node_list, "dependencies": edge_list} + return graph_dict + + async def get_tileable_graph_as_dict(self): + return await asyncio.to_thread(self._get_tileable_graph_as_dict) + + def _get_tileable_subtasks(self, tileable_id: str, with_input_output: bool): + returned_subtasks = dict() + subtask_id_to_types = dict() + + subtask_details = dict() + subtask_graph = subtask_results = subtask_snapshots = None + for processor in self._task_id_to_processor.values(): + tileable_to_subtasks = processor.get_tileable_to_subtasks() + tileable_id_to_tileable = processor.tileable_id_to_tileable + for stage in processor.stage_processors: + if tileable_id in tileable_id_to_tileable: + tileable = tileable_id_to_tileable[tileable_id] + returned_subtasks = { + subtask.subtask_id: subtask + for subtask in tileable_to_subtasks[tileable] + } + subtask_graph = stage.subtask_graph + subtask_results = stage.subtask_results + subtask_snapshots = stage.subtask_snapshots + break + if returned_subtasks: + break + + if subtask_graph is None: # pragma: no cover + return {} + + if with_input_output: + for subtask in list(returned_subtasks.values()): + for pred in subtask_graph.iter_predecessors(subtask): + if pred.subtask_id in returned_subtasks: # pragma: no cover + continue + returned_subtasks[pred.subtask_id] = pred + subtask_id_to_types[pred.subtask_id] = "Input" + for succ in subtask_graph.iter_successors(subtask): + if succ.subtask_id in returned_subtasks: # pragma: no cover + continue + returned_subtasks[succ.subtask_id] = succ + subtask_id_to_types[succ.subtask_id] = "Output" + + for subtask in returned_subtasks.values(): + subtask_result = subtask_results.get( + subtask, + subtask_snapshots.get( + subtask, + SubtaskResult( + progress=0.0, + status=SubtaskStatus.pending, + stage_id=subtask.stage_id, + ), + ), + ) + subtask_details[subtask.subtask_id] = { + "name": subtask.subtask_name, + "status": subtask_result.status.value, + "progress": subtask_result.progress, + "nodeType": subtask_id_to_types.get(subtask.subtask_id, "Calculation"), + } + + for subtask in returned_subtasks.values(): + pred_ids = [] + for pred in subtask_graph.iter_predecessors(subtask): + if pred.subtask_id in returned_subtasks: + pred_ids.append(pred.subtask_id) + subtask_details[subtask.subtask_id]["fromSubtaskIds"] = pred_ids + return subtask_details + + async def get_tileable_subtasks(self, tileable_id: str, with_input_output: bool): + return await asyncio.to_thread( + self._get_tileable_subtasks, tileable_id, with_input_output + ) + + +class TaskProcessorActor(mo.Actor, _TaskInfoProcessorMixin): + _task_id_to_processor: Dict[str, TaskProcessor] + _cur_processor: Optional[TaskProcessor] + + def __init__( + self, + session_id: str, + task_id: str, + task_processor_cls: Type[TaskPreprocessor] = None, + ): + self.session_id = session_id + self.task_id = task_id + + self._task_processor_cls = self._get_task_processor_cls(task_processor_cls) + self._task_id_to_processor = dict() + self._cur_processor = None + + @classmethod + def gen_uid(cls, session_id: str, task_id: str): + return f"task_processor_{session_id}_{task_id}" + + async def add_task( + self, + task: Task, + tiled_context: TileContext, + config: Config, + execution_config: Dict, + task_preprocessor_cls: Type[TaskPreprocessor], + ): + task_preprocessor = task_preprocessor_cls( + task, tiled_context=tiled_context, config=config + ) + task_executor = await TaskExecutor.create( + execution_config, + task=task, + session_id=self.session_id, + address=self.address, + tile_context=task_preprocessor.tile_context, + ) + processor = self._task_processor_cls( + task, + task_preprocessor, + task_executor, + ) + self._task_id_to_processor[task.task_id] = processor + + # tell self to start running + await self.ref().start.tell() + + @classmethod + def _get_task_processor_cls(cls, task_processor_cls): + if task_processor_cls is not None: # pragma: no cover + if isinstance(task_processor_cls, type): + return task_processor_cls + assert isinstance(task_processor_cls, str) + module, name = task_processor_cls.rsplit(".", 1) + return getattr(importlib.import_module(module), name) + else: + return TaskProcessor + + def _get_unprocessed_task_processor(self): + for processor in self._task_id_to_processor.values(): + if processor.result.status == TaskStatus.pending: + return processor + + async def start(self): + if self._cur_processor is not None: # pragma: no cover + # some processor is running + return + + processor = self._get_unprocessed_task_processor() + if processor is None: # pragma: no cover + return + self._cur_processor = processor + try: + yield processor.run() + finally: + self._cur_processor = None + + async def wait(self, timeout: int = None): + fs = [ + asyncio.ensure_future(processor.done.wait()) + for processor in self._task_id_to_processor.values() + ] + + _, pending = yield asyncio.wait(fs, timeout=timeout) + if not pending: + raise mo.Return(self.result()) + else: + _ = [fut.cancel() for fut in pending] + + async def cancel(self): + if self._cur_processor: + await self._cur_processor.cancel() + + def result(self): + terminated_result = None + for processor in self._task_id_to_processor.values(): + if processor.result.status != TaskStatus.terminated: + return processor.result + else: + terminated_result = processor.result + return terminated_result + + async def progress(self): + processor_progresses = [ + await processor.get_progress() + for processor in self._task_id_to_processor.values() + ] + return sum(processor_progresses) / len(processor_progresses) + + def get_result_tileables(self): + processor = list(self._task_id_to_processor.values())[-1] + tileable_graph = processor.tileable_graph + result = [] + for result_tileable in tileable_graph.result_tileables: + tiled = processor.get_tiled(result_tileable) + result.append(build_fetch(tiled)) + return result + + def get_subtask_graphs(self, task_id: str) -> List[SubtaskGraph]: + return [ + stage_processor.subtask_graph + for stage_processor in self._task_id_to_processor[task_id].stage_processors + ] + + def get_result_tileable(self, tileable_key: str): + processor = list(self._task_id_to_processor.values())[-1] + tileable_graph = processor.tileable_graph + for result_tileable in tileable_graph.result_tileables: + if result_tileable.key == tileable_key: + tiled = processor.get_tiled(result_tileable) + return build_fetch(tiled) + raise KeyError(f"Tileable {tileable_key} does not exist") # pragma: no cover + + async def set_subtask_result(self, subtask_result: SubtaskResult): + logger.debug( + "Set subtask %s with result %s.", subtask_result.subtask_id, subtask_result + ) + if self._cur_processor is not None: + yield self._cur_processor.set_subtask_result(subtask_result) + + async def get_map_reduce_info(self, map_reduce_id: int) -> MapReduceInfo: + for processor in self._task_id_to_processor.values(): + return processor.get_map_reduce_info(map_reduce_id) + + def is_done(self) -> bool: + for processor in self._task_id_to_processor.values(): + if not processor.is_done(): + return False + return True diff --git a/python/xorbits/_mars/services/task/supervisor/tests/__init__.py b/python/xorbits/_mars/services/task/supervisor/tests/__init__.py new file mode 100644 index 000000000..a0122bfa4 --- /dev/null +++ b/python/xorbits/_mars/services/task/supervisor/tests/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .task_preprocessor import CheckedTaskPreprocessor diff --git a/python/xorbits/_mars/services/task/supervisor/tests/task_preprocessor.py b/python/xorbits/_mars/services/task/supervisor/tests/task_preprocessor.py new file mode 100644 index 000000000..c98f73f10 --- /dev/null +++ b/python/xorbits/_mars/services/task/supervisor/tests/task_preprocessor.py @@ -0,0 +1,259 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from collections import defaultdict +from functools import partial +from typing import Callable, Dict, List + +import numpy as np + +from .....core import ( + OBJECT_TYPE, + ChunkGraph, + TileableType, + enter_mode, + register, + unregister, +) +from .....core.operand import Fetch, ShuffleProxy +from .....core.operand.shuffle import ShuffleFetchType +from .....resource import Resource +from .....tests.core import ObjectCheckMixin, _check_args +from .....typing import BandType, ChunkType +from ....subtask import Subtask, SubtaskGraph +from ...analyzer import GraphAnalyzer +from ..preprocessor import CancellableTiler, TaskPreprocessor + + +class CheckedTaskPreprocessor(ObjectCheckMixin, TaskPreprocessor): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._raw_chunk_shapes = dict() + self._tileable_checked = dict() + + check_options = dict() + kwargs = self._task.extra_config or dict() + self._operand_tile_handlers = operand_tile_handlers = kwargs.pop( + "operand_tile_handlers", dict() + ) + for op, tile_handler in operand_tile_handlers.items(): + register(op, tile_handler) + for key in _check_args: + check_options[key] = kwargs.get(key, True) + self._check_options = check_options + self._check_duplicated_operand_keys = bool( + kwargs.get("check_duplicated_operand_keys") + ) + + def _get_done(self): + return super()._get_done() + + def _set_done(self, is_done: bool): + super()._set_done(is_done) + for op in self._operand_tile_handlers: + unregister(op) + + done = property(_get_done, _set_done) + + def _check_nsplits(self, tiled: TileableType): + if tiled.nsplits is None or (tiled.nsplits == () and len(tiled.chunks) == 1): + return + + nsplit_chunk_shape = tuple(len(s) for s in tiled.nsplits) + if nsplit_chunk_shape != tiled.chunk_shape: + raise AssertionError( + "Operand %r: shape of nsplits %r not consistent with chunk shape %r" + % (tiled.op, nsplit_chunk_shape, tiled.chunk_shape) + ) from None + + nsplit_shape = tuple(np.sum(s) for s in tiled.nsplits) + try: + self.assert_shape_consistent(nsplit_shape, tiled.shape) + except AssertionError: + raise AssertionError( + "Operand %r: shape computed from nsplits %r -> %r not consistent with real shape %r" + % (tiled.op, tiled.nsplits, nsplit_shape, tiled.shape) + ) from None + + for c in tiled.chunks: + try: + tiled_c = tiled.cix[c.index] + except ValueError as ex: + raise AssertionError( + "Operand %r: Malformed index %r, nsplits is %r. Raw error is %r" + % (c.op, c.index, tiled.nsplits, ex) + ) from None + + if tiled_c is not c: + raise AssertionError( + "Operand %r: Cannot spot chunk via index %r, nsplits is %r" + % (c.op, c.index, tiled.nsplits) + ) + for cid, shape in enumerate(itertools.product(*tiled.nsplits)): + chunk_shape = ( + self._raw_chunk_shapes.get(tiled.chunks[cid].key) + or tiled.chunks[cid].shape + ) + if len(shape) != len(chunk_shape): + raise AssertionError( + "Operand %r: Shape in nsplits %r does not meet shape in chunk %r" + % (tiled.chunks[cid].op, shape, chunk_shape) + ) + for s1, s2 in zip(shape, chunk_shape): + if (not (np.isnan(s1) and np.isnan(s2))) and s1 != s2: + raise AssertionError( + "Operand %r: Shape in nsplits %r does not meet shape in chunk %r" + % (tiled.chunks[cid].op, shape, chunk_shape) + ) + + def post_chunk_graph_execution(self): + for tileable in self.tileable_graph: + tiled_tileable = self.tile_context.get(tileable) + if ( + tiled_tileable is not None + and self._check_options["check_nsplits"] + and tileable.key not in self._tileable_checked + and not isinstance(tileable, OBJECT_TYPE) + ): + self._check_nsplits(tiled_tileable) + self._tileable_checked[tileable.key] = True + + def _get_tiler_cls(self) -> Callable: + extra_config = self._task.extra_config or dict() + check_duplicated_submission = extra_config.get( + "check_duplicated_submission", True + ) + return partial( + CancellableTiler, + cancelled=self._cancelled, + check_duplicated_submission=check_duplicated_submission, + ) + + @enter_mode(build=True) + def analyze( + self, + chunk_graph: ChunkGraph, + chunk_to_subtasks: Dict[ChunkType, Subtask], + available_bands: Dict[BandType, Resource], + stage_id: str, + op_to_bands: Dict[str, BandType] = None, + shuffle_fetch_type: ShuffleFetchType = None, + ) -> SubtaskGraph: + checked_chunks = set() + for tileable in self.tileable_graph: + try: + tiled = self.get_tiled(tileable) + self._check_shuffle_reduce_chunks(tiled.chunks, checked_chunks) + except KeyError: + pass + + # check if duplicated operand keys exist + if self._check_duplicated_operand_keys and len( + {c.key for c in chunk_graph} + ) < len( + chunk_graph + ): # pragma: no cover + raise AssertionError("Duplicated operands exist") + # record shapes generated in tile + for n in chunk_graph: + self._raw_chunk_shapes[n.key] = getattr(n, "shape", None) + task = self._task + analyzer = GraphAnalyzer( + chunk_graph, + available_bands, + task, + self._config, + chunk_to_subtasks, + shuffle_fetch_type=shuffle_fetch_type, + map_reduce_id_to_infos=self.map_reduce_id_to_infos, + ) + subtask_graph = analyzer.gen_subtask_graph() + results = set( + analyzer._chunk_to_copied[c] + for c in chunk_graph.results + if not isinstance(c.op, Fetch) + ) + for subtask in subtask_graph: + if subtask.extra_config is None: + subtask.extra_config = dict() + if all(c not in results for c in subtask.chunk_graph.results): + subtask.extra_config["check_all"] = False + else: + subtask.extra_config["check_keys"] = [ + c.key for c in subtask.chunk_graph.results if c in results + ] + proxy_chunks = [ + c for c in subtask.chunk_graph if isinstance(c.op, ShuffleProxy) + ] + if proxy_chunks: + assert len(proxy_chunks) == 1, proxy_chunks + proxy_chunk_key = proxy_chunks[0].key + proxy_chunk = next(c for c in chunk_graph if c.key == proxy_chunk_key) + reducer_chunks = chunk_graph.successors(proxy_chunk) + n_reducers_list = [c.op.n_reducers for c in reducer_chunks] + n_reducers = n_reducers_list[0] + reducer_ordinals = [c.op.reducer_ordinal for c in reducer_chunks] + assert set(reducer_ordinals).issubset(list(range(n_reducers))), ( + reducer_ordinals, + n_reducers, + ) + assert len(set(n_reducers_list)) == 1, n_reducers_list + mapper_chunks = chunk_graph.predecessors(proxy_chunk) + assert proxy_chunk.op.n_mappers == len(mapper_chunks), ( + proxy_chunk.op.n_mappers, + mapper_chunks, + ) + # If some reducer data are not used by downstream, then it won't be included in the chunk graph. + assert proxy_chunk.op.n_reducers >= n_reducers, ( + proxy_chunk.op.n_reducers, + n_reducers, + ) + return subtask_graph + + @classmethod + def _check_shuffle_reduce_chunks(cls, chunks: List, checked_chunks): + """Check shuffle reduce chunks sorted reducer_index consistent with reducer_ordinal. So shuffle mapper blocks + can be sorted by reducer_index, and the reducer can fetch mapper data by reducer_ordinal. + """ + chunks = [c for c in chunks or [] if c not in checked_chunks] + if not chunks: + return + from .....core.operand import MapReduceOperand, OperandStage, ShuffleProxy + + reduce_chunks = defaultdict(list) + for c in chunks: + checked_chunks.add(c) + if isinstance(c.op, MapReduceOperand) and c.op.stage == OperandStage.reduce: + shuffle_proxies = [ + c for c in c.inputs if isinstance(c.op, ShuffleProxy) + ] + assert len(shuffle_proxies) == 1, (c.inputs, shuffle_proxies) + reduce_chunks[shuffle_proxies[0]].append(c) + else: + cls._check_shuffle_reduce_chunks(c.inputs, checked_chunks) + for _, reduce_chunks in reduce_chunks.items(): + sorted_chunks_by_indices = sorted( + reduce_chunks, key=lambda c: c.op.reducer_index + ) + sorted_chunks_by_ordinals = sorted( + reduce_chunks, key=lambda c: c.op.reducer_ordinal + ) + for c1, c2 in zip(sorted_chunks_by_indices, sorted_chunks_by_ordinals): + assert c1.op.reducer_index == c2.op.reducer_index, ( + sorted_chunks_by_indices, + sorted_chunks_by_ordinals, + ) + for c in reduce_chunks: + cls._check_shuffle_reduce_chunks(c.inputs, checked_chunks) diff --git a/python/xorbits/_mars/services/task/supervisor/tests/test_task_manager.py b/python/xorbits/_mars/services/task/supervisor/tests/test_task_manager.py new file mode 100644 index 000000000..caff6ab21 --- /dev/null +++ b/python/xorbits/_mars/services/task/supervisor/tests/test_task_manager.py @@ -0,0 +1,706 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import gc +import os +import sys +import tempfile +import time + +import numpy as np +import pandas as pd +import pytest + +from ..... import dataframe as md +from ..... import oscar as mo +from ..... import remote as mr +from ..... import tensor as mt +from .....conftest import MARS_CI_BACKEND +from .....core import Tileable, TileableGraph, TileableGraphBuilder +from .....core.operand import Fetch +from .....oscar.backends.allocate_strategy import MainPool +from .....resource import Resource +from .....storage import StorageLevel +from .....utils import Timer, merge_chunks +from ....cluster import MockClusterAPI +from ....lifecycle import MockLifecycleAPI +from ....meta import MetaAPI, MockMetaAPI, MockWorkerMetaAPI +from ....mutable import MockMutableAPI +from ....scheduling import MockSchedulingAPI +from ....session import MockSessionAPI +from ....storage import MockStorageAPI, StorageAPI +from ....subtask import MockSubtaskAPI +from ...core import TaskResult, TaskStatus +from ...execution.api import ExecutionConfig, Fetcher +from ..manager import TaskConfigurationActor, TaskManagerActor + + +@pytest.fixture +async def actor_pool(): + backend = MARS_CI_BACKEND + start_method = ( + os.environ.get("POOL_START_METHOD", "forkserver") + if sys.platform != "win32" + else None + ) + pool = await mo.create_actor_pool( + "127.0.0.1", + n_process=3, + labels=["main"] + ["numa-0"] * 2 + ["io"], + subprocess_start_method=start_method, + ) + + async with pool: + session_id = "test_session" + # create mock APIs + await MockClusterAPI.create( + pool.external_address, band_to_resource={"numa-0": Resource(num_cpus=2)} + ) + await MockSessionAPI.create(pool.external_address, session_id=session_id) + meta_api = await MockMetaAPI.create(session_id, pool.external_address) + await MockWorkerMetaAPI.create(session_id, pool.external_address) + lifecycle_api = await MockLifecycleAPI.create(session_id, pool.external_address) + storage_api = await MockStorageAPI.create(session_id, pool.external_address) + await MockSchedulingAPI.create(session_id, pool.external_address) + await MockSubtaskAPI.create(pool.external_address) + await MockMutableAPI.create(session_id, pool.external_address) + + # create configuration + config = ExecutionConfig.from_params( + backend=backend, + n_worker=1, + n_cpu=2, + subtask_max_retries=3, + ) + await mo.create_actor( + TaskConfigurationActor, + dict(), + config.get_config_dict(), + uid=TaskConfigurationActor.default_uid(), + address=pool.external_address, + ) + # create task manager + manager = await mo.create_actor( + TaskManagerActor, + session_id, + uid=TaskManagerActor.gen_uid(session_id), + address=pool.external_address, + allocate_strategy=MainPool(), + ) + try: + yield backend, pool, session_id, meta_api, lifecycle_api, storage_api, manager + finally: + await MockStorageAPI.cleanup(pool.external_address) + await MockClusterAPI.cleanup(pool.external_address) + await MockMutableAPI.cleanup(session_id, pool.external_address) + + +async def _merge_data( + execution_backend: str, + fetch_tileable: Tileable, + meta_api: MetaAPI, + storage_api: StorageAPI, +): + async def _get_storage_api(band): + return storage_api + + fetcher = Fetcher.create(execution_backend, get_storage_api=_get_storage_api) + get_metas = [] + for chunk in fetch_tileable.chunks: + get_metas.append( + meta_api.get_chunk_meta.delay(chunk.key, fields=fetcher.required_meta_keys) + ) + metas = await meta_api.get_chunk_meta.batch(*get_metas) + for chunk, meta in zip(fetch_tileable.chunks, metas): + await fetcher.append(chunk.key, meta) + data = await fetcher.get() + index_data = [(c.index, d) for c, d in zip(fetch_tileable.chunks, data)] + return merge_chunks(index_data) + + +@pytest.mark.asyncio +@pytest.mark.ray_dag +async def test_run_task(actor_pool): + ( + execution_backend, + pool, + session_id, + meta_api, + lifecycle_api, + storage_api, + manager, + ) = actor_pool + + raw = np.random.RandomState(0).rand(10, 10) + a = mt.tensor(raw, chunk_size=5) + b = a + 1 + + graph = TileableGraph([b.data]) + next(TileableGraphBuilder(graph).build()) + + task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False) + assert isinstance(task_id, str) + + await manager.wait_task(task_id) + task_result: TaskResult = await manager.get_task_result(task_id) + + assert task_result.status == TaskStatus.terminated + if task_result.error is not None: + raise task_result.error.with_traceback(task_result.traceback) + assert await manager.get_task_progress(task_id) == 1.0 + + result_tileable = (await manager.get_task_result_tileables(task_id))[0] + result = await _merge_data( + execution_backend, result_tileable, meta_api, storage_api + ) + np.testing.assert_array_equal(result, raw + 1) + + # test ref counts + assert (await lifecycle_api.get_tileable_ref_counts([b.key]))[0] == 1 + assert ( + await lifecycle_api.get_chunk_ref_counts( + [c.key for c in result_tileable.chunks] + ) + ) == [1] * len(result_tileable.chunks) + + +@pytest.mark.asyncio +@pytest.mark.ray_dag +async def test_run_tasks_with_same_name(actor_pool): + ( + execution_backend, + pool, + session_id, + meta_api, + lifecycle_api, + storage_api, + manager, + ) = actor_pool + + raw = np.random.RandomState(0).rand(10, 10) + a = mt.tensor(raw, chunk_size=5) + b = a + 1 + c = a * 2 + + for t, e in zip([b, c], [raw + 1, raw * 2]): + graph = TileableGraph([t.data]) + next(TileableGraphBuilder(graph).build()) + + task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False) + assert isinstance(task_id, str) + + await manager.wait_task(task_id) + task_result: TaskResult = await manager.get_task_result(task_id) + + assert task_result.status == TaskStatus.terminated + if task_result.error is not None: + raise task_result.error.with_traceback(task_result.traceback) + assert await manager.get_task_progress(task_id) == 1.0 + + result_tileable = (await manager.get_task_result_tileables(task_id))[0] + result = await _merge_data( + execution_backend, result_tileable, meta_api, storage_api + ) + np.testing.assert_array_equal(result, e) + + +@pytest.mark.asyncio +@pytest.mark.ray_dag +async def test_error_task(actor_pool): + ( + execution_backend, + pool, + session_id, + meta_api, + lifecycle_api, + storage_api, + manager, + ) = actor_pool + + with mt.errstate(divide="raise"): + a = mt.ones((10, 10), chunk_size=5) + c = a / 0 + + graph = TileableGraph([c.data]) + next(TileableGraphBuilder(graph).build()) + + task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False) + assert isinstance(task_id, str) + + await manager.wait_task(task_id) + task_result: TaskResult = await manager.get_task_result(task_id) + + assert task_result.status == TaskStatus.terminated + assert task_result.error is not None + assert isinstance(task_result.error, FloatingPointError) + + # test ref counts + assert (await lifecycle_api.get_tileable_ref_counts([c.key]))[0] == 0 + assert len(await lifecycle_api.get_all_chunk_ref_counts()) == 0 + + +@pytest.mark.asyncio +async def test_cancel_task(actor_pool): + ( + execution_backend, + pool, + session_id, + meta_api, + lifecycle_api, + storage_api, + manager, + ) = actor_pool + + def func(): + time.sleep(200) + + rs = [mr.spawn(func) for _ in range(10)] + + graph = TileableGraph([r.data for r in rs]) + next(TileableGraphBuilder(graph).build()) + + task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False) + assert isinstance(task_id, str) + + await asyncio.sleep(0.5) + + with Timer() as timer: + await manager.cancel_task(task_id) + await manager.wait_task(task_id) + result = await manager.get_task_result(task_id) + assert result.status == TaskStatus.terminated + + assert timer.duration < 25 + + keys = [r.key for r in rs] + del rs + gc.collect() + await asyncio.sleep(0.5) + + # test ref counts + assert (await lifecycle_api.get_tileable_ref_counts(keys)) == [0] * len(keys) + + +@pytest.mark.asyncio +@pytest.mark.ray_dag +async def test_iterative_tiling(actor_pool): + ( + execution_backend, + pool, + session_id, + meta_api, + lifecycle_api, + storage_api, + manager, + ) = actor_pool + + rs = np.random.RandomState(0) + raw_a = rs.rand(10, 10) + raw_b = rs.rand(10, 10) + a = mt.tensor(raw_a, chunk_size=5) + b = mt.tensor(raw_b, chunk_size=5) + + d = a[a[:, 0] < 3] + b[b[:, 0] < 3] + graph = TileableGraph([d.data]) + next(TileableGraphBuilder(graph).build()) + + task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False) + assert isinstance(task_id, str) + + await manager.wait_task(task_id) + task_result: TaskResult = await manager.get_task_result(task_id) + + assert task_result.status == TaskStatus.terminated + if task_result.error is not None: + raise task_result.error.with_traceback(task_result.traceback) + assert await manager.get_task_progress(task_id) == 1.0 + + expect = raw_a[raw_a[:, 0] < 3] + raw_b[raw_b[:, 0] < 3] + result_tileable = (await manager.get_task_result_tileables(task_id))[0] + result = await _merge_data( + execution_backend, result_tileable, meta_api, storage_api + ) + np.testing.assert_array_equal(result, expect) + + # test ref counts + assert (await lifecycle_api.get_tileable_ref_counts([d.key]))[0] == 1 + assert ( + await lifecycle_api.get_chunk_ref_counts( + [c.key for c in result_tileable.chunks] + ) + ) == [1] * len(result_tileable.chunks) + + +@pytest.mark.asyncio +async def test_prune_in_iterative_tiling(actor_pool): + ( + execution_backend, + pool, + session_id, + meta_api, + lifecycle_api, + storage_api, + manager, + ) = actor_pool + + raw = pd.DataFrame(np.random.RandomState(0).rand(1000, 10)) + df = md.DataFrame(raw, chunk_size=100) + df2 = df.groupby(0).agg("sum") + + graph = TileableGraph([df2.data]) + next(TileableGraphBuilder(graph).build()) + + task_id = await manager.submit_tileable_graph(graph, fuse_enabled=True) + assert isinstance(task_id, str) + + await manager.wait_task(task_id) + task_result: TaskResult = await manager.get_task_result(task_id) + + assert task_result.status == TaskStatus.terminated + if task_result.error is not None: + raise task_result.error.with_traceback(task_result.traceback) + assert await manager.get_task_progress(task_id) == 1.0 + + expect = raw.groupby(0).agg("sum") + result_tileable = (await manager.get_task_result_tileables(task_id))[0] + result = await _merge_data( + execution_backend, result_tileable, meta_api, storage_api + ) + pd.testing.assert_frame_equal(expect, result) + + subtask_graphs = await manager.get_subtask_graphs(task_id) + assert len(subtask_graphs) == 2 + + # the first subtask graph should have only 2 subtasks after pruning + assert len(subtask_graphs[0]) == 2 + nodes = [ + n + for st in subtask_graphs[0] + for n in st.chunk_graph + if not isinstance(n.op, Fetch) + ] + assert len(nodes) == 8 + result_nodes = [n for st in subtask_graphs[0] for n in st.chunk_graph.results] + assert len(result_nodes) == 4 + assert all("GroupByAgg" in str(n.op) for n in result_nodes) + + # second subtask graph + assert len(subtask_graphs[1]) == 6 + all_nodes = nodes + [ + n + for st in subtask_graphs[1] + for n in st.chunk_graph + if not isinstance(n.op, Fetch) + ] + assert len(all_nodes) == 28 + assert len({n.key for n in all_nodes}) == 28 + + df3 = df[df[0] < 1].rechunk(200) + + graph = TileableGraph([df3.data]) + next(TileableGraphBuilder(graph).build()) + + task_id = await manager.submit_tileable_graph(graph, fuse_enabled=True) + assert isinstance(task_id, str) + + await manager.wait_task(task_id) + task_result: TaskResult = await manager.get_task_result(task_id) + + assert task_result.status == TaskStatus.terminated + if task_result.error is not None: + raise task_result.error.with_traceback(task_result.traceback) + assert await manager.get_task_progress(task_id) == 1.0 + + result_tileable = (await manager.get_task_result_tileables(task_id))[0] + result = await _merge_data( + execution_backend, result_tileable, meta_api, storage_api + ) + pd.testing.assert_frame_equal(raw, result) + + subtask_graphs = await manager.get_subtask_graphs(task_id) + assert len(subtask_graphs) == 2 + + # the first subtask graph + assert len(subtask_graphs[0]) == 5 + nodes = [ + n + for st in subtask_graphs[0] + for n in st.chunk_graph + if not isinstance(n.op, Fetch) + ] + assert len(nodes) == 40 + result_nodes = [n for st in subtask_graphs[0] for n in st.chunk_graph.results] + assert len(result_nodes) == 10 + + # second subtask graph + assert len(subtask_graphs[1]) == 5 + all_nodes = nodes + [ + n + for st in subtask_graphs[1] + for n in st.chunk_graph + if not isinstance(n.op, Fetch) + ] + assert len(all_nodes) == 45 + assert len({n.key for n in all_nodes}) == 45 + + +@pytest.mark.asyncio +async def test_shuffle(actor_pool): + ( + execution_backend, + pool, + session_id, + meta_api, + lifecycle_api, + storage_api, + manager, + ) = actor_pool + + rs = np.random.RandomState(0) + raw = rs.rand(10, 10) + raw2 = rs.randint(10, size=(10,)) + a = mt.tensor(raw, chunk_size=5) + b = mt.tensor(raw2, chunk_size=5) + c = a[b] + + graph = TileableGraph([c.data]) + next(TileableGraphBuilder(graph).build()) + + task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False) + assert isinstance(task_id, str) + + await manager.wait_task(task_id) + task_result: TaskResult = await manager.get_task_result(task_id) + + assert task_result.status == TaskStatus.terminated + if task_result.error is not None: + raise task_result.error.with_traceback(task_result.traceback) + assert await manager.get_task_progress(task_id) == 1.0 + + expect = raw[raw2] + result_tileable = (await manager.get_task_result_tileables(task_id))[0] + result = await _merge_data( + execution_backend, result_tileable, meta_api, storage_api + ) + np.testing.assert_array_equal(result, expect) + + # test generating map reduce info + subtask_graphs = (await manager.get_subtask_graphs(task_id))[0] + map_reduce_ids = [] + for subtask in subtask_graphs: + for chunk in subtask.chunk_graph.result_chunks: + map_reduce_id = getattr(chunk, "extra_params", dict()).get( + "analyzer_map_reduce_id" + ) + if map_reduce_id is not None: + map_reduce_ids.append(map_reduce_id) + assert len(map_reduce_ids) > 0 + map_reduce_info = await manager.get_map_reduce_info(task_id, map_reduce_ids[0]) + assert ( + len(set(map_reduce_info.reducer_indexes)) + == len(map_reduce_info.reducer_indexes) + == len(map_reduce_info.reducer_bands) + > 0 + ) + + # test ref counts + assert (await lifecycle_api.get_tileable_ref_counts([c.key]))[0] == 1 + assert ( + await lifecycle_api.get_chunk_ref_counts( + [c.key for c in result_tileable.chunks] + ) + ) == [1] * len(result_tileable.chunks) + await lifecycle_api.decref_tileables([c.key]) + ref_counts = await lifecycle_api.get_all_chunk_ref_counts() + assert len(ref_counts) == 0 + + # test if exists in storage + assert len(await storage_api.list(level=StorageLevel.MEMORY)) == 0 + + +@pytest.mark.asyncio +@pytest.mark.ray_dag +async def test_numexpr(actor_pool): + ( + execution_backend, + pool, + session_id, + meta_api, + lifecycle_api, + storage_api, + manager, + ) = actor_pool + + raw = np.random.rand(10, 10) + t = mt.tensor(raw, chunk_size=5) + t2 = (t + 1) * 2 - 0.3 + + graph = TileableGraph([t2.data]) + next(TileableGraphBuilder(graph).build()) + + task_id = await manager.submit_tileable_graph(graph, fuse_enabled=True) + assert isinstance(task_id, str) + + await manager.wait_task(task_id) + task_result: TaskResult = await manager.get_task_result(task_id) + + assert task_result.status == TaskStatus.terminated + if task_result.error is not None: + raise task_result.error.with_traceback(task_result.traceback) + assert await manager.get_task_progress(task_id) == 1.0 + + expect = (raw + 1) * 2 - 0.3 + result_tileable = (await manager.get_task_result_tileables(task_id))[0] + result = await _merge_data( + execution_backend, result_tileable, meta_api, storage_api + ) + np.testing.assert_array_equal(result, expect) + + # test ref counts + assert (await lifecycle_api.get_tileable_ref_counts([t2.key]))[0] == 1 + assert ( + await lifecycle_api.get_chunk_ref_counts( + [c.key for c in result_tileable.chunks] + ) + ) == [1] * len(result_tileable.chunks) + + +@pytest.mark.asyncio +@pytest.mark.ray_dag +async def test_optimization(actor_pool): + ( + execution_backend, + pool, + session_id, + meta_api, + lifecycle_api, + storage_api, + manager, + ) = actor_pool + + with tempfile.TemporaryDirectory() as tempdir: + file_path = os.path.join(tempdir, "test.csv") + + pdf = pd.DataFrame( + { + "a": [3, 4, 5, 3, 5, 4, 1, 2, 3], + "b": [1, 3, 4, 5, 6, 5, 4, 4, 4], + "c": list("aabaaddce"), + "d": list("abaaaddce"), + } + ) + pdf.to_csv(file_path, index=False) + + df = md.read_csv(file_path, incremental_index=True) + df2 = df.groupby("c").agg({"a": "sum"}) + df3 = df[["b", "a"]] + + graph = TileableGraph([df2.data, df3.data]) + next(TileableGraphBuilder(graph).build()) + + task_id = await manager.submit_tileable_graph(graph) + assert isinstance(task_id, str) + + await manager.wait_task(task_id) + task_result: TaskResult = await manager.get_task_result(task_id) + + assert task_result.status == TaskStatus.terminated + if task_result.error is not None: + raise task_result.error.with_traceback(task_result.traceback) + assert await manager.get_task_progress(task_id) == 1.0 + + expect = pdf.groupby("c").agg({"a": "sum"}) + result_tileables = await manager.get_task_result_tileables(task_id) + result1 = result_tileables[0] + result = await _merge_data(execution_backend, result1, meta_api, storage_api) + np.testing.assert_array_equal(result, expect) + + expect = pdf[["b", "a"]] + result2 = result_tileables[1] + result = await _merge_data(execution_backend, result2, meta_api, storage_api) + np.testing.assert_array_equal(result, expect) + + # test ref counts + assert (await lifecycle_api.get_tileable_ref_counts([df3.key]))[0] == 1 + assert ( + await lifecycle_api.get_chunk_ref_counts( + [c.key for c in result_tileables[1].chunks] + ) + ) == [1] * len(result_tileables[1].chunks) + + # test ref counts + assert (await lifecycle_api.get_tileable_ref_counts([df3.key]))[0] == 1 + assert ( + await lifecycle_api.get_chunk_ref_counts( + [c.key for c in result_tileables[1].chunks] + ) + ) == [1] * len(result_tileables[1].chunks) + + +@pytest.mark.asyncio +@pytest.mark.ray_dag +async def test_dump_subtask_graph(actor_pool): + ( + execution_backend, + pool, + session_id, + meta_api, + lifecycle_api, + storage_api, + manager, + ) = actor_pool + + rs = np.random.RandomState(0) + raw = pd.DataFrame( + { + "c1": rs.randint(20, size=100), + "c2": rs.choice(["a", "b", "c"], (100,)), + "c3": rs.rand(100), + } + ) + mdf = md.DataFrame(raw, chunk_size=20) + # groupby will generate multiple tasks + r = mdf.groupby("c2").agg("sum") + graph = TileableGraph([r.data]) + next(TileableGraphBuilder(graph).build()) + + task_id = await manager.submit_tileable_graph( + graph, + fuse_enabled=True, + extra_config={"dump_subtask_graph": True}, + ) + assert isinstance(task_id, str) + + await manager.wait_task(task_id) + + result_tileable = (await manager.get_task_result_tileables(task_id))[0] + result = await _merge_data( + execution_backend, result_tileable, meta_api, storage_api + ) + pd.testing.assert_frame_equal(result.sort_index(), raw.groupby("c2").agg("sum")) + + # read dot file + file_path = os.path.join(tempfile.gettempdir(), f"mars-{task_id}") + with open(file_path) as f: + text = f.read() + assert "style=bold" in text + assert 'color="/spectral9/' in text + for c in result_tileable.chunks: + assert c.key[:5] in text + os.remove(file_path) + + pdf_path = os.path.join(tempfile.gettempdir(), f"mars-{task_id}.pdf") + if os.path.exists(pdf_path): + os.remove(pdf_path) diff --git a/python/xorbits/_mars/services/task/supervisor/tests/test_task_manager_on_ray.py b/python/xorbits/_mars/services/task/supervisor/tests/test_task_manager_on_ray.py new file mode 100644 index 000000000..b58fb2b6a --- /dev/null +++ b/python/xorbits/_mars/services/task/supervisor/tests/test_task_manager_on_ray.py @@ -0,0 +1,52 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from ..... import oscar as mo +from .....oscar.backends.ray.utils import placement_group_info_to_addresses +from .....tests.core import require_ray +from .....utils import lazy_import +from ..manager import TaskConfigurationActor + +ray = lazy_import("ray") + + +@require_ray +@pytest.mark.asyncio +async def test_task_manager_creation(ray_start_regular): + mo.setup_cluster( + address_to_resources=placement_group_info_to_addresses( + "test_cluster", [{"CPU": 2}] + ) + ) + # the pool is an ActorHandle, it does not have an async context. + pool = await mo.create_actor_pool( + "ray://test_cluster/0/0", n_process=2, labels=[None] + ["numa-0"] * 2 + ) + assert pool + + # create configuration + await mo.create_actor( + TaskConfigurationActor, + dict(), + dict(), + uid=TaskConfigurationActor.default_uid(), + address="ray://test_cluster/0/0", + ) + + configuration_ref = await mo.actor_ref( + TaskConfigurationActor.default_uid(), address="ray://test_cluster/0/0" + ) + await configuration_ref.get_config() diff --git a/python/xorbits/_mars/services/task/tests/__init__.py b/python/xorbits/_mars/services/task/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/services/task/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/services/task/tests/test_service.py b/python/xorbits/_mars/services/task/tests/test_service.py new file mode 100644 index 000000000..fdf981f8a --- /dev/null +++ b/python/xorbits/_mars/services/task/tests/test_service.py @@ -0,0 +1,633 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import time + +import numpy as np +import pandas as pd +import pytest + +from .... import dataframe as md +from .... import oscar as mo +from .... import remote as mr +from .... import tensor as mt +from ....core import TileableGraph, TileableGraphBuilder, TileStatus, recursive_tile +from ....core.context import get_context +from ....resource import Resource +from ....tensor.core import TensorOrder +from ....tensor.operands import TensorOperand, TensorOperandMixin +from ....utils import Timer, build_fetch +from ... import NodeRole, start_services, stop_services +from ...meta import MetaAPI +from ...session import SessionAPI +from ...storage import MockStorageAPI +from ...subtask import SubtaskStatus +from ...web import WebActor +from .. import TaskAPI, TaskStatus, WebTaskAPI +from ..errors import TaskNotExist +from ..supervisor.processor import TaskProcessor + + +@pytest.fixture +async def actor_pools(): + async def start_pool(is_worker: bool): + if is_worker: + kw = dict( + n_process=3, + labels=["main"] + ["numa-0"] * 2 + ["io"], + subprocess_start_method="spawn", + ) + else: + kw = dict(n_process=1, subprocess_start_method="spawn") + pool = await mo.create_actor_pool("127.0.0.1", **kw) + await pool.start() + return pool + + sv_pool, worker_pool = await asyncio.gather(start_pool(False), start_pool(True)) + try: + yield sv_pool, worker_pool + finally: + await asyncio.gather(sv_pool.stop(), worker_pool.stop()) + + +async def _start_services( + supervisor_pool, worker_pool, request, task_processor_cls=None +): + config = { + "services": [ + "cluster", + "session", + "meta", + "lifecycle", + "scheduling", + "subtask", + "task", + "mutable", + ], + "cluster": { + "backend": "fixed", + "lookup_address": supervisor_pool.external_address, + "resource": {"numa-0": Resource(num_cpus=2)}, + }, + "meta": {"store": "dict"}, + "scheduling": {}, + "task": {}, + } + if task_processor_cls: + config["task"]["task_processor_cls"] = task_processor_cls + if request: + config["services"].append("web") + await start_services( + NodeRole.SUPERVISOR, config, address=supervisor_pool.external_address + ) + await start_services(NodeRole.WORKER, config, address=worker_pool.external_address) + + session_id = "test_session" + session_api = await SessionAPI.create(supervisor_pool.external_address) + await session_api.create_session(session_id) + + if not request.param: + task_api = await TaskAPI.create(session_id, supervisor_pool.external_address) + else: + web_actor = await mo.actor_ref( + WebActor.default_uid(), address=supervisor_pool.external_address + ) + web_address = await web_actor.get_web_address() + task_api = WebTaskAPI(session_id, web_address) + + assert await task_api.get_task_results() == [] + + # create mock meta and storage APIs + _ = await MetaAPI.create(session_id, supervisor_pool.external_address) + storage_api = await MockStorageAPI.create(session_id, worker_pool.external_address) + return task_api, storage_api, config + + +@pytest.mark.parametrize(indirect=True) +@pytest.fixture(params=[False, True]) +async def start_test_service(actor_pools, request): + sv_pool, worker_pool = actor_pools + + task_api, storage_api, config = await _start_services(sv_pool, worker_pool, request) + + try: + yield sv_pool.external_address, task_api, storage_api + finally: + await MockStorageAPI.cleanup(worker_pool.external_address) + await stop_services(NodeRole.WORKER, config, worker_pool.external_address) + await stop_services(NodeRole.SUPERVISOR, config, sv_pool.external_address) + + +class MockTaskProcessor(TaskProcessor): + @classmethod + def _get_decref_stage_chunk_keys(cls, stage_processor): + import time + + # time.sleep to block async thread + time.sleep(5) + return super()._get_decref_stage_chunk_keys(stage_processor) + + +@pytest.mark.parametrize(indirect=True) +@pytest.fixture(params=[True]) +async def start_test_service_with_mock(actor_pools, request): + sv_pool, worker_pool = actor_pools + + task_api, storage_api, config = await _start_services( + sv_pool, + worker_pool, + request, + task_processor_cls="mars.services.task.tests.test_service.MockTaskProcessor", + ) + + try: + yield sv_pool.external_address, task_api, storage_api + finally: + await MockStorageAPI.cleanup(worker_pool.external_address) + await stop_services(NodeRole.WORKER, config, worker_pool.external_address) + await stop_services(NodeRole.SUPERVISOR, config, sv_pool.external_address) + + +@pytest.mark.asyncio +async def test_task_timeout_execution(start_test_service_with_mock): + _sv_pool_address, task_api, storage_api = start_test_service_with_mock + + def f1(): + return np.arange(5) + + def f2(): + return np.arange(5, 10) + + def f3(f1r, f2r): + return np.concatenate([f1r, f2r]).sum() + + r1 = mr.spawn(f1) + r2 = mr.spawn(f2) + r3 = mr.spawn(f3, args=(r1, r2)) + + graph = TileableGraph([r3.data]) + next(TileableGraphBuilder(graph).build()) + + task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False) + assert await task_api.get_last_idle_time() is None + assert isinstance(task_id, str) + + await task_api.wait_task(task_id, timeout=2) + task_result = await task_api.get_task_result(task_id) + + assert task_result.status == TaskStatus.terminated + + +@pytest.mark.asyncio +async def test_task_execution(start_test_service): + _sv_pool_address, task_api, storage_api = start_test_service + + def f1(): + return np.arange(5) + + def f2(): + return np.arange(5, 10) + + def f3(f1r, f2r): + return np.concatenate([f1r, f2r]).sum() + + r1 = mr.spawn(f1) + r2 = mr.spawn(f2) + r3 = mr.spawn(f3, args=(r1, r2)) + + graph = TileableGraph([r3.data]) + next(TileableGraphBuilder(graph).build()) + + task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False) + assert await task_api.get_last_idle_time() is None + assert isinstance(task_id, str) + + await task_api.wait_task(task_id) + task_result = await task_api.get_task_result(task_id) + + assert task_result.status == TaskStatus.terminated + assert await task_api.get_last_idle_time() is not None + if task_result.error is not None: + raise task_result.error.with_traceback(task_result.traceback) + + result_tileable = (await task_api.get_fetch_tileables(task_id))[0] + data_key = result_tileable.chunks[0].key + assert await storage_api.get(data_key) == 45 + + +@pytest.mark.asyncio +async def test_task_error(start_test_service): + _sv_pool_address, task_api, storage_api = start_test_service + + # test job cancel + def f1(): + raise SystemError + + rs = [mr.spawn(f1) for _ in range(10)] + + graph = TileableGraph([r.data for r in rs]) + next(TileableGraphBuilder(graph).build()) + + task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False) + + await task_api.wait_task(task_id, timeout=10) + results = await task_api.get_task_results(progress=True) + assert isinstance(results[0].error, SystemError) + + +@pytest.mark.asyncio +async def test_task_cancel(start_test_service): + _sv_pool_address, task_api, storage_api = start_test_service + + # test job cancel + def f1(): + time.sleep(100) + + rs = [mr.spawn(f1) for _ in range(10)] + + graph = TileableGraph([r.data for r in rs]) + next(TileableGraphBuilder(graph).build()) + + task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False) + await asyncio.sleep(1) + with Timer() as timer: + await task_api.cancel_task(task_id) + await asyncio.sleep(1) + result = await task_api.get_task_result(task_id) + assert result.status == TaskStatus.terminated + assert timer.duration < 20 + await asyncio.sleep(0.1) + assert await task_api.get_last_idle_time() is not None + + results = await task_api.get_task_results(progress=True) + assert all(result.status == TaskStatus.terminated for result in results) + + +class _ProgressController: + def __init__(self): + self._step_event = asyncio.Event() + + async def wait(self): + await self._step_event.wait() + self._step_event.clear() + + def set(self): + self._step_event.set() + + +@pytest.mark.asyncio +async def test_task_progress(start_test_service): + sv_pool_address, task_api, storage_api = start_test_service + + session_api = await SessionAPI.create(address=sv_pool_address) + ref = await session_api.create_remote_object( + task_api._session_id, "progress_controller", _ProgressController + ) + + def f1(count: int): + progress_controller = get_context().get_remote_object("progress_controller") + for idx in range(count): + progress_controller.wait() + get_context().set_progress((1 + idx) * 1.0 / count) + + r = mr.spawn(f1, args=(2,)) + + graph = TileableGraph([r.data]) + next(TileableGraphBuilder(graph).build()) + + await task_api.submit_tileable_graph(graph, fuse_enabled=False) + + await asyncio.sleep(0.2) + results = await task_api.get_task_results(progress=True) + assert results[0].progress == 0.0 + + await ref.set() + await asyncio.sleep(1) + results = await task_api.get_task_results(progress=True) + assert results[0].progress == 0.5 + + await ref.set() + await asyncio.sleep(1) + results = await task_api.get_task_results(progress=True) + assert results[0].progress == 1.0 + + +class _TileProgressOperand(TensorOperand, TensorOperandMixin): + @classmethod + def tile(cls, op: "_TileProgressOperand"): + progress_controller = get_context().get_remote_object("progress_controller") + + t = yield from recursive_tile(mt.random.rand(10, 10, chunk_size=5)) + yield TileStatus(t.chunks, progress=0.25) + progress_controller.wait() + + new_op = op.copy() + params = op.outputs[0].params.copy() + params["chunks"] = t.chunks + params["nsplits"] = t.nsplits + return new_op.new_tileables(t.inputs, kws=[params]) + + +@pytest.mark.asyncio +async def test_task_tile_progress(start_test_service): + sv_pool_address, task_api, storage_api = start_test_service + + session_api = await SessionAPI.create(address=sv_pool_address) + ref = await session_api.create_remote_object( + task_api._session_id, "progress_controller", _ProgressController + ) + + t = _TileProgressOperand(dtype=np.dtype(np.float64)).new_tensor( + None, (10, 10), order=TensorOrder.C_ORDER + ) + + graph = TileableGraph([t.data]) + next(TileableGraphBuilder(graph).build()) + + await task_api.submit_tileable_graph(graph, fuse_enabled=False) + + await asyncio.sleep(1) + results = await task_api.get_task_results(progress=True) + assert results[0].progress == 0.25 + + await ref.set() + await asyncio.sleep(1) + results = await task_api.get_task_results(progress=True) + assert results[0].progress == 1.0 + + +@pytest.mark.asyncio +async def test_get_tileable_graph(start_test_service): + _sv_pool_address, task_api, storage_api = start_test_service + + def f1(): + return np.arange(5) + + def f2(): + return np.arange(5, 10) + + def f3(f1r, f2r): + return np.concatenate([f1r, f2r]).sum() + + r1 = mr.spawn(f1) + r2 = mr.spawn(f2) + r3 = mr.spawn(f3, args=(r1, r2)) + + graph = TileableGraph([r3.data]) + next(TileableGraphBuilder(graph).build()) + + task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False) + try: + with pytest.raises(TaskNotExist): + await task_api.get_tileable_graph_as_json("non_exist") + + tileable_detail = await task_api.get_tileable_graph_as_json(task_id) + + num_tileable = len(tileable_detail.get("tileables")) + num_dependencies = len(tileable_detail.get("dependencies")) + assert num_tileable > 0 + assert num_dependencies <= (num_tileable / 2) * (num_tileable / 2) + + assert (num_tileable == 1 and num_dependencies == 0) or ( + num_tileable > 1 and num_dependencies > 0 + ) + + graph_nodes = [] + graph_dependencies = [] + for node in graph.iter_nodes(): + graph_nodes.append(node.key) + + for node_successor in graph.iter_successors(node): + graph_dependencies.append( + { + "fromTileableId": node.key, + "toTileableId": node_successor.key, + "linkType": 0, + } + ) + + for tileable in tileable_detail.get("tileables"): + graph_nodes.remove(tileable.get("tileableId")) + + assert len(graph_nodes) == 0 + + for i in range(num_dependencies): + dependency = tileable_detail.get("dependencies")[i] + assert graph_dependencies[i] == dependency + finally: + await task_api.wait_task(task_id, timeout=120) + + +@pytest.mark.asyncio +async def test_get_tileable_details(start_test_service): + sv_pool_address, task_api, storage_api = start_test_service + + session_api = await SessionAPI.create(address=sv_pool_address) + ref = await session_api.create_remote_object( + task_api._session_id, "progress_controller", _ProgressController + ) + + with pytest.raises(TaskNotExist): + await task_api.get_tileable_details("non_exist") + + def f(*_args, raises=False): + get_context().set_progress(0.5) + if raises: + raise ValueError + progress_controller = get_context().get_remote_object("progress_controller") + progress_controller.wait() + get_context().set_progress(1.0) + + # test non-fused DAGs + r1 = mr.spawn(f) + r2 = mr.spawn(f, args=(r1, 0)) + r3 = mr.spawn(f, args=(r1, 1)) + + graph = TileableGraph([r2.data, r3.data]) + next(TileableGraphBuilder(graph).build()) + + task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False) + + def _get_fields(details, field, wrapper=None): + rs = [r1, r2, r3] + ret = [details[r.key][field] for r in rs] + if wrapper: + ret = [wrapper(v) for v in ret] + return ret + + await asyncio.sleep(1) + details = await task_api.get_tileable_details(task_id) + assert _get_fields(details, "progress") == [0.5, 0.0, 0.0] + assert ( + _get_fields(details, "status", SubtaskStatus) + == [SubtaskStatus.running] + [SubtaskStatus.pending] * 2 + ) + + await ref.set() + await asyncio.sleep(1) + details = await task_api.get_tileable_details(task_id) + assert _get_fields(details, "progress") == [1.0, 0.5, 0.5] + assert ( + _get_fields(details, "status", SubtaskStatus) + == [SubtaskStatus.succeeded] + [SubtaskStatus.running] * 2 + ) + + await ref.set() + await task_api.wait_task(task_id) + + # test fused DAGs + r5 = mr.spawn(f, args=(0,)) + r6 = mr.spawn(f, args=(r5,)) + + graph = TileableGraph([r6.data]) + next(TileableGraphBuilder(graph).build()) + + task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=True) + + await asyncio.sleep(1) + details = await task_api.get_tileable_details(task_id) + assert details[r5.key]["progress"] == details[r6.key]["progress"] == 0.25 + + await ref.set() + await asyncio.sleep(0.1) + await ref.set() + await task_api.wait_task(task_id) + + # test raises + r7 = mr.spawn(f, kwargs={"raises": 1}) + + graph = TileableGraph([r7.data]) + next(TileableGraphBuilder(graph).build()) + + task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=True) + await task_api.wait_task(task_id) + details = await task_api.get_tileable_details(task_id) + assert details[r7.key]["status"] == SubtaskStatus.errored.value + + for tileable in details.keys(): + for property_key, property_value in ( + details.get(tileable).get("properties").items() + ): + assert property_key != "key" + assert property_key != "id" + assert isinstance(property_value, (int, float, str)) + + # test merge + d1 = pd.DataFrame({"a": np.random.rand(100), "b": np.random.randint(3, size=100)}) + d2 = pd.DataFrame({"c": np.random.rand(100), "b": np.random.randint(3, size=100)}) + df1 = md.DataFrame(d1, chunk_size=10) + df2 = md.DataFrame(d2, chunk_size=10) + + graph = TileableGraph([df1.data, df2.data]) + next(TileableGraphBuilder(graph).build()) + + task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=True) + await task_api.wait_task(task_id) + details = await task_api.get_tileable_details(task_id) + assert details[df1.key]["progress"] == details[df2.key]["progress"] == 1.0 + + f1 = build_fetch(df1) + f2 = build_fetch(df2) + df3 = f1.merge(f2, auto_merge="none", bloom_filter=False) + graph = TileableGraph([df3.data]) + next(TileableGraphBuilder(graph).build()) + + task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=True) + await task_api.wait_task(task_id) + for _ in range(2): + # get twice to ensure cache work + details = await task_api.get_tileable_details(task_id) + assert ( + details[df3.key]["progress"] + == details[f1.key]["progress"] + == details[f2.key]["progress"] + == 1.0 + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("with_input_output", [False, True]) +async def test_get_tileable_subtasks(start_test_service, with_input_output): + sv_pool_address, task_api, storage_api = start_test_service + + def a(): + return md.DataFrame([[1, 2], [3, 4]]) + + def b(): + return md.DataFrame([[1, 2, 3, 4], [4, 3, 2, 1]]) + + def c(a, b): + return ( + a.sum() + * a.product() + * b.sum() + * a.sum() + / a.sum() + * b.product() + / a.product() + ) + + ra = mr.spawn(a) + rb = mr.spawn(b) + rc = mr.spawn(c, args=(ra, rb)) + + graph = TileableGraph([rc.data]) + next(TileableGraphBuilder(graph).build()) + + task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False) + + await asyncio.sleep(1) + + try: + tileable_graph_json = await task_api.get_tileable_graph_as_json(task_id) + for tileable_json in tileable_graph_json["tileables"]: + tileable_id = tileable_json["tileableId"] + subtask_details = await task_api.get_tileable_subtasks( + task_id, tileable_id, True + ) + + subtask_deps = [] + for subtask_id, subtask_detail in subtask_details.items(): + for from_subtask_id in subtask_detail.get("fromSubtaskIds", ()): + subtask_deps.append((from_subtask_id, subtask_id)) + assert len(subtask_details) > 0 + + for from_id, to_id in subtask_deps: + assert from_id in subtask_details + assert to_id in subtask_details + + if with_input_output: + tileable_inputs = [ + dep["fromTileableId"] + for dep in tileable_graph_json["dependencies"] + if dep["toTileableId"] == tileable_id + ] + tileable_outputs = [ + dep["toTileableId"] + for dep in tileable_graph_json["dependencies"] + if dep["fromTileableId"] == tileable_id + ] + if tileable_inputs: + assert any( + detail["nodeType"] == "Input" + for detail in subtask_details.values() + ) + if tileable_outputs: + assert any( + detail["nodeType"] == "Output" + for detail in subtask_details.values() + ) + finally: + await task_api.wait_task(task_id, timeout=120) diff --git a/python/xorbits/_mars/services/task/worker/__init__.py b/python/xorbits/_mars/services/task/worker/__init__.py new file mode 100644 index 000000000..55b7ebca7 --- /dev/null +++ b/python/xorbits/_mars/services/task/worker/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...core import EmptyService + + +class TaskWorkerService(EmptyService): + pass diff --git a/python/xorbits/_mars/services/tests/__init__.py b/python/xorbits/_mars/services/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/services/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/services/tests/fault_injection_manager.py b/python/xorbits/_mars/services/tests/fault_injection_manager.py new file mode 100644 index 000000000..e6e6497f6 --- /dev/null +++ b/python/xorbits/_mars/services/tests/fault_injection_manager.py @@ -0,0 +1,98 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import enum +import os +import uuid +from abc import ABC, abstractmethod + +from ...core.base import MarsError +from ..session import SessionAPI + + +class ExtraConfigKey: + FAULT_INJECTION_MANAGER_NAME = "fault_injection_manager_name" + + +class FaultPosition(enum.Enum): + ON_EXECUTE_OPERAND = 0 + ON_RUN_SUBTASK = 1 + + +class FaultType(enum.Enum): + NoFault = 0 + Exception = 1 + UnhandledException = 2 + ProcessExit = 3 + + +class FaultInjectionError(MarsError): + pass + + +class FaultInjectionUnhandledError(Exception): + pass + + +def handle_fault(fault): + if fault == FaultType.Exception: + raise FaultInjectionError("Fault Injection") + elif fault == FaultType.UnhandledException: + raise FaultInjectionUnhandledError("Fault Injection Unhandled") + elif fault == FaultType.ProcessExit: + # used to simulate process crash, no cleanup. + os._exit(-1) + assert fault == FaultType.NoFault, f"Got unexpected fault: {fault}" + + +class AbstractFaultInjectionManager(ABC): + """ + The abstract base of fault injection manager for test. + """ + + name = str(uuid.uuid4()) + + @abstractmethod + def get_fault(self, pos: FaultPosition, ctx=None) -> FaultType: + """ + Get fault at position. + + Parameters + ---------- + pos + The fault position. + ctx + The fault context. + + Returns + ------- + The fault type. + """ + pass + + @classmethod + async def create(cls, session_id, supervisor_address): + """ + Create the fault injection manager on supervisor. + + Parameters + ---------- + session_id + The session id. + supervisor_address + The supervisor address. + ------- + """ + session_api = await SessionAPI.create(supervisor_address) + await session_api.create_remote_object(session_id, cls.name, cls) diff --git a/python/xorbits/_mars/services/tests/fault_injection_patch.py b/python/xorbits/_mars/services/tests/fault_injection_patch.py new file mode 100644 index 000000000..8d09f0dc1 --- /dev/null +++ b/python/xorbits/_mars/services/tests/fault_injection_patch.py @@ -0,0 +1,96 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict + +from ... import oscar as mo +from ...core import OperandType +from ...lib.aio import alru_cache +from ...tests.core import patch_cls +from ...tests.core import patch_super as super +from ..scheduling.worker.execution import SubtaskExecutionActor +from ..session import SessionAPI +from ..subtask import Subtask +from ..subtask.worker.processor import SubtaskProcessor +from ..tests.fault_injection_manager import ( + AbstractFaultInjectionManager, + ExtraConfigKey, + FaultPosition, + handle_fault, +) + + +@patch_cls(SubtaskExecutionActor) +class FaultInjectedSubtaskExecutionActor(SubtaskExecutionActor): + @alru_cache(cache_exceptions=False) + async def _get_fault_injection_manager_ref( + self, supervisor_address: str, session_id: str, name: str + ) -> mo.ActorRefType[AbstractFaultInjectionManager]: + session_api = await self._get_session_api(supervisor_address) + return await session_api.get_remote_object(session_id, name) + + @staticmethod + @alru_cache(cache_exceptions=False) + async def _get_session_api(supervisor_address: str): + return await SessionAPI.create(supervisor_address) + + async def internal_run_subtask(self, subtask: Subtask, band_name: str): + # fault injection + if subtask.extra_config: + fault_injection_manager_name = subtask.extra_config.get( + ExtraConfigKey.FAULT_INJECTION_MANAGER_NAME + ) + if fault_injection_manager_name is not None: + subtask_info = self._subtask_info[subtask.subtask_id] + fault_injection_manager = await self._get_fault_injection_manager_ref( + subtask_info.supervisor_address, + subtask.session_id, + fault_injection_manager_name, + ) + fault = await fault_injection_manager.get_fault( + FaultPosition.ON_RUN_SUBTASK, {"subtask": subtask} + ) + handle_fault(fault) + return super().internal_run_subtask(subtask, band_name) + + +@patch_cls(SubtaskProcessor) +class FaultInjectionSubtaskProcessor(SubtaskProcessor): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._fault_injection_manager_ref: mo.ActorRefType[ + AbstractFaultInjectionManager + ] = None + + async def run(self): + if self.subtask.extra_config: + fault_injection_manager_name = self.subtask.extra_config.get( + ExtraConfigKey.FAULT_INJECTION_MANAGER_NAME + ) + if fault_injection_manager_name is not None: + self._fault_injection_manager_ref = ( + await self._session_api.get_remote_object( + self._session_id, fault_injection_manager_name + ) + ) + return await super().run() + + async def _async_execute_operand(self, ctx: Dict[str, Any], op: OperandType): + if self._fault_injection_manager_ref is not None: + fault = await self._fault_injection_manager_ref.get_fault( + FaultPosition.ON_EXECUTE_OPERAND, + {"subtask": self.subtask, "operand": op}, + ) + handle_fault(fault) + return await super()._async_execute_operand(ctx, op) diff --git a/python/xorbits/_mars/services/tests/test_core.py b/python/xorbits/_mars/services/tests/test_core.py new file mode 100644 index 000000000..8e5c8857a --- /dev/null +++ b/python/xorbits/_mars/services/tests/test_core.py @@ -0,0 +1,102 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from tornado import httpclient + +from ... import oscar as mo +from ...utils import get_next_port +from .. import ( + NodeRole, + create_service_session, + destroy_service_session, + start_services, + stop_services, +) + + +@pytest.fixture +async def actor_pool_context(): + pool = await mo.create_actor_pool(f"127.0.0.1:{get_next_port()}", n_process=0) + await pool.start() + try: + yield pool + finally: + await pool.stop() + + +@pytest.mark.asyncio +async def test_start_service(actor_pool_context): + from .test_svcs.test_svc1.supervisor import SvcSessionActor1 + + pool = actor_pool_context + web_port = get_next_port() + config = { + "services": [["test_svc1"], "test_svc2", "test_warn_svc", "web"], + "modules": "mars.services.tests.test_svcs", + "test_svc1": {"uid": "TestActor1", "arg1": "val1"}, + "test_svc2": {"uid": "TestActor2", "arg2": "val2", "ref": "TestActor1"}, + "web": {"port": web_port}, + } + with pytest.warns(RuntimeWarning) as record: + await start_services(NodeRole.SUPERVISOR, config, address=pool.external_address) + assert "test_warn_svc" in str(record[-1].message) + + ref1 = await mo.actor_ref("TestActor1", address=pool.external_address) + ref2 = await mo.actor_ref("TestActor2", address=pool.external_address) + assert await ref1.get_arg() == "val1" + assert await ref2.get_arg() == "val1:val2" + + with pytest.raises(ImportError): + await start_services( + NodeRole.SUPERVISOR, + {"services": ["non-exist-svc"]}, + address=pool.external_address, + ) + + session_id = "test_session" + await create_service_session( + NodeRole.SUPERVISOR, + config, + session_id=session_id, + address=pool.external_address, + ) + assert await mo.has_actor( + mo.create_actor_ref( + uid=SvcSessionActor1.gen_uid(session_id), address=pool.external_address + ) + ) + await destroy_service_session( + NodeRole.SUPERVISOR, + config, + session_id=session_id, + address=pool.external_address, + ) + assert not await mo.has_actor( + mo.create_actor_ref( + uid=SvcSessionActor1.gen_uid(session_id), address=pool.external_address + ) + ) + + client = httpclient.AsyncHTTPClient() + resp = await client.fetch(f"http://127.0.0.1:{web_port}/test_actor1/test_api") + assert resp.body.decode() == "val1" + + await stop_services(NodeRole.SUPERVISOR, config, address=pool.external_address) + assert not await mo.has_actor( + mo.create_actor_ref("TestActor1", address=pool.external_address) + ) + assert not await mo.has_actor( + mo.create_actor_ref("TestActor2", address=pool.external_address) + ) diff --git a/python/xorbits/_mars/services/tests/test_patch.py b/python/xorbits/_mars/services/tests/test_patch.py new file mode 100644 index 000000000..ad2552e37 --- /dev/null +++ b/python/xorbits/_mars/services/tests/test_patch.py @@ -0,0 +1,110 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + + +class A: + def __init__(self): + self.value = ["A"] + + def test_method(self): + return ["A"] + + @classmethod + def test_classmethod(cls): + return ["A"] + + +class B(A): + def __init__(self): + super().__init__() + self.value += ["B"] + + def test_method(self): + return super().test_method() + ["B"] + + def test_method2(self): + return super().test_method() + ["BB"] + + @classmethod + def test_classmethod(cls): + return super().test_classmethod() + ["B"] + + @classmethod + def test_classmethod2(cls): + return super().test_classmethod() + ["BB"] + + +class C(B): + def __init__(self): + super().__init__() + self.value += ["C"] + + def test_method(self): + return super().test_method() + ["C"] + + @classmethod + def test_classmethod(cls): + return super().test_classmethod() + ["C"] + + +class Dummy: + pass + + +def test_patch_super(): + from ...tests.core import patch_cls + from ...tests.core import patch_super as super + + @patch_cls(B) + class D(B): + def __init__(self): + super().__init__() + self.value += ["D"] + + def test_method(self): + return super().test_method() + super().test_method2() + ["D"] + + @classmethod + def test_classmethod(cls): + return super().test_classmethod() + super().test_classmethod2() + ["D"] + + b = B() + assert B.test_classmethod() == ["A", "B", "A", "BB", "D"] + assert b.test_method() == ["A", "B", "A", "BB", "D"] + assert b.value == ["A", "B", "D"] + + c = C() + assert C.test_classmethod() == ["A", "B", "A", "BB", "D", "C"] + assert c.test_method() == ["A", "B", "A", "BB", "D", "C"] + assert c.value == ["A", "B", "D", "C"] + + @patch_cls(Dummy) + class E: + def __init__(self): + super().__init__() + + def test_method(self): + return super().test_method() + ["D"] + + @classmethod + def test_classmethod(cls): + return super().test_classmethod() + ["D"] + + dummy = Dummy() + with pytest.raises(AttributeError): + dummy.test_method() + with pytest.raises(AttributeError): + Dummy.test_classmethod() diff --git a/python/xorbits/_mars/services/tests/test_svcs/__init__.py b/python/xorbits/_mars/services/tests/test_svcs/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/services/tests/test_svcs/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/services/tests/test_svcs/test_svc1/__init__.py b/python/xorbits/_mars/services/tests/test_svcs/test_svc1/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/services/tests/test_svcs/test_svc1/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/services/tests/test_svcs/test_svc1/api/__init__.py b/python/xorbits/_mars/services/tests/test_svcs/test_svc1/api/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/services/tests/test_svcs/test_svc1/api/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/services/tests/test_svcs/test_svc1/api/web.py b/python/xorbits/_mars/services/tests/test_svcs/test_svc1/api/web.py new file mode 100644 index 000000000..50f38c99b --- /dev/null +++ b/python/xorbits/_mars/services/tests/test_svcs/test_svc1/api/web.py @@ -0,0 +1,27 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...... import oscar as mo +from .....web import MarsRequestHandler + + +class TestWebHandler(MarsRequestHandler): + async def get(self): + ref = await mo.actor_ref("TestActor1", address=self._supervisor_addr) + self.write(str(await ref.get_arg())) + + +web_handlers = { + "/test_actor1/test_api": TestWebHandler, +} diff --git a/python/xorbits/_mars/services/tests/test_svcs/test_svc1/supervisor.py b/python/xorbits/_mars/services/tests/test_svcs/test_svc1/supervisor.py new file mode 100644 index 000000000..a7a4beb25 --- /dev/null +++ b/python/xorbits/_mars/services/tests/test_svcs/test_svc1/supervisor.py @@ -0,0 +1,65 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..... import oscar as mo +from ....core import AbstractService + + +class SvcActor1(mo.Actor): + def __init__(self, arg): + super().__init__() + self._arg = arg + + def get_arg(self): + return self._arg + + +class SvcSessionActor1(mo.Actor): + @classmethod + def gen_uid(cls, session_id: str): + return f"{session_id}_svc_session_actor1" + + +class TestService1(AbstractService): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + async def start(self): + svc_config = self._config["test_svc1"] + await mo.create_actor( + SvcActor1, + uid=svc_config["uid"], + arg=svc_config["arg1"], + address=self._address, + ) + + async def stop(self): + svc_config = self._config["test_svc1"] + await mo.destroy_actor( + mo.create_actor_ref(uid=svc_config["uid"], address=self._address) + ) + + async def create_session(self, session_id: str): + await mo.create_actor( + SvcSessionActor1, + uid=SvcSessionActor1.gen_uid(session_id), + address=self._address, + ) + + async def destroy_session(self, session_id: str): + await mo.destroy_actor( + mo.create_actor_ref( + uid=SvcSessionActor1.gen_uid(session_id), address=self._address + ) + ) diff --git a/python/xorbits/_mars/services/tests/test_svcs/test_svc2/__init__.py b/python/xorbits/_mars/services/tests/test_svcs/test_svc2/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/services/tests/test_svcs/test_svc2/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/services/tests/test_svcs/test_svc2/supervisor.py b/python/xorbits/_mars/services/tests/test_svcs/test_svc2/supervisor.py new file mode 100644 index 000000000..efe5b8480 --- /dev/null +++ b/python/xorbits/_mars/services/tests/test_svcs/test_svc2/supervisor.py @@ -0,0 +1,50 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..... import oscar as mo +from ....core import AbstractService + + +class SvcActor2(mo.Actor): + def __init__(self, arg, ref_uid): + super().__init__() + self._arg = arg + self._ref_uid = ref_uid + self._ref = None + + async def __post_create__(self): + self._ref = await mo.actor_ref(self._ref_uid, address=self.address) + + async def get_arg(self): + return await self._ref.get_arg() + ":" + self._arg + + +class TestService2(AbstractService): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._ref = None + + async def start(self): + svc_config = self._config["test_svc2"] + self._ref = await mo.create_actor( + SvcActor2, + uid=svc_config["uid"], + arg=svc_config["arg2"], + ref_uid=svc_config["ref"], + address=self._address, + ) + + async def stop(self): + assert self._ref is not None + await self._ref.destroy() diff --git a/python/xorbits/_mars/services/tests/test_svcs/test_warn_svc/__init__.py b/python/xorbits/_mars/services/tests/test_svcs/test_warn_svc/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/services/tests/test_svcs/test_warn_svc/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/services/tests/test_svcs/test_warn_svc/supervisor.py b/python/xorbits/_mars/services/tests/test_svcs/test_warn_svc/supervisor.py new file mode 100644 index 000000000..6566ff30e --- /dev/null +++ b/python/xorbits/_mars/services/tests/test_svcs/test_warn_svc/supervisor.py @@ -0,0 +1,19 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ....core import AbstractService + + +class AbsDeriveService(AbstractService): + pass diff --git a/python/xorbits/_mars/services/web/__init__.py b/python/xorbits/_mars/services/web/__init__.py new file mode 100644 index 000000000..4021fe30a --- /dev/null +++ b/python/xorbits/_mars/services/web/__init__.py @@ -0,0 +1,26 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .api import OscarWebAPI +from .core import ( + MarsRequestHandler, + MarsServiceWebAPIHandler, + MarsWebAPIClientMixin, + web_api, +) + +try: + from .supervisor import WebActor +except ImportError: # pragma: no cover + pass diff --git a/python/xorbits/_mars/services/web/api/__init__.py b/python/xorbits/_mars/services/web/api/__init__.py new file mode 100644 index 000000000..807c8487e --- /dev/null +++ b/python/xorbits/_mars/services/web/api/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .oscar import OscarWebAPI diff --git a/python/xorbits/_mars/services/web/api/oscar.py b/python/xorbits/_mars/services/web/api/oscar.py new file mode 100644 index 000000000..53df33fca --- /dev/null +++ b/python/xorbits/_mars/services/web/api/oscar.py @@ -0,0 +1,35 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Type + +from .... import oscar as mo +from ....lib.aio import alru_cache + + +class OscarWebAPI: + def __init__(self, address: str, web_ref: mo.ActorRef): + self._address = address + self._web_ref = web_ref + + @classmethod + @alru_cache(cache_exceptions=False) + async def create(cls: Type["OscarWebAPI"], address: str) -> "OscarWebAPI": + from ..supervisor import WebActor + + ref = await mo.actor_ref(WebActor.default_uid(), address=address) + return cls(address, ref) + + async def get_web_address(self) -> str: + return await self._web_ref.get_web_address() diff --git a/python/xorbits/_mars/services/web/api/web.py b/python/xorbits/_mars/services/web/api/web.py new file mode 100644 index 000000000..e8905e23a --- /dev/null +++ b/python/xorbits/_mars/services/web/api/web.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json + +from ..core import MarsRequestHandler + + +class MarsApiEntryHandler(MarsRequestHandler): + def get(self): + import mars + + version = mars.__version__ + self.write(json.dumps({"mars_version": version})) + + +web_handlers = {"/api": MarsApiEntryHandler} diff --git a/python/xorbits/_mars/services/web/core.py b/python/xorbits/_mars/services/web/core.py new file mode 100644 index 000000000..bdbbf7c76 --- /dev/null +++ b/python/xorbits/_mars/services/web/core.py @@ -0,0 +1,273 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import functools +import inspect +import logging +import re +import sys +import urllib.parse +from collections import defaultdict +from typing import Callable, Dict, List, NamedTuple, Optional, Type, Union + +from tornado import httpclient, web +from tornado.simple_httpclient import HTTPRequest, HTTPTimeoutError + +from ...lib.aio import alru_cache +from ...utils import deserialize_serializable, serialize_serializable + +if sys.version_info[:2] == (3, 6): + # make sure typing works + re.Pattern = type(re.compile(r".*")) + +logger = logging.getLogger(__name__) +_ROOT_PLACEHOLDER = "ROOT_PLACEHOLDER" + + +class MarsRequestHandler(web.RequestHandler): # pragma: no cover + def initialize(self, supervisor_addr: str = None): + self._supervisor_addr = supervisor_addr + + +class _WebApiDef(NamedTuple): + sub_pattern: str + sub_pattern_compiled: re.Pattern + method: str + arg_filter: Optional[Dict] = None + + +def web_api( + sub_pattern: str, + method: Union[str, List[str]], + arg_filter: Optional[Dict] = None, + cache_blocking: bool = False, +): + if not sub_pattern.endswith("$"): # pragma: no branch + sub_pattern += "$" + methods = method if isinstance(method, list) else [method] + + def wrapper(func): + @functools.wraps(func) + async def wrapped(self: "MarsServiceWebAPIHandler", *args, **kwargs): + try: + if not inspect.iscoroutinefunction(func): + return func(self, *args, **kwargs) + elif not cache_blocking or self.request.method.lower() != "get": + res = await func(self, *args, **kwargs) + else: + res = await self._create_or_get_url_future( + func, self, *args, **kwargs + ) + return res + except GeneratorExit: + raise + except: # noqa: E722 # nosec # pylint: disable=bare-except + exc_type, exc, tb = sys.exc_info() + err_msg = ( + f"{exc_type.__name__} when handling request with " + f"{type(self).__name__}.{func.__name__}" + ) + logger.exception(err_msg) + self.write(serialize_serializable((exc, tb))) + self.set_status(500, err_msg) + + wrapped._web_api_defs = [ + _WebApiDef(sub_pattern, re.compile(sub_pattern), m, arg_filter) + for m in methods + ] + return wrapped + + return wrapper + + +@alru_cache(cache_exceptions=False) +async def _get_cluster_api(address: str): + from ..cluster import ClusterAPI + + return await ClusterAPI.create(address) + + +@alru_cache(cache_exceptions=False) +async def _get_api_by_key( + api_cls: Type, session_id: str, address: str, with_key_arg: bool = True +): + cluster_api = await _get_cluster_api(address) + [address] = await cluster_api.get_supervisors_by_keys([session_id]) + if with_key_arg: + return await api_cls.create(session_id, address) + else: + return await api_cls.create(address) + + +class MarsServiceWebAPIHandler(MarsRequestHandler): + _root_pattern: str = None + _method_to_handlers: Dict[str, Dict[Callable, _WebApiDef]] = None + _uri_to_futures: Dict[str, asyncio.Task] = None + + def __init__(self, *args, **kwargs): + self._collect_services() + super().__init__(*args, **kwargs) + + def _get_api_by_key( + self, api_cls: Type, session_id: str, with_key_arg: bool = True + ): + return _get_api_by_key( + api_cls, + session_id, + address=self._supervisor_addr, + with_key_arg=with_key_arg, + ) + + def _create_or_get_url_future(self, func, *args, **kw): + if self._uri_to_futures is None: + type(self)._uri_to_futures = dict() + + uri = self.request.uri + if uri in self._uri_to_futures: + return self._uri_to_futures[uri] + + def _future_remover(_fut): + self._uri_to_futures.pop(uri, None) + + task = self._uri_to_futures[uri] = asyncio.create_task(func(*args, **kw)) + task.add_done_callback(_future_remover) + return task + + @classmethod + def _collect_services(cls): + if cls._method_to_handlers is not None: + return + + cls._method_to_handlers = defaultdict(dict) + for attr in dir(cls): + handle_func = getattr(cls, attr, None) + if not hasattr(handle_func, "_web_api_defs"): + continue + web_api_defs = getattr( + handle_func, "_web_api_defs" + ) # type: List[_WebApiDef] + for api_def in web_api_defs: + cls._method_to_handlers[api_def.method.lower()][handle_func] = api_def + + def prepare(self): + self.set_header("Content-Type", "application/octet-stream") + + @classmethod + def get_root_pattern(cls): + return cls._root_pattern + "(?:/(?P.*)$|$)" + + @functools.lru_cache(100) + def _route_sub_path(self, http_method: str, sub_path: str): + handlers = self._method_to_handlers[http_method.lower()] + method, kwargs = None, None + for handler_method, web_api_def in handlers.items(): + match = web_api_def.sub_pattern_compiled.match(sub_path) + if match is not None: + if web_api_def.arg_filter is not None: + if not all( + self.get_argument(k, None) == v + for k, v in web_api_def.arg_filter.items() + ): + continue + method, kwargs = handler_method, dict(match.groupdict()) + elif method is None: + # method matched with arg_filter shall not be overwritten + method, kwargs = handler_method, dict(match.groupdict()) + if method is not None: + return method, kwargs + else: + raise web.HTTPError( + 404, + f"{sub_path} does not match any defined APIs " + f"with method {http_method.upper()}", + ) + + def _make_handle_http_method(http_method: str): + async def _handle_http_method(self: "MarsServiceWebAPIHandler", **kwargs): + # make sure results from APIs is not stored + self.add_header("Cache-Control", "no-store") + + sub_path = kwargs.pop("sub_path", None) or "" + method, kw = self._route_sub_path(http_method, sub_path) + kw.update(kwargs) + res = method(self, **kw) + if inspect.isawaitable(res): + await res + + _handle_http_method.__name__ = http_method.lower() + return _handle_http_method + + get = _make_handle_http_method("get") + put = _make_handle_http_method("put") + post = _make_handle_http_method("post") + patch = _make_handle_http_method("patch") + delete = _make_handle_http_method("delete") + + del _make_handle_http_method + + +class MarsWebAPIClientMixin: + @property + def _client(self): + try: + return self._client_obj + except AttributeError: + self._client_obj = httpclient.AsyncHTTPClient() + return self._client_obj + + @property + def request_rewriter(self) -> Callable: + return getattr(self, "_request_rewriter", None) + + @request_rewriter.setter + def request_rewriter(self, value: Callable): + self._request_rewriter = value + + async def _request_url(self, method, path, **kwargs): + self._running_loop = asyncio.get_running_loop() + + if "data" in kwargs: + kwargs["body"] = kwargs.pop("data") + + if "params" in kwargs: + params = kwargs.pop("params") + for k, v in params.items(): + if isinstance(v, (list, tuple, set)): + params[k] = ",".join(str(i) for i in v) + url_params = urllib.parse.urlencode(params) + path_connector = "?" if "?" not in path else "&" + path += path_connector + url_params + + try: + request = HTTPRequest(path, method=method, **kwargs) + if self.request_rewriter: + request = self.request_rewriter(request) + res = await self._client.fetch(request, raise_error=False) + except HTTPTimeoutError as ex: + raise TimeoutError(str(ex)) from None + + if res.code < 400: + return res + else: + exc, tb = None, None + try: + exc, tb = deserialize_serializable(res.body) + except: # noqa: E722 # nosec # pylint: disable=bare-except + pass + + if exc is None: + raise res.error + else: + raise exc.with_traceback(tb) diff --git a/python/xorbits/_mars/services/web/handlers.py b/python/xorbits/_mars/services/web/handlers.py new file mode 100644 index 000000000..cc1fc8c05 --- /dev/null +++ b/python/xorbits/_mars/services/web/handlers.py @@ -0,0 +1,45 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from tornado import web + +from .core import MarsRequestHandler + + +class IndexHandler(MarsRequestHandler): + def _get_index_page(self): + try: + return self._index_page + except AttributeError: + index_file = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "index.html" + ) + with open(index_file, "r") as file_obj: + self._index_page = file_obj.read() + return self._index_page + + def get(self): + self.write(self._get_index_page()) + + +handlers = {"/": IndexHandler} + +static_handlers = { + r"[^\?\&]*/static/(.*)": ( + web.StaticFileHandler, + {"path": os.path.join(os.path.dirname(__file__), "static")}, + ) +} diff --git a/python/xorbits/_mars/services/web/index.html b/python/xorbits/_mars/services/web/index.html new file mode 100644 index 000000000..29c1af7cb --- /dev/null +++ b/python/xorbits/_mars/services/web/index.html @@ -0,0 +1,18 @@ + + + + Mars UI + + + + + + +
+ + diff --git a/python/xorbits/_mars/services/web/supervisor.py b/python/xorbits/_mars/services/web/supervisor.py new file mode 100644 index 000000000..3d5405032 --- /dev/null +++ b/python/xorbits/_mars/services/web/supervisor.py @@ -0,0 +1,118 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import logging +import os + +from tornado import web + +from ... import oscar as mo +from ...utils import get_next_port +from ..core import AbstractService + +logger = logging.getLogger(__name__) + + +class WebActor(mo.Actor): + def __init__(self, config): + super().__init__() + self._config = config + self._web_server = None + self._web_app = None + + extra_mod_names = self._config.get("extra_discovery_modules") or [] + web_handlers = self._config.get("web_handlers", {}) + for mod_name in extra_mod_names: + try: + web_mod = importlib.import_module(mod_name) + web_handlers.update(getattr(web_mod, "web_handlers", {})) + except ImportError: # pragma: no cover + pass + + async def __post_create__(self): + from .handlers import handlers, static_handlers + + supervisor_addr = self.address + + host = self._config.get("host") or "0.0.0.0" + port = self._config.get("port") or get_next_port() + self._web_address = f"http://{host}:{port}" + handlers.update(self._config.get("web_handlers", {})) + web_handlers = [] + for p, h in handlers.items(): + web_handlers.append((p, h, {"supervisor_addr": supervisor_addr})) + web_handlers.extend([(*[p], *v) for p, v in static_handlers.items()]) + + retrial = 5 + while retrial: + try: + if port is None: + port = get_next_port() + + # For debugging tornado, use debug=True to enable hot deploy + self._web_app = web.Application(web_handlers) + self._web_server = self._web_app.listen(port, host) + logger.info("Mars Web started at %s:%d", host, port) + break + except OSError: # pragma: no cover + if port is not None: + raise + retrial -= 1 + if retrial == 0: + raise + + async def __pre_destroy__(self): + if self._web_server is not None: + self._web_server.stop() + + def get_web_address(self): + web_address = self._web_address + if os.name == "nt": + web_address = web_address.replace("0.0.0.0", "127.0.0.1") + return web_address + + +class WebSupervisorService(AbstractService): + """ + Web service on supervisor. + + Service Configuration + --------------------- + { + "web": { + "host": "", + "port": "", + "web_handlers": [ + , + ], + "extra_discovery_modules": [ + "path.to.modules", + ] + } + } + """ + + async def start(self): + await mo.create_actor( + WebActor, + config=self._config.get("web", {}), + uid=WebActor.default_uid(), + address=self._address, + ) + + async def stop(self): + await mo.destroy_actor( + mo.create_actor_ref(uid=WebActor.default_uid(), address=self._address) + ) diff --git a/python/xorbits/_mars/services/web/tests/__init__.py b/python/xorbits/_mars/services/web/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/services/web/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/services/web/tests/extra_handler.py b/python/xorbits/_mars/services/web/tests/extra_handler.py new file mode 100644 index 000000000..09450983e --- /dev/null +++ b/python/xorbits/_mars/services/web/tests/extra_handler.py @@ -0,0 +1,23 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..core import MarsRequestHandler + + +class ExtraTestHandler(MarsRequestHandler): + def get(self): + self.write("Test") + + +web_handlers = {"/api/extra_test": ExtraTestHandler} diff --git a/python/xorbits/_mars/services/web/tests/test_core.py b/python/xorbits/_mars/services/web/tests/test_core.py new file mode 100644 index 000000000..dfc029a13 --- /dev/null +++ b/python/xorbits/_mars/services/web/tests/test_core.py @@ -0,0 +1,168 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import os +import sys + +import pytest +from tornado import httpclient + +from .... import oscar as mo +from ....utils import get_next_port +from .. import MarsServiceWebAPIHandler, MarsWebAPIClientMixin, WebActor, web_api +from ..api.web import MarsApiEntryHandler + + +class TestAPIHandler(MarsServiceWebAPIHandler): + __test__ = False + _root_pattern = "/api/test/(?P[^/]+)" + _call_counter = 0 + + @web_api("", method="get") + def get_method_root(self, test_id): + self.write(f"get_root_value_{test_id}") + + @web_api("", method="post") + def post_method_root(self, test_id): + self.write(f"post_root_value_{test_id}") + + @web_api("subtest/(?P[^/]+)", method="get") + def get_method_sub_patt(self, test_id, subtest_id): + self.write(f"get_sub_value_{test_id}_{subtest_id}") + + @web_api("subtest/(?P[^/]+)", method="get", arg_filter={"action": "a1"}) + async def get_method_sub_patt_match_arg1(self, test_id, subtest_id): + self.write(f"get_sub_value_{test_id}_{subtest_id}_action1") + + @web_api("subtest/(?P[^/]+)", method="get", arg_filter={"action": "a2"}) + async def get_method_sub_patt_match_arg2(self, test_id, subtest_id): + self.write(f"get_sub_value_{test_id}_{subtest_id}_action2") + + @web_api("subtest_error", method="get") + def get_with_error(self, test_id): + raise ValueError + + @web_api("subtest_delay", method="get") + async def get_with_timeout(self, test_id): + await asyncio.sleep(100) + raise ValueError(test_id) + + @web_api("subtest_delay_cache", method="get", cache_blocking=True) + async def get_with_blocking_cache(self, test_id): + await asyncio.sleep(1) + type(self)._call_counter += 1 + self.write(test_id) + + +@pytest.fixture +async def actor_pool(): + start_method = ( + os.environ.get("POOL_START_METHOD", "forkserver") + if sys.platform != "win32" + else None + ) + pool = await mo.create_actor_pool( + "127.0.0.1", n_process=0, subprocess_start_method=start_method + ) + async with pool: + web_config = { + "host": "127.0.0.1", + "port": get_next_port(), + "web_handlers": { + "/api": MarsApiEntryHandler, + TestAPIHandler.get_root_pattern(): TestAPIHandler, + }, + "extra_discovery_modules": ["mars.services.web.tests.extra_handler"], + } + await mo.create_actor(WebActor, web_config, address=pool.external_address) + yield pool, web_config["port"] + + +class SimpleWebClient(MarsWebAPIClientMixin): + async def fetch(self, path, method="GET", **kwargs): + return await self._request_url(method, path, **kwargs) + + +@pytest.mark.asyncio +async def test_web_api(actor_pool): + _pool, web_port = actor_pool + recorded_urls = [] + + def url_recorder(request): + recorded_urls.append(request.url) + return request + + client = SimpleWebClient() + client.request_rewriter = url_recorder + + res = await client.fetch(f"http://localhost:{web_port}/") + assert res.body.decode() + + res = await client.fetch(f"http://localhost:{web_port}/api") + assert res.body.decode() + + res = await client.fetch(f"http://localhost:{web_port}/api/test/test_id") + assert res.body.decode() == "get_root_value_test_id" + + res = await client.fetch( + f"http://localhost:{web_port}/api/test/test_id", method="POST", data=b"" + ) + assert res.body.decode() == "post_root_value_test_id" + + res = await client.fetch( + f"http://localhost:{web_port}/api/test/test_id/subtest/sub_tid" + ) + assert res.body.decode() == "get_sub_value_test_id_sub_tid" + + res = await client.fetch( + f"http://localhost:{web_port}/api/test/test_id/subtest/sub_tid?action=a1" + ) + assert res.body.decode() == "get_sub_value_test_id_sub_tid_action1" + + res = await client.fetch( + f"http://localhost:{web_port}/api/test/test_id/subtest/sub_tid?action=a2" + ) + assert res.body.decode() == "get_sub_value_test_id_sub_tid_action2" + + with pytest.raises(httpclient.HTTPError) as excinfo: + await client.fetch(f"http://localhost:{web_port}/api/test/test_id/non_exist") + assert excinfo.value.code == 404 + + with pytest.raises(ValueError): + await client.fetch( + f"http://localhost:{web_port}/api/test/test_id/subtest_error" + ) + + # test multiple request into long immutable requests + req_uri = f"http://localhost:{web_port}/api/test/test_id/subtest_delay_cache" + tasks = [asyncio.create_task(client.fetch(req_uri)) for _ in range(2)] + await asyncio.sleep(0.5) + assert TestAPIHandler._call_counter == 0 + assert len(TestAPIHandler._uri_to_futures) == 1 + + await asyncio.gather(*tasks) + assert TestAPIHandler._call_counter == 1 + assert len(TestAPIHandler._uri_to_futures) == 0 + + with pytest.raises(TimeoutError): + await client.fetch( + f"http://localhost:{web_port}/api/test/test_id/subtest_delay", + request_timeout=0.5, + ) + + res = await client.fetch(f"http://localhost:{web_port}/api/extra_test") + assert "Test" in res.body.decode() + + assert len(recorded_urls) > 0 diff --git a/python/xorbits/_mars/services/web/worker.py b/python/xorbits/_mars/services/web/worker.py new file mode 100644 index 000000000..77ff70458 --- /dev/null +++ b/python/xorbits/_mars/services/web/worker.py @@ -0,0 +1,19 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..core import EmptyService + + +class WebWorkerService(EmptyService): + pass diff --git a/python/xorbits/_mars/session.py b/python/xorbits/_mars/session.py new file mode 100644 index 000000000..f9b28b7f9 --- /dev/null +++ b/python/xorbits/_mars/session.py @@ -0,0 +1,23 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .deploy.oscar.session import execute, fetch, fetch_log, new_session, stop_server + +__all__ = [ + "new_session", + "execute", + "fetch", + "fetch_log", + "stop_server", +] diff --git a/python/xorbits/_mars/storage/__init__.py b/python/xorbits/_mars/storage/__init__.py new file mode 100644 index 000000000..db448c859 --- /dev/null +++ b/python/xorbits/_mars/storage/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .base import StorageLevel, get_storage_backend +from .cuda import CudaStorage +from .filesystem import FileSystemStorage +from .ray import RayStorage +from .shared_memory import SharedMemoryStorage + +try: + # require vineyard, pyarrow + from .vineyard import VineyardStorage +except ImportError: + pass +try: + # require pyarrow + from .plasma import PlasmaStorage +except ImportError: + pass diff --git a/python/xorbits/_mars/storage/base.py b/python/xorbits/_mars/storage/base.py new file mode 100644 index 000000000..e4fb75889 --- /dev/null +++ b/python/xorbits/_mars/storage/base.py @@ -0,0 +1,293 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import operator +from abc import ABC, abstractmethod +from dataclasses import dataclass +from enum import Enum +from typing import Any, Dict, List, Tuple, Type, Union + +from ..utils import dataslots +from .core import StorageFileObject + +_storage_backends = dict() + + +def register_storage_backend(backend: Type["StorageBackend"]): + _storage_backends[backend.name] = backend + return backend + + +def get_storage_backend(backend_name) -> Type["StorageBackend"]: + return _storage_backends[backend_name] + + +_ComparableLevel = Union[int, "StorageLevel"] + + +class StorageLevel(Enum): + GPU = 1 << 0 + MEMORY = 1 << 1 + DISK = 1 << 2 + REMOTE = 1 << 3 + + def __and__(self, other: _ComparableLevel): + other_value = getattr(other, "value", other) + return self.value & other_value + + __rand__ = __and__ + + def __or__(self, other: _ComparableLevel): + other_value = getattr(other, "value", other) + return self.value | other_value + + __ror__ = __or__ + + def __lt__(self, other: _ComparableLevel): + other_value = getattr(other, "value", other) + return self.value < other_value + + def __gt__(self, other: _ComparableLevel): + other_value = getattr(other, "value", other) + return self.value > other_value + + def spill_level(self): + if self == StorageLevel.GPU: + return StorageLevel.MEMORY + elif self == StorageLevel.MEMORY: + return StorageLevel.DISK + else: # pragma: no cover + raise ValueError(f"Level {self} doesn't have spill level") + + @staticmethod + def from_str(s: str): + level_mapping = StorageLevel.__members__ + level_strings = [ss.strip() for ss in s.upper().split("|")] + levels = [] + for ls in level_strings: + if ls not in level_mapping: # pragma: no cover + raise ValueError(f"Unknown level {ls}") + levels.append(level_mapping[ls]) + return functools.reduce(operator.or_, levels) + + +@dataslots +@dataclass +class ObjectInfo: + size: int = None + device: int = None + object_id: Any = None + + +class StorageBackend(ABC): + name = None + is_seekable = True + + @classmethod + @abstractmethod + async def setup(cls, **kwargs) -> Tuple[Dict, Dict]: + """ + Setup environments, for example, start plasma store for plasma backend. + + Parameters + ---------- + kwargs : kwargs + Kwargs for setup. + + Returns + ------- + Tuple of two dicts + Dicts for initialization and teardown. + """ + + @staticmethod + async def teardown(**kwargs): + """ + Clean up the environments. + + Parameters + ---------- + kwargs : kwargs + Parameters for clean up. + """ + + @property + def size(self) -> Union[int, None]: + """ + The total size of storage. + + Returns + ------- + Size: int + Total size of storage. + """ + return None + + @property + @abstractmethod + def level(self) -> StorageLevel: + """ + Level of current storage backend. + + Returns + ------- + Level: StorageLevel + storage level. + """ + + @property + def backend_info(self) -> dict: + """ + Get the customized backend info of this storage backend. + + Returns + ------- + info: dict + Customized storage backend info dict. + """ + return {"name": self.name} + + @abstractmethod + async def get(self, object_id, **kwargs) -> object: + """ + Get object by key. For some backends, `columns` or `slice` can pass to get part of data. + + Parameters + ---------- + object_id : object id + Object id to get. + + kwargs: + Additional keyword arguments + + Returns + ------- + Python object + """ + + @abstractmethod + async def put(self, obj, importance: int = 0) -> ObjectInfo: + """ + Put object into storage with object_id. + + Parameters + ---------- + obj : python object + Object to put. + + importance: int + The priority to spill when storage is full + + Returns + ------- + ObjectInfo + object information including size, raw_size, device + """ + + @abstractmethod + async def delete(self, object_id): + """ + Delete object from storage by object_id. + + Parameters + ---------- + object_id + object id + """ + + @abstractmethod + async def object_info(self, object_id) -> ObjectInfo: + """ + Get information about stored object. + + Parameters + ---------- + object_id + object id + + Returns + ------- + ObjectInfo + Object info including size, device and etc. + """ + + @abstractmethod + async def open_writer(self, size=None) -> StorageFileObject: + """ + Return a file-like object for writing. + + Parameters + ---------- + size: int + Maximum size in bytes + + Returns + ------- + fileobj: StorageFileObject + """ + + @abstractmethod + async def open_reader(self, object_id) -> StorageFileObject: + """ + Return a file-like object for reading. + + Parameters + ---------- + object_id + Object id + + Returns + ------- + fileobj: StorageFileObject + """ + + async def list(self) -> List: + """ + List all stored objects in storage. + + Returns + ------- + List of objects + """ + + async def fetch(self, object_id): + """ + Fetch object to current worker. + + Parameters + ---------- + object_id + Object id. + """ + + async def pin(self, object_id): + """ + Pin the data to prevent the data being released or spilled. + + Parameters + ---------- + object_id + object id + """ + + async def unpin(self, object_id): + """ + Unpin the data, allow storage to release the data. + + Parameters + ---------- + object_id + object id + """ diff --git a/python/xorbits/_mars/storage/core.py b/python/xorbits/_mars/storage/core.py new file mode 100644 index 000000000..89055c4ce --- /dev/null +++ b/python/xorbits/_mars/storage/core.py @@ -0,0 +1,155 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import os +from abc import ABC, abstractmethod +from concurrent.futures import Executor +from typing import Any, Optional, Union + +from ..lib.aio import AioFileObject + + +class StorageFileObject(AioFileObject): + def __init__( + self, + file: Any, + object_id: Any, + loop: asyncio.BaseEventLoop = None, + executor: Executor = None, + ): + self._object_id = object_id + super().__init__(file, loop=loop, executor=executor) + + @property + def object_id(self): + return self._object_id + + async def __aexit__(self, exc_type, exc_val, exc_tb): + await super().__aexit__(exc_type, exc_val, exc_tb) + if self._executor: + self._executor.shutdown(wait=False) + + +class BufferWrappedFileObject(ABC): + def __init__(self, object_id: Any, mode: str, size: Optional[int] = None): + # check arguments + assert mode in ("w", "r"), 'mode must be "w" or "r"' + if mode == "w" and size is None: # pragma: no cover + raise ValueError("size must be provided to write") + + self._object_id = object_id + self._size = size + self._mode = mode + + self._offset = 0 + self._initialized = False + self._closed = False + + self._mv = None + self._buffer = None + + @abstractmethod + def _read_init(self): + """ + Initialization for read purpose. + """ + + @abstractmethod + def _write_init(self): + """ + Initialization for write purpose. + """ + + @property + def object_id(self): + return self._object_id + + @property + def buffer(self): + return self._buffer + + @property + def mode(self): + return self._mode + + def read(self, size=-1): + if not self._initialized: + self._read_init() + self._initialized = True + + offset = self._offset + size = self._size if size < 0 else size + end = min(self._size, offset + size) + result = self._mv[offset:end] + self._offset = end + return result + + def write(self, content: Union[bytes, memoryview]): + if not self._initialized: + self._write_init() + self._initialized = True + + offset = self._offset + content_length = getattr(content, "nbytes", len(content)) + new_offset = offset + content_length + self._mv[offset:new_offset] = content + self._offset = new_offset + + def seek(self, offset: int, whence: int = os.SEEK_SET): + if not self._initialized: + self._read_init() + self._initialized = True + + if whence == os.SEEK_END: + new_offset = self._size + offset + elif whence == os.SEEK_CUR: + new_offset = self._offset + offset + else: + assert whence == os.SEEK_SET + new_offset = offset + if new_offset < 0 or new_offset >= self._size: + raise ValueError( + f"File offset should be limited to (0, {self._size}), " + f"now is {new_offset}" + ) + self._offset = new_offset + return self._offset + + def tell(self): + return self._offset + + @abstractmethod + def _read_close(self): + """ + Close for read. + """ + + @abstractmethod + def _write_close(self): + """ + Close for write. + """ + + def close(self): + if self._closed: + return + + self._closed = True + if self._mode == "w": + self._write_close() + else: + self._read_close() + self._mv = None + self._buffer = None diff --git a/python/xorbits/_mars/storage/cuda.py b/python/xorbits/_mars/storage/cuda.py new file mode 100644 index 000000000..274aed44d --- /dev/null +++ b/python/xorbits/_mars/storage/cuda.py @@ -0,0 +1,309 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ctypes +import pickle +import uuid +from io import BytesIO +from typing import Dict, List, Tuple, Union + +import numpy as np +import pandas as pd + +from ..serialization import deserialize, serialize +from ..utils import implements, lazy_import +from .base import ObjectInfo, StorageBackend, StorageLevel, register_storage_backend +from .core import StorageFileObject + +cupy = lazy_import("cupy") +cudf = lazy_import("cudf") + + +_id_to_buffers = dict() + + +class CudaFileObject: + def __init__(self, mode: str, object_id: str, size: int = None): + self._mode = mode + self._object_id = object_id + self._size = size + self._closed = False + self._buffers = None + self._headers = None + self._offset = None + # for read + self._has_read_headers = None + # for write + self._has_write_headers = None + self._cur_buffer_index = None + if "r" in mode: + assert object_id is not None + self._initialize_read() + elif "w" in mode: + self._initialize_write() + + @property + def object_id(self): + return self._object_id + + @property + def mode(self): + return self._mode + + def _initialize_read(self): + from cudf.core.buffer import Buffer + from cupy.cuda.memory import UnownedMemory + + self._offset = 0 + self._has_read_headers = False + self._buffers = [] + (metas, serialized), buffers = _id_to_buffers[self._object_id] + self._headers = headers = (metas.copy(), serialized) + buffer_types = [] + for buf in buffers: + if isinstance(buf, cupy.ndarray): + ptr, size = buf.data.ptr, buf.size + self._buffers.append(UnownedMemory(ptr, size, Buffer(ptr, size=size))) + buffer_types.append(["cuda", size]) + elif isinstance(buf, Buffer): + ptr, size = buf.ptr, buf.size + if size == 0: + # empty buffer cannot construct a UnownedMemory + self._buffers.append(None) + else: + self._buffers.append(UnownedMemory(ptr, size, Buffer(ptr, size))) + buffer_types.append(["cuda", size]) + else: + size = getattr(buf, "size", len(buf)) + self._buffers.append(buf) + buffer_types.append(["memory", size]) + headers[0]["buffer_types"] = buffer_types + + def _initialize_write(self): + self._had_write_headers = False + self._cur_buffer_index = 0 + self._buffers = [] + self._offset = 0 + + def read(self, size: int): + # we read cuda_header first and then read cuda buffers one by one, + # the return value's size is not exactly the specified size. + from cudf.core.buffer import Buffer + from cupy.cuda import MemoryPointer + from cupy.cuda.memory import UnownedMemory + + if not self._has_read_headers: + self._has_read_headers = True + return pickle.dumps(self._headers) + if len(self._buffers) == 0: + return "" + cur_buf = self._buffers[0] + # current buf read to end + if cur_buf is None: + # empty cuda buffer + content = Buffer.empty(0) + self._offset = 0 + self._buffers.pop(0) + return content + elif size >= cur_buf.size - self._offset: + if isinstance(cur_buf, UnownedMemory): + cupy_pointer = MemoryPointer(cur_buf, self._offset) + content = Buffer(cupy_pointer.ptr, size=cur_buf.size - self._offset) + else: + content = cur_buf[self._offset : self._offset + size] + self._offset = 0 + self._buffers.pop(0) + return content + else: + if isinstance(cur_buf, UnownedMemory): + cupy_pointer = MemoryPointer(cur_buf, self._offset) + self._offset += size + return Buffer(cupy_pointer.ptr, size=size) + else: + self._offset += size + return cur_buf[self._offset, self._offset + size] + + def write(self, content): + from cupy.cuda import MemoryPointer + from cupy.cuda.memory import UnownedMemory + from rmm import DeviceBuffer + + if not self._has_write_headers: + self._headers = headers = pickle.loads(content) + buffer_types = headers[0]["buffer_types"] + for buffer_type, size in buffer_types: + if buffer_type == "cuda": + self._buffers.append(DeviceBuffer(size=size)) + else: + self._buffers.append(BytesIO()) + self._has_write_headers = True + return + + cur_buf = self._buffers[self._cur_buffer_index] + cur_buf_size = self._headers[0]["buffer_types"][self._cur_buffer_index][1] + if isinstance(cur_buf, DeviceBuffer): + cur_cupy_memory = UnownedMemory(cur_buf.ptr, cur_buf.size, cur_buf) + cupy_pointer = MemoryPointer(cur_cupy_memory, self._offset) + + if isinstance(content, bytes): + content_length = len(content) + source_mem = np.frombuffer(content, dtype="uint8").ctypes.data_as( + ctypes.c_void_p + ) + else: + source_mem = MemoryPointer( + UnownedMemory(content.ptr, content.size, content), 0 + ) + content_length = source_mem.mem.size + cupy_pointer.copy_from(source_mem, content_length) + else: + content_length = len(content) + cur_buf.write(content) + if content_length + self._offset >= cur_buf_size: + if isinstance(cur_buf, BytesIO): + self._buffers[self._cur_buffer_index] = cur_buf.getvalue() + self._cur_buffer_index += 1 + self._offset = 0 + else: + self._offset += content_length + + def _read_close(self): + self._offset = None + self._cuda_buffers = None + self._cuda_header = None + self._has_read_headers = None + + def _write_close(self): + headers = self._headers + headers[0].pop("buffer_types") + # hold cuda buffers + + _id_to_buffers[self._object_id] = headers, self._buffers + + self._has_write_headers = None + self._cur_buffer_index = None + self._cuda_buffers = None + self._cuda_header = None + self._offset = None + + def close(self): + if self._closed: + return + self._closed = True + if self._mode == "w": + self._write_close() + else: + self._read_close() + + +@register_storage_backend +class CudaStorage(StorageBackend): + name = "cuda" + is_seekable = False + + def __init__(self, size=None): + self._size = size + + @classmethod + @implements(StorageBackend.setup) + async def setup(cls, **kwargs) -> Tuple[Dict, Dict]: + size = kwargs.pop("size", None) + if kwargs: # pragma: no cover + raise TypeError(f'CudaStorage got unexpected config: {",".join(kwargs)}') + + return dict(size=size), dict() + + @staticmethod + @implements(StorageBackend.teardown) + async def teardown(**kwargs): + pass + + @property + @implements(StorageBackend.level) + def level(self): + return StorageLevel.GPU + + @property + @implements(StorageBackend.size) + def size(self) -> Union[int, None]: + return self._size + + @staticmethod + def _to_cuda(obj): # pragma: no cover + if isinstance(obj, np.ndarray): + return cupy.asarray(obj) + elif isinstance(obj, pd.DataFrame): + return cudf.DataFrame.from_pandas(obj) + elif isinstance(obj, pd.Series): + return cudf.Series.from_pandas(obj) + return obj + + @implements(StorageBackend.get) + async def get(self, object_id: str, **kwargs) -> object: + from cudf.core.buffer import Buffer as CPBuffer + from rmm import DeviceBuffer + + headers, buffers = _id_to_buffers[object_id] + new_buffers = [] + for buf in buffers: + if isinstance(buf, cupy.ndarray): + new_buffers.append(DeviceBuffer(ptr=buf.data.ptr, size=buf.size)) + elif isinstance(buf, CPBuffer): + new_buffers.append(DeviceBuffer(ptr=buf.ptr, size=buf.size)) + else: + new_buffers.append(buf) + return deserialize(headers, new_buffers) + + @implements(StorageBackend.put) + async def put(self, obj, importance=0) -> ObjectInfo: + from cudf.core.buffer import Buffer as CPBuffer + + string_id = str(uuid.uuid4()) + headers, buffers = serialize(obj) + size = sum( + buf.size for buf in buffers if isinstance(buf, (cupy.ndarray, CPBuffer)) + ) + _id_to_buffers[string_id] = headers, buffers + return ObjectInfo(size=size, object_id=string_id) + + @implements(StorageBackend.delete) + async def delete(self, object_id: str): + if object_id in _id_to_buffers: + del _id_to_buffers[object_id] + + @implements(StorageBackend.object_info) + async def object_info(self, object_id: str) -> ObjectInfo: + from cudf.core.buffer import Buffer as CPBuffer + + size = sum( + buf.size + for buf in _id_to_buffers[object_id][1] + if isinstance(buf, (cupy.ndarray, CPBuffer)) + ) + return ObjectInfo(size=size, object_id=object_id) + + @implements(StorageBackend.open_writer) + async def open_writer(self, size=None) -> StorageFileObject: + object_id = str(uuid.uuid4()) + cuda_writer = CudaFileObject(object_id=object_id, mode="w", size=size) + return StorageFileObject(cuda_writer, object_id=object_id) + + @implements(StorageBackend.open_reader) + async def open_reader(self, object_id) -> StorageFileObject: + cuda_reader = CudaFileObject(mode="r", object_id=object_id) + return StorageFileObject(cuda_reader, object_id=object_id) + + @implements(StorageBackend.list) + async def list(self) -> List: # pragma: no cover + raise NotImplementedError("Cuda storage doesn't support `list` method.") diff --git a/python/xorbits/_mars/storage/errors.py b/python/xorbits/_mars/storage/errors.py new file mode 100644 index 000000000..2fc160d7b --- /dev/null +++ b/python/xorbits/_mars/storage/errors.py @@ -0,0 +1,19 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..core.base import MarsError + + +class DataNotExist(MarsError): + pass diff --git a/python/xorbits/_mars/storage/filesystem.py b/python/xorbits/_mars/storage/filesystem.py new file mode 100644 index 000000000..1b801ce7f --- /dev/null +++ b/python/xorbits/_mars/storage/filesystem.py @@ -0,0 +1,149 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import uuid +from typing import Dict, List, Optional, Tuple + +from ..lib.aio import AioFilesystem +from ..lib.filesystem import FileSystem, get_fs +from ..serialization import AioDeserializer, AioSerializer +from ..utils import implements, mod_hash +from .base import ObjectInfo, StorageBackend, StorageLevel, register_storage_backend +from .core import StorageFileObject + + +@register_storage_backend +class FileSystemStorage(StorageBackend): + name = "filesystem" + + def __init__( + self, fs: FileSystem, root_dirs: List[str], level: StorageLevel, size: int + ): + self._fs = AioFilesystem(fs) + self._root_dirs = root_dirs + self._level = level + self._size = size + + @classmethod + @implements(StorageBackend.setup) + async def setup(cls, **kwargs) -> Tuple[Dict, Dict]: + root_dirs = kwargs.pop("root_dirs") + level = kwargs.pop("level") + size = kwargs.pop("size", None) + fs = kwargs.pop("fs", None) + if kwargs: # pragma: no cover + raise TypeError( + f'FileSystemStorage got unexpected config: {",".join(kwargs)}' + ) + + if isinstance(root_dirs, str): + root_dirs = root_dirs.split(":") + if isinstance(level, str): + level = StorageLevel.from_str(level) + + if fs is None: + fs = get_fs(root_dirs[0]) + + for d in root_dirs: + if not fs.exists(d): + fs.mkdir(d) + params = dict(fs=fs, root_dirs=root_dirs, level=level, size=size) + return params, params + + @staticmethod + @implements(StorageBackend.teardown) + async def teardown(**kwargs): + fs = kwargs.get("fs") + root_dirs = kwargs.get("root_dirs") + for d in root_dirs: + fs.delete(d, recursive=True) + + @property + @implements(StorageBackend.level) + def level(self) -> StorageLevel: + return self._level + + @property + @implements(StorageBackend.size) + def size(self) -> Optional[int]: + return self._size + + def _generate_path(self): + file_name = str(uuid.uuid4()) + selected_index = mod_hash(file_name, len(self._root_dirs)) + selected_dir = self._root_dirs[selected_index] + return os.path.join(selected_dir, file_name) + + @implements(StorageBackend.get) + async def get(self, object_id, **kwargs) -> object: + if kwargs: # pragma: no cover + raise NotImplementedError(f'Got unsupported args: {",".join(kwargs)}') + + file = await self._fs.open(object_id, "rb") + async with file as f: + deserializer = AioDeserializer(f) + return await deserializer.run() + + @implements(StorageBackend.put) + async def put(self, obj, importance: int = 0) -> ObjectInfo: + serializer = AioSerializer(obj) + buffers = await serializer.run() + buffer_size = sum(getattr(buf, "nbytes", len(buf)) for buf in buffers) + + path = self._generate_path() + file = await self._fs.open(path, "wb") + async with file as f: + for buffer in buffers: + await f.write(buffer) + + return ObjectInfo(size=buffer_size, object_id=path) + + @implements(StorageBackend.delete) + async def delete(self, object_id): + await self._fs.delete(object_id) + + @implements(StorageBackend.list) + async def list(self) -> List: + file_list = [] + for d in self._root_dirs: + file_list.extend(list(await self._fs.ls(d))) + return file_list + + @implements(StorageBackend.object_info) + async def object_info(self, object_id) -> ObjectInfo: + stat = await self._fs.stat(object_id) + return ObjectInfo(size=stat["size"], object_id=object_id) + + @implements(StorageBackend.open_writer) + async def open_writer(self, size=None) -> StorageFileObject: + path = self._generate_path() + file = await self._fs.open(path, "wb") + return StorageFileObject(file, file.name) + + @implements(StorageBackend.open_reader) + async def open_reader(self, object_id) -> StorageFileObject: + file = await self._fs.open(object_id, "rb") + return StorageFileObject(file, file.name) + + +@register_storage_backend +class DiskStorage(FileSystemStorage): + name = "disk" + + @classmethod + @implements(StorageBackend.setup) + async def setup(cls, **kwargs) -> Tuple[Dict, Dict]: + kwargs["level"] = StorageLevel.DISK + return await super().setup(**kwargs) diff --git a/python/xorbits/_mars/storage/plasma.py b/python/xorbits/_mars/storage/plasma.py new file mode 100644 index 000000000..bd1667361 --- /dev/null +++ b/python/xorbits/_mars/storage/plasma.py @@ -0,0 +1,287 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import sys +from dataclasses import dataclass +from functools import lru_cache +from typing import Any, Dict, List, Optional, Tuple + +import psutil +import pyarrow as pa + +from ..resource import virtual_memory +from ..serialization import AioDeserializer, AioSerializer +from ..utils import calc_size_by_str, dataslots, implements, lazy_import +from .base import ObjectInfo, StorageBackend, StorageLevel, register_storage_backend +from .core import BufferWrappedFileObject, StorageFileObject +from .errors import DataNotExist + +plasma = lazy_import("pyarrow.plasma", rename="plasma") +if sys.platform.startswith("win"): + plasma = None + +PAGE_SIZE = 64 * 1024 + + +class PlasmaFileObject(BufferWrappedFileObject): + def __init__( + self, + plasma_client: "plasma.PlasmaClient", + object_id: Any, + mode: str, + size: Optional[int] = None, + ): + self._plasma_client = plasma_client + self._file = None + super().__init__(object_id, mode, size=size) + + @property + def buffer(self): + return getattr(self, "_buffer", None) + + def _write_init(self): + self._buffer = buf = self._plasma_client.create(self._object_id, self._size) + file = self._file = pa.FixedSizeBufferWriter(buf) + file.set_memcopy_threads(6) + + def _read_init(self): + self._buffer = buf = self._plasma_client.get_buffers([self._object_id])[0] + self._mv = memoryview(buf) + self._size = len(buf) + + def write(self, content: bytes): + if not self._initialized: + self._write_init() + self._initialized = True + + return self._file.write(content) + + def _write_close(self): + try: + self._plasma_client.seal(self._object_id) + except plasma.PlasmaObjectNotFound: + pass + self._file = None + + def _read_close(self): + pass + + +class PlasmaStorageFileObject(StorageFileObject): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._buffer = None + + async def close(self): + self._buffer = self._file.buffer + await super().close() + + +@dataslots +@dataclass +class PlasmaObjectInfo(ObjectInfo): + buffer: memoryview = None + plasma_socket: str = None + + @classmethod + @lru_cache(5) + def _get_plasma_client(cls, socket): + return plasma.connect(socket) + + def __getstate__(self): + return self.size, self.device, self.object_id, self.plasma_socket + + def __setstate__(self, state): + self.size, self.device, self.object_id, self.plasma_socket = state + client = self._get_plasma_client(self.plasma_socket) + [self.buffer] = client.get_buffers([self.object_id]) + + +def get_actual_capacity(plasma_client: "plasma.PlasmaClient") -> int: + """ + Get actual capacity of plasma store + + Parameters + ---------- + plasma_client: PlasmaClient + Plasma client. + + Returns + ------- + size: int + Actual storage size in bytes + """ + store_limit = plasma_client.store_capacity() + + left_size = store_limit + alloc_fraction = 1 + while True: + allocate_size = int(left_size * alloc_fraction / PAGE_SIZE) * PAGE_SIZE + try: + obj_id = plasma.ObjectID.from_random() + buf = [plasma_client.create(obj_id, allocate_size)] + plasma_client.seal(obj_id) + del buf[:] + break + except plasma.PlasmaStoreFull: # pragma: no cover + alloc_fraction *= 0.99 + finally: + plasma_client.evict(allocate_size) + return allocate_size + + +@register_storage_backend +class PlasmaStorage(StorageBackend): + name = "plasma" + + def __init__( + self, + plasma_socket: str = None, + plasma_directory: str = None, + capacity: int = None, + check_dir_size: bool = True, + ): + self._plasma_socket = plasma_socket + self._client = plasma.connect(plasma_socket) + self._plasma_directory = plasma_directory + self._capacity = capacity + self._check_dir_size = check_dir_size + + @classmethod + @implements(StorageBackend.setup) + async def setup(cls, **kwargs) -> Tuple[Dict, Dict]: + loop = asyncio.get_running_loop() + store_memory = kwargs.pop("store_memory") + plasma_directory = kwargs.pop("plasma_directory", None) + check_dir_size = kwargs.pop("check_dir_size", True) + + if kwargs: + raise TypeError(f'PlasmaStorage got unexpected config: {",".join(kwargs)}') + + store_memory = int( + calc_size_by_str(store_memory, virtual_memory().total) * 0.95 + ) + plasma_store = plasma.start_plasma_store( + store_memory, plasma_directory=plasma_directory + ) + plasma_socket = (await loop.run_in_executor(None, plasma_store.__enter__))[0] + init_params = dict( + plasma_socket=plasma_socket, + plasma_directory=plasma_directory, + check_dir_size=check_dir_size, + ) + client = plasma.connect(plasma_socket) + actual_capacity = await loop.run_in_executor(None, get_actual_capacity, client) + init_params["capacity"] = actual_capacity + teardown_params = dict(plasma_store=plasma_store) + return init_params, teardown_params + + @staticmethod + @implements(StorageBackend.teardown) + async def teardown(**kwargs): + plasma_store = kwargs.get("plasma_store") + plasma_store.__exit__(None, None, None) + + @property + @implements(StorageBackend.level) + def level(self) -> StorageLevel: + return StorageLevel.MEMORY + + @property + @implements(StorageBackend.size) + def size(self) -> Optional[int]: + return self._capacity + + def _check_plasma_limit(self, size: int): + used_size = psutil.disk_usage(self._plasma_directory).used + total = psutil.disk_usage(self._plasma_directory).total + if used_size + size > total * 0.95: # pragma: no cover + raise plasma.PlasmaStoreFull + + def _generate_object_id(self): + while True: + new_id = plasma.ObjectID.from_random() + if not self._client.contains(new_id): + return new_id + + @implements(StorageBackend.get) + async def get(self, object_id, **kwargs) -> object: + if kwargs: # pragma: no cover + raise NotImplementedError(f'Got unsupported args: {",".join(kwargs)}') + + if not self._client.contains(object_id): # pragma: no cover + raise DataNotExist(f"Data {object_id} not exists") + + plasma_file = PlasmaFileObject(self._client, object_id, mode="r") + + async with StorageFileObject(plasma_file, object_id) as f: + deserializer = AioDeserializer(f) + return await deserializer.run() + + @implements(StorageBackend.put) + async def put(self, obj, importance=0) -> ObjectInfo: + object_id = self._generate_object_id() + + serializer = AioSerializer(obj) + buffers = await serializer.run() + buffer_size = sum(getattr(buf, "nbytes", len(buf)) for buf in buffers) + + plasma_file = PlasmaFileObject( + self._client, object_id, mode="w", size=buffer_size + ) + async with StorageFileObject(plasma_file, object_id) as f: + for buffer in buffers: + await f.write(buffer) + + return PlasmaObjectInfo( + size=buffer_size, + object_id=object_id, + buffer=plasma_file.buffer, + plasma_socket=self._plasma_socket, + ) + + @implements(StorageBackend.delete) + async def delete(self, object_id): + self._client.delete([object_id]) + + @implements(StorageBackend.object_info) + async def object_info(self, object_id) -> ObjectInfo: + buf = self._client.get_buffers([object_id])[0] + return PlasmaObjectInfo( + size=buf.size, + object_id=object_id, + buffer=buf, + plasma_socket=self._plasma_socket, + ) + + @implements(StorageBackend.open_writer) + async def open_writer(self, size=None) -> StorageFileObject: + if size is None: # pragma: no cover + raise ValueError("size must be provided for plasma backend") + + new_id = self._generate_object_id() + plasma_writer = PlasmaFileObject(self._client, new_id, size=size, mode="w") + return PlasmaStorageFileObject(plasma_writer, object_id=new_id) + + @implements(StorageBackend.open_reader) + async def open_reader(self, object_id) -> StorageFileObject: + if not self._client.contains(object_id): # pragma: no cover + raise DataNotExist(f"Data {object_id} not exists") + plasma_reader = PlasmaFileObject(self._client, object_id, mode="r") + return PlasmaStorageFileObject(plasma_reader, object_id=object_id) + + @implements(StorageBackend.list) + async def list(self) -> List: + return list(self._client.list()) diff --git a/python/xorbits/_mars/storage/ray.py b/python/xorbits/_mars/storage/ray.py new file mode 100644 index 000000000..46b4c4db4 --- /dev/null +++ b/python/xorbits/_mars/storage/ray.py @@ -0,0 +1,248 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import Any, Dict, List, Tuple + +from ..lib import sparse +from ..metrics import Metrics, Percentile, record_time_cost_percentile +from ..oscar.debug import debug_async_timeout +from ..utils import implements, lazy_import, register_ray_serializer +from .base import ObjectInfo, StorageBackend, StorageLevel, register_storage_backend +from .core import BufferWrappedFileObject, StorageFileObject + +ray = lazy_import("ray") + + +# TODO(fyrestone): make the SparseMatrix pickleable. + + +def _mars_sparse_matrix_serializer(value): + return [value.shape, value.spmatrix] + + +def _mars_sparse_matrix_deserializer(obj) -> sparse.SparseNDArray: + shape, spmatrix = obj + return sparse.matrix.SparseMatrix(spmatrix, shape=shape) + + +def _register_sparse_matrix_serializer(): + # register a custom serializer for Mars SparseMatrix + register_ray_serializer( + sparse.matrix.SparseMatrix, + serializer=_mars_sparse_matrix_serializer, + deserializer=_mars_sparse_matrix_deserializer, + ) + + +class RayFileLikeObject: + def __init__(self): + self._buffers = [] + self._size = 0 + + def write(self, content: bytes): + self._buffers.append(content) + self._size += len(content) + + def readinto(self, buffer): + read_bytes = 0 + for b in self._buffers: + read_pos = read_bytes + len(b) + buffer[read_bytes:read_pos] = b + read_bytes = read_pos + return read_bytes + + def close(self): + self._buffers.clear() + self._size = 0 + + def tell(self): + return self._size + + +class RayFileObject(BufferWrappedFileObject): + def __init__(self, object_id: Any, mode: str): + super().__init__(object_id, mode, size=0) + + def _write_init(self): + self._buffer = RayFileLikeObject() + + def _read_init(self): + self._buffer = ray.get(self._object_id) + self._mv = memoryview(self._buffer) + self._size = len(self._buffer) + + def write(self, content: bytes): + if not self._initialized: + self._write_init() + self._initialized = True + + return self._buffer.write(content) + + def _write_close(self): + worker = ray.worker.global_worker + metadata = ray.ray_constants.OBJECT_METADATA_TYPE_RAW + args = [metadata, self._buffer.tell(), self._buffer, self._object_id] + try: + worker.core_worker.put_file_like_object(*args) + except TypeError: + args.append(None) # owner_address for ray >= 1.3.0 + worker.core_worker.put_file_like_object(*args) + + def _read_close(self): + pass + + +_support_specify_owner = None + + +def support_specify_owner(): + global _support_specify_owner + if _support_specify_owner is None: + sig = inspect.signature(ray.put) + _support_specify_owner = "_owner" in sig.parameters + return _support_specify_owner + + +@register_storage_backend +class RayStorage(StorageBackend): + name = "ray" + is_seekable = False + + def __init__(self, *args, **kwargs): + self._owner_address = kwargs.get("owner") + self._owner = None # A ray actor which will own the objects put by workers. + self._storage_get_metrics = [ + ( + Percentile.PercentileType.P99, + Metrics.gauge( + "mars.storage.ray.get_cost_time_p99_seconds", + "P99 time consuming in seconds to get object, every 1000 times report once.", + ).record, + 1000, + ), + ( + Percentile.PercentileType.P95, + Metrics.gauge( + "mars.storage.ray.get_cost_time_p95_seconds", + "P95 time consuming in seconds to get object, every 1000 times report once.", + ).record, + 1000, + ), + ( + Percentile.PercentileType.P90, + Metrics.gauge( + "mars.storage.ray.get_cost_time_p90_seconds", + "P90 time consuming in seconds to get object, every 1000 times report once.", + ).record, + 1000, + ), + ] + + self._storage_put_metrics = [ + ( + Percentile.PercentileType.P99, + Metrics.gauge( + "mars.storage.ray.put_cost_time_p99_seconds", + "P99 time consuming in seconds to put object, every 1000 times report once.", + ).record, + 1000, + ), + ( + Percentile.PercentileType.P95, + Metrics.gauge( + "mars.storage.ray.put_cost_time_p95_seconds", + "P95 time consuming in seconds to put object, every 1000 times report once.", + ).record, + 1000, + ), + ( + Percentile.PercentileType.P90, + Metrics.gauge( + "mars.storage.ray.put_cost_time_p90_seconds", + "P90 time consuming in seconds to put object, every 1000 times report once.", + ).record, + 1000, + ), + ] + + @classmethod + @implements(StorageBackend.setup) + async def setup(cls, **kwargs) -> Tuple[Dict, Dict]: + _register_sparse_matrix_serializer() + return kwargs, dict() + + @staticmethod + @implements(StorageBackend.teardown) + async def teardown(**kwargs): + pass + + @property + @implements(StorageBackend.level) + def level(self) -> StorageLevel: + # TODO(fyrestone): return StorageLevel.MEMORY & StorageLevel.DISK + # if object spilling is available. + return StorageLevel.MEMORY | StorageLevel.REMOTE + + @implements(StorageBackend.get) + async def get(self, object_id, **kwargs) -> object: + if kwargs: # pragma: no cover + raise NotImplementedError(f'Got unsupported args: {",".join(kwargs)}') + with debug_async_timeout( + "ray_object_retrieval_timeout", + "Storage get object timeout, ObjectRef: %s", + object_id, + ): + with record_time_cost_percentile(self._storage_get_metrics): + return await object_id + + @implements(StorageBackend.put) + async def put(self, obj, importance=0) -> ObjectInfo: + with record_time_cost_percentile(self._storage_put_metrics): + if support_specify_owner() and self._owner_address: + if not self._owner: + self._owner = ray.get_actor(self._owner_address) + object_id = ray.put(obj, _owner=self._owner) + else: + object_id = ray.put(obj) + # We can't get the serialized bytes length from ray.put + return ObjectInfo(object_id=object_id) + + @implements(StorageBackend.delete) + async def delete(self, object_id): + ray.internal.free(object_id) + + @implements(StorageBackend.object_info) + async def object_info(self, object_id) -> ObjectInfo: + # The performance of obtaining the object size is poor. + return ObjectInfo(object_id=object_id) + + @implements(StorageBackend.open_writer) + async def open_writer(self, size=None) -> StorageFileObject: + new_id = ray.ObjectRef.from_random() + ray_writer = RayFileObject(new_id, mode="w") + return StorageFileObject(ray_writer, object_id=new_id) + + @implements(StorageBackend.open_reader) + async def open_reader(self, object_id) -> StorageFileObject: + ray_reader = RayFileObject(object_id, mode="r") + return StorageFileObject(ray_reader, object_id=object_id) + + @implements(StorageBackend.list) + async def list(self) -> List: + raise NotImplementedError("Ray storage does not support list") + + @implements(StorageBackend.fetch) + async def fetch(self, object_id): + pass diff --git a/python/xorbits/_mars/storage/shared_memory.py b/python/xorbits/_mars/storage/shared_memory.py new file mode 100644 index 000000000..24bf8fe0e --- /dev/null +++ b/python/xorbits/_mars/storage/shared_memory.py @@ -0,0 +1,224 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import os +import random +import struct +import sys +from dataclasses import dataclass +from string import ascii_letters, digits +from typing import Any, Dict, List, Optional, Tuple + +try: + if sys.version_info[:2] >= (3, 8): + # builtin package for Python 3.8+ + from multiprocessing.shared_memory import SharedMemory + else: + # backport package for Python 3.7- + from shared_memory import SharedMemory + + class SharedMemoryForRead(SharedMemory): + def __del__(self): + # close fd only + fd = self._fd + if os.name != "nt" and fd >= 0: + os.close(fd) + +except ImportError: # pragma: no cover + # allow shared_memory package to be absent + SharedMemory = SharedMemoryForRead = None + +from ..serialization import AioDeserializer, AioSerializer +from ..utils import dataslots, implements +from .base import ObjectInfo, StorageBackend, StorageLevel, register_storage_backend +from .core import BufferWrappedFileObject, StorageFileObject + +_is_windows: bool = sys.platform.startswith("win") +_qword_pack = struct.Struct(" Tuple[Dict, Dict]: + if kwargs: # pragma: no cover + raise TypeError( + f'SharedMemoryStorage got unexpected config: {",".join(kwargs)}' + ) + + return dict(), dict() + + @staticmethod + @implements(StorageBackend.teardown) + async def teardown(**kwargs): + object_ids = kwargs.get("object_ids") or () + for object_id in object_ids: + try: + shm = SharedMemory(name=object_id) + shm.unlink() + await asyncio.sleep(0) + except FileNotFoundError: + pass + + @property + @implements(StorageBackend.level) + def level(self) -> StorageLevel: + return StorageLevel.MEMORY + + @classmethod + def _generate_object_id(cls): + return "".join(random.choice(ascii_letters + digits) for _ in range(30)) + + @implements(StorageBackend.get) + async def get(self, object_id, **kwargs) -> object: + if kwargs: # pragma: no cover + raise NotImplementedError(f'Got unsupported args: {",".join(kwargs)}') + + shm_file = SharedMemoryFileObject(object_id, mode="r") + + async with StorageFileObject(shm_file, object_id) as f: + deserializer = AioDeserializer(f) + return await deserializer.run() + + @implements(StorageBackend.put) + async def put(self, obj, importance=0) -> ObjectInfo: + object_id = self._generate_object_id() + + serializer = AioSerializer(obj) + buffers = await serializer.run() + buffer_size = sum(getattr(buf, "nbytes", len(buf)) for buf in buffers) + + shm_file = SharedMemoryFileObject(object_id, mode="w", size=buffer_size) + async with StorageFileObject(shm_file, object_id) as f: + for buffer in buffers: + await f.write(buffer) + + self._object_ids.add(object_id) + if _is_windows: + return WinShmObjectInfo( + size=buffer_size, object_id=object_id, shm=shm_file.shm + ) + else: + return ObjectInfo(size=buffer_size, object_id=object_id) + + @implements(StorageBackend.delete) + async def delete(self, object_id): + try: + shm = SharedMemory(name=object_id) + shm.unlink() + shm.close() + except FileNotFoundError: + if sys.platform == "win32": + # skip file not found error for windows + pass + else: # pragma: no cover + raise + try: + self._object_ids.remove(object_id) + except KeyError: # pragma: no cover + return + + @implements(StorageBackend.object_info) + async def object_info(self, object_id) -> ObjectInfo: + shm_file = SharedMemoryFileObject(object_id, mode="r") + + async with ShmStorageFileObject(shm_file, object_id) as f: + deserializer = AioDeserializer(f) + size = await deserializer.get_size() + if not _is_windows: + return ObjectInfo(size=size, object_id=object_id) + else: + return WinShmObjectInfo(size=size, object_id=object_id, shm=shm_file) + + @implements(StorageBackend.open_writer) + async def open_writer(self, size=None) -> StorageFileObject: + if size is None: # pragma: no cover + raise ValueError("size must be provided for shared memory backend") + + new_id = self._generate_object_id() + shm_file = SharedMemoryFileObject(new_id, size=size, mode="w") + return ShmStorageFileObject(shm_file, object_id=new_id) + + @implements(StorageBackend.open_reader) + async def open_reader(self, object_id) -> StorageFileObject: + shm_file = SharedMemoryFileObject(object_id, mode="r") + return ShmStorageFileObject(shm_file, object_id=object_id) + + @implements(StorageBackend.list) + async def list(self) -> List: # pragma: no cover + raise NotImplementedError("Shared memory storage does not support list") diff --git a/python/xorbits/_mars/storage/tests/__init__.py b/python/xorbits/_mars/storage/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/storage/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/storage/tests/test_base.py b/python/xorbits/_mars/storage/tests/test_base.py new file mode 100644 index 000000000..b6fca7dcd --- /dev/null +++ b/python/xorbits/_mars/storage/tests/test_base.py @@ -0,0 +1,26 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .. import StorageLevel + + +def test_storage_level(): + s = "memory" + assert StorageLevel.MEMORY == StorageLevel.from_str(s) + + s = "disk | memory" + assert StorageLevel.DISK | StorageLevel.MEMORY == StorageLevel.from_str(s) + + s = " MEMORY|REMOTE " + assert StorageLevel.MEMORY | StorageLevel.REMOTE == StorageLevel.from_str(s) diff --git a/python/xorbits/_mars/storage/tests/test_libs.py b/python/xorbits/_mars/storage/tests/test_libs.py new file mode 100644 index 000000000..047cd3397 --- /dev/null +++ b/python/xorbits/_mars/storage/tests/test_libs.py @@ -0,0 +1,330 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import pkgutil +import sys +import tempfile + +import numpy as np +import pandas as pd +import pytest +import scipy.sparse as sps + +from ...lib.filesystem import LocalFileSystem +from ...lib.sparse import SparseMatrix, SparseNDArray +from ...serialization import AioDeserializer, AioSerializer +from ...tests.core import require_cudf, require_cupy, require_ray +from ..base import StorageLevel +from ..cuda import CudaStorage +from ..filesystem import DiskStorage +from ..plasma import PlasmaStorage +from ..ray import RayStorage +from ..shared_memory import SharedMemoryStorage +from ..vineyard import VineyardStorage + +try: + import vineyard +except ImportError: + vineyard = None +try: + import ray +except ImportError: + ray = None + +require_lib = lambda x: x +params = [ + "filesystem", + "shared_memory", +] +if ( + not sys.platform.startswith("win") + and pkgutil.find_loader("pyarrow.plasma") is not None +): + params.append("plasma") +if vineyard is not None: + params.append("vineyard") +if ray is not None: + params.append("ray") + require_lib = require_ray + + +@pytest.mark.parametrize( + "ray_start_regular", [{"enable": ray is not None}], indirect=True +) +@pytest.fixture(params=params) +async def storage_context(ray_start_regular, request): + if request.param == "filesystem": + tempdir = tempfile.mkdtemp() + params, teardown_params = await DiskStorage.setup( + fs=LocalFileSystem(), root_dirs=[tempdir] + ) + storage = DiskStorage(**params) + assert storage.level == StorageLevel.DISK + + yield storage + + await storage.teardown(**teardown_params) + elif request.param == "plasma": + plasma_storage_size = 10 * 1024 * 1024 + if sys.platform == "darwin": + plasma_dir = "/tmp" + else: + plasma_dir = "/dev/shm" + params, teardown_params = await PlasmaStorage.setup( + store_memory=plasma_storage_size, + plasma_directory=plasma_dir, + check_dir_size=False, + ) + storage = PlasmaStorage(**params) + assert storage.level == StorageLevel.MEMORY + + yield storage + + await PlasmaStorage.teardown(**teardown_params) + elif request.param == "vineyard": + vineyard_size = "256M" + params, teardown_params = await VineyardStorage.setup( + vineyard_size=vineyard_size + ) + storage = VineyardStorage(**params) + assert storage.level == StorageLevel.MEMORY + + yield storage + + await VineyardStorage.teardown(**teardown_params) + elif request.param == "shared_memory": + params, teardown_params = await SharedMemoryStorage.setup() + storage = SharedMemoryStorage(**params) + assert storage.level == StorageLevel.MEMORY + + yield storage + + teardown_params["object_ids"] = storage._object_ids + await SharedMemoryStorage.teardown(**teardown_params) + elif request.param == "ray": + params, teardown_params = await RayStorage.setup() + storage = RayStorage(**params) + assert storage.level == StorageLevel.MEMORY | StorageLevel.REMOTE + + yield storage + + await RayStorage.teardown(**teardown_params) + + +def test_storage_level(): + level = StorageLevel.DISK | StorageLevel.MEMORY + assert level == StorageLevel.DISK.value | StorageLevel.MEMORY.value + + assert (StorageLevel.DISK | StorageLevel.MEMORY) & StorageLevel.DISK + assert not (StorageLevel.DISK | StorageLevel.MEMORY) & StorageLevel.GPU + + assert StorageLevel.GPU < StorageLevel.MEMORY < StorageLevel.DISK + assert StorageLevel.DISK > StorageLevel.MEMORY > StorageLevel.GPU + + +@pytest.mark.asyncio +@require_lib +@pytest.mark.parametrize( + "ray_start_regular", [{"enable": ray is not None}], indirect=True +) +async def test_base_operations(ray_start_regular, storage_context): + storage = storage_context + + data1 = np.random.rand(10, 10) + put_info1 = await storage.put(data1) + get_data1 = await storage.get(put_info1.object_id) + np.testing.assert_array_equal(data1, get_data1) + + info1 = await storage.object_info(put_info1.object_id) + # FIXME: remove os check when size issue fixed + assert info1.size == put_info1.size + + data2 = pd.DataFrame( + { + "col1": np.arange(10), + "col2": [f"str{i}" for i in range(10)], + "col3": np.random.rand(10), + }, + ) + put_info2 = await storage.put(data2) + get_data2 = await storage.get(put_info2.object_id) + pd.testing.assert_frame_equal(data2, get_data2) + + info2 = await storage.object_info(put_info2.object_id) + # FIXME: remove os check when size issue fixed + assert info2.size == put_info2.size + + # FIXME: remove when list functionality is ready for vineyard. + if not isinstance(storage, (VineyardStorage, SharedMemoryStorage, RayStorage)): + num = len(await storage.list()) + assert num == 2 + await storage.delete(info2.object_id) + + # test SparseMatrix + s1 = sps.csr_matrix([[1, 0, 1], [0, 0, 1]]) + s = SparseNDArray(s1) + put_info3 = await storage.put(s) + get_data3 = await storage.get(put_info3.object_id) + assert isinstance(get_data3, SparseMatrix) + np.testing.assert_array_equal(get_data3.toarray(), s1.A) + np.testing.assert_array_equal(get_data3.todense(), s1.A) + + +@pytest.mark.asyncio +@require_lib +@pytest.mark.parametrize( + "ray_start_regular", [{"enable": ray is not None}], indirect=True +) +async def test_reader_and_writer(ray_start_regular, storage_context): + storage = storage_context + + if isinstance(storage, VineyardStorage): + pytest.skip( + "open_{reader,writer} in vineyard doesn't use the DEFAULT_SERIALIZATION" + ) + + # test writer and reader + t = np.random.random(10) + buffers = await AioSerializer(t).run() + size = sum(getattr(buf, "nbytes", len(buf)) for buf in buffers) + async with await storage.open_writer(size=size) as writer: + for buf in buffers: + await writer.write(buf) + + async with await storage.open_reader(writer.object_id) as reader: + r = await AioDeserializer(reader).run() + + np.testing.assert_array_equal(t, r) + + # test writer and reader with seek offset + t = np.random.random(10) + buffers = await AioSerializer(t).run() + size = sum(getattr(buf, "nbytes", len(buf)) for buf in buffers) + async with await storage.open_writer(size=20 + size) as writer: + await writer.write(b" " * 10) + for buf in buffers: + await writer.write(buf) + await writer.write(b" " * 10) + + async with await storage.open_reader(writer.object_id) as reader: + with pytest.raises((OSError, ValueError)): + await reader.seek(-1) + + assert 5 == await reader.seek(5) + assert 10 == await reader.seek(5, os.SEEK_CUR) + assert 10 == await reader.seek(-10 - size, os.SEEK_END) + assert 10 == await reader.tell() + r = await AioDeserializer(reader).run() + + np.testing.assert_array_equal(t, r) + + +@pytest.mark.asyncio +@require_lib +@pytest.mark.parametrize( + "ray_start_regular", [{"enable": ray is not None}], indirect=True +) +async def test_reader_and_writer_vineyard(ray_start_regular, storage_context): + storage = storage_context + + if not isinstance(storage, VineyardStorage): + pytest.skip( + "open_{reader,writer} in vineyard doesn't use the DEFAULT_SERIALIZATION" + ) + + # test writer and reader + t = np.random.random(10) + tinfo = await storage.put(t) + + # testing the roundtrip of `open_{reader,writer}`. + + buffers = [] + async with await storage.open_reader(tinfo.object_id) as reader: + while True: + buf = await reader.read() + if buf: + buffers.append(buf) + else: + break + + writer_object_id = None + async with await storage.open_writer() as writer: + for buf in buffers: + await writer.write(buf) + + # The `object_id` of `StorageFileObject` returned by `open_writer` in vineyard + # storage only available after `close` and before `__exit__` of `AioFileObject`. + # + # As `StorageFileObject.object_id` is only used for testing here, I think its + # fine to have such a hack. + await writer.close() + writer_object_id = writer._file._object_id + + t2 = await storage.get(writer_object_id) + np.testing.assert_array_equal(t, t2) + + +@require_cupy +@require_cudf +@pytest.mark.asyncio +async def test_cuda_backend(): + import cudf + import cupy + + params, teardown_params = await CudaStorage.setup() + storage = CudaStorage(**params) + assert storage.level == StorageLevel.GPU + + data1 = cupy.asarray(np.random.rand(10, 10)) + put_info1 = await storage.put(data1) + get_data1 = await storage.get(put_info1.object_id) + cupy.testing.assert_array_equal(data1, get_data1) + + info1 = await storage.object_info(put_info1.object_id) + assert info1.size == put_info1.size + + data2 = cudf.DataFrame( + pd.DataFrame( + { + "col1": np.arange(10), + "col2": [f"str{i}" for i in range(10)], + "col3": np.random.rand(10), + }, + ) + ) + put_info2 = await storage.put(data2) + get_data2 = await storage.get(put_info2.object_id) + cudf.testing.assert_frame_equal(data2, get_data2) + + info2 = await storage.object_info(put_info2.object_id) + assert info2.size == put_info2.size + + await CudaStorage.teardown(**teardown_params) + + # test writer and reader + read_chunk = 100 + writer = await storage.open_writer(put_info1.size) + async with await storage.open_reader(put_info1.object_id) as reader: + while True: + content = await reader.read(read_chunk) + if content: + await writer.write(content) + else: + break + writer._file._write_close() + write_data = await storage.get(writer._file._object_id) + cupy.testing.assert_array_equal(write_data, get_data1) + + await storage.delete(put_info1.object_id) diff --git a/python/xorbits/_mars/storage/vineyard.py b/python/xorbits/_mars/storage/vineyard.py new file mode 100644 index 000000000..e90eafb8c --- /dev/null +++ b/python/xorbits/_mars/storage/vineyard.py @@ -0,0 +1,220 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import logging +import sys +from io import UnsupportedOperation +from typing import Dict, List, Optional, Tuple + +from ..lib import sparse +from ..resource import virtual_memory +from ..utils import calc_size_by_str, implements, lazy_import +from .base import ObjectInfo, StorageBackend, StorageLevel, register_storage_backend +from .core import BufferWrappedFileObject, StorageFileObject + +vineyard = lazy_import("vineyard") +vy_data_pickle = lazy_import("vineyard.data.pickle", rename="vy_data_pickle") +vy_data_utils = lazy_import("vineyard.data.utils", rename="vy_data_utils") +pyarrow = lazy_import("pyarrow") + +if sys.platform.startswith("win"): + vineyard = vy_data_pickle = vy_data_utils = None + +logger = logging.getLogger(__name__) + +# Setup support for mars datatypes on vineyard + + +def mars_sparse_matrix_builder(client, value, builder, **kw): + meta = vineyard.ObjectMeta() + meta["typename"] = "vineyard::SparseMatrix<%s>" % value.dtype.name + meta["shape_"] = vy_data_utils.to_json(value.shape) + meta.add_member("spmatrix", builder.run(client, value.spmatrix, **kw)) + return client.create_metadata(meta) + + +def mars_sparse_matrix_resolver(obj, resolver) -> sparse.SparseNDArray: + meta = obj.meta + shape = vy_data_utils.from_json(meta["shape_"]) + spmatrix = resolver.run(obj.member("spmatrix")) + return sparse.matrix.SparseMatrix(spmatrix, shape=shape) + + +def _register_vineyard_matrices(): + vineyard.core.default_builder_context.register( + sparse.matrix.SparseMatrix, mars_sparse_matrix_builder + ) + vineyard.core.default_resolver_context.register( + "vineyard::SparseMatrix", mars_sparse_matrix_resolver + ) + + +class VineyardFileObject(BufferWrappedFileObject): + def __init__( + self, vineyard_client, object_id, mode: str, size: Optional[int] = None + ): + self._client = vineyard_client + self._file = None + + self._reader = None + self._writer = None + + if size is None: + size = -1 # unknown estimated size. + + super().__init__(object_id, mode, size=size) + + def _read_init(self): + self._reader = vy_data_pickle.PickledReader(self._client.get(self._object_id)) + self._size = self._reader.store_size + + def _write_init(self): + self._writer = vy_data_pickle.PickledWriter(self._size) + + @property + def buffer(self): + raise UnsupportedOperation( + "VineyardFileObject doesn't support the direct 'buffer' property" + ) + + def read(self, size=-1): + if not self._initialized: + self._read_init() + self._initialized = True + return self._reader.read(size) + + def write(self, content: bytes): + if not self._initialized: + self._write_init() + self._initialized = True + return self._writer.write(content) + + def _read_close(self): + self._reader = None + + def _write_close(self): + self._writer.close() + self._object_id = self._client.put(self._writer.value) + self._writer = None + + +@register_storage_backend +class VineyardStorage(StorageBackend): + name = "vineyard" + is_seekable = False + + def __init__(self, vineyard_size: int, vineyard_socket: str = None): + _register_vineyard_matrices() + + self._size = vineyard_size + self._vineyard_socket = vineyard_socket + self._client = vineyard.connect(vineyard_socket) + + @classmethod + @implements(StorageBackend.setup) + async def setup(cls, **kwargs) -> Tuple[Dict, Dict]: + loop = asyncio.get_running_loop() + etcd_endpoints = kwargs.pop("etcd_endpoints", "127.0.0.1:2379") + etcd_prefix = kwargs.pop("etcd_prefix", "vineyard") + vineyard_size = kwargs.pop("vineyard_size", "1Gi") + vineyard_socket = kwargs.pop("vineyard_socket", None) + vineyardd_path = kwargs.pop("vineyardd_path", None) + + if kwargs: + raise TypeError( + f'VineyardStorage got unexpected config: {",".join(kwargs)}' + ) + + vineyard_size = calc_size_by_str(vineyard_size, virtual_memory().total) + if vineyard_socket is not None: # pragma: no cover + vineyard_store = None + else: + vineyard_store = vineyard.deploy.local.start_vineyardd( + etcd_endpoints, + etcd_prefix, + vineyardd_path, + vineyard_size, + vineyard_socket, + rpc=False, + ) + vineyard_socket = ( + await loop.run_in_executor(None, vineyard_store.__enter__) + )[1] + init_params = dict(vineyard_size=vineyard_size, vineyard_socket=vineyard_socket) + teardown_params = dict(vineyard_store=vineyard_store) + return init_params, teardown_params + + @staticmethod + @implements(StorageBackend.teardown) + async def teardown(**kwargs): + vineyard_store = kwargs.get("vineyard_store") + if vineyard_store is not None: + vineyard_store.__exit__(None, None, None) + + @property + @implements(StorageBackend.level) + def level(self) -> StorageLevel: + return StorageLevel.MEMORY + + @property + @implements(StorageBackend.size) + def size(self) -> Optional[int]: + return self._size + + @property + @implements(StorageBackend.backend_info) + def backend_info(self): + return { + "name": self.name, + "socket": self._vineyard_socket, + "instance_id": self._client.instance_id, + } + + @implements(StorageBackend.get) + async def get(self, object_id, **kwargs) -> object: + if kwargs: # pragma: no cover + raise NotImplementedError(f'Got unsupported args: {",".join(kwargs)}') + + return self._client.get(object_id) + + @implements(StorageBackend.put) + async def put(self, obj, importance: int = 0) -> ObjectInfo: + object_id = self._client.put(obj) + size = self._client.get_meta(object_id).nbytes + return ObjectInfo(size=size, object_id=object_id) + + @implements(StorageBackend.delete) + async def delete(self, object_id): + self._client.delete([object_id], deep=True) + + @implements(StorageBackend.object_info) + async def object_info(self, object_id) -> ObjectInfo: + size = self._client.get_meta(object_id).nbytes + return ObjectInfo(size=size, object_id=object_id) + + @implements(StorageBackend.open_writer) + async def open_writer(self, size=None) -> StorageFileObject: + vineyard_writer = VineyardFileObject(self._client, None, size=size, mode="w") + return StorageFileObject(vineyard_writer, object_id=None) + + @implements(StorageBackend.open_reader) + async def open_reader(self, object_id) -> StorageFileObject: + vineyard_reader = VineyardFileObject(self._client, object_id, mode="r") + return StorageFileObject(vineyard_reader, object_id=object_id) + + @implements(StorageBackend.list) + async def list(self) -> List: + # FIXME: vineyard's list_objects not equal to plasma + raise NotImplementedError diff --git a/python/xorbits/_mars/supervisor.py b/python/xorbits/_mars/supervisor.py new file mode 100644 index 000000000..183516a6c --- /dev/null +++ b/python/xorbits/_mars/supervisor.py @@ -0,0 +1,23 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# shortcut to support +# python -m mars.supervisor + +from .deploy.oscar.supervisor import main +from .utils import ensure_coverage + +if __name__ == "__main__": + ensure_coverage() + main() diff --git a/python/xorbits/_mars/tensor/__init__.py b/python/xorbits/_mars/tensor/__init__.py new file mode 100644 index 000000000..64c5e0539 --- /dev/null +++ b/python/xorbits/_mars/tensor/__init__.py @@ -0,0 +1,380 @@ +# isort: skip_file +# Copyright 1999-2021 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from .datasource import ( + tensor, + array, + asarray, + ascontiguousarray, + asfortranarray, + scalar, + empty, + empty_like, + ones, + ones_like, + zeros, + zeros_like, + full, + full_like, + arange, + diag, + diagflat, + eye, + identity, + linspace, + meshgrid, + indices, + tril, + triu, + fromtiledb, + fromtiledb as from_tiledb, + from_dataframe, + fromhdf5, + fromhdf5 as from_hdf5, + fromzarr, + fromzarr as from_zarr, + fromvineyard, + fromvineyard as from_vineyard, +) +from .datastore import ( + totiledb, + totiledb as to_tiledb, + tohdf5, + tohdf5 as to_hdf5, + tozarr, + tozarr as to_zarr, + tovineyard, + tovineyard as to_vineyard, +) # pylint: disable=reimported +from .base import ( + result_type, + ndim, + copyto, + transpose, + where, + broadcast_to, + broadcast_arrays, + expand_dims, + rollaxis, + swapaxes, + moveaxis, + ravel, + atleast_1d, + atleast_2d, + atleast_3d, + argwhere, + array_split, + split, + hsplit, + vsplit, + dsplit, + roll, + squeeze, + diff, + ediff1d, + flip, + flipud, + fliplr, + repeat, + tile, + isin, + searchsorted, + unique, + sort, + argsort, + partition, + argpartition, + topk, + argtopk, + copy, + trapz, + shape, + insert, + delete, + in1d, + setdiff1d, +) +from .arithmetic import ( + add, + subtract, + multiply, + divide, + truediv as true_divide, + floordiv as floor_divide, + mod, + power, + float_power, + fmod, + sqrt, + around, + round_, + round_ as round, + logaddexp, + logaddexp2, + negative, + positive, + absolute, + fabs, + absolute as abs, + rint, + sign, + degrees, + radians, + conj, + conjugate, + exp, + exp2, + log, + log2, + log10, + expm1, + log1p, + square, + cbrt, + reciprocal, + equal, + not_equal, + less, + less_equal, + greater, + greater_equal, + sin, + cos, + tan, + arcsin, + arccos, + arctan, + arctan2, + hypot, + sinh, + cosh, + tanh, + arcsinh, + arccosh, + arctanh, + deg2rad, + rad2deg, + bitand as bitwise_and, + bitor as bitwise_or, + bitxor as bitwise_xor, + invert, + invert as bitwise_not, + lshift as left_shift, + rshift as right_shift, + logical_and, + logical_or, + logical_xor, + logical_not, + maximum, + minimum, + floor, + ceil, + trunc, + remainder, + fmax, + fmin, + isfinite, + isinf, + isnan, + signbit, + copysign, + nextafter, + spacing, + clip, + isclose, + ldexp, + frexp, + modf, + angle, + isreal, + iscomplex, + real, + imag, + fix, + i0, + sinc, + nan_to_num, + tree_add, + tree_multiply, +) +from .statistics import ( + average, + bincount, + cov, + corrcoef, + digitize, + ptp, + histogram_bin_edges, + histogram, + median, + quantile, + percentile, +) +from .linalg.tensordot import tensordot +from .linalg.dot import dot +from .linalg.inner import inner, innerproduct +from .linalg.vdot import vdot +from .linalg.matmul import matmul +from .reduction import ( + sum, + nansum, + prod, + prod as product, + nanprod, + max, + max as amax, + nanmax, + min, + min as amin, + nanmin, + all, + any, + mean, + nanmean, + argmax, + nanargmax, + argmin, + nanargmin, + cumsum, + cumprod, + var, + std, + nanvar, + nanstd, + nancumsum, + nancumprod, + count_nonzero, + allclose, + array_equal, +) +from .reshape import reshape +from .merge import ( + concatenate, + stack, + hstack, + vstack, + dstack, + column_stack, + union1d, + block, + append, +) +from .indexing import ( + take, + compress, + extract, + choose, + unravel_index, + nonzero, + flatnonzero, + fill_diagonal, +) +from .rechunk import rechunk +from .einsum import einsum +from .images import imread + +# noinspection PyUnresolvedReferences +from .lib.index_tricks import mgrid, ogrid, ndindex, r_, c_ + +from . import random +from . import fft +from . import linalg +from . import lib +from . import special +from . import stats + +# types +from .core import Tensor + +# noinspection PyUnresolvedReferences +from ..core import ExecutableTuple + +# noinspection PyUnresolvedReferences +from numpy import ( + newaxis, + AxisError, + inf, + Inf, + NINF, + nan, + NAN, + NaN, + pi, + e, + errstate, + geterr, + seterr, +) + +# import numpy types +# noinspection PyUnresolvedReferences +from numpy import ( + dtype, + number, + inexact, + floating, + complexfloating, + integer, + signedinteger, + unsignedinteger, + character, + generic, + flexible, + int_ as int, + bool_ as bool, + float_ as float, + cfloat, + bytes_, + unicode_, + void, + object_ as object, + intc, + intp, + int8, + int16, + int32, + int64, + uint8, + uint16, + uint32, + uint64, + uint, + float16, + float32, + float64, + double, + complex64, + complex128, + datetime64, + timedelta64, +) + +# noinspection PyUnresolvedReferences +from numpy import finfo + +# register fuse op and fetch op +from .fuse import TensorFuseChunk, TensorCpFuseChunk, TensorNeFuseChunk +from .fetch import TensorFetch, TensorFetchShuffle +from . import ufunc + +del ( + TensorFuseChunk, + TensorCpFuseChunk, + TensorNeFuseChunk, + TensorFetch, + TensorFetchShuffle, + ufunc, +) diff --git a/python/xorbits/_mars/tensor/arithmetic/__init__.py b/python/xorbits/_mars/tensor/arithmetic/__init__.py new file mode 100644 index 000000000..5f05cc287 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/__init__.py @@ -0,0 +1,313 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...core import is_build_mode +from .abs import TensorAbs, abs +from .absolute import TensorAbsolute, absolute +from .add import TensorAdd, TensorTreeAdd, add, tree_add +from .angle import TensorAngle, angle +from .arccos import TensorArccos, arccos +from .arccosh import TensorArccosh, arccosh +from .arcsin import TensorArcsin, arcsin +from .arcsinh import TensorArcsinh, arcsinh +from .arctan import TensorArctan, arctan +from .arctan2 import TensorArctan2, arctan2 +from .arctanh import TensorArctanh, arctanh +from .around import TensorAround +from .around import around +from .around import around as round_ +from .bitand import TensorBitand, bitand +from .bitor import TensorBitor, bitor +from .bitxor import TensorBitxor, bitxor +from .cbrt import TensorCbrt, cbrt +from .ceil import TensorCeil, ceil +from .clip import TensorClip, clip +from .conj import TensorConj +from .conj import conj +from .conj import conj as conjugate +from .copysign import TensorCopysign, copysign +from .cos import TensorCos, cos +from .cosh import TensorCosh, cosh +from .deg2rad import TensorDeg2rad, deg2rad +from .degrees import TensorDegrees, degrees +from .divide import TensorDivide, divide +from .equal import TensorEqual, equal +from .exp import TensorExp, exp +from .exp2 import TensorExp2, exp2 +from .expm1 import TensorExpm1, expm1 +from .fabs import TensorFabs, fabs +from .fix import TensorFix, fix +from .float_power import TensorFloatPower, float_power +from .floor import TensorFloor, floor +from .floordiv import TensorFloorDiv, floordiv +from .fmax import TensorFMax, fmax +from .fmin import TensorFMin, fmin +from .fmod import TensorFMod, fmod +from .frexp import TensorFrexp, frexp +from .greater import TensorGreaterThan, greater +from .greater_equal import TensorGreaterEqual, greater_equal +from .hypot import TensorHypot, hypot +from .i0 import TensorI0, i0 +from .imag import TensorImag, imag +from .invert import TensorInvert, invert +from .isclose import TensorIsclose, isclose +from .iscomplex import TensorIsComplex, iscomplex +from .isfinite import TensorIsFinite, isfinite +from .isinf import TensorIsInf, isinf +from .isnan import TensorIsNan, isnan +from .isreal import TensorIsReal, isreal +from .ldexp import TensorLdexp, ldexp +from .less import TensorLessThan, less +from .less_equal import TensorLessEqual, less_equal +from .log import TensorLog, log +from .log1p import TensorLog1p, log1p +from .log2 import TensorLog2, log2 +from .log10 import TensorLog10, log10 +from .logaddexp import TensorLogAddExp, logaddexp +from .logaddexp2 import TensorLogAddExp2, logaddexp2 +from .logical_and import TensorAnd, logical_and +from .logical_not import TensorNot, logical_not +from .logical_or import TensorOr, logical_or +from .logical_xor import TensorXor, logical_xor +from .lshift import TensorLshift, lshift +from .maximum import TensorMaximum, maximum +from .minimum import TensorMinimum, minimum +from .mod import TensorMod +from .mod import mod +from .mod import mod as remainder +from .modf import TensorModf, modf +from .multiply import TensorMultiply, TensorTreeMultiply, multiply, tree_multiply +from .nan_to_num import TensorNanToNum, nan_to_num +from .negative import TensorNegative, negative +from .nextafter import TensorNextafter, nextafter +from .not_equal import TensorNotEqual, not_equal +from .positive import TensorPositive, positive +from .power import TensorPower, power +from .rad2deg import TensorRad2deg, rad2deg +from .radians import TensorRadians, radians +from .real import TensorReal, real +from .reciprocal import TensorReciprocal, reciprocal +from .rint import TensorRint, rint +from .rshift import TensorRshift, rshift +from .setimag import TensorSetImag +from .setreal import TensorSetReal +from .sign import TensorSign, sign +from .signbit import TensorSignbit, signbit +from .sin import TensorSin, sin +from .sinc import TensorSinc, sinc +from .sinh import TensorSinh, sinh +from .spacing import TensorSpacing, spacing +from .sqrt import TensorSqrt, sqrt +from .square import TensorSquare, square +from .subtract import TensorSubtract, subtract +from .tan import TensorTan, tan +from .tanh import TensorTanh, tanh +from .truediv import TensorTrueDiv, truediv +from .trunc import TensorTrunc, trunc + + +def _wrap_iop(func): + def inner(self, *args, **kwargs): + kwargs["out"] = self + return func(self, *args, **kwargs) + + return inner + + +def _install(): + from ..core import TENSOR_TYPE, Tensor, TensorData + from ..datasource import tensor as astensor + from .add import add, radd + from .bitand import bitand, rbitand + from .bitor import bitor, rbitor + from .bitxor import bitxor, rbitxor + from .divide import divide, rdivide + from .floordiv import floordiv, rfloordiv + from .lshift import lshift, rlshift + from .mod import mod, rmod + from .multiply import multiply, rmultiply + from .power import power, rpower + from .rshift import rrshift, rshift + from .subtract import rsubtract, subtract + from .truediv import rtruediv, truediv + + def _wrap_equal(func): + def eq(x1, x2, **kwargs): + if is_build_mode(): + return astensor(x1)._equals(x2) + return func(x1, x2, **kwargs) + + return eq + + for cls in TENSOR_TYPE: + setattr(cls, "__add__", add) + setattr(cls, "__iadd__", _wrap_iop(add)) + setattr(cls, "__radd__", radd) + setattr(cls, "__sub__", subtract) + setattr(cls, "__isub__", _wrap_iop(subtract)) + setattr(cls, "__rsub__", rsubtract) + setattr(cls, "__mul__", multiply) + setattr(cls, "__imul__", _wrap_iop(multiply)) + setattr(cls, "__rmul__", rmultiply) + setattr(cls, "__div__", divide) + setattr(cls, "__idiv__", _wrap_iop(divide)) + setattr(cls, "__rdiv__", rdivide) + setattr(cls, "__truediv__", truediv) + setattr(cls, "__itruediv__", _wrap_iop(truediv)) + setattr(cls, "__rtruediv__", rtruediv) + setattr(cls, "__floordiv__", floordiv) + setattr(cls, "__ifloordiv__", _wrap_iop(floordiv)) + setattr(cls, "__rfloordiv__", rfloordiv) + setattr(cls, "__pow__", power) + setattr(cls, "__ipow__", _wrap_iop(power)) + setattr(cls, "__rpow__", rpower) + setattr(cls, "__mod__", mod) + setattr(cls, "__imod__", _wrap_iop(mod)) + setattr(cls, "__rmod__", rmod) + setattr(cls, "__lshift__", lshift) + setattr(cls, "__ilshift__", _wrap_iop(lshift)) + setattr(cls, "__rlshift__", rlshift) + setattr(cls, "__rshift__", rshift) + setattr(cls, "__irshift__", _wrap_iop(rshift)) + setattr(cls, "__rrshift__", rrshift) + + setattr(cls, "__eq__", _wrap_equal(equal)) + setattr(cls, "__ne__", not_equal) + setattr(cls, "__lt__", less) + setattr(cls, "__le__", less_equal) + setattr(cls, "__gt__", greater) + setattr(cls, "__ge__", greater_equal) + setattr(cls, "__and__", bitand) + setattr(cls, "__iand__", _wrap_iop(bitand)) + setattr(cls, "__rand__", rbitand) + setattr(cls, "__or__", bitor) + setattr(cls, "__ior__", _wrap_iop(bitor)) + setattr(cls, "__ror__", rbitor) + setattr(cls, "__xor__", bitxor) + setattr(cls, "__ixor__", _wrap_iop(bitxor)) + setattr(cls, "__rxor__", rbitxor) + + setattr(cls, "__neg__", negative) + setattr(cls, "__pos__", positive) + setattr(cls, "__abs__", abs) + setattr(cls, "__invert__", invert) + + setattr(Tensor, "round", round_) + setattr(Tensor, "conj", conj) + setattr(Tensor, "conjugate", conjugate) + setattr(TensorData, "round", round_) + setattr(TensorData, "conj", conj) + setattr(TensorData, "conjugate", conjugate) + + +_install() +del _install + + +BIN_UFUNC = { + add, + subtract, + multiply, + divide, + truediv, + floordiv, + power, + mod, + fmod, + logaddexp, + logaddexp2, + equal, + not_equal, + less, + less_equal, + greater, + greater_equal, + arctan2, + hypot, + bitand, + bitor, + bitxor, + lshift, + rshift, + logical_and, + logical_or, + logical_xor, + maximum, + minimum, + float_power, + remainder, + fmax, + fmin, + copysign, + nextafter, + ldexp, +} + +UNARY_UFUNC = { + square, + arcsinh, + rint, + sign, + conj, + tan, + absolute, + deg2rad, + log, + fabs, + exp2, + invert, + negative, + sqrt, + arctan, + positive, + cbrt, + log10, + sin, + rad2deg, + log2, + arcsin, + expm1, + arctanh, + cosh, + sinh, + cos, + reciprocal, + tanh, + log1p, + exp, + arccos, + arccosh, + around, + logical_not, + conjugate, + isfinite, + isinf, + isnan, + signbit, + spacing, + floor, + ceil, + trunc, + degrees, + radians, + angle, + isreal, + iscomplex, + real, + imag, + fix, + i0, + sinc, + nan_to_num, +} diff --git a/python/xorbits/_mars/tensor/arithmetic/abs.py b/python/xorbits/_mars/tensor/arithmetic/abs.py new file mode 100644 index 000000000..616ff3c29 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/abs.py @@ -0,0 +1,66 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorAbs(TensorUnaryOp): + _op_type_ = OperandDef.ABS + _func_name = "abs" + + +@infer_dtype(np.abs) +def abs(x, out=None, where=None, **kwargs): + r""" + Calculate the absolute value element-wise. + + Parameters + ---------- + x : array_like + Input tensor. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + absolute : Tensor + An tensor containing the absolute value of + each element in `x`. For complex input, ``a + ib``, the + absolute value is :math:`\sqrt{ a^2 + b^2 }`. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.array([-1.2, 1.2]) + >>> mt.absolute(x).execute() + array([ 1.2, 1.2]) + >>> mt.absolute(1.2 + 1j).execute() + 1.5620499351813308 + """ + op = TensorAbs(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/absolute.py b/python/xorbits/_mars/tensor/arithmetic/absolute.py new file mode 100644 index 000000000..6c72132a7 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/absolute.py @@ -0,0 +1,66 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorAbsolute(TensorUnaryOp): + _op_type_ = OperandDef.ABSOLUTE + _func_name = "absolute" + + +@infer_dtype(np.absolute) +def absolute(x, out=None, where=None, **kwargs): + r""" + Calculate the absolute value element-wise. + + Parameters + ---------- + x : array_like + Input tensor. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + absolute : Tensor + An tensor containing the absolute value of + each element in `x`. For complex input, ``a + ib``, the + absolute value is :math:`\sqrt{ a^2 + b^2 }`. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.array([-1.2, 1.2]) + >>> mt.absolute(x).execute() + array([ 1.2, 1.2]) + >>> mt.absolute(1.2 + 1j).execute() + 1.5620499351813308 + """ + op = TensorAbsolute(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/add.py b/python/xorbits/_mars/tensor/arithmetic/add.py new file mode 100644 index 000000000..9c6665532 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/add.py @@ -0,0 +1,124 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import reduce + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import BoolField +from ..array_utils import as_same_device, device +from ..datasource import scalar +from ..utils import infer_dtype +from .core import TensorBinOp, TensorMultiOp +from .utils import TreeReductionBuilder, arithmetic_operand, tree_op_estimate_size + + +@arithmetic_operand(sparse_mode="binary_and") +class TensorAdd(TensorBinOp): + _op_type_ = OperandDef.ADD + _func_name = "add" + + +@infer_dtype(np.add) +def add(x1, x2, out=None, where=None, **kwargs): + """ + Add arguments element-wise. + + Parameters + ---------- + x1, x2 : array_like + The tensors to be added. If ``x1.shape != x2.shape``, they must be + broadcastable to a common shape (which may be the shape of one or + the other). + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + add : Tensor or scalar + The sum of `x1` and `x2`, element-wise. Returns a scalar if + both `x1` and `x2` are scalars. + + Notes + ----- + Equivalent to `x1` + `x2` in terms of tensor broadcasting. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.add(1.0, 4.0).execute() + 5.0 + >>> x1 = mt.arange(9.0).reshape((3, 3)) + >>> x2 = mt.arange(3.0) + >>> mt.add(x1, x2).execute() + array([[ 0., 2., 4.], + [ 3., 5., 7.], + [ 6., 8., 10.]]) + """ + op = TensorAdd(**kwargs) + return op(x1, x2, out=out, where=where) + + +@infer_dtype(np.add, reverse=True) +def radd(x1, x2, **kwargs): + op = TensorAdd(**kwargs) + return op.rcall(x1, x2) + + +class TensorTreeAdd(TensorMultiOp): + _op_type_ = OperandDef.TREE_ADD + _func_name = "add" + + ignore_empty_input = BoolField("ignore_empty_input", default=False) + + @classmethod + def _is_sparse(cls, *args): + if args and all(hasattr(x, "issparse") and x.issparse() for x in args): + return True + return False + + @classmethod + def execute(cls, ctx, op: "TensorTreeAdd"): + inputs, device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + if op.ignore_empty_input: + inputs = [inp for inp in inputs if not hasattr(inp, "size") or inp.size > 0] + + with device(device_id): + ctx[op.outputs[0].key] = reduce(xp.add, inputs) + + @classmethod + def estimate_size(cls, ctx, op): + tree_op_estimate_size(ctx, op) + + +@infer_dtype(lambda *args: reduce(np.add, args)) +def tree_add(*args, combine_size=None, **kwargs): + class MultiplyBuilder(TreeReductionBuilder): + def _build_reduction(self, inputs, final=False): + op = TensorTreeAdd(args=inputs, **kwargs) + return op(*inputs) + + args = [scalar(a) if np.isscalar(a) else a for a in args] + return MultiplyBuilder(combine_size).build(args) diff --git a/python/xorbits/_mars/tensor/arithmetic/angle.py b/python/xorbits/_mars/tensor/arithmetic/angle.py new file mode 100644 index 000000000..d8dc729d4 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/angle.py @@ -0,0 +1,88 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import BoolField +from ..array_utils import as_same_device, device +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(init=False, sparse_mode="unary") +class TensorAngle(TensorUnaryOp): + _op_type_ = OperandDef.ANGLE + _func_name = "angle" + + _deg = BoolField("deg") + + @property + def deg(self): + return self._deg + + def __init__( + self, deg=None, casting="same_kind", err=None, dtype=None, sparse=False, **kw + ): + err = err if err is not None else np.geterr() + super().__init__( + _deg=deg, _casting=casting, _err=err, dtype=dtype, sparse=sparse, **kw + ) + + @classmethod + def execute(cls, ctx, op): + (z,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + ctx[op.outputs[0].key] = xp.angle(z, deg=op.deg) + + +@infer_dtype(np.angle) +def angle(z, deg=False, **kwargs): + """ + Return the angle of the complex argument. + + Parameters + ---------- + z : array_like + A complex number or sequence of complex numbers. + deg : bool, optional + Return angle in degrees if True, radians if False (default). + + Returns + ------- + angle : Tensor or scalar + The counterclockwise angle from the positive real axis on + the complex plane, with dtype as numpy.float64. + + See Also + -------- + arctan2 + absolute + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.angle([1.0, 1.0j, 1+1j]).execute() # in radians + array([ 0. , 1.57079633, 0.78539816]) + >>> mt.angle(1+1j, deg=True).execute() # in degrees + 45.0 + + """ + op = TensorAngle(deg=deg, **kwargs) + return op(z) diff --git a/python/xorbits/_mars/tensor/arithmetic/arccos.py b/python/xorbits/_mars/tensor/arithmetic/arccos.py new file mode 100644 index 000000000..cd3ad8527 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/arccos.py @@ -0,0 +1,102 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorArccos(TensorUnaryOp): + _op_type_ = OperandDef.ARCCOS + + _func_name = "arccos" + + +@infer_dtype(np.arccos) +def arccos(x, out=None, where=None, **kwargs): + """ + Trigonometric inverse cosine, element-wise. + + The inverse of `cos` so that, if ``y = cos(x)``, then ``x = arccos(y)``. + + Parameters + ---------- + x : array_like + `x`-coordinate on the unit circle. + For real arguments, the domain is [-1, 1]. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + angle : Tensor + The angle of the ray intersecting the unit circle at the given + `x`-coordinate in radians [0, pi]. If `x` is a scalar then a + scalar is returned, otherwise an array of the same shape as `x` + is returned. + + See Also + -------- + cos, arctan, arcsin + + Notes + ----- + `arccos` is a multivalued function: for each `x` there are infinitely + many numbers `z` such that `cos(z) = x`. The convention is to return + the angle `z` whose real part lies in `[0, pi]`. + + For real-valued input data types, `arccos` always returns real output. + For each value that cannot be expressed as a real number or infinity, + it yields ``nan`` and sets the `invalid` floating point error flag. + + For complex-valued input, `arccos` is a complex analytic function that + has branch cuts `[-inf, -1]` and `[1, inf]` and is continuous from + above on the former and from below on the latter. + + The inverse `cos` is also known as `acos` or cos^-1. + + References + ---------- + M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions", + 10th printing, 1964, pp. 79. http://www.math.sfu.ca/~cbm/aands/ + + Examples + -------- + We expect the arccos of 1 to be 0, and of -1 to be pi: + >>> import mars.tensor as mt + + >>> mt.arccos([1, -1]).execute() + array([ 0. , 3.14159265]) + + Plot arccos: + + >>> import matplotlib.pyplot as plt + >>> x = mt.linspace(-1, 1, num=100) + >>> plt.plot(x.execute(), mt.arccos(x).execute()) + >>> plt.axis('tight') + >>> plt.show() + """ + op = TensorArccos(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/arccosh.py b/python/xorbits/_mars/tensor/arithmetic/arccosh.py new file mode 100644 index 000000000..572c89992 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/arccosh.py @@ -0,0 +1,89 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorArccosh(TensorUnaryOp): + _op_type_ = OperandDef.ARCCOSH + _func_name = "arccosh" + + +@infer_dtype(np.arccosh) +def arccosh(x, out=None, where=None, **kwargs): + """ + Inverse hyperbolic cosine, element-wise. + + Parameters + ---------- + x : array_like + Input tensor. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + arccosh : Tensor + Array of the same shape as `x`. + + See Also + -------- + + cosh, arcsinh, sinh, arctanh, tanh + + Notes + ----- + `arccosh` is a multivalued function: for each `x` there are infinitely + many numbers `z` such that `cosh(z) = x`. The convention is to return the + `z` whose imaginary part lies in `[-pi, pi]` and the real part in + ``[0, inf]``. + + For real-valued input data types, `arccosh` always returns real output. + For each value that cannot be expressed as a real number or infinity, it + yields ``nan`` and sets the `invalid` floating point error flag. + + For complex-valued input, `arccosh` is a complex analytical function that + has a branch cut `[-inf, 1]` and is continuous from above on it. + + References + ---------- + .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions", + 10th printing, 1964, pp. 86. http://www.math.sfu.ca/~cbm/aands/ + .. [2] Wikipedia, "Inverse hyperbolic function", + http://en.wikipedia.org/wiki/Arccosh + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.arccosh([mt.e, 10.0]).execute() + array([ 1.65745445, 2.99322285]) + >>> mt.arccosh(1).execute() + 0.0 + """ + op = TensorArccosh(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/arcsin.py b/python/xorbits/_mars/tensor/arithmetic/arcsin.py new file mode 100644 index 000000000..8b05fd304 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/arcsin.py @@ -0,0 +1,92 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorArcsin(TensorUnaryOp): + _op_type_ = OperandDef.ARCSIN + _func_name = "arcsin" + + +@infer_dtype(np.arcsin) +def arcsin(x, out=None, where=None, **kwargs): + """ + Inverse sine, element-wise. + + Parameters + ---------- + x : array_like + `y`-coordinate on the unit circle. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + angle : Tensor + The inverse sine of each element in `x`, in radians and in the + closed interval ``[-pi/2, pi/2]``. If `x` is a scalar, a scalar + is returned, otherwise a tensor. + + See Also + -------- + sin, cos, arccos, tan, arctan, arctan2, emath.arcsin + + Notes + ----- + `arcsin` is a multivalued function: for each `x` there are infinitely + many numbers `z` such that :math:`sin(z) = x`. The convention is to + return the angle `z` whose real part lies in [-pi/2, pi/2]. + + For real-valued input data types, *arcsin* always returns real output. + For each value that cannot be expressed as a real number or infinity, + it yields ``nan`` and sets the `invalid` floating point error flag. + + For complex-valued input, `arcsin` is a complex analytic function that + has, by convention, the branch cuts [-inf, -1] and [1, inf] and is + continuous from above on the former and from below on the latter. + + The inverse sine is also known as `asin` or sin^{-1}. + + References + ---------- + Abramowitz, M. and Stegun, I. A., *Handbook of Mathematical Functions*, + 10th printing, New York: Dover, 1964, pp. 79ff. + http://www.math.sfu.ca/~cbm/aands/ + + Examples + -------- + >>> import mars.tensor as mt + >>> mt.arcsin(1).execute() # pi/2 + 1.5707963267948966 + >>> mt.arcsin(-1).execute() # -pi/2 + -1.5707963267948966 + >>> mt.arcsin(0).execute() + 0.0 + """ + op = TensorArcsin(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/arcsinh.py b/python/xorbits/_mars/tensor/arithmetic/arcsinh.py new file mode 100644 index 000000000..1472c37f4 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/arcsinh.py @@ -0,0 +1,84 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorArcsinh(TensorUnaryOp): + _op_type_ = OperandDef.ARCSINH + _func_name = "arcsinh" + + +@infer_dtype(np.arcsinh) +def arcsinh(x, out=None, where=None, **kwargs): + """ + Inverse hyperbolic sine element-wise. + + Parameters + ---------- + x : array_like + Input tensor. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + out : Tensor + Tensor of of the same shape as `x`. + + Notes + ----- + `arcsinh` is a multivalued function: for each `x` there are infinitely + many numbers `z` such that `sinh(z) = x`. The convention is to return the + `z` whose imaginary part lies in `[-pi/2, pi/2]`. + + For real-valued input data types, `arcsinh` always returns real output. + For each value that cannot be expressed as a real number or infinity, it + returns ``nan`` and sets the `invalid` floating point error flag. + + For complex-valued input, `arccos` is a complex analytical function that + has branch cuts `[1j, infj]` and `[-1j, -infj]` and is continuous from + the right on the former and from the left on the latter. + + The inverse hyperbolic sine is also known as `asinh` or ``sinh^-1``. + + References + ---------- + .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions", + 10th printing, 1964, pp. 86. http://www.math.sfu.ca/~cbm/aands/ + .. [2] Wikipedia, "Inverse hyperbolic function", + http://en.wikipedia.org/wiki/Arcsinh + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.arcsinh(mt.array([mt.e, 10.0])).execute() + array([ 1.72538256, 2.99822295]) + """ + op = TensorArcsinh(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/arctan.py b/python/xorbits/_mars/tensor/arithmetic/arctan.py new file mode 100644 index 000000000..1610a3880 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/arctan.py @@ -0,0 +1,104 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorArctan(TensorUnaryOp): + _op_type_ = OperandDef.ARCTAN + _func_name = "arctan" + + +@infer_dtype(np.arctan) +def arctan(x, out=None, where=None, **kwargs): + """ + Trigonometric inverse tangent, element-wise. + + The inverse of tan, so that if ``y = tan(x)`` then ``x = arctan(y)``. + + Parameters + ---------- + x : array_like + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + out : Tensor + Out has the same shape as `x`. Its real part is in + ``[-pi/2, pi/2]`` (``arctan(+/-inf)`` returns ``+/-pi/2``). + It is a scalar if `x` is a scalar. + + See Also + -------- + arctan2 : The "four quadrant" arctan of the angle formed by (`x`, `y`) + and the positive `x`-axis. + angle : Argument of complex values. + + Notes + ----- + `arctan` is a multi-valued function: for each `x` there are infinitely + many numbers `z` such that tan(`z`) = `x`. The convention is to return + the angle `z` whose real part lies in [-pi/2, pi/2]. + + For real-valued input data types, `arctan` always returns real output. + For each value that cannot be expressed as a real number or infinity, + it yields ``nan`` and sets the `invalid` floating point error flag. + + For complex-valued input, `arctan` is a complex analytic function that + has [`1j, infj`] and [`-1j, -infj`] as branch cuts, and is continuous + from the left on the former and from the right on the latter. + + The inverse tangent is also known as `atan` or tan^{-1}. + + References + ---------- + Abramowitz, M. and Stegun, I. A., *Handbook of Mathematical Functions*, + 10th printing, New York: Dover, 1964, pp. 79. + http://www.math.sfu.ca/~cbm/aands/ + + Examples + -------- + We expect the arctan of 0 to be 0, and of 1 to be pi/4: + >>> import mars.tensor as mt + + >>> mt.arctan([0, 1]).execute() + array([ 0. , 0.78539816]) + + >>> mt.pi/4 + 0.78539816339744828 + + Plot arctan: + + >>> import matplotlib.pyplot as plt + >>> x = mt.linspace(-10, 10) + >>> plt.plot(x.execute(), mt.arctan(x).execute()) + >>> plt.axis('tight') + >>> plt.show() + """ + op = TensorArctan(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/arctan2.py b/python/xorbits/_mars/tensor/arithmetic/arctan2.py new file mode 100644 index 000000000..089e0cf3d --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/arctan2.py @@ -0,0 +1,126 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand +class TensorArctan2(TensorBinOp): + _op_type_ = OperandDef.ARCTAN2 + _func_name = "arctan2" + + @classmethod + def _is_sparse(cls, x1, x2): + if hasattr(x1, "issparse") and x1.issparse(): + # if x1 is sparse, will be sparse always + return True + elif np.isscalar(x1) and x1 == 0: + # x1 == 0, return sparse if x2 is + return x2.issparse() if hasattr(x2, "issparse") else False + return False + + +@infer_dtype(np.arctan2) +def arctan2(x1, x2, out=None, where=None, **kwargs): + """ + Element-wise arc tangent of ``x1/x2`` choosing the quadrant correctly. + + The quadrant (i.e., branch) is chosen so that ``arctan2(x1, x2)`` is + the signed angle in radians between the ray ending at the origin and + passing through the point (1,0), and the ray ending at the origin and + passing through the point (`x2`, `x1`). (Note the role reversal: the + "`y`-coordinate" is the first function parameter, the "`x`-coordinate" + is the second.) By IEEE convention, this function is defined for + `x2` = +/-0 and for either or both of `x1` and `x2` = +/-inf (see + Notes for specific values). + + This function is not defined for complex-valued arguments; for the + so-called argument of complex values, use `angle`. + + Parameters + ---------- + x1 : array_like, real-valued + `y`-coordinates. + x2 : array_like, real-valued + `x`-coordinates. `x2` must be broadcastable to match the shape of + `x1` or vice versa. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + angle : Tensor + Array of angles in radians, in the range ``[-pi, pi]``. + + See Also + -------- + arctan, tan, angle + + Notes + ----- + *arctan2* is identical to the `atan2` function of the underlying + C library. The following special values are defined in the C + standard: [1]_ + + ====== ====== ================ + `x1` `x2` `arctan2(x1,x2)` + ====== ====== ================ + +/- 0 +0 +/- 0 + +/- 0 -0 +/- pi + > 0 +/-inf +0 / +pi + < 0 +/-inf -0 / -pi + +/-inf +inf +/- (pi/4) + +/-inf -inf +/- (3*pi/4) + ====== ====== ================ + + Note that +0 and -0 are distinct floating point numbers, as are +inf + and -inf. + + References + ---------- + .. [1] ISO/IEC standard 9899:1999, "Programming language C." + + Examples + -------- + Consider four points in different quadrants: + >>> import mars.tensor as mt + + >>> x = mt.array([-1, +1, +1, -1]) + >>> y = mt.array([-1, -1, +1, +1]) + >>> (mt.arctan2(y, x) * 180 / mt.pi).execute() + array([-135., -45., 45., 135.]) + + Note the order of the parameters. `arctan2` is defined also when `x2` = 0 + and at several other special points, obtaining values in + the range ``[-pi, pi]``: + + >>> mt.arctan2([1., -1.], [0., 0.]).execute() + array([ 1.57079633, -1.57079633]) + >>> mt.arctan2([0., 0., mt.inf], [+0., -0., mt.inf]).execute() + array([ 0. , 3.14159265, 0.78539816]) + """ + op = TensorArctan2(**kwargs) + return op(x1, x2, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/arctanh.py b/python/xorbits/_mars/tensor/arithmetic/arctanh.py new file mode 100644 index 000000000..90aefb8ab --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/arctanh.py @@ -0,0 +1,84 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorArctanh(TensorUnaryOp): + _op_type_ = OperandDef.ARCTANH + _func_name = "arctanh" + + +@infer_dtype(np.arctanh) +def arctanh(x, out=None, where=None, **kwargs): + """ + Inverse hyperbolic tangent element-wise. + + Parameters + ---------- + x : array_like + Input tensor. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + out : Tensor + Array of the same shape as `x`. + + Notes + ----- + `arctanh` is a multivalued function: for each `x` there are infinitely + many numbers `z` such that `tanh(z) = x`. The convention is to return + the `z` whose imaginary part lies in `[-pi/2, pi/2]`. + + For real-valued input data types, `arctanh` always returns real output. + For each value that cannot be expressed as a real number or infinity, + it yields ``nan`` and sets the `invalid` floating point error flag. + + For complex-valued input, `arctanh` is a complex analytical function + that has branch cuts `[-1, -inf]` and `[1, inf]` and is continuous from + above on the former and from below on the latter. + + The inverse hyperbolic tangent is also known as `atanh` or ``tanh^-1``. + + References + ---------- + .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions", + 10th printing, 1964, pp. 86. http://www.math.sfu.ca/~cbm/aands/ + .. [2] Wikipedia, "Inverse hyperbolic function", + http://en.wikipedia.org/wiki/Arctanh + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.arctanh([0, -0.5]).execute() + array([ 0. , -0.54930614]) + """ + op = TensorArctanh(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/around.py b/python/xorbits/_mars/tensor/arithmetic/around.py new file mode 100644 index 000000000..58c896cb9 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/around.py @@ -0,0 +1,141 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import Int32Field +from ..array_utils import as_same_device, device +from ..datasource import tensor as astensor +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(init=False, sparse_mode="unary") +class TensorAround(TensorUnaryOp): + _op_type_ = OperandDef.AROUND + + _decimals = Int32Field("decimals") + _func_name = "around" + + @property + def decimals(self): + return self._decimals + + def __init__( + self, + decimals=None, + casting="same_kind", + err=None, + dtype=None, + sparse=False, + **kw + ): + err = err if err is not None else np.geterr() + super().__init__( + _decimals=decimals, + _casting=casting, + _err=err, + dtype=dtype, + sparse=sparse, + **kw + ) + + @property + def ufunc_extra_params(self): + return {"decimals": self._decimals} + + @classmethod + def execute(cls, ctx, op): + (a,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + ctx[op.outputs[0].key] = xp.around(a, decimals=op.decimals) + + +def around(a, decimals=0, out=None): + """ + Evenly round to the given number of decimals. + + Parameters + ---------- + a : array_like + Input data. + decimals : int, optional + Number of decimal places to round to (default: 0). If + decimals is negative, it specifies the number of positions to + the left of the decimal point. + out : Tensor, optional + Alternative output tensor in which to place the result. It must have + the same shape as the expected output, but the type of the output + values will be cast if necessary. + + Returns + ------- + rounded_array : Tensor + An tensor of the same type as `a`, containing the rounded values. + Unless `out` was specified, a new tensor is created. A reference to + the result is returned. + + The real and imaginary parts of complex numbers are rounded + separately. The result of rounding a float is a float. + + See Also + -------- + Tensor.round : equivalent method + + ceil, fix, floor, rint, trunc + + + Notes + ----- + For values exactly halfway between rounded decimal values, NumPy + rounds to the nearest even value. Thus 1.5 and 2.5 round to 2.0, + -0.5 and 0.5 round to 0.0, etc. Results may also be surprising due + to the inexact representation of decimal fractions in the IEEE + floating point standard [1]_ and errors introduced when scaling + by powers of ten. + + References + ---------- + .. [1] "Lecture Notes on the Status of IEEE 754", William Kahan, + http://www.cs.berkeley.edu/~wkahan/ieee754status/IEEE754.PDF + .. [2] "How Futile are Mindless Assessments of + Roundoff in Floating-Point Computation?", William Kahan, + http://www.cs.berkeley.edu/~wkahan/Mindless.pdf + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.around([0.37, 1.64]).execute() + array([ 0., 2.]) + >>> mt.around([0.37, 1.64], decimals=1).execute() + array([ 0.4, 1.6]) + >>> mt.around([.5, 1.5, 2.5, 3.5, 4.5]).execute() # rounds to nearest even value + array([ 0., 2., 2., 4., 4.]) + >>> mt.around([1,2,3,11], decimals=1).execute() # tensor of ints is returned + array([ 1, 2, 3, 11]) + >>> mt.around([1,2,3,11], decimals=-1).execute() + array([ 0, 0, 0, 10]) + + """ + dtype = astensor(a).dtype + op = TensorAround(decimals=decimals, dtype=dtype) + return op(a, out=out) + + +round_ = around diff --git a/python/xorbits/_mars/tensor/arithmetic/bitand.py b/python/xorbits/_mars/tensor/arithmetic/bitand.py new file mode 100644 index 000000000..6ada55559 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/bitand.py @@ -0,0 +1,93 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="binary_or") +class TensorBitand(TensorBinOp): + _op_type_ = OperandDef.BITAND + _func_name = "bitwise_and" + + +@infer_dtype(np.bitwise_and) +def bitand(x1, x2, out=None, where=None, **kwargs): + """ + Compute the bit-wise AND of two tensors element-wise. + + Computes the bit-wise AND of the underlying binary representation of + the integers in the input arrays. This ufunc implements the C/Python + operator ``&``. + + Parameters + ---------- + x1, x2 : array_like + Only integer and boolean types are handled. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + out : array_like + Result. + + See Also + -------- + logical_and + bitwise_or + bitwise_xor + + Examples + -------- + The number 13 is represented by ``00001101``. Likewise, 17 is + represented by ``00010001``. The bit-wise AND of 13 and 17 is + therefore ``000000001``, or 1: + + >>> import mars.tensor as mt + + >>> mt.bitwise_and(13, 17).execute() + 1 + + >>> mt.bitwise_and(14, 13).execute() + 12 + >>> mt.bitwise_and([14,3], 13).execute() + array([12, 1]) + + >>> mt.bitwise_and([11,7], [4,25]).execute() + array([0, 1]) + >>> mt.bitwise_and(mt.array([2,5,255]), mt.array([3,14,16])).execute() + array([ 2, 4, 16]) + >>> mt.bitwise_and([True, True], [False, True]).execute() + array([False, True]) + """ + op = TensorBitand(**kwargs) + return op(x1, x2, out=out, where=where) + + +@infer_dtype(np.bitwise_and, reverse=True) +def rbitand(x1, x2, **kwargs): + op = TensorBitand(**kwargs) + return op.rcall(x1, x2) diff --git a/python/xorbits/_mars/tensor/arithmetic/bitor.py b/python/xorbits/_mars/tensor/arithmetic/bitor.py new file mode 100644 index 000000000..aeacb7f65 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/bitor.py @@ -0,0 +1,100 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="binary_or") +class TensorBitor(TensorBinOp): + _op_type_ = OperandDef.BITOR + _func_name = "bitwise_or" + + +@infer_dtype(np.bitwise_or) +def bitor(x1, x2, out=None, where=None, **kwargs): + """ + Compute the bit-wise OR of two tensors element-wise. + + Computes the bit-wise OR of the underlying binary representation of + the integers in the input arrays. This ufunc implements the C/Python + operator ``|``. + + Parameters + ---------- + x1, x2 : array_like + Only integer and boolean types are handled. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + out : array_like + Result. + + See Also + -------- + logical_or + bitwise_and + bitwise_xor + binary_repr : + Return the binary representation of the input number as a string. + + Examples + -------- + The number 13 has the binaray representation ``00001101``. Likewise, + 16 is represented by ``00010000``. The bit-wise OR of 13 and 16 is + then ``000111011``, or 29: + + >>> import mars.tensor as mt + + >>> mt.bitwise_or(13, 16).execute() + 29 + + >>> mt.bitwise_or(32, 2).execute() + 34 + >>> mt.bitwise_or([33, 4], 1).execute() + array([33, 5]) + >>> mt.bitwise_or([33, 4], [1, 2]).execute() + array([33, 6]) + + >>> mt.bitwise_or(mt.array([2, 5, 255]), mt.array([4, 4, 4])).execute() + array([ 6, 5, 255]) + >>> (mt.array([2, 5, 255]) | mt.array([4, 4, 4])).execute() + array([ 6, 5, 255]) + >>> mt.bitwise_or(mt.array([2, 5, 255, 2147483647], dtype=mt.int32), + ... mt.array([4, 4, 4, 2147483647], dtype=mt.int32)).execute() + array([ 6, 5, 255, 2147483647]) + >>> mt.bitwise_or([True, True], [False, True]).execute() + array([ True, True]) + """ + op = TensorBitor(**kwargs) + return op(x1, x2, out=out, where=where) + + +@infer_dtype(np.bitwise_or, reverse=True) +def rbitor(x1, x2, **kwargs): + op = TensorBitor(**kwargs) + return op.rcall(x1, x2) diff --git a/python/xorbits/_mars/tensor/arithmetic/bitxor.py b/python/xorbits/_mars/tensor/arithmetic/bitxor.py new file mode 100644 index 000000000..f840890bf --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/bitxor.py @@ -0,0 +1,93 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="binary_or") +class TensorBitxor(TensorBinOp): + _op_type_ = OperandDef.BITXOR + _func_name = "bitwise_xor" + + +@infer_dtype(np.bitwise_xor) +def bitxor(x1, x2, out=None, where=None, **kwargs): + """ + Compute the bit-wise XOR of two arrays element-wise. + + Computes the bit-wise XOR of the underlying binary representation of + the integers in the input arrays. This ufunc implements the C/Python + operator ``^``. + + Parameters + ---------- + x1, x2 : array_like + Only integer and boolean types are handled. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + out : array_like + Result. + + See Also + -------- + logical_xor + bitwise_and + bitwise_or + binary_repr : + Return the binary representation of the input number as a string. + + Examples + -------- + The number 13 is represented by ``00001101``. Likewise, 17 is + represented by ``00010001``. The bit-wise XOR of 13 and 17 is + therefore ``00011100``, or 28: + + >>> import mars.tensor as mt + + >>> mt.bitwise_xor(13, 17).execute() + 28 + + >>> mt.bitwise_xor(31, 5).execute() + 26 + >>> mt.bitwise_xor([31,3], 5).execute() + array([26, 6]) + + >>> mt.bitwise_xor([31,3], [5,6]).execute() + array([26, 5]) + >>> mt.bitwise_xor([True, True], [False, True]).execute() + array([ True, False]) + """ + op = TensorBitxor(**kwargs) + return op(x1, x2, out=out, where=where) + + +@infer_dtype(np.bitwise_xor, reverse=True) +def rbitxor(x1, x2, **kwargs): + op = TensorBitxor(**kwargs) + return op.rcall(x1, x2) diff --git a/python/xorbits/_mars/tensor/arithmetic/cbrt.py b/python/xorbits/_mars/tensor/arithmetic/cbrt.py new file mode 100644 index 000000000..a865bfc47 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/cbrt.py @@ -0,0 +1,64 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorCbrt(TensorUnaryOp): + _op_type_ = OperandDef.CBRT + _func_name = "cbrt" + + +@infer_dtype(np.cbrt) +def cbrt(x, out=None, where=None, **kwargs): + """ + Return the cube-root of an tensor, element-wise. + + Parameters + ---------- + x : array_like + The values whose cube-roots are required. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor + An tensor of the same shape as `x`, containing the cube + cube-root of each element in `x`. + If `out` was provided, `y` is a reference to it. + + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.cbrt([1,8,27]).execute() + array([ 1., 2., 3.]) + """ + op = TensorCbrt(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/ceil.py b/python/xorbits/_mars/tensor/arithmetic/ceil.py new file mode 100644 index 000000000..218300103 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/ceil.py @@ -0,0 +1,69 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorCeil(TensorUnaryOp): + _op_type_ = OperandDef.CEIL + _func_name = "ceil" + + +@infer_dtype(np.ceil) +def ceil(x, out=None, where=None, **kwargs): + r""" + Return the ceiling of the input, element-wise. + + The ceil of the scalar `x` is the smallest integer `i`, such that + `i >= x`. It is often denoted as :math:`\lceil x \rceil`. + + Parameters + ---------- + x : array_like + Input data. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor or scalar + The ceiling of each element in `x`, with `float` dtype. + + See Also + -------- + floor, trunc, rint + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) + >>> mt.ceil(a).execute() + array([-1., -1., -0., 1., 2., 2., 2.]) + """ + op = TensorCeil(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/clip.py b/python/xorbits/_mars/tensor/arithmetic/clip.py new file mode 100644 index 000000000..4d5f6a75b --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/clip.py @@ -0,0 +1,205 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from numbers import Number + +import numpy as np + +from ... import opcodes as OperandDef +from ...core import ENTITY_TYPE +from ...serialization.serializables import AnyField, KeyField +from ..array_utils import as_same_device, device +from ..core import Tensor +from ..datasource import tensor as astensor +from ..utils import broadcast_shape +from .core import TensorElementWise, TensorOperand, filter_inputs + + +class TensorClip(TensorOperand, TensorElementWise): + _op_type_ = OperandDef.CLIP + + _a = KeyField("a") + _a_min = AnyField("a_min") + _a_max = AnyField("a_max") + _out = KeyField("out") + + def __init__(self, a=None, a_min=None, a_max=None, out=None, **kw): + super().__init__(_a=a, _a_min=a_min, _a_max=a_max, _out=out, **kw) + + @property + def a(self): + return self._a + + @property + def a_min(self): + return self._a_min + + @property + def a_max(self): + return self._a_max + + @property + def out(self): + return getattr(self, "_out", None) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + self._a = next(inputs_iter) + if isinstance(self._a_min, ENTITY_TYPE): + self._a_min = next(inputs_iter) + if isinstance(self._a_max, ENTITY_TYPE): + self._a_max = next(inputs_iter) + if getattr(self, "_out", None) is not None: + self._out = next(inputs_iter) + + def __call__(self, a, a_min, a_max, out=None): + a = astensor(a) + tensors = [a] + sparse = a.issparse() + + if isinstance(a_min, Number): + if a_min > 0: + sparse = False + a_min_dtype = np.array(a_min).dtype + elif a_min is not None: + a_min = astensor(a_min) + tensors.append(a_min) + if not a_min.issparse(): + sparse = False + a_min_dtype = a_min.dtype + else: + a_min_dtype = None + self._a_min = a_min + + if isinstance(a_max, Number): + if a_max < 0: + sparse = False + a_max_dtype = np.array(a_max).dtype + elif a_max is not None: + a_max = astensor(a_max) + tensors.append(a_max) + if not a_max.issparse(): + sparse = False + a_max_dtype = a_max.dtype + else: + a_max_dtype = None + self._a_max = a_max + + if out is not None: + if isinstance(out, Tensor): + self._out = out + else: + raise TypeError(f"out should be Tensor object, got {type(out)} instead") + + dtypes = [dt for dt in [a.dtype, a_min_dtype, a_max_dtype] if dt is not None] + dtype = np.result_type(*dtypes) + # check broadcast + shape = broadcast_shape(*[t.shape for t in tensors]) + + setattr(self, "sparse", sparse) + inputs = filter_inputs([a, a_min, a_max, out]) + t = self.new_tensor(inputs, shape) + + if out is None: + setattr(self, "dtype", dtype) + return t + + # if `out` is specified, use out's dtype and shape + out_shape, out_dtype = out.shape, out.dtype + + if t.shape != out_shape: + t = self.new_tensor(inputs, out_shape) + setattr(self, "dtype", out_dtype) + + out.data = t.data + return out + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + inputs_iter = iter(inputs) + a = next(inputs_iter) + a_min = ( + next(inputs_iter) if isinstance(op.a_min, type(op.outputs[0])) else op.a_min + ) + a_max = ( + next(inputs_iter) if isinstance(op.a_max, type(op.outputs[0])) else op.a_max + ) + out = next(inputs_iter).copy() if op.out is not None else None + + with device(device_id): + kw = {} + if out is not None: + kw["out"] = out + ctx[op.outputs[0].key] = xp.clip(a, a_min, a_max, **kw) + + +def clip(a, a_min, a_max, out=None): + """ + Clip (limit) the values in a tensor. + + Given an interval, values outside the interval are clipped to + the interval edges. For example, if an interval of ``[0, 1]`` + is specified, values smaller than 0 become 0, and values larger + than 1 become 1. + + Parameters + ---------- + a : array_like + Tensor containing elements to clip. + a_min : scalar or array_like or `None` + Minimum value. If `None`, clipping is not performed on lower + interval edge. Not more than one of `a_min` and `a_max` may be + `None`. + a_max : scalar or array_like or `None` + Maximum value. If `None`, clipping is not performed on upper + interval edge. Not more than one of `a_min` and `a_max` may be + `None`. If `a_min` or `a_max` are array_like, then the three + arrays will be broadcasted to match their shapes. + out : Tensor, optional + The results will be placed in this tensor. It may be the input + array for in-place clipping. `out` must be of the right shape + to hold the output. Its type is preserved. + + Returns + ------- + clipped_array : Tensor + An tensor with the elements of `a`, but where values + < `a_min` are replaced with `a_min`, and those > `a_max` + with `a_max`. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.arange(10) + >>> mt.clip(a, 1, 8).execute() + array([1, 1, 2, 3, 4, 5, 6, 7, 8, 8]) + >>> a.execute() + array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> mt.clip(a, 3, 6, out=a).execute() + array([3, 3, 3, 3, 4, 5, 6, 6, 6, 6]) + >>> a = mt.arange(10) + >>> a.execute() + array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> mt.clip(a, [3, 4, 1, 1, 1, 4, 4, 4, 4, 4], 8).execute() + array([3, 4, 2, 3, 4, 5, 6, 7, 8, 8]) + + """ + op = TensorClip(a=a, a_min=a_min, a_max=a_max, out=out) + return op(a, a_min, a_max, out=out) diff --git a/python/xorbits/_mars/tensor/arithmetic/conj.py b/python/xorbits/_mars/tensor/arithmetic/conj.py new file mode 100644 index 000000000..4fd183a6b --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/conj.py @@ -0,0 +1,72 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorConj(TensorUnaryOp): + _op_type_ = OperandDef.CONJ + _func_name = "conj" + + +@infer_dtype(np.conj) +def conj(x, out=None, where=None, **kwargs): + """ + Return the complex conjugate, element-wise. + + The complex conjugate of a complex number is obtained by changing the + sign of its imaginary part. + + Parameters + ---------- + x : array_like + Input value. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor + The complex conjugate of `x`, with same dtype as `y`. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.conjugate(1+2j).execute() + (1-2j) + + >>> x = mt.eye(2) + 1j * mt.eye(2) + >>> mt.conjugate(x).execute() + array([[ 1.-1.j, 0.-0.j], + [ 0.-0.j, 1.-1.j]]) + """ + op = TensorConj(**kwargs) + return op(x, out=out, where=where) + + +conjugate = conj diff --git a/python/xorbits/_mars/tensor/arithmetic/copysign.py b/python/xorbits/_mars/tensor/arithmetic/copysign.py new file mode 100644 index 000000000..db369ba2d --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/copysign.py @@ -0,0 +1,76 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="always_false") +class TensorCopysign(TensorBinOp): + _op_type_ = OperandDef.COPYSIGN + _func_name = "copysign" + + +@infer_dtype(np.copysign) +def copysign(x1, x2, out=None, where=None, **kwargs): + """ + Change the sign of x1 to that of x2, element-wise. + + If both arguments are arrays or sequences, they have to be of the same + length. If `x2` is a scalar, its sign will be copied to all elements of + `x1`. + + Parameters + ---------- + x1 : array_like + Values to change the sign of. + x2 : array_like + The sign of `x2` is copied to `x1`. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + out : array_like + The values of `x1` with the sign of `x2`. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.copysign(1.3, -1).execute() + -1.3 + >>> (1/mt.copysign(0, 1)).execute() + inf + >>> (1/mt.copysign(0, -1)).execute() + -inf + + >>> mt.copysign([-1, 0, 1], -1.1).execute() + array([-1., -0., -1.]) + >>> mt.copysign([-1, 0, 1], mt.arange(3)-1).execute() + array([-1., 0., 1.]) + """ + op = TensorCopysign(**kwargs) + return op(x1, x2, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/core.py b/python/xorbits/_mars/tensor/arithmetic/core.py new file mode 100644 index 000000000..a5180316a --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/core.py @@ -0,0 +1,788 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np + +from ...core import ExecutableTuple +from ...serialization.serializables import ( + AnyField, + DictField, + FieldTypes, + KeyField, + ListField, + StringField, +) +from ...utils import has_unknown_shape +from ..array_utils import as_same_device, convert_order, device +from ..core import Tensor, TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorOperand, TensorOperandMixin +from ..utils import ( + broadcast_shape, + check_order, + check_out_param, + filter_inputs, + unify_chunks, +) + + +class TensorElementWise(TensorOperandMixin): + __slots__ = () + + @classmethod + def tile(cls, op): + if len(op.inputs) > 1: + if has_unknown_shape(*op.inputs): + yield + inputs = yield from unify_chunks( + *[(input, list(range(input.ndim))[::-1]) for input in op.inputs] + ) + + chunk_shapes = [t.chunk_shape for t in inputs] + out_chunk_shape = broadcast_shape(*chunk_shapes) + + out_chunks = [list() for _ in op.outputs] + nsplits = [[np.nan] * shape for shape in out_chunk_shape] + get_index = lambda idx, t: tuple( + 0 if t.nsplits[i] == (1,) else ix for i, ix in enumerate(idx) + ) + for out_index in itertools.product(*(map(range, out_chunk_shape))): + in_chunks = [ + t.cix[get_index(out_index[-t.ndim :], t)] + if t.ndim != 0 + else t.chunks[0] + for t in inputs + ] + chunk_op = op.copy().reset_key() + chunk_shape = broadcast_shape(*(c.shape for c in in_chunks)) + chunks = chunk_op.new_chunks( + in_chunks, + shape=chunk_shape, + index=out_index, + kws=[ + {"side": str(i), "order": o.order, "dtype": o.dtype} + for i, o in enumerate(op.outputs) + ], + ) + for i, out_chunk in enumerate(chunks): + out_chunks[i].append(out_chunk) + for i, idx, s in zip(itertools.count(0), out_index, chunks[0].shape): + nsplits[i][idx] = s + + new_op = op.copy().reset_key() + kws = [] + for out_chunk, o in zip(out_chunks, op.outputs): + params = o.params.copy() + params["chunks"] = out_chunk + params["nsplits"] = nsplits + kws.append(params) + return new_op.new_tensors(list(inputs), kws=kws, output_limit=len(op.outputs)) + + +class TensorElementWiseWithInputs(TensorElementWise): + def _set_sparse(self, inputs): + raise NotImplementedError + + def _new_tileables(self, inputs, kws=None, **kw): + self._set_sparse(inputs) + return super()._new_tileables(inputs, kws=kws, **kw) + + def _new_chunks(self, inputs, kws=None, **kw): + self._set_sparse(inputs) + return super()._new_chunks(inputs, kws=kws, **kw) + + +def _handle_out_dtype(val, dtype): + if val.dtype != dtype: + return val.astype(dtype) + return val + + +class TensorBinOpMixin(TensorElementWiseWithInputs): + __slots__ = () + + def check_inputs(self, inputs): + if len(inputs) > 4: + raise ValueError( + f"Binary operand's inputs should less than or equal 4, got {len(inputs)}" + ) + + @classmethod + def _get_func(cls, xp): + func_name = getattr(cls, "_func_name") + return getattr(xp, func_name) + + @classmethod + def _execute_gpu(cls, op, xp, lhs, rhs, **kw): + if kw.get("out") is not None: + kw["out"] = xp.asarray(kw["out"]) + r = cls._get_func(xp)(lhs, rhs, **kw) + return convert_order(r, op.outputs[0].order.value) + + @classmethod + def _execute_cpu(cls, op, xp, lhs, rhs, **kw): + kw["order"] = op.order + if kw.get("out") is not None: + kw["out"] = np.asarray(kw["out"]) + return cls._get_func(xp)(lhs, rhs, **kw) + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + kw = {"casting": op.casting} if op.out is not None else {} + + inputs_iter = iter(inputs) + lhs = op.lhs if np.isscalar(op.lhs) else next(inputs_iter) + rhs = op.rhs if np.isscalar(op.rhs) else next(inputs_iter) + if op.out is not None: + kw["out"] = next(inputs_iter).copy() + if op.where is not None: + kw["where"] = next(inputs_iter) + + with np.errstate(**op.err): + if op.is_gpu(): + ret = cls._execute_gpu(op, xp, lhs, rhs, **kw) + else: + ret = cls._execute_cpu(op, xp, lhs, rhs, **kw) + ctx[op.outputs[0].key] = _handle_out_dtype(ret, op.dtype) + + +class TensorBinOp(TensorOperand, TensorBinOpMixin): + _lhs = AnyField("lhs") + _rhs = AnyField("rhs") + _out = KeyField("out") + _where = KeyField("where") + _casting = StringField("casting") + _order = StringField("order") + _err = DictField("err", FieldTypes.string, FieldTypes.string) + + def __init__(self, lhs=None, rhs=None, out=None, where=None, order=None, **kwargs): + super().__init__( + _lhs=lhs, _rhs=rhs, _out=out, _where=where, _order=order, **kwargs + ) + if self._order is None: + self._order = "K" + check_order(self._order) + + @property + def lhs(self): + return self._lhs + + @property + def rhs(self): + return self._rhs + + @property + def out(self): + return getattr(self, "_out", None) + + @property + def where(self): + return getattr(self, "_where", None) + + @property + def order(self): + return getattr(self, "_order", None) + + @property + def casting(self): + return getattr(self, "_casting", None) + + @property + def err(self): + return getattr(self, "_err", dict()) + + @classmethod + def _is_sparse(cls, x1, x2): + return False + + def _set_sparse(self, inputs): + inputs_iter = iter(inputs) + x1 = self._lhs if np.isscalar(self._lhs) else next(inputs_iter) + x2 = self._rhs if np.isscalar(self._rhs) else next(inputs_iter) + setattr(self, "sparse", self._is_sparse(x1, x2)) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + + self._lhs = self._lhs if np.isscalar(self._lhs) else next(inputs_iter) + self._rhs = self._rhs if np.isscalar(self._rhs) else next(inputs_iter) + if getattr(self, "_out", None) is not None: + self._out = next(inputs_iter) + if getattr(self, "_where", None) is not None: + self._where = next(inputs_iter) + + def _process_inputs(self, x1, x2, out, where): + x1 = x1 if np.isscalar(x1) else astensor(x1) + x2 = x2 if np.isscalar(x2) else astensor(x2) + self._lhs = x1 + self._rhs = x2 + + if out is not None: + if isinstance(out, Tensor): + self._out = out + else: + raise TypeError(f"out should be Tensor object, got {type(out)} instead") + if where is True: + where = None + if where is not None: + where = astensor(where) + self._where = where + + return x1, x2, out, where + + def _calc_order(self, x1, x2, out): + if out is not None: + return out.order + + if self._order in "KA": + orders = [] + if not np.isscalar(x1): + orders.append(x1.order) + if not np.isscalar(x2): + orders.append(x2.order) + + if len(orders) == 0: + return TensorOrder.C_ORDER + elif any(order == TensorOrder.C_ORDER for order in orders): + return TensorOrder.C_ORDER + else: + return TensorOrder.F_ORDER + elif self._order == "C": + return TensorOrder.C_ORDER + else: + return TensorOrder.F_ORDER + + @property + def ufunc_extra_params(self): + return dict() + + def _call_tensor_ufunc(self, x1, x2, out=None, where=None): + if hasattr(x1, "__tensor_ufunc__") or hasattr(x2, "__tensor_ufunc__"): + ufunc = ( + x1.__tensor_ufunc__ + if hasattr(x1, "__tensor_ufunc__") + else x2.__tensor_ufunc__ + ) + ret = ufunc(type(self), [x1, x2], out, where, **self.ufunc_extra_params) + if ret is NotImplemented: + return + return ret + + def _call(self, x1, x2, out=None, where=None): + # check tensor ufunc, if x1 or x2 is not a tensor, e.g. Mars DataFrame + # which implements tensor ufunc, will delegate the computation + # to it if possible + ret = self._call_tensor_ufunc(x1, x2, out=out, where=where) + if ret is not None: + return ret + + x1, x2, out, where = self._process_inputs(x1, x2, out, where) + # check broadcast + x1_shape = () if np.isscalar(x1) else x1.shape + x2_shape = () if np.isscalar(x2) else x2.shape + shape = broadcast_shape(x1_shape, x2_shape) + order = self._calc_order(x1, x2, out) + + inputs = filter_inputs([x1, x2, out, where]) + t = self.new_tensor(inputs, shape, order=order) + + if out is None: + return t + + check_out_param(out, t, getattr(self, "_casting")) + out_shape, out_dtype = out.shape, out.dtype + + # if `out` is specified, use out's dtype and shape + if t.shape != out_shape: + t = self.new_tensor(inputs, out_shape, order=order) + setattr(self, "dtype", out_dtype) + + out.data = t.data + return out + + def __call__(self, x1, x2, out=None, where=None): + return self._call(x1, x2, out=out, where=where) + + def rcall(self, x1, x2, out=None, where=None): + return self._call(x2, x1, out=out, where=where) + + +class TensorUnaryOpMixin(TensorElementWiseWithInputs): + __slots__ = () + + def check_inputs(self, inputs): + if len(inputs) > 3: + raise ValueError( + f"Binary operand's inputs should less than or equal 3, got {len(inputs)}" + ) + + @classmethod + def _get_func(cls, xp): + func_name = getattr(cls, "_func_name") + return getattr(xp, func_name) + + @classmethod + def _execute_gpu(cls, op, xp, inp, **kw): + r = cls._get_func(xp)(inp, **kw) + return convert_order(r, op.outputs[0].order.value) + + @classmethod + def _execute_cpu(cls, op, xp, inp, **kw): + if op.order != "K": + kw["order"] = op.order + return cls._get_func(xp)(inp, **kw) + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + kw = {"casting": op.casting} if op.out else {} + + if op.out and op.where: + inputs, kw["out"], kw["where"] = ( + inputs[:-2], + inputs[-2].copy(), + inputs[-1], + ) + elif op.out: + inputs, kw["out"] = inputs[:-1], inputs[-1].copy() + elif op.where: + inputs, kw["where"] = inputs[:-1], inputs[-1] + + with np.errstate(**op.err): + if op.is_gpu(): + ret = cls._execute_gpu(op, xp, inputs[0], **kw) + else: + ret = cls._execute_cpu(op, xp, inputs[0], **kw) + ctx[op.outputs[0].key] = _handle_out_dtype(ret, op.dtype) + + +class TensorUnaryOp(TensorOperand, TensorUnaryOpMixin): + _input = KeyField("input") + _out = KeyField("out") + _where = KeyField("where") + _casting = StringField("casting") + _order = StringField("order") + _err = DictField("err", FieldTypes.string, FieldTypes.string) + + def __init__(self, out=None, where=None, order=None, **kwargs): + super().__init__(_out=out, _where=where, _order=order, **kwargs) + if self._order is None: + self._order = "K" + check_order(self._order) + + @property + def input(self): + return self._input + + @property + def out(self): + return getattr(self, "_out", None) + + @property + def where(self): + return getattr(self, "_where", None) + + @property + def order(self): + return getattr(self, "_order", None) + + @property + def casting(self): + return getattr(self, "_casting", None) + + @property + def err(self): + return getattr(self, "_err", dict()) + + @classmethod + def _is_sparse(cls, x): + if hasattr(x, "issparse") and x.issparse(): + return True + else: + return False + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + + self._input = next(inputs_iter) + if getattr(self, "_out", None) is not None: + self._out = next(inputs_iter) + if getattr(self, "_where", None) is not None: + self._where = next(inputs_iter) + + def _process_inputs(self, x, out, where): + x = astensor(x) + + if out is not None: + if isinstance(out, Tensor): + self._out = out + else: + raise TypeError(f"out should be Tensor object, got {type(out)} instead") + if where is True: + where = None + if where is not None: + where = astensor(where) + self._where = where + + return x, out, where + + def _set_sparse(self, inputs): + setattr(self, "sparse", self._is_sparse(inputs[0])) + + def _calc_order(self, x, out): + if out is not None: + return out.order + + if self._order in "KA": + return x.order + elif self._order == "C": + return TensorOrder.C_ORDER + else: + return TensorOrder.F_ORDER + + @property + def ufunc_extra_params(self): + return dict() + + def _call_tensor_ufunc(self, x, out=None, where=None): + if hasattr(x, "__tensor_ufunc__"): + ret = x.__tensor_ufunc__( + type(self), [x], out, where, **self.ufunc_extra_params + ) + if ret is NotImplemented: + return + return ret + + def _call(self, x, out=None, where=None): + # check tensor ufunc, if x is not a tensor, e.g. Mars DataFrame + # which implements tensor ufunc, will delegate the computation + # to it if possible + ret = self._call_tensor_ufunc(x, out=out, where=where) + if ret is not None: + return ret + + x, out, where = self._process_inputs(x, out, where) + shape = x.shape + order = self._calc_order(x, out) + + inputs = filter_inputs([x, out, where]) + t = self.new_tensor(inputs, shape, order=order) + + if out is None: + return t + + check_out_param(out, t, getattr(self, "_casting")) + out_shape, out_dtype = out.shape, out.dtype + + # if `out` is specified, use out's dtype and shape + if t.shape != out_shape: + t = self.new_tensor(inputs, out_shape, order=order) + setattr(self, "dtype", out_dtype) + + out.data = t.data + return out + + def __call__(self, x, out=None, where=None): + return self._call(x, out=out, where=where) + + +class TensorOutBinOp(TensorOperand, TensorElementWiseWithInputs): + _input = KeyField("input") + _out1 = KeyField("out1") + _out2 = KeyField("out2") + _where = KeyField("where") + _order = StringField("order") + _casting = StringField("casting") + + def __init__(self, out1=None, out2=None, where=None, order=None, **kwargs): + super().__init__(_out1=out1, _out2=out2, _where=where, _order=order, **kwargs) + if self._order is None: + self._order = "K" + check_order(self._order) + + @property + def output_limit(self): + return 2 + + @property + def input(self): + return self._input + + @property + def out1(self): + return getattr(self, "_out1", None) + + @property + def out2(self): + return getattr(self, "_out2", None) + + @property + def where(self): + return getattr(self, "_where", None) + + @property + def order(self): + return getattr(self, "_order", None) + + @property + def casting(self): + return getattr(self, "_casting", None) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + + self._input = next(inputs_iter) + if getattr(self, "_out1", None) is not None: + self._out1 = next(inputs_iter) + if getattr(self, "_out2", None) is not None: + self._out2 = next(inputs_iter) + if getattr(self, "_where", None) is not None: + self._where = next(inputs_iter) + + def _process_inputs(self, x, out1, out2, where): + x = astensor(x) + + if out1 is not None: + if isinstance(out1, Tensor): + self._out1 = out1 + else: + raise TypeError( + f"out1 should be Tensor object, got {type(out1)} instead" + ) + if out2 is not None: + if isinstance(out2, Tensor): + self._out2 = out2 + else: + raise TypeError( + f"out2 should be Tensor object, got {type(out2)} instead" + ) + if where is True: + where = None + if where is not None: + where = astensor(where) + self._where = where + + return x, out1, out2, where + + @classmethod + def _is_sparse(cls, x): + return False + + def _set_sparse(self, inputs): + setattr(self, "sparse", self._is_sparse(inputs[0])) + + @property + def _fun(self): + raise NotImplementedError + + def _calc_order(self, x, out): + if out is not None: + return out.order + + if self._order in "KA": + return x.order + elif self._order == "C": + return TensorOrder.C_ORDER + else: + return TensorOrder.F_ORDER + + def _call(self, x, out1=None, out2=None, out=None, where=None): + dtype = [r.dtype for r in self._fun(np.empty(1, dtype=x.dtype))] + + out = out or (None, None) + out1 = out1 or out[0] + out2 = out2 or out[1] + x, out1, out2, where = self._process_inputs(x, out1, out2, where) + shape = x.shape + order1 = self._calc_order(x, out1) + order2 = self._calc_order(x, out2) + + inputs = filter_inputs([x, out1, out2, where]) + t1, t2 = self.new_tensors( + inputs, + shape, + kws=[ + {"order": order1, "dtype": dtype[0], "side": "left"}, + {"order": order2, "dtype": dtype[1], "side": "right"}, + ], + ) + + if out1 is None and out2 is None: + return ExecutableTuple([t1, t2]) + + if out1 is not None: + check_out_param(out1, t1, getattr(self, "_casting")) + out1_shape, out1_dtype = out1.shape, out1.dtype + else: + out1_shape, out1_dtype = t1.shape, t1.dtype + if out2 is not None: + check_out_param(out2, t2, getattr(self, "_casting")) + out2_shape, out2_dtype = out2.shape, out2.dtype + else: + out2_shape, out2_dtype = t2.shape, t2.dtype + # if `out` is specified, use out's dtype and shape + if t1.shape != out1_shape or t2.shape != out2_shape: + t1, t2 = self.new_tensor( + inputs, + [out1_shape, out2_shape], + kws=[ + {"order": order1, "dtype": out1_dtype}, + {"order": order2, "dtype": out2_dtype}, + ], + ) + + if out1 is not None: + out1.data = t1.data + else: + out1 = t1 + if out2 is not None: + out2.data = t2.data + else: + out2 = t2 + return ExecutableTuple([out1, out2]) + + def __call__(self, x, out1=None, out2=None, out=None, where=None): + return self._call(x, out1=out1, out2=out2, out=out, where=where) + + +class TensorMultiOp(TensorElementWiseWithInputs, TensorOperand): + _args = ListField("args") + _out = KeyField("out") + _where = KeyField("where") + _casting = StringField("casting") + _order = StringField("order") + _err = DictField("err", FieldTypes.string, FieldTypes.string) + + def __init__( + self, + args=None, + out=None, + where=None, + casting=None, + order=None, + err=None, + **kwargs, + ): + super().__init__( + _args=args, + _out=out, + _where=where, + _order=order, + _casting=casting, + _er=err, + **kwargs, + ) + if self._casting is None: + self._casting = "same_kind" + if self._order is None: + self._order = "K" + check_order(self._order) + + @property + def args(self): + return getattr(self, "_args", None) + + @property + def out(self): + return getattr(self, "_out", None) + + @property + def order(self): + return getattr(self, "_order", None) + + @property + def casting(self): + return getattr(self, "_casting", None) + + @property + def err(self): + return getattr(self, "_err", dict()) + + @classmethod + def _is_sparse(cls, *args): + return False + + def _set_sparse(self, inputs): + inputs_iter = iter(inputs or ()) + args = list(self._args) + for idx in range(len(self._args)): + if not np.isscalar(self._args[idx]): + args[idx] = next(inputs_iter) + setattr(self, "sparse", self._is_sparse(*args)) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(inputs or ()) + + args = list(self._args) + for idx in range(len(args)): + if not np.isscalar(args[idx]): + args[idx] = next(inputs_iter) + self._args = args + + if getattr(self, "_out", None) is not None: + self._out = next(inputs_iter) + if getattr(self, "_where", None) is not None: + self._where = next(inputs_iter) + + def _process_inputs(self, *args, out=None): + self._args = [a if np.isscalar(a) else astensor(a) for a in args] + + if out is not None: + if isinstance(out, Tensor): + self._out = out + else: + raise TypeError(f"out should be Tensor object, got {type(out)} instead") + + return args + (out,) + + def __call__(self, *args, out=None): + proc_inputs_results = self._process_inputs(*args, out=out) + args = proc_inputs_results[:-1] + (out,) = proc_inputs_results[-1:] + # check broadcast + shapes = [() if np.isscalar(a) else a.shape for a in self._args] + shape = broadcast_shape(*shapes) + order = out.order if out is not None else None + + inputs = filter_inputs(list(args) + [out]) + t = self.new_tensor(inputs, shape, order=order) + + if out is None: + return t + + check_out_param(out, t, getattr(self, "_casting")) + out_shape, out_dtype = out.shape, out.dtype + + # if `out` is specified, use out's dtype and shape + if t.shape != out_shape: + t = self.new_tensor(inputs, out_shape, order=order) + setattr(self, "dtype", out_dtype) + + out.data = t.data + return out diff --git a/python/xorbits/_mars/tensor/arithmetic/cos.py b/python/xorbits/_mars/tensor/arithmetic/cos.py new file mode 100644 index 000000000..afc6f60f2 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/cos.py @@ -0,0 +1,83 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorCos(TensorUnaryOp): + _op_type_ = OperandDef.COS + _func_name = "cos" + + +@infer_dtype(np.cos) +def cos(x, out=None, where=None, **kwargs): + """ + Cosine element-wise. + + Parameters + ---------- + x : array_like + Input tensor in radians. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated array is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor + The corresponding cosine values. + + Notes + ----- + If `out` is provided, the function writes the result into it, + and returns a reference to `out`. (See Examples) + + References + ---------- + M. Abramowitz and I. A. Stegun, Handbook of Mathematical Functions. + New York, NY: Dover, 1972. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.cos(mt.array([0, mt.pi/2, mt.pi])).execute() + array([ 1.00000000e+00, 6.12303177e-17, -1.00000000e+00]) + >>> + >>> # Example of providing the optional output parameter + >>> out1 = mt.empty(1) + >>> out2 = mt.cos([0.1], out1) + >>> out2 is out1 + True + >>> + >>> # Example of ValueError due to provision of shape mis-matched `out` + >>> mt.cos(mt.zeros((3,3)),mt.zeros((2,2))) + Traceback (most recent call last): + File "", line 1, in + ValueError: operands could not be broadcast together with shapes (3,3) (2,2) + """ + op = TensorCos(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/cosh.py b/python/xorbits/_mars/tensor/arithmetic/cosh.py new file mode 100644 index 000000000..d34d40150 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/cosh.py @@ -0,0 +1,70 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorCosh(TensorUnaryOp): + _op_type_ = OperandDef.COSH + _func_name = "cosh" + + +@infer_dtype(np.cosh) +def cosh(x, out=None, where=None, **kwargs): + """ + Hyperbolic cosine, element-wise. + + Equivalent to ``1/2 * (mt.exp(x) + mt.exp(-x))`` and ``mt.cos(1j*x)``. + + Parameters + ---------- + x : array_like + Input tensor. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + out : Tensor + Output array of same shape as `x`. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.cosh(0).execute() + 1.0 + + The hyperbolic cosine describes the shape of a hanging cable: + + >>> import matplotlib.pyplot as plt + >>> x = mt.linspace(-4, 4, 1000) + >>> plt.plot(x.execute(), mt.cosh(x).execute()) + >>> plt.show() + """ + op = TensorCosh(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/deg2rad.py b/python/xorbits/_mars/tensor/arithmetic/deg2rad.py new file mode 100644 index 000000000..90ffd8136 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/deg2rad.py @@ -0,0 +1,70 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorDeg2rad(TensorUnaryOp): + _op_type_ = OperandDef.DEG2RAD + _func_name = "deg2rad" + + +@infer_dtype(np.deg2rad) +def deg2rad(x, out=None, where=None, **kwargs): + """ + Convert angles from degrees to radians. + + Parameters + ---------- + x : array_like + Angles in degrees. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor + The corresponding angle in radians. + + See Also + -------- + rad2deg : Convert angles from radians to degrees. + unwrap : Remove large jumps in angle by wrapping. + + Notes + ----- + ``deg2rad(x)`` is ``x * pi / 180``. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.deg2rad(180).execute() + 3.1415926535897931 + """ + op = TensorDeg2rad(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/degrees.py b/python/xorbits/_mars/tensor/arithmetic/degrees.py new file mode 100644 index 000000000..6b022fb1f --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/degrees.py @@ -0,0 +1,75 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorDegrees(TensorUnaryOp): + _op_type_ = OperandDef.DEGREES + _func_name = "degrees" + + +@infer_dtype(np.degrees) +def degrees(x, out=None, where=None, **kwargs): + """ + Convert angles from radians to degrees. + + Parameters + ---------- + x : array_like + Input tensor in radians. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor of floats + The corresponding degree values; if `out` was supplied this is a + reference to it. + + See Also + -------- + rad2deg : equivalent function + + Examples + -------- + Convert a radian array to degrees + + >>> import mars.tensor as mt + + >>> rad = mt.arange(12.)*mt.pi/6 + >>> mt.degrees(rad).execute() + array([ 0., 30., 60., 90., 120., 150., 180., 210., 240., + 270., 300., 330.]) + + >>> out = mt.zeros((rad.shape)) + >>> r = mt.degrees(out) + >>> mt.all(r == out).execute() + True + """ + op = TensorDegrees(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/divide.py b/python/xorbits/_mars/tensor/arithmetic/divide.py new file mode 100644 index 000000000..d166d13bb --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/divide.py @@ -0,0 +1,112 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand +class TensorDivide(TensorBinOp): + _op_type_ = OperandDef.DIV + _func_name = "divide" + + @classmethod + def _is_sparse(cls, x1, x2): + if not np.isscalar(x1) and not np.isscalar(x2): + return False + if hasattr(x1, "issparse") and x1.issparse(): + if x2 != 0: + return True + else: + raise ZeroDivisionError("float division by zero") + + +@infer_dtype(np.divide) +def divide(x1, x2, out=None, where=None, **kwargs): + """ + Divide arguments element-wise. + + Parameters + ---------- + x1 : array_like + Dividend tensor. + x2 : array_like + Divisor tensor. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated array is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + out : Tensor + The quotient `x1/x2`, element-wise. Returns a scalar if both `x1` and `x2` are scalars. + + Notes + ----- + Equivalent to `x1` / `x2` in terms of array-broadcasting. + + Behavior on division by zero can be changed using `seterr`. + + In Python 2, when both `x1` and `x2` are of an integer type, `divide` will behave like `floor_divide`. + In Python 3, it behaves like `true_divide`. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.divide(2.0, 4.0).execute() + 0.5 + >>> x1 = mt.arange(9.0).reshape((3, 3)) + >>> x2 = mt.arange(3.0) + >>> mt.divide(x1, x2).execute() + array([[ NaN, 1. , 1. ], + [ Inf, 4. , 2.5], + [ Inf, 7. , 4. ]]) + Note the behavior with integer types (Python 2 only): + >>> mt.divide(2, 4).execute() + 0 + >>> mt.divide(2, 4.).execute() + 0.5 + Division by zero always yields zero in integer arithmetic (again, Python 2 only), + and does not raise an exception or a warning: + >>> mt.divide(mt.array([0, 1], dtype=int), mt.array([0, 0], dtype=int)).execute() + array([0, 0]) + Division by zero can, however, be caught using seterr: + >>> old_err_state = mt.seterr(divide='raise') + >>> mt.divide(1, 0).execute() + Traceback (most recent call last): + ... + FloatingPointError: divide by zero encountered in divide + >>> ignored_states = mt.seterr(**old_err_state) + >>> mt.divide(1, 0).execute() + 0 + """ + op = TensorDivide(**kwargs) + return op(x1, x2, out=out, where=where) + + +@infer_dtype(np.divide, reverse=True) +def rdivide(x1, x2, **kwargs): + op = TensorDivide(**kwargs) + return op.rcall(x1, x2) diff --git a/python/xorbits/_mars/tensor/arithmetic/equal.py b/python/xorbits/_mars/tensor/arithmetic/equal.py new file mode 100644 index 000000000..a77fcb37f --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/equal.py @@ -0,0 +1,74 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import inject_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="binary_and") +class TensorEqual(TensorBinOp): + _op_type_ = OperandDef.EQ + _func_name = "equal" + + +@inject_dtype(np.bool_) +def equal(x1, x2, out=None, where=None, **kwargs): + """ + Return (x1 == x2) element-wise. + + Parameters + ---------- + x1, x2 : array_like + Input tensors of the same shape. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated array is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + For other keyword-only arguments, see the + :ref:`ufunc docs `. + + Returns + ------- + out : Tensor or bool + Output tensor of bools, or a single bool if x1 and x2 are scalars. + + See Also + -------- + not_equal, greater_equal, less_equal, greater, less + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.equal([0, 1, 3], mt.arange(3)).execute() + array([ True, True, False]) + + What is compared are values, not types. So an int (1) and a tensor of + length one can evaluate as True: + + >>> mt.equal(1, mt.ones(1)) + array([ True]) + """ + + op = TensorEqual(**kwargs) + return op(x1, x2, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/exp.py b/python/xorbits/_mars/tensor/arithmetic/exp.py new file mode 100644 index 000000000..e722339e2 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/exp.py @@ -0,0 +1,104 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorExp(TensorUnaryOp): + _op_type_ = OperandDef.EXP + _func_name = "exp" + + +@infer_dtype(np.exp) +def exp(x, out=None, where=None, **kwargs): + r""" + Calculate the exponential of all elements in the input tensor. + + Parameters + ---------- + x : array_like + Input values. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + For other keyword-only arguments, see the + :ref:`ufunc docs `. + + Returns + ------- + out : Tensor + Output tensor, element-wise exponential of `x`. + + See Also + -------- + expm1 : Calculate ``exp(x) - 1`` for all elements in the array. + exp2 : Calculate ``2**x`` for all elements in the array. + + Notes + ----- + The irrational number ``e`` is also known as Euler's number. It is + approximately 2.718281, and is the base of the natural logarithm, + ``ln`` (this means that, if :math:`x = \ln y = \log_e y`, + then :math:`e^x = y`. For real input, ``exp(x)`` is always positive. + + For complex arguments, ``x = a + ib``, we can write + :math:`e^x = e^a e^{ib}`. The first term, :math:`e^a`, is already + known (it is the real argument, described above). The second term, + :math:`e^{ib}`, is :math:`\cos b + i \sin b`, a function with + magnitude 1 and a periodic phase. + + References + ---------- + .. [1] Wikipedia, "Exponential function", + http://en.wikipedia.org/wiki/Exponential_function + .. [2] M. Abramovitz and I. A. Stegun, "Handbook of Mathematical Functions + with Formulas, Graphs, and Mathematical Tables," Dover, 1964, p. 69, + http://www.math.sfu.ca/~cbm/aands/page_69.htm + + Examples + -------- + Plot the magnitude and phase of ``exp(x)`` in the complex plane: + + >>> import mars.tensor as mt + >>> import matplotlib.pyplot as plt + + >>> x = mt.linspace(-2*mt.pi, 2*mt.pi, 100) + >>> xx = x + 1j * x[:, mt.newaxis] # a + ib over complex plane + >>> out = mt.exp(xx) + + >>> plt.subplot(121) + >>> plt.imshow(mt.abs(out).execute(), + ... extent=[-2*mt.pi, 2*mt.pi, -2*mt.pi, 2*mt.pi], cmap='gray') + >>> plt.title('Magnitude of exp(x)') + + >>> plt.subplot(122) + >>> plt.imshow(mt.angle(out).execute(), + ... extent=[-2*mt.pi, 2*mt.pi, -2*mt.pi, 2*mt.pi], cmap='hsv') + >>> plt.title('Phase (angle) of exp(x)') + >>> plt.show() + """ + op = TensorExp(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/exp2.py b/python/xorbits/_mars/tensor/arithmetic/exp2.py new file mode 100644 index 000000000..75a4bb5e3 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/exp2.py @@ -0,0 +1,65 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorExp2(TensorUnaryOp): + _op_type_ = OperandDef.EXP2 + _func_name = "exp2" + + +@infer_dtype(np.exp2) +def exp2(x, out=None, where=None, **kwargs): + """ + Calculate `2**p` for all `p` in the input tensor. + + Parameters + ---------- + x : array_like + Input values. + out : Tensor, None, or tuple of tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + out : Tensor + Element-wise 2 to the power `x`. + + See Also + -------- + power + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.exp2([2, 3]).execute() + array([ 4., 8.]) + """ + op = TensorExp2(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/expm1.py b/python/xorbits/_mars/tensor/arithmetic/expm1.py new file mode 100644 index 000000000..fa8594c70 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/expm1.py @@ -0,0 +1,77 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorExpm1(TensorUnaryOp): + _op_type_ = OperandDef.EXPM1 + _func_name = "expm1" + + +@infer_dtype(np.expm1) +def expm1(x, out=None, where=None, **kwargs): + """ + Calculate ``exp(x) - 1`` for all elements in the tensor. + + Parameters + ---------- + x : array_like + Input values. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + out : Tensor + Element-wise exponential minus one: ``out = exp(x) - 1``. + + See Also + -------- + log1p : ``log(1 + x)``, the inverse of expm1. + + + Notes + ----- + This function provides greater precision than ``exp(x) - 1`` + for small values of ``x``. + + Examples + -------- + The true value of ``exp(1e-10) - 1`` is ``1.00000000005e-10`` to + about 32 significant digits. This example shows the superiority of + expm1 in this case. + + >>> import mars.tensor as mt + + >>> mt.expm1(1e-10).execute() + 1.00000000005e-10 + >>> (mt.exp(1e-10) - 1).execute() + 1.000000082740371e-10 + """ + op = TensorExpm1(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/fabs.py b/python/xorbits/_mars/tensor/arithmetic/fabs.py new file mode 100644 index 000000000..5e2f22018 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/fabs.py @@ -0,0 +1,72 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorFabs(TensorUnaryOp): + _op_type_ = OperandDef.FABS + _func_name = "fabs" + + +@infer_dtype(np.fabs) +def fabs(x, out=None, where=None, **kwargs): + """ + Compute the absolute values element-wise. + + This function returns the absolute values (positive magnitude) of the + data in `x`. Complex values are not handled, use `absolute` to find the + absolute values of complex data. + + Parameters + ---------- + x : array_like + The tensor of numbers for which the absolute values are required. If + `x` is a scalar, the result `y` will also be a scalar. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor or scalar + The absolute values of `x`, the returned values are always floats. + + See Also + -------- + absolute : Absolute values including `complex` types. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.fabs(-1).execute() + 1.0 + >>> mt.fabs([-1.2, 1.2]).execute() + array([ 1.2, 1.2]) + """ + op = TensorFabs(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/fix.py b/python/xorbits/_mars/tensor/arithmetic/fix.py new file mode 100644 index 000000000..c702b012b --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/fix.py @@ -0,0 +1,67 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorFix(TensorUnaryOp): + _op_type_ = OperandDef.FIX + _func_name = "fix" + + +@infer_dtype(np.fix) +def fix(x, out=None, **kwargs): + """ + Round to nearest integer towards zero. + + Round a tensor of floats element-wise to nearest integer towards zero. + The rounded values are returned as floats. + + Parameters + ---------- + x : array_like + An tensor of floats to be rounded + out : Tensor, optional + Output tensor + + Returns + ------- + out : Tensor of floats + The array of rounded numbers + + See Also + -------- + trunc, floor, ceil + around : Round to given number of decimals + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.fix(3.14).execute() + 3.0 + >>> mt.fix(3).execute() + 3.0 + >>> mt.fix([2.1, 2.9, -2.1, -2.9]).execute() + array([ 2., 2., -2., -2.]) + + """ + op = TensorFix(**kwargs) + return op(x, out=out) diff --git a/python/xorbits/_mars/tensor/arithmetic/float_power.py b/python/xorbits/_mars/tensor/arithmetic/float_power.py new file mode 100644 index 000000000..02f8c4655 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/float_power.py @@ -0,0 +1,101 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import inject_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand +class TensorFloatPower(TensorBinOp): + _op_type_ = OperandDef.FLOAT_POWER + _func_name = "float_power" + + @classmethod + def _is_sparse(cls, x1, x2): + if hasattr(x1, "issparse") and x1.issparse(): + return True + return False + + +@inject_dtype(np.float64) +def float_power(x1, x2, out=None, where=None, **kwargs): + """ + First tensor elements raised to powers from second array, element-wise. + + Raise each base in `x1` to the positionally-corresponding power in `x2`. + `x1` and `x2` must be broadcastable to the same shape. This differs from + the power function in that integers, float16, and float32 are promoted to + floats with a minimum precision of float64 so that the result is always + inexact. The intent is that the function will return a usable result for + negative powers and seldom overflow for positive powers. + + Parameters + ---------- + x1 : array_like + The bases. + x2 : array_like + The exponents. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor + The bases in `x1` raised to the exponents in `x2`. + + See Also + -------- + power : power function that preserves type + + Examples + -------- + Cube each element in a list. + + >>> import mars.tensor as mt + + >>> x1 = range(6) + >>> x1 + [0, 1, 2, 3, 4, 5] + >>> mt.float_power(x1, 3).execute() + array([ 0., 1., 8., 27., 64., 125.]) + + Raise the bases to different exponents. + + >>> x2 = [1.0, 2.0, 3.0, 3.0, 2.0, 1.0] + >>> mt.float_power(x1, x2).execute() + array([ 0., 1., 8., 27., 16., 5.]) + + The effect of broadcasting. + + >>> x2 = mt.array([[1, 2, 3, 3, 2, 1], [1, 2, 3, 3, 2, 1]]) + >>> x2.execute() + array([[1, 2, 3, 3, 2, 1], + [1, 2, 3, 3, 2, 1]]) + >>> mt.float_power(x1, x2).execute() + array([[ 0., 1., 8., 27., 16., 5.], + [ 0., 1., 8., 27., 16., 5.]]) + """ + op = TensorFloatPower(**kwargs) + return op(x1, x2, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/floor.py b/python/xorbits/_mars/tensor/arithmetic/floor.py new file mode 100644 index 000000000..3b402ab98 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/floor.py @@ -0,0 +1,75 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorFloor(TensorUnaryOp): + _op_type_ = OperandDef.FLOOR + _func_name = "floor" + + +@infer_dtype(np.floor) +def floor(x, out=None, where=None, **kwargs): + r""" + Return the floor of the input, element-wise. + + The floor of the scalar `x` is the largest integer `i`, such that + `i <= x`. It is often denoted as :math:`\lfloor x \rfloor`. + + Parameters + ---------- + x : array_like + Input data. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor or scalar + The floor of each element in `x`. + + See Also + -------- + ceil, trunc, rint + + Notes + ----- + Some spreadsheet programs calculate the "floor-towards-zero", in other + words ``floor(-2.5) == -2``. NumPy instead uses the definition of + `floor` where `floor(-2.5) == -3`. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) + >>> mt.floor(a).execute() + array([-2., -2., -1., 0., 1., 1., 2.]) + """ + op = TensorFloor(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/floordiv.py b/python/xorbits/_mars/tensor/arithmetic/floordiv.py new file mode 100644 index 000000000..6b680ea5a --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/floordiv.py @@ -0,0 +1,92 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand +class TensorFloorDiv(TensorBinOp): + _op_type_ = OperandDef.FLOORDIV + _func_name = "floor_divide" + + @classmethod + def _is_sparse(cls, x1, x2): + if hasattr(x1, "issparse") and x1.issparse(): + if x2 != 0: + return True + else: + raise ZeroDivisionError("float division by zero") + return False + + +@infer_dtype(np.floor_divide) +def floordiv(x1, x2, out=None, where=None, **kwargs): + """ + Return the largest integer smaller or equal to the division of the inputs. + It is equivalent to the Python ``//`` operator and pairs with the + Python ``%`` (`remainder`), function so that ``b = a % b + b * (a // b)`` + up to roundoff. + + Parameters + ---------- + x1 : array_like + Numerator. + x2 : array_like + Denominator. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated array is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor + y = floor(`x1`/`x2`) + + + See Also + -------- + remainder : Remainder complementary to floor_divide. + divmod : Simultaneous floor division and remainder. + divide : Standard division. + floor : Round a number to the nearest integer toward minus infinity. + ceil : Round a number to the nearest integer toward infinity. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.floor_divide(7,3).execute() + 2 + >>> mt.floor_divide([1., 2., 3., 4.], 2.5).execute() + array([ 0., 0., 1., 1.]) + """ + op = TensorFloorDiv(**kwargs) + return op(x1, x2, out=out, where=where) + + +@infer_dtype(np.floor_divide, reverse=True) +def rfloordiv(x1, x2, **kwargs): + op = TensorFloorDiv(**kwargs) + return op.rcall(x1, x2) diff --git a/python/xorbits/_mars/tensor/arithmetic/fmax.py b/python/xorbits/_mars/tensor/arithmetic/fmax.py new file mode 100644 index 000000000..1e41af8d6 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/fmax.py @@ -0,0 +1,103 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand +class TensorFMax(TensorBinOp): + _op_type_ = OperandDef.FMAX + _func_name = "fmax" + + @classmethod + def _is_sparse(cls, x1, x2): + if hasattr(x1, "issparse") and x1.issparse() and np.isscalar(x2) and x2 <= 0: + return True + if hasattr(x2, "issparse") and x2.issparse() and np.isscalar(x1) and x1 <= 0: + return True + return False + + +@infer_dtype(np.fmax) +def fmax(x1, x2, out=None, where=None, **kwargs): + """ + Element-wise maximum of array elements. + + Compare two tensors and returns a new tensor containing the element-wise + maxima. If one of the elements being compared is a NaN, then the + non-nan element is returned. If both elements are NaNs then the first + is returned. The latter distinction is important for complex NaNs, + which are defined as at least one of the real or imaginary parts being + a NaN. The net effect is that NaNs are ignored when possible. + + Parameters + ---------- + x1, x2 : array_like + The tensors holding the elements to be compared. They must have + the same shape. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor or scalar + The maximum of `x1` and `x2`, element-wise. Returns scalar if + both `x1` and `x2` are scalars. + + See Also + -------- + fmin : + Element-wise minimum of two tensors, ignores NaNs. + maximum : + Element-wise maximum of two tensors, propagates NaNs. + amax : + The maximum value of an tensor along a given axis, propagates NaNs. + nanmax : + The maximum value of an tensor along a given axis, ignores NaNs. + + minimum, amin, nanmin + + Notes + ----- + The fmax is equivalent to ``mt.where(x1 >= x2, x1, x2)`` when neither + x1 nor x2 are NaNs, but it is faster and does proper broadcasting. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.fmax([2, 3, 4], [1, 5, 2]).execute() + array([ 2., 5., 4.]) + + >>> mt.fmax(mt.eye(2), [0.5, 2]).execute() + array([[ 1. , 2. ], + [ 0.5, 2. ]]) + + >>> mt.fmax([mt.nan, 0, mt.nan],[0, mt.nan, mt.nan]).execute() + array([ 0., 0., NaN]) + """ + op = TensorFMax(**kwargs) + return op(x1, x2, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/fmin.py b/python/xorbits/_mars/tensor/arithmetic/fmin.py new file mode 100644 index 000000000..715124cf5 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/fmin.py @@ -0,0 +1,104 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand +class TensorFMin(TensorBinOp): + _op_type_ = OperandDef.FMIN + _func_name = "fmin" + + @classmethod + def _is_sparse(cls, x1, x2): + if hasattr(x1, "issparse") and x1.issparse() and np.isscalar(x2) and x2 >= 0: + return True + if hasattr(x2, "issparse") and x2.issparse() and np.isscalar(x1) and x1 >= 0: + return True + return False + + +@infer_dtype(np.fmin) +def fmin(x1, x2, out=None, where=None, **kwargs): + """ + Element-wise minimum of array elements. + + Compare two tensors and returns a new tensor containing the element-wise + minima. If one of the elements being compared is a NaN, then the + non-nan element is returned. If both elements are NaNs then the first + is returned. The latter distinction is important for complex NaNs, + which are defined as at least one of the real or imaginary parts being + a NaN. The net effect is that NaNs are ignored when possible. + + Parameters + ---------- + x1, x2 : array_like + The tensors holding the elements to be compared. They must have + the same shape. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor or scalar + The minimum of `x1` and `x2`, element-wise. Returns scalar if + both `x1` and `x2` are scalars. + + See Also + -------- + fmax : + Element-wise maximum of two tensors, ignores NaNs. + minimum : + Element-wise minimum of two tensors, propagates NaNs. + amin : + The minimum value of a tensor along a given axis, propagates NaNs. + nanmin : + The minimum value of a tensor along a given axis, ignores NaNs. + + maximum, amax, nanmax + + Notes + ----- + + The fmin is equivalent to ``mt.where(x1 <= x2, x1, x2)`` when neither + x1 nor x2 are NaNs, but it is faster and does proper broadcasting. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.fmin([2, 3, 4], [1, 5, 2]).execute() + array([1, 3, 2]) + + >>> mt.fmin(mt.eye(2), [0.5, 2]).execute() + array([[ 0.5, 0. ], + [ 0. , 1. ]]) + + >>> mt.fmin([mt.nan, 0, mt.nan],[0, mt.nan, mt.nan]).execute() + array([ 0., 0., NaN]) + """ + op = TensorFMin(**kwargs) + return op(x1, x2, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/fmod.py b/python/xorbits/_mars/tensor/arithmetic/fmod.py new file mode 100644 index 000000000..b06494115 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/fmod.py @@ -0,0 +1,97 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="binary_or") +class TensorFMod(TensorBinOp): + _op_type_ = OperandDef.FMOD + _func_name = "fmod" + + +@infer_dtype(np.fmod) +def fmod(x1, x2, out=None, where=None, **kwargs): + """ + Return the element-wise remainder of division. + + This is the NumPy implementation of the C library function fmod, the + remainder has the same sign as the dividend `x1`. It is equivalent to + the Matlab(TM) ``rem`` function and should not be confused with the + Python modulus operator ``x1 % x2``. + + Parameters + ---------- + x1 : array_like + Dividend. + x2 : array_like + Divisor. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + For other keyword-only arguments, see the + :ref:`ufunc docs `. + + Returns + ------- + y : Tensor_like + The remainder of the division of `x1` by `x2`. + + See Also + -------- + remainder : Equivalent to the Python ``%`` operator. + divide + + Notes + ----- + The result of the modulo operation for negative dividend and divisors + is bound by conventions. For `fmod`, the sign of result is the sign of + the dividend, while for `remainder` the sign of the result is the sign + of the divisor. The `fmod` function is equivalent to the Matlab(TM) + ``rem`` function. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.fmod([-3, -2, -1, 1, 2, 3], 2).execute() + array([-1, 0, -1, 1, 0, 1]) + >>> mt.remainder([-3, -2, -1, 1, 2, 3], 2).execute() + array([1, 0, 1, 1, 0, 1]) + + >>> mt.fmod([5, 3], [2, 2.]).execute() + array([ 1., 1.]) + >>> a = mt.arange(-3, 3).reshape(3, 2) + >>> a.execute() + array([[-3, -2], + [-1, 0], + [ 1, 2]]) + >>> mt.fmod(a, [2,2]).execute() + array([[-1, 0], + [-1, 0], + [ 1, 0]]) + """ + op = TensorFMod(**kwargs) + return op(x1, x2, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/frexp.py b/python/xorbits/_mars/tensor/arithmetic/frexp.py new file mode 100644 index 000000000..5aa07ae1a --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/frexp.py @@ -0,0 +1,128 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..array_utils import as_same_device, device +from .core import TensorOutBinOp + + +class TensorFrexp(TensorOutBinOp): + _op_type_ = OperandDef.FREXP + _func_name = "frexp" + + def __init__(self, casting="same_kind", dtype=None, sparse=False, **kw): + super().__init__(_casting=casting, dtype=dtype, sparse=sparse, **kw) + + @property + def _fun(self): + return np.frexp + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + kw = {"casting": op.casting} + + inputs_iter = iter(inputs) + input = next(inputs_iter) + if op.out1 is not None: + out1 = next(inputs_iter) + else: + out1 = None + if op.out2 is not None: + out2 = next(inputs_iter) + else: + out2 = None + if op.where is not None: + where = kw["where"] = next(inputs_iter) + else: + where = None + kw["order"] = op.order + + # The out1 out2 are immutable because they are got from + # the shared memory. + mantissa, exponent = xp.frexp(input) + if where is not None: + mantissa, exponent = ( + xp.where(where, mantissa, out1), + xp.where(where, exponent, out2), + ) + + for c, res in zip(op.outputs, (mantissa, exponent)): + ctx[c.key] = res + + +def frexp(x, out1=None, out2=None, out=None, where=None, **kwargs): + """ + Decompose the elements of x into mantissa and twos exponent. + + Returns (`mantissa`, `exponent`), where `x = mantissa * 2**exponent``. + The mantissa is lies in the open interval(-1, 1), while the twos + exponent is a signed integer. + + Parameters + ---------- + x : array_like + Tensor of numbers to be decomposed. + out1 : Tensor, optional + Output tensor for the mantissa. Must have the same shape as `x`. + out2 : Tensor, optional + Output tensor for the exponent. Must have the same shape as `x`. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + (mantissa, exponent) : tuple of tensors, (float, int) + `mantissa` is a float array with values between -1 and 1. + `exponent` is an int array which represents the exponent of 2. + + See Also + -------- + ldexp : Compute ``y = x1 * 2**x2``, the inverse of `frexp`. + + Notes + ----- + Complex dtypes are not supported, they will raise a TypeError. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.arange(9) + >>> y1, y2 = mt.frexp(x) + + >>> y1_result, y2_result = mt.ExecutableTuple([y1, y2]).execute() + >>> y1_result + array([ 0. , 0.5 , 0.5 , 0.75 , 0.5 , 0.625, 0.75 , 0.875, + 0.5 ]) + >>> y2_result + array([0, 1, 2, 2, 3, 3, 3, 3, 4]) + >>> (y1 * 2**y2).execute() + array([ 0., 1., 2., 3., 4., 5., 6., 7., 8.]) + """ + op = TensorFrexp(**kwargs) + return op(x, out1=out1, out2=out2, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/greater.py b/python/xorbits/_mars/tensor/arithmetic/greater.py new file mode 100644 index 000000000..6f448e3a4 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/greater.py @@ -0,0 +1,75 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import inject_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="binary_and") +class TensorGreaterThan(TensorBinOp): + _op_type_ = OperandDef.GT + _func_name = "greater" + + +@inject_dtype(np.bool_) +def greater(x1, x2, out=None, where=None, **kwargs): + """ + Return the truth value of (x1 > x2) element-wise. + + Parameters + ---------- + x1, x2 : array_like + Input tensors. If ``x1.shape != x2.shape``, they must be + broadcastable to a common shape (which may be the shape of one or + the other). + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + out : bool or Tensor of bool + Array of bools, or a single bool if `x1` and `x2` are scalars. + + + See Also + -------- + greater_equal, less, less_equal, equal, not_equal + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.greater([4,2],[2,2]).execute() + array([ True, False]) + + If the inputs are ndarrays, then np.greater is equivalent to '>'. + + >>> a = mt.array([4,2]) + >>> b = mt.array([2,2]) + >>> (a > b).execute() + array([ True, False]) + """ + op = TensorGreaterThan(**kwargs) + return op(x1, x2, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/greater_equal.py b/python/xorbits/_mars/tensor/arithmetic/greater_equal.py new file mode 100644 index 000000000..17b7c60af --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/greater_equal.py @@ -0,0 +1,67 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import inject_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="binary_and") +class TensorGreaterEqual(TensorBinOp): + _op_type_ = OperandDef.GE + _func_name = "greater_equal" + + +@inject_dtype(np.bool_) +def greater_equal(x1, x2, out=None, where=None, **kwargs): + """ + Return the truth value of (x1 >= x2) element-wise. + + Parameters + ---------- + x1, x2 : array_like + Input tensors. If ``x1.shape != x2.shape``, they must be + broadcastable to a common shape (which may be the shape of one or + the other). + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + out : bool or Tensor of bool + Array of bools, or a single bool if `x1` and `x2` are scalars. + + See Also + -------- + greater, less, less_equal, equal, not_equal + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.greater_equal([4, 2, 1], [2, 2, 2]).execute() + array([ True, True, False]) + """ + op = TensorGreaterEqual(**kwargs) + return op(x1, x2, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/hypot.py b/python/xorbits/_mars/tensor/arithmetic/hypot.py new file mode 100644 index 000000000..7ee1ccec0 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/hypot.py @@ -0,0 +1,75 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="binary_and") +class TensorHypot(TensorBinOp): + _op_type_ = OperandDef.HYPOT + _func_name = "hypot" + + +@infer_dtype(np.hypot) +def hypot(x1, x2, out=None, where=None, **kwargs): + """ + Given the "legs" of a right triangle, return its hypotenuse. + + Equivalent to ``sqrt(x1**2 + x2**2)``, element-wise. If `x1` or + `x2` is scalar_like (i.e., unambiguously cast-able to a scalar type), + it is broadcast for use with each element of the other argument. + (See Examples) + + Parameters + ---------- + x1, x2 : array_like + Leg of the triangle(s). + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated array is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + z : Tensor + The hypotenuse of the triangle(s). + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.hypot(3*mt.ones((3, 3)), 4*mt.ones((3, 3))).execute() + array([[ 5., 5., 5.], + [ 5., 5., 5.], + [ 5., 5., 5.]]) + + Example showing broadcast of scalar_like argument: + + >>> mt.hypot(3*mt.ones((3, 3)), [4]).execute() + array([[ 5., 5., 5.], + [ 5., 5., 5.], + [ 5., 5., 5.]]) + """ + op = TensorHypot(**kwargs) + return op(x1, x2, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/i0.py b/python/xorbits/_mars/tensor/arithmetic/i0.py new file mode 100644 index 000000000..27064e45b --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/i0.py @@ -0,0 +1,97 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..array_utils import get_array_module, is_sparse_module +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorI0(TensorUnaryOp): + _op_type_ = OperandDef.I0 + _func_name = "i0" + + @classmethod + def execute(cls, ctx, op): + x = ctx[op.inputs[0].key] + xp = get_array_module(x) + res = xp.i0(x) + if not is_sparse_module(xp): + res = res.reshape(op.outputs[0].shape) + ctx[op.outputs[0].key] = res + + +@infer_dtype(np.i0) +def i0(x, **kwargs): + """ + Modified Bessel function of the first kind, order 0. + + Usually denoted :math:`I_0`. This function does broadcast, but will *not* + "up-cast" int dtype arguments unless accompanied by at least one float or + complex dtype argument (see Raises below). + + Parameters + ---------- + x : array_like, dtype float or complex + Argument of the Bessel function. + + Returns + ------- + out : Tensor, shape = x.shape, dtype = x.dtype + The modified Bessel function evaluated at each of the elements of `x`. + + Raises + ------ + TypeError: array cannot be safely cast to required type + If argument consists exclusively of int dtypes. + + See Also + -------- + scipy.special.iv, scipy.special.ive + + Notes + ----- + We use the algorithm published by Clenshaw [1]_ and referenced by + Abramowitz and Stegun [2]_, for which the function domain is + partitioned into the two intervals [0,8] and (8,inf), and Chebyshev + polynomial expansions are employed in each interval. Relative error on + the domain [0,30] using IEEE arithmetic is documented [3]_ as having a + peak of 5.8e-16 with an rms of 1.4e-16 (n = 30000). + + References + ---------- + .. [1] C. W. Clenshaw, "Chebyshev series for mathematical functions", in + *National Physical Laboratory Mathematical Tables*, vol. 5, London: + Her Majesty's Stationery Office, 1962. + .. [2] M. Abramowitz and I. A. Stegun, *Handbook of Mathematical + Functions*, 10th printing, New York: Dover, 1964, pp. 379. + http://www.math.sfu.ca/~cbm/aands/page_379.htm + .. [3] http://kobesearch.cpan.org/htdocs/Math-Cephes/Math/Cephes.html + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.i0([0.]).execute() + array([1.]) + >>> mt.i0([0., 1. + 2j]).execute() + array([ 1.00000000+0.j , 0.18785373+0.64616944j]) + + """ + op = TensorI0(**kwargs) + return op(x) diff --git a/python/xorbits/_mars/tensor/arithmetic/imag.py b/python/xorbits/_mars/tensor/arithmetic/imag.py new file mode 100644 index 000000000..5fd30e3d7 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/imag.py @@ -0,0 +1,65 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorImag(TensorUnaryOp): + _op_type_ = OperandDef.IMAG + _func_name = "imag" + + +@infer_dtype(np.imag) +def imag(val, **kwargs): + """ + Return the imaginary part of the complex argument. + + Parameters + ---------- + val : array_like + Input tensor. + + Returns + ------- + out : Tensor or scalar + The imaginary component of the complex argument. If `val` is real, + the type of `val` is used for the output. If `val` has complex + elements, the returned type is float. + + See Also + -------- + real, angle, real_if_close + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([1+2j, 3+4j, 5+6j]) + >>> a.imag.execute() + array([ 2., 4., 6.]) + >>> a.imag = mt.array([8, 10, 12]) + >>> a.execute() + array([ 1. +8.j, 3.+10.j, 5.+12.j]) + >>> mt.imag(1 + 1j).execute() + 1.0 + + """ + op = TensorImag(**kwargs) + return op(val) diff --git a/python/xorbits/_mars/tensor/arithmetic/invert.py b/python/xorbits/_mars/tensor/arithmetic/invert.py new file mode 100644 index 000000000..33c7fca18 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/invert.py @@ -0,0 +1,108 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorInvert(TensorUnaryOp): + _op_type_ = OperandDef.INVERT + _func_name = "invert" + + +@infer_dtype(np.invert) +def invert(x, out=None, where=None, **kwargs): + """ + Compute bit-wise inversion, or bit-wise NOT, element-wise. + + Computes the bit-wise NOT of the underlying binary representation of + the integers in the input tensors. This ufunc implements the C/Python + operator ``~``. + + For signed integer inputs, the two's complement is returned. In a + two's-complement system negative numbers are represented by the two's + complement of the absolute value. This is the most common method of + representing signed integers on computers [1]_. A N-bit + two's-complement system can represent every integer in the range + :math:`-2^{N-1}` to :math:`+2^{N-1}-1`. + + Parameters + ---------- + x : array_like + Only integer and boolean types are handled. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + out : array_like + Result. + + See Also + -------- + bitwise_and, bitwise_or, bitwise_xor + logical_not + + Notes + ----- + `bitwise_not` is an alias for `invert`: + + >>> import mars.tensor as mt + + >>> mt.bitwise_not is mt.invert + True + + References + ---------- + .. [1] Wikipedia, "Two's complement", + http://en.wikipedia.org/wiki/Two's_complement + + Examples + -------- + We've seen that 13 is represented by ``00001101``. + The invert or bit-wise NOT of 13 is then: + + >>> mt.invert(mt.array([13], dtype=mt.uint8)).execute() + array([242], dtype=uint8) + + The result depends on the bit-width: + + >>> mt.invert(mt.array([13], dtype=mt.uint16)).execute() + array([65522], dtype=uint16) + + When using signed integer types the result is the two's complement of + the result for the unsigned type: + + >>> mt.invert(mt.array([13], dtype=mt.int8)).execute() + array([-14], dtype=int8) + + Booleans are accepted as well: + + >>> mt.invert(mt.array([True, False])).execute() + array([False, True]) + """ + op = TensorInvert(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/isclose.py b/python/xorbits/_mars/tensor/arithmetic/isclose.py new file mode 100644 index 000000000..c20446214 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/isclose.py @@ -0,0 +1,157 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import BoolField, Float64Field +from ..array_utils import as_same_device, device +from .core import TensorBinOp + + +class TensorIsclose(TensorBinOp): + _op_type_ = OperandDef.ISCLOSE + + _rtol = Float64Field("rtol") + _atol = Float64Field("atol") + _equal_nan = BoolField("equal_nan") + + def __init__( + self, + rtol=None, + atol=None, + equal_nan=None, + casting="same_kind", + err=None, + sparse=False, + **kw + ): + err = err if err is not None else np.geterr() + super().__init__( + _rtol=rtol, + _atol=atol, + _equal_nan=equal_nan, + _casting=casting, + _err=err, + sparse=sparse, + **kw + ) + + @property + def rtol(self): + return self._rtol + + @property + def atol(self): + return self._atol + + @property + def equal_nan(self): + return self._equal_nan + + @classmethod + def _is_sparse(cls, x1, x2): + if ( + hasattr(x1, "issparse") + and x1.issparse() + and np.isscalar(x2) + and not np.isclose(x2, 0) + ): + return True + if ( + hasattr(x2, "issparse") + and x2.issparse() + and np.isscalar(x1) + and not np.isclose(x1, 0) + ): + return True + return False + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + a = op.lhs if np.isscalar(op.lhs) else inputs[0] + b = op.rhs if np.isscalar(op.rhs) else inputs[-1] + + ctx[op.outputs[0].key] = xp.isclose( + a, b, atol=op.atol, rtol=op.rtol, equal_nan=op.equal_nan + ) + + +def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): + """ + Returns a boolean tensor where two tensors are element-wise equal within a + tolerance. + + The tolerance values are positive, typically very small numbers. The + relative difference (`rtol` * abs(`b`)) and the absolute difference + `atol` are added together to compare against the absolute difference + between `a` and `b`. + + Parameters + ---------- + a, b : array_like + Input tensors to compare. + rtol : float + The relative tolerance parameter (see Notes). + atol : float + The absolute tolerance parameter (see Notes). + equal_nan : bool + Whether to compare NaN's as equal. If True, NaN's in `a` will be + considered equal to NaN's in `b` in the output tensor. + + Returns + ------- + y : array_like + Returns a boolean tensor of where `a` and `b` are equal within the + given tolerance. If both `a` and `b` are scalars, returns a single + boolean value. + + See Also + -------- + allclose + + Notes + ----- + + For finite values, isclose uses the following equation to test whether + two floating point values are equivalent. + + absolute(`a` - `b`) <= (`atol` + `rtol` * absolute(`b`)) + + The above equation is not symmetric in `a` and `b`, so that + `isclose(a, b)` might be different from `isclose(b, a)` in + some rare cases. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.isclose([1e10,1e-7], [1.00001e10,1e-8]).execute() + array([True, False]) + >>> mt.isclose([1e10,1e-8], [1.00001e10,1e-9]).execute() + array([True, True]) + >>> mt.isclose([1e10,1e-8], [1.0001e10,1e-9]).execute() + array([False, True]) + >>> mt.isclose([1.0, mt.nan], [1.0, mt.nan]).execute() + array([True, False]) + >>> mt.isclose([1.0, mt.nan], [1.0, mt.nan], equal_nan=True).execute() + array([True, True]) + """ + op = TensorIsclose(rtol=rtol, atol=atol, equal_nan=equal_nan, dtype=np.dtype(bool)) + return op(a, b) diff --git a/python/xorbits/_mars/tensor/arithmetic/iscomplex.py b/python/xorbits/_mars/tensor/arithmetic/iscomplex.py new file mode 100644 index 000000000..79c21c4ab --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/iscomplex.py @@ -0,0 +1,62 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import inject_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorIsComplex(TensorUnaryOp): + _op_type_ = OperandDef.ISCOMPLEX + _func_name = "iscomplex" + + +@inject_dtype(np.bool_) +def iscomplex(x, **kwargs): + """ + Returns a bool tensor, where True if input element is complex. + + What is tested is whether the input has a non-zero imaginary part, not if + the input type is complex. + + Parameters + ---------- + x : array_like + Input tensor. + + Returns + ------- + out : Tensor of bools + Output tensor. + + See Also + -------- + isreal + iscomplexobj : Return True if x is a complex type or an array of complex + numbers. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.iscomplex([1+1j, 1+0j, 4.5, 3, 2, 2j]).execute() + array([ True, False, False, False, False, True]) + + """ + op = TensorIsComplex(**kwargs) + return op(x) diff --git a/python/xorbits/_mars/tensor/arithmetic/isfinite.py b/python/xorbits/_mars/tensor/arithmetic/isfinite.py new file mode 100644 index 000000000..c1a6372b5 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/isfinite.py @@ -0,0 +1,104 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import inject_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorIsFinite(TensorUnaryOp): + _op_type_ = OperandDef.ISFINITE + _func_name = "isfinite" + + +@inject_dtype(np.bool_) +def isfinite(x, out=None, where=None, **kwargs): + """ + Test element-wise for finiteness (not infinity or not Not a Number). + + The result is returned as a boolean tensor. + + Parameters + ---------- + x : array_like + Input values. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor, bool + For scalar input, the result is a new boolean with value True + if the input is finite; otherwise the value is False (input is + either positive infinity, negative infinity or Not a Number). + + For array input, the result is a boolean array with the same + dimensions as the input and the values are True if the + corresponding element of the input is finite; otherwise the values + are False (element is either positive infinity, negative infinity + or Not a Number). + + See Also + -------- + isinf, isneginf, isposinf, isnan + + Notes + ----- + Not a Number, positive infinity and negative infinity are considered + to be non-finite. + + Mars uses the IEEE Standard for Binary Floating-Point for Arithmetic + (IEEE 754). This means that Not a Number is not equivalent to infinity. + Also that positive infinity is not equivalent to negative infinity. But + infinity is equivalent to positive infinity. Errors result if the + second argument is also supplied when `x` is a scalar input, or if + first and second arguments have different shapes. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.isfinite(1).execute() + True + >>> mt.isfinite(0).execute() + True + >>> mt.isfinite(mt.nan).execute() + False + >>> mt.isfinite(mt.inf).execute() + False + >>> mt.isfinite(mt.NINF).execute() + False + >>> mt.isfinite([mt.log(-1.).execute(),1.,mt.log(0).execute()]).execute() + array([False, True, False]) + + >>> x = mt.array([-mt.inf, 0., mt.inf]) + >>> y = mt.array([2, 2, 2]) + >>> mt.isfinite(x, y).execute() + array([0, 1, 0]) + >>> y.execute() + array([0, 1, 0]) + """ + op = TensorIsFinite(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/isinf.py b/python/xorbits/_mars/tensor/arithmetic/isinf.py new file mode 100644 index 000000000..05da391b9 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/isinf.py @@ -0,0 +1,101 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import inject_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorIsInf(TensorUnaryOp): + _op_type_ = OperandDef.ISINF + _func_name = "isinf" + + +@inject_dtype(np.bool_) +def isinf(x, out=None, where=None, **kwargs): + """ + Test element-wise for positive or negative infinity. + + Returns a boolean array of the same shape as `x`, True where ``x == + +/-inf``, otherwise False. + + Parameters + ---------- + x : array_like + Input values + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : bool (scalar) or boolean Tensor + For scalar input, the result is a new boolean with value True if + the input is positive or negative infinity; otherwise the value is + False. + + For tensor input, the result is a boolean tensor with the same shape + as the input and the values are True where the corresponding + element of the input is positive or negative infinity; elsewhere + the values are False. If a second argument was supplied the result + is stored there. If the type of that array is a numeric type the + result is represented as zeros and ones, if the type is boolean + then as False and True, respectively. The return value `y` is then + a reference to that tensor. + + See Also + -------- + isneginf, isposinf, isnan, isfinite + + Notes + ----- + Mars uses the IEEE Standard for Binary Floating-Point for Arithmetic + (IEEE 754). + + Errors result if the second argument is supplied when the first + argument is a scalar, or if the first and second arguments have + different shapes. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.isinf(mt.inf).execute() + True + >>> mt.isinf(mt.nan).execute() + False + >>> mt.isinf(mt.NINF).execute() + True + >>> mt.isinf([mt.inf, -mt.inf, 1.0, mt.nan]).execute() + array([ True, True, False, False]) + + >>> x = mt.array([-mt.inf, 0., mt.inf]) + >>> y = mt.array([2, 2, 2]) + >>> mt.isinf(x, y).execute() + array([1, 0, 1]) + >>> y.execute() + array([1, 0, 1]) + """ + op = TensorIsInf(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/isnan.py b/python/xorbits/_mars/tensor/arithmetic/isnan.py new file mode 100644 index 000000000..595f372e3 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/isnan.py @@ -0,0 +1,80 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import inject_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorIsNan(TensorUnaryOp): + _op_type_ = OperandDef.ISNAN + _func_name = "isnan" + + +@inject_dtype(np.bool_) +def isnan(x, out=None, where=None, **kwargs): + """ + Test element-wise for NaN and return result as a boolean tensor. + + Parameters + ---------- + x : array_like + Input tensor. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor or bool + For scalar input, the result is a new boolean with value True if + the input is NaN; otherwise the value is False. + + For array input, the result is a boolean tensor of the same + dimensions as the input and the values are True if the + corresponding element of the input is NaN; otherwise the values are + False. + + See Also + -------- + isinf, isneginf, isposinf, isfinite, isnat + + Notes + ----- + Mars uses the IEEE Standard for Binary Floating-Point for Arithmetic + (IEEE 754). This means that Not a Number is not equivalent to infinity. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.isnan(mt.nan).execute() + True + >>> mt.isnan(mt.inf).execute() + False + >>> mt.isnan([mt.log(-1.).execute(),1.,mt.log(0).execute()]).execute() + array([ True, False, False]) + """ + op = TensorIsNan(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/isreal.py b/python/xorbits/_mars/tensor/arithmetic/isreal.py new file mode 100644 index 000000000..03abe9f1d --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/isreal.py @@ -0,0 +1,61 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import inject_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorIsReal(TensorUnaryOp): + _op_type_ = OperandDef.ISREAL + _func_name = "isreal" + + +@inject_dtype(np.bool_) +def isreal(x, **kwargs): + """ + Returns a bool tensor, where True if input element is real. + + If element has complex type with zero complex part, the return value + for that element is True. + + Parameters + ---------- + x : array_like + Input tensor. + + Returns + ------- + out : Tensor, bool + Boolean tensor of same shape as `x`. + + See Also + -------- + iscomplex + isrealobj : Return True if x is not a complex type. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.isreal([1+1j, 1+0j, 4.5, 3, 2, 2j]).execute() + array([False, True, True, True, True, False]) + + """ + op = TensorIsReal(**kwargs) + return op(x) diff --git a/python/xorbits/_mars/tensor/arithmetic/ldexp.py b/python/xorbits/_mars/tensor/arithmetic/ldexp.py new file mode 100644 index 000000000..213f92a6d --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/ldexp.py @@ -0,0 +1,97 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand +class TensorLdexp(TensorBinOp): + _op_type_ = OperandDef.LDEXP + _func_name = "ldexp" + + @classmethod + def _is_sparse(cls, x1, x2): + if hasattr(x1, "issparse") and x1.issparse(): + return True + return False + + +@infer_dtype(np.ldexp) +def ldexp(x1, x2, out=None, where=None, **kwargs): + """ + Returns x1 * 2**x2, element-wise. + + The mantissas `x1` and twos exponents `x2` are used to construct + floating point numbers ``x1 * 2**x2``. + + Parameters + ---------- + x1 : array_like + Tensor of multipliers. + x2 : array_like, int + Tensor of twos exponents. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor or scalar + The result of ``x1 * 2**x2``. + + See Also + -------- + frexp : Return (y1, y2) from ``x = y1 * 2**y2``, inverse to `ldexp`. + + Notes + ----- + Complex dtypes are not supported, they will raise a TypeError. + + `ldexp` is useful as the inverse of `frexp`, if used by itself it is + more clear to simply use the expression ``x1 * 2**x2``. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.ldexp(5, mt.arange(4)).execute() + array([ 5., 10., 20., 40.], dtype=float32) + + >>> x = mt.arange(6) + >>> mt.ldexp(*mt.frexp(x)).execute() + array([ 0., 1., 2., 3., 4., 5.]) + """ + x2_dtype = astensor(x2).dtype + casting = kwargs.get("casting", "safe") + if not np.can_cast(x2_dtype, np.int64, casting=casting): + raise TypeError( + "ufunc 'ldexp' not supported for the input types, " + "and the inputs could not be safely coerced to any supported types " + f"according to the casting rule ''{casting}''" + ) + + op = TensorLdexp(**kwargs) + return op(x1, x2, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/less.py b/python/xorbits/_mars/tensor/arithmetic/less.py new file mode 100644 index 000000000..9f4e89cfb --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/less.py @@ -0,0 +1,67 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import inject_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="binary_and") +class TensorLessThan(TensorBinOp): + _op_type_ = OperandDef.LT + _func_name = "less" + + +@inject_dtype(np.bool_) +def less(x1, x2, out=None, where=None, **kwargs): + """ + Return the truth value of (x1 < x2) element-wise. + + Parameters + ---------- + x1, x2 : array_like + Input tensors. If ``x1.shape != x2.shape``, they must be + broadcastable to a common shape (which may be the shape of one or + the other). + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + out : bool or Tensor of bool + Array of bools, or a single bool if `x1` and `x2` are scalars. + + See Also + -------- + greater, less_equal, greater_equal, equal, not_equal + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.less([1, 2], [2, 2]).execute() + array([ True, False]) + """ + op = TensorLessThan(**kwargs) + return op(x1, x2, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/less_equal.py b/python/xorbits/_mars/tensor/arithmetic/less_equal.py new file mode 100644 index 000000000..5de0f1052 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/less_equal.py @@ -0,0 +1,67 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import inject_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="binary_and") +class TensorLessEqual(TensorBinOp): + _op_type_ = OperandDef.LE + _func_name = "less_equal" + + +@inject_dtype(np.bool_) +def less_equal(x1, x2, out=None, where=None, **kwargs): + """ + Return the truth value of (x1 =< x2) element-wise. + + Parameters + ---------- + x1, x2 : array_like + Input tensors. If ``x1.shape != x2.shape``, they must be + broadcastable to a common shape (which may be the shape of one or + the other). + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + out : bool or tensor of bool + Array of bools, or a single bool if `x1` and `x2` are scalars. + + See Also + -------- + greater, less, greater_equal, equal, not_equal + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.less_equal([4, 2, 1], [2, 2, 2]).execute() + array([False, True, True]) + """ + op = TensorLessEqual(**kwargs) + return op(x1, x2, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/log.py b/python/xorbits/_mars/tensor/arithmetic/log.py new file mode 100644 index 000000000..ccedd2e5e --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/log.py @@ -0,0 +1,90 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorLog(TensorUnaryOp): + _op_type_ = OperandDef.LOG + _func_name = "log" + + +@infer_dtype(np.log) +def log(x, out=None, where=None, **kwargs): + """ + Natural logarithm, element-wise. + + The natural logarithm `log` is the inverse of the exponential function, + so that `log(exp(x)) = x`. The natural logarithm is logarithm in base + `e`. + + Parameters + ---------- + x : array_like + Input value. + out : Tensor, None, or tuple of tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor + The natural logarithm of `x`, element-wise. + + See Also + -------- + log10, log2, log1p + + Notes + ----- + Logarithm is a multivalued function: for each `x` there is an infinite + number of `z` such that `exp(z) = x`. The convention is to return the + `z` whose imaginary part lies in `[-pi, pi]`. + + For real-valued input data types, `log` always returns real output. For + each value that cannot be expressed as a real number or infinity, it + yields ``nan`` and sets the `invalid` floating point error flag. + + For complex-valued input, `log` is a complex analytical function that + has a branch cut `[-inf, 0]` and is continuous from above on it. `log` + handles the floating-point negative zero as an infinitesimal negative + number, conforming to the C99 standard. + + References + ---------- + .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions", + 10th printing, 1964, pp. 67. http://www.math.sfu.ca/~cbm/aands/ + .. [2] Wikipedia, "Logarithm". http://en.wikipedia.org/wiki/Logarithm + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.log([1, mt.e, mt.e**2, 0]).execute() + array([ 0., 1., 2., -Inf]) + """ + op = TensorLog(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/log10.py b/python/xorbits/_mars/tensor/arithmetic/log10.py new file mode 100644 index 000000000..87d7efbba --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/log10.py @@ -0,0 +1,83 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorLog10(TensorUnaryOp): + _op_type_ = OperandDef.LOG10 + _func_name = "log10" + + +@infer_dtype(np.log10) +def log10(x, out=None, where=None, **kwargs): + """ + Return the base 10 logarithm of the input tensor, element-wise. + + Parameters + ---------- + x : array_like + Input values. + out : Tensor, None, or tuple of tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor + The logarithm to the base 10 of `x`, element-wise. NaNs are + returned where x is negative. + + Notes + ----- + Logarithm is a multivalued function: for each `x` there is an infinite + number of `z` such that `10**z = x`. The convention is to return the + `z` whose imaginary part lies in `[-pi, pi]`. + + For real-valued input data types, `log10` always returns real output. + For each value that cannot be expressed as a real number or infinity, + it yields ``nan`` and sets the `invalid` floating point error flag. + + For complex-valued input, `log10` is a complex analytical function that + has a branch cut `[-inf, 0]` and is continuous from above on it. + `log10` handles the floating-point negative zero as an infinitesimal + negative number, conforming to the C99 standard. + + References + ---------- + .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions", + 10th printing, 1964, pp. 67. http://www.math.sfu.ca/~cbm/aands/ + .. [2] Wikipedia, "Logarithm". http://en.wikipedia.org/wiki/Logarithm + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.log10([1e-15, -3.]).execute() + array([-15., NaN]) + """ + op = TensorLog10(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/log1p.py b/python/xorbits/_mars/tensor/arithmetic/log1p.py new file mode 100644 index 000000000..8850a9ea3 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/log1p.py @@ -0,0 +1,93 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorLog1p(TensorUnaryOp): + _op_type_ = OperandDef.LOG1P + _func_name = "log1p" + + +@infer_dtype(np.log1p) +def log1p(x, out=None, where=None, **kwargs): + """ + Return the natural logarithm of one plus the input tensor, element-wise. + + Calculates ``log(1 + x)``. + + Parameters + ---------- + x : array_like + Input values. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor + Natural logarithm of `1 + x`, element-wise. + + See Also + -------- + expm1 : ``exp(x) - 1``, the inverse of `log1p`. + + Notes + ----- + For real-valued input, `log1p` is accurate also for `x` so small + that `1 + x == 1` in floating-point accuracy. + + Logarithm is a multivalued function: for each `x` there is an infinite + number of `z` such that `exp(z) = 1 + x`. The convention is to return + the `z` whose imaginary part lies in `[-pi, pi]`. + + For real-valued input data types, `log1p` always returns real output. + For each value that cannot be expressed as a real number or infinity, + it yields ``nan`` and sets the `invalid` floating point error flag. + + For complex-valued input, `log1p` is a complex analytical function that + has a branch cut `[-inf, -1]` and is continuous from above on it. + `log1p` handles the floating-point negative zero as an infinitesimal + negative number, conforming to the C99 standard. + + References + ---------- + .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions", + 10th printing, 1964, pp. 67. http://www.math.sfu.ca/~cbm/aands/ + .. [2] Wikipedia, "Logarithm". http://en.wikipedia.org/wiki/Logarithm + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.log1p(1e-99).execute() + 1e-99 + >>> mt.log(1 + 1e-99).execute() + 0.0 + """ + op = TensorLog1p(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/log2.py b/python/xorbits/_mars/tensor/arithmetic/log2.py new file mode 100644 index 000000000..5c8ae4dbd --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/log2.py @@ -0,0 +1,83 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorLog2(TensorUnaryOp): + _op_type_ = OperandDef.LOG2 + _func_name = "log2" + + +@infer_dtype(np.log2) +def log2(x, out=None, where=None, **kwargs): + """ + Base-2 logarithm of `x`. + + Parameters + ---------- + x : array_like + Input values. + out : Tensor, None, or tuple of tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor + Base-2 logarithm of `x`. + + See Also + -------- + log, log10, log1p + + Logarithm is a multivalued function: for each `x` there is an infinite + number of `z` such that `2**z = x`. The convention is to return the `z` + whose imaginary part lies in `[-pi, pi]`. + + For real-valued input data types, `log2` always returns real output. + For each value that cannot be expressed as a real number or infinity, + it yields ``nan`` and sets the `invalid` floating point error flag. + + For complex-valued input, `log2` is a complex analytical function that + has a branch cut `[-inf, 0]` and is continuous from above on it. `log2` + handles the floating-point negative zero as an infinitesimal negative + number, conforming to the C99 standard. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.array([0, 1, 2, 2**4]) + >>> mt.log2(x).execute() + array([-Inf, 0., 1., 4.]) + + >>> xi = mt.array([0+1.j, 1, 2+0.j, 4.j]) + >>> mt.log2(xi).execute() + array([ 0.+2.26618007j, 0.+0.j , 1.+0.j , 2.+2.26618007j]) + """ + op = TensorLog2(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/logaddexp.py b/python/xorbits/_mars/tensor/arithmetic/logaddexp.py new file mode 100644 index 000000000..9b894b4c6 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/logaddexp.py @@ -0,0 +1,78 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="always_false") +class TensorLogAddExp(TensorBinOp): + _op_type_ = OperandDef.LOGADDEXP + _func_name = "logaddexp" + + +@infer_dtype(np.logaddexp) +def logaddexp(x1, x2, out=None, where=None, **kwargs): + """ + Logarithm of the sum of exponentiations of the inputs. + + Calculates ``log(exp(x1) + exp(x2))``. This function is useful in + statistics where the calculated probabilities of events may be so small + as to exceed the range of normal floating point numbers. In such cases + the logarithm of the calculated probability is stored. This function + allows adding probabilities stored in such a fashion. + + Parameters + ---------- + x1, x2 : array_like + Input values. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + For other keyword-only arguments, see the + :ref:`ufunc docs `. + + Returns + ------- + result : Tensor + Logarithm of ``exp(x1) + exp(x2)``. + + See Also + -------- + logaddexp2: Logarithm of the sum of exponentiations of inputs in base 2. + + Examples + -------- + >>> import mars.tensor as mt + + >>> prob1 = mt.log(1e-50) + >>> prob2 = mt.log(2.5e-50) + >>> prob12 = mt.logaddexp(prob1, prob2) + >>> prob12.execute() + -113.87649168120691 + >>> mt.exp(prob12).execute() + 3.5000000000000057e-50 + """ + op = TensorLogAddExp(**kwargs) + return op(x1, x2, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/logaddexp2.py b/python/xorbits/_mars/tensor/arithmetic/logaddexp2.py new file mode 100644 index 000000000..af41dd8b7 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/logaddexp2.py @@ -0,0 +1,76 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="always_false") +class TensorLogAddExp2(TensorBinOp): + _op_type_ = OperandDef.LOGADDEXP2 + _func_name = "logaddexp2" + + +@infer_dtype(np.logaddexp2) +def logaddexp2(x1, x2, out=None, where=None, **kwargs): + """ + Logarithm of the sum of exponentiations of the inputs in base-2. + + Calculates ``log2(2**x1 + 2**x2)``. This function is useful in machine + learning when the calculated probabilities of events may be so small as + to exceed the range of normal floating point numbers. In such cases + the base-2 logarithm of the calculated probability can be used instead. + This function allows adding probabilities stored in such a fashion. + + Parameters + ---------- + x1, x2 : array_like + Input values. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + result : Tensor + Base-2 logarithm of ``2**x1 + 2**x2``. + + See Also + -------- + logaddexp: Logarithm of the sum of exponentiations of the inputs. + + Examples + -------- + >>> import mars.tensor as mt + + >>> prob1 = mt.log2(1e-50) + >>> prob2 = mt.log2(2.5e-50) + >>> prob12 = mt.logaddexp2(prob1, prob2) + >>> prob1.execute(), prob2.execute(), prob12.execute() + (-166.09640474436813, -164.77447664948076, -164.28904982231052) + >>> (2**prob12).execute() + 3.4999999999999914e-50 + """ + op = TensorLogAddExp2(**kwargs) + return op(x1, x2, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/logical_and.py b/python/xorbits/_mars/tensor/arithmetic/logical_and.py new file mode 100644 index 000000000..9fe64d5b3 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/logical_and.py @@ -0,0 +1,79 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="binary_or") +class TensorAnd(TensorBinOp): + _op_type_ = OperandDef.AND + _func_name = "logical_and" + + +@infer_dtype(np.logical_and) +def logical_and(x1, x2, out=None, where=None, **kwargs): + """ + Compute the truth value of x1 AND x2 element-wise. + + Parameters + ---------- + x1, x2 : array_like + Input tensors. `x1` and `x2` must be of the same shape. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor or bool + Boolean result with the same shape as `x1` and `x2` of the logical + AND operation on corresponding elements of `x1` and `x2`. + + See Also + -------- + logical_or, logical_not, logical_xor + bitwise_and + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.logical_and(True, False).execute() + False + >>> mt.logical_and([True, False], [False, False]).execute() + array([False, False]) + + >>> x = mt.arange(5) + >>> mt.logical_and(x>1, x<4).execute() + array([False, False, True, True, False]) + """ + op = TensorAnd(**kwargs) + return op(x1, x2, out=out, where=where) + + +@infer_dtype(np.logical_and, reverse=True) +def rlogical_and(x1, x2, **kwargs): + op = TensorAnd(**kwargs) + return op.rcall(x1, x2) diff --git a/python/xorbits/_mars/tensor/arithmetic/logical_not.py b/python/xorbits/_mars/tensor/arithmetic/logical_not.py new file mode 100644 index 000000000..4df067cb4 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/logical_not.py @@ -0,0 +1,72 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorNot(TensorUnaryOp): + _op_type_ = OperandDef.NOT + _func_name = "logical_not" + + +@infer_dtype(np.logical_not) +def logical_not(x, out=None, where=None, **kwargs): + """ + Compute the truth value of NOT x element-wise. + + Parameters + ---------- + x : array_like + Logical NOT is applied to the elements of `x`. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : bool or Tensor of bool + Boolean result with the same shape as `x` of the NOT operation + on elements of `x`. + + See Also + -------- + logical_and, logical_or, logical_xor + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.logical_not(3).execute() + False + >>> mt.logical_not([True, False, 0, 1]).execute() + array([False, True, True, False]) + + >>> x = mt.arange(5) + >>> mt.logical_not(x<3).execute() + array([False, False, False, True, True]) + """ + op = TensorNot(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/logical_or.py b/python/xorbits/_mars/tensor/arithmetic/logical_or.py new file mode 100644 index 000000000..f7a31dd02 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/logical_or.py @@ -0,0 +1,80 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="binary_and") +class TensorOr(TensorBinOp): + _op_type_ = OperandDef.OR + _func_name = "logical_or" + + +@infer_dtype(np.logical_or) +def logical_or(x1, x2, out=None, where=None, **kwargs): + """ + Compute the truth value of x1 OR x2 element-wise. + + Parameters + ---------- + x1, x2 : array_like + Logical OR is applied to the elements of `x1` and `x2`. + They have to be of the same shape. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor or bool + Boolean result with the same shape as `x1` and `x2` of the logical + OR operation on elements of `x1` and `x2`. + + See Also + -------- + logical_and, logical_not, logical_xor + bitwise_or + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.logical_or(True, False).execute() + True + >>> mt.logical_or([True, False], [False, False]).execute() + array([ True, False]) + + >>> x = mt.arange(5) + >>> mt.logical_or(x < 1, x > 3).execute() + array([ True, False, False, False, True]) + """ + op = TensorOr(**kwargs) + return op(x1, x2, out=out, where=where) + + +@infer_dtype(np.logical_or, reverse=True) +def rlogical_or(x1, x2, **kwargs): + op = TensorOr(**kwargs) + return op.rcall(x1, x2) diff --git a/python/xorbits/_mars/tensor/arithmetic/logical_xor.py b/python/xorbits/_mars/tensor/arithmetic/logical_xor.py new file mode 100644 index 000000000..f3327570f --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/logical_xor.py @@ -0,0 +1,86 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="binary_and") +class TensorXor(TensorBinOp): + _op_type_ = OperandDef.XOR + _func_name = "logical_xor" + + +@infer_dtype(np.logical_xor) +def logical_xor(x1, x2, out=None, where=None, **kwargs): + """ + Compute the truth value of x1 XOR x2, element-wise. + + Parameters + ---------- + x1, x2 : array_like + Logical XOR is applied to the elements of `x1` and `x2`. They must + be broadcastable to the same shape. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : bool or Tensor of bool + Boolean result of the logical XOR operation applied to the elements + of `x1` and `x2`; the shape is determined by whether or not + broadcasting of one or both arrays was required. + + See Also + -------- + logical_and, logical_or, logical_not, bitwise_xor + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.logical_xor(True, False).execute() + True + >>> mt.logical_xor([True, True, False, False], [True, False, True, False]).execute() + array([False, True, True, False]) + + >>> x = mt.arange(5) + >>> mt.logical_xor(x < 1, x > 3).execute() + array([ True, False, False, False, True]) + + Simple example showing support of broadcasting + + >>> mt.logical_xor(0, mt.eye(2)).execute() + array([[ True, False], + [False, True]]) + """ + op = TensorXor(**kwargs) + return op(x1, x2, out=out, where=where) + + +@infer_dtype(np.logical_xor, reverse=True) +def rlogical_xor(x1, x2, **kwargs): + op = TensorXor(**kwargs) + return op.rcall(x1, x2) diff --git a/python/xorbits/_mars/tensor/arithmetic/lshift.py b/python/xorbits/_mars/tensor/arithmetic/lshift.py new file mode 100644 index 000000000..f9c29f1f2 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/lshift.py @@ -0,0 +1,80 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="binary_or") +class TensorLshift(TensorBinOp): + _op_type_ = OperandDef.LSHIFT + _func_name = "left_shift" + + +@infer_dtype(np.left_shift) +def lshift(x1, x2, out=None, where=None, **kwargs): + """ + Shift the bits of an integer to the left. + + Bits are shifted to the left by appending `x2` 0s at the right of `x1`. + Since the internal representation of numbers is in binary format, this + operation is equivalent to multiplying `x1` by ``2**x2``. + + Parameters + ---------- + x1 : array_like of integer type + Input values. + x2 : array_like of integer type + Number of zeros to append to `x1`. Has to be non-negative. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + out : tensor of integer type + Return `x1` with bits shifted `x2` times to the left. + + See Also + -------- + right_shift : Shift the bits of an integer to the right. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.left_shift(5, 2).execute() + 20 + + >>> mt.left_shift(5, [1,2,3]).execute() + array([10, 20, 40]) + """ + op = TensorLshift(**kwargs) + return op(x1, x2, out=out, where=where) + + +@infer_dtype(np.left_shift, reverse=True) +def rlshift(x1, x2, **kwargs): + op = TensorLshift(**kwargs) + return op.rcall(x1, x2) diff --git a/python/xorbits/_mars/tensor/arithmetic/maximum.py b/python/xorbits/_mars/tensor/arithmetic/maximum.py new file mode 100644 index 000000000..af3a02721 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/maximum.py @@ -0,0 +1,106 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand +class TensorMaximum(TensorBinOp): + _op_type_ = OperandDef.MAXIMUM + _func_name = "maximum" + + @classmethod + def _is_sparse(cls, x1, x2): + if hasattr(x1, "issparse") and x1.issparse() and np.isscalar(x2) and x2 <= 0: + return True + if hasattr(x2, "issparse") and x2.issparse() and np.isscalar(x1) and x1 <= 0: + return True + return False + + +@infer_dtype(np.maximum) +def maximum(x1, x2, out=None, where=None, **kwargs): + """ + Element-wise maximum of tensor elements. + + Compare two tensors and returns a new array containing the element-wise + maxima. If one of the elements being compared is a NaN, then that + element is returned. If both elements are NaNs then the first is + returned. The latter distinction is important for complex NaNs, which + are defined as at least one of the real or imaginary parts being a NaN. + The net effect is that NaNs are propagated. + + Parameters + ---------- + x1, x2 : array_like + The tensors holding the elements to be compared. They must have + the same shape, or shapes that can be broadcast to a single shape. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : ndarray or scalar + The maximum of `x1` and `x2`, element-wise. Returns scalar if + both `x1` and `x2` are scalars. + + See Also + -------- + minimum : + Element-wise minimum of two tensors, propagates NaNs. + fmax : + Element-wise maximum of two tensors, ignores NaNs. + amax : + The maximum value of a tensor along a given axis, propagates NaNs. + nanmax : + The maximum value of a tensor along a given axis, ignores NaNs. + + fmin, amin, nanmin + + Notes + ----- + The maximum is equivalent to ``mt.where(x1 >= x2, x1, x2)`` when + neither x1 nor x2 are nans, but it is faster and does proper + broadcasting. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.maximum([2, 3, 4], [1, 5, 2]).execute() + array([2, 5, 4]) + + >>> mt.maximum(mt.eye(2), [0.5, 2]).execute() # broadcasting + array([[ 1. , 2. ], + [ 0.5, 2. ]]) + + >>> mt.maximum([mt.nan, 0, mt.nan], [0, mt.nan, mt.nan]).execute() + array([ NaN, NaN, NaN]) + >>> mt.maximum(mt.Inf, 1).execute() + inf + """ + op = TensorMaximum(**kwargs) + return op(x1, x2, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/minimum.py b/python/xorbits/_mars/tensor/arithmetic/minimum.py new file mode 100644 index 000000000..9db2b5c32 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/minimum.py @@ -0,0 +1,106 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand +class TensorMinimum(TensorBinOp): + _op_type_ = OperandDef.MINIMUM + _func_name = "minimum" + + @classmethod + def _is_sparse(cls, x1, x2): + if hasattr(x1, "issparse") and x1.issparse() and np.isscalar(x2) and x2 >= 0: + return True + if hasattr(x2, "issparse") and x2.issparse() and np.isscalar(x1) and x1 >= 0: + return True + return False + + +@infer_dtype(np.minimum) +def minimum(x1, x2, out=None, where=None, **kwargs): + """ + Element-wise minimum of tensor elements. + + Compare two tensors and returns a new tensor containing the element-wise + minima. If one of the elements being compared is a NaN, then that + element is returned. If both elements are NaNs then the first is + returned. The latter distinction is important for complex NaNs, which + are defined as at least one of the real or imaginary parts being a NaN. + The net effect is that NaNs are propagated. + + Parameters + ---------- + x1, x2 : array_like + The tensors holding the elements to be compared. They must have + the same shape, or shapes that can be broadcast to a single shape. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor or scalar + The minimum of `x1` and `x2`, element-wise. Returns scalar if + both `x1` and `x2` are scalars. + + See Also + -------- + maximum : + Element-wise maximum of two tensors, propagates NaNs. + fmin : + Element-wise minimum of two tensors, ignores NaNs. + amin : + The minimum value of a tensor along a given axis, propagates NaNs. + nanmin : + The minimum value of a tenosr along a given axis, ignores NaNs. + + fmax, amax, nanmax + + Notes + ----- + The minimum is equivalent to ``mt.where(x1 <= x2, x1, x2)`` when + neither x1 nor x2 are NaNs, but it is faster and does proper + broadcasting. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.minimum([2, 3, 4], [1, 5, 2]).execute() + array([1, 3, 2]) + + >>> mt.minimum(mt.eye(2), [0.5, 2]).execute() # broadcasting + array([[ 0.5, 0. ], + [ 0. , 1. ]]) + + >>> mt.minimum([mt.nan, 0, mt.nan],[0, mt.nan, mt.nan]).execute() + array([ NaN, NaN, NaN]) + >>> mt.minimum(-mt.Inf, 1).execute() + -inf + """ + op = TensorMinimum(**kwargs) + return op(x1, x2, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/mod.py b/python/xorbits/_mars/tensor/arithmetic/mod.py new file mode 100644 index 000000000..e598084ea --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/mod.py @@ -0,0 +1,102 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="binary_or") +class TensorMod(TensorBinOp): + _op_type_ = OperandDef.MOD + _func_name = "mod" + + +@infer_dtype(np.mod) +def mod(x1, x2, out=None, where=None, **kwargs): + """ + Return element-wise remainder of division. + + Computes the remainder complementary to the `floor_divide` function. It is + equivalent to the Python modulus operator``x1 % x2`` and has the same sign + as the divisor `x2`. The MATLAB function equivalent to ``np.remainder`` + is ``mod``. + + .. warning:: + + This should not be confused with: + + * Python 3.7's `math.remainder` and C's ``remainder``, which + computes the IEEE remainder, which are the complement to + ``round(x1 / x2)``. + * The MATLAB ``rem`` function and or the C ``%`` operator which is the + complement to ``int(x1 / x2)``. + + Parameters + ---------- + x1 : array_like + Dividend array. + x2 : array_like + Divisor array. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor + The element-wise remainder of the quotient ``floor_divide(x1, x2)``. + Returns a scalar if both `x1` and `x2` are scalars. + + See Also + -------- + floor_divide : Equivalent of Python ``//`` operator. + divmod : Simultaneous floor division and remainder. + fmod : Equivalent of the MATLAB ``rem`` function. + divide, floor + + Notes + ----- + Returns 0 when `x2` is 0 and both `x1` and `x2` are (tensors of) + integers. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.remainder([4, 7], [2, 3]).execute() + array([0, 1]) + >>> mt.remainder(mt.arange(7), 5).execute() + array([0, 1, 2, 3, 4, 0, 1]) + """ + op = TensorMod(**kwargs) + return op(x1, x2, out=out, where=where) + + +remainder = mod + + +@infer_dtype(np.mod, reverse=True) +def rmod(x1, x2, **kwargs): + op = TensorMod(**kwargs) + return op.rcall(x1, x2) diff --git a/python/xorbits/_mars/tensor/arithmetic/modf.py b/python/xorbits/_mars/tensor/arithmetic/modf.py new file mode 100644 index 000000000..c2ca31b63 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/modf.py @@ -0,0 +1,123 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..array_utils import as_same_device, device +from ..datasource import tensor as astensor +from .core import TensorOutBinOp + + +class TensorModf(TensorOutBinOp): + _op_type_ = OperandDef.MODF + + def __init__(self, casting="same_kind", dtype=None, sparse=False, **kw): + super().__init__(_casting=casting, dtype=dtype, sparse=sparse, **kw) + + @property + def _fun(self): + return np.modf + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + kw = {"casting": op.casting} + + inputs_iter = iter(inputs) + input = next(inputs_iter) + if op.out1 is not None: + out1 = next(inputs_iter) + else: + out1 = None + if op.out2 is not None: + out2 = next(inputs_iter) + else: + out2 = None + if op.where is not None: + where = kw["where"] = next(inputs_iter) + else: + where = None + kw["order"] = op.order + + try: + args = [input] + if out1 is not None: + args.append(out1.copy()) + if out2 is not None: + args.append(out2.copy()) + y1, y2 = xp.modf(*args, **kw) + except TypeError: + if where is None: + raise + y1, y2 = xp.modf(input) + y1, y2 = xp.where(where, y1, out1), xp.where(where, y2, out2) + + for c, res in zip(op.outputs, (y1, y2)): + ctx[c.key] = res + + +def modf(x, out1=None, out2=None, out=None, where=None, **kwargs): + """ + Return the fractional and integral parts of a tensor, element-wise. + + The fractional and integral parts are negative if the given number is + negative. + + Parameters + ---------- + x : array_like + Input tensor. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y1 : Tensor + Fractional part of `x`. + y2 : Tensor + Integral part of `x`. + + Notes + ----- + For integer input the return values are floats. + + See Also + -------- + divmod : ``divmod(x, 1)`` is equivalent to ``modf`` with the return values + switched, except it always has a positive remainder. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.modf([0, 3.5]).execute() + (array([ 0. , 0.5]), array([ 0., 3.])) + >>> mt.modf(-0.5).execute() + (-0.5, -0) + """ + x = astensor(x) + op = TensorModf(**kwargs) + return op(x, out1=out1, out2=out2, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/multiply.py b/python/xorbits/_mars/tensor/arithmetic/multiply.py new file mode 100644 index 000000000..cc214f556 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/multiply.py @@ -0,0 +1,130 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import reduce + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import BoolField +from ..array_utils import as_same_device, device +from ..datasource import scalar +from ..utils import infer_dtype +from .core import TensorBinOp, TensorMultiOp +from .utils import TreeReductionBuilder, arithmetic_operand, tree_op_estimate_size + + +@arithmetic_operand(sparse_mode="binary_or") +class TensorMultiply(TensorBinOp): + _op_type_ = OperandDef.MUL + _func_name = "multiply" + + +@infer_dtype(np.multiply) +def multiply(x1, x2, out=None, where=None, **kwargs): + """ + Multiply arguments element-wise. + + Parameters + ---------- + x1, x2 : array_like + Input arrays to be multiplied. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor + The product of `x1` and `x2`, element-wise. Returns a scalar if + both `x1` and `x2` are scalars. + + Notes + ----- + Equivalent to `x1` * `x2` in terms of array broadcasting. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.multiply(2.0, 4.0).execute() + 8.0 + + >>> x1 = mt.arange(9.0).reshape((3, 3)) + >>> x2 = mt.arange(3.0) + >>> mt.multiply(x1, x2).execute() + array([[ 0., 1., 4.], + [ 0., 4., 10.], + [ 0., 7., 16.]]) + """ + op = TensorMultiply(**kwargs) + return op(x1, x2, out=out, where=where) + + +@infer_dtype(np.multiply, reverse=True) +def rmultiply(x1, x2, **kwargs): + op = TensorMultiply(**kwargs) + return op.rcall(x1, x2) + + +class TensorTreeMultiply(TensorMultiOp): + _op_type_ = OperandDef.TREE_MULTIPLY + _func_name = "multiply" + + ignore_empty_input = BoolField("ignore_empty_input", default=False) + + def __init__(self, sparse=False, **kw): + super().__init__(sparse=sparse, **kw) + + @classmethod + def _is_sparse(cls, *args): + if not args or all(np.isscalar(x) for x in args): + return False + if all( + np.isscalar(x) or (hasattr(x, "issparse") and x.issparse()) for x in args + ): + return True + return False + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + if op.ignore_empty_input: + inputs = [inp for inp in inputs if not hasattr(inp, "size") or inp.size > 0] + + with device(device_id): + ctx[op.outputs[0].key] = reduce(xp.multiply, inputs) + + @classmethod + def estimate_size(cls, ctx, op): + tree_op_estimate_size(ctx, op) + + +@infer_dtype(lambda *args: reduce(np.multiply, args)) +def tree_multiply(*args, combine_size=None, **kwargs): + class MultiplyBuilder(TreeReductionBuilder): + def _build_reduction(self, inputs, final=False): + op = TensorTreeMultiply(args=inputs, **kwargs) + return op(*inputs) + + args = [scalar(a) if np.isscalar(a) else a for a in args] + return MultiplyBuilder(combine_size).build(args) diff --git a/python/xorbits/_mars/tensor/arithmetic/nan_to_num.py b/python/xorbits/_mars/tensor/arithmetic/nan_to_num.py new file mode 100644 index 000000000..67d844f7b --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/nan_to_num.py @@ -0,0 +1,97 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..core import Tensor +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorNanToNum(TensorUnaryOp): + _op_type_ = OperandDef.NAN_TO_NUM + _func_name = "nan_to_num" + + +@infer_dtype(np.nan_to_num) +def nan_to_num(x, copy=True, **kwargs): + """ + Replace nan with zero and inf with large finite numbers. + + If `x` is inexact, NaN is replaced by zero, and infinity and -infinity + replaced by the respectively largest and most negative finite floating + point values representable by ``x.dtype``. + + For complex dtypes, the above is applied to each of the real and + imaginary components of `x` separately. + + If `x` is not inexact, then no replacements are made. + + Parameters + ---------- + x : array_like + Input data. + copy : bool, optional + Whether to create a copy of `x` (True) or to replace values + in-place (False). The in-place operation only occurs if + casting to an array does not require a copy. + Default is True. + + Returns + ------- + out : Tensor + `x`, with the non-finite values replaced. If `copy` is False, this may + be `x` itself. + + See Also + -------- + isinf : Shows which elements are positive or negative infinity. + isneginf : Shows which elements are negative infinity. + isposinf : Shows which elements are positive infinity. + isnan : Shows which elements are Not a Number (NaN). + isfinite : Shows which elements are finite (not NaN, not infinity) + + Notes + ----- + Mars uses the IEEE Standard for Binary Floating-Point for Arithmetic + (IEEE 754). This means that Not a Number is not equivalent to infinity. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.array([mt.inf, -mt.inf, mt.nan, -128, 128]) + >>> mt.nan_to_num(x).execute() + array([ 1.79769313e+308, -1.79769313e+308, 0.00000000e+000, + -1.28000000e+002, 1.28000000e+002]) + >>> y = mt.array([complex(mt.inf, mt.nan), mt.nan, complex(mt.nan, mt.inf)]) + >>> mt.nan_to_num(y).execute() + array([ 1.79769313e+308 +0.00000000e+000j, + 0.00000000e+000 +0.00000000e+000j, + 0.00000000e+000 +1.79769313e+308j]) + """ + op = TensorNanToNum(**kwargs) + ret = op(x) + + if copy: + return ret + + # set back, make sure x is a Tensor + if not isinstance(x, Tensor): + raise ValueError(f"`x` must be a Tensor, got {type(x)} instead") + x.data = ret.data + return x diff --git a/python/xorbits/_mars/tensor/arithmetic/negative.py b/python/xorbits/_mars/tensor/arithmetic/negative.py new file mode 100644 index 000000000..e7d495fa7 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/negative.py @@ -0,0 +1,63 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorNegative(TensorUnaryOp): + _op_type_ = OperandDef.NEGATIVE + _func_name = "negative" + + +@infer_dtype(np.negative) +def negative(x, out=None, where=None, **kwargs): + """ + Numerical negative, element-wise. + + Parameters + ---------- + x : array_like or scalar + Input tensor. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + For other keyword-only arguments, see the + :ref:`ufunc docs `. + + Returns + ------- + y : Tensor or scalar + Returned array or scalar: `y = -x`. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.negative([1.,-1.]).execute() + array([-1., 1.]) + """ + op = TensorNegative(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/nextafter.py b/python/xorbits/_mars/tensor/arithmetic/nextafter.py new file mode 100644 index 000000000..2b4ffbfcf --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/nextafter.py @@ -0,0 +1,66 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="binary_and") +class TensorNextafter(TensorBinOp): + _op_type_ = OperandDef.NEXTAFTER + _func_name = "nextafter" + + +@infer_dtype(np.nextafter) +def nextafter(x1, x2, out=None, where=None, **kwargs): + """ + Return the next floating-point value after x1 towards x2, element-wise. + + Parameters + ---------- + x1 : array_like + Values to find the next representable value of. + x2 : array_like + The direction where to look for the next representable value of `x1`. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + out : array_like + The next representable values of `x1` in the direction of `x2`. + + Examples + -------- + >>> import mars.tensor as mt + + >>> eps = mt.finfo(mt.float64).eps + >>> (mt.nextafter(1, 2) == eps + 1).execute() + True + >>> (mt.nextafter([1, 2], [2, 1]) == [eps + 1, 2 - eps]).execute() + array([ True, True]) + """ + op = TensorNextafter(**kwargs) + return op(x1, x2, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/not_equal.py b/python/xorbits/_mars/tensor/arithmetic/not_equal.py new file mode 100644 index 000000000..0c2150bf3 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/not_equal.py @@ -0,0 +1,70 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import inject_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="binary_and") +class TensorNotEqual(TensorBinOp): + _op_type_ = OperandDef.NE + _func_name = "not_equal" + + +@inject_dtype(np.bool_) +def not_equal(x1, x2, out=None, where=None, **kwargs): + """ + Return (x1 != x2) element-wise. + + Parameters + ---------- + x1, x2 : array_like + Input tensors. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + not_equal : tensor bool, scalar bool + For each element in `x1, x2`, return True if `x1` is not equal + to `x2` and False otherwise. + + + See Also + -------- + equal, greater, greater_equal, less, less_equal + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.not_equal([1.,2.], [1., 3.]).execute() + array([False, True]) + >>> mt.not_equal([1, 2], [[1, 3],[1, 4]]).execute() + array([[False, True], + [False, True]]) + """ + op = TensorNotEqual(**kwargs) + return op(x1, x2, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/positive.py b/python/xorbits/_mars/tensor/arithmetic/positive.py new file mode 100644 index 000000000..39c077337 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/positive.py @@ -0,0 +1,45 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorPositive(TensorUnaryOp): + _op_type_ = OperandDef.POSITIVE + _func_name = "positive" + + +@infer_dtype(np.positive) +def positive(x, out=None, where=None, **kwargs): + """ + Numerical positive, element-wise. + + Parameters + ---------- + x : array_like or scalar + Input tensor. + + Returns + ------- + y : Tensor or scalar + Returned array or scalar: `y = +x`. + """ + op = TensorPositive(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/power.py b/python/xorbits/_mars/tensor/arithmetic/power.py new file mode 100644 index 000000000..0a611c801 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/power.py @@ -0,0 +1,104 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand +class TensorPower(TensorBinOp): + _op_type_ = OperandDef.POW + _func_name = "power" + + @classmethod + def _is_sparse(cls, x1, x2): + if hasattr(x1, "issparse") and x1.issparse(): + return True + return False + + +@infer_dtype(np.power) +def power(x1, x2, out=None, where=None, **kwargs): + r""" + First tensor elements raised to powers from second tensor, element-wise. + + Raise each base in `x1` to the positionally-corresponding power in + `x2`. `x1` and `x2` must be broadcastable to the same shape. Note that an + integer type raised to a negative integer power will raise a ValueError. + + Parameters + ---------- + x1 : array_like + The bases. + x2 : array_like + The exponents. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor + The bases in `x1` raised to the exponents in `x2`. + + See Also + -------- + float_power : power function that promotes integers to float + + Examples + -------- + Cube each element in a list. + + >>> import mars.tensor as mt + + >>> x1 = range(6) + >>> x1 + [0, 1, 2, 3, 4, 5] + >>> mt.power(x1, 3).execute() + array([ 0, 1, 8, 27, 64, 125]) + + Raise the bases to different exponents. + + >>> x2 = [1.0, 2.0, 3.0, 3.0, 2.0, 1.0] + >>> mt.power(x1, x2).execute() + array([ 0., 1., 8., 27., 16., 5.]) + + The effect of broadcasting. + + >>> x2 = mt.array([[1, 2, 3, 3, 2, 1], [1, 2, 3, 3, 2, 1]]) + >>> x2.execute() + array([[1, 2, 3, 3, 2, 1], + [1, 2, 3, 3, 2, 1]]) + >>> mt.power(x1, x2).execute() + array([[ 0, 1, 8, 27, 16, 5], + [ 0, 1, 8, 27, 16, 5]]) + """ + op = TensorPower(**kwargs) + return op(x1, x2, out=out, where=where) + + +@infer_dtype(np.power, reverse=True) +def rpower(x1, x2, **kwargs): + op = TensorPower(**kwargs) + return op.rcall(x1, x2) diff --git a/python/xorbits/_mars/tensor/arithmetic/rad2deg.py b/python/xorbits/_mars/tensor/arithmetic/rad2deg.py new file mode 100644 index 000000000..34b5067a2 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/rad2deg.py @@ -0,0 +1,69 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorRad2deg(TensorUnaryOp): + _op_type_ = OperandDef.RAD2DEG + _func_name = "rad2deg" + + +@infer_dtype(np.rad2deg) +def rad2deg(x, out=None, where=None, **kwargs): + """ + Convert angles from radians to degrees. + + Parameters + ---------- + x : array_like + Angle in radians. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor + The corresponding angle in degrees. + + See Also + -------- + deg2rad : Convert angles from degrees to radians. + + Notes + ----- + rad2deg(x) is ``180 * x / pi``. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.rad2deg(mt.pi/2).execute() + 90.0 + """ + op = TensorRad2deg(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/radians.py b/python/xorbits/_mars/tensor/arithmetic/radians.py new file mode 100644 index 000000000..2aa943f1f --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/radians.py @@ -0,0 +1,75 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorRadians(TensorUnaryOp): + _op_type_ = OperandDef.RADIANS + _func_name = "radians" + + +@infer_dtype(np.radians) +def radians(x, out=None, where=None, **kwargs): + """ + Convert angles from degrees to radians. + + Parameters + ---------- + x : array_like + Input tensor in degrees. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor + The corresponding radian values. + + See Also + -------- + deg2rad : equivalent function + + Examples + -------- + Convert a degree array to radians + + >>> import mars.tensor as mt + + >>> deg = mt.arange(12.) * 30. + >>> mt.radians(deg).execute() + array([ 0. , 0.52359878, 1.04719755, 1.57079633, 2.0943951 , + 2.61799388, 3.14159265, 3.66519143, 4.1887902 , 4.71238898, + 5.23598776, 5.75958653]) + + >>> out = mt.zeros((deg.shape)) + >>> ret = mt.radians(deg, out) + >>> ret is out + True + """ + op = TensorRadians(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/real.py b/python/xorbits/_mars/tensor/arithmetic/real.py new file mode 100644 index 000000000..32b651d77 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/real.py @@ -0,0 +1,68 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorReal(TensorUnaryOp): + _op_type_ = OperandDef.REAL + _func_name = "real" + + +@infer_dtype(np.real) +def real(val, **kwargs): + """ + Return the real part of the complex argument. + + Parameters + ---------- + val : array_like + Input tensor. + + Returns + ------- + out : Tensor or scalar + The real component of the complex argument. If `val` is real, the type + of `val` is used for the output. If `val` has complex elements, the + returned type is float. + + See Also + -------- + real_if_close, imag, angle + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([1+2j, 3+4j, 5+6j]) + >>> a.real.execute() + array([ 1., 3., 5.]) + >>> a.real = 9 + >>> a.execute() + array([ 9.+2.j, 9.+4.j, 9.+6.j]) + >>> a.real = mt.array([9, 8, 7]) + >>> a.execute() + array([ 9.+2.j, 8.+4.j, 7.+6.j]) + >>> mt.real(1 + 1j).execute() + 1.0 + + """ + op = TensorReal(**kwargs) + return op(val) diff --git a/python/xorbits/_mars/tensor/arithmetic/reciprocal.py b/python/xorbits/_mars/tensor/arithmetic/reciprocal.py new file mode 100644 index 000000000..cb3bb901b --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/reciprocal.py @@ -0,0 +1,74 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorReciprocal(TensorUnaryOp): + _op_type_ = OperandDef.RECIPROCAL + _func_name = "reciprocal" + + +@infer_dtype(np.reciprocal) +def reciprocal(x, out=None, where=None, **kwargs): + """ + Return the reciprocal of the argument, element-wise. + + Calculates ``1/x``. + + Parameters + ---------- + x : array_like + Input tensor. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor + Return tensor. + + Notes + ----- + .. note:: + This function is not designed to work with integers. + + For integer arguments with absolute value larger than 1 the result is + always zero because of the way Python handles integer division. For + integer zero the result is an overflow. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.reciprocal(2.).execute() + 0.5 + >>> mt.reciprocal([1, 2., 3.33]).execute() + array([ 1. , 0.5 , 0.3003003]) + """ + op = TensorReciprocal(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/rint.py b/python/xorbits/_mars/tensor/arithmetic/rint.py new file mode 100644 index 000000000..c15408e2b --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/rint.py @@ -0,0 +1,66 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorRint(TensorUnaryOp): + _op_type_ = OperandDef.RINT + _func_name = "rint" + + +@infer_dtype(np.rint) +def rint(x, out=None, where=None, **kwargs): + """ + Round elements of the tensor to the nearest integer. + + Parameters + ---------- + x : array_like + Input tensor. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + out : Tensor or scalar + Output array is same shape and type as `x`. + + See Also + -------- + ceil, floor, trunc + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) + >>> mt.rint(a).execute() + array([-2., -2., -0., 0., 2., 2., 2.]) + """ + op = TensorRint(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/rshift.py b/python/xorbits/_mars/tensor/arithmetic/rshift.py new file mode 100644 index 000000000..93aef63e2 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/rshift.py @@ -0,0 +1,79 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="binary_or") +class TensorRshift(TensorBinOp): + _op_type_ = OperandDef.RSHIFT + _func_name = "right_shift" + + +@infer_dtype(np.right_shift) +def rshift(x1, x2, out=None, where=None, **kwargs): + """ + Shift the bits of an integer to the right. + + Bits are shifted to the right `x2`. Because the internal + representation of numbers is in binary format, this operation is + equivalent to dividing `x1` by ``2**x2``. + + Parameters + ---------- + x1 : array_like, int + Input values. + x2 : array_like, int + Number of bits to remove at the right of `x1`. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + out : Tensor, int + Return `x1` with bits shifted `x2` times to the right. + + See Also + -------- + left_shift : Shift the bits of an integer to the left. + + Examples + -------- + >>> import mars.tensor as mt + >>> mt.right_shift(10, 1).execute() + 5 + + >>> mt.right_shift(10, [1,2,3]).execute() + array([5, 2, 1]) + """ + op = TensorRshift(**kwargs) + return op(x1, x2, out=out, where=where) + + +@infer_dtype(np.right_shift, reverse=True) +def rrshift(x1, x2, **kwargs): + op = TensorRshift(**kwargs) + return op.rcall(x1, x2) diff --git a/python/xorbits/_mars/tensor/arithmetic/setimag.py b/python/xorbits/_mars/tensor/arithmetic/setimag.py new file mode 100644 index 000000000..f3aad4bf9 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/setimag.py @@ -0,0 +1,46 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ..array_utils import as_same_device, device +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="binary_and") +class TensorSetImag(TensorBinOp): + _op_type_ = OperandDef.SET_IMAG + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + if len(inputs) == 1: + val, imag = inputs[0], op.rhs + else: + assert len(inputs) == 2 + val, imag = inputs + + with device(device_id): + val = val.copy() + val.imag = imag + + ctx[op.outputs[0].key] = val + + +def set_imag(val, imag): + op = TensorSetImag(dtype=val.dtype) + return op(val, imag) diff --git a/python/xorbits/_mars/tensor/arithmetic/setreal.py b/python/xorbits/_mars/tensor/arithmetic/setreal.py new file mode 100644 index 000000000..76cbe52c8 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/setreal.py @@ -0,0 +1,46 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ..array_utils import as_same_device, device +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="binary_and") +class TensorSetReal(TensorBinOp): + _op_type_ = OperandDef.SET_REAL + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + if len(inputs) == 1: + val, real = inputs[0], op.rhs + else: + assert len(inputs) == 2 + val, real = inputs + + with device(device_id): + val = val.copy() + val.real = real + + ctx[op.outputs[0].key] = val + + +def set_real(val, real): + op = TensorSetReal(dtype=val.dtype) + return op(val, real) diff --git a/python/xorbits/_mars/tensor/arithmetic/sign.py b/python/xorbits/_mars/tensor/arithmetic/sign.py new file mode 100644 index 000000000..d988337ba --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/sign.py @@ -0,0 +1,79 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorSign(TensorUnaryOp): + _op_type_ = OperandDef.SIGN + _func_name = "sign" + + +@infer_dtype(np.sign) +def sign(x, out=None, where=None, **kwargs): + r""" + Returns an element-wise indication of the sign of a number. + + The `sign` function returns ``-1 if x < 0, 0 if x==0, 1 if x > 0``. nan + is returned for nan inputs. + + For complex inputs, the `sign` function returns + ``sign(x.real) + 0j if x.real != 0 else sign(x.imag) + 0j``. + + complex(nan, 0) is returned for complex nan inputs. + + Parameters + ---------- + x : array_like + Input values. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor + The sign of `x`. + + Notes + ----- + There is more than one definition of sign in common use for complex + numbers. The definition used here is equivalent to :math:`x/\sqrt{x*x}` + which is different from a common alternative, :math:`x/|x|`. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.sign([-5., 4.5]).execute() + array([-1., 1.]) + >>> mt.sign(0).execute() + 0 + >>> mt.sign(5-2j).execute() + (1+0j) + """ + op = TensorSign(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/signbit.py b/python/xorbits/_mars/tensor/arithmetic/signbit.py new file mode 100644 index 000000000..8e576f614 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/signbit.py @@ -0,0 +1,63 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import inject_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorSignbit(TensorUnaryOp): + _op_type_ = OperandDef.SIGNBIT + _func_name = "signbit" + + +@inject_dtype(np.bool_) +def signbit(x, out=None, where=None, **kwargs): + """ + Returns element-wise True where signbit is set (less than zero). + + Parameters + ---------- + x : array_like + The input value(s). + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + result : Tensor of bool + Output tensor, or reference to `out` if that was supplied. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.signbit(-1.2).execute() + True + >>> mt.signbit(mt.array([1, -2.3, 2.1])).execute() + array([False, True, False]) + """ + op = TensorSignbit(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/sin.py b/python/xorbits/_mars/tensor/arithmetic/sin.py new file mode 100644 index 000000000..b022af5ed --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/sin.py @@ -0,0 +1,96 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorSin(TensorUnaryOp): + _op_type_ = OperandDef.SIN + _func_name = "sin" + + +@infer_dtype(np.sin) +def sin(x, out=None, where=None, **kwargs): + r""" + Trigonometric sine, element-wise. + + Parameters + ---------- + x : array_like + Angle, in radians (:math:`2 \pi` rad equals 360 degrees). + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : array_like + The sine of each element of x. + + See Also + -------- + arcsin, sinh, cos + + Notes + ----- + The sine is one of the fundamental functions of trigonometry (the + mathematical study of triangles). Consider a circle of radius 1 + centered on the origin. A ray comes in from the :math:`+x` axis, makes + an angle at the origin (measured counter-clockwise from that axis), and + departs from the origin. The :math:`y` coordinate of the outgoing + ray's intersection with the unit circle is the sine of that angle. It + ranges from -1 for :math:`x=3\pi / 2` to +1 for :math:`\pi / 2.` The + function has zeroes where the angle is a multiple of :math:`\pi`. + Sines of angles between :math:`\pi` and :math:`2\pi` are negative. + The numerous properties of the sine and related functions are included + in any standard trigonometry text. + + Examples + -------- + Print sine of one angle: + + >>> import mars.tensor as mt + + >>> mt.sin(mt.pi/2.).execute() + 1.0 + + Print sines of an array of angles given in degrees: + + >>> mt.sin(mt.array((0., 30., 45., 60., 90.)) * mt.pi / 180. ).execute() + array([ 0. , 0.5 , 0.70710678, 0.8660254 , 1. ]) + + Plot the sine function: + + >>> import matplotlib.pylab as plt + >>> x = mt.linspace(-mt.pi, mt.pi, 201) + >>> plt.plot(x.execute(), mt.sin(x).execute()) + >>> plt.xlabel('Angle [rad]') + >>> plt.ylabel('sin(x)') + >>> plt.axis('tight') + >>> plt.show() + """ + op = TensorSin(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/sinc.py b/python/xorbits/_mars/tensor/arithmetic/sinc.py new file mode 100644 index 000000000..a1e12b510 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/sinc.py @@ -0,0 +1,100 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorSinc(TensorUnaryOp): + _op_type_ = OperandDef.SINC + _func_name = "sinc" + + +@infer_dtype(np.sinc) +def sinc(x, **kwargs): + r""" + Return the sinc function. + + The sinc function is :math:`\\sin(\\pi x)/(\\pi x)`. + + Parameters + ---------- + x : Tensor + Tensor (possibly multi-dimensional) of values for which to to + calculate ``sinc(x)``. + + Returns + ------- + out : Tensor + ``sinc(x)``, which has the same shape as the input. + + Notes + ----- + ``sinc(0)`` is the limit value 1. + + The name sinc is short for "sine cardinal" or "sinus cardinalis". + + The sinc function is used in various signal processing applications, + including in anti-aliasing, in the construction of a Lanczos resampling + filter, and in interpolation. + + For bandlimited interpolation of discrete-time signals, the ideal + interpolation kernel is proportional to the sinc function. + + References + ---------- + .. [1] Weisstein, Eric W. "Sinc Function." From MathWorld--A Wolfram Web + Resource. http://mathworld.wolfram.com/SincFunction.html + .. [2] Wikipedia, "Sinc function", + http://en.wikipedia.org/wiki/Sinc_function + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.linspace(-4, 4, 41) + >>> mt.sinc(x).execute() + array([ -3.89804309e-17, -4.92362781e-02, -8.40918587e-02, + -8.90384387e-02, -5.84680802e-02, 3.89804309e-17, + 6.68206631e-02, 1.16434881e-01, 1.26137788e-01, + 8.50444803e-02, -3.89804309e-17, -1.03943254e-01, + -1.89206682e-01, -2.16236208e-01, -1.55914881e-01, + 3.89804309e-17, 2.33872321e-01, 5.04551152e-01, + 7.56826729e-01, 9.35489284e-01, 1.00000000e+00, + 9.35489284e-01, 7.56826729e-01, 5.04551152e-01, + 2.33872321e-01, 3.89804309e-17, -1.55914881e-01, + -2.16236208e-01, -1.89206682e-01, -1.03943254e-01, + -3.89804309e-17, 8.50444803e-02, 1.26137788e-01, + 1.16434881e-01, 6.68206631e-02, 3.89804309e-17, + -5.84680802e-02, -8.90384387e-02, -8.40918587e-02, + -4.92362781e-02, -3.89804309e-17]) + + >>> import matplotlib.pyplot as plt + >>> plt.plot(x.execute(), np.sinc(x).execute()) + [] + >>> plt.title("Sinc Function") + + >>> plt.ylabel("Amplitude") + + >>> plt.xlabel("X") + + >>> plt.show() + """ + op = TensorSinc(**kwargs) + return op(x) diff --git a/python/xorbits/_mars/tensor/arithmetic/sinh.py b/python/xorbits/_mars/tensor/arithmetic/sinh.py new file mode 100644 index 000000000..f11f2a542 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/sinh.py @@ -0,0 +1,91 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorSinh(TensorUnaryOp): + _op_type_ = OperandDef.SINH + _func_name = "sinh" + + +@infer_dtype(np.sinh) +def sinh(x, out=None, where=None, **kwargs): + """ + Hyperbolic sine, element-wise. + + Equivalent to ``1/2 * (mt.exp(x) - mt.exp(-x))`` or + ``-1j * mt.sin(1j*x)``. + + Parameters + ---------- + x : array_like + Input tensor. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor + The corresponding hyperbolic sine values. + + Notes + ----- + If `out` is provided, the function writes the result into it, + and returns a reference to `out`. (See Examples) + + References + ---------- + M. Abramowitz and I. A. Stegun, Handbook of Mathematical Functions. + New York, NY: Dover, 1972, pg. 83. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.sinh(0).execute() + 0.0 + >>> mt.sinh(mt.pi*1j/2).execute() + 1j + >>> mt.sinh(mt.pi*1j).execute() # (exact value is 0) + 1.2246063538223773e-016j + >>> # Discrepancy due to vagaries of floating point arithmetic. + + >>> # Example of providing the optional output parameter + >>> out1 = mt.zeros(1) + >>> out2 = mt.sinh([0.1], out1) + >>> out2 is out1 + True + + >>> # Example of ValueError due to provision of shape mis-matched `out` + >>> mt.sinh(mt.zeros((3,3)),mt.zeros((2,2))).execute() + Traceback (most recent call last): + ... + ValueError: operands could not be broadcast together with shapes (3,3) (2,2) + """ + op = TensorSinh(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/spacing.py b/python/xorbits/_mars/tensor/arithmetic/spacing.py new file mode 100644 index 000000000..a3c1c97ab --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/spacing.py @@ -0,0 +1,70 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="always_false") +class TensorSpacing(TensorUnaryOp): + _op_type_ = OperandDef.SPACING + _func_name = "spacing" + + +@infer_dtype(np.spacing) +def spacing(x, out=None, where=None, **kwargs): + """ + Return the distance between x and the nearest adjacent number. + + Parameters + ---------- + x : array_like + Values to find the spacing of. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + out : array_like + The spacing of values of `x1`. + + Notes + ----- + It can be considered as a generalization of EPS: + ``spacing(mt.float64(1)) == mt.finfo(mt.float64).eps``, and there + should not be any representable number between ``x + spacing(x)`` and + x for any finite x. + + Spacing of +- inf and NaN is NaN. + + Examples + -------- + >>> import mars.tensor as mt + + >>> (mt.spacing(1) == mt.finfo(mt.float64).eps).execute() + True + """ + op = TensorSpacing(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/sqrt.py b/python/xorbits/_mars/tensor/arithmetic/sqrt.py new file mode 100644 index 000000000..b5b9ee785 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/sqrt.py @@ -0,0 +1,79 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorSqrt(TensorUnaryOp): + _op_type_ = OperandDef.SQRT + _func_name = "sqrt" + + +@infer_dtype(np.sqrt) +def sqrt(x, out=None, where=None, **kwargs): + """ + Return the positive square-root of an tensor, element-wise. + + Parameters + ---------- + x : array_like + The values whose square-roots are required. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor + An tensor of the same shape as `x`, containing the positive + square-root of each element in `x`. If any element in `x` is + complex, a complex tensor is returned (and the square-roots of + negative reals are calculated). If all of the elements in `x` + are real, so is `y`, with negative elements returning ``nan``. + If `out` was provided, `y` is a reference to it. + + Notes + ----- + *sqrt* has--consistent with common convention--as its branch cut the + real "interval" [`-inf`, 0), and is continuous from above on it. + A branch cut is a curve in the complex plane across which a given + complex function fails to be continuous. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.sqrt([1,4,9]).execute() + array([ 1., 2., 3.]) + + >>> mt.sqrt([4, -1, -3+4J]).execute() + array([ 2.+0.j, 0.+1.j, 1.+2.j]) + + >>> mt.sqrt([4, -1, mt.inf]).execute() + array([ 2., NaN, Inf]) + """ + op = TensorSqrt(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/square.py b/python/xorbits/_mars/tensor/arithmetic/square.py new file mode 100644 index 000000000..37c6418b0 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/square.py @@ -0,0 +1,67 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorSquare(TensorUnaryOp): + _op_type_ = OperandDef.SQUARE + _func_name = "square" + + +@infer_dtype(np.square) +def square(x, out=None, where=None, **kwargs): + """ + Return the element-wise square of the input. + + Parameters + ---------- + x : array_like + Input data. + out : Tensor, None, or tuple of tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated array is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + out : Tensor + Element-wise `x*x`, of the same shape and dtype as `x`. + Returns scalar if `x` is a scalar. + + See Also + -------- + sqrt + power + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.square([-1j, 1]).execute() + array([-1.-0.j, 1.+0.j]) + """ + op = TensorSquare(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/subtract.py b/python/xorbits/_mars/tensor/arithmetic/subtract.py new file mode 100644 index 000000000..ebc39eeb6 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/subtract.py @@ -0,0 +1,79 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="binary_and") +class TensorSubtract(TensorBinOp): + _op_type_ = OperandDef.SUB + _func_name = "subtract" + + +@infer_dtype(np.subtract) +def subtract(x1, x2, out=None, where=None, **kwargs): + """ + Subtract arguments, element-wise. + + Parameters + ---------- + x1, x2 : array_like + The tensors to be subtracted from each other. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor + The difference of `x1` and `x2`, element-wise. Returns a scalar if + both `x1` and `x2` are scalars. + + Notes + ----- + Equivalent to ``x1 - x2`` in terms of tensor broadcasting. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.subtract(1.0, 4.0).execute() + -3.0 + + >>> x1 = mt.arange(9.0).reshape((3, 3)) + >>> x2 = mt.arange(3.0) + >>> mt.subtract(x1, x2).execute() + array([[ 0., 0., 0.], + [ 3., 3., 3.], + [ 6., 6., 6.]]) + """ + op = TensorSubtract(**kwargs) + return op(x1, x2, out=out, where=where) + + +@infer_dtype(np.subtract, reverse=True) +def rsubtract(x1, x2, **kwargs): + op = TensorSubtract(**kwargs) + return op.rcall(x1, x2) diff --git a/python/xorbits/_mars/tensor/arithmetic/tan.py b/python/xorbits/_mars/tensor/arithmetic/tan.py new file mode 100644 index 000000000..ef9495ba7 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/tan.py @@ -0,0 +1,86 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorTan(TensorUnaryOp): + _op_type_ = OperandDef.TAN + _func_name = "tan" + + +@infer_dtype(np.tan) +def tan(x, out=None, where=None, **kwargs): + """ + Compute tangent element-wise. + + Equivalent to ``mt.sin(x)/mt.cos(x)`` element-wise. + + Parameters + ---------- + x : array_like + Input tensor. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor + The corresponding tangent values. + + Notes + ----- + If `out` is provided, the function writes the result into it, + and returns a reference to `out`. (See Examples) + + References + ---------- + M. Abramowitz and I. A. Stegun, Handbook of Mathematical Functions. + New York, NY: Dover, 1972. + + Examples + -------- + >>> from math import pi + >>> import mars.tensor as mt + >>> mt.tan(mt.array([-pi,pi/2,pi])).execute() + array([ 1.22460635e-16, 1.63317787e+16, -1.22460635e-16]) + >>> + >>> # Example of providing the optional output parameter illustrating + >>> # that what is returned is a reference to said parameter + >>> out1 = mt.zeros(1) + >>> out2 = mt.cos([0.1], out1) + >>> out2 is out1 + True + >>> + >>> # Example of ValueError due to provision of shape mis-matched `out` + >>> mt.cos(mt.zeros((3,3)),mt.zeros((2,2))) + Traceback (most recent call last): + File "", line 1, in + ValueError: invalid return array shape + """ + op = TensorTan(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/tanh.py b/python/xorbits/_mars/tensor/arithmetic/tanh.py new file mode 100644 index 000000000..ad21f6b94 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/tanh.py @@ -0,0 +1,90 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorTanh(TensorUnaryOp): + _op_type_ = OperandDef.TANH + _func_name = "tanh" + + +@infer_dtype(np.tanh) +def tanh(x, out=None, where=None, **kwargs): + """ + Compute hyperbolic tangent element-wise. + + Equivalent to ``mt.sinh(x)/np.cosh(x)`` or ``-1j * mt.tan(1j*x)``. + + Parameters + ---------- + x : array_like + Input tensor. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor + The corresponding hyperbolic tangent values. + + Notes + ----- + If `out` is provided, the function writes the result into it, + and returns a reference to `out`. (See Examples) + + References + ---------- + .. [1] M. Abramowitz and I. A. Stegun, Handbook of Mathematical Functions. + New York, NY: Dover, 1972, pg. 83. + http://www.math.sfu.ca/~cbm/aands/ + + .. [2] Wikipedia, "Hyperbolic function", + http://en.wikipedia.org/wiki/Hyperbolic_function + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.tanh((0, mt.pi*1j, mt.pi*1j/2)).execute() + array([ 0. +0.00000000e+00j, 0. -1.22460635e-16j, 0. +1.63317787e+16j]) + + >>> # Example of providing the optional output parameter illustrating + >>> # that what is returned is a reference to said parameter + >>> out1 = mt.zeros(1) + >>> out2 = mt.tanh([0.1], out1) + >>> out2 is out1 + True + + >>> # Example of ValueError due to provision of shape mis-matched `out` + >>> mt.tanh(mt.zeros((3,3)),mt.zeros((2,2))) + Traceback (most recent call last): + ... + ValueError: operands could not be broadcast together with shapes (3,3) (2,2) + """ + op = TensorTanh(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/tests/__init__.py b/python/xorbits/_mars/tensor/arithmetic/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/tensor/arithmetic/tests/test_arithmetic.py b/python/xorbits/_mars/tensor/arithmetic/tests/test_arithmetic.py new file mode 100644 index 000000000..7ebaf45a8 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/tests/test_arithmetic.py @@ -0,0 +1,640 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +import scipy.sparse as sps + +from ....core import enter_mode, tile +from ...core import SparseTensor, Tensor +from ...datasource import array, empty, ones, tensor +from ...fetch import TensorFetch +from ...linalg import matmul +from .. import ( + TensorAdd, + TensorGreaterThan, + TensorIsclose, + TensorLog, + TensorSubtract, + TensorTreeAdd, + TensorTreeMultiply, + add, + around, + cos, + frexp, + isclose, + isfinite, + log, + negative, + subtract, + tree_add, + tree_multiply, + truediv, +) + + +def test_add(): + t1 = ones((3, 4), chunk_size=2) + t2 = ones(4, chunk_size=2) + t3 = t1 + t2 + k1 = t3.key + assert t3.op.gpu is None + t1, t2, t3 = tile(t1, t2, t3) + assert t3.key != k1 + assert t3.shape == (3, 4) + assert len(t3.chunks) == 4 + assert t3.chunks[0].inputs == [t1.chunks[0].data, t2.chunks[0].data] + assert t3.chunks[1].inputs == [t1.chunks[1].data, t2.chunks[1].data] + assert t3.chunks[2].inputs == [t1.chunks[2].data, t2.chunks[0].data] + assert t3.chunks[3].inputs == [t1.chunks[3].data, t2.chunks[1].data] + assert t3.op.dtype == np.dtype("f8") + assert t3.chunks[0].op.dtype == np.dtype("f8") + + t1 = ones((3, 4), chunk_size=2) + t4 = t1 + 1 + t1, t4 = tile(t1, t4) + assert t4.shape == (3, 4) + assert len(t3.chunks) == 4 + assert t4.chunks[0].inputs == [t1.chunks[0].data] + assert t4.chunks[0].op.rhs == 1 + assert t4.chunks[1].inputs == [t1.chunks[1].data] + assert t4.chunks[1].op.rhs == 1 + assert t4.chunks[2].inputs == [t1.chunks[2].data] + assert t4.chunks[2].op.rhs == 1 + assert t4.chunks[3].inputs == [t1.chunks[3].data] + assert t4.chunks[3].op.rhs == 1 + + t5 = add([1, 2, 3, 4], 1) + tile(t5) + assert t4.chunks[0].inputs == [t1.chunks[0].data] + + t2 = ones(4, chunk_size=2) + t6 = ones((3, 4), chunk_size=2, gpu=True) + t7 = ones(4, chunk_size=2, gpu=True) + t8 = t6 + t7 + t9 = t6 + t2 + assert t8.op.gpu is True + t8, t9 = tile(t8, t9) + assert t8.chunks[0].op.gpu is True + assert t9.op.gpu is None + assert t9.chunks[0].op.gpu is None + + # sparse tests + t1 = tensor([[0, 1, 0], [1, 0, 0]], chunk_size=2).tosparse() + + t = t1 + 1 + assert t.op.gpu is None + assert t.issparse() is True + assert type(t) is SparseTensor + + t = tile(t) + assert t.chunks[0].op.sparse is True + + t = t1 + 0 + assert t.issparse() is True + assert type(t) is SparseTensor + + t2 = tensor([[1, 0, 0]], chunk_size=2).tosparse() + + t = t1 + t2 + assert t.issparse() is True + assert type(t) is SparseTensor + + t = tile(t) + assert t.chunks[0].op.sparse is True + + t3 = tensor([1, 1, 1], chunk_size=2) + t = t1 + t3 + assert t.issparse() is False + assert type(t) is Tensor + + t = tile(t) + assert t.chunks[0].op.sparse is False + + +def test_add_order(): + raw_a = np.random.rand(4, 2) + raw_b = np.asfortranarray(np.random.rand(4, 2)) + t1 = tensor(raw_a) + t2 = tensor(raw_b) + out = tensor(raw_b) + + # C + scalar + assert (t1 + 1).flags["C_CONTIGUOUS"] == (raw_a + 1).flags["C_CONTIGUOUS"] + assert (t1 + 1).flags["F_CONTIGUOUS"] == (raw_a + 1).flags["F_CONTIGUOUS"] + # C + C + assert (t1 + t1).flags["C_CONTIGUOUS"] == (raw_a + raw_a).flags["C_CONTIGUOUS"] + assert (t1 + t1).flags["F_CONTIGUOUS"] == (raw_a + raw_a).flags["F_CONTIGUOUS"] + # F + scalar + assert (t2 + 1).flags["C_CONTIGUOUS"] == (raw_b + 1).flags["C_CONTIGUOUS"] + assert (t2 + 1).flags["F_CONTIGUOUS"] == (raw_b + 1).flags["F_CONTIGUOUS"] + # F + F + assert (t2 + t2).flags["C_CONTIGUOUS"] == (raw_b + raw_b).flags["C_CONTIGUOUS"] + assert (t2 + t2).flags["F_CONTIGUOUS"] == (raw_b + raw_b).flags["F_CONTIGUOUS"] + # C + F + assert (t1 + t2).flags["C_CONTIGUOUS"] == (raw_a + raw_b).flags["C_CONTIGUOUS"] + assert (t1 + t2).flags["F_CONTIGUOUS"] == (raw_a + raw_b).flags["F_CONTIGUOUS"] + # C + C + out + assert ( + add(t1, t1, out=out).flags["C_CONTIGUOUS"] + == np.add(raw_a, raw_a, out=np.empty((4, 2), order="F")).flags["C_CONTIGUOUS"] + ) + assert ( + add(t1, t1, out=out).flags["F_CONTIGUOUS"] + == np.add(raw_a, raw_a, out=np.empty((4, 2), order="F")).flags["F_CONTIGUOUS"] + ) + + with pytest.raises(TypeError): + add(t1, 1, order="B") + + +def test_multiply(): + t1 = tensor([[0, 1, 0], [1, 0, 0]], chunk_size=2).tosparse() + + t = t1 * 10 + assert t.issparse() is True + assert type(t) is SparseTensor + + t = tile(t) + assert t.chunks[0].op.sparse is True + + t2 = tensor([[1, 0, 0]], chunk_size=2).tosparse() + + t = t1 * t2 + assert t.issparse() is True + assert type(t) is SparseTensor + + t = tile(t) + assert t.chunks[0].op.sparse is True + + t3 = tensor([1, 1, 1], chunk_size=2) + t = t1 * t3 + assert t.issparse() is True + assert type(t) is SparseTensor + + t = tile(t) + assert t.chunks[0].op.sparse is True + + +def test_divide(): + t1 = tensor([[0, 1, 0], [1, 0, 0]], chunk_size=2).tosparse() + + t = t1 / 10 + assert t.issparse() is True + assert type(t) is SparseTensor + + t = tile(t) + assert t.chunks[0].op.sparse is True + + t2 = tensor([[1, 0, 0]], chunk_size=2).tosparse() + + t = t1 / t2 + assert t.issparse() is False + assert type(t) is Tensor + + t = tile(t) + assert t.chunks[0].op.sparse is False + + t3 = tensor([1, 1, 1], chunk_size=2) + t = t1 / t3 + assert t.issparse() is False + assert type(t) is Tensor + + t = tile(t) + assert t.chunks[0].op.sparse is False + + t = t3 / t1 + assert t.issparse() is False + assert type(t) is Tensor + + t = tile(t) + assert t.chunks[0].op.sparse is False + + +def test_datatime_arith(): + t1 = array([np.datetime64("2005-02-02"), np.datetime64("2005-02-03")]) + t2 = t1 + np.timedelta64(1) + + assert isinstance(t2.op, TensorAdd) + + t3 = t1 - np.datetime64("2005-02-02") + + assert isinstance(t3.op, TensorSubtract) + assert ( + t3.dtype + == ( + np.array(["2005-02-02", "2005-02-03"], dtype=np.datetime64) + - np.datetime64("2005-02-02") + ).dtype + ) + + t1 = array([np.datetime64("2005-02-02"), np.datetime64("2005-02-03")]) + subtract(t1, np.datetime64("2005-02-02"), out=empty(t1.shape, dtype=t3.dtype)) + + t1 = array([np.datetime64("2005-02-02"), np.datetime64("2005-02-03")]) + add(t1, np.timedelta64(1, "D"), out=t1) + + +def test_add_with_out(): + t1 = ones((3, 4), chunk_size=2) + t2 = ones(4, chunk_size=2) + + t3 = add(t1, t2, out=t1) + + assert isinstance(t1.op, TensorAdd) + assert t1.op.out.key == t1.op.lhs.key + assert t3 is t1 + assert t3.shape == (3, 4) + assert t3.op.lhs.extra_params.raw_chunk_size == 2 + assert t3.op.rhs is t2.data + assert t3.key != t3.op.lhs.key + + t1, t3 = tile(t1, t3) + + assert isinstance(t1.chunks[0].op, TensorAdd) + assert t1.chunks[0].op.out.key == t1.chunks[0].op.lhs.key + + with pytest.raises(TypeError): + add(t1, t2, out=1) + + with pytest.raises(ValueError): + add(t1, t2, out=t2) + + with pytest.raises(TypeError): + truediv(t1, t2, out=t1.astype("i8")) + + t1 = ones((3, 4), chunk_size=2, dtype=float) + t2 = ones(4, chunk_size=2, dtype=int) + + t3 = add(t2, 1, out=t1) + assert t3.shape == (3, 4) + assert t3.dtype == np.float64 + + +def test_dtype_from_out(): + x = array([-np.inf, 0.0, np.inf]) + y = array([2, 2, 2]) + + t3 = isfinite(x, y) + assert t3.dtype == y.dtype + + +def test_log_without_where(): + t1 = ones((3, 4), chunk_size=2) + + t2 = log(t1, out=t1) + + assert isinstance(t2.op, TensorLog) + assert t1.op.out.key == t1.op.input.key + assert t2 is t1 + assert t2.op.input.extra_params.raw_chunk_size == 2 + assert t2.key != t2.op.input.key + + t3 = empty((3, 4), chunk_size=2) + t4 = log(t1, out=t3, where=t1 > 0) + assert isinstance(t4.op, TensorLog) + assert t4 is t3 + assert t2.op.input.extra_params.raw_chunk_size == 2 + assert t2.key != t2.op.input.key + + +def test_copy_add(): + t1 = ones((3, 4), chunk_size=2) + t2 = ones(4, chunk_size=2) + t3 = t1 + t2 + t3 = tile(t3) + + c = t3.chunks[0] + inputs = ( + c.op.lhs, + TensorFetch().new_chunk( + c.op.rhs.inputs, + shape=c.op.rhs.shape, + index=c.op.rhs.index, + _key=c.op.rhs.key, + ), + ) + new_c = c.op.copy().reset_key().new_chunk(inputs, shape=c.shape, _key="new_key") + assert new_c.key == "new_key" + assert new_c.inputs[1] is new_c.op.rhs + assert isinstance(new_c.inputs[1].op, TensorFetch) + + +def test_compare(): + t1 = ones(4, chunk_size=2) * 2 + t2 = ones(4, chunk_size=2) + t3 = t1 > t2 + t3 = tile(t3) + assert len(t3.chunks) == 2 + assert isinstance(t3.op, TensorGreaterThan) + + +def test_unify_chunk_add(): + t1 = ones(4, chunk_size=2) + t2 = ones(1, chunk_size=1) + + t3 = t1 + t2 + t1, t2, t3 = tile(t1, t2, t3) + + assert len(t3.chunks) == 2 + assert t3.chunks[0].inputs[0] == t1.chunks[0].data + assert t3.chunks[0].inputs[1] == t2.chunks[0].data + assert t3.chunks[1].inputs[0] == t1.chunks[1].data + assert t3.chunks[1].inputs[1] == t2.chunks[0].data + + +def test_frexp(): + t1 = ones((3, 4, 5), chunk_size=2) + t2 = empty((3, 4, 5), dtype=np.float_, chunk_size=2) + op_type = type(t1.op) + + o1, o2 = frexp(t1) + + assert o1.op is o2.op + assert o1.dtype != o2.dtype + + o1, o2 = frexp(t1, t1) + + assert o1 is t1 + assert o1.inputs[0] is not t1 + assert isinstance(o1.inputs[0].op, op_type) + assert o2.inputs[0] is not t1 + + o1, o2 = frexp(t1, t2, where=t1 > 0) + + op_type = type(t2.op) + assert o1 is t2 + assert o1.inputs[0] is not t1 + assert isinstance(o1.inputs[0].op, op_type) + assert o2.inputs[0] is not t1 + + +def test_frexp_order(): + raw1 = np.asfortranarray(np.random.rand(2, 4)) + t = tensor(raw1) + o1 = tensor(np.random.rand(2, 4)) + + o1, o2 = frexp(t, out1=o1) + + assert ( + o1.flags["C_CONTIGUOUS"] + == np.frexp(raw1, np.empty((2, 4)))[0].flags["C_CONTIGUOUS"] + ) + assert ( + o1.flags["F_CONTIGUOUS"] + == np.frexp(raw1, np.empty((2, 4)))[0].flags["F_CONTIGUOUS"] + ) + assert o2.flags["C_CONTIGUOUS"] == np.frexp(raw1)[1].flags["C_CONTIGUOUS"] + assert o2.flags["F_CONTIGUOUS"] == np.frexp(raw1)[1].flags["F_CONTIGUOUS"] + + +def test_dtype(): + t1 = ones((2, 3), dtype="f4", chunk_size=2) + + t = truediv(t1, 2, dtype="f8") + + assert t.dtype == np.float64 + + with pytest.raises(TypeError): + truediv(t1, 2, dtype="i4") + + +def test_negative(): + t1 = tensor([[0, 1, 0], [1, 0, 0]], chunk_size=2).tosparse() + + t = negative(t1) + assert t.op.gpu is None + assert t.issparse() is True + assert type(t) is SparseTensor + + t = tile(t) + assert t.chunks[0].op.sparse is True + + +def test_negative_order(): + raw1 = np.random.rand(4, 2) + raw2 = np.asfortranarray(np.random.rand(4, 2)) + t1 = tensor(raw1) + t2 = tensor(raw2) + t3 = tensor(raw1) + t4 = tensor(raw2) + + # C + assert negative(t1).flags["C_CONTIGUOUS"] == np.negative(raw1).flags["C_CONTIGUOUS"] + assert negative(t1).flags["F_CONTIGUOUS"] == np.negative(raw1).flags["F_CONTIGUOUS"] + # F + assert negative(t2).flags["C_CONTIGUOUS"] == np.negative(raw2).flags["C_CONTIGUOUS"] + assert negative(t2).flags["F_CONTIGUOUS"] == np.negative(raw2).flags["F_CONTIGUOUS"] + # C + out + assert ( + negative(t1, out=t4).flags["C_CONTIGUOUS"] + == np.negative(raw1, out=np.empty((4, 2), order="F")).flags["C_CONTIGUOUS"] + ) + assert ( + negative(t1, out=t4).flags["F_CONTIGUOUS"] + == np.negative(raw1, out=np.empty((4, 2), order="F")).flags["F_CONTIGUOUS"] + ) + # F + out + assert ( + negative(t2, out=t3).flags["C_CONTIGUOUS"] + == np.negative(raw1, out=np.empty((4, 2), order="C")).flags["C_CONTIGUOUS"] + ) + assert ( + negative(t2, out=t3).flags["F_CONTIGUOUS"] + == np.negative(raw1, out=np.empty((4, 2), order="C")).flags["F_CONTIGUOUS"] + ) + + with pytest.raises(TypeError): + negative(t1, order="B") + + +def test_cos(): + t1 = tensor([[0, 1, 0], [1, 0, 0]], chunk_size=2).tosparse() + + t = cos(t1) + assert t.issparse() is True + assert type(t) is SparseTensor + + +def test_around(): + t1 = ones((2, 3), dtype="f4", chunk_size=2) + + t = around(t1, decimals=3) + + assert t.issparse() is False + assert t.op.decimals == 3 + + t = tile(t) + + assert t.chunks[0].op.decimals == 3 + + +def test_isclose(): + t1 = ones((2, 3), dtype="f4", chunk_size=2) + + atol = 1e-4 + rtol = 1e-5 + equal_nan = True + + t = isclose(t1, 2, atol=atol, rtol=rtol, equal_nan=equal_nan) + + assert isinstance(t.op, TensorIsclose) + assert t.op.atol == atol + assert t.op.rtol == rtol + assert t.op.equal_nan == equal_nan + + t = tile(t) + + assert isinstance(t.chunks[0].op, TensorIsclose) + assert t.chunks[0].op.atol == atol + assert t.chunks[0].op.rtol == rtol + assert t.chunks[0].op.equal_nan == equal_nan + + t1 = ones((2, 3), dtype="f4", chunk_size=2) + t2 = ones((2, 3), dtype="f4", chunk_size=2) + + atol = 1e-4 + rtol = 1e-5 + equal_nan = True + + t = isclose(t1, t2, atol=atol, rtol=rtol, equal_nan=equal_nan) + + assert isinstance(t.op, TensorIsclose) + assert t.op.atol == atol + assert t.op.rtol == rtol + assert t.op.equal_nan == equal_nan + + t = tile(t) + + assert isinstance(t.chunks[0].op, TensorIsclose) + assert t.chunks[0].op.atol == atol + assert t.chunks[0].op.rtol == rtol + assert t.chunks[0].op.equal_nan == equal_nan + + +def test_matmul(): + a_data = [[1, 0], [0, 1]] + b_data = [[4, 1], [2, 2]] + + a = tensor(a_data, chunk_size=1) + b = tensor(b_data, chunk_size=1) + + t = matmul(a, b) + + assert t.shape == (2, 2) + t = tile(t) + assert t.shape == tuple(sum(s) for s in t.nsplits) + + b_data = [1, 2] + b = tensor(b_data, chunk_size=1) + + t = matmul(a, b) + + assert t.shape == (2,) + t = tile(t) + assert t.shape == tuple(sum(s) for s in t.nsplits) + + t = matmul(b, a) + + assert t.shape == (2,) + t = tile(t) + assert t.shape == tuple(sum(s) for s in t.nsplits) + + a_data = np.arange(2 * 2 * 4).reshape((2, 2, 4)) + b_data = np.arange(2 * 2 * 4).reshape((2, 4, 2)) + + a = tensor(a_data, chunk_size=1) + b = tensor(b_data, chunk_size=1) + + t = matmul(a, b) + + assert t.shape == (2, 2, 2) + t = tile(t) + assert t.shape == tuple(sum(s) for s in t.nsplits) + + t = matmul(tensor([2j, 3j], chunk_size=1), tensor([2j, 3j], chunk_size=1)) + + assert t.shape == () + t = tile(t) + assert t.shape == tuple(sum(s) for s in t.nsplits) + + with pytest.raises(ValueError): + matmul([1, 2], 3) + + with pytest.raises(ValueError): + matmul(np.random.randn(2, 3, 4), np.random.randn(3, 4, 3)) + + t = matmul( + tensor(np.random.randn(2, 3, 4), chunk_size=2), + tensor(np.random.randn(3, 1, 4, 3), chunk_size=3), + ) + assert t.shape == (3, 2, 3, 3) + + v = ones((100, 100), chunk_size=10) + tv = matmul(v, v) + assert tv.shape == (100, 100) + tv = tile(tv) + assert tv.shape == tuple(sum(s) for s in tv.nsplits) + + +def test_tree_arithmetic(): + raws = [np.random.rand(10, 10) for _ in range(10)] + tensors = [tensor(a, chunk_size=3) for a in raws] + + t = tree_add(*tensors, combine_size=4) + assert isinstance(t.op, TensorTreeAdd) + assert t.issparse() is False + assert len(t.inputs) == 3 + assert len(t.inputs[0].inputs) == 4 + assert len(t.inputs[-1].inputs) == 2 + + t = tree_multiply(*tensors, combine_size=4) + assert isinstance(t.op, TensorTreeMultiply) + assert t.issparse() is False + assert len(t.inputs) == 3 + assert len(t.inputs[0].inputs) == 4 + assert len(t.inputs[-1].inputs) == 2 + + raws = [sps.random(5, 9, density=0.1) for _ in range(10)] + tensors = [tensor(a, chunk_size=3) for a in raws] + + t = tree_add(*tensors, combine_size=4) + assert isinstance(t.op, TensorTreeAdd) + assert t.issparse() is True + assert len(t.inputs) == 3 + assert len(t.inputs[0].inputs) == 4 + assert len(t.inputs[-1].inputs) == 2 + + t = tree_multiply(*tensors, combine_size=4) + assert isinstance(t.op, TensorTreeMultiply) + assert t.issparse() is True + assert len(t.inputs) == 3 + assert len(t.inputs[0].inputs) == 4 + assert len(t.inputs[-1].inputs) == 2 + + +def test_get_set_real(): + a_data = np.array([1 + 2j, 3 + 4j, 5 + 6j]) + a = tensor(a_data, chunk_size=2) + + with pytest.raises(ValueError): + a.real = [2, 4] + + +def test_build_mode(): + t1 = ones((2, 3), chunk_size=2) + assert t1 == 2 + + with enter_mode(build=True): + assert t1 != 2 diff --git a/python/xorbits/_mars/tensor/arithmetic/tests/test_arithmetic_execution.py b/python/xorbits/_mars/tensor/arithmetic/tests/test_arithmetic_execution.py new file mode 100644 index 000000000..a9d6580d5 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/tests/test_arithmetic_execution.py @@ -0,0 +1,795 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import operator + +import numpy as np +import pytest +import scipy.sparse as sps + +from ....config import option_context +from ....session import execute, fetch +from ....tests.core import require_cupy +from ....utils import ignore_warning +from ...datasource import arange, ones, tensor, zeros +from .. import ( + add, + arctan2, + clip, + cos, + frexp, + isclose, + modf, + tree_add, + tree_multiply, + truediv, +) + + +def _nan_equal(a, b): + try: + np.testing.assert_equal(a, b) + except AssertionError: + return False + return True + + +def _get_func(op): + if isinstance(op, str): + return getattr(np, op) + return op + + +def _get_sparse_func(op): + from ....lib.sparse.core import issparse + + if isinstance(op, str): + op = getattr(np, op) + + def func(*args): + new_args = [] + for arg in args: + if issparse(arg): + new_args.append(arg.toarray()) + else: + new_args.append(arg) + + return op(*new_args) + + return func + + +def toarray(x): + if hasattr(x, "toarray"): + return x.toarray() + return x + + +def test_base_execution(setup): + arr = ones((10, 8), chunk_size=2) + arr2 = arr + 1 + + res = arr2.execute().fetch() + + np.testing.assert_array_equal(res, np.ones((10, 8)) + 1) + + data = np.random.random((10, 8, 3)) + arr = tensor(data, chunk_size=2) + arr2 = arr + 1 + + res = arr2.execute().fetch() + np.testing.assert_array_equal(res, data + 1) + + +def test_base_order_execution(setup): + raw = np.asfortranarray(np.random.rand(5, 6)) + arr = tensor(raw, chunk_size=3) + + res = (arr + 1).execute().fetch() + np.testing.assert_array_equal(res, raw + 1) + assert res.flags["C_CONTIGUOUS"] is False + assert res.flags["F_CONTIGUOUS"] is True + + res2 = add(arr, 1, order="C").execute().fetch() + np.testing.assert_array_equal(res2, np.add(raw, 1, order="C")) + assert res2.flags["C_CONTIGUOUS"] is True + assert res2.flags["F_CONTIGUOUS"] is False + + +def test_ufunc_execution(setup): + from .. import ( + BIN_UFUNC, + UNARY_UFUNC, + arccosh, + bitand, + bitor, + bitxor, + fmod, + invert, + ldexp, + lshift, + mod, + rshift, + ) + + _sp_unary_ufunc = {arccosh, invert} + _sp_bin_ufunc = {mod, fmod, bitand, bitor, bitxor, lshift, rshift, ldexp} + + data1 = np.random.random((5, 6, 2)) + data2 = np.random.random((5, 6, 2)) + rand = np.random.random() + arr1 = tensor(data1, chunk_size=3) + arr2 = tensor(data2, chunk_size=3) + + _new_unary_ufunc = UNARY_UFUNC - _sp_unary_ufunc + for func in _new_unary_ufunc: + res_tensor = func(arr1) + assert res_tensor.dtype is not None + res = res_tensor.execute().fetch() + expected = _get_func(res_tensor.op._func_name)(data1) + np.testing.assert_array_almost_equal(res, expected) + + _new_bin_ufunc = BIN_UFUNC - _sp_bin_ufunc + for func in _new_bin_ufunc: + res_tensor1 = func(arr1, arr2) + assert res_tensor1.dtype is not None + res_tensor2 = func(arr1, rand) + assert res_tensor2.dtype is not None + res_tensor3 = func(rand, arr1) + assert res_tensor3.dtype is not None + + res1 = res_tensor1.execute().fetch() + res2 = res_tensor2.execute().fetch() + res3 = res_tensor3.execute().fetch() + + expected1 = _get_func(res_tensor1.op._func_name)(data1, data2) + expected2 = _get_func(res_tensor1.op._func_name)(data1, rand) + expected3 = _get_func(res_tensor1.op._func_name)(rand, data1) + + np.testing.assert_array_almost_equal(res1, expected1) + np.testing.assert_array_almost_equal(res2, expected2) + np.testing.assert_array_almost_equal(res3, expected3) + + data1 = np.random.randint(2, 10, size=(10, 10, 10)) + data2 = np.random.randint(2, 10, size=(10, 10, 10)) + rand = np.random.randint(1, 10) + arr1 = tensor(data1, chunk_size=6) + arr2 = tensor(data2, chunk_size=6) + + for func in _sp_unary_ufunc: + res_tensor = func(arr1) + assert res_tensor.dtype is not None + res = res_tensor.execute().fetch() + expected = _get_func(res_tensor.op._func_name)(data1) + np.testing.assert_array_almost_equal(res, expected) + + for func in _sp_bin_ufunc: + res_tensor1 = func(arr1, arr2) + assert res_tensor1.dtype is not None + res_tensor2 = func(arr1, rand) + assert res_tensor2.dtype is not None + res_tensor3 = func(rand, arr1) + assert res_tensor3.dtype is not None + + res1 = res_tensor1.execute().fetch() + res2 = res_tensor2.execute().fetch() + res3 = res_tensor3.execute().fetch() + + expected1 = _get_func(res_tensor1.op._func_name)(data1, data2) + expected2 = _get_func(res_tensor1.op._func_name)(data1, rand) + expected3 = _get_func(res_tensor1.op._func_name)(rand, data1) + + np.testing.assert_array_almost_equal(res1, expected1) + np.testing.assert_array_almost_equal(res2, expected2) + np.testing.assert_array_almost_equal(res3, expected3) + + +def test_sparse_ufunc_execution(setup): + from .. import add, arccosh, mod, square + + _normal_unary_ufunc = [square] + _normal_bin_ufunc = [add] + _sp_unary_ufunc = [arccosh] + _sp_bin_ufunc = [mod] + + data1 = sps.random(5, 9, density=0.1) + data2 = sps.random(5, 9, density=0.2) + rand = np.random.random() + arr1 = tensor(data1, chunk_size=3) + arr2 = tensor(data2, chunk_size=3) + + for func in _normal_unary_ufunc: + res_tensor = func(arr1) + res = res_tensor.execute().fetch() + expected = _get_sparse_func(res_tensor.op._func_name)(data1) + _nan_equal(toarray(res[0]), expected) + + for func in _normal_bin_ufunc: + res_tensor1 = func(arr1, arr2) + res_tensor2 = func(arr1, rand) + res_tensor3 = func(rand, arr1) + + res1 = res_tensor1.execute().fetch() + res2 = res_tensor2.execute().fetch() + res3 = res_tensor3.execute().fetch() + + expected1 = _get_sparse_func(res_tensor1.op._func_name)(data1, data2) + expected2 = _get_sparse_func(res_tensor1.op._func_name)(data1, rand) + expected3 = _get_sparse_func(res_tensor1.op._func_name)(rand, data1) + + _nan_equal(toarray(res1[0]), expected1) + _nan_equal(toarray(res2[0]), expected2) + _nan_equal(toarray(res3[0]), expected3) + + data1 = np.random.randint(2, 10, size=(10, 10)) + data2 = np.random.randint(2, 10, size=(10, 10)) + rand = np.random.randint(1, 10) + arr1 = tensor(data1, chunk_size=3).tosparse() + arr2 = tensor(data2, chunk_size=3).tosparse() + + for func in _sp_unary_ufunc: + res_tensor = func(arr1) + res = res_tensor.execute().fetch() + expected = _get_sparse_func(res_tensor.op._func_name)(data1) + _nan_equal(toarray(res[0]), expected) + + for func in _sp_bin_ufunc: + res_tensor1 = func(arr1, arr2) + res_tensor2 = func(arr1, rand) + res_tensor3 = func(rand, arr1) + + res1 = res_tensor1.execute().fetch() + res2 = res_tensor2.execute().fetch() + res3 = res_tensor3.execute().fetch() + expected1 = _get_sparse_func(res_tensor1.op._func_name)(data1, data2) + expected2 = _get_sparse_func(res_tensor1.op._func_name)(data1, rand) + expected3 = _get_sparse_func(res_tensor1.op._func_name)(rand, data1) + + _nan_equal(toarray(res1[0]), expected1) + _nan_equal(toarray(res2[0]), expected2) + _nan_equal(toarray(res3[0]), expected3) + + +def test_add_with_out_execution(setup): + data1 = np.random.random((5, 9, 4)) + data2 = np.random.random((9, 4)) + + arr1 = tensor(data1.copy(), chunk_size=3) + arr2 = tensor(data2.copy(), chunk_size=3) + + add(arr1, arr2, out=arr1) + res = arr1.execute().fetch() + np.testing.assert_array_equal(res, data1 + data2) + + arr1 = tensor(data1.copy(), chunk_size=3) + arr2 = tensor(data2.copy(), chunk_size=3) + + arr3 = add(arr1, arr2, out=arr1.astype("i4"), casting="unsafe") + res = arr3.execute().fetch() + np.testing.assert_array_equal(res, (data1 + data2).astype("i4")) + + arr1 = tensor(data1.copy(), chunk_size=3) + arr2 = tensor(data2.copy(), chunk_size=3) + + arr3 = truediv(arr1, arr2, out=arr1, where=arr2 > 0.5) + res = arr3.execute().fetch() + np.testing.assert_array_equal( + res, np.true_divide(data1, data2, out=data1.copy(), where=data2 > 0.5) + ) + + arr1 = tensor(data1.copy(), chunk_size=4) + arr2 = tensor(data2.copy(), chunk_size=4) + + arr3 = add(arr1, arr2, where=arr1 > 0.5) + res = arr3.execute().fetch() + expected = np.add(data1, data2, where=data1 > 0.5) + np.testing.assert_array_equal(res[data1 > 0.5], expected[data1 > 0.5]) + + arr1 = tensor(data1.copy(), chunk_size=4) + + arr3 = add(arr1, 1, where=arr1 > 0.5) + res = arr3.execute().fetch() + expected = np.add(data1, 1, where=data1 > 0.5) + np.testing.assert_array_equal(res[data1 > 0.5], expected[data1 > 0.5]) + + arr1 = tensor(data2.copy(), chunk_size=3) + + arr3 = add(arr1[:5, :], 1, out=arr1[-5:, :]) + res = arr3.execute().fetch() + expected = np.add(data2[:5, :], 1) + np.testing.assert_array_equal(res, expected) + + +def test_arctan2_execution(setup): + x = tensor(1) # scalar + y = arctan2(x, x) + + assert y.issparse() is False + result = y.execute().fetch() + np.testing.assert_equal(result, np.arctan2(1, 1)) + + y = arctan2(0, x) + + assert y.issparse() is False + result = y.execute().fetch() + np.testing.assert_equal(result, np.arctan2(0, 1)) + + raw1 = np.array([[0, 1, 2]]) + raw2 = sps.csr_matrix([[0, 1, 0]]) + y = arctan2(raw1, raw2) + + assert y.issparse() is False + result = y.execute().fetch() + np.testing.assert_equal(result, np.arctan2(raw1, raw2.A)) + + y = arctan2(raw2, raw2) + + assert y.issparse() is True + result = y.execute().fetch() + np.testing.assert_equal(result, np.arctan2(raw2.A, raw2.A)) + + y = arctan2(0, raw2) + + assert y.issparse() is True + result = y.execute().fetch() + np.testing.assert_equal(result, np.arctan2(0, raw2.A)) + + +@pytest.mark.ray_dag +def test_frexp_execution(setup): + data1 = np.random.RandomState(0).randint(0, 100, (5, 9, 6)) + + arr1 = tensor(data1.copy(), chunk_size=4) + + o1, o2 = frexp(arr1) + o = o1 + o2 + + res = o.execute().fetch() + expected = sum(np.frexp(data1)) + np.testing.assert_array_almost_equal(res, expected) + + arr1 = tensor(data1.copy(), chunk_size=4) + o1 = zeros(data1.shape, chunk_size=4) + o2 = zeros(data1.shape, dtype="i8", chunk_size=4) + frexp(arr1, o1, o2) + res1, res2 = fetch(*execute(o1, o2)) + + res = res1 * 2**res2 + np.testing.assert_array_almost_equal(res, data1, decimal=3) + + data1 = sps.random(5, 9, density=0.1) + + arr1 = tensor(data1.copy(), chunk_size=4) + + o1, o2 = frexp(arr1) + o = o1 + o2 + + res = o.execute().fetch() + expected = sum(np.frexp(data1.toarray())) + np.testing.assert_equal(res.toarray(), expected) + + x = np.arange(9) + a = np.zeros(9) + b = np.zeros(9) + mx = arange(9) + ma = zeros(9) + mb = zeros(9) + res = frexp(mx, ma, mb, where=mx > 5).execute() + expected = np.frexp(x, a, b, where=x > 5) + np.testing.assert_equal(res[0], expected[0]) + np.testing.assert_equal(res[1], expected[1]) + + +def test_frexp_order_execution(setup): + data1 = np.random.RandomState(0).random((5, 9)) + t = tensor(data1, chunk_size=3) + + o1, o2 = frexp(t, order="F") + res1, res2 = execute(o1, o2) + expected1, expected2 = np.frexp(data1, order="F") + np.testing.assert_allclose(res1, expected1) + assert res1.flags["F_CONTIGUOUS"] is True + assert res1.flags["C_CONTIGUOUS"] is False + np.testing.assert_allclose(res2, expected2) + assert res2.flags["F_CONTIGUOUS"] is True + assert res2.flags["C_CONTIGUOUS"] is False + + +def test_modf_execution(setup): + data1 = np.random.random((5, 9)) + + arr1 = tensor(data1.copy(), chunk_size=3) + + o1, o2 = modf(arr1) + o = o1 + o2 + + res = o.execute().fetch() + expected = sum(np.modf(data1)) + np.testing.assert_array_almost_equal(res, expected) + + o1, o2 = modf([0, 3.5]) + o = o1 + o2 + + res = o.execute().fetch() + expected = sum(np.modf([0, 3.5])) + np.testing.assert_array_almost_equal(res, expected) + + arr1 = tensor(data1.copy(), chunk_size=3) + o1 = zeros(data1.shape, chunk_size=3) + o2 = zeros(data1.shape, chunk_size=3) + modf(arr1, o1, o2) + o = o1 + o2 + + res = o.execute().fetch() + expected = sum(np.modf(data1)) + np.testing.assert_array_almost_equal(res, expected) + + data1 = sps.random(5, 9, density=0.1) + + arr1 = tensor(data1.copy(), chunk_size=3) + + o1, o2 = modf(arr1) + o = o1 + o2 + + res = o.execute().fetch() + expected = sum(np.modf(data1.toarray())) + np.testing.assert_equal(res.toarray(), expected) + + +def test_modf_order_execution(setup): + data1 = np.random.random((5, 9)) + t = tensor(data1, chunk_size=3) + + o1, o2 = modf(t, order="F") + res1, res2 = execute(o1, o2) + expected1, expected2 = np.modf(data1, order="F") + np.testing.assert_allclose(res1, expected1) + assert res1.flags["F_CONTIGUOUS"] is True + assert res1.flags["C_CONTIGUOUS"] is False + np.testing.assert_allclose(res2, expected2) + assert res2.flags["F_CONTIGUOUS"] is True + assert res2.flags["C_CONTIGUOUS"] is False + + +def test_clip_execution(setup): + a_data = np.arange(10) + + a = tensor(a_data.copy(), chunk_size=3) + + b = clip(a, 1, 8) + + res = b.execute().fetch() + expected = np.clip(a_data, 1, 8) + np.testing.assert_array_equal(res, expected) + + a = tensor(a_data.copy(), chunk_size=3) + clip(a, 3, 6, out=a) + + res = a.execute().fetch() + expected = np.clip(a_data, 3, 6) + np.testing.assert_array_equal(res, expected) + + a = tensor(a_data.copy(), chunk_size=3) + a_min_data = np.random.randint(1, 10, size=(10,)) + a_max_data = np.random.randint(1, 10, size=(10,)) + a_min = tensor(a_min_data) + a_max = tensor(a_max_data) + clip(a, a_min, a_max, out=a) + + res = a.execute().fetch() + expected = np.clip(a_data, a_min_data, a_max_data) + np.testing.assert_array_equal(res, expected) + + with option_context() as options: + options.chunk_size = 3 + + a = tensor(a_data.copy(), chunk_size=3) + b = clip(a, [3, 4, 1, 1, 1, 4, 4, 4, 4, 4], 8) + + res = b.execute().fetch() + expected = np.clip(a_data, [3, 4, 1, 1, 1, 4, 4, 4, 4, 4], 8) + np.testing.assert_array_equal(res, expected) + + # test sparse clip + a_data = sps.csr_matrix([[0, 2, 8], [0, 0, -1]]) + a = tensor(a_data, chunk_size=3) + b_data = sps.csr_matrix([[0, 3, 0], [1, 0, -2]]) + + c = clip(a, b_data, 4) + + res = c.execute().fetch() + expected = np.clip(a_data.toarray(), b_data.toarray(), 4) + np.testing.assert_array_equal(res, expected) + + +def test_clip_order_execution(setup): + a_data = np.asfortranarray(np.random.rand(4, 8)) + + a = tensor(a_data, chunk_size=3) + + b = clip(a, 0.2, 0.8) + + res = b.execute().fetch() + expected = np.clip(a_data, 0.2, 0.8) + + np.testing.assert_allclose(res, expected) + assert res.flags["F_CONTIGUOUS"] is True + assert res.flags["C_CONTIGUOUS"] is False + + +def test_around_execution(setup): + data = np.random.randn(10, 20) + x = tensor(data, chunk_size=3) + + t = x.round(2) + + res = t.execute().fetch() + expected = np.around(data, decimals=2) + + np.testing.assert_allclose(res, expected) + + data = sps.random(10, 20, density=0.2) + x = tensor(data, chunk_size=3) + + t = x.round(2) + + res = t.execute().fetch() + expected = np.around(data.toarray(), decimals=2) + + np.testing.assert_allclose(res.toarray(), expected) + + +def test_around_order_execution(setup): + data = np.asfortranarray(np.random.rand(10, 20)) + x = tensor(data, chunk_size=3) + + t = x.round(2) + + res = t.execute().fetch() + expected = np.around(data, decimals=2) + + np.testing.assert_allclose(res, expected) + assert res.flags["F_CONTIGUOUS"] is True + assert res.flags["C_CONTIGUOUS"] is False + + +def test_cos_order_execution(setup): + data = np.asfortranarray(np.random.rand(3, 5)) + x = tensor(data, chunk_size=2) + + t = cos(x) + + res = t.execute().fetch() + np.testing.assert_allclose(res, np.cos(data)) + assert res.flags["C_CONTIGUOUS"] is False + assert res.flags["F_CONTIGUOUS"] is True + + t2 = cos(x, order="C") + + res2 = t2.execute().fetch() + np.testing.assert_allclose(res2, np.cos(data, order="C")) + assert res2.flags["C_CONTIGUOUS"] is True + assert res2.flags["F_CONTIGUOUS"] is False + + +def test_is_close_execution(setup): + data = np.array([1.05, 1.0, 1.01, np.nan]) + data2 = np.array([1.04, 1.0, 1.03, np.nan]) + + x = tensor(data, chunk_size=2) + y = tensor(data2, chunk_size=3) + + z = isclose(x, y, atol=0.01) + + res = z.execute().fetch() + expected = np.isclose(data, data2, atol=0.01) + np.testing.assert_equal(res, expected) + + z = isclose(x, y, atol=0.01, equal_nan=True) + + res = z.execute().fetch() + expected = np.isclose(data, data2, atol=0.01, equal_nan=True) + np.testing.assert_equal(res, expected) + + # test tensor with scalar + z = isclose(x, 1.0, atol=0.01) + res = z.execute().fetch() + expected = np.isclose(data, 1.0, atol=0.01) + np.testing.assert_equal(res, expected) + z = isclose(1.0, y, atol=0.01) + res = z.execute().fetch() + expected = np.isclose(1.0, data2, atol=0.01) + np.testing.assert_equal(res, expected) + z = isclose(1.0, 2.0, atol=0.01) + res = z.execute().fetch() + expected = np.isclose(1.0, 2.0, atol=0.01) + np.testing.assert_equal(res, expected) + + # test sparse + data = sps.csr_matrix(np.array([0, 1.0, 1.01, np.nan])) + data2 = sps.csr_matrix(np.array([0, 1.0, 1.03, np.nan])) + + x = tensor(data, chunk_size=2) + y = tensor(data2, chunk_size=3) + + z = isclose(x, y, atol=0.01) + + res = z.execute().fetch() + expected = np.isclose(data.toarray(), data2.toarray(), atol=0.01) + np.testing.assert_equal(res, expected) + + z = isclose(x, y, atol=0.01, equal_nan=True) + + res = z.execute().fetch() + expected = np.isclose(data.toarray(), data2.toarray(), atol=0.01, equal_nan=True) + np.testing.assert_equal(res, expected) + + +@ignore_warning +def test_dtype_execution(setup): + a = ones((10, 20), dtype="f4", chunk_size=5) + + c = truediv(a, 2, dtype="f8") + + res = c.execute().fetch() + assert res.dtype == np.float64 + + c = truediv(a, 0, dtype="f8") + res = c.execute().fetch() + assert np.isinf(res[0, 0]) + + with pytest.raises(FloatingPointError): + with np.errstate(divide="raise"): + c = truediv(a, 0, dtype="f8") + _ = c.execute().fetch() # noqa: F841 + + +def test_set_get_real_execution(setup): + a_data = np.array([1 + 2j, 3 + 4j, 5 + 6j]) + a = tensor(a_data, chunk_size=2) + + res = a.real.execute().fetch() + expected = a_data.real + + np.testing.assert_equal(res, expected) + + a.real = 9 + + res = a.execute().fetch() + expected = a_data.copy() + expected.real = 9 + + np.testing.assert_equal(res, expected) + + a.real = np.array([9, 8, 7]) + + res = a.execute().fetch() + expected = a_data.copy() + expected.real = np.array([9, 8, 7]) + + np.testing.assert_equal(res, expected) + + # test sparse + a_data = np.array([[1 + 2j, 3 + 4j, 0], [0, 0, 0]]) + a = tensor(sps.csr_matrix(a_data)) + + res = a.real.execute().fetch().toarray() + expected = a_data.real + + np.testing.assert_equal(res, expected) + + a.real = 9 + + res = a.execute().fetch().toarray() + expected = a_data.copy() + expected.real = 9 + + np.testing.assert_equal(res, expected) + + a.real = np.array([9, 8, 7]) + + res = a.execute().fetch().toarray() + expected = a_data.copy() + expected.real = np.array([9, 8, 7]) + + np.testing.assert_equal(res, expected) + + +def test_set_get_imag_execution(setup): + a_data = np.array([1 + 2j, 3 + 4j, 5 + 6j]) + a = tensor(a_data, chunk_size=2) + + res = a.imag.execute().fetch() + expected = a_data.imag + + np.testing.assert_equal(res, expected) + + a.imag = 9 + + res = a.execute().fetch() + expected = a_data.copy() + expected.imag = 9 + + np.testing.assert_equal(res, expected) + + a.imag = np.array([9, 8, 7]) + + res = a.execute().fetch() + expected = a_data.copy() + expected.imag = np.array([9, 8, 7]) + + np.testing.assert_equal(res, expected) + + # test sparse + a_data = np.array([[1 + 2j, 3 + 4j, 0], [0, 0, 0]]) + a = tensor(sps.csr_matrix(a_data)) + + res = a.imag.execute().fetch().toarray() + expected = a_data.imag + + np.testing.assert_equal(res, expected) + + a.imag = 9 + + res = a.execute().fetch().toarray() + expected = a_data.copy() + expected.imag = 9 + + np.testing.assert_equal(res, expected) + + a.imag = np.array([9, 8, 7]) + + res = a.execute().fetch().toarray() + expected = a_data.copy() + expected.imag = np.array([9, 8, 7]) + + np.testing.assert_equal(res, expected) + + +def test_tree_arithmetic_execution(setup): + raws = [np.random.rand(10, 10) for _ in range(10)] + tensors = [tensor(a, chunk_size=3) for a in raws] + + res = tree_add(*tensors, 1.0).execute().fetch() + np.testing.assert_array_almost_equal( + res, 1.0 + functools.reduce(operator.add, raws) + ) + + res = tree_multiply(*tensors, 2.0).execute().fetch() + np.testing.assert_array_almost_equal( + res, 2.0 * functools.reduce(operator.mul, raws) + ) + + raws = [sps.random(5, 9, density=0.1) for _ in range(10)] + tensors = [tensor(a, chunk_size=3) for a in raws] + + res = tree_add(*tensors).execute().fetch() + np.testing.assert_array_almost_equal( + res.toarray(), functools.reduce(operator.add, raws).toarray() + ) + + +@require_cupy +def test_cupy_execution(setup_gpu): + a_data = np.random.rand(10, 10) + b_data = np.random.rand(10, 10) + + a = tensor(a_data, gpu=True, chunk_size=3) + b = tensor(b_data, gpu=True, chunk_size=3) + res_binary = (a + b).execute().fetch() + np.testing.assert_array_equal(res_binary.get(), (a_data + b_data)) + + res_unary = cos(a).execute().fetch() + np.testing.assert_array_almost_equal(res_unary.get(), np.cos(a_data)) diff --git a/python/xorbits/_mars/tensor/arithmetic/truediv.py b/python/xorbits/_mars/tensor/arithmetic/truediv.py new file mode 100644 index 000000000..ff7358a73 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/truediv.py @@ -0,0 +1,102 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorBinOp +from .utils import arithmetic_operand + + +@arithmetic_operand +class TensorTrueDiv(TensorBinOp): + _op_type_ = OperandDef.TRUEDIV + _func_name = "true_divide" + + @classmethod + def _is_sparse(cls, x1, x2): + if not np.isscalar(x1) and not np.isscalar(x2): + return False + if hasattr(x1, "issparse") and x1.issparse(): + if x2 != 0: + return True + else: + raise ZeroDivisionError("float division by zero") + return False + + +@infer_dtype(np.true_divide) +def truediv(x1, x2, out=None, where=None, **kwargs): + """ + Returns a true division of the inputs, element-wise. + + Instead of the Python traditional 'floor division', this returns a true + division. True division adjusts the output type to present the best + answer, regardless of input types. + + Parameters + ---------- + x1 : array_like + Dividend tensor. + x2 : array_like + Divisor tensor. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + out : Tensor + Result is scalar if both inputs are scalar, tensor otherwise. + + Notes + ----- + The floor division operator ``//`` was added in Python 2.2 making + ``//`` and ``/`` equivalent operators. The default floor division + operation of ``/`` can be replaced by true division with ``from + __future__ import division``. + + In Python 3.0, ``//`` is the floor division operator and ``/`` the + true division operator. The ``true_divide(x1, x2)`` function is + equivalent to true division in Python. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.arange(5) + >>> mt.true_divide(x, 4).execute() + array([ 0. , 0.25, 0.5 , 0.75, 1. ]) + + # for python 2 + >>> (x/4).execute() + array([0, 0, 0, 0, 1]) + >>> (x//4).execute() + array([0, 0, 0, 0, 1]) + """ + op = TensorTrueDiv(**kwargs) + return op(x1, x2, out=out, where=where) + + +@infer_dtype(np.true_divide, reverse=True) +def rtruediv(x1, x2, **kwargs): + op = TensorTrueDiv(**kwargs) + return op.rcall(x1, x2) diff --git a/python/xorbits/_mars/tensor/arithmetic/trunc.py b/python/xorbits/_mars/tensor/arithmetic/trunc.py new file mode 100644 index 000000000..bf2aae77d --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/trunc.py @@ -0,0 +1,70 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import infer_dtype +from .core import TensorUnaryOp +from .utils import arithmetic_operand + + +@arithmetic_operand(sparse_mode="unary") +class TensorTrunc(TensorUnaryOp): + _op_type_ = OperandDef.TRUNC + _func_name = "trunc" + + +@infer_dtype(np.trunc) +def trunc(x, out=None, where=None, **kwargs): + """ + Return the truncated value of the input, element-wise. + + The truncated value of the scalar `x` is the nearest integer `i` which + is closer to zero than `x` is. In short, the fractional part of the + signed number `x` is discarded. + + Parameters + ---------- + x : array_like + Input data. + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + y : Tensor or scalar + The truncated value of each element in `x`. + + See Also + -------- + ceil, floor, rint + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) + >>> mt.trunc(a).execute() + array([-1., -1., -0., 0., 1., 1., 2.]) + """ + op = TensorTrunc(**kwargs) + return op(x, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/arithmetic/utils.py b/python/xorbits/_mars/tensor/arithmetic/utils.py new file mode 100644 index 000000000..0c53e5f52 --- /dev/null +++ b/python/xorbits/_mars/tensor/arithmetic/utils.py @@ -0,0 +1,125 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import numpy as np + +from ...utils import TreeReductionBuilder + + +def arithmetic_operand(cls=None, init=True, sparse_mode=None): + def _decorator(cls): + def __init__(self, casting="same_kind", err=None, **kw): + err = err if err is not None else np.geterr() + super(cls, self).__init__(_casting=casting, _err=err, **kw) + + def _is_sparse_binary_and_const(x1, x2): + if all(np.isscalar(x) for x in [x1, x2]): + return False + if all( + np.isscalar(x) or (hasattr(x, "issparse") and x.issparse()) + for x in [x1, x2] + ): + return True + return False + + def _is_sparse_binary_or_const(x1, x2): + if (hasattr(x1, "issparse") and x1.issparse()) or ( + hasattr(x2, "issparse") and x2.issparse() + ): + return True + return False + + _is_sparse_dict = dict( + always_false=lambda *_: False, + unary=lambda x: x.issparse(), + binary_and=_is_sparse_binary_and_const, + binary_or=_is_sparse_binary_or_const, + ) + for v in _is_sparse_dict.values(): + v.__name__ = "_is_sparse" + + if init: + cls.__init__ = __init__ + + if sparse_mode in _is_sparse_dict: + cls._is_sparse = staticmethod(_is_sparse_dict[sparse_mode]) + elif sparse_mode is not None: # pragma: no cover + raise ValueError(f"Unsupported sparse mode: {sparse_mode}") + + return cls + + if cls is not None: + return _decorator(cls) + else: + return _decorator + + +def chunk_tree_add(dtype, chunks, idx, shape, sparse=False, combine_size=None): + """ + Generate tree add plan. + + Assume combine size as 4, given a input chunks with size 8, + we will generate tree add plan like: + + op op op op op op op op + | | | | + -------- -------- + tree_add tree_add + | | + ------------- + tree_add + + :param dtype: data type for tree added chunk + :param chunks: input chunks + :param idx: index of result chunk + :param shape: shape of result chunk + :param sparse: return value is sparse or dense + :param combine_size: combine size + :return: result chunk + """ + + class ChunkAddBuilder(TreeReductionBuilder): + def _build_reduction(self, inputs, final=False): + from .add import TensorTreeAdd + + op = TensorTreeAdd(args=inputs, dtype=dtype, sparse=sparse) + if not final: + return op.new_chunk(inputs, shape=shape) + else: + return op.new_chunk( + inputs, shape=shape, index=idx, order=chunks[0].order + ) + + return ChunkAddBuilder(combine_size).build(chunks) + + +def tree_op_estimate_size(ctx, op): + chunk = op.outputs[0] + if not chunk.is_sparse(): + max_inputs = max(ctx[inp.key][0] for inp in op.inputs) + calc_size = chunk_size = chunk.nbytes + if np.isnan(calc_size): + chunk_size = calc_size = max_inputs + else: + sum_inputs = sum(ctx[inp.key][0] for inp in op.inputs) + calc_size = sum_inputs + chunk_size = min( + sum_inputs, + chunk.nbytes + + np.dtype(np.int64).itemsize * np.prod(chunk.shape) * chunk.ndim, + ) + if np.isnan(chunk_size): + chunk_size = sum_inputs + ctx[chunk.key] = (chunk_size, calc_size) diff --git a/python/xorbits/_mars/tensor/array_utils.py b/python/xorbits/_mars/tensor/array_utils.py new file mode 100644 index 000000000..2b08368b8 --- /dev/null +++ b/python/xorbits/_mars/tensor/array_utils.py @@ -0,0 +1,186 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +from contextlib import contextmanager + +import numpy as np + +from ..lib import sparse +from ..lib.sparse.core import get_dense_module, issparse +from ..utils import lazy_import + +cp = lazy_import("cupy", rename="cp") + + +def is_array(x): + if isinstance(x, np.ndarray): + return True + elif isinstance(x, (sparse.SparseMatrix, sparse.SparseVector)): + return True + elif cp: # pragma: no cover + return isinstance(x, cp.ndarray) + else: + return False + + +def is_cupy(x): + if cp and isinstance(x, cp.ndarray): # pragma: no cover + return True + else: + return False + + +def get_array_module(x, nosparse=False): + if issparse(x): + if nosparse: + return get_dense_module(x) + return sparse + if cp: + return cp.get_array_module(x) + return np + + +def array_module(gpu): + if gpu: + if cp is None: + raise ImportError("Execute on GPU requires for `cupy` library") + return cp + + return np + + +def _get(x): + m = get_array_module(x) + + if m is np: + return x + if m is sparse: + return x if not hasattr(x, "get") else x.get() + return x.get() + + +def move_to_device(x, device_id): + if hasattr(x, "device") and x.device.id == device_id: + return x + + assert device_id >= 0 + + if issparse(x) and device_id > 0: + raise NotImplementedError + + # for dense array, we currently copy from gpu to memory and then copy back to destination device + # to avoid kernel panic + with cp.cuda.Device(device_id): + return cp.asarray(cp.asnumpy(x)) # remove `cp.asnumpy` call to do directly copy + + +def convert_order(x, order): + xp = get_array_module(x) + if xp.isfortran(x) != (order == "F"): + x = xp.array(x, order=order) + return x + + +def _most_nbytes_device(device_nbytes): + device_to_nbytes = defaultdict(lambda: 0) + for device, nbytes in device_nbytes: + device_to_nbytes[device] += nbytes + return max(device_to_nbytes, key=lambda i: device_to_nbytes[i]) + + +def _is_array_writeable(a): + if hasattr(a, "flags") and hasattr(a.flags, "writeable"): + return a.flags.writeable + # writeable as default + return True + + +def as_same_device(inputs, device=None, ret_extra=False, copy_if_not_writeable=False): + input_tensors = [ + i for i in inputs if hasattr(i, "ndim") and i.ndim > 0 + ] # filter scalar + has_sparse = any(issparse(i) for i in inputs) + + if device is None: + try: + device = _most_nbytes_device( + (i.device.id if hasattr(i, "device") else -1, i.nbytes) + for i in input_tensors + ) + except ValueError: + device = -1 + + if device == -1: + outputs = [_get(i) for i in inputs] + else: + outputs = [move_to_device(i, device) for i in inputs] + + if copy_if_not_writeable: + new_outputs = [] + for out in outputs: + if not _is_array_writeable(out): + new_outputs.append(out.copy()) + elif isinstance(out, (sparse.SparseMatrix, sparse.SparseVector)): + if ( + not _is_array_writeable(out.data) + or not _is_array_writeable(out.indices) + or not _is_array_writeable(out.indptr) + ): + new_outputs.append(type(out)(out.spmatrix.copy(), shape=out.shape)) + else: + new_outputs.append(out) + else: + new_outputs.append(out) + outputs = new_outputs + + if not ret_extra: + return outputs + + if has_sparse: + m = sparse + else: + if len(input_tensors) > 0: + m = get_array_module(input_tensors[0]) + else: + m = np + return outputs, device, m + + +def as_np_array(x): + xp = get_array_module(x) + return x if xp == np else x.get() + + +def is_sparse_module(xp): + return xp is sparse + + +@contextmanager +def device(device_id): + if device_id is None or device_id < 0: + yield + else: # pragma: no cover + with cp.cuda.Device(device_id) as dev: + yield dev + + +def create_array(op): + xp = array_module(op.gpu) + + def inner(func, *args, **kwargs): + with device(op.device): + return getattr(xp, func)(*args, **kwargs) + + return inner diff --git a/python/xorbits/_mars/tensor/base/__init__.py b/python/xorbits/_mars/tensor/base/__init__.py new file mode 100644 index 000000000..9358a7c67 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/__init__.py @@ -0,0 +1,87 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .argpartition import argpartition +from .argsort import argsort +from .argtopk import argtopk +from .argwhere import TensorArgwhere, argwhere +from .array_split import array_split +from .astype import TensorAstype +from .atleast_1d import atleast_1d +from .atleast_2d import atleast_2d +from .atleast_3d import atleast_3d +from .broadcast_arrays import broadcast_arrays +from .broadcast_to import TensorBroadcastTo, broadcast_to +from .copy import copy +from .copyto import TensorCopyTo, copyto +from .delete import delete +from .diff import diff +from .dsplit import dsplit +from .ediff1d import ediff1d +from .expand_dims import expand_dims +from .flatten import flatten +from .flip import flip +from .fliplr import fliplr +from .flipud import flipud +from .hsplit import hsplit +from .in1d import in1d +from .insert import insert +from .isin import TensorIsIn, isin +from .map_chunk import TensorMapChunk, map_chunk +from .moveaxis import moveaxis +from .ndim import ndim +from .partition import partition +from .ravel import ravel +from .rebalance import rebalance +from .repeat import TensorRepeat, repeat +from .result_type import result_type +from .roll import roll +from .rollaxis import rollaxis +from .searchsorted import TensorSearchsorted, searchsorted +from .setdiff1d import setdiff1d +from .shape import shape +from .sort import sort +from .split import TensorSplit, split +from .squeeze import TensorSqueeze, squeeze +from .swapaxes import TensorSwapAxes, swapaxes +from .tile import tile +from .to_cpu import to_cpu +from .to_gpu import to_gpu +from .topk import topk +from .transpose import TensorTranspose, transpose +from .trapz import trapz +from .unique import unique +from .vsplit import vsplit +from .where import TensorWhere, where + + +def _install(): + from ..core import Tensor, TensorData + from .astype import _astype + + for cls in (Tensor, TensorData): + setattr(cls, "astype", _astype) + setattr(cls, "swapaxes", swapaxes) + setattr(cls, "squeeze", squeeze) + setattr(cls, "repeat", repeat) + setattr(cls, "ravel", ravel) + setattr(cls, "flatten", flatten) + setattr(cls, "to_gpu", to_gpu) + setattr(cls, "to_cpu", to_cpu) + setattr(cls, "rebalance", rebalance) + setattr(cls, "map_chunk", map_chunk) + + +_install() +del _install diff --git a/python/xorbits/_mars/tensor/base/argpartition.py b/python/xorbits/_mars/tensor/base/argpartition.py new file mode 100644 index 000000000..fe7424a05 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/argpartition.py @@ -0,0 +1,98 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .partition import TensorPartition, _validate_partition_arguments + + +def argpartition(a, kth, axis=-1, kind="introselect", order=None, **kw): + """ + Perform an indirect partition along the given axis using the + algorithm specified by the `kind` keyword. It returns an array of + indices of the same shape as `a` that index data along the given + axis in partitioned order. + + .. versionadded:: 1.8.0 + + Parameters + ---------- + a : array_like + Tensor to sort. + kth : int or sequence of ints + Element index to partition by. The k-th element will be in its + final sorted position and all smaller elements will be moved + before it and all larger elements behind it. The order all + elements in the partitions is undefined. If provided with a + sequence of k-th it will partition all of them into their sorted + position at once. + axis : int or None, optional + Axis along which to sort. The default is -1 (the last axis). If + None, the flattened tensor is used. + kind : {'introselect'}, optional + Selection algorithm. Default is 'introselect' + order : str or list of str, optional + When `a` is a tensor with fields defined, this argument + specifies which fields to compare first, second, etc. A single + field can be specified as a string, and not all fields need be + specified, but unspecified fields will still be used, in the + order in which they come up in the dtype, to break ties. + + Returns + ------- + index_tensor : Tensor, int + Tensor of indices that partition `a` along the specified axis. + If `a` is one-dimensional, ``a[index_tensor]`` yields a partitioned `a`. + More generally, ``np.take_along_axis(a, index_tensor, axis=a)`` always + yields the partitioned `a`, irrespective of dimensionality. + + See Also + -------- + partition : Describes partition algorithms used. + Tensor.partition : Inplace partition. + argsort : Full indirect sort + + Notes + ----- + See `partition` for notes on the different selection algorithms. + + Examples + -------- + One dimensional tensor: + + >>> import mars.tensor as mt + >>> x = mt.array([3, 4, 2, 1]) + >>> x[mt.argpartition(x, 3)].execute() + array([2, 1, 3, 4]) + >>> x[mt.argpartition(x, (1, 3))].execute() + array([1, 2, 3, 4]) + + >>> x = [3, 4, 2, 1] + >>> mt.array(x)[mt.argpartition(x, 3)].execute() + array([2, 1, 3, 4]) + + """ + a, kth, axis, kind, order, need_align = _validate_partition_arguments( + a, kth, axis, kind, order, kw + ) + op = TensorPartition( + kth=kth, + axis=axis, + kind=kind, + order=order, + need_align=need_align, + return_value=False, + return_indices=True, + dtype=a.dtype, + gpu=a.op.gpu, + ) + return op(a, kth) diff --git a/python/xorbits/_mars/tensor/base/argsort.py b/python/xorbits/_mars/tensor/base/argsort.py new file mode 100644 index 000000000..e83850b42 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/argsort.py @@ -0,0 +1,136 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .sort import TensorSort, _validate_sort_arguments + + +def argsort(a, axis=-1, kind=None, parallel_kind=None, psrs_kinds=None, order=None): + """ + Returns the indices that would sort a tensor. + + Perform an indirect sort along the given axis using the algorithm specified + by the `kind` keyword. It returns a tensor of indices of the same shape as + `a` that index data along the given axis in sorted order. + + Parameters + ---------- + a : array_like + Tensor to sort. + axis : int or None, optional + Axis along which to sort. The default is -1 (the last axis). If None, + the flattened tensor is used. + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional + Sorting algorithm. The default is 'quicksort'. Note that both 'stable' + and 'mergesort' use timsort under the covers and, in general, the + actual implementation will vary with data type. The 'mergesort' option + is retained for backwards compatibility. + + .. versionchanged:: 1.15.0. + The 'stable' option was added. + order : str or list of str, optional + When `a` is a tensor with fields defined, this argument specifies + which fields to compare first, second, etc. A single field can + be specified as a string, and not all fields need be specified, + but unspecified fields will still be used, in the order in which + they come up in the dtype, to break ties. + + Returns + ------- + index_tensor : Tensor, int + Tensor of indices that sort `a` along the specified `axis`. + If `a` is one-dimensional, ``a[index_tensor]`` yields a sorted `a`. + More generally, ``np.take_along_axis(a, index_tensor, axis=axis)`` + always yields the sorted `a`, irrespective of dimensionality. + + See Also + -------- + sort : Describes sorting algorithms used. + lexsort : Indirect stable sort with multiple keys. + Tensor.sort : Inplace sort. + argpartition : Indirect partial sort. + + Notes + ----- + See `sort` for notes on the different sorting algorithms. + + Examples + -------- + One dimensional tensor: + + >>> import mars.tensor as mt + >>> x = mt.array([3, 1, 2]) + >>> mt.argsort(x).execute() + array([1, 2, 0]) + + Two-dimensional tensor: + + >>> x = mt.array([[0, 3], [2, 2]]) + >>> x.execute() + array([[0, 3], + [2, 2]]) + + >>> ind = mt.argsort(x, axis=0) # sorts along first axis (down) + >>> ind.execute() + array([[0, 1], + [1, 0]]) + #>>> mt.take_along_axis(x, ind, axis=0).execute() # same as np.sort(x, axis=0) + #array([[0, 2], + # [2, 3]]) + + >>> ind = mt.argsort(x, axis=1) # sorts along last axis (across) + >>> ind.execute() + array([[0, 1], + [0, 1]]) + #>>> mt.take_along_axis(x, ind, axis=1).execute() # same as np.sort(x, axis=1) + #array([[0, 3], + # [2, 2]]) + + Indices of the sorted elements of a N-dimensional array: + + >>> ind = mt.unravel_index(mt.argsort(x, axis=None), x.shape) + >>> ind.execute9) + (array([0, 1, 1, 0]), array([0, 0, 1, 1])) + >>> x[ind].execute() # same as np.sort(x, axis=None) + array([0, 2, 2, 3]) + + Sorting with keys: + + >>> x = mt.array([(1, 0), (0, 1)], dtype=[('x', '>> x.execute() + array([(1, 0), (0, 1)], + dtype=[('x', '>> mt.argsort(x, order=('x','y')).execute() + array([1, 0]) + + >>> mt.argsort(x, order=('y','x')).execute() + array([0, 1]) + + """ + a, axis, kind, parallel_kind, psrs_kinds, order = _validate_sort_arguments( + a, axis, kind, parallel_kind, psrs_kinds, order + ) + + op = TensorSort( + axis=axis, + kind=kind, + parallel_kind=parallel_kind, + order=order, + psrs_kinds=psrs_kinds, + return_value=False, + return_indices=True, + dtype=a.dtype, + gpu=a.op.gpu, + ) + return op(a) diff --git a/python/xorbits/_mars/tensor/base/argtopk.py b/python/xorbits/_mars/tensor/base/argtopk.py new file mode 100644 index 000000000..1ef0af2ac --- /dev/null +++ b/python/xorbits/_mars/tensor/base/argtopk.py @@ -0,0 +1,53 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...core.operand import OperandStage +from .topk import TensorTopk, _validate_topk_arguments + + +def argtopk( + a, + k, + axis=-1, + largest=True, + sorted=True, + order=None, + parallel_kind="auto", + psrs_kinds=None, +): + ( + a, + k, + axis, + largest, + sorted, + order, + parallel_kind, + psrs_kinds, + ) = _validate_topk_arguments( + a, k, axis, largest, sorted, order, parallel_kind, psrs_kinds + ) + op = TensorTopk( + k=k, + axis=axis, + largest=largest, + sorted=sorted, + parallel_kind=parallel_kind, + psrs_kinds=psrs_kinds, + dtype=a.dtype, + return_value=False, + return_indices=True, + stage=OperandStage.agg, + ) + return op(a) diff --git a/python/xorbits/_mars/tensor/base/argwhere.py b/python/xorbits/_mars/tensor/base/argwhere.py new file mode 100644 index 000000000..ce9b03474 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/argwhere.py @@ -0,0 +1,126 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np + +from ... import opcodes as OperandDef +from ...core import recursive_tile +from ...serialization.serializables import KeyField +from ...utils import has_unknown_shape +from ..datasource import tensor as astensor +from ..operands import TensorHasInput, TensorOperandMixin +from .ravel import ravel + + +class TensorArgwhere(TensorHasInput, TensorOperandMixin): + _op_type_ = OperandDef.ARGWHERE + + _input = KeyField("input") + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + def __call__(self, a): + shape = (np.nan, a.ndim) + return self.new_tensor([a], shape) + + @classmethod + def tile(cls, op): + from ..datasource import arange + from ..indexing import unravel_index + from ..reshape.reshape import TensorReshape + + in_tensor = op.input + out_tensor = op.outputs[0] + + if has_unknown_shape(in_tensor): + yield + + flattened = yield from recursive_tile(ravel(in_tensor)) + indices = arange(flattened.size, dtype=np.intp, chunks=flattened.nsplits) + indices = indices[flattened] + dim_indices = unravel_index(indices, in_tensor.shape) + dim_indices = yield from recursive_tile(*dim_indices) + + out_chunk_shape = dim_indices[0].chunk_shape + (in_tensor.ndim,) + nsplits = dim_indices[0].nsplits + ((1,) * in_tensor.ndim,) + out_chunks = [] + for out_index in itertools.product(*(map(range, out_chunk_shape))): + dim_ind_chunk = dim_indices[out_index[1]].chunks[out_index[0]] + chunk_shape = dim_ind_chunk.shape + (1,) + chunk_op = TensorReshape(newshape=(-1, 1), dtype=dim_ind_chunk.dtype) + out_chunk = chunk_op.new_chunk( + [dim_ind_chunk], + shape=chunk_shape, + index=out_index, + order=out_tensor.order, + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + out_tensor.shape, + order=out_tensor.order, + chunks=out_chunks, + nsplits=nsplits, + ) + + +def argwhere(a): + """ + Find the indices of tensor elements that are non-zero, grouped by element. + + Parameters + ---------- + a : array_like + Input data. + + Returns + ------- + index_tensor : Tensor + Indices of elements that are non-zero. Indices are grouped by element. + + See Also + -------- + where, nonzero + + Notes + ----- + ``mt.argwhere(a)`` is the same as ``mt.transpose(mt.nonzero(a))``. + + The output of ``argwhere`` is not suitable for indexing tensors. + For this purpose use ``nonzero(a)`` instead. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.arange(6).reshape(2,3) + >>> x.execute() + array([[0, 1, 2], + [3, 4, 5]]) + >>> mt.argwhere(x>1).execute() + array([[0, 2], + [1, 0], + [1, 1], + [1, 2]]) + + """ + a = astensor(a).astype(bool, order="A") + op = TensorArgwhere(dtype=np.dtype(np.intp)) + return op(a) diff --git a/python/xorbits/_mars/tensor/base/array_split.py b/python/xorbits/_mars/tensor/base/array_split.py new file mode 100644 index 000000000..2f1095eb4 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/array_split.py @@ -0,0 +1,46 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .split import _split + + +def array_split(a, indices_or_sections, axis=0): + """ + Split a tensor into multiple sub-tensors. + + Please refer to the ``split`` documentation. The only difference + between these functions is that ``array_split`` allows + `indices_or_sections` to be an integer that does *not* equally + divide the axis. For a tensor of length l that should be split + into n sections, it returns l % n sub-arrays of size l//n + 1 + and the rest of size l//n. + + See Also + -------- + split : Split tensor into multiple sub-tensors of equal size. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.arange(8.0) + >>> mt.array_split(x, 3).execute() + [array([ 0., 1., 2.]), array([ 3., 4., 5.]), array([ 6., 7.])] + + >>> x = mt.arange(7.0) + >>> mt.array_split(x, 3).execute() + [array([ 0., 1., 2.]), array([ 3., 4.]), array([ 5., 6.])] + + """ + return _split(a, indices_or_sections, axis=axis) diff --git a/python/xorbits/_mars/tensor/base/astype.py b/python/xorbits/_mars/tensor/base/astype.py new file mode 100644 index 000000000..0f1b32faf --- /dev/null +++ b/python/xorbits/_mars/tensor/base/astype.py @@ -0,0 +1,168 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import KeyField, StringField +from ...utils import get_dtype +from ..array_utils import as_same_device, device +from ..operands import TensorHasInput, TensorOperandMixin +from ..utils import get_order + + +class TensorAstype(TensorHasInput, TensorOperandMixin): + _op_type_ = OperandDef.ASTYPE + + _input = KeyField("input") + _order = StringField("order") + _casting = StringField("casting") + + def __init__(self, dtype=None, order=None, casting=None, sparse=False, **kw): + super().__init__( + dtype=dtype, _order=order, _casting=casting, sparse=sparse, **kw + ) + + @property + def order(self): + return self._order + + @property + def casting(self): + return self._casting + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + def __call__(self, tensor, order=None): + return self.new_tensor([tensor], tensor.shape, order=order) + + @classmethod + def tile(cls, op): + in_tensor = op.input + out_tensor = op.outputs[0] + + out_chunks = [] + for c in in_tensor.chunks: + chunk_op = op.copy().reset_key() + chunk = chunk_op.new_chunk( + [c], shape=c.shape, index=c.index, order=out_tensor.order + ) + out_chunks.append(chunk) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + nsplits=in_tensor.nsplits, + chunks=out_chunks, + kws=[out_tensor.params], + ) + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + (x,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + if op.sparse: + ctx[chunk.key] = x.astype(op.dtype) + else: + if xp is np: + ctx[chunk.key] = x.astype( + op.dtype, order=op.order, casting=op.casting + ) + else: # pragma: no cover + # cupy does not support casting + ctx[chunk.key] = x.astype(op.dtype, order=op.order) + + +def _astype(tensor, dtype, order="K", casting="unsafe", copy=True): + """ + Copy of the tensor, cast to a specified type. + + Parameters + ---------- + dtype : str or dtype + Typecode or data-type to which the array is cast. + casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional + Controls what kind of data casting may occur. Defaults to 'unsafe' + for backwards compatibility. + * 'no' means the data types should not be cast at all. + * 'equiv' means only byte-order changes are allowed. + * 'safe' means only casts which can preserve values are allowed. + * 'same_kind' means only safe casts or casts within a kind, + like float64 to float32, are allowed. + * 'unsafe' means any data conversions may be done. + order : {'C', 'F', 'A', 'K'}, optional + Controls the memory layout order of the result. + 'C' means C order, 'F' means Fortran order, 'A' + means 'F' order if all the arrays are Fortran contiguous, + 'C' order otherwise, and 'K' means as close to the + order the array elements appear in memory as possible. + Default is 'K'. + copy : bool, optional + By default, astype always returns a newly allocated array. If this + is set to false, and the `dtype`, `order`, and `subok` + requirements are satisfied, the input array is returned instead + of a copy. + + Returns + ------- + arr_t : Tensor + Unless `copy` is False and the other conditions for returning the input + array are satisfied (see description for `copy` input parameter), `arr_t` + is a new tensor of the same shape as the input array, with dtype, order + given by `dtype`, `order`. + + Notes + ----- + astype method returns an error if the string + dtype to cast to is not long enough in 'safe' casting mode to hold the max + value of integer/float array that is being casted. Previously the casting + was allowed even if the result was truncated. + + Raises + ------ + ComplexWarning + When casting from complex to float or int. To avoid this, + one should use ``a.real.astype(t)``. + + Examples + -------- + >>> import mars.tensor as mt + >>> x = mt.array([1, 2, 2.5]) + >>> x.execute() + array([ 1. , 2. , 2.5]) + + >>> x.astype(int).execute() + array([1, 2, 2]) + """ + dtype = get_dtype(dtype) + tensor_order = get_order(order, tensor.order) + + if tensor.dtype == dtype and tensor.order == tensor_order: + return tensor if not copy else tensor.copy(order=order) + elif not np.can_cast(tensor.dtype, dtype, casting=casting): + raise TypeError( + f"Cannot cast array from {tensor.dtype!r} to {dtype!r} " + f"according to the rule {casting}" + ) + + op = TensorAstype( + dtype=dtype, order=order, casting=casting, sparse=tensor.issparse() + ) + return op(tensor, order=tensor_order) diff --git a/python/xorbits/_mars/tensor/base/atleast_1d.py b/python/xorbits/_mars/tensor/base/atleast_1d.py new file mode 100644 index 000000000..8e26feace --- /dev/null +++ b/python/xorbits/_mars/tensor/base/atleast_1d.py @@ -0,0 +1,72 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ...core import ExecutableTuple +from ..datasource import tensor as astensor + + +def atleast_1d(*tensors): + """ + Convert inputs to tensors with at least one dimension. + + Scalar inputs are converted to 1-dimensional tensors, whilst + higher-dimensional inputs are preserved. + + Parameters + ---------- + tensors1, tensors2, ... : array_like + One or more input tensors. + + Returns + ------- + ret : Tensor + An tensor, or list of tensors, each with ``a.ndim >= 1``. + Copies are made only if necessary. + + See Also + -------- + atleast_2d, atleast_3d + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.atleast_1d(1.0).execute() + array([ 1.]) + + >>> x = mt.arange(9.0).reshape(3,3) + >>> mt.atleast_1d(x).execute() + array([[ 0., 1., 2.], + [ 3., 4., 5.], + [ 6., 7., 8.]]) + >>> mt.atleast_1d(x) is x + True + + >>> mt.atleast_1d(1, [3, 4]).execute() + [array([1]), array([3, 4])] + + """ + new_tensors = [] + for x in tensors: + x = astensor(x) + if x.ndim == 0: + x = x[np.newaxis] + + new_tensors.append(x) + + if len(new_tensors) == 1: + return new_tensors[0] + return ExecutableTuple(new_tensors) diff --git a/python/xorbits/_mars/tensor/base/atleast_2d.py b/python/xorbits/_mars/tensor/base/atleast_2d.py new file mode 100644 index 000000000..f9eac5622 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/atleast_2d.py @@ -0,0 +1,70 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ...core import ExecutableTuple +from ..datasource import tensor as astensor + + +def atleast_2d(*tensors): + """ + View inputs as tensors with at least two dimensions. + + Parameters + ---------- + tensors1, tensors2, ... : array_like + One or more array-like sequences. Non-tensor inputs are converted + to tensors. Tensors that already have two or more dimensions are + preserved. + + Returns + ------- + res, res2, ... : Tensor + A tensor, or list of tensors, each with ``a.ndim >= 2``. + Copies are avoided where possible, and views with two or more + dimensions are returned. + + See Also + -------- + atleast_1d, atleast_3d + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.atleast_2d(3.0).execute() + array([[ 3.]]) + + >>> x = mt.arange(3.0) + >>> mt.atleast_2d(x).execute() + array([[ 0., 1., 2.]]) + + >>> mt.atleast_2d(1, [1, 2], [[1, 2]]).execute() + [array([[1]]), array([[1, 2]]), array([[1, 2]])] + + """ + new_tensors = [] + for x in tensors: + x = astensor(x) + if x.ndim == 0: + x = x[np.newaxis, np.newaxis] + elif x.ndim == 1: + x = x[np.newaxis, :] + + new_tensors.append(x) + + if len(new_tensors) == 1: + return new_tensors[0] + return ExecutableTuple(new_tensors) diff --git a/python/xorbits/_mars/tensor/base/atleast_3d.py b/python/xorbits/_mars/tensor/base/atleast_3d.py new file mode 100644 index 000000000..540236a38 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/atleast_3d.py @@ -0,0 +1,84 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ...core import ExecutableTuple +from ..datasource import tensor as astensor + + +def atleast_3d(*tensors): + """ + View inputs as tensors with at least three dimensions. + + Parameters + ---------- + tensors1, tensors2, ... : array_like + One or more tensor-like sequences. Non-tensor inputs are converted to + tensors. Tensors that already have three or more dimensions are + preserved. + + Returns + ------- + res1, res2, ... : Tensor + A tensor, or list of tensors, each with ``a.ndim >= 3``. Copies are + avoided where possible, and views with three or more dimensions are + returned. For example, a 1-D tensor of shape ``(N,)`` becomes a view + of shape ``(1, N, 1)``, and a 2-D tensor of shape ``(M, N)`` becomes a + view of shape ``(M, N, 1)``. + + See Also + -------- + atleast_1d, atleast_2d + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.atleast_3d(3.0).execute() + array([[[ 3.]]]) + + >>> x = mt.arange(3.0) + >>> mt.atleast_3d(x).shape + (1, 3, 1) + + >>> x = mt.arange(12.0).reshape(4,3) + >>> mt.atleast_3d(x).shape + (4, 3, 1) + + >>> for arr in mt.atleast_3d([1, 2], [[1, 2]], [[[1, 2]]]).execute(): + ... print(arr, arr.shape) + ... + [[[1] + [2]]] (1, 2, 1) + [[[1] + [2]]] (1, 2, 1) + [[[1 2]]] (1, 1, 2) + + """ + new_tensors = [] + for x in tensors: + x = astensor(x) + if x.ndim == 0: + x = x[np.newaxis, np.newaxis, np.newaxis] + elif x.ndim == 1: + x = x[np.newaxis, :, np.newaxis] + elif x.ndim == 2: + x = x[:, :, None] + + new_tensors.append(x) + + if len(new_tensors) == 1: + return new_tensors[0] + return ExecutableTuple(new_tensors) diff --git a/python/xorbits/_mars/tensor/base/broadcast_arrays.py b/python/xorbits/_mars/tensor/base/broadcast_arrays.py new file mode 100644 index 000000000..3d96d14d7 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/broadcast_arrays.py @@ -0,0 +1,57 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...core import ExecutableTuple +from ..datasource import tensor as astensor +from ..utils import broadcast_shape +from .broadcast_to import broadcast_to + + +def broadcast_arrays(*args, **kwargs): + """ + Broadcast any number of arrays against each other. + + Parameters + ---------- + `*args` : array_likes + The tensors to broadcast. + + Returns + ------- + broadcasted : list of tensors + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.array([[1,2,3]]) + >>> y = mt.array([[1],[2],[3]]) + >>> mt.broadcast_arrays(x, y).execute() + [array([[1, 2, 3], + [1, 2, 3], + [1, 2, 3]]), array([[1, 1, 1], + [2, 2, 2], + [3, 3, 3]])] + + """ + if kwargs: + raise TypeError( + "broadcast_arrays() got an unexpected keyword " + f"argument {next(iter(kwargs.keys()))!r}" + ) + + args = [astensor(arg) for arg in args] + + shape = broadcast_shape(*[arg.shape for arg in args]) + return ExecutableTuple([broadcast_to(a, shape) for a in args]) diff --git a/python/xorbits/_mars/tensor/base/broadcast_to.py b/python/xorbits/_mars/tensor/base/broadcast_to.py new file mode 100644 index 000000000..12f892f01 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/broadcast_to.py @@ -0,0 +1,151 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import KeyField, TupleField +from ..array_utils import device, get_array_module +from ..datasource import tensor as astensor +from ..operands import TensorHasInput, TensorOperandMixin + + +class TensorBroadcastTo(TensorHasInput, TensorOperandMixin): + _op_type_ = OperandDef.BROADCAST_TO + + _input = KeyField("input") + _shape = TupleField("shape") + + def __init__(self, shape=None, **kw): + super().__init__(_shape=shape, **kw) + + @property + def shape(self): + return self._shape + + def __call__(self, tensor, shape): + return self.new_tensor([tensor], shape) + + @classmethod + def tile(cls, op): + tensor = op.outputs[0] + in_tensor = op.inputs[0] + shape = op.shape + new_dim = tensor.ndim - in_tensor.ndim + + out_chunks = [] + for c in in_tensor.chunks: + chunk_shape = shape[:new_dim] + tuple( + s if in_tensor.shape[idx] != 1 else shape[new_dim + idx] + for idx, s in enumerate(c.shape) + ) + chunk_idx = (0,) * new_dim + c.index + chunk_op = op.copy().reset_key() + chunk_op._shape = chunk_shape + out_chunk = chunk_op.new_chunk( + [c], shape=chunk_shape, index=chunk_idx, order=tensor.order + ) + out_chunks.append(out_chunk) + + nsplits = [ + tuple( + c.shape[i] + for c in out_chunks + if all(idx == 0 for j, idx in enumerate(c.index) if j != i) + ) + for i in range(len(out_chunks[0].shape)) + ] + new_op = op.copy() + return new_op.new_tensors( + [in_tensor], + tensor.shape, + order=tensor.order, + chunks=out_chunks, + nsplits=nsplits, + ) + + @classmethod + def execute(cls, ctx, op): + xp = get_array_module(ctx[op.input.key]) + input_data = ctx[op.input.key] + device_id = input_data.device.id if hasattr(input_data, "device") else -1 + + with device(device_id): + shape = op.shape + if any(np.isnan(s) for s in shape): + shape = list(shape) + new_dim = len(shape) - input_data.ndim + for i in range(input_data.ndim): + if np.isnan(shape[i + new_dim]): + shape[i + new_dim] = input_data.shape[i] + ctx[op.outputs[0].key] = xp.broadcast_to(input_data, shape) + + +def broadcast_to(tensor, shape): + """Broadcast an tensor to a new shape. + + Parameters + ---------- + tensor : array_like + The tensor to broadcast. + shape : tuple + The shape of the desired array. + + Returns + ------- + broadcast : Tensor + + Raises + ------ + ValueError + If the tensor is not compatible with the new shape according to Mars's + broadcasting rules. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.array([1, 2, 3]) + >>> mt.broadcast_to(x, (3, 3)).execute() + array([[1, 2, 3], + [1, 2, 3], + [1, 2, 3]]) + """ + from ..core import Tensor + + tensor = tensor if isinstance(tensor, Tensor) else astensor(tensor) + shape = tuple(shape) if isinstance(shape, (list, tuple)) else (shape,) + + if any(np.isnan(s) for s in tensor.shape): + raise ValueError( + "input tensor has unknown shape, need to call `.execute()` first" + ) + + if tensor.shape == shape: + return tensor + + new_ndim = len(shape) - tensor.ndim + if new_ndim < 0: + raise ValueError( + "input operand has more dimensions than allowed by the axis remapping" + ) + if any(o != n for o, n in zip(tensor.shape, shape[new_ndim:]) if o != 1): + raise ValueError( + "operands could not be broadcast together " + f"with remapped shapes [original->remapped]: {tensor.shape} " + f"and requested shape {shape}" + ) + + op = TensorBroadcastTo(shape, dtype=tensor.dtype, sparse=tensor.issparse()) + return op(tensor, shape) diff --git a/python/xorbits/_mars/tensor/base/copy.py b/python/xorbits/_mars/tensor/base/copy.py new file mode 100644 index 000000000..3ae544c7a --- /dev/null +++ b/python/xorbits/_mars/tensor/base/copy.py @@ -0,0 +1,64 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def copy(a, order="K"): + """ + Return a tensor copy of the given object. + + Parameters + ---------- + a : array_like + Input data. + order : {'C', 'F', 'A', 'K'}, optional + Controls the memory layout of the copy. 'C' means C-order, + 'F' means F-order, 'A' means 'F' if `a` is Fortran contiguous, + 'C' otherwise. 'K' means match the layout of `a` as closely + as possible. (Note that this function and :meth:`ndarray.copy` are very + similar, but have different default values for their order= + arguments.) + + Returns + ------- + arr : Tensor + Tensor interpretation of `a`. + + Notes + ----- + This is equivalent to: + + >>> import mars.tensor as mt + + >>> mt.array(a, copy=True) #doctest: +SKIP + + Examples + -------- + Create an array x, with a reference y and a copy z: + + >>> x = mt.array([1, 2, 3]) + >>> y = x + >>> z = mt.copy(x) + + Note that, when we modify x, y changes, but not z: + + >>> x[0] = 10 + >>> (x[0] == y[0]).execute() + True + >>> (x[0] == z[0]).execute() + False + + """ + from ..datasource import array + + return array(a, order=order, copy=True) diff --git a/python/xorbits/_mars/tensor/base/copyto.py b/python/xorbits/_mars/tensor/base/copyto.py new file mode 100644 index 000000000..4b7b88dcf --- /dev/null +++ b/python/xorbits/_mars/tensor/base/copyto.py @@ -0,0 +1,211 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import KeyField, StringField +from ...utils import has_unknown_shape +from ..array_utils import as_same_device, device +from ..datasource import tensor as astensor +from ..operands import TensorOperand, TensorOperandMixin +from ..utils import broadcast_shape, unify_chunks +from .broadcast_to import broadcast_to + + +class TensorCopyTo(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.COPYTO + + _src = KeyField("src") + _dst = KeyField("dest") + _casting = StringField("casting") + _where = KeyField("where") + + def __init__(self, casting=None, **kw): + super().__init__(_casting=casting, **kw) + + @property + def src(self): + return self._src + + @property + def dst(self): + return self._dst + + @property + def casting(self): + return self._casting + + @property + def where(self): + return self._where + + def check_inputs(self, inputs): + if not 2 <= len(inputs) <= 3: + raise ValueError("inputs' length must be 2 or 3") + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + + self._src = self._inputs[0] + self._dst = self._inputs[1] + if len(self._inputs) > 2: + self._where = self._inputs[2] + + @staticmethod + def _extract_inputs(inputs): + if len(inputs) == 2: + (src, dst), where = inputs, None + else: + src, dst, where = inputs + if where is True: + where = None + else: + where = astensor(where) + + return src, dst, where + + def __call__(self, *inputs): + from ..core import Tensor + + src, dst, where = self._extract_inputs(inputs) + + if not isinstance(dst, Tensor): + raise TypeError("dst has to be a Tensor") + + self.dtype = dst.dtype + self.gpu = dst.op.gpu + self.sparse = dst.issparse() + + if not np.can_cast(src.dtype, dst.dtype, casting=self.casting): + raise TypeError( + f"Cannot cast array from {src.dtype!r} to {dst.dtype!r} " + f"according to the rule {self.casting!s}" + ) + + try: + broadcast_to(src, dst.shape) + except ValueError: + raise ValueError( + "could not broadcast input array " + f"from shape {src.shape!r} into shape {dst.shape!r}" + ) + if where: + try: + broadcast_to(where, dst.shape) + except ValueError: + raise ValueError( + "could not broadcast where mask " + f"from shape {src.shape!r} into shape {dst.shape!r}" + ) + + inps = [src, dst] + if where is not None: + inps.append(where) + ret = self.new_tensor(inps, dst.shape, order=dst.order) + dst.data = ret.data + + @classmethod + def tile(cls, op): + if has_unknown_shape(*op.inputs): + yield + inputs = yield from unify_chunks( + *[(input, list(range(input.ndim))[::-1]) for input in op.inputs] + ) + output = op.outputs[0] + + chunk_shapes = [ + t.chunk_shape if hasattr(t, "chunk_shape") else t for t in inputs + ] + out_chunk_shape = broadcast_shape(*chunk_shapes) + + out_chunks = [] + nsplits = [[np.nan] * shape for shape in out_chunk_shape] + get_index = lambda idx, t: tuple( + 0 if t.nsplits[i] == (1,) else ix for i, ix in enumerate(idx) + ) + for out_idx in itertools.product(*(map(range, out_chunk_shape))): + in_chunks = [ + t.cix[get_index(out_idx[-t.ndim :], t)] if t.ndim != 0 else t.chunks[0] + for t in inputs + ] + out_chunk = ( + op.copy() + .reset_key() + .new_chunk( + in_chunks, + shape=in_chunks[1].shape, + order=output.order, + index=out_idx, + ) + ) + out_chunks.append(out_chunk) + for i, idx, s in zip(itertools.count(0), out_idx, out_chunk.shape): + nsplits[i][idx] = s + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + output.shape, + order=output.order, + chunks=out_chunks, + nsplits=nsplits, + ) + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + dst = inputs[1].copy() + src = inputs[0] + where = inputs[2] if len(inputs) > 2 else True + + xp.copyto(dst, src, casting=op.casting, where=where) + ctx[op.outputs[0].key] = dst + + +def copyto(dst, src, casting="same_kind", where=True): + """ + Copies values from one array to another, broadcasting as necessary. + + Raises a TypeError if the `casting` rule is violated, and if + `where` is provided, it selects which elements to copy. + + Parameters + ---------- + dst : Tensor + The tensor into which values are copied. + src : array_like + The tensor from which values are copied. + casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional + Controls what kind of data casting may occur when copying. + + * 'no' means the data types should not be cast at all. + * 'equiv' means only byte-order changes are allowed. + * 'safe' means only casts which can preserve values are allowed. + * 'same_kind' means only safe casts or casts within a kind, + like float64 to float32, are allowed. + * 'unsafe' means any data conversions may be done. + where : array_like of bool, optional + A boolean tensor which is broadcasted to match the dimensions + of `dst`, and selects elements to copy from `src` to `dst` + wherever it contains the value True. + """ + op = TensorCopyTo(casting=casting) + return op(src, dst, where) diff --git a/python/xorbits/_mars/tensor/base/core.py b/python/xorbits/_mars/tensor/base/core.py new file mode 100644 index 000000000..b44dc8dde --- /dev/null +++ b/python/xorbits/_mars/tensor/base/core.py @@ -0,0 +1,47 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...serialization.serializables import KeyField +from ..operands import TensorOperand, TensorOperandMixin + + +class TensorDeviceConversionBase(TensorOperand, TensorOperandMixin): + _input = KeyField("input") + + @property + def input(self): + return self._input + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = inputs[0] + + def __call__(self, tensor): + return self.new_tensor( + [tensor], shape=tensor.shape, dtype=tensor.dtype, order=tensor.order + ) + + @classmethod + def tile(cls, op): + out_chunks = [] + for c in op.input.chunks: + chunk_op = op.copy().reset_key() + out_chunk = chunk_op.new_chunk([c], **c.params) + out_chunks.append(out_chunk) + + new_op = op.copy().reset_key() + out = op.outputs[0] + return new_op.new_tensors( + op.inputs, nsplits=op.input.nsplits, chunks=out_chunks, **out.params + ) diff --git a/python/xorbits/_mars/tensor/base/delete.py b/python/xorbits/_mars/tensor/base/delete.py new file mode 100644 index 000000000..87db6b8dd --- /dev/null +++ b/python/xorbits/_mars/tensor/base/delete.py @@ -0,0 +1,236 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict + +import numpy as np + +from ... import opcodes as OperandDef +from ...core import ENTITY_TYPE, recursive_tile +from ...serialization.serializables import AnyField, Int32Field, Int64Field, KeyField +from ...utils import has_unknown_shape +from ..datasource import tensor as astensor +from ..operands import TensorHasInput, TensorOperandMixin +from ..utils import calc_object_length, filter_inputs, slice_split, validate_axis + + +class TensorDelete(TensorHasInput, TensorOperandMixin): + _op_type_ = OperandDef.DELETE + + _index_obj = AnyField("index_obj") + _axis = Int32Field("axis") + _input = KeyField("input") + + # for chunk + _offset_on_axis = Int64Field("offset_on_axis") + + def __init__(self, index_obj=None, axis=None, offset_on_axis=None, **kw): + super().__init__( + _index_obj=index_obj, _axis=axis, _offset_on_axis=offset_on_axis, **kw + ) + + @property + def index_obj(self): + return self._index_obj + + @property + def axis(self): + return self._axis + + @property + def offset_on_axis(self): + return self._offset_on_axis + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if len(self._inputs) > 1: + self._index_obj = self._inputs[1] + + @classmethod + def tile(cls, op: "TensorDelete"): + inp = op.input + index_obj = op.index_obj + axis = op.axis + if axis is None: + inp = yield from recursive_tile(inp.flatten()) + axis = 0 + if has_unknown_shape(inp): + yield + + if isinstance(index_obj, int): + index_obj = [index_obj] + + if isinstance(index_obj, ENTITY_TYPE): + index_obj = yield from recursive_tile(index_obj.rechunk(index_obj.shape)) + offsets = np.cumsum([0] + list(inp.nsplits[axis])) + out_chunks = [] + for c in inp.chunks: + chunk_op = op.copy().reset_key() + chunk_op._index_obj = index_obj.chunks[0] + chunk_op._offset_on_axis = int(offsets[c.index[axis]]) + shape = tuple(np.nan if j == axis else s for j, s in enumerate(c.shape)) + out_chunks.append( + chunk_op.new_chunk( + [c, index_obj.chunks[0]], shape=shape, index=c.index + ) + ) + nsplits_on_axis = (np.nan,) * len(inp.nsplits[axis]) + else: + nsplits_on_axis = [None for _ in inp.nsplits[axis]] + out_chunks = [] + # index_obj is list, tuple, slice or array like + if isinstance(index_obj, slice): + slc_splits = slice_split(index_obj, inp.nsplits[axis]) + for c in inp.chunks: + if c.index[axis] in slc_splits: + chunk_op = op.copy().reset_key() + chunk_slc = slc_splits[c.index[axis]] + shape = tuple( + s - calc_object_length(chunk_slc, s) if j == axis else s + for j, s in enumerate(c.shape) + ) + chunk_op._index_obj = chunk_slc + out_chunks.append( + chunk_op.new_chunk([c], shape=shape, index=c.index) + ) + nsplits_on_axis[c.index[axis]] = shape[axis] + else: + out_chunks.append(c) + nsplits_on_axis[c.index[axis]] = c.shape[axis] + else: + index_obj = np.array(index_obj) + cum_splits = np.cumsum([0] + list(inp.nsplits[axis])) + chunk_indexes = defaultdict(list) + for int_idx in index_obj: + in_idx = cum_splits.searchsorted(int_idx, side="right") - 1 + chunk_indexes[in_idx].append(int_idx - cum_splits[in_idx]) + + for c in inp.chunks: + idx_on_axis = c.index[axis] + if idx_on_axis in chunk_indexes: + chunk_op = op.copy().reset_key() + chunk_op._index_obj = chunk_indexes[idx_on_axis] + shape = tuple( + s - len(chunk_indexes[idx_on_axis]) if j == axis else s + for j, s in enumerate(c.shape) + ) + out_chunks.append( + chunk_op.new_chunk([c], shape=shape, index=c.index) + ) + nsplits_on_axis[c.index[axis]] = shape[axis] + else: + out_chunks.append(c) + nsplits_on_axis[c.index[axis]] = c.shape[axis] + + nsplits = tuple( + s if i != axis else tuple(nsplits_on_axis) + for i, s in enumerate(inp.nsplits) + ) + out = op.outputs[0] + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + shape=out.shape, + order=out.order, + chunks=out_chunks, + nsplits=nsplits, + ) + + @classmethod + def execute(cls, ctx, op): + inp = ctx[op.input.key] + index_obj = ( + ctx[op.index_obj.key] if hasattr(op.index_obj, "key") else op.index_obj + ) + if op.offset_on_axis is None: + ctx[op.outputs[0].key] = np.delete(inp, index_obj, axis=op.axis) + else: + index_obj = np.array(index_obj) + part_index = [ + idx - op.offset_on_axis + for idx in index_obj + if ( + (idx >= op.offset_on_axis) + and idx < (op.offset_on_axis + inp.shape[op.axis or 0]) + ) + ] + + ctx[op.outputs[0].key] = np.delete(inp, part_index, axis=op.axis) + + def __call__(self, arr, obj, shape): + return self.new_tensor(filter_inputs([arr, obj]), shape=shape, order=arr.order) + + +def delete(arr, obj, axis=None): + """ + Return a new array with sub-arrays along an axis deleted. For a one + dimensional array, this returns those entries not returned by + `arr[obj]`. + + Parameters + ---------- + arr : array_like + Input array. + obj : slice, int or array of ints + Indicate indices of sub-arrays to remove along the specified axis. + axis : int, optional + The axis along which to delete the subarray defined by `obj`. + If `axis` is None, `obj` is applied to the flattened array. + + Returns + ------- + out : mars.tensor + A copy of `arr` with the elements specified by `obj` removed. Note + that `delete` does not occur in-place. If `axis` is None, `out` is + a flattened array. + + Examples + -------- + >>> import mars.tensor as mt + >>> arr = mt.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]]) + >>> arr.execute() + array([[ 1, 2, 3, 4], + [ 5, 6, 7, 8], + [ 9, 10, 11, 12]]) + >>> mt.delete(arr, 1, 0).execute() + array([[ 1, 2, 3, 4], + [ 9, 10, 11, 12]]) + >>> mt.delete(arr, np.s_[::2], 1).execute() + array([[ 2, 4], + [ 6, 8], + [10, 12]]) + >>> mt.delete(arr, [1,3,5], None).execute() + array([ 1, 3, 5, 7, 8, 9, 10, 11, 12]) + """ + arr = astensor(arr) + arr = astensor(arr) + if getattr(obj, "ndim", 0) > 1: # pragma: no cover + raise ValueError( + "index array argument obj to insert must be one dimensional or scalar" + ) + + if axis is None: + # if axis is None, array will be flatten + arr_size = arr.size + idx_length = calc_object_length(obj, size=arr_size) + shape = (arr_size - idx_length,) + else: + validate_axis(arr.ndim, axis) + idx_length = calc_object_length(obj, size=arr.shape[axis]) + shape = tuple( + s - idx_length if i == axis else s for i, s in enumerate(arr.shape) + ) + + op = TensorDelete(index_obj=obj, axis=axis, dtype=arr.dtype) + return op(arr, obj, shape) diff --git a/python/xorbits/_mars/tensor/base/diff.py b/python/xorbits/_mars/tensor/base/diff.py new file mode 100644 index 000000000..e8626cb6b --- /dev/null +++ b/python/xorbits/_mars/tensor/base/diff.py @@ -0,0 +1,132 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...core import recursive_tile +from ...serialization.serializables import Int32Field, Int64Field +from ..datasource import tensor as astensor +from ..operands import TensorOperand, TensorOperandMixin +from ..utils import validate_axis + + +class TensorDiff(TensorOperand, TensorOperandMixin): + n = Int64Field("n") + axis = Int32Field("axis") + + def __call__(self, a): + shape = list(a.shape) + shape[self.axis] -= self.n + shape = tuple(shape) + return self.new_tensor([a], shape, dtype=a.dtype, order=a.order) + + @classmethod + def tile(cls, op: "TensorDiff"): + axis = op.axis + n = op.n + a = astensor(op.inputs[0]) + + slc1 = (slice(None),) * axis + (slice(1, None),) + slc2 = (slice(None),) * axis + (slice(-1),) + + for _ in range(n): + l = yield from recursive_tile(a[slc1]) + r = (yield from recursive_tile(a[slc2])).rechunk(l.nsplits) + a = yield from recursive_tile(l - r) + + return [a] + + +def diff(a, n=1, axis=-1): + """ + Calculate the n-th discrete difference along the given axis. + + The first difference is given by ``out[n] = a[n+1] - a[n]`` along + the given axis, higher differences are calculated by using `diff` + recursively. + + Parameters + ---------- + a : array_like + Input tensor + n : int, optional + The number of times values are differenced. If zero, the input + is returned as-is. + axis : int, optional + The axis along which the difference is taken, default is the + last axis. + + Returns + ------- + diff : Tensor + The n-th differences. The shape of the output is the same as `a` + except along `axis` where the dimension is smaller by `n`. The + type of the output is the same as the type of the difference + between any two elements of `a`. This is the same as the type of + `a` in most cases. A notable exception is `datetime64`, which + results in a `timedelta64` output tensor. + + See Also + -------- + gradient, ediff1d, cumsum + + Notes + ----- + Type is preserved for boolean tensors, so the result will contain + `False` when consecutive elements are the same and `True` when they + differ. + + For unsigned integer tensors, the results will also be unsigned. This + should not be surprising, as the result is consistent with + calculating the difference directly: + + >>> import mars.tensor as mt + + >>> u8_arr = mt.array([1, 0], dtype=mt.uint8) + >>> mt.diff(u8_arr).execute() + array([255], dtype=uint8) + >>> (u8_arr[1,...] - u8_arr[0,...]).execute() + 255 + + If this is not desirable, then the array should be cast to a larger + integer type first: + + >>> i16_arr = u8_arr.astype(mt.int16) + >>> mt.diff(i16_arr).execute() + array([-1], dtype=int16) + + Examples + -------- + >>> x = mt.array([1, 2, 4, 7, 0]) + >>> mt.diff(x).execute() + array([ 1, 2, 3, -7]) + >>> mt.diff(x, n=2).execute() + array([ 1, 1, -10]) + + >>> x = mt.array([[1, 3, 6, 10], [0, 5, 6, 8]]) + >>> mt.diff(x).execute() + array([[2, 3, 4], + [5, 1, 2]]) + >>> mt.diff(x, axis=0).execute() + array([[-1, 2, 0, -2]]) + + >>> x = mt.arange('1066-10-13', '1066-10-16', dtype=mt.datetime64) + >>> mt.diff(x).execute() + array([1, 1], dtype='timedelta64[D]') + + """ + a = astensor(a) + n = int(n) + + axis = validate_axis(a.ndim, axis) + op = TensorDiff(axis=axis, n=n) + return op(a) diff --git a/python/xorbits/_mars/tensor/base/dsplit.py b/python/xorbits/_mars/tensor/base/dsplit.py new file mode 100644 index 000000000..90ebca324 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/dsplit.py @@ -0,0 +1,67 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..datasource import tensor as astensor +from .split import split + + +def dsplit(a, indices_or_sections): + """ + Split tensor into multiple sub-tensors along the 3rd axis (depth). + + Please refer to the `split` documentation. `dsplit` is equivalent + to `split` with ``axis=2``, the array is always split along the third + axis provided the tensor dimension is greater than or equal to 3. + + See Also + -------- + split : Split a tensor into multiple sub-arrays of equal size. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.arange(16.0).reshape(2, 2, 4) + >>> x.execute() + array([[[ 0., 1., 2., 3.], + [ 4., 5., 6., 7.]], + [[ 8., 9., 10., 11.], + [ 12., 13., 14., 15.]]]) + >>> mt.dsplit(x, 2).execute() + [array([[[ 0., 1.], + [ 4., 5.]], + [[ 8., 9.], + [ 12., 13.]]]), + array([[[ 2., 3.], + [ 6., 7.]], + [[ 10., 11.], + [ 14., 15.]]])] + >>> mt.dsplit(x, mt.array([3, 6])).execute() + [array([[[ 0., 1., 2.], + [ 4., 5., 6.]], + [[ 8., 9., 10.], + [ 12., 13., 14.]]]), + array([[[ 3.], + [ 7.]], + [[ 11.], + [ 15.]]]), + array([], dtype=float64)] + + """ + ary = a + a = astensor(a) + + if a.ndim < 3: + raise ValueError("dsplit only works on tensors of 3 or more dimensions") + return split(ary, indices_or_sections, 2) diff --git a/python/xorbits/_mars/tensor/base/ediff1d.py b/python/xorbits/_mars/tensor/base/ediff1d.py new file mode 100644 index 000000000..53742668c --- /dev/null +++ b/python/xorbits/_mars/tensor/base/ediff1d.py @@ -0,0 +1,74 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..datasource import tensor as astensor +from .ravel import ravel + + +def ediff1d(a, to_end=None, to_begin=None): + """ + The differences between consecutive elements of a tensor. + + Parameters + ---------- + a : array_like + If necessary, will be flattened before the differences are taken. + to_end : array_like, optional + Number(s) to append at the end of the returned differences. + to_begin : array_like, optional + Number(s) to prepend at the beginning of the returned differences. + + Returns + ------- + ediff1d : Tensor + The differences. Loosely, this is ``a.flat[1:] - a.flat[:-1]``. + + See Also + -------- + diff, gradient + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.array([1, 2, 4, 7, 0]) + >>> mt.ediff1d(x).execute() + array([ 1, 2, 3, -7]) + + >>> mt.ediff1d(x, to_begin=-99, to_end=mt.array([88, 99])).execute() + array([-99, 1, 2, 3, -7, 88, 99]) + + The returned tensor is always 1D. + + >>> y = [[1, 2, 4], [1, 6, 24]] + >>> mt.ediff1d(y).execute() + array([ 1, 2, -3, 5, 18]) + + """ + from ..merge import concatenate + + a = astensor(a) + a = ravel(a) + + t = a[1:] - a[:-1] + if to_begin is None and to_end is None: + return t + + to_concat = [t] + if to_begin is not None: + to_concat.insert(0, ravel(astensor(to_begin))) + if to_end is not None: + to_concat.append(ravel(astensor(to_end))) + + return concatenate(to_concat) diff --git a/python/xorbits/_mars/tensor/base/expand_dims.py b/python/xorbits/_mars/tensor/base/expand_dims.py new file mode 100644 index 000000000..ff3450b86 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/expand_dims.py @@ -0,0 +1,85 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ..datasource import tensor as astensor + + +def expand_dims(a, axis): + """ + Expand the shape of a tensor. + + Insert a new axis that will appear at the `axis` position in the expanded + array shape. + + Parameters + ---------- + a : array_like + Input tensor. + axis : int + Position in the expanded axes where the new axis is placed. + + Returns + ------- + res : Tensor + Output tensor. The number of dimensions is one greater than that of + the input tensor. + + See Also + -------- + squeeze : The inverse operation, removing singleton dimensions + reshape : Insert, remove, and combine dimensions, and resize existing ones + doc.indexing, atleast_1d, atleast_2d, atleast_3d + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.array([1,2]) + >>> x.shape + (2,) + + The following is equivalent to ``x[mt.newaxis,:]`` or ``x[mt.newaxis]``: + + >>> y = mt.expand_dims(x, axis=0) + >>> y.execute() + array([[1, 2]]) + >>> y.shape + (1, 2) + + >>> y = mt.expand_dims(x, axis=1) # Equivalent to x[:,mt.newaxis] + >>> y.execute() + array([[1], + [2]]) + >>> y.shape + (2, 1) + + Note that some examples may use ``None`` instead of ``np.newaxis``. These + are the same objects: + + >>> mt.newaxis is None + True + + """ + a = astensor(a) + + if axis > a.ndim or axis < -a.ndim - 1: + raise np.AxisError( + f"Axis must be between -{a.ndim + 1} and {a.ndim}, got {axis}" + ) + + axis = axis if axis >= 0 else axis + a.ndim + 1 + indexes = (slice(None),) * axis + (np.newaxis,) + (slice(None),) * (a.ndim - axis) + return a[indexes] diff --git a/python/xorbits/_mars/tensor/base/flatten.py b/python/xorbits/_mars/tensor/base/flatten.py new file mode 100644 index 000000000..857f32732 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/flatten.py @@ -0,0 +1,63 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ..utils import get_order + + +def flatten(a, order="C"): + """ + Return a copy of the tensor collapsed into one dimension. + + Parameters + ---------- + order : {'C', 'F', 'A', 'K'}, optional + 'C' means to flatten in row-major (C-style) order. + 'F' means to flatten in column-major (Fortran- + style) order. 'A' means to flatten in column-major + order if `a` is Fortran *contiguous* in memory, + row-major order otherwise. 'K' means to flatten + `a` in the order the elements occur in memory. + The default is 'C'. + + Returns + ------- + y : Tensor + A copy of the input tensor, flattened to one dimension. + + See Also + -------- + ravel : Return a flattened tensor. + flat : A 1-D flat iterator over the tensor. + + Examples + -------- + + >>> import mars.tensor as mt + + >>> a = mt.array([[1,2], [3,4]]) + >>> a.flatten().execute() + array([1, 2, 3, 4]) + """ + + from ..reshape.reshape import TensorReshape, calc_shape + + if a.ndim == 1: + return a + + new_shape = np.nan if any(np.isnan(s) for s in a.shape) else calc_shape(a.size, -1) + tensor_order = get_order(order, a.order) + op = TensorReshape(new_shape, dtype=a.dtype, create_view=False) + return op(a, order=tensor_order, out_shape=new_shape) diff --git a/python/xorbits/_mars/tensor/base/flip.py b/python/xorbits/_mars/tensor/base/flip.py new file mode 100644 index 000000000..a77d2949c --- /dev/null +++ b/python/xorbits/_mars/tensor/base/flip.py @@ -0,0 +1,90 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from ..datasource import tensor as astensor + + +def flip(m, axis): + """ + Reverse the order of elements in a tensor along the given axis. + + The shape of the array is preserved, but the elements are reordered. + + Parameters + ---------- + m : array_like + Input tensor. + axis : integer + Axis in tensor, which entries are reversed. + + + Returns + ------- + out : array_like + A view of `m` with the entries of axis reversed. Since a view is + returned, this operation is done in constant time. + + See Also + -------- + flipud : Flip a tensor vertically (axis=0). + fliplr : Flip a tensor horizontally (axis=1). + + Notes + ----- + flip(m, 0) is equivalent to flipud(m). + flip(m, 1) is equivalent to fliplr(m). + flip(m, n) corresponds to ``m[...,::-1,...]`` with ``::-1`` at position n. + + Examples + -------- + >>> import mars.tensor as mt + + >>> A = mt.arange(8).reshape((2,2,2)) + >>> A.execute() + array([[[0, 1], + [2, 3]], + + [[4, 5], + [6, 7]]]) + + >>> mt.flip(A, 0).execute() + array([[[4, 5], + [6, 7]], + + [[0, 1], + [2, 3]]]) + + >>> mt.flip(A, 1).execute() + array([[[2, 3], + [0, 1]], + + [[6, 7], + [4, 5]]]) + + >>> A = mt.random.randn(3,4,5) + >>> mt.all(mt.flip(A,2) == A[:,:,::-1,...]).execute() + True + """ + m = astensor(m) + + sl = [slice(None)] * m.ndim + try: + sl[axis] = slice(None, None, -1) + except IndexError: + raise ValueError( + "axis=%i is invalid for the %i-dimensional input tensor" % (axis, m.ndim) + ) + + return m[tuple(sl)] diff --git a/python/xorbits/_mars/tensor/base/fliplr.py b/python/xorbits/_mars/tensor/base/fliplr.py new file mode 100644 index 000000000..825a38d17 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/fliplr.py @@ -0,0 +1,64 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .flip import flip + + +def fliplr(m): + """ + Flip tensor in the left/right direction. + + Flip the entries in each row in the left/right direction. + Columns are preserved, but appear in a different order than before. + + Parameters + ---------- + m : array_like + Input tensor, must be at least 2-D. + + Returns + ------- + f : Tensor + A view of `m` with the columns reversed. Since a view + is returned, this operation is :math:`\\mathcal O(1)`. + + See Also + -------- + flipud : Flip array in the up/down direction. + rot90 : Rotate array counterclockwise. + + Notes + ----- + Equivalent to m[:,::-1]. Requires the tensor to be at least 2-D. + + Examples + -------- + >>> import mars.tensor as mt + + >>> A = mt.diag([1.,2.,3.]) + >>> A.execute() + array([[ 1., 0., 0.], + [ 0., 2., 0.], + [ 0., 0., 3.]]) + >>> mt.fliplr(A).execute() + array([[ 0., 0., 1.], + [ 0., 2., 0.], + [ 3., 0., 0.]]) + + >>> A = mt.random.randn(2,3,5) + >>> mt.all(mt.fliplr(A) == A[:,::-1,...]).execute() + True + + """ + return flip(m, 1) diff --git a/python/xorbits/_mars/tensor/base/flipud.py b/python/xorbits/_mars/tensor/base/flipud.py new file mode 100644 index 000000000..4b8bea2ee --- /dev/null +++ b/python/xorbits/_mars/tensor/base/flipud.py @@ -0,0 +1,68 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .flip import flip + + +def flipud(m): + """ + Flip tensor in the up/down direction. + + Flip the entries in each column in the up/down direction. + Rows are preserved, but appear in a different order than before. + + Parameters + ---------- + m : array_like + Input tensor. + + Returns + ------- + out : array_like + A view of `m` with the rows reversed. Since a view is + returned, this operation is :math:`\\mathcal O(1)`. + + See Also + -------- + fliplr : Flip tensor in the left/right direction. + rot90 : Rotate tensor counterclockwise. + + Notes + ----- + Equivalent to ``m[::-1,...]``. + Does not require the tensor to be two-dimensional. + + Examples + -------- + >>> import mars.tensor as mt + + >>> A = mt.diag([1.0, 2, 3]) + >>> A.execute() + array([[ 1., 0., 0.], + [ 0., 2., 0.], + [ 0., 0., 3.]]) + >>> mt.flipud(A).execute() + array([[ 0., 0., 3.], + [ 0., 2., 0.], + [ 1., 0., 0.]]) + + >>> A = mt.random.randn(2,3,5) + >>> mt.all(mt.flipud(A) == A[::-1,...]).execute() + True + + >>> mt.flipud([1,2]).execute() + array([2, 1]) + + """ + return flip(m, 0) diff --git a/python/xorbits/_mars/tensor/base/hsplit.py b/python/xorbits/_mars/tensor/base/hsplit.py new file mode 100644 index 000000000..bfffbc0c6 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/hsplit.py @@ -0,0 +1,84 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..datasource import tensor as astensor +from .split import split + + +def hsplit(a, indices_or_sections): + """ + Split a tensor into multiple sub-tensors horizontally (column-wise). + + Please refer to the `split` documentation. `hsplit` is equivalent + to `split` with ``axis=1``, the tensor is always split along the second + axis regardless of the tensor dimension. + + See Also + -------- + split : Split an array into multiple sub-arrays of equal size. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.arange(16.0).reshape(4, 4) + >>> x.execute() + array([[ 0., 1., 2., 3.], + [ 4., 5., 6., 7.], + [ 8., 9., 10., 11.], + [ 12., 13., 14., 15.]]) + >>> mt.hsplit(x, 2).execute() + [array([[ 0., 1.], + [ 4., 5.], + [ 8., 9.], + [ 12., 13.]]), + array([[ 2., 3.], + [ 6., 7.], + [ 10., 11.], + [ 14., 15.]])] + >>> mt.hsplit(x, mt.array([3, 6])).execute() + [array([[ 0., 1., 2.], + [ 4., 5., 6.], + [ 8., 9., 10.], + [ 12., 13., 14.]]), + array([[ 3.], + [ 7.], + [ 11.], + [ 15.]]), + array([], dtype=float64)] + + With a higher dimensional array the split is still along the second axis. + + >>> x = mt.arange(8.0).reshape(2, 2, 2) + >>> x.execute() + array([[[ 0., 1.], + [ 2., 3.]], + [[ 4., 5.], + [ 6., 7.]]]) + >>> mt.hsplit(x, 2) + [array([[[ 0., 1.]], + [[ 4., 5.]]]), + array([[[ 2., 3.]], + [[ 6., 7.]]])] + + """ + ary = a + a = astensor(a) + + if a.ndim == 0: + raise ValueError("hsplit only works on tensors of 1 or more dimensions") + if a.ndim > 1: + return split(ary, indices_or_sections, 1) + else: + return split(ary, indices_or_sections, 0) diff --git a/python/xorbits/_mars/tensor/base/in1d.py b/python/xorbits/_mars/tensor/base/in1d.py new file mode 100644 index 000000000..cf01961be --- /dev/null +++ b/python/xorbits/_mars/tensor/base/in1d.py @@ -0,0 +1,94 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union + +import numpy as np + +from ...typing import TileableType +from .. import asarray + + +def in1d( + ar1: Union[TileableType, np.ndarray], + ar2: Union[TileableType, np.ndarray, list], + assume_unique: bool = False, + invert: bool = False, +): + """ + Test whether each element of a 1-D tensor is also present in a second tensor. + + Returns a boolean tensor the same length as `ar1` that is True + where an element of `ar1` is in `ar2` and False otherwise. + + We recommend using :func:`isin` instead of `in1d` for new code. + + Parameters + ---------- + ar1 : (M,) Tensor + Input tensor. + ar2 : array_like + The values against which to test each value of `ar1`. + assume_unique : bool, optional + If True, the input tensors are both assumed to be unique, which + can speed up the calculation. Default is False. + invert : bool, optional + If True, the values in the returned tensor are inverted (that is, + False where an element of `ar1` is in `ar2` and True otherwise). + Default is False. ``np.in1d(a, b, invert=True)`` is equivalent + to (but is faster than) ``np.invert(in1d(a, b))``. + + Returns + ------- + in1d : (M,) Tensor, bool + The values `ar1[in1d]` are in `ar2`. + + See Also + -------- + isin : Version of this function that preserves the + shape of ar1. + numpy.lib.arraysetops : Module with a number of other functions for + performing set operations on arrays. + + Notes + ----- + `in1d` can be considered as an element-wise function version of the + python keyword `in`, for 1-D sequences. ``in1d(a, b)`` is roughly + equivalent to ``mt.array([item in b for item in a])``. + However, this idea fails if `ar2` is a set, or similar (non-sequence) + container: As ``ar2`` is converted to a tensor, in those cases + ``asarray(ar2)`` is an object tensor rather than the expected tensor of + contained values. + + Examples + -------- + >>> import mars.tensor as mt + >>> test = mt.array([0, 1, 2, 5, 0]) + >>> states = [0, 2] + >>> mask = mt.in1d(test, states) + >>> mask.execute() + array([ True, False, True, False, True]) + >>> test[mask].execute() + array([0, 2, 0]) + >>> mask = mt.in1d(test, states, invert=True) + >>> mask.execute() + array([False, True, False, True, False]) + >>> test[mask].execute() + array([1, 5]) + """ + from .isin import isin + + ar1 = asarray(ar1).ravel() + ar2 = asarray(ar2).ravel() + return isin(ar1, ar2, assume_unique=assume_unique, invert=invert) diff --git a/python/xorbits/_mars/tensor/base/insert.py b/python/xorbits/_mars/tensor/base/insert.py new file mode 100644 index 000000000..57dc75c31 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/insert.py @@ -0,0 +1,377 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...core import ENTITY_TYPE, recursive_tile +from ...serialization.serializables import AnyField, Int32Field, KeyField, TupleField +from ...utils import has_unknown_shape +from ..datasource import tensor as astensor +from ..operands import TensorHasInput, TensorOperandMixin +from ..utils import calc_object_length, filter_inputs, validate_axis + + +class TensorInsert(TensorHasInput, TensorOperandMixin): + _op_type_ = OperandDef.INSERT + + _index_obj = AnyField("index_obj") + _values = AnyField("values") + _axis = Int32Field("axis") + _input = KeyField("input") + + # for chunk + _range_on_axis = TupleField("range_on_axis") + + def __init__( + self, index_obj=None, values=None, axis=None, range_on_axis=None, **kw + ): + super().__init__( + _index_obj=index_obj, + _values=values, + _axis=axis, + _range_on_axis=range_on_axis, + **kw + ) + + @property + def index_obj(self): + return self._index_obj + + @property + def values(self): + return self._values + + @property + def axis(self): + return self._axis + + @property + def range_on_axis(self): + return self._range_on_axis + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs[1:]) + if isinstance(self._index_obj, ENTITY_TYPE): + self._index_obj = next(inputs_iter) + if isinstance(self._values, ENTITY_TYPE): + self._values = next(inputs_iter) + + @classmethod + def tile(cls, op: "TensorInsert"): + inp = op.inputs[0] + axis = op.axis + if axis is None: + inp = yield from recursive_tile(inp.flatten()) + axis = 0 + else: + new_splits = [s if i == axis else sum(s) for i, s in enumerate(inp.nsplits)] + inp = yield from recursive_tile(inp.rechunk(new_splits)) + + if has_unknown_shape(inp): + yield + + index_obj = op.index_obj + values = op.values + if isinstance(values, ENTITY_TYPE): + # if values is Mars type, we rechunk it into one chunk and + # all insert chunks depend on it + values = yield from recursive_tile(values.rechunk(values.shape)) + + nsplits_on_axis = [] + if isinstance(index_obj, int): + splits = inp.nsplits[axis] + cum_splits = np.cumsum([0] + list(splits)) + # add 1 for last split + cum_splits[-1] = cum_splits[-1] + 1 + in_idx = cum_splits.searchsorted(index_obj, side="right") - 1 + out_chunks = [] + for chunk in inp.chunks: + if chunk.index[axis] == in_idx: + chunk_op = op.copy().reset_key() + chunk_op._index_obj = index_obj - cum_splits[in_idx] + if isinstance(values, ENTITY_TYPE): + chunk_values = values.chunks[0] + else: + chunk_values = values + inputs = filter_inputs([chunk, chunk_values]) + shape = tuple( + s + calc_object_length(index_obj) if i == axis else s + for i, s in enumerate(chunk.shape) + ) + out_chunks.append( + chunk_op.new_chunk(inputs, shape=shape, index=chunk.index) + ) + nsplits_on_axis.append(shape[axis]) + else: + out_chunks.append(chunk) + nsplits_on_axis.append(chunk.shape[axis]) + elif isinstance(index_obj, ENTITY_TYPE): + index_obj = yield from recursive_tile(index_obj.rechunk(index_obj.shape)) + offset = 0 + out_chunks = [] + for chunk in inp.chunks: + chunk_op = op.copy().reset_key() + chunk_op._index_obj = index_obj.chunks[0] + if isinstance(values, ENTITY_TYPE): + chunk_values = values.chunks[0] + else: + chunk_values = values + chunk_op._values = chunk_values + if chunk.index[axis] + 1 == len(inp.nsplits[axis]): + # the last chunk on axis + chunk_op._range_on_axis = (offset, offset + chunk.shape[axis] + 1) + else: + chunk_op._range_on_axis = (offset, offset + chunk.shape[axis]) + shape = tuple( + np.nan if j == axis else s for j, s in enumerate(chunk.shape) + ) + inputs = filter_inputs([chunk, index_obj.chunks[0], chunk_values]) + out_chunks.append( + chunk_op.new_chunk(inputs, shape=shape, index=chunk.index) + ) + offset += chunk.shape[axis] + nsplits_on_axis.append(np.nan) + else: + # index object is slice or sequence of ints + if isinstance(index_obj, slice): + index_obj = range( + index_obj.start or 0, index_obj.stop, index_obj.step or 1 + ) + splits = inp.nsplits[axis] + cum_splits = np.cumsum([0] + list(splits)) + # add 1 for last split + cum_splits[-1] = cum_splits[-1] + 1 + chunk_idx_params = [[[], []] for _ in splits] + for i, int_idx in enumerate(index_obj): + in_idx = cum_splits.searchsorted(int_idx, side="right") - 1 + chunk_idx_params[in_idx][0].append(int_idx - cum_splits[in_idx]) + chunk_idx_params[in_idx][1].append(i) + + out_chunks = [] + offset = 0 + for chunk in inp.chunks: + idx_on_axis = chunk.index[axis] + if len(chunk_idx_params[idx_on_axis][0]) > 0: + chunk_op = op.copy().reset_key() + chunk_index_obj = chunk_idx_params[idx_on_axis][0] + shape = tuple( + s + len(chunk_index_obj) if j == axis else s + for j, s in enumerate(chunk.shape) + ) + if isinstance(values, int): + chunk_op._index_obj = chunk_index_obj + out_chunks.append( + chunk_op.new_chunk([chunk], shape=shape, index=chunk.index) + ) + elif isinstance(values, ENTITY_TYPE): + chunk_op._values = values.chunks[0] + if chunk.index[axis] + 1 == len(inp.nsplits[axis]): + chunk_op._range_on_axis = ( + offset, + offset + chunk.shape[axis] + 1, + ) + else: + chunk_op._range_on_axis = ( + offset, + offset + chunk.shape[axis], + ) + out_chunks.append( + chunk_op.new_chunk( + [chunk, values.chunks[0]], + shape=shape, + index=chunk.index, + ) + ) + offset += chunk.shape[axis] + else: + chunk_op._index_obj = chunk_index_obj + values = np.asarray(values) + to_shape = [ + calc_object_length(index_obj, chunk.shape[axis]) + ] + [s for j, s in enumerate(inp.shape) if j != axis] + if all(j == k for j, k in zip(to_shape, values.shape)): + chunk_values = np.asarray(values)[ + chunk_idx_params[idx_on_axis][1] + ] + chunk_op._values = chunk_values + out_chunks.append( + chunk_op.new_chunk( + [chunk], shape=shape, index=chunk.index + ) + ) + else: + out_chunks.append( + chunk_op.new_chunk( + [chunk], shape=shape, index=chunk.index + ) + ) + + nsplits_on_axis.append(shape[axis]) + else: + out_chunks.append(chunk) + nsplits_on_axis.append(chunk.shape[axis]) + + nsplits = tuple( + s if i != axis else tuple(nsplits_on_axis) + for i, s in enumerate(inp.nsplits) + ) + out = op.outputs[0] + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + shape=out.shape, + order=out.order, + chunks=out_chunks, + nsplits=nsplits, + ) + + @classmethod + def execute(cls, ctx, op: "TensorInsert"): + inp = ctx[op.input.key] + index_obj = ( + ctx[op.index_obj.key] if hasattr(op.index_obj, "key") else op.index_obj + ) + values = ctx[op.values.key] if hasattr(op.values, "key") else op.values + if op.range_on_axis is None: + ctx[op.outputs[0].key] = np.insert(inp, index_obj, values, axis=op.axis) + else: + if isinstance(index_obj, slice): + index_obj = np.arange( + index_obj.step or 0, index_obj.stop, index_obj.step or 1 + ) + else: + index_obj = np.array(index_obj) + values = np.asarray(values) + + part_index = [ + i + for i, idx in enumerate(index_obj) + if ((idx >= op.range_on_axis[0]) and idx < op.range_on_axis[1]) + ] + if ( + (values.ndim > 0) + and len(index_obj) == len(values) + and (values[0].ndim > 0 or inp.ndim == 1) + ): + ctx[op.outputs[0].key] = np.insert( + inp, + index_obj[part_index] - op.range_on_axis[0], + values[part_index], + axis=op.axis, + ) + else: + ctx[op.outputs[0].key] = np.insert( + inp, + index_obj[part_index] - op.range_on_axis[0], + values, + axis=op.axis, + ) + + def __call__(self, arr, obj, values, shape): + return self.new_tensor( + filter_inputs([arr, obj, values]), shape=shape, order=arr.order + ) + + +def insert(arr, obj, values, axis=None): + """ + Insert values along the given axis before the given indices. + + Parameters + ---------- + arr : array like + Input array. + obj : int, slice or sequence of ints + Object that defines the index or indices before which `values` is + inserted. + values : array_like + Values to insert into `arr`. If the type of `values` is different + from that of `arr`, `values` is converted to the type of `arr`. + `values` should be shaped so that ``arr[...,obj,...] = values`` + is legal. + axis : int, optional + Axis along which to insert `values`. If `axis` is None then `arr` + is flattened first. + Returns + ------- + out : ndarray + A copy of `arr` with `values` inserted. Note that `insert` + does not occur in-place: a new array is returned. If + `axis` is None, `out` is a flattened array. + See Also + -------- + append : Append elements at the end of an array. + concatenate : Join a sequence of arrays along an existing axis. + delete : Delete elements from an array. + Notes + ----- + Note that for higher dimensional inserts `obj=0` behaves very different + from `obj=[0]` just like `arr[:,0,:] = values` is different from + `arr[:,[0],:] = values`. + Examples + -------- + >>> import mars.tensor as mt + >>> a = mt.array([[1, 1], [2, 2], [3, 3]]) + >>> a.execute() + array([[1, 1], + [2, 2], + [3, 3]]) + >>> mt.insert(a, 1, 5).execute() + array([1, 5, 1, ..., 2, 3, 3]) + >>> mt.insert(a, 1, 5, axis=1).execute() + array([[1, 5, 1], + [2, 5, 2], + [3, 5, 3]]) + Difference between sequence and scalars: + >>> mt.insert(a, [1], [[1],[2],[3]], axis=1).execute() + array([[1, 1, 1], + [2, 2, 2], + [3, 3, 3]]) + >>> b = a.flatten() + >>> b.execute() + array([1, 1, 2, 2, 3, 3]) + >>> mt.insert(b, [2, 2], [5, 6]).execute() + array([1, 1, 5, ..., 2, 3, 3]) + >>> mt.insert(b, slice(2, 4), [5, 6]).execute() + array([1, 1, 5, ..., 2, 3, 3]) + >>> mt.insert(b, [2, 2], [7.13, False]).execute() # type casting + array([1, 1, 7, ..., 2, 3, 3]) + >>> x = mt.arange(8).reshape(2, 4) + >>> idx = (1, 3) + >>> mt.insert(x, idx, 999, axis=1).execute() + array([[ 0, 999, 1, 2, 999, 3], + [ 4, 999, 5, 6, 999, 7]]) + """ + arr = astensor(arr) + if getattr(obj, "ndim", 0) > 1: # pragma: no cover + raise ValueError( + "index array argument obj to insert must be one dimensional or scalar" + ) + + if axis is None: + # if axis is None, array will be flatten + arr_size = arr.size + idx_length = calc_object_length(obj, size=arr_size) + shape = (arr_size + idx_length,) + else: + validate_axis(arr.ndim, axis) + idx_length = calc_object_length(obj, size=arr.shape[axis]) + shape = tuple( + s + idx_length if i == axis else s for i, s in enumerate(arr.shape) + ) + + op = TensorInsert(index_obj=obj, values=values, axis=axis, dtype=arr.dtype) + return op(arr, obj, values, shape) diff --git a/python/xorbits/_mars/tensor/base/isin.py b/python/xorbits/_mars/tensor/base/isin.py new file mode 100644 index 000000000..95fe76e0d --- /dev/null +++ b/python/xorbits/_mars/tensor/base/isin.py @@ -0,0 +1,199 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import BoolField +from ...typing import TileableType +from ..array_utils import as_same_device, device +from ..core import TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorOperand, TensorOperandMixin + + +class TensorIsIn(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.ISIN + + assume_unique = BoolField("assume_unique") + invert = BoolField("invert") + + def __call__(self, element, test_elements): + self.dtype = np.dtype(bool) + return self.new_tensor( + [element, test_elements], shape=element.shape, order=TensorOrder.C_ORDER + ) + + @classmethod + def tile(cls, op): + from ..merge.stack import TensorStack + from ..reduction import TensorAll, TensorAny + + ar1, ar2 = op.inputs + invert = op.invert + out = op.outputs[0] + + out_chunks = [] + for ar1_chunk in ar1.chunks: + to_concat_chunks = [] + for ar2_chunk in ar2.chunks: + chunk_op = op.copy().reset_key() + out_chunk = chunk_op.new_chunk( + [ar1_chunk, ar2_chunk], + dtype=out.dtype, + shape=ar1_chunk.shape, + order=out.order, + index=ar1_chunk.index, + ) + to_concat_chunks.append(out_chunk) + if len(to_concat_chunks) == 1: + out_chunks.append(to_concat_chunks[0]) + else: + # concat chunks + concat_op = TensorStack(axis=0) + shape = (len(to_concat_chunks),) + ar1_chunk.shape + concat_chunk = concat_op.new_chunk( + to_concat_chunks, shape=shape, dtype=out.dtype, order=out.order + ) + if not invert: + chunk_op = TensorAny(axis=(0,), dtype=out.dtype) + out_chunk = chunk_op.new_chunk( + [concat_chunk], + shape=ar1_chunk.shape, + dtype=out.dtype, + order=out.order, + index=ar1_chunk.index, + ) + else: + chunk_op = TensorAll(axis=(0,), dtype=out.dtype) + out_chunk = chunk_op.new_chunk( + [concat_chunk], + shape=ar1_chunk.shape, + dtype=out.dtype, + order=out.order, + index=ar1_chunk.index, + ) + out_chunks.append(out_chunk) + + params = out.params.copy() + params["nsplits"] = ar1.nsplits + params["chunks"] = out_chunks + new_op = op.copy() + return new_op.new_tensors(op.inputs, kws=[params]) + + @classmethod + def execute(cls, ctx, op): + (element, test_elements), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + ctx[op.outputs[0].key] = xp.isin( + element, test_elements, assume_unique=op.assume_unique, invert=op.invert + ) + + +def isin( + element: Union[TileableType, np.ndarray], + test_elements: Union[TileableType, np.ndarray, list], + assume_unique: bool = False, + invert: bool = False, +): + """ + Calculates `element in test_elements`, broadcasting over `element` only. + Returns a boolean array of the same shape as `element` that is True + where an element of `element` is in `test_elements` and False otherwise. + + Parameters + ---------- + element : array_like + Input tensor. + test_elements : array_like + The values against which to test each value of `element`. + This argument is flattened if it is a tensor or array_like. + See notes for behavior with non-array-like parameters. + assume_unique : bool, optional + If True, the input tensors are both assumed to be unique, which + can speed up the calculation. Default is False. + invert : bool, optional + If True, the values in the returned tensor are inverted, as if + calculating `element not in test_elements`. Default is False. + ``mt.isin(a, b, invert=True)`` is equivalent to (but faster + than) ``mt.invert(mt.isin(a, b))``. + + Returns + ------- + isin : Tensor, bool + Has the same shape as `element`. The values `element[isin]` + are in `test_elements`. + + See Also + -------- + in1d : Flattened version of this function. + + Notes + ----- + + `isin` is an element-wise function version of the python keyword `in`. + ``isin(a, b)`` is roughly equivalent to + ``mt.array([item in b for item in a])`` if `a` and `b` are 1-D sequences. + + `element` and `test_elements` are converted to tensors if they are not + already. If `test_elements` is a set (or other non-sequence collection) + it will be converted to an object tensor with one element, rather than a + tensor of the values contained in `test_elements`. This is a consequence + of the `tensor` constructor's way of handling non-sequence collections. + Converting the set to a list usually gives the desired behavior. + + Examples + -------- + >>> import mars.tensor as mt + + >>> element = 2*mt.arange(4).reshape((2, 2)) + >>> element.execute() + array([[0, 2], + [4, 6]]) + >>> test_elements = [1, 2, 4, 8] + >>> mask = mt.isin(element, test_elements) + >>> mask.execute() + array([[ False, True], + [ True, False]]) + >>> element[mask].execute() + array([2, 4]) + >>> mask = mt.isin(element, test_elements, invert=True) + >>> mask.execute() + array([[ True, False], + [ False, True]]) + >>> element[mask] + array([0, 6]) + + Because of how `array` handles sets, the following does not + work as expected: + + >>> test_set = {1, 2, 4, 8} + >>> mt.isin(element, test_set).execute() + array([[ False, False], + [ False, False]]) + + Casting the set to a list gives the expected result: + + >>> mt.isin(element, list(test_set)).execute() + array([[ False, True], + [ True, False]]) + """ + element, test_elements = astensor(element), astensor(test_elements).ravel() + op = TensorIsIn(assume_unique=assume_unique, invert=invert) + return op(element, test_elements) diff --git a/python/xorbits/_mars/tensor/base/map_chunk.py b/python/xorbits/_mars/tensor/base/map_chunk.py new file mode 100644 index 000000000..0f884c326 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/map_chunk.py @@ -0,0 +1,221 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes +from ...core import CHUNK_TYPE, ENTITY_TYPE, recursive_tile +from ...core.custom_log import redirect_custom_log +from ...serialization.serializables import ( + BoolField, + DictField, + FunctionField, + TupleField, +) +from ...utils import ( + enter_current_session, + find_objects, + has_unknown_shape, + quiet_stdio, + replace_objects, +) +from ..operands import TensorOperand, TensorOperandMixin + + +class TensorMapChunk(TensorOperand, TensorOperandMixin): + _op_type_ = opcodes.MAP_CHUNK + + _func = FunctionField("func") + _elementwise = BoolField("elementwise") + _args = TupleField("args") + _kwargs = DictField("kwargs") + _with_chunk_index = BoolField("with_chunk_index") + + def __init__( + self, + func=None, + args=None, + kwargs=None, + elementwise=None, + with_chunk_index=None, + **kw + ): + super().__init__( + _func=func, + _args=args, + _kwargs=kwargs, + _elementwise=elementwise, + _with_chunk_index=with_chunk_index, + **kw + ) + + @property + def func(self): + return self._func + + @property + def elementwise(self): + return self._elementwise + + @property + def args(self): + return self._args + + @property + def kwargs(self): + return self._kwargs + + @property + def with_chunk_index(self): + return self._with_chunk_index + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + old_inputs = find_objects(self._args, ENTITY_TYPE) + find_objects( + self._kwargs, ENTITY_TYPE + ) + mapping = {o: n for o, n in zip(old_inputs, self._inputs[1:])} + self._args = replace_objects(self._args, mapping) + self._kwargs = replace_objects(self._kwargs, mapping) + + def __call__(self, t, dtype=None, shape=None): + if dtype is None: + try: + kwargs = self.kwargs or dict() + if self.with_chunk_index: + kwargs["chunk_index"] = (0,) * t.ndim + with np.errstate(all="ignore"), quiet_stdio(): + mock_result = self.func( + np.random.rand(2, 2).astype(t.dtype), + *(self.args or ()), + **kwargs + ) + except: + raise TypeError("Cannot estimate output type of map_chunk call") + dtype = mock_result.dtype + + if shape is not None: + new_shape = shape + else: + new_shape = t.shape if self.elementwise else (np.nan,) * t.ndim + inputs = ( + [t] + + find_objects(self.args, ENTITY_TYPE) + + find_objects(self.kwargs, ENTITY_TYPE) + ) + return self.new_tensor(inputs, dtype=dtype, shape=new_shape) + + @classmethod + def tile(cls, op: "TensorMapChunk"): + inp = op.inputs[0] + out = op.outputs[0] + + new_inputs = [op.inputs[0]] + if has_unknown_shape(*op.inputs[1:]): + yield + for other_inp in op.inputs[1:]: + other_inp = yield from recursive_tile(other_inp.rechunk(other_inp.shape)) + new_inputs.append(other_inp) + + chunks = [] + for c in inp.chunks: + params = c.params + params["dtype"] = out.dtype + if not op.elementwise: + params["shape"] = (np.nan,) * out.ndim + params["index"] = params["index"][: out.ndim] + + new_op = op.copy().reset_key() + new_op.tileable_op_key = out.key + chunk_inputs = [c] + for other_inp in new_inputs[1:]: + chunk_inputs.append(other_inp.chunks[0]) + chunks.append(new_op.new_chunk(chunk_inputs, **params)) + + new_op = op.copy().reset_key() + params = out.params + nsplits = inp.nsplits[: out.ndim] + if not op.elementwise: + nsplits = tuple((np.nan,) * len(sp) for sp in nsplits) + return new_op.new_tileables(op.inputs, chunks=chunks, nsplits=nsplits, **params) + + @classmethod + @redirect_custom_log + @enter_current_session + def execute(cls, ctx, op: "TensorMapChunk"): + in_data = ctx[op.inputs[0].key] + out_chunk = op.outputs[0] + + args = op.args or tuple() + kwargs = op.kwargs or dict() + if op.with_chunk_index: + kwargs["chunk_index"] = out_chunk.index + + chunks = find_objects(args, CHUNK_TYPE) + find_objects(kwargs, CHUNK_TYPE) + mapping = {chunk: ctx[chunk.key] for chunk in chunks} + args = replace_objects(args, mapping) + kwargs = replace_objects(kwargs, mapping) + + ctx[op.outputs[0].key] = op.func(in_data, *args, **kwargs) + + +def map_chunk(t, func, args=(), **kwargs): + """ + Apply function to each chunk. + + Parameters + ---------- + func : function + Function to apply to each chunk. + args : tuple + Positional arguments to pass to func in addition to the array. + **kwargs + Additional keyword arguments to pass as keywords arguments to func. + + Returns + ------- + Tensor + Result of applying ``func`` to each chunk of the Tensor. + + Examples + -------- + >>> import mars.tensor as mt + >>> a = mt.array([[4, 9]] * 3) + >>> a.execute() + array([[4, 9], + [4, 9], + [4, 9]]) + + Output dtype will be auto inferred. + + >>> a.map_chunk(lambda c: c * 0.5).execute() + array([[2. , 4.5], + [2. , 4.5], + [2. , 4.5]]) + + You can specify ``dtype`` by yourself if auto infer failed. + """ + elementwise = kwargs.pop("elementwise", None) + dtype = np.dtype(kwargs.pop("dtype")) if "dtype" in kwargs else None + shape = kwargs.pop("shape", None) + with_chunk_index = kwargs.pop("with_chunk_index", False) + + op = TensorMapChunk( + func=func, + args=args, + kwargs=kwargs, + elementwise=elementwise, + with_chunk_index=with_chunk_index, + ) + return op(t, dtype=dtype, shape=shape) diff --git a/python/xorbits/_mars/tensor/base/moveaxis.py b/python/xorbits/_mars/tensor/base/moveaxis.py new file mode 100644 index 000000000..9caa0654e --- /dev/null +++ b/python/xorbits/_mars/tensor/base/moveaxis.py @@ -0,0 +1,84 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from numpy.core.numeric import normalize_axis_tuple + +from ..datasource import tensor as astensor +from .transpose import transpose + + +def moveaxis(a, source, destination): + """ + Move axes of a tensor to new positions. + + Other axes remain in their original order. + + Parameters + ---------- + a : Tensor + The tensor whose axes should be reordered. + source : int or sequence of int + Original positions of the axes to move. These must be unique. + destination : int or sequence of int + Destination positions for each of the original axes. These must also be + unique. + + Returns + ------- + result : Tensor + Array with moved axes. This tensor is a view of the input tensor. + + See Also + -------- + transpose: Permute the dimensions of an array. + swapaxes: Interchange two axes of an array. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.zeros((3, 4, 5)) + >>> mt.moveaxis(x, 0, -1).shape + (4, 5, 3) + >>> mt.moveaxis(x, -1, 0).shape + (5, 3, 4), + + These all achieve the same result: + + >>> mt.transpose(x).shape + (5, 4, 3) + >>> mt.swapaxes(x, 0, -1).shape + (5, 4, 3) + >>> mt.moveaxis(x, [0, 1], [-1, -2]).shape + (5, 4, 3) + >>> mt.moveaxis(x, [0, 1, 2], [-1, -2, -3]).shape + (5, 4, 3) + + """ + a = astensor(a) + + source = normalize_axis_tuple(source, a.ndim, "source") + destination = normalize_axis_tuple(destination, a.ndim, "destination") + if len(source) != len(destination): + raise ValueError( + "`source` and `destination` arguments must have " + "the same number of elements" + ) + + order = [n for n in range(a.ndim) if n not in source] + + for dest, src in sorted(zip(destination, source)): + order.insert(dest, src) + + return transpose(a, order) diff --git a/python/xorbits/_mars/tensor/base/ndim.py b/python/xorbits/_mars/tensor/base/ndim.py new file mode 100644 index 000000000..cb7a8c9f1 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/ndim.py @@ -0,0 +1,53 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def ndim(a): + """ + Return the number of dimensions of a tensor. + + Parameters + ---------- + a : array_like + Input tebsir. If it is not already a tensor, a conversion is + attempted. + + Returns + ------- + number_of_dimensions : int + The number of dimensions in `a`. Scalars are zero-dimensional. + + See Also + -------- + ndarray.ndim : equivalent method + shape : dimensions of tensor + Tensor.shape : dimensions of tensor + + Examples + -------- + >>> import mars.tensor as mt + >>> mt.ndim([[1,2,3],[4,5,6]]) + 2 + >>> mt.ndim(mt.array([[1,2,3],[4,5,6]])) + 2 + >>> mt.ndim(1) + 0 + + """ + from ..datasource import asarray + + try: + return a.ndim + except AttributeError: + return asarray(a).ndim diff --git a/python/xorbits/_mars/tensor/base/partition.py b/python/xorbits/_mars/tensor/base/partition.py new file mode 100644 index 000000000..c57f832a2 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/partition.py @@ -0,0 +1,774 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np + +from ... import opcodes as OperandDef +from ...core import ENTITY_TYPE, ExecutableTuple, recursive_tile +from ...serialization.serializables import ( + AnyField, + BoolField, + FieldTypes, + Int32Field, + KeyField, + ListField, + StringField, +) +from ...utils import flatten, has_unknown_shape, stack_back +from ..array_utils import as_same_device, device +from ..core import TENSOR_CHUNK_TYPE, TENSOR_TYPE, TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorOperand, TensorShuffleProxy +from ..utils import validate_axis, validate_order +from .psrs import TensorPSRSOperandMixin + + +class ParallelPartitionMixin(TensorPSRSOperandMixin): + @classmethod + def calc_paritions_info(cls, op, kth, size, sort_info_chunks): + # stage5, collect sort infos and calculate partition info for each partitions + if isinstance(kth, TENSOR_TYPE): + kth = kth.chunks[0] + is_kth_input = True + else: + is_kth_input = False + calc_op = CalcPartitionsInfo( + kth=kth, size=size, dtype=np.dtype(np.int32), gpu=op.gpu + ) + kws = [] + for i, sort_info_chunk in enumerate(sort_info_chunks): + kws.append( + { + "shape": sort_info_chunk.shape + (len(kth),), + "order": sort_info_chunk.order, + "index": sort_info_chunk.index, + "pos": i, + } + ) + inputs = list(sort_info_chunks) + if is_kth_input: + inputs.insert(0, kth) + return calc_op.new_chunks(inputs, kws=kws, output_limit=len(kws)) + + @classmethod + def partition_on_merged( + cls, + op, + need_align, + partition_merged_chunks, + partition_indices_chunks, + partition_info_chunks, + ): + # Stage 6: partition on each partitions + return_value, return_indices = op.return_value, op.return_indices + partitioned_chunks, partitioned_indices_chunks = [], [] + for i, partition_merged_chunk, partition_info_chunk in zip( + itertools.count(), partition_merged_chunks, partition_info_chunks + ): + partition_op = PartitionMerged( + return_value=return_value, + return_indices=return_indices, + order=op.order, + kind=op.kind, + need_align=need_align, + dtype=partition_merged_chunk.dtype, + gpu=op.gpu, + ) + chunk_inputs = [] + kws = [] + if return_value: + chunk_inputs.append(partition_merged_chunk) + kws.append( + { + "shape": partition_merged_chunk.shape, + "order": partition_merged_chunk.order, + "index": partition_merged_chunk.index, + "dtype": partition_merged_chunk.dtype, + "type": "partitioned", + } + ) + if return_indices: + if not return_value: + # value is required even it's not returned + chunk_inputs.append(partition_merged_chunk) + chunk_inputs.append(partition_indices_chunks[i]) + kws.append( + { + "shape": partition_merged_chunk.shape, + "order": TensorOrder.C_ORDER, + "index": partition_merged_chunk.index, + "dtype": np.dtype(np.int64), + "type": "argpartition", + } + ) + chunk_inputs.append(partition_info_chunk) + partition_chunks = partition_op.new_chunks(chunk_inputs, kws=kws) + if return_value: + partitioned_chunks.append(partition_chunks[0]) + if return_indices: + partitioned_indices_chunks.append(partition_chunks[-1]) + + return partitioned_chunks, partitioned_indices_chunks + + +class TensorPartition(TensorOperand, ParallelPartitionMixin): + _op_type_ = OperandDef.PARTITION + + _input = KeyField("input") + _kth = AnyField("kth") + _axis = Int32Field("axis") + _kind = StringField("kind") + _order = ListField("order", FieldTypes.string) + _need_align = BoolField("need_align") + _return_value = BoolField("return_value") + _return_indices = BoolField("return_indices") + + def __init__( + self, + kth=None, + axis=None, + kind=None, + order=None, + need_align=None, + return_value=None, + return_indices=None, + dtype=None, + gpu=None, + **kw, + ): + super().__init__( + _kth=kth, + _axis=axis, + _kind=kind, + _order=order, + _need_align=need_align, + _return_value=return_value, + _return_indices=return_indices, + dtype=dtype, + gpu=gpu, + **kw, + ) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + if len(self._inputs) > 1: + self._kth = self._inputs[1] + + @property + def psrs_kinds(self): + # to keep compatibility with PSRS + # remember when merging data in PSRSShuffle(reduce), + # we don't need sort, thus set psrs_kinds[2] to None + return ["quicksort", "mergesort", None] + + @property + def need_align(self): + return self._need_align + + @property + def input(self): + return self._input + + @property + def kth(self): + return self._kth + + @property + def axis(self): + return self._axis + + @property + def kind(self): + return self._kind + + @property + def order(self): + return self._order + + @property + def return_value(self): + return self._return_value + + @property + def return_indices(self): + return self._return_indices + + @property + def output_limit(self): + return int(bool(self._return_value)) + int(bool(self._return_indices)) + + def __call__(self, a, kth): + inputs = [a] + if isinstance(kth, TENSOR_TYPE): + inputs.append(kth) + kws = [] + if self._return_value: + kws.append( + { + "shape": a.shape, + "order": a.order, + "type": "sorted", + "dtype": a.dtype, + } + ) + if self._return_indices: + kws.append( + { + "shape": a.shape, + "order": TensorOrder.C_ORDER, + "type": "argsort", + "dtype": np.dtype(np.int64), + } + ) + ret = self.new_tensors(inputs, kws=kws) + if len(kws) == 1: + return ret[0] + return ExecutableTuple(ret) + + @classmethod + def _tile_psrs(cls, op, kth): + """ + Approach here would be almost like PSRSSorter, but there are definitely some differences + Main processes are listed below: + Stage 1, local sort and regular samples collected + State 2, gather and merge samples, choose and broadcast p-1 pivots + Stage 3, Local data is partitioned + Stage 4: all *ith* classes are gathered and merged, sizes should be calculated as well + Stage 5: collect sizes from partitions, calculate how to partition given kth + Stage 6: partition on each partitions + Stage 7: align if axis is given, and more than 1 dimension + """ + out_tensor = op.outputs[0] + return_value, return_indices = op.return_value, op.return_indices + # preprocess, to make sure chunk shape on axis are approximately same + in_tensor, axis_chunk_shape, out_idxes, need_align = yield from cls.preprocess( + op + ) + axis_offsets = [0] + np.cumsum(in_tensor.nsplits[op.axis]).tolist()[:-1] + + out_chunks, out_indices_chunks = [], [] + for out_idx in out_idxes: + # stage 1: local sort and regular samples collected + ( + sorted_chunks, + indices_chunks, + sampled_chunks, + ) = cls.local_sort_and_regular_sample( + op, in_tensor, axis_chunk_shape, axis_offsets, out_idx + ) + + # stage 2: gather and merge samples, choose and broadcast p-1 pivots + concat_pivot_chunk = cls.concat_and_pivot( + op, axis_chunk_shape, out_idx, sorted_chunks, sampled_chunks + ) + + # stage 3: Local data is partitioned + partition_chunks = cls.partition_local_data( + op, axis_chunk_shape, sorted_chunks, indices_chunks, concat_pivot_chunk + ) + + proxy_chunk = TensorShuffleProxy(dtype=partition_chunks[0].dtype).new_chunk( + partition_chunks, shape=() + ) + + # stage 4: all *ith* classes are gathered and merged, + # note that we don't need sort here, op.psrs_kinds[2] is None + # force need_align=True to get sort info + ( + partition_merged_chunks, + partition_indices_chunks, + sort_info_chunks, + ) = cls.partition_merge_data(op, True, True, partition_chunks, proxy_chunk) + + # stage5, collect sort infos and calculate partition info for each partitions + partition_info_chunks = cls.calc_paritions_info( + op, kth, in_tensor.shape[op.axis], sort_info_chunks + ) + + # Stage 6: partition on each partitions + partitioned_chunks, partitioned_indices_chunks = cls.partition_on_merged( + op, + need_align, + partition_merged_chunks, + partition_indices_chunks, + partition_info_chunks, + ) + + if not need_align: + if return_value: + out_chunks.extend(partitioned_chunks) + if return_indices: + out_indices_chunks.extend(partitioned_indices_chunks) + else: + ( + align_reduce_chunks, + align_reduce_indices_chunks, + ) = cls.align_partitions_data( + op, + out_idx, + in_tensor, + partitioned_chunks, + partitioned_indices_chunks, + sort_info_chunks, + ) + if return_value: + out_chunks.extend(align_reduce_chunks) + if return_indices: + out_indices_chunks.extend(align_reduce_indices_chunks) + + new_op = op.copy() + nsplits = list(in_tensor.nsplits) + if not need_align: + nsplits[op.axis] = (np.nan,) * axis_chunk_shape + kws = [] + if return_value: + kws.append( + { + "shape": out_tensor.shape, + "order": out_tensor.order, + "chunks": out_chunks, + "nsplits": tuple(nsplits), + "dtype": out_tensor.dtype, + "type": "partitioned", + } + ) + if return_indices: + kws.append( + { + "shape": out_tensor.shape, + "order": TensorOrder.C_ORDER, + "chunks": out_indices_chunks, + "nsplits": tuple(nsplits), + "dtype": np.dtype(np.int64), + "type": "argpartition", + } + ) + return new_op.new_tensors(op.inputs, kws=kws) + + @classmethod + def tile(cls, op): + in_tensor = op.input + if np.isnan(in_tensor.shape[op.axis]): + yield + + kth = op.kth + if isinstance(kth, TENSOR_TYPE): + # if `kth` is a tensor, make sure no unknown shape + if has_unknown_shape(kth): + yield + kth = yield from recursive_tile(kth.rechunk(kth.shape)) + + return_value, return_indices = op.return_value, op.return_indices + if in_tensor.chunk_shape[op.axis] == 1: + out_chunks, out_indices_chunks = [], [] + for chunk in in_tensor.chunks: + chunk_op = op.copy().reset_key() + kws = [] + if return_value: + kws.append( + { + "shape": chunk.shape, + "index": chunk.index, + "order": chunk.order, + "dtype": chunk.dtype, + "type": "partitioned", + } + ) + if return_indices: + kws.append( + { + "shape": chunk.shape, + "index": chunk.index, + "order": TensorOrder.C_ORDER, + "dtype": np.dtype(np.int64), + "type": "argpartition", + } + ) + chunk_inputs = [chunk] + if isinstance(kth, TENSOR_TYPE): + chunk_inputs.append(kth.chunks[0]) + chunks = chunk_op.new_chunks(chunk_inputs, kws=kws) + if return_value: + out_chunks.append(chunks[0]) + if return_indices: + out_indices_chunks.append(chunks[-1]) + + new_op = op.copy() + kws = [out.params for out in op.outputs] + if return_value: + kws[0]["nsplits"] = in_tensor.nsplits + kws[0]["chunks"] = out_chunks + if return_indices: + kws[-1]["nsplits"] = in_tensor.nsplits + kws[-1]["chunks"] = out_indices_chunks + return new_op.new_tensors([in_tensor], kws=kws) + else: + return (yield from cls._tile_psrs(op, kth)) + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True + ) + a = inputs[0] + if len(inputs) == 2: + kth = inputs[1] + else: + kth = op.kth + return_value, return_indices = op.return_value, op.return_indices + + with device(device_id): + kw = {} + if op.kind is not None: + kw["kind"] = op.kind + if op.order is not None: + kw["order"] = op.order + + if return_indices: + if not return_value: + ctx[op.outputs[0].key] = xp.argpartition(a, kth, axis=op.axis, **kw) + else: + argparts = ctx[op.outputs[1].key] = xp.argpartition( + a, kth, axis=op.axis, **kw + ) + ctx[op.outputs[0].key] = xp.take_along_axis(a, argparts, op.axis) + else: + ctx[op.outputs[0].key] = xp.partition(a, kth, axis=op.axis, **kw) + + +class CalcPartitionsInfo(TensorOperand, TensorPSRSOperandMixin): + _op_type_ = OperandDef.CALC_PARTITIONS_INFO + + _kth = AnyField("kth") + _size = Int32Field("size") + + def __init__(self, kth=None, size=None, dtype=None, gpu=None, **kw): + super().__init__(_kth=kth, _size=size, dtype=dtype, gpu=gpu, **kw) + + @property + def kth(self): + return self._kth + + @property + def size(self): + return self._size + + @property + def output_limit(self): + return np.inf + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if isinstance(self._kth, ENTITY_TYPE): + self._kth = self._inputs[0] + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + if isinstance(op.kth, TENSOR_CHUNK_TYPE): + kth = inputs[0] + sort_infos = inputs[1:] + # make kth all positive + kth = _validate_kth_value(kth, op.size) + else: + kth = op.kth + sort_infos = inputs + + sort_info_shape = sort_infos[0].shape + # create arrays filled with -1, -1 means do nothing about partition + partition_infos = [ + xp.full(sort_info_shape + (len(kth),), -1) for _ in sort_infos + ] + concat_sort_info = xp.stack([sort_info.ravel() for sort_info in sort_infos]) + cumsum_sort_info = xp.cumsum(concat_sort_info, axis=0) + + for j in range(cumsum_sort_info.shape[1]): + idx = xp.unravel_index(j, sort_infos[0].shape) + sizes = cumsum_sort_info[:, j] + to_partition_chunk_idxes = xp.searchsorted(sizes, kth, side="right") + for i, to_partition_chunk_idx in enumerate(to_partition_chunk_idxes): + partition_idx = tuple(idx) + (i,) + k = kth[i] + # if to partition on chunk 0, just set to kth + # else kth - {size of previous chunks} + chunk_k = ( + k + if to_partition_chunk_idx == 0 + else k - sizes[to_partition_chunk_idx - 1] + ) + partition_infos[to_partition_chunk_idx][partition_idx] = chunk_k + + for out, partition_info in zip(op.outputs, partition_infos): + ctx[out.key] = partition_info + + +class PartitionMerged(TensorOperand, TensorPSRSOperandMixin): + _op_type_ = OperandDef.PARTITION_MERGED + + _return_value = BoolField("return_value") + _return_indices = BoolField("return_indices") + _order = ListField("order", FieldTypes.string) + _kind = StringField("kind") + _need_align = BoolField("need_align") + + def __init__( + self, + return_value=None, + return_indices=None, + order=None, + kind=None, + need_align=None, + **kw, + ): + super().__init__( + _return_value=return_value, + _return_indices=return_indices, + _order=order, + _kind=kind, + _need_align=need_align, + **kw, + ) + + @property + def return_value(self): + return self._return_value + + @property + def return_indices(self): + return self._return_indices + + @property + def order(self): + return self._order + + @property + def kind(self): + return self._kind + + @property + def need_align(self): + return self._need_align + + @property + def output_limit(self): + return int(bool(self._return_value)) + int(bool(self._return_indices)) + + @classmethod + def execute(cls, ctx, op): + return_value, return_indices = op.return_value, op.return_indices + + raw_inputs = [ctx[inp.key] for inp in op.inputs] + flatten_inputs = flatten(raw_inputs) + inputs, device_id, xp = as_same_device( + flatten_inputs, device=op.device, ret_extra=True + ) + inputs = stack_back(inputs, raw_inputs) + partition_info = inputs[-1] + merged_data, merged_indices = None, None + if return_value: + merged_data = inputs[0] + if return_indices: + # if return indices, value should be returned + assert len(inputs) == 3 + if not return_value: + merged_data = inputs[0] + merged_indices = inputs[1] + + outs, out_indices = [], [] + with device(device_id): + kw = {} + if op.kind is not None: + kw["kind"] = op.kind + if op.order is not None: + kw["order"] = op.order + + ravel_partition_info = partition_info.reshape(-1, partition_info.shape[-1]) + for i, merged_vec, kth in zip( + itertools.count(), merged_data, ravel_partition_info + ): + kth = kth[kth > -1] + if kth.size == 0: + if return_value: + outs.append(merged_vec) + if return_indices: + out_indices.append(merged_indices[i]) + else: + if return_indices: + argparts = xp.argpartition(merged_vec, kth, **kw) + if return_value: + outs.append(xp.take(merged_vec, argparts)) + out_indices.append(xp.take(merged_indices[i], argparts)) + else: + outs.append(xp.partition(merged_vec, kth, **kw)) + + if not op.need_align: + assert len(outs or out_indices) == 1 + i = 0 + if return_value: + ctx[op.outputs[0].key] = outs[0] + i += 1 + if return_indices: + ctx[op.outputs[i].key] = out_indices[0] + else: + i = 0 + if return_value: + ctx[op.outputs[0].key] = tuple(outs) + i += 1 + if return_indices: + ctx[op.outputs[i].key] = tuple(out_indices) + + +def _check_kth_dtype(dtype): + if not np.issubdtype(dtype, np.integer): + raise TypeError("Partition index must be integer") + + +def _validate_kth_value(kth, size): + kth = np.where(kth < 0, kth + size, kth) + if np.any((kth < 0) | (kth >= size)): + invalid_kth = next(k for k in kth if k < 0 or k >= size) + raise ValueError(f"kth(={invalid_kth}) out of bounds ({size})") + return kth + + +def _validate_partition_arguments(a, kth, axis, kind, order, kw): + a = astensor(a) + if axis is None: + a = a.flatten() + axis = 0 + else: + axis = validate_axis(a.ndim, axis) + if isinstance(kth, ENTITY_TYPE): + kth = astensor(kth) + _check_kth_dtype(kth.dtype) + else: + kth = np.atleast_1d(kth) + kth = _validate_kth_value(kth, a.shape[axis]) + if kth.ndim > 1: + raise ValueError("object too deep for desired array") + if kind != "introselect": + raise ValueError(f"{kind} is an unrecognized kind of select") + # if a is structure type and order is not None + order = validate_order(a.dtype, order) + need_align = kw.pop("need_align", None) + if len(kw) > 0: + raise TypeError( + f"partition() got an unexpected keyword argument '{next(iter(kw))}'" + ) + + return a, kth, axis, kind, order, need_align + + +def partition(a, kth, axis=-1, kind="introselect", order=None, **kw): + r""" + Return a partitioned copy of a tensor. + + Creates a copy of the tensor with its elements rearranged in such a + way that the value of the element in k-th position is in the + position it would be in a sorted tensor. All elements smaller than + the k-th element are moved before this element and all equal or + greater are moved behind it. The ordering of the elements in the two + partitions is undefined. + + Parameters + ---------- + a : array_like + Tensor to be sorted. + kth : int or sequence of ints + Element index to partition by. The k-th value of the element + will be in its final sorted position and all smaller elements + will be moved before it and all equal or greater elements behind + it. The order of all elements in the partitions is undefined. If + provided with a sequence of k-th it will partition all elements + indexed by k-th of them into their sorted position at once. + axis : int or None, optional + Axis along which to sort. If None, the tensor is flattened before + sorting. The default is -1, which sorts along the last axis. + kind : {'introselect'}, optional + Selection algorithm. Default is 'introselect'. + order : str or list of str, optional + When `a` is a tensor with fields defined, this argument + specifies which fields to compare first, second, etc. A single + field can be specified as a string. Not all fields need be + specified, but unspecified fields will still be used, in the + order in which they come up in the dtype, to break ties. + + Returns + ------- + partitioned_tensor : Tensor + Tensor of the same type and shape as `a`. + + See Also + -------- + Tensor.partition : Method to sort a tensor in-place. + argpartition : Indirect partition. + sort : Full sorting + + Notes + ----- + The various selection algorithms are characterized by their average + speed, worst case performance, work space size, and whether they are + stable. A stable sort keeps items with the same key in the same + relative order. The available algorithms have the following + properties: + + ================= ======= ============= ============ ======= + kind speed worst case work space stable + ================= ======= ============= ============ ======= + 'introselect' 1 O(n) 0 no + ================= ======= ============= ============ ======= + + All the partition algorithms make temporary copies of the data when + partitioning along any but the last axis. Consequently, + partitioning along the last axis is faster and uses less space than + partitioning along any other axis. + + The sort order for complex numbers is lexicographic. If both the + real and imaginary parts are non-nan then the order is determined by + the real parts except when they are equal, in which case the order + is determined by the imaginary parts. + + Examples + -------- + >>> import mars.tensor as mt + >>> a = mt.array([3, 4, 2, 1]) + >>> mt.partition(a, 3).execute() + array([2, 1, 3, 4]) + + >>> mt.partition(a, (1, 3)).execute() + array([1, 2, 3, 4]) + """ + return_indices = kw.pop("return_index", False) + a, kth, axis, kind, order, need_align = _validate_partition_arguments( + a, kth, axis, kind, order, kw + ) + op = TensorPartition( + kth=kth, + axis=axis, + kind=kind, + order=order, + need_align=need_align, + return_value=True, + return_indices=return_indices, + dtype=a.dtype, + gpu=a.op.gpu, + ) + return op(a, kth) diff --git a/python/xorbits/_mars/tensor/base/psrs.py b/python/xorbits/_mars/tensor/base/psrs.py new file mode 100644 index 000000000..ff8c712fc --- /dev/null +++ b/python/xorbits/_mars/tensor/base/psrs.py @@ -0,0 +1,993 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from functools import partial + +import numpy as np + +from ... import opcodes as OperandDef +from ...core import recursive_tile +from ...core.operand import OperandStage +from ...serialization.serializables import ( + AnyField, + BoolField, + FieldTypes, + Int32Field, + ListField, + StringField, +) +from ...utils import flatten, stack_back +from ..array_utils import as_same_device, cp, device +from ..core import TensorOrder +from ..operands import ( + TensorMapReduceOperand, + TensorOperand, + TensorOperandMixin, + TensorShuffleProxy, +) + + +class PSRSOperandMixin: + @classmethod + def preprocess(cls, op, in_data=None): + if in_data is None: + in_data = op.inputs[0] + axis_shape = in_data.shape[op.axis] + axis_chunk_shape = in_data.chunk_shape[op.axis] + + # rechunk to ensure all chunks on axis have rough same size + has_unknown_shape = False + for ns in in_data.nsplits: + if any(np.isnan(s) for s in ns): + has_unknown_shape = True + break + + if not has_unknown_shape: + axis_chunk_shape = min(axis_chunk_shape, int(np.sqrt(axis_shape))) + if np.isnan(axis_shape) or any( + np.isnan(s) for s in in_data.nsplits[op.axis] + ): + yield + chunk_size = int(axis_shape / axis_chunk_shape) + chunk_sizes = [chunk_size for _ in range(int(axis_shape // chunk_size))] + if axis_shape % chunk_size > 0: + chunk_sizes[-1] += axis_shape % chunk_size + in_data = yield from recursive_tile( + in_data.rechunk({op.axis: tuple(chunk_sizes)}) + ) + axis_chunk_shape = in_data.chunk_shape[op.axis] + + left_chunk_shape = ( + in_data.chunk_shape[: op.axis] + in_data.chunk_shape[op.axis + 1 :] + ) + if len(left_chunk_shape) > 0: + out_idxes = itertools.product(*(range(s) for s in left_chunk_shape)) + else: + out_idxes = [()] + # if the size except axis has more than 1, the sorted values on each one may be different + # another shuffle would be required to make sure each axis except to sort + # has elements with identical size + extra_shape = [s for i, s in enumerate(in_data.shape) if i != op.axis] + if getattr(op, "need_align", None) is None: + need_align = bool(np.prod(extra_shape, dtype=int) != 1) + else: + need_align = op.need_align + + return in_data, axis_chunk_shape, out_idxes, need_align + + @classmethod + def local_sort_and_regular_sample( + cls, op, in_data, axis_chunk_shape, axis_offsets, out_idx + ): + raise NotImplementedError + + @classmethod + def concat_and_pivot( + cls, op, axis_chunk_shape, out_idx, sorted_chunks, sampled_chunks + ): + raise NotImplementedError + + @classmethod + def partition_local_data( + cls, op, axis_chunk_shape, sorted_chunks, indices_chunks, concat_pivot_chunk + ): + raise NotImplementedError + + @classmethod + def partition_merge_data( + cls, op, need_align, return_value, partition_chunks, proxy_chunk + ): + raise NotImplementedError + + @classmethod + def align_partitions_data( + cls, + op, + out_idx, + in_data, + partition_sort_chunks, + partition_indices_chunks, + sort_info_chunks, + ): + raise NotImplementedError + + +class TensorPSRSOperandMixin(TensorOperandMixin, PSRSOperandMixin): + @classmethod + def local_sort_and_regular_sample( + cls, op, in_data, axis_chunk_shape, axis_offsets, out_idx + ): + # stage 1: local sort and regular samples collected + sorted_chunks, indices_chunks, sampled_chunks = [], [], [] + sampled_dtype = ( + np.dtype([(o, in_data.dtype[o]) for o in op.order]) + if op.order is not None + else in_data.dtype + ) + for i in range(axis_chunk_shape): + idx = list(out_idx) + idx.insert(op.axis, i) + in_chunk = in_data.cix[tuple(idx)] + kind = None if op.psrs_kinds is None else op.psrs_kinds[0] + chunk_op = PSRSSortRegularSample( + axis=op.axis, + order=op.order, + kind=kind, + return_indices=op.return_indices, + n_partition=axis_chunk_shape, + axis_offset=axis_offsets[i], + gpu=op.gpu, + ) + kws = [] + sort_shape = in_chunk.shape + kws.append( + { + "shape": sort_shape, + "order": in_chunk.order, + "dtype": in_chunk.dtype, + "index": in_chunk.index, + "type": "sorted", + } + ) + if op.return_indices: + kws.append( + { + "shape": sort_shape, + "order": TensorOrder.C_ORDER, + "dtype": np.dtype(np.int64), + "index": in_chunk.index, + "type": "argsort", + } + ) + sampled_shape = (axis_chunk_shape,) + kws.append( + { + "shape": sampled_shape, + "order": in_chunk.order, + "dtype": sampled_dtype, + "index": (i,), + "type": "regular_sampled", + } + ) + chunks = chunk_op.new_chunks([in_chunk], kws=kws, output_limit=len(kws)) + if len(chunks) == 2: + sort_chunk, sampled_chunk = chunks + sorted_chunks.append(sort_chunk) + sampled_chunks.append(sampled_chunk) + else: + sort_chunk, indices_chunk, sampled_chunk = chunks + sorted_chunks.append(sort_chunk) + indices_chunks.append(indices_chunk) + sampled_chunks.append(sampled_chunk) + + return sorted_chunks, indices_chunks, sampled_chunks + + @classmethod + def concat_and_pivot( + cls, op, axis_chunk_shape, out_idx, sorted_chunks, sampled_chunks + ): + # stage 2: gather and merge samples, choose and broadcast p-1 pivots + concat_pivot_op = PSRSConcatPivot( + axis=op.axis, + order=op.order, + kind=None if op.psrs_kinds is None else op.psrs_kinds[1], + dtype=sampled_chunks[0].dtype, + gpu=op.gpu, + ) + concat_pivot_shape = ( + sorted_chunks[0].shape[: op.axis] + + (axis_chunk_shape - 1,) + + sorted_chunks[0].shape[op.axis + 1 :] + ) + concat_pivot_index = out_idx[: op.axis] + (0,) + out_idx[op.axis :] + concat_pivot_chunk = concat_pivot_op.new_chunk( + sampled_chunks, shape=concat_pivot_shape, index=concat_pivot_index + ) + return concat_pivot_chunk + + @classmethod + def partition_local_data( + cls, op, axis_chunk_shape, sorted_chunks, indices_chunks, concat_pivot_chunk + ): + # stage 3: Local data is partitioned + return_value = op.return_value + return_indices = op.return_indices + if return_indices: + # if return indices and psrs_kind[2] is not None + # value has to be output + map_return_value = True + else: + map_return_value = return_value + partition_chunks = [] + length = len(sorted_chunks or indices_chunks) + for i in range(length): + chunk_inputs = [] + if sorted_chunks: + chunk_inputs.append(sorted_chunks[i]) + if indices_chunks: + chunk_inputs.append(indices_chunks[i]) + chunk_inputs.append(concat_pivot_chunk) + partition_shuffle_map = PSRSShuffle( + return_value=map_return_value, + return_indices=return_indices, + stage=OperandStage.map, + axis=op.axis, + n_partition=axis_chunk_shape, + input_sorted=op.psrs_kinds[0] is not None, + order=op.order, + dtype=chunk_inputs[0].dtype, + gpu=chunk_inputs[0].op.gpu, + ) + partition_chunk = partition_shuffle_map.new_chunk( + chunk_inputs, + shape=chunk_inputs[0].shape, + index=chunk_inputs[0].index, + order=chunk_inputs[0].order, + ) + partition_chunks.append(partition_chunk) + return partition_chunks + + @classmethod + def partition_merge_data( + cls, op, need_align, return_value, partition_chunks, proxy_chunk + ): + # stage 4: all *ith* classes are gathered and merged + return_value = return_value if return_value is not None else op.return_value + return_indices = op.return_indices + partition_sort_chunks, partition_indices_chunks, sort_info_chunks = [], [], [] + for i, partition_chunk in enumerate(partition_chunks): + kind = None if op.psrs_kinds is None else op.psrs_kinds[2] + partition_shuffle_reduce = PSRSShuffle( + return_value=return_value, + return_indices=return_indices, + stage=OperandStage.reduce, + axis=op.axis, + order=op.order, + kind=kind, + reducer_index=(i,), + n_reducers=len(partition_chunks), + dtype=partition_chunk.dtype, + gpu=partition_chunk.op.gpu, + need_align=need_align, + ) + kws = [] + chunk_shape = list(partition_chunk.shape) + chunk_shape[op.axis] = np.nan + if return_value: + kws.append( + { + "shape": tuple(chunk_shape), + "order": partition_chunk.order, + "index": partition_chunk.index, + "dtype": partition_chunk.dtype, + "type": "sorted", + } + ) + if return_indices: + kws.append( + { + "shape": tuple(chunk_shape), + "order": TensorOrder.C_ORDER, + "index": partition_chunk.index, + "dtype": np.dtype(np.int64), + "type": "argsort", + } + ) + if need_align: + s = list(chunk_shape) + s.pop(op.axis) + kws.append( + { + "shape": tuple(s), + "order": TensorOrder.C_ORDER, + "index": partition_chunk.index, + "dtype": np.dtype(np.int32), + "type": "sort_info", + } + ) + cs = partition_shuffle_reduce.new_chunks([proxy_chunk], kws=kws) + i = 0 + if return_value: + partition_sort_chunks.append(cs[0]) + i += 1 + if return_indices: + partition_indices_chunks.append(cs[i]) + if need_align: + sort_info_chunks.append(cs[-1]) + + return partition_sort_chunks, partition_indices_chunks, sort_info_chunks + + @classmethod + def align_partitions_data( + cls, + op, + out_idx, + in_tensor, + partition_sort_chunks, + partition_indices_chunks, + sort_info_chunks, + ): + return_value, return_indices = op.return_value, op.return_indices + align_map_chunks = [] + length = len(partition_sort_chunks or partition_indices_chunks) + for i in range(length): + chunk_inputs = [] + if return_value: + chunk_inputs.append(partition_sort_chunks[i]) + if return_indices: + chunk_inputs.append(partition_indices_chunks[i]) + chunk_inputs.extend(sort_info_chunks) + align_map_op = PSRSAlign( + return_value=return_value, + return_indices=return_indices, + stage=OperandStage.map, + axis=op.axis, + output_sizes=list(in_tensor.nsplits[op.axis]), + dtype=chunk_inputs[0].dtype, + gpu=chunk_inputs[0].op.gpu, + ) + align_map_chunk = align_map_op.new_chunk( + chunk_inputs, + shape=chunk_inputs[0].shape, + index=chunk_inputs[0].index, + order=TensorOrder.C_ORDER, + ) + align_map_chunks.append(align_map_chunk) + proxy_chunk = TensorShuffleProxy(dtype=align_map_chunks[0].dtype).new_chunk( + align_map_chunks, shape=() + ) + align_reduce_value_chunks, align_reduce_indices_chunks = [], [] + for i, align_map_chunk in enumerate(align_map_chunks): + align_reduce_op = PSRSAlign( + return_value=return_value, + return_indices=return_indices, + stage=OperandStage.reduce, + axis=op.axis, + reducer_index=(i,), + n_reducers=len(align_map_chunks), + dtype=align_map_chunk.dtype, + gpu=align_map_chunk.op.gpu, + ) + idx = list(out_idx) + idx.insert(op.axis, i) + in_chunk = in_tensor.cix[tuple(idx)] + kws = [] + if return_value: + kws.append( + { + "shape": in_chunk.shape, + "index": in_chunk.index, + "order": in_chunk.order, + "dtype": in_chunk.dtype, + "type": "sorted", + } + ) + if return_indices: + kws.append( + { + "shape": in_chunk.shape, + "index": in_chunk.index, + "order": TensorOrder.C_ORDER, + "dtype": np.dtype(np.int64), + "type": "argsort", + } + ) + align_reduce_chunks = align_reduce_op.new_chunks([proxy_chunk], kws=kws) + if return_value: + align_reduce_value_chunks.append(align_reduce_chunks[0]) + if return_indices: + align_reduce_indices_chunks.append(align_reduce_chunks[-1]) + + return align_reduce_value_chunks, align_reduce_indices_chunks + + +def _sort(a, op, xp, axis=None, kind=None, order=None, inplace=False): + axis = axis if axis is not None else op.axis + kind = kind if kind is not None else op.kind + order = order if order is not None else op.order + if xp is np: + method = a.sort if inplace else partial(np.sort, a) + return method(axis=axis, kind=kind, order=order) + else: # pragma: no cover + # cupy does not support structure type + assert xp is cp + assert order is not None + method = a.sort if inplace else partial(cp.sort, a) + # cupy does not support kind, thus just ignore it + return method(axis=axis) + + +def _argsort(a, op, xp, axis=None, kind=None, order=None): + axis = axis if axis is not None else op.axis + kind = kind if kind is not None else op.kind + order = order if order is not None else op.order + if xp is np: + return np.argsort(a, axis=axis, kind=kind, order=order) + else: # pragma: no cover + # cupy does not support structure type + assert xp is cp + assert order is not None + return cp.argsort(a, axis=axis) + + +class PSRSSortRegularSample(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.PSRS_SORT_REGULAR_SMAPLE + + _axis = Int32Field("axis") + _order = ListField("order", FieldTypes.string) + _kind = StringField("kind") + _return_indices = BoolField("return_indices") + _n_partition = Int32Field("n_partition") + _axis_offset = AnyField("axis_offset") + + def __init__( + self, + axis=None, + order=None, + kind=None, + return_indices=None, + n_partition=None, + axis_offset=None, + dtype=None, + gpu=None, + **kw + ): + super().__init__( + _axis=axis, + _order=order, + _kind=kind, + _return_indices=return_indices, + _n_partition=n_partition, + _axis_offset=axis_offset, + dtype=dtype, + gpu=gpu, + **kw + ) + + @property + def axis(self): + return self._axis + + @property + def order(self): + return self._order + + @property + def kind(self): + return self._kind + + @property + def return_indices(self): + return self._return_indices + + @property + def n_partition(self): + return self._n_partition + + @property + def axis_offset(self): + return self._axis_offset + + @property + def output_limit(self): + # return sorted tensor, indices(optional) and regular sampled tensor + return 2 if not self._return_indices else 3 + + @classmethod + def execute(cls, ctx, op): + (a,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + if len(a) == 0: + # when chunk is empty, return the empty chunk itself + ctx[op.outputs[0].key] = ctx[op.outputs[-1].key] = a + return + + with device(device_id): + n = op.n_partition + w = a.shape[op.axis] * 1.0 / (n + 1) + if not op.return_indices: + if op.kind is not None: + # sort + res = ctx[op.outputs[0].key] = _sort(a, op, xp) + else: + # do not sort, prepare for sample by `xp.partition` + kth = xp.linspace( + max(w - 1, 0), a.shape[op.axis] - 1, num=n, endpoint=False + ).astype(int) + ctx[op.outputs[0].key] = res = xp.partition( + a, kth, axis=op.axis, order=op.order + ) + else: + if op.kind is not None: + # argsort + indices = _argsort(a, op, xp) + else: + # do not sort, use `xp.argpartition` + kth = xp.linspace( + max(w - 1, 0), a.shape[op.axis] - 1, num=n, endpoint=False + ).astype(int) + indices = xp.argpartition(a, kth, axis=op.axis, order=op.order) + ctx[op.outputs[0].key] = res = xp.take_along_axis(a, indices, op.axis) + ctx[op.outputs[1].key] = op.axis_offset + indices + + # do regular sample + if op.order is not None: + res = res[op.order] + slc = xp.linspace( + max(w - 1, 0), a.shape[op.axis] - 1, num=n, endpoint=False + ).astype(int) + slc = (slice(None),) * op.axis + (slc,) + ctx[op.outputs[-1].key] = res[slc] + + +class PSRSConcatPivot(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.PSRS_CONCAT_PIVOT + + _axis = Int32Field("axis") + _order = ListField("order", FieldTypes.string) + _kind = StringField("kind") + + def __init__(self, axis=None, order=None, kind=None, dtype=None, gpu=None, **kw): + super().__init__( + _axis=axis, _order=order, _kind=kind, dtype=dtype, gpu=gpu, **kw + ) + + @property + def axis(self): + return self._axis + + @property + def order(self): + return self._order + + @property + def kind(self): + return self._kind + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs if len(ctx[c.key]) > 0], + device=op.device, + ret_extra=True, + ) + + with device(device_id): + a = xp.concatenate(inputs, axis=op.axis) + p = len(inputs) + assert a.shape[op.axis] == p * len(op.inputs) + + if op.kind is not None: + # sort + _sort(a, op, xp, inplace=True) + else: + # prepare for sampling via `partition` + kth = xp.linspace( + p - 1, a.shape[op.axis] - 1, num=p - 1, endpoint=False + ).astype(int) + a.partition(kth, axis=op.axis) + + select = xp.linspace( + p - 1, a.shape[op.axis] - 1, num=len(op.inputs) - 1, endpoint=False + ).astype(int) + slc = (slice(None),) * op.axis + (select,) + ctx[op.outputs[0].key] = a[slc] + + +class PSRSShuffle(TensorMapReduceOperand, TensorOperandMixin): + _op_type_ = OperandDef.PSRS_SHUFFLE + + # public + _return_value = BoolField("return_value") + _return_indices = BoolField("return_indices") + + # for shuffle map + _axis = Int32Field("axis") + _order = ListField("order", FieldTypes.string) + _n_partition = Int32Field("n_partition") + _input_sorted = BoolField("input_sorted") + + # for shuffle reduce + _kind = StringField("kind") + _need_align = BoolField("need_align") + + def __init__( + self, + return_value=None, + return_indices=None, + axis=None, + order=None, + n_partition=None, + input_sorted=None, + kind=None, + need_align=None, + **kw + ): + super().__init__( + _return_value=return_value, + _return_indices=return_indices, + _axis=axis, + _order=order, + _n_partition=n_partition, + _input_sorted=input_sorted, + _kind=kind, + _need_align=need_align, + **kw + ) + + @property + def return_value(self): + return self._return_value + + @property + def return_indices(self): + return self._return_indices + + @property + def axis(self): + return self._axis + + @property + def order(self): + return self._order + + @property + def n_partition(self): + return self._n_partition + + @property + def input_sorted(self): + return self._input_sorted + + @property + def kind(self): + return self._kind + + @property + def need_align(self): + return self._need_align + + @property + def output_limit(self): + if self.stage == OperandStage.map: + return 1 + else: + limit = int(bool(self._return_value)) + int(bool(self._return_indices)) + if self._need_align: + limit += 1 + return limit + + @classmethod + def _execute_map(cls, ctx, op): + return_value = op.return_value + return_indices = op.return_indices + + inputs, device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + out = op.outputs[0] + a = inputs[0] + pivots = inputs[-1] + a_indices = None + if return_indices: + a_indices = inputs[-2] + + with device(device_id): + shape = tuple(s for i, s in enumerate(a.shape) if i != op.axis) + reduce_outputs = [ + np.empty(shape, dtype=object) for _ in range(op.n_partition) + ] + for idx in itertools.product(*(range(s) for s in shape)): + slc = list(idx) + slc.insert(op.axis, slice(None)) + slc = tuple(slc) + a_1d, pivots_1d = a[slc], pivots[slc] + a_indices_1d = a_indices[slc] if a_indices is not None else None + raw_a_1d = a_1d + if op.order is not None: + a_1d = a_1d[op.order] + if op.input_sorted: + # a is sorted already + poses = xp.searchsorted(a_1d, pivots_1d, side="right") + poses = (None,) + tuple(poses) + (None,) + for i in range(op.n_partition): + reduce_out = [] + if return_value: + values = raw_a_1d[poses[i] : poses[i + 1]] + reduce_out.append(values) + if return_indices: + indices = a_indices_1d[poses[i] : poses[i + 1]] + reduce_out.append(indices) + reduce_outputs[i][idx] = tuple(reduce_out) + else: + # a is not sorted, search every element in pivots + out_idxes = xp.searchsorted(pivots_1d, a_1d, side="right") + for i in range(op.n_partition): + cond = out_idxes == i + reduce_out = [] + if return_value: + values = raw_a_1d[cond] + reduce_out.append(values) + if return_indices: + indices = a_indices_1d[cond] + reduce_out.append(indices) + reduce_outputs[i][idx] = tuple(reduce_out) + for i in range(op.n_partition): + ctx[out.key, (i,)] = tuple(reduce_outputs[i].ravel()) + + @classmethod + def _execute_reduce(cls, ctx, op: "PSRSShuffle"): + raw_inputs = list(op.iter_mapper_data(ctx)) + # flatten inputs + flatten_inputs = flatten(raw_inputs) + inputs, device_id, xp = as_same_device( + flatten_inputs, device=op.device, ret_extra=True + ) + # organize back inputs + inputs = stack_back(inputs, raw_inputs) + + out = op.outputs[0] + extra_shape = list(out.shape) + extra_shape.pop(op.axis) + + return_value = op.return_value + return_indices = op.return_indices + + with device(device_id): + sort_res = np.empty(len(inputs[0]), dtype=object) + if extra_shape: + sort_res = sort_res.reshape(*extra_shape) + sort_info = np.empty(sort_res.shape, dtype=np.int32) + it = itertools.count(0) + for inps in zip(*inputs): + cur = itertools.count() + values, indices = None, None + ret = [] + if return_value or len(inps[0]) == 2: + i = next(cur) + values = xp.concatenate([inp[i] for inp in inps]) + if return_value: + ret.append(values) + if return_indices: + i = next(cur) + indices = xp.concatenate([inp[i] for inp in inps]) + ret.append(indices) + + if op.kind is not None: + # sort only if kind specified + if return_indices: + # if kind specified and return_indices + # values cannot be None + assert values is not None + values_indices = _argsort(values, op, xp, axis=0) + if return_value: + xp.take(values, values_indices, out=values) + xp.take(indices, values_indices, out=indices) + else: + _sort(values, op, xp, axis=0, inplace=True) + + j = next(it) + sort_res.ravel()[j] = ret + sort_info.ravel()[j] = len(ret[0]) + + if not op.need_align: + assert len(sort_res) == 1 + shape = list(extra_shape) + shape.insert(op.axis, len(sort_res[0])) + i = 0 + if return_value: + ctx[op.outputs[0].key] = sort_res[0][i] + i += 1 + if return_indices: + ctx[op.outputs[i].key] = sort_res[0][i] + else: + i = 0 + if return_value: + ctx[op.outputs[0].key] = tuple(r[0] for r in sort_res.ravel()) + i += 1 + if return_indices: + ctx[op.outputs[i].key] = tuple(r[i] for r in sort_res.ravel()) + ctx[op.outputs[-1].key] = sort_info + + @classmethod + def execute(cls, ctx, op): + if op.stage == OperandStage.map: + cls._execute_map(ctx, op) + else: + cls._execute_reduce(ctx, op) + + +class PSRSAlign(TensorMapReduceOperand, TensorOperandMixin): + _op_type_ = OperandDef.PSRS_ALIGN + + _return_value = BoolField("return_value") + _return_indices = BoolField("return_indices") + _axis = Int32Field("axis") + _output_sizes = ListField("output_sizes", FieldTypes.int32) + + def __init__( + self, return_value=None, return_indices=None, axis=None, output_sizes=None, **kw + ): + super().__init__( + _return_value=return_value, + _return_indices=return_indices, + _axis=axis, + _output_sizes=output_sizes, + **kw + ) + + @property + def return_value(self): + return self._return_value + + @property + def return_indices(self): + return self._return_indices + + @property + def axis(self): + return self._axis + + @property + def output_sizes(self): + return self._output_sizes + + @property + def output_limit(self): + if self.stage == OperandStage.map: + return 1 + else: + return int(bool(self._return_value)) + int(bool(self._return_indices)) + + @classmethod + def _execute_map(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + sort_res, sort_indices = None, None + i = 0 + if op.return_value: + sort_res = inputs[0] + i += 1 + if op.return_indices: + sort_indices = inputs[i] + i += 1 + sort_infos = inputs[i:] + out = op.outputs[0] + + with device(device_id): + length = len(sort_res or sort_indices) + outs = np.empty((len(op.output_sizes), length), dtype=object) + out_sizes = op.output_sizes + cum_out_sizes = (0,) + tuple(np.cumsum(out_sizes)) + for i in range(length): + sort_1d = sort_res[i] if sort_res is not None else None + indices_1d = sort_indices[i] if sort_indices is not None else None + sort_lengths = [sort_info.flat[i] for sort_info in sort_infos] + cum_sort_lengths = (0,) + tuple(np.cumsum(sort_lengths)) + j = out.index[op.axis] + start_pos = cum_sort_lengths[j] + end_pos = cum_sort_lengths[j + 1] + out_idx_start, out_idx_end = np.searchsorted( + cum_out_sizes, [start_pos, end_pos] + ) + out_idx_start = max(out_idx_start - 1, 0) + for out_idx in range(out_idx_start, out_idx_end): + out_start_pos = cum_out_sizes[out_idx] + out_end_pos = cum_out_sizes[out_idx + 1] + s = max(start_pos, out_start_pos) + size = max(min(end_pos, out_end_pos) - s, 0) + s = max(0, s - start_pos) + ret = [] + if sort_1d is not None: + ret.append(sort_1d[s : s + size]) + if indices_1d is not None: + ret.append(indices_1d[s : s + size]) + outs[out_idx, i] = tuple(ret) + + for idx in range(len(op.output_sizes)): + ret = [] + for ar in outs[idx]: + if ar is None: + item = [] + if sort_res is not None: + item.append(xp.empty((0,), dtype=out.dtype)) + if sort_indices is not None: + item.append(xp.empty((0,), dtype=np.dtype(np.int64))) + ret.append(tuple(item)) + else: + ret.append(ar) + ctx[op.outputs[0].key, (idx,)] = tuple(ret) + + @classmethod + def _execute_reduce(cls, ctx, op: "PSRSAlign"): + axis = op.axis + raw_inputs = list(op.iter_mapper_data(ctx)) + flatten_inputs = flatten(raw_inputs) + inputs, device_id, xp = as_same_device( + flatten_inputs, device=op.device, ret_extra=True + ) + inputs = stack_back(flatten_inputs, raw_inputs) + + out = op.outputs[0] + extra_shape = list(out.shape) + extra_shape.pop(axis) + + return_value = op.return_value + return_indices = op.return_indices + + with device(device_id): + if return_value: + values_res = xp.empty(out.shape, dtype=out.dtype) + else: + values_res = None + if return_indices: + indices_res = xp.empty(out.shape, dtype=np.dtype(np.int64)) + else: + indices_res = None + it = itertools.product( + *(range(s) for i, s in enumerate(out.shape) if i != axis) + ) + for inps in zip(*inputs): + slc = list(next(it)) + slc.insert(op.axis, slice(None)) + i = 0 + if return_value: + value_concat_1d = xp.concatenate([inp[0] for inp in inps]) + values_res[tuple(slc)] = value_concat_1d + i += 1 + if return_indices: + ind_concat_id = xp.concatenate([inp[i] for inp in inps]) + indices_res[tuple(slc)] = ind_concat_id + + i = 0 + if return_value: + ctx[op.outputs[0].key] = values_res.astype( + values_res.dtype, order=op.outputs[0].order.value + ) + i += 1 + if return_indices: + ctx[op.outputs[i].key] = indices_res.astype( + indices_res.dtype, order=op.outputs[i].order.value + ) + + @classmethod + def execute(cls, ctx, op): + if op.stage == OperandStage.map: + cls._execute_map(ctx, op) + else: + cls._execute_reduce(ctx, op) diff --git a/python/xorbits/_mars/tensor/base/ravel.py b/python/xorbits/_mars/tensor/base/ravel.py new file mode 100644 index 000000000..d2e6d6130 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/ravel.py @@ -0,0 +1,90 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ..datasource import tensor as astensor + + +def ravel(a, order="C"): + """Return a contiguous flattened tensor. + + A 1-D tensor, containing the elements of the input, is returned. A copy is + made only if needed. + + Parameters + ---------- + a : array_like + Input tensor. The elements in `a` are packed as a 1-D tensor. + order : {'C','F', 'A', 'K'}, optional + + The elements of `a` are read using this index order. 'C' means + to index the elements in row-major, C-style order, + with the last axis index changing fastest, back to the first + axis index changing slowest. 'F' means to index the elements + in column-major, Fortran-style order, with the + first index changing fastest, and the last index changing + slowest. Note that the 'C' and 'F' options take no account of + the memory layout of the underlying array, and only refer to + the order of axis indexing. 'A' means to read the elements in + Fortran-like index order if `a` is Fortran *contiguous* in + memory, C-like order otherwise. 'K' means to read the + elements in the order they occur in memory, except for + reversing the data when strides are negative. By default, 'C' + index order is used. + + Returns + ------- + y : array_like + If `a` is a matrix, y is a 1-D tensor, otherwise y is a tensor of + the same subtype as `a`. The shape of the returned array is + ``(a.size,)``. Matrices are special cased for backward + compatibility. + + See Also + -------- + Tensor.flat : 1-D iterator over an array. + Tensor.flatten : 1-D array copy of the elements of an array + in row-major order. + Tensor.reshape : Change the shape of an array without changing its data. + + Examples + -------- + It is equivalent to ``reshape(-1)``. + + >>> import mars.tensor as mt + + >>> x = mt.array([[1, 2, 3], [4, 5, 6]]) + >>> print(mt.ravel(x).execute()) + [1 2 3 4 5 6] + + >>> print(x.reshape(-1).execute()) + [1 2 3 4 5 6] + + >>> print(mt.ravel(x.T).execute()) + [1 4 2 5 3 6] + + >>> a = mt.arange(12).reshape(2,3,2).swapaxes(1,2); a.execute() + array([[[ 0, 2, 4], + [ 1, 3, 5]], + [[ 6, 8, 10], + [ 7, 9, 11]]]) + >>> a.ravel().execute() + array([ 0, 2, 4, 1, 3, 5, 6, 8, 10, 7, 9, 11]) + + """ + a = astensor(a) + if a.ndim == 0: + return a[np.newaxis] + return a.reshape(-1, order=order) diff --git a/python/xorbits/_mars/tensor/base/rebalance.py b/python/xorbits/_mars/tensor/base/rebalance.py new file mode 100644 index 000000000..c9bc519ea --- /dev/null +++ b/python/xorbits/_mars/tensor/base/rebalance.py @@ -0,0 +1,142 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes +from ...core import recursive_tile +from ...core.context import get_context +from ...serialization.serializables import Float64Field, Int64Field, KeyField +from ...tensor.datasource import tensor as astensor +from ...utils import ceildiv, has_unknown_shape +from ..operands import TensorOperand, TensorOperandMixin + + +class RebalanceMixin: + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + def __call__(self, df_or_series): + self._output_types = df_or_series.op.output_types + return self.new_tileable([df_or_series], kws=[df_or_series.params]) + + def _get_input_object(self): + raise NotImplementedError + + @classmethod + def tile(cls, op: "RebalanceMixin"): + in_obj = op._get_input_object() + ctx = get_context() + + if ctx is None and op.factor is not None: + return [in_obj] + + if has_unknown_shape(in_obj): + yield + + size = in_obj.shape[op.axis] + if op.factor is not None: + cluster_cpu_count = ctx.get_total_n_cpu() + assert cluster_cpu_count > 0 + expect_n_chunk = int(cluster_cpu_count * op.factor) + else: + expect_n_chunk = op.num_partitions + + expect_chunk_size = max(ceildiv(size, expect_n_chunk), 1) + r = yield from recursive_tile( + in_obj.rechunk( + {op.axis: expect_chunk_size}, reassign_worker=op.reassign_worker + ) + ) + return r + + +class TensorRebalance(RebalanceMixin, TensorOperandMixin, TensorOperand): + _op_type_ = opcodes.REBALANCE + + _input = KeyField("input") + _factor = Float64Field("factor") + _axis = Int64Field("axis") + _num_partitions = Int64Field("num_partitions") + + def __init__( + self, + input=None, + factor=None, + axis=None, # pylint: disable=redefined-builtin + num_partitions=None, + output_types=None, + **kw + ): + super().__init__( + _input=input, + _factor=factor, + _axis=axis, + _num_partitions=num_partitions, + _output_types=output_types, + **kw + ) + + @property + def input(self): + return self._input + + @property + def factor(self): + return self._factor + + @property + def axis(self): + return self._axis + + @property + def num_partitions(self): + return self._num_partitions + + def _get_input_object(self): + return astensor(self.inputs[0]) + + +def rebalance(tensor, factor=None, axis=0, num_partitions=None, reassign_worker=True): + """ + Make Data more balanced across entire cluster. + + Parameters + ---------- + factor : float + Specified so that number of chunks after balance is + total CPU count of cluster * factor. + axis : int + The axis to rebalance. + num_partitions : int + Specified so the number of chunks are at most + num_partitions. + reassign_worker : bool + If True, workers will be reassigned. + + Returns + ------- + Series or DataFrame + Result of DataFrame or Series after rebalanced. + """ + if num_partitions is None: + factor = factor if factor is not None else 1.2 + + op = TensorRebalance( + input=tensor, + factor=factor, + axis=axis, + num_partitions=num_partitions, + reassign_worker=reassign_worker, + ) + return op(tensor) diff --git a/python/xorbits/_mars/tensor/base/repeat.py b/python/xorbits/_mars/tensor/base/repeat.py new file mode 100644 index 000000000..522a7921e --- /dev/null +++ b/python/xorbits/_mars/tensor/base/repeat.py @@ -0,0 +1,227 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from numbers import Integral + +import numpy as np + +from ... import opcodes as OperandDef +from ...core import recursive_tile +from ...serialization.serializables import AnyField, Int32Field, KeyField +from ...utils import has_unknown_shape +from ..array_utils import as_same_device, device +from ..core import TENSOR_CHUNK_TYPE, TENSOR_TYPE, Tensor, TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorHasInput, TensorOperandMixin +from ..utils import broadcast_shape, unify_chunks +from .ravel import ravel + + +class TensorRepeat(TensorHasInput, TensorOperandMixin): + _op_type_ = OperandDef.REPEAT + + _input = KeyField("input") + _repeats = AnyField("repeats") + _axis = Int32Field("axis") + + def __init__(self, axis=None, dtype=None, sparse=False, **kw): + super().__init__(_axis=axis, dtype=dtype, sparse=sparse, **kw) + + @property + def repeats(self): + return self._repeats + + @property + def axis(self): + return self._axis + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + if len(inputs) > 1: + self._repeats = self._inputs[1] + + def __call__(self, a, repeats): + axis = self._axis + a = astensor(a) + if axis is None: + a = ravel(a) + + ax = axis or 0 + + if not isinstance(repeats, Integral): + if not isinstance(repeats, Tensor): + repeats = np.asarray(repeats) + if repeats.size == 1: + repeats = int(repeats[0]) + size = repeats * a.shape[axis or 0] + elif a.shape[ax] == 1: + size = repeats = int(repeats.sum()) + else: + size = int(repeats.sum()) + else: + size = np.nan + if not isinstance(repeats, Integral): + if repeats.ndim != 1: + raise ValueError("repeats should be 1-d tensor") + broadcast_shape(repeats.shape, a.shape[ax : ax + 1]) + else: + size = a.shape[axis or 0] * repeats + + shape = a.shape[:ax] + (size,) + a.shape[ax + 1 :] + self.dtype = a.dtype + self.sparse = a.issparse() + + inputs = [a] + if isinstance(repeats, Tensor): + inputs.append(repeats) + else: + self._repeats = repeats + + return self.new_tensor(inputs, shape, order=TensorOrder.C_ORDER) + + @classmethod + def tile(cls, op): + a = op.input + repeats = op.repeats + axis = op.axis + ax = axis or 0 + out = op.outputs[0] + + if has_unknown_shape(*op.inputs): + yield + + if isinstance(repeats, TENSOR_TYPE): + a, repeats = yield from unify_chunks(a, (repeats, (ax,))) + + nsplit = a.nsplits[axis or 0] + + if isinstance(repeats, Integral): + new_nsplit = [] + for split in nsplit: + s = max(split // repeats, 1) + c = split // s + new_nsplit.extend([s] * c) + if split % s != 0: + new_nsplit.append(split % s) + + a = yield from recursive_tile(a.rechunk({ax: new_nsplit})) + + out_chunks = [] + ax_cum_count = np.cumsum((0,) + a.nsplits[ax]) + is_repeats_ndarray = isinstance(repeats, np.ndarray) + for out_idx in itertools.product(*[range(len(s)) for s in a.nsplits]): + in_chunk = a.cix[out_idx] + ax_idx = out_idx[ax] + if is_repeats_ndarray: + start = ax_cum_count[ax_idx] + stop = ax_cum_count[ax_idx + 1] + rp = repeats[start:stop] + size = int(rp.sum()) + elif not isinstance(repeats, Integral): + rp = repeats.cix[ax_idx,] + size = np.nan + else: + rp = repeats + size = in_chunk.shape[ax] * rp + + chunk_inputs = [in_chunk] + if isinstance(rp, TENSOR_CHUNK_TYPE): + chunk_inputs.append(rp) + + chunk_shape = in_chunk.shape[:ax] + (size,) + in_chunk.shape[ax + 1 :] + chunk_op = op.copy().reset_key() + if len(chunk_inputs) < 2: + # repeats is not chunk + chunk_op._repeats = rp + out_chunk = chunk_op.new_chunk( + chunk_inputs, shape=chunk_shape, index=out_idx, order=out.order + ) + out_chunks.append(out_chunk) + + nsplits = [ + tuple( + c.shape[i] + for c in out_chunks + if all(idx == 0 for j, idx in enumerate(c.index) if j != i) + ) + for i in range(len(out_chunks[0].shape)) + ] + new_op = op.copy() + return new_op.new_tensors( + op.inputs, out.shape, order=out.order, chunks=out_chunks, nsplits=nsplits + ) + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + a = inputs[0] + if len(inputs) > 1: + repeats = inputs[1] + else: + repeats = op.repeats + + with device(device_id): + ctx[op.outputs[0].key] = xp.repeat(a, repeats=repeats, axis=op.axis) + + +def repeat(a, repeats, axis=None): + """ + Repeat elements of a tensor. + + Parameters + ---------- + a : array_like + Input tensor. + repeats : int or tensor of ints + The number of repetitions for each element. `repeats` is broadcasted + to fit the shape of the given axis. + axis : int, optional + The axis along which to repeat values. By default, use the + flattened input tensor, and return a flat output tensor. + + Returns + ------- + repeated_tensor : Tensor + Output array which has the same shape as `a`, except along + the given axis. + + See Also + -------- + tile : Tile a tensor. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.repeat(3, 4).execute() + array([3, 3, 3, 3]) + >>> x = mt.array([[1,2],[3,4]]) + >>> mt.repeat(x, 2).execute() + array([1, 1, 2, 2, 3, 3, 4, 4]) + >>> mt.repeat(x, 3, axis=1).execute() + array([[1, 1, 1, 2, 2, 2], + [3, 3, 3, 4, 4, 4]]) + >>> mt.repeat(x, [1, 2], axis=0).execute() + array([[1, 2], + [3, 4], + [3, 4]]) + + """ + op = TensorRepeat(axis=axis) + return op(a, repeats) diff --git a/python/xorbits/_mars/tensor/base/result_type.py b/python/xorbits/_mars/tensor/base/result_type.py new file mode 100644 index 000000000..8f1fad04c --- /dev/null +++ b/python/xorbits/_mars/tensor/base/result_type.py @@ -0,0 +1,88 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + + +def result_type(*tensors_and_dtypes): + """ + Returns the type that results from applying the NumPy + type promotion rules to the arguments. + + Type promotion in Mars works similarly to the rules in languages + like C++, with some slight differences. When both scalars and + arrays are used, the array's type takes precedence and the actual value + of the scalar is taken into account. + + For example, calculating 3*a, where a is an array of 32-bit floats, + intuitively should result in a 32-bit float output. If the 3 is a + 32-bit integer, the NumPy rules indicate it can't convert losslessly + into a 32-bit float, so a 64-bit float should be the result type. + By examining the value of the constant, '3', we see that it fits in + an 8-bit integer, which can be cast losslessly into the 32-bit float. + + Parameters + ---------- + tensors_and_dtypes : list of tensors and dtypes + The operands of some operation whose result type is needed. + + Returns + ------- + out : dtype + The result type. + + See also + -------- + dtype, promote_types, min_scalar_type, can_cast + + Notes + ----- + The specific algorithm used is as follows. + + Categories are determined by first checking which of boolean, + integer (int/uint), or floating point (float/complex) the maximum + kind of all the arrays and the scalars are. + + If there are only scalars or the maximum category of the scalars + is higher than the maximum category of the arrays, + the data types are combined with :func:`promote_types` + to produce the return value. + + Otherwise, `min_scalar_type` is called on each array, and + the resulting data types are all combined with :func:`promote_types` + to produce the return value. + + The set of int values is not a subset of the uint values for types + with the same number of bits, something not reflected in + :func:`min_scalar_type`, but handled as a special case in `result_type`. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.result_type(3, mt.arange(7, dtype='i1')) + dtype('int8') + + >>> mt.result_type('i4', 'c8') + dtype('complex128') + + >>> mt.result_type(3.0, -2) + dtype('float64') + """ + from ..core import Tensor + + arrays_and_dtypes = [ + a.dtype if isinstance(a, Tensor) else a for a in tensors_and_dtypes + ] + return np.result_type(*arrays_and_dtypes) diff --git a/python/xorbits/_mars/tensor/base/roll.py b/python/xorbits/_mars/tensor/base/roll.py new file mode 100644 index 000000000..6da3833e8 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/roll.py @@ -0,0 +1,124 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License.' + +from collections.abc import Iterable + +import numpy as np + +from ..datasource import tensor as astensor +from ..utils import validate_axis +from .ravel import ravel + + +def roll(a, shift, axis=None): + """ + Roll tensor elements along a given axis. + + Elements that roll beyond the last position are re-introduced at + the first. + + Parameters + ---------- + a : array_like + Input tensor. + shift : int or tuple of ints + The number of places by which elements are shifted. If a tuple, + then `axis` must be a tuple of the same size, and each of the + given axes is shifted by the corresponding number. If an int + while `axis` is a tuple of ints, then the same value is used for + all given axes. + axis : int or tuple of ints, optional + Axis or axes along which elements are shifted. By default, the + tensor is flattened before shifting, after which the original + shape is restored. + + Returns + ------- + res : Tensor + Output tensor, with the same shape as `a`. + + See Also + -------- + rollaxis : Roll the specified axis backwards, until it lies in a + given position. + + Notes + ----- + + Supports rolling over multiple dimensions simultaneously. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.arange(10) + >>> mt.roll(x, 2).execute() + array([8, 9, 0, 1, 2, 3, 4, 5, 6, 7]) + + >>> x2 = mt.reshape(x, (2,5)) + >>> x2.execute() + array([[0, 1, 2, 3, 4], + [5, 6, 7, 8, 9]]) + >>> mt.roll(x2, 1).execute() + array([[9, 0, 1, 2, 3], + [4, 5, 6, 7, 8]]) + >>> mt.roll(x2, 1, axis=0).execute() + array([[5, 6, 7, 8, 9], + [0, 1, 2, 3, 4]]) + >>> mt.roll(x2, 1, axis=1).execute() + array([[4, 0, 1, 2, 3], + [9, 5, 6, 7, 8]]) + + """ + from ..merge import concatenate + + a = astensor(a) + raw = a + + if axis is None: + a = ravel(a) + axis = 0 + + if not isinstance(shift, Iterable): + shift = (shift,) + else: + shift = tuple(shift) + if not isinstance(axis, Iterable): + axis = (axis,) + else: + axis = tuple(axis) + + for ax in axis: + validate_axis(a.ndim, ax) + broadcasted = np.broadcast(shift, axis) + if broadcasted.ndim > 1: + raise ValueError("'shift' and 'axis' should be scalars or 1D sequences") + + shifts = {ax: 0 for ax in range(a.ndim)} + for s, ax in broadcasted: + shifts[ax] += s + + for ax, s in shifts.items(): + if s == 0: + continue + + s = -s + s %= a.shape[ax] + + slc1 = (slice(None),) * ax + (slice(s, None),) + slc2 = (slice(None),) * ax + (slice(s),) + + a = concatenate([a[slc1], a[slc2]], axis=ax) + + return a.reshape(raw.shape) diff --git a/python/xorbits/_mars/tensor/base/rollaxis.py b/python/xorbits/_mars/tensor/base/rollaxis.py new file mode 100644 index 000000000..586294d69 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/rollaxis.py @@ -0,0 +1,77 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ..utils import validate_axis + + +def rollaxis(tensor, axis, start=0): + """ + Roll the specified axis backwards, until it lies in a given position. + + This function continues to be supported for backward compatibility, but you + should prefer `moveaxis`. + + Parameters + ---------- + a : Tensor + Input tensor. + axis : int + The axis to roll backwards. The positions of the other axes do not + change relative to one another. + start : int, optional + The axis is rolled until it lies before this position. The default, + 0, results in a "complete" roll. + + Returns + ------- + res : Tensor + a view of `a` is always returned. + + See Also + -------- + moveaxis : Move array axes to new positions. + roll : Roll the elements of an array by a number of positions along a + given axis. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.ones((3,4,5,6)) + >>> mt.rollaxis(a, 3, 1).shape + (3, 6, 4, 5) + >>> mt.rollaxis(a, 2).shape + (5, 3, 4, 6) + >>> mt.rollaxis(a, 1, 4).shape + (3, 5, 6, 4) + + """ + n = tensor.ndim + axis = validate_axis(n, axis) + if start < 0: + start += n + msg = "'%s' arg requires %d <= %s < %d, but %d was passed in" + if not (0 <= start < n + 1): + raise np.AxisError(msg % ("start", -n, "start", n + 1, start)) + if axis < start: + # it's been removed + start -= 1 + if axis == start: + return tensor + axes = list(range(0, n)) + axes.remove(axis) + axes.insert(start, axis) + return tensor.transpose(axes) diff --git a/python/xorbits/_mars/tensor/base/searchsorted.py b/python/xorbits/_mars/tensor/base/searchsorted.py new file mode 100644 index 000000000..98d323ddf --- /dev/null +++ b/python/xorbits/_mars/tensor/base/searchsorted.py @@ -0,0 +1,381 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from typing import Any, List, Tuple, Type + +import numpy as np + +from ... import opcodes as OperandDef +from ...config import options +from ...core import TILEABLE_TYPE +from ...core.operand import OperandStage +from ...serialization.serializables import AnyField, Int32Field, Int64Field, StringField +from ...typing import ChunkType, TileableType +from ...utils import has_unknown_shape +from ..array_utils import as_same_device, device +from ..core import TENSOR_TYPE, TensorOrder +from ..datasource.array import tensor as astensor +from ..operands import TensorOperand, TensorOperandMixin + + +class TensorSearchsorted(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.SEARCHSORTED + + v = AnyField("v") + side = StringField("side") + combine_size = Int32Field("combine_size") + # for chunk + offset = Int64Field("offset") + size = Int64Field("size") + n_chunk = Int64Field("n_chunk") + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if isinstance(self.v, TILEABLE_TYPE): + self.v = self._inputs[1] + + def __call__(self, a, v): + inputs = [a] + if isinstance(v, TILEABLE_TYPE): + inputs.append(v) + shape = v.shape + else: + shape = () + return self.new_tensor(inputs, shape=shape, order=TensorOrder.C_ORDER) + + @classmethod + def _tile_one_chunk(cls, op, a, v, out): + chunks = [] + if len(op.inputs) == 1: + v_chunks = [v] + else: + v_chunks = v.chunks + for v_chunk in v_chunks: + chunk_op = op.copy().reset_key() + in_chunks = [a.chunks[0]] + if len(op.inputs) == 2: + in_chunks.append(v_chunk) + v_shape = v_chunk.shape if hasattr(v_chunk, "shape") else () + chunk_idx = v_chunk.index if len(op.inputs) == 2 else (0,) + chunk = chunk_op.new_chunk( + in_chunks, shape=v_shape, index=chunk_idx, order=out.order + ) + chunks.append(chunk) + new_op = op.copy().reset_key() + nsplits = ((s,) for s in out.shape) if len(op.inputs) == 1 else v.nsplits + return new_op.new_tensors(op.inputs, out.shape, chunks=chunks, nsplits=nsplits) + + @classmethod + def _combine_chunks( + cls, + to_combine: List[ChunkType], + op_type: Type, + v: Any, + stage: OperandStage, + chunk_index: Tuple[int], + ): + from ..merge import TensorStack + + dtype = np.dtype(np.intp) + v_shape = v.shape if hasattr(v, "shape") else () + combine_op = TensorStack(axis=0, dtype=dtype) + combine_chunk = combine_op.new_chunk(to_combine, shape=v_shape) + chunk_op = op_type(dtype=dtype, axis=(0,), stage=stage) + return chunk_op.new_chunk( + [combine_chunk], shape=v_shape, index=chunk_index, order=TensorOrder.C_ORDER + ) + + @classmethod + def _tile_tree_reduction( + cls, op: "TensorSearchsorted", a: TileableType, v: Any, out: TileableType + ): + from ..indexing import TensorSlice + from ..merge import TensorConcatenate + from ..reduction import TensorMax, TensorMin + + if has_unknown_shape(a): + yield + + combine_size = op.combine_size or options.combine_size + n_chunk = len(a.chunks) + input_len = len(op.inputs) + v_chunks = [v] if input_len == 1 else v.chunks + cum_nsplits = [0] + np.cumsum(a.nsplits[0]).tolist() + + input_chunks = [] + offsets = [] + for i in range(n_chunk): + offset = cum_nsplits[i] + cur_chunk = a.chunks[i] + chunk_size = a.shape[0] + chunks = [] + if i > 0: + last_chunk = a.chunks[i - 1] + if last_chunk.shape[0] > 0: + slice_chunk_op = TensorSlice( + slices=[slice(-1, None)], dtype=cur_chunk.dtype + ) + slice_chunk = slice_chunk_op.new_chunk( + [last_chunk], shape=(1,), order=out.order + ) + chunks.append(slice_chunk) + chunk_size += 1 + offset -= 1 + chunks.append(cur_chunk) + if i < n_chunk - 1: + next_chunk = a.chunks[i + 1] + if next_chunk.shape[0] > 0: + slice_chunk_op = TensorSlice( + slices=[slice(1)], dtype=cur_chunk.dtype + ) + slice_chunk = slice_chunk_op.new_chunk( + [next_chunk], shape=(1,), order=out.order + ) + chunks.append(slice_chunk) + chunk_size += 1 + + concat_op = TensorConcatenate(dtype=cur_chunk.dtype) + concat_chunk = concat_op.new_chunk( + chunks, shape=(chunk_size,), order=out.order, index=cur_chunk.index + ) + input_chunks.append(concat_chunk) + offsets.append(offset) + + out_chunks = [] + for v_chunk in v_chunks: + chunks = [] + v_shape = v_chunk.shape if hasattr(v_chunk, "shape") else () + v_index = v_chunk.index if hasattr(v_chunk, "index") else (0,) + for inp_chunk, offset in zip(input_chunks, offsets): + chunk_op = op.copy().reset_key() + chunk_op.stage = OperandStage.map + chunk_op.offset = offset + chunk_op.n_chunk = n_chunk + chunk_op.size = a.shape[0] + chunk_inputs = [inp_chunk] + if input_len > 1: + chunk_inputs.append(v_chunk) + map_chunk = chunk_op.new_chunk( + chunk_inputs, shape=v_shape, index=inp_chunk.index, order=out.order + ) + chunks.append(map_chunk) + + op_type = TensorMax if op.side == "right" else TensorMin + while len(chunks) > combine_size: + new_chunks = [] + it = itertools.count(0) + while True: + j = next(it) + to_combine = chunks[j * combine_size : (j + 1) * combine_size] + if len(to_combine) == 0: + break + + new_chunks.append( + cls._combine_chunks( + to_combine, op_type, v_chunk, OperandStage.combine, (j,) + ) + ) + chunks = new_chunks + + chunk = cls._combine_chunks( + chunks, op_type, v_chunk, OperandStage.agg, v_index + ) + out_chunks.append(chunk) + + new_op = op.copy().reset_key() + nsplits = ((s,) for s in out.shape) if len(op.inputs) == 1 else v.nsplits + return new_op.new_tensors( + op.inputs, out.shape, chunks=out_chunks, nsplits=nsplits + ) + + @classmethod + def tile(cls, op): + a = op.inputs[0] + out = op.outputs[0] + input_len = len(op.inputs) + if input_len == 1: + v = op.v + else: + v = op.inputs[1] + + if len(a.chunks) == 1: + return cls._tile_one_chunk(op, a, v, out) + return (yield from cls._tile_tree_reduction(op, a, v, out)) + + @classmethod + def _execute_without_stage(cls, xp, a, v, op): + return xp.searchsorted(a, v, side=op.side) + + @classmethod + def _execute_map(cls, xp: Any, a: np.ndarray, v: Any, op: "TensorSearchsorted"): + out = op.outputs[0] + i = out.index[0] + side = op.side + + raw_v = v + v = xp.atleast_1d(v) + searched = xp.searchsorted(a, v, side=op.side) + xp.add(searched, op.offset, out=searched) + a_min, a_max = a[0], a[-1] + if i == 0: + # the first chunk + if a_min == a_max: + miss = v > a_max + else: + miss = v > a_max if side == "left" else v >= a_max + elif i == op.n_chunk - 1: + # the last chunk + if a_min == a_max: + miss = v < a_min + else: + miss = v <= a_min if side == "left" else v < a_min + else: + if side == "left" and a_min < a_max: + miss = (v <= a_min) | (v > a_max) + elif a_min < a_max: + miss = (v < a_min) | (v >= a_max) + else: + assert a_min == a_max + miss = v != a_min + if side == "right": + searched[miss] = -1 + else: + searched[miss] = op.size + 1 + + return searched[0] if np.isscalar(raw_v) else searched + + @classmethod + def execute(cls, ctx, op): + a = ctx[op.inputs[0].key] + v = ctx[op.inputs[1].key] if len(op.inputs) == 2 else op.v + + data = [] + if isinstance(a, tuple): + data.extend(a) + else: + data.append(a) + if len(op.inputs) == 2: + data.append(v) + + data, device_id, xp = as_same_device(data, device=op.device, ret_extra=True) + + if isinstance(a, tuple): + a = data[:2] + else: + a = data[0] + if len(op.inputs) == 2: + v = data[-1] + + with device(device_id): + if op.stage is None: + ret = cls._execute_without_stage(xp, a, v, op) + else: + assert op.stage == OperandStage.map + ret = cls._execute_map(xp, a, v, op) + ctx[op.outputs[0].key] = ret + + +def searchsorted(a, v, side="left", sorter=None, combine_size=None): + """ + Find indices where elements should be inserted to maintain order. + + Find the indices into a sorted tensor `a` such that, if the + corresponding elements in `v` were inserted before the indices, the + order of `a` would be preserved. + + Assuming that `a` is sorted: + + ====== ============================ + `side` returned index `i` satisfies + ====== ============================ + left ``a[i-1] < v <= a[i]`` + right ``a[i-1] <= v < a[i]`` + ====== ============================ + + Parameters + ---------- + a : 1-D array_like + Input tensor. If `sorter` is None, then it must be sorted in + ascending order, otherwise `sorter` must be an array of indices + that sort it. + v : array_like + Values to insert into `a`. + side : {'left', 'right'}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `a`). + sorter : 1-D array_like, optional + Optional tensor of integer indices that sort array a into ascending + order. They are typically the result of argsort. + combine_size: int, optional + The number of chunks to combine. + + Returns + ------- + indices : tensor of ints + Array of insertion points with the same shape as `v`. + + See Also + -------- + sort : Return a sorted copy of a tensor. + histogram : Produce histogram from 1-D data. + + Notes + ----- + Binary search is used to find the required insertion points. + + This function is a faster version of the builtin python `bisect.bisect_left` + (``side='left'``) and `bisect.bisect_right` (``side='right'``) functions, + which is also vectorized in the `v` argument. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.searchsorted([1,2,3,4,5], 3).execute() + 2 + >>> mt.searchsorted([1,2,3,4,5], 3, side='right').execute() + 3 + >>> mt.searchsorted([1,2,3,4,5], [-10, 10, 2, 3]).execute() + array([0, 5, 1, 2]) + + """ + + if ( + not isinstance(a, TENSOR_TYPE) + and sorter is not None + and not isinstance(sorter, TENSOR_TYPE) + ): + a = astensor(np.asarray(a)[sorter]) + else: + a = astensor(a) + if sorter is not None: + a = a[sorter] + + if a.ndim != 1: + raise ValueError("`a` should be 1-d tensor") + if a.issparse(): + # does not support sparse tensor + raise ValueError("`a` should be a dense tensor") + if side not in {"left", "right"}: + raise ValueError(f"'{side}' is an invalid value for keyword 'side'") + + if not np.isscalar(v): + v = astensor(v) + + op = TensorSearchsorted( + v=v, side=side, dtype=np.dtype(np.intp), combine_size=combine_size + ) + return op(a, v) diff --git a/python/xorbits/_mars/tensor/base/setdiff1d.py b/python/xorbits/_mars/tensor/base/setdiff1d.py new file mode 100644 index 000000000..1becec87d --- /dev/null +++ b/python/xorbits/_mars/tensor/base/setdiff1d.py @@ -0,0 +1,58 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def setdiff1d(ar1, ar2, assume_unique=False): + """ + Find the set difference of two tensors. + + Return the unique values in `ar1` that are not in `ar2`. + + Parameters + ---------- + ar1 : array_like + Input tensor. + ar2 : array_like + Input comparison tensor. + assume_unique : bool + If True, the input tensors are both assumed to be unique, which + can speed up the calculation. Default is False. + + Returns + ------- + setdiff1d : Tensor + 1D tensor of values in `ar1` that are not in `ar2`. The result + is sorted when `assume_unique=False`, but otherwise only sorted + if the input is sorted. + + Examples + -------- + >>> import mars.tensor as mt + >>> a = mt.array([1, 2, 3, 2, 4, 1]) + >>> b = mt.array([3, 4, 5, 6]) + >>> mt.setdiff1d(a, b).execute() + array([1, 2]) + + """ + + from ..datasource.array import asarray + from .in1d import in1d + from .unique import unique + + if assume_unique: + ar1 = asarray(ar1).ravel() + else: + ar1 = unique(ar1) + ar2 = unique(ar2) + return ar1[in1d(ar1, ar2, assume_unique=True, invert=True)] diff --git a/python/xorbits/_mars/tensor/base/shape.py b/python/xorbits/_mars/tensor/base/shape.py new file mode 100644 index 000000000..722a6a1a9 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/shape.py @@ -0,0 +1,139 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes +from ...core import ExecutableTuple +from ...serialization.serializables import Int32Field, KeyField +from ...utils import calc_nsplits +from ..core import TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorOperand, TensorOperandMixin + + +class TensorGetShape(TensorOperand, TensorOperandMixin): + _op_type_ = opcodes.GET_SHAPE + + _a = KeyField("a") + _ndim = Int32Field("ndim") + + def __init__(self, pure_depends=None, a=None, ndim=None, dtype=None, **kw): + super().__init__( + dtype=dtype, _a=a, _ndim=ndim, _pure_depends=pure_depends, **kw + ) + + @property + def a(self): + return self._a + + @property + def ndim(self): + return self._ndim + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if self._a is not None: + self._a = self._inputs[0] + + @property + def output_limit(self): + return self._ndim + + def __call__(self, a): + if not np.isnan(a.size): + return ExecutableTuple([astensor(s) for s in a.shape]) + + self._a = a + kws = [] + for i in range(self.output_limit): + kws.append( + { + "shape": (), + "dtype": np.dtype(np.intc), + "order": TensorOrder.C_ORDER, + "i": i, + } + ) + return ExecutableTuple(self.new_tensors([a], kws=kws)) + + @classmethod + def tile(cls, op): + a = op.a + outs = op.outputs + + yield a.chunks + + chunk_op = TensorGetShape(pure_depends=[True] * len(a.chunks), ndim=op.ndim) + chunk_kws = [] + for out in outs: + params = out.params + params["index"] = () + chunk_kws.append(params) + chunks = chunk_op.new_chunks(a.chunks, kws=chunk_kws) + + kws = [] + for c, out in zip(chunks, outs): + params = out.params + params["chunks"] = [c] + params["nsplits"] = () + kws.append(params) + new_op = op.copy() + return new_op.new_tensors(op.inputs, kws=kws, output_limit=op.output_limit) + + @classmethod + def execute(cls, ctx, op): + chunk_idx_to_chunk_shapes = dict((c.index, c.shape) for c in op.inputs) + nsplits = calc_nsplits(chunk_idx_to_chunk_shapes) + shape = tuple(sum(ns) for ns in nsplits) + for o, s in zip(op.outputs, shape): + ctx[o.key] = s + + +def shape(a): + """ + Return the shape of a tensor. + + Parameters + ---------- + a : array_like + Input tensor. + + Returns + ------- + shape : ExecutableTuple of tensors + The elements of the shape tuple give the lengths of the + corresponding array dimensions. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.shape(mt.eye(3)).execute() + (3, 3) + >>> mt.shape([[1, 2]]).execute() + (1, 2) + >>> mt.shape([0]).execute() + (1,) + >>> mt.shape(0).execute() + () + + >>> a = mt.array([(1, 2), (3, 4)], dtype=[('x', 'i4'), ('y', 'i4')]) + >>> mt.shape(a).execute() + (2,) + + """ + a = astensor(a) + op = TensorGetShape(ndim=a.ndim) + return op(a) diff --git a/python/xorbits/_mars/tensor/base/sort.py b/python/xorbits/_mars/tensor/base/sort.py new file mode 100644 index 000000000..98df72138 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/sort.py @@ -0,0 +1,508 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...core import ExecutableTuple +from ...serialization.serializables import ( + BoolField, + FieldTypes, + Int32Field, + ListField, + StringField, +) +from ..array_utils import as_same_device, device +from ..core import TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorOperand, TensorShuffleProxy +from ..utils import validate_axis, validate_order +from .psrs import TensorPSRSOperandMixin + + +class TensorSort(TensorOperand, TensorPSRSOperandMixin): + _op_type_ = OperandDef.SORT + + _axis = Int32Field("axis") + _kind = StringField("kind") + _parallel_kind = StringField("parallel_kind") + _order = ListField("order", FieldTypes.string) + _psrs_kinds = ListField("psrs_kinds", FieldTypes.string) + _need_align = BoolField("need_align") + _return_value = BoolField("return_value") + _return_indices = BoolField("return_indices") + + def __init__( + self, + axis=None, + kind=None, + parallel_kind=None, + order=None, + psrs_kinds=None, + need_align=None, + return_value=None, + return_indices=None, + dtype=None, + gpu=None, + **kw, + ): + super().__init__( + _axis=axis, + _kind=kind, + _parallel_kind=parallel_kind, + _order=order, + _psrs_kinds=psrs_kinds, + _need_align=need_align, + _return_value=return_value, + _return_indices=return_indices, + dtype=dtype, + gpu=gpu, + **kw, + ) + + @property + def axis(self): + return self._axis + + @property + def kind(self): + return self._kind + + @property + def parallel_kind(self): + return self._parallel_kind + + @property + def order(self): + return self._order + + @property + def psrs_kinds(self): + return self._psrs_kinds + + @property + def need_align(self): + return self._need_align + + @property + def return_value(self): + return self._return_value + + @property + def return_indices(self): + return self._return_indices + + @property + def output_limit(self): + return int(bool(self._return_value)) + int(bool(self._return_indices)) + + def __call__(self, a): + kws = [] + if self._return_value: + kws.append( + {"shape": a.shape, "order": a.order, "dtype": a.dtype, "type": "sorted"} + ) + if self._return_indices: + kws.append( + { + "shape": a.shape, + "order": TensorOrder.C_ORDER, + "dtype": np.dtype(np.int64), + "type": "argsort", + } + ) + ret = self.new_tensors([a], kws=kws) + if len(kws) == 1: + return ret[0] + return ExecutableTuple(ret) + + @classmethod + def _tile_psrs(cls, op): + """ + Refer to http://csweb.cs.wfu.edu/bigiron/LittleFE-PSRS/build/html/PSRSalgorithm.html + to see explanation of parallel sorting by regular sampling + """ + out_tensor = op.outputs[0] + in_tensor, axis_chunk_shape, out_idxes, need_align = yield from cls.preprocess( + op + ) + axis_offsets = [0] + np.cumsum(in_tensor.nsplits[op.axis]).tolist()[:-1] + return_value, return_indices = op.return_value, op.return_indices + + out_value_chunks, out_indices_chunks = [], [] + for out_idx in out_idxes: + # stage 1: local sort and regular samples collected + ( + sorted_chunks, + indices_chunks, + sampled_chunks, + ) = cls.local_sort_and_regular_sample( + op, in_tensor, axis_chunk_shape, axis_offsets, out_idx + ) + + # stage 2: gather and merge samples, choose and broadcast p-1 pivots + concat_pivot_chunk = cls.concat_and_pivot( + op, axis_chunk_shape, out_idx, sorted_chunks, sampled_chunks + ) + + # stage 3: Local data is partitioned + partition_chunks = cls.partition_local_data( + op, axis_chunk_shape, sorted_chunks, indices_chunks, concat_pivot_chunk + ) + + proxy_chunk = TensorShuffleProxy(dtype=partition_chunks[0].dtype).new_chunk( + partition_chunks, shape=() + ) + + # stage 4: all *ith* classes are gathered and merged + ( + partition_sort_chunks, + partition_indices_chunks, + sort_info_chunks, + ) = cls.partition_merge_data( + op, need_align, None, partition_chunks, proxy_chunk + ) + + if not need_align: + if return_value: + out_value_chunks.extend(partition_sort_chunks) + if return_indices: + out_indices_chunks.extend(partition_indices_chunks) + else: + ( + align_reduce_value_chunks, + align_reduce_indices_chunks, + ) = cls.align_partitions_data( + op, + out_idx, + in_tensor, + partition_sort_chunks, + partition_indices_chunks, + sort_info_chunks, + ) + if return_value: + out_value_chunks.extend(align_reduce_value_chunks) + if return_indices: + out_indices_chunks.extend(align_reduce_indices_chunks) + + new_op = op.copy() + nsplits = list(in_tensor.nsplits) + if not need_align: + nsplits[op.axis] = (np.nan,) * axis_chunk_shape + kws = [] + if return_value: + kws.append( + { + "shape": out_tensor.shape, + "order": out_tensor.order, + "chunks": out_value_chunks, + "nsplits": nsplits, + "dtype": out_tensor.dtype, + } + ) + if return_indices: + kws.append( + { + "shape": out_tensor.shape, + "order": TensorOrder.C_ORDER, + "chunks": out_indices_chunks, + "nsplits": nsplits, + "dtype": np.dtype(np.int64), + } + ) + return new_op.new_tensors(op.inputs, kws=kws) + + @classmethod + def tile(cls, op): + in_tensor = op.inputs[0] + return_value, return_indices = op.return_value, op.return_indices + + if in_tensor.chunk_shape[op.axis] == 1: + out_chunks, out_indices_chunks = [], [] + for chunk in in_tensor.chunks: + chunk_op = op.copy().reset_key() + kws = [] + if return_value: + kws.append( + { + "shape": chunk.shape, + "index": chunk.index, + "order": chunk.order, + "dtype": chunk.dtype, + "type": "sorted", + } + ) + if return_indices: + kws.append( + { + "shape": chunk.shape, + "index": chunk.index, + "order": TensorOrder.C_ORDER, + "dtype": np.dtype(np.int64), + "type": "argsort", + } + ) + chunks = chunk_op.new_chunks([chunk], kws=kws) + if return_value: + out_chunks.append(chunks[0]) + if return_indices: + out_indices_chunks.append(chunks[-1]) + + new_op = op.copy() + kws = [out.params for out in op.outputs] + if return_value: + kws[0]["nsplits"] = in_tensor.nsplits + kws[0]["chunks"] = out_chunks + if return_indices: + kws[-1]["nsplits"] = in_tensor.nsplits + kws[-1]["chunks"] = out_indices_chunks + return new_op.new_tensors([in_tensor], kws=kws) + else: + # use parallel sorting by regular sampling + return (yield from cls._tile_psrs(op)) + + @classmethod + def execute(cls, ctx, op): + (a,), device_id, xp = as_same_device( + [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True + ) + return_value, return_indices = op.return_value, op.return_indices + + with device(device_id): + kw = {} + if op.kind is not None: + kw["kind"] = op.kind + if op.order is not None: + kw["order"] = op.order + + if return_indices: + if not return_value: + ctx[op.outputs[0].key] = xp.argsort(a, axis=op.axis, **kw) + else: + indices = ctx[op.outputs[1].key] = xp.argsort(a, axis=op.axis, **kw) + ctx[op.outputs[0].key] = xp.take_along_axis(a, indices, op.axis) + else: + ctx[op.outputs[0].key] = xp.sort(a, axis=op.axis, **kw) + + +_AVAILABLE_KINDS = {"QUICKSORT", "MERGESORT", "HEAPSORT", "STABLE"} + + +def _validate_sort_psrs_kinds(psrs_kinds): + if psrs_kinds is not None: + if isinstance(psrs_kinds, (list, tuple)): + psrs_kinds = list(psrs_kinds) + if len(psrs_kinds) != 3: + raise ValueError("psrs_kinds should have 3 elements") + for i, psrs_kind in enumerate(psrs_kinds): + if psrs_kind is None: + if i < 2: + continue + else: + raise ValueError( + "3rd element of psrs_kinds should be specified" + ) + upper_psrs_kind = psrs_kind.upper() + if upper_psrs_kind not in _AVAILABLE_KINDS: + raise ValueError( + f"{psrs_kind} is an unrecognized kind in psrs_kinds" + ) + else: + raise TypeError("psrs_kinds should be list or tuple") + else: + psrs_kinds = ["quicksort", "mergesort", "mergesort"] + return psrs_kinds + + +def _validate_sort_arguments(a, axis, kind, parallel_kind, psrs_kinds, order): + a = astensor(a) + if axis is None: + a = a.flatten() + axis = 0 + else: + axis = validate_axis(a.ndim, axis) + if kind is not None: + raw_kind = kind + kind = kind.upper() + if kind not in _AVAILABLE_KINDS: + # check kind + raise ValueError(f"{raw_kind} is an unrecognized kind of sort") + if parallel_kind is not None: + raw_parallel_kind = parallel_kind + parallel_kind = parallel_kind.upper() + if parallel_kind not in {"PSRS"}: + raise ValueError( + f"{raw_parallel_kind} is an unrecognized kind of parallel sort" + ) + + order = validate_order(a.dtype, order) + psrs_kinds = _validate_sort_psrs_kinds(psrs_kinds) + return a, axis, kind, parallel_kind, psrs_kinds, order + + +def sort( + a, + axis=-1, + kind=None, + parallel_kind=None, + psrs_kinds=None, + order=None, + return_index=False, + **kw, +): + r""" + Return a sorted copy of a tensor. + + Parameters + ---------- + a : array_like + Tensor to be sorted. + axis : int or None, optional + Axis along which to sort. If None, the tensor is flattened before + sorting. The default is -1, which sorts along the last axis. + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional + Sorting algorithm. The default is 'quicksort'. Note that both 'stable' + and 'mergesort' use timsort or radix sort under the covers and, in general, + the actual implementation will vary with data type. The 'mergesort' option + is retained for backwards compatibility. + Note that this argument would not take effect if `a` has more than + 1 chunk on the sorting axis. + parallel_kind: {'PSRS'}, optional + Parallel sorting algorithm, for the details, refer to: + http://csweb.cs.wfu.edu/bigiron/LittleFE-PSRS/build/html/PSRSalgorithm.html + psrs_kinds: list with 3 elements, optional + Sorting algorithms during PSRS algorithm. + order : str or list of str, optional + When `a` is a tensor with fields defined, this argument specifies + which fields to compare first, second, etc. A single field can + be specified as a string, and not all fields need be specified, + but unspecified fields will still be used, in the order in which + they come up in the dtype, to break ties. + return_index: bool + Return indices as well if True. + + Returns + ------- + sorted_tensor : Tensor + Tensor of the same type and shape as `a`. + + See Also + -------- + Tensor.sort : Method to sort a tensor in-place. + argsort : Indirect sort. + lexsort : Indirect stable sort on multiple keys. + searchsorted : Find elements in a sorted tensor. + partition : Partial sort. + + Notes + ----- + The various sorting algorithms are characterized by their average speed, + worst case performance, work space size, and whether they are stable. A + stable sort keeps items with the same key in the same relative + order. The four algorithms implemented in NumPy have the following + properties: + + =========== ======= ============= ============ ======== + kind speed worst case work space stable + =========== ======= ============= ============ ======== + 'quicksort' 1 O(n^2) 0 no + 'heapsort' 3 O(n*log(n)) 0 no + 'mergesort' 2 O(n*log(n)) ~n/2 yes + 'timsort' 2 O(n*log(n)) ~n/2 yes + =========== ======= ============= ============ ======== + + .. note:: The datatype determines which of 'mergesort' or 'timsort' + is actually used, even if 'mergesort' is specified. User selection + at a finer scale is not currently available. + + All the sort algorithms make temporary copies of the data when + sorting along any but the last axis. Consequently, sorting along + the last axis is faster and uses less space than sorting along + any other axis. + + The sort order for complex numbers is lexicographic. If both the real + and imaginary parts are non-nan then the order is determined by the + real parts except when they are equal, in which case the order is + determined by the imaginary parts. + + quicksort has been changed to an introsort which will switch + heapsort when it does not make enough progress. This makes its + worst case O(n*log(n)). + + 'stable' automatically choses the best stable sorting algorithm + for the data type being sorted. It, along with 'mergesort' is + currently mapped to timsort or radix sort depending on the + data type. API forward compatibility currently limits the + ability to select the implementation and it is hardwired for the different + data types. + + Timsort is added for better performance on already or nearly + sorted data. On random data timsort is almost identical to + mergesort. It is now used for stable sort while quicksort is still the + default sort if none is chosen. For details of timsort, refer to + `CPython listsort.txt `_. + 'mergesort' and 'stable' are mapped to radix sort for integer data types. Radix sort is an + O(n) sort instead of O(n log n). + + Examples + -------- + >>> import mars.tensor as mt + >>> a = mt.array([[1,4],[3,1]]) + >>> mt.sort(a).execute() # sort along the last axis + array([[1, 4], + [1, 3]]) + >>> mt.sort(a, axis=None).execute() # sort the flattened tensor + array([1, 1, 3, 4]) + >>> mt.sort(a, axis=0).execute() # sort along the first axis + array([[1, 1], + [3, 4]]) + + Use the `order` keyword to specify a field to use when sorting a + structured array: + + >>> dtype = [('name', 'S10'), ('height', float), ('age', int)] + >>> values = [('Arthur', 1.8, 41), ('Lancelot', 1.9, 38), + ... ('Galahad', 1.7, 38)] + >>> a = mt.array(values, dtype=dtype) # create a structured tensor + >>> mt.sort(a, order='height').execute() # doctest: +SKIP + array([('Galahad', 1.7, 38), ('Arthur', 1.8, 41), + ('Lancelot', 1.8999999999999999, 38)], + dtype=[('name', '|S10'), ('height', '>> mt.sort(a, order=['age', 'height']).execute() # doctest: +SKIP + array([('Galahad', 1.7, 38), ('Lancelot', 1.8999999999999999, 38), + ('Arthur', 1.8, 41)], + dtype=[('name', '|S10'), ('height', ' 0: + raise TypeError(f"sort() got an unexpected keyword argument '{next(iter(kw))}'") + a, axis, kind, parallel_kind, psrs_kinds, order = _validate_sort_arguments( + a, axis, kind, parallel_kind, psrs_kinds, order + ) + op = TensorSort( + axis=axis, + kind=kind, + parallel_kind=parallel_kind, + order=order, + psrs_kinds=psrs_kinds, + need_align=need_align, + return_value=True, + return_indices=return_index, + dtype=a.dtype, + gpu=a.op.gpu, + ) + return op(a) diff --git a/python/xorbits/_mars/tensor/base/split.py b/python/xorbits/_mars/tensor/base/split.py new file mode 100644 index 000000000..f6aac57d7 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/split.py @@ -0,0 +1,218 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...core import ExecutableTuple, recursive_tile +from ...lib.sparse.core import get_array_module +from ...serialization.serializables import AnyField, Int32Field, KeyField +from ..core import Tensor +from ..datasource import tensor as astensor +from ..operands import TensorHasInput, TensorOperandMixin +from ..utils import calc_sliced_size + + +class TensorSplit(TensorHasInput, TensorOperandMixin): + _op_type_ = OperandDef.ARRAY_SPLIT + + _input = KeyField("input") + _indices_or_sections = AnyField("indices_or_sections") + _axis = Int32Field("axis") + + def __init__(self, axis=None, **kw): + super().__init__(_axis=axis, **kw) + + @property + def indices_or_sections(self): + return self._indices_or_sections + + @property + def axis(self): + return getattr(self, "_axis", 0) + + @property + def output_limit(self): + return float("inf") + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + if len(self._inputs) > 1: + self._indices_or_sections = self._inputs[1] + + def __call__(self, a, indices_or_sections, is_split=False): + axis = self._axis + size = a.shape[axis] + if np.isnan(size): + raise ValueError( + "cannot split array with unknown shape, " + "call `.execute()` on input tensor first" + ) + + if ( + isinstance(indices_or_sections, Tensor) + and hasattr(indices_or_sections.op, "data") + and indices_or_sections.op.data is not None + ): + indices_or_sections = indices_or_sections.op.data + + try: + indices_or_sections = int(indices_or_sections) + if is_split: + if size % indices_or_sections: + raise ValueError( + "tensor split does not result in an equal division" + ) + nparts = indices_or_sections + nsplit = (size // indices_or_sections,) * nparts + else: + nparts = indices_or_sections + if size % indices_or_sections == 0: + nsplit = (size // indices_or_sections,) * nparts + else: + nsplit = (size // indices_or_sections + 1,) * ( + size % indices_or_sections + ) + (size // indices_or_sections,) * ( + size - size % indices_or_sections + ) + except TypeError: + if isinstance(indices_or_sections, Tensor): + nparts = indices_or_sections.shape[0] + 1 + nsplit = (np.nan,) * nparts + else: + ind = indices_or_sections = get_array_module( + indices_or_sections + ).asarray(indices_or_sections) + if indices_or_sections.ndim != 1 or not np.issubdtype( + indices_or_sections.dtype, np.integer + ): + raise TypeError("slice indices must be integers or None") + nparts = indices_or_sections.shape[0] + 1 + get = lambda i: None if i < 0 or i >= len(ind) else ind[i] + nsplit = [ + calc_sliced_size(size, slice(get(j - 1), get(j))) + for j in range(nparts) + ] + + inputs = [a] + if isinstance(indices_or_sections, Tensor): + inputs.append(indices_or_sections) + else: + self._indices_or_sections = indices_or_sections + + kws = [ + { + "i": i, + "shape": a.shape[:axis] + (nsplit[i],) + a.shape[axis + 1 :], + "order": a.order, + } + for i in range(nparts) + ] + return ExecutableTuple(self.new_tensors(inputs, kws=kws, output_limit=nparts)) + + @classmethod + def tile(cls, op): + in_tensor = op.input + splits = op.outputs + axis = op.axis + + acc_shapes = np.cumsum([s.shape[axis] for s in splits]) + out_kws = [dict() for _ in splits] + for i, split in enumerate(splits): + slc = slice(0 if i == 0 else acc_shapes[i - 1], acc_shapes[i]) + new_s = yield from recursive_tile(in_tensor[(slice(None),) * axis + (slc,)]) + out_kws[i]["chunks"] = new_s.chunks + out_kws[i]["nsplits"] = new_s.nsplits + out_kws[i]["shape"] = split.shape + out_kws[i]["order"] = op.outputs[i].order + + new_op = op.copy() + return new_op.new_tensors(op.inputs, kws=out_kws, output_limit=len(out_kws)) + + +def _split(a, indices_or_sections, axis=0, is_split=False): + op = TensorSplit(axis=axis, dtype=a.dtype) + return op(a, indices_or_sections, is_split=is_split) + + +def split(ary, indices_or_sections, axis=0): + """ + Split a tensor into multiple sub-tensors. + + Parameters + ---------- + ary : Tensor + Tensor to be divided into sub-tensors. + indices_or_sections : int or 1-D tensor + If `indices_or_sections` is an integer, N, the array will be divided + into N equal tensors along `axis`. If such a split is not possible, + an error is raised. + + If `indices_or_sections` is a 1-D tensor of sorted integers, the entries + indicate where along `axis` the array is split. For example, + ``[2, 3]`` would, for ``axis=0``, result in + + - ary[:2] + - ary[2:3] + - ary[3:] + + If an index exceeds the dimension of the tensor along `axis`, + an empty sub-tensor is returned correspondingly. + axis : int, optional + The axis along which to split, default is 0. + + Returns + ------- + sub-tensors : list of Tensors + A list of sub-tensors. + + Raises + ------ + ValueError + If `indices_or_sections` is given as an integer, but + a split does not result in equal division. + + See Also + -------- + array_split : Split a tensor into multiple sub-tensors of equal or + near-equal size. Does not raise an exception if + an equal division cannot be made. + hsplit : Split into multiple sub-arrays horizontally (column-wise). + vsplit : Split tensor into multiple sub-tensors vertically (row wise). + dsplit : Split tensor into multiple sub-tensors along the 3rd axis (depth). + concatenate : Join a sequence of tensors along an existing axis. + stack : Join a sequence of tensors along a new axis. + hstack : Stack tensors in sequence horizontally (column wise). + vstack : Stack tensors in sequence vertically (row wise). + dstack : Stack tensors in sequence depth wise (along third dimension). + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.arange(9.0) + >>> mt.split(x, 3).execute() + [array([ 0., 1., 2.]), array([ 3., 4., 5.]), array([ 6., 7., 8.])] + + >>> x = mt.arange(8.0) + >>> mt.split(x, [3, 5, 6, 10]).execute() + [array([ 0., 1., 2.]), + array([ 3., 4.]), + array([ 5.]), + array([ 6., 7.]), + array([], dtype=float64)] + + """ + return _split(astensor(ary), indices_or_sections, axis=axis, is_split=True) diff --git a/python/xorbits/_mars/tensor/base/squeeze.py b/python/xorbits/_mars/tensor/base/squeeze.py new file mode 100644 index 000000000..3bdf1476e --- /dev/null +++ b/python/xorbits/_mars/tensor/base/squeeze.py @@ -0,0 +1,163 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import Iterable + +from ... import opcodes as OperandDef +from ...serialization.serializables import FieldTypes, KeyField, TupleField +from ..array_utils import as_same_device, device +from ..operands import TensorHasInput, TensorOperandMixin + + +def _get_squeeze_shape(shape, axis): + if axis is not None: + if isinstance(axis, Iterable): + axis = tuple(axis) + else: + axis = (axis,) + + for ax in axis: + if shape[ax] != 1: + raise ValueError( + "cannot select an axis to squeeze out " + "which has size not equal to one" + ) + shape = tuple(s for i, s in enumerate(shape) if i not in axis) + else: + axis = tuple(i for i, s in enumerate(shape) if s == 1) + shape = tuple(s for s in shape if s != 1) + + return shape, axis + + +class TensorSqueeze(TensorHasInput, TensorOperandMixin): + _op_type_ = OperandDef.SQUEEZE + + _input = KeyField("input") + _axis = TupleField("axis", FieldTypes.int32) + + def __init__(self, axis=None, **kw): + super().__init__(_axis=axis, create_view=True, **kw) + + def on_output_modify(self, new_output): + slcs = [slice(None)] * new_output.ndim + for axis in self._axis: + slcs.insert(axis, None) + return new_output[slcs] + + def on_input_modify(self, new_input): + op = self.copy().reset_key() + return op(new_input, self.outputs[0].shape) + + @property + def axis(self): + return self._axis + + def __call__(self, a, shape): + return self.new_tensor([a], shape, order=a.order) + + @classmethod + def tile(cls, op): + in_tensor = op.input + out_tensor = op.outputs[0] + axis_set = set(op.axis) + + out_chunks = [] + for c in in_tensor.chunks: + chunk_op = op.copy().reset_key() + chunk_shape = _get_squeeze_shape(c.shape, op.axis)[0] + chunk_idx = tuple(idx for i, idx in enumerate(c.index) if i not in axis_set) + out_chunk = chunk_op.new_chunk( + [c], shape=chunk_shape, index=chunk_idx, order=out_tensor.order + ) + out_chunks.append(out_chunk) + nsplits = [ + nsplit for i, nsplit in enumerate(in_tensor.nsplits) if i not in axis_set + ] + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + op.outputs[0].shape, + order=out_tensor.order, + chunks=out_chunks, + nsplits=nsplits, + ) + + @classmethod + def execute(cls, ctx, op): + (a,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + ctx[op.outputs[0].key] = xp.squeeze(a, axis=op.axis) + + +def squeeze(a, axis=None): + """ + Remove single-dimensional entries from the shape of a tensor. + + Parameters + ---------- + a : array_like + Input data. + axis : None or int or tuple of ints, optional + Selects a subset of the single-dimensional entries in the + shape. If an axis is selected with shape entry greater than + one, an error is raised. + + Returns + ------- + squeezed : Tensor + The input tensor, but with all or a subset of the + dimensions of length 1 removed. This is always `a` itself + or a view into `a`. + + Raises + ------ + ValueError + If `axis` is not `None`, and an axis being squeezed is not of length 1 + + See Also + -------- + expand_dims : The inverse operation, adding singleton dimensions + reshape : Insert, remove, and combine dimensions, and resize existing ones + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.array([[[0], [1], [2]]]) + >>> x.shape + (1, 3, 1) + >>> mt.squeeze(x).shape + (3,) + >>> mt.squeeze(x, axis=0).shape + (3, 1) + >>> mt.squeeze(x, axis=1).shape + Traceback (most recent call last): + ... + ValueError: cannot select an axis to squeeze out which has size not equal to one + >>> mt.squeeze(x, axis=2).shape + (1, 3) + + """ + shape, axis = _get_squeeze_shape(a.shape, axis) + + if 1 not in a.shape: + return a + + op = TensorSqueeze(axis=axis, dtype=a.dtype, sparse=a.issparse()) + return op(a, shape) diff --git a/python/xorbits/_mars/tensor/base/swapaxes.py b/python/xorbits/_mars/tensor/base/swapaxes.py new file mode 100644 index 000000000..1e3178732 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/swapaxes.py @@ -0,0 +1,163 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...serialization.serializables import Int32Field, KeyField +from ..array_utils import as_same_device, device +from ..core import TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorHasInput, TensorOperandMixin +from ..utils import reverse_order, validate_axis + + +def _swap(it, axis1, axis2): + new_it = list(it) + new_it[axis1], new_it[axis2] = it[axis2], it[axis1] + + return tuple(new_it) + + +class TensorSwapAxes(TensorHasInput, TensorOperandMixin): + _op_type_ = OperandDef.SWAPAXES + + _input = KeyField("input") + _axis1 = Int32Field("axis1") + _axis2 = Int32Field("axis2") + + def __init__(self, axis1=None, axis2=None, **kw): + super().__init__(_axis1=axis1, _axis2=axis2, create_view=True, **kw) + + @property + def axis1(self): + return self._axis1 + + @property + def axis2(self): + return self._axis2 + + def __call__(self, a): + axis1, axis2 = self._axis1, self._axis2 + if (axis1 == 0 and axis2 == a.ndim - 1) or (axis1 == a.ndim - 1 and axis2 == 0): + tensor_order = reverse_order(a.order) + else: + tensor_order = TensorOrder.C_ORDER + shape = _swap(a.shape, self.axis1, self.axis2) + return self.new_tensor([a], shape, order=tensor_order) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + def on_output_modify(self, new_output): + op = TensorSwapAxes( + axis1=self._axis2, + axis2=self._axis1, + dtype=new_output.dtype, + sparse=new_output.issparse(), + ) + return op(new_output) + + def on_input_modify(self, new_input): + op = self.copy().reset_key() + return op(new_input) + + @classmethod + def tile(cls, op): + axis1, axis2 = op.axis1, op.axis2 + in_tensor = op.inputs[0] + out_tensor = op.outputs[0] + + out_chunks = [] + for c in in_tensor.chunks: + chunk_shape = _swap(c.shape, axis1, axis2) + chunk_idx = _swap(c.index, axis1, axis2) + chunk_op = op.copy().reset_key() + out_chunk = chunk_op.new_chunk( + [c], shape=chunk_shape, index=chunk_idx, order=out_tensor.order + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + nsplits = _swap(in_tensor.nsplits, axis1, axis2) + return new_op.new_tensors( + [in_tensor], + out_tensor.shape, + order=out_tensor.order, + chunks=out_chunks, + nsplits=nsplits, + ) + + @classmethod + def execute(cls, ctx, op): + (x,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + axis1, axis2 = op.axis1, op.axis2 + with device(device_id): + ctx[op.outputs[0].key] = xp.swapaxes(x, axis1, axis2) + + +def swapaxes(a, axis1, axis2): + """ + Interchange two axes of a tensor. + + Parameters + ---------- + a : array_like + Input tensor. + axis1 : int + First axis. + axis2 : int + Second axis. + + Returns + ------- + a_swapped : Tensor + If `a` is a Tensor, then a view of `a` is + returned; otherwise a new tensor is created. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.array([[1,2,3]]) + >>> mt.swapaxes(x,0,1).execute() + array([[1], + [2], + [3]]) + + >>> x = mt.array([[[0,1],[2,3]],[[4,5],[6,7]]]) + >>> x.execute() + array([[[0, 1], + [2, 3]], + [[4, 5], + [6, 7]]]) + + >>> mt.swapaxes(x,0,2).execute() + array([[[0, 4], + [2, 6]], + [[1, 5], + [3, 7]]]) + + """ + a = astensor(a) + axis1 = validate_axis(a.ndim, axis1) + axis2 = validate_axis(a.ndim, axis2) + + if axis1 == axis2: + return a + + op = TensorSwapAxes(axis1, axis2, dtype=a.dtype, sparse=a.issparse()) + return op(a) diff --git a/python/xorbits/_mars/tensor/base/tests/__init__.py b/python/xorbits/_mars/tensor/base/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/tensor/base/tests/test_base.py b/python/xorbits/_mars/tensor/base/tests/test_base.py new file mode 100644 index 000000000..f41f595ae --- /dev/null +++ b/python/xorbits/_mars/tensor/base/tests/test_base.py @@ -0,0 +1,797 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest + +from ....core import tile +from ....core.operand import OperandStage +from ...datasource import arange, ones, tensor +from .. import ( + TensorCopyTo, + argwhere, + array_split, + atleast_1d, + atleast_2d, + atleast_3d, + broadcast_to, + copyto, + isin, + moveaxis, + partition, + ravel, + repeat, + result_type, + searchsorted, + sort, + split, + squeeze, + to_cpu, + to_gpu, + topk, + transpose, + unique, + where, +) + + +def test_dir(): + a = tensor([0, 1, 2], chunk_size=2) + tensor_dir = dir(a) + for attr in dir(a.data): + assert attr in tensor_dir + + +def test_copyto(): + a = ones((10, 20), chunk_size=3) + b = ones(10, chunk_size=4) + + with pytest.raises(ValueError): + copyto(a, b) + + tp = type(a.op) + b = ones(20, chunk_size=4) + copyto(a, b) + + assert isinstance(a.op, TensorCopyTo) + assert a.inputs[0] is b.data + assert isinstance(a.inputs[1].op, tp) + + a = tile(a) + + assert isinstance(a.chunks[0].op, TensorCopyTo) + assert len(a.chunks[0].inputs) == 2 + + a = ones((10, 20), chunk_size=3, dtype="i4") + b = ones(20, chunk_size=4, dtype="f8") + + with pytest.raises(TypeError): + copyto(a, b) + + b = ones(20, chunk_size=4, dtype="i4") + copyto(a, b, where=b > 0) + + assert a.op.where is not None + + a = tile(a) + + assert isinstance(a.chunks[0].op, TensorCopyTo) + assert len(a.chunks[0].inputs) == 3 + + with pytest.raises(ValueError): + copyto(a, a, where=np.ones(30, dtype="?")) + + +def test_astype(): + arr = ones((10, 20, 30), chunk_size=3) + + arr2 = arr.astype(np.int32) + arr2 = tile(arr2) + + assert arr2.shape == (10, 20, 30) + assert np.issubdtype(arr2.dtype, np.int32) is True + assert arr2.op.casting == "unsafe" + + with pytest.raises(TypeError): + arr.astype(np.int32, casting="safe") + + arr3 = arr.astype(arr.dtype, order="F") + assert arr3.flags["F_CONTIGUOUS"] is True + assert arr3.flags["C_CONTIGUOUS"] is False + + arr3 = tile(arr3) + + assert arr3.chunks[0].order.value == "F" + + +def test_transpose(): + arr = ones((10, 20, 30), chunk_size=[4, 3, 5]) + + arr2 = transpose(arr) + arr2 = tile(arr2) + + assert arr2.shape == (30, 20, 10) + assert len(arr2.chunks) == 126 + assert arr2.chunks[0].shape == (5, 3, 4) + assert arr2.chunks[-1].shape == (5, 2, 2) + + with pytest.raises(ValueError): + transpose(arr, axes=(1, 0)) + + arr3 = transpose(arr, (-2, 2, 0)) + arr3 = tile(arr3) + + assert arr3.shape == (20, 30, 10) + assert len(arr3.chunks) == 126 + assert arr3.chunks[0].shape == (3, 5, 4) + assert arr3.chunks[-1].shape == (2, 5, 2) + + arr4 = arr.transpose(-2, 2, 0) + arr4 = tile(arr4) + + assert arr4.shape == (20, 30, 10) + assert len(arr4.chunks) == 126 + assert arr4.chunks[0].shape == (3, 5, 4) + assert arr4.chunks[-1].shape == (2, 5, 2) + + arr5 = arr.T + arr5 = tile(arr5) + + assert arr5.shape == (30, 20, 10) + assert len(arr5.chunks) == 126 + assert arr5.chunks[0].shape == (5, 3, 4) + assert arr5.chunks[-1].shape == (5, 2, 2) + + +def test_swapaxes(): + arr = ones((10, 20, 30), chunk_size=[4, 3, 5]) + arr2 = arr.swapaxes(0, 1) + arr, arr2 = tile(arr, arr2) + + assert arr2.shape == (20, 10, 30) + assert len(arr.chunks) == len(arr2.chunks) + + +def test_broadcast_to(): + arr = ones((10, 5), chunk_size=2) + arr2 = broadcast_to(arr, (20, 10, 5)) + arr, arr2 = tile(arr, arr2) + + assert arr2.shape == (20, 10, 5) + assert len(arr2.chunks) == len(arr.chunks) + assert arr2.chunks[0].shape == (20, 2, 2) + + arr = ones((10, 5, 1), chunk_size=2) + arr3 = broadcast_to(arr, (5, 10, 5, 6)) + arr, arr3 = tile(arr, arr3) + + assert arr3.shape == (5, 10, 5, 6) + assert len(arr3.chunks) == len(arr.chunks) + assert arr3.nsplits == ((5,), (2, 2, 2, 2, 2), (2, 2, 1), (6,)) + assert arr3.chunks[0].shape == (5, 2, 2, 6) + + arr = ones((10, 1), chunk_size=2) + arr4 = broadcast_to(arr, (20, 10, 5)) + arr, arr4 = tile(arr, arr4) + + assert arr4.shape == (20, 10, 5) + assert len(arr4.chunks) == len(arr.chunks) + assert arr4.chunks[0].shape == (20, 2, 5) + + with pytest.raises(ValueError): + broadcast_to(arr, (10,)) + + with pytest.raises(ValueError): + broadcast_to(arr, (5, 1)) + + arr = ones((4, 5), chunk_size=2) + with pytest.raises((ValueError)): + broadcast_to(arr[arr < 2], (3, 20)) + + +def test_where(): + cond = tensor([[True, False], [False, True]], chunk_size=1) + x = tensor([1, 2], chunk_size=1) + y = tensor([3, 4], chunk_size=1) + + arr = where(cond, x, y) + arr = tile(arr) + + assert len(arr.chunks) == 4 + np.testing.assert_equal(arr.chunks[0].inputs[0].op.data, [[True]]) + np.testing.assert_equal(arr.chunks[0].inputs[1].op.data, [1]) + np.testing.assert_equal(arr.chunks[0].inputs[2].op.data, [3]) + np.testing.assert_equal(arr.chunks[1].inputs[0].op.data, [[False]]) + np.testing.assert_equal(arr.chunks[1].inputs[1].op.data, [2]) + np.testing.assert_equal(arr.chunks[1].inputs[2].op.data, [4]) + np.testing.assert_equal(arr.chunks[2].inputs[0].op.data, [[False]]) + np.testing.assert_equal(arr.chunks[2].inputs[1].op.data, [1]) + np.testing.assert_equal(arr.chunks[2].inputs[2].op.data, [3]) + np.testing.assert_equal(arr.chunks[3].inputs[0].op.data, [[True]]) + np.testing.assert_equal(arr.chunks[3].inputs[1].op.data, [2]) + np.testing.assert_equal(arr.chunks[3].inputs[2].op.data, [4]) + + with pytest.raises(ValueError): + where(cond, x) + + x = arange(9.0).reshape(3, 3) + y = where(x < 5, x, -1) + + assert y.dtype == np.float64 + + +def test_argwhere(): + cond = tensor([[True, False], [False, True]], chunk_size=1) + indices = argwhere(cond) + + assert np.isnan(indices.shape[0]) + assert indices.shape[1] == 2 + + indices = tile(indices) + + assert indices.nsplits[1] == (1, 1) + + +def test_argwhere_order(): + data = np.asfortranarray([[True, False], [False, True]]) + cond = tensor(data, chunk_size=1) + indices = argwhere(cond) + + assert indices.flags["F_CONTIGUOUS"] is True + assert indices.flags["C_CONTIGUOUS"] is False + + indices = tile(indices) + + assert indices.chunks[0].order.value == "F" + + +def test_array_split(): + a = arange(8, chunk_size=2) + + splits = array_split(a, 3) + assert len(splits) == 3 + assert [s.shape[0] for s in splits] == [3, 3, 2] + + splits = tile(*splits) + assert splits[0].nsplits == ((2, 1),) + assert splits[1].nsplits == ((1, 2),) + assert splits[2].nsplits == ((2,),) + + a = arange(7, chunk_size=2) + + splits = array_split(a, 3) + assert len(splits) == 3 + assert [s.shape[0] for s in splits] == [3, 2, 2] + + splits = tile(*splits) + assert splits[0].nsplits == ((2, 1),) + assert splits[1].nsplits == ((1, 1),) + assert splits[2].nsplits == ((1, 1),) + + +def test_split(): + a = arange(9, chunk_size=2) + + splits = split(a, 3) + assert len(splits) == 3 + assert all(s.shape == (3,) for s in splits) is True + + splits = tile(*splits) + assert splits[0].nsplits == ((2, 1),) + assert splits[1].nsplits == ((1, 2),) + assert splits[2].nsplits == ((2, 1),) + + a = arange(8, chunk_size=2) + + splits = split(a, [3, 5, 6, 10]) + assert len(splits) == 5 + assert splits[0].shape == (3,) + assert splits[1].shape == (2,) + assert splits[2].shape == (1,) + assert splits[3].shape == (2,) + assert splits[4].shape == (0,) + + splits = tile(*splits) + assert splits[0].nsplits == ((2, 1),) + assert splits[1].nsplits == ((1, 1),) + assert splits[2].nsplits == ((1,),) + assert splits[3].nsplits == ((2,),) + assert splits[4].nsplits == ((0,),) + + a = tensor(np.asfortranarray(np.random.rand(9, 10)), chunk_size=4) + splits = split(a, 3) + assert splits[0].flags["F_CONTIGUOUS"] is True + assert splits[0].flags["C_CONTIGUOUS"] is False + assert splits[1].flags["F_CONTIGUOUS"] is True + assert splits[0].flags["C_CONTIGUOUS"] is False + assert splits[2].flags["F_CONTIGUOUS"] is True + assert splits[0].flags["C_CONTIGUOUS"] is False + + for a in ((1, 1, 1, 2, 2, 3), [1, 1, 1, 2, 2, 3]): + splits = split(a, (3, 5)) + assert len(splits) == 3 + + +def test_squeeze(): + data = np.array([[[0], [1], [2]]]) + x = tensor(data) + + t = squeeze(x) + assert t.shape == (3,) + assert t.dtype is not None + + t = squeeze(x, axis=0) + assert t.shape == (3, 1) + + with pytest.raises(ValueError): + squeeze(x, axis=1) + + t = squeeze(x, axis=2) + assert t.shape == (1, 3) + + +def test_result_type(): + x = tensor([2, 3], dtype="i4") + y = 3 + z = np.array([3, 4], dtype="f4") + + r = result_type(x, y, z) + e = np.result_type(x.dtype, y, z) + assert r == e + + +def test_repeat(): + a = arange(10, chunk_size=2).reshape(2, 5) + + t = repeat(a, 3) + assert t.shape == (30,) + + t = repeat(a, 3, axis=0) + assert t.shape == (6, 5) + + t = repeat(a, 3, axis=1) + assert t.shape == (2, 15) + + t = repeat(a, [3], axis=1) + assert t.shape == (2, 15) + + t = repeat(a, [3, 4], axis=0) + assert t.shape == (7, 5) + + with pytest.raises(ValueError): + repeat(a, [3, 4], axis=1) + + a = tensor(np.random.randn(10), chunk_size=5) + + t = repeat(a, 3) + t = tile(t) + assert sum(t.nsplits[0]) == 30 + + a = tensor(np.random.randn(100), chunk_size=10) + + t = repeat(a, 3) + t = tile(t) + assert sum(t.nsplits[0]) == 300 + + a = tensor(np.random.randn(4)) + b = tensor((4,)) + + t = repeat(a, b) + + t = tile(t) + assert np.isnan(t.nsplits[0]) + + +def test_isin(): + element = 2 * arange(4, chunk_size=1).reshape(2, 2) + test_elements = [1, 2, 4, 8] + + mask = isin(element, test_elements) + assert mask.shape == (2, 2) + assert mask.dtype == np.bool_ + + mask, element = tile(mask, element) + + assert len(mask.chunks) == len(element.chunks) + assert len(mask.op.inputs[1].chunks) == 1 + assert mask.chunks[0].inputs[0] is element.chunks[0].data + + element = 2 * arange(4, chunk_size=1).reshape(2, 2) + test_elements = tensor([1, 2, 4, 8], chunk_size=2) + + mask = isin(element, test_elements, invert=True) + assert mask.shape == (2, 2) + assert mask.dtype == np.bool_ + + +def test_create_view(): + arr = ones((10, 20, 30), chunk_size=[4, 3, 5]) + arr2 = transpose(arr) + assert arr2.op.create_view is True + + arr3 = transpose(arr) + assert arr3.op.create_view is True + + arr4 = arr.swapaxes(0, 1) + assert arr4.op.create_view is True + + arr5 = moveaxis(arr, 1, 0) + assert arr5.op.create_view is True + + arr6 = atleast_1d(1) + assert arr6.op.create_view is True + + arr7 = atleast_2d([1, 1]) + assert arr7.op.create_view is True + + arr8 = atleast_3d([1, 1]) + assert arr8.op.create_view is True + + arr9 = arr[:3, [1, 2, 3]] + # no view cuz of fancy indexing + assert arr9.op.create_view is False + + arr9[0][0][0] = 100 + assert arr9.op.create_view is False + + arr10 = arr[:3, None, :5] + assert arr10.op.create_view is True + + arr10[0][0][0] = 100 + assert arr10.op.create_view is False + + data = np.array([[[0], [1], [2]]]) + x = tensor(data) + + t = squeeze(x) + assert t.op.create_view is True + + y = x.reshape(3) + assert y.op.create_view is True + + +def test_ravel(): + arr = ones((10, 5), chunk_size=2) + flat_arr = ravel(arr) + assert flat_arr.shape == (50,) + + +def test_searchsorted(): + raw = np.sort(np.random.randint(100, size=(16,))) + arr = tensor(raw, chunk_size=3).cumsum() + + t1 = searchsorted(arr, 10) + + assert t1.shape == () + assert ( + t1.flags["C_CONTIGUOUS"] + == np.searchsorted(raw.cumsum(), 10).flags["C_CONTIGUOUS"] + ) + assert ( + t1.flags["F_CONTIGUOUS"] + == np.searchsorted(raw.cumsum(), 10).flags["F_CONTIGUOUS"] + ) + + t1 = tile(t1) + + assert t1.nsplits == () + assert len(t1.chunks) == 1 + assert t1.chunks[0].op.stage == OperandStage.agg + + with pytest.raises(ValueError): + searchsorted(np.random.randint(10, size=(14, 14)), 1) + + with pytest.raises(ValueError): + searchsorted(arr, 10, side="both") + + with pytest.raises(ValueError): + searchsorted(arr.tosparse(), 10) + + raw2 = np.asfortranarray(np.sort(np.random.randint(100, size=(16,)))) + arr = tensor(raw2, chunk_size=3) + to_search = np.asfortranarray([[1, 2], [3, 4]]) + + t1 = searchsorted(arr, to_search) + expected = np.searchsorted(raw2, to_search) + + assert t1.shape == to_search.shape + assert t1.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert t1.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + +def test_to_gpu(): + x = tensor(np.random.rand(10, 10), chunk_size=3) + + gx = to_gpu(x) + + assert gx.dtype == x.dtype + assert gx.order == x.order + assert gx.op.gpu is True + + gx, x = tile(gx, x) + + assert gx.chunks[0].dtype == x.chunks[0].dtype + assert gx.chunks[0].order == x.chunks[0].order + assert gx.chunks[0].op.gpu is True + + +def test_to_cpu(): + x = tensor(np.random.rand(10, 10), chunk_size=3, gpu=True) + + cx = to_cpu(x) + + assert cx.dtype == x.dtype + assert cx.order == x.order + assert cx.op.gpu is False + + cx, x = tile(cx, x) + + assert cx.chunks[0].dtype == x.chunks[0].dtype + assert cx.chunks[0].order == x.chunks[0].order + assert cx.chunks[0].op.gpu is False + + +def test_unique(): + x = unique(np.int64(1)) + + assert len(x.shape) == 1 + assert np.isnan(x.shape[0]) + assert x.dtype == np.dtype(np.int64) + + x = tile(x) + + assert len(x.chunks) == 1 + assert len(x.chunks[0].shape) == 1 + assert np.isnan(x.chunks[0].shape[0]) + assert x.chunks[0].dtype == np.dtype(np.int64) + + x, indices = unique(0.1, return_index=True) + + assert len(x.shape) == 1 + assert np.isnan(x.shape[0]) + assert x.dtype == np.dtype(np.float64) + assert len(indices.shape) == 1 + assert np.isnan(indices.shape[0]) + assert indices.dtype == np.dtype(np.intp) + + x, indices = tile(x, indices) + + assert len(x.chunks) == 1 + assert len(x.chunks[0].shape) == 1 + assert np.isnan(x.chunks[0].shape[0]) + assert x.chunks[0].dtype == np.dtype(np.float64) + assert len(indices.chunks) == 1 + assert len(indices.chunks[0].shape) == 1 + assert np.isnan(indices.chunks[0].shape[0]) + assert indices.chunks[0].dtype == np.dtype(np.intp) + + with pytest.raises(np.AxisError): + unique(0.1, axis=1) + + raw = np.random.randint(10, size=(10), dtype=np.int64) + a = tensor(raw, chunk_size=4) + + x = unique(a, aggregate_size=2) + + assert len(x.shape) == len(raw.shape) + assert np.isnan(x.shape[0]) + assert x.dtype == np.dtype(np.int64) + + x = tile(x) + + assert len(x.chunks) == 2 + assert x.nsplits == ((np.nan, np.nan),) + for i in range(2): + assert x.chunks[i].shape == (np.nan,) + assert x.chunks[i].dtype == raw.dtype + + raw = np.random.randint(10, size=(10, 20), dtype=np.int64) + a = tensor(raw, chunk_size=(4, 6)) + + x, indices, inverse, counts = unique( + a, + axis=1, + aggregate_size=2, + return_index=True, + return_inverse=True, + return_counts=True, + ) + + assert x.shape == (10, np.nan) + assert x.dtype == np.dtype(np.int64) + assert indices.shape == (np.nan,) + assert indices.dtype == np.dtype(np.intp) + assert inverse.shape == (20,) + assert inverse.dtype == np.dtype(np.intp) + assert counts.shape == (np.nan,) + assert counts.dtype == np.dtype(np.int_) + + x, indices, inverse, counts = tile(x, indices, inverse, counts) + + assert len(x.chunks) == 2 + assert x.nsplits == ((10,), (np.nan, np.nan)) + for i in range(2): + assert x.chunks[i].shape == (10, np.nan) + assert x.chunks[i].dtype == raw.dtype + assert x.chunks[i].index == (0, i) + + assert len(indices.chunks) == 2 + assert indices.nsplits == ((np.nan, np.nan),) + for i in range(2): + assert indices.chunks[i].shape == (np.nan,) + assert indices.chunks[i].dtype == raw.dtype + assert indices.chunks[i].index == (i,) + + assert len(inverse.chunks) == 4 + assert inverse.nsplits == ((6, 6, 6, 2),) + for i in range(4): + assert inverse.chunks[i].shape == ((6, 6, 6, 2)[i],) + assert inverse.chunks[i].dtype == np.dtype(np.int64) + assert inverse.chunks[i].index == (i,) + + assert len(counts.chunks) == 2 + assert counts.nsplits == ((np.nan, np.nan),) + for i in range(2): + assert counts.chunks[i].shape == (np.nan,) + assert counts.chunks[i].dtype == np.dtype(np.int_) + assert counts.chunks[i].index == (i,) + + +def test_sort(): + a = tensor(np.random.rand(10, 10), chunk_size=(5, 10)) + + sa = sort(a) + assert type(sa.op).__name__ == "TensorSort" + + sa = tile(sa) + + assert len(sa.chunks) == 2 + for c in sa.chunks: + assert type(c.op).__name__ == "TensorSort" + assert type(c.inputs[0].op).__name__ == "ArrayDataSource" + + a = tensor(np.random.rand(100), chunk_size=(10)) + + sa = sort(a) + assert type(sa.op).__name__ == "TensorSort" + + sa = tile(sa) + + for c in sa.chunks: + assert type(c.op).__name__ == "PSRSShuffle" + assert c.op.stage == OperandStage.reduce + assert c.shape == (np.nan,) + + a = tensor( + np.empty((10, 10), dtype=[("id", np.int32), ("size", np.int64)]), + chunk_size=(10, 5), + ) + sa = sort(a) + assert sa.op.order == ["id", "size"] + + with pytest.raises(ValueError): + sort(a, order=["unknown_field"]) + + with pytest.raises(np.AxisError): + sort(np.random.rand(100), axis=1) + + with pytest.raises(ValueError): + sort(np.random.rand(100), kind="non_valid_kind") + + with pytest.raises(ValueError): + sort(np.random.rand(100), parallel_kind="non_valid_parallel_kind") + + with pytest.raises(TypeError): + sort(np.random.rand(100), psrs_kinds="non_valid_psrs_kinds") + + with pytest.raises(ValueError): + sort(np.random.rand(100), psrs_kinds=["quicksort"] * 2) + + with pytest.raises(ValueError): + sort(np.random.rand(100), psrs_kinds=["non_valid_kind"] * 3) + + with pytest.raises(ValueError): + sort(np.random.rand(100), psrs_kinds=[None, None, None]) + + with pytest.raises(ValueError): + sort(np.random.rand(100), psrs_kinds=["quicksort", "mergesort", None]) + + +def test_partition(): + a = tensor(np.random.rand(10, 10), chunk_size=(5, 10)) + + pa = partition(a, [4, 9]) + assert type(pa.op).__name__ == "TensorPartition" + + pa = tile(pa) + + assert len(pa.chunks) == 2 + for c in pa.chunks: + assert type(c.op).__name__ == "TensorPartition" + assert type(c.inputs[0].op).__name__ == "ArrayDataSource" + + a = tensor(np.random.rand(100), chunk_size=(10)) + + pa = partition(a, 4) + assert type(pa.op).__name__ == "TensorPartition" + + pa = tile(pa) + + for c in pa.chunks: + assert type(c.op).__name__ == "PartitionMerged" + assert c.shape == (np.nan,) + + a = tensor( + np.empty((10, 10), dtype=[("id", np.int32), ("size", np.int64)]), + chunk_size=(10, 5), + ) + pa = partition(a, 3) + assert pa.op.order == ["id", "size"] + + with pytest.raises(ValueError): + partition(a, 4, order=["unknown_field"]) + + with pytest.raises(np.AxisError): + partition(np.random.rand(100), 4, axis=1) + + with pytest.raises(ValueError): + partition(np.random.rand(100), 4, kind="non_valid_kind") + + with pytest.raises(ValueError): + partition(np.random.rand(10), 10) + + with pytest.raises(TypeError): + partition(np.random.rand(10), tensor([1.0, 2.0])) + + with pytest.raises(ValueError): + partition(np.random.rand(10), tensor([[1, 2]])) + + with pytest.raises(ValueError): + partition(np.random.rand(10), [-11, 2]) + + +def test_topk(): + raw = np.random.rand(20) + a = tensor(raw, chunk_size=10) + + t = topk(a, 2) + t = tile(t) + assert t.op.parallel_kind == "tree" + + t = topk(a, 3) + t = tile(t) + assert t.op.parallel_kind == "psrs" + + t = topk(sort(a), 3) + t = tile(t) + # k is less than 100 + assert t.op.parallel_kind == "tree" + + with pytest.raises(ValueError): + topk(a, 3, parallel_kind="unknown") + + +def test_map_chunk(): + raw = np.random.rand(20) + a = tensor(raw, chunk_size=10) + + mapped = tile(a.map_chunk(lambda x: x * 0.5)) + assert np.issubdtype(mapped.dtype, np.floating) is True + assert mapped.shape == (np.nan,) + assert len(mapped.chunks) == 2 + + mapped = tile(a.map_chunk(lambda x: x * 0.5, elementwise=True)) + assert np.issubdtype(mapped.dtype, np.floating) is True + assert mapped.shape == (20,) + assert len(mapped.chunks) == 2 diff --git a/python/xorbits/_mars/tensor/base/tests/test_base_execution.py b/python/xorbits/_mars/tensor/base/tests/test_base_execution.py new file mode 100644 index 000000000..78092cf63 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/tests/test_base_execution.py @@ -0,0 +1,1989 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest +import scipy.sparse as sps + +from .... import dataframe as md +from .... import execute, fetch +from .... import tensor as mt +from ....tests.core import require_cupy +from ...datasource import arange, ones, tensor, zeros +from .. import ( + argpartition, + argsort, + argtopk, + argwhere, + array_split, + atleast_1d, + atleast_2d, + atleast_3d, + broadcast_arrays, + broadcast_to, + copyto, + diff, + dsplit, + ediff1d, + expand_dims, + flip, + fliplr, + flipud, + hsplit, + isin, + moveaxis, + partition, + repeat, + roll, + rollaxis, + searchsorted, + shape, + sort, + split, + squeeze, + swapaxes, + tile, + to_cpu, + to_gpu, + topk, + transpose, + trapz, + unique, + vsplit, + where, +) + + +def test_rechunk_execution(setup): + raw = np.random.RandomState(0).random((11, 8)) + arr = tensor(raw, chunk_size=3) + arr2 = arr.rechunk(4) + + res = arr2.execute().fetch() + np.testing.assert_array_equal(res, raw) + + +def test_copyto_execution(setup): + a = ones((2, 3), chunk_size=1) + b = tensor([3, -1, 3], chunk_size=2) + + copyto(a, b, where=b > 1) + + res = a.execute().fetch() + expected = np.array([[3, 1, 3], [3, 1, 3]]) + + np.testing.assert_equal(res, expected) + + a = ones((2, 3), chunk_size=1) + b = tensor(np.asfortranarray(np.random.rand(2, 3)), chunk_size=2) + + copyto(b, a) + + res = b.execute().fetch() + expected = np.asfortranarray(np.ones((2, 3))) + + np.testing.assert_array_equal(res, expected) + assert res.flags["F_CONTIGUOUS"] is True + assert res.flags["C_CONTIGUOUS"] is False + + +@pytest.mark.ray_dag +def test_astype_execution(setup): + raw = np.random.random((10, 5)) + arr = tensor(raw, chunk_size=3) + arr2 = arr.astype("i8") + + res = arr2.execute().fetch() + np.testing.assert_array_equal(res, raw.astype("i8")) + + raw = sps.random(10, 5, density=0.2) + arr = tensor(raw, chunk_size=3) + arr2 = arr.astype("i8") + + res = arr2.execute().fetch() + assert np.array_equal(res.toarray(), raw.astype("i8").toarray()) is True + + raw = np.asfortranarray(np.random.random((10, 5))) + arr = tensor(raw, chunk_size=3) + arr2 = arr.astype("i8", order="C") + + res = arr2.execute().fetch() + np.testing.assert_array_equal(res, raw.astype("i8")) + assert res.flags["C_CONTIGUOUS"] is True + assert res.flags["F_CONTIGUOUS"] is False + + +def test_transpose_execution(setup): + raw = np.random.random((11, 8, 5)) + arr = tensor(raw, chunk_size=3) + arr2 = transpose(arr) + + res = arr2.execute().fetch() + np.testing.assert_array_equal(res, raw.T) + + arr3 = transpose(arr, axes=(-2, -1, -3)) + + res = arr3.execute().fetch() + np.testing.assert_array_equal(res, raw.transpose(1, 2, 0)) + + raw = sps.random(11, 8) + arr = tensor(raw, chunk_size=3) + arr2 = transpose(arr) + + assert arr2.issparse() is True + + res = arr2.execute().fetch() + np.testing.assert_array_equal(res.toarray(), raw.T.toarray()) + + # test order + raw = np.asfortranarray(np.random.random((11, 8, 5))) + + arr = tensor(raw, chunk_size=3) + arr2 = transpose(arr) + + res = arr2.execute().fetch() + expected = np.transpose(raw).copy(order="A") + + np.testing.assert_array_equal(res, expected) + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + arr = tensor(raw, chunk_size=3) + arr2 = transpose(arr, (1, 2, 0)) + + res = arr2.execute().fetch() + expected = np.transpose(raw, (1, 2, 0)).copy(order="A") + + np.testing.assert_array_equal(res, expected) + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + df = md.DataFrame(mt.random.rand(10, 5, chunk_size=5)) + df = df[df[0] < 1] + # generate tensor with unknown shape + t = df.to_tensor() + t2 = transpose(t) + + res = t2.execute().fetch() + assert res.shape == (5, 10) + + +def test_swapaxes_execution(setup): + raw = np.random.random((11, 8, 5)) + arr = swapaxes(raw, 2, 0) + + res = arr.execute().fetch() + np.testing.assert_array_equal(res, raw.swapaxes(2, 0)) + + raw = np.random.random((11, 8, 5)) + arr = tensor(raw, chunk_size=3) + arr2 = arr.swapaxes(2, 0) + + res = arr2.execute().fetch() + np.testing.assert_array_equal(res, raw.swapaxes(2, 0)) + + raw = sps.random(11, 8, density=0.2) + arr = tensor(raw, chunk_size=3) + arr2 = arr.swapaxes(1, 0) + + res = arr2.execute().fetch() + np.testing.assert_array_equal(res.toarray(), raw.toarray().swapaxes(1, 0)) + + # test order + raw = np.asfortranarray(np.random.rand(11, 8, 5)) + + arr = tensor(raw, chunk_size=3) + arr2 = arr.swapaxes(2, 0) + + res = arr2.execute().fetch() + expected = raw.swapaxes(2, 0).copy(order="A") + + np.testing.assert_array_equal(res, expected) + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + arr = tensor(raw, chunk_size=3) + arr2 = arr.swapaxes(0, 2) + + res = arr2.execute().fetch() + expected = raw.swapaxes(0, 2).copy(order="A") + + np.testing.assert_array_equal(res, expected) + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + arr = tensor(raw, chunk_size=3) + arr2 = arr.swapaxes(1, 0) + + res = arr2.execute().fetch() + expected = raw.swapaxes(1, 0).copy(order="A") + + np.testing.assert_array_equal(res, expected) + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + +def test_moveaxis_execution(setup): + x = zeros((3, 4, 5), chunk_size=2) + + t = moveaxis(x, 0, -1) + + res = t.execute().fetch() + assert res.shape == (4, 5, 3) + + t = moveaxis(x, -1, 0) + + res = t.execute().fetch() + assert res.shape == (5, 3, 4) + + t = moveaxis(x, [0, 1], [-1, -2]) + + res = t.execute().fetch() + assert res.shape == (5, 4, 3) + + t = moveaxis(x, [0, 1, 2], [-1, -2, -3]) + + res = t.execute().fetch() + assert res.shape == (5, 4, 3) + + +def test_broadcast_to_execution(setup): + raw = np.random.random((10, 5, 1)) + arr = tensor(raw, chunk_size=2) + arr2 = broadcast_to(arr, (5, 10, 5, 6)) + + res = arr2.execute().fetch() + np.testing.assert_array_equal(res, np.broadcast_to(raw, (5, 10, 5, 6))) + + # test chunk with unknown shape + arr1 = mt.random.rand(3, 4, chunk_size=2) + arr2 = mt.random.permutation(arr1) + arr3 = broadcast_to(arr2, (2, 3, 4)) + + res = arr3.execute().fetch() + assert res.shape == (2, 3, 4) + + +def test_broadcast_arrays_executions(setup): + x_data = [[1, 2, 3]] + x = tensor(x_data, chunk_size=1) + y_data = [[1], [2], [3]] + y = tensor(y_data, chunk_size=2) + + a = broadcast_arrays(x, y) + + res = [arr.execute().fetch() for arr in a] + expected = np.broadcast_arrays(x_data, y_data) + + for r, e in zip(res, expected): + np.testing.assert_equal(r, e) + + +def test_where_execution(setup): + raw_cond = np.random.randint(0, 2, size=(4, 4), dtype="?") + raw_x = np.random.rand(4, 1) + raw_y = np.random.rand(4, 4) + + cond, x, y = ( + tensor(raw_cond, chunk_size=2), + tensor(raw_x, chunk_size=2), + tensor(raw_y, chunk_size=2), + ) + + arr = where(cond, x, y) + res = arr.execute().fetch() + assert np.array_equal(res, np.where(raw_cond, raw_x, raw_y)) is True + + raw_cond = sps.csr_matrix(np.random.randint(0, 2, size=(4, 4), dtype="?")) + raw_x = sps.random(4, 1, density=0.1) + raw_y = sps.random(4, 4, density=0.1) + + cond, x, y = ( + tensor(raw_cond, chunk_size=2), + tensor(raw_x, chunk_size=2), + tensor(raw_y, chunk_size=2), + ) + + arr = where(cond, x, y) + res = arr.execute().fetch() + assert ( + np.array_equal( + res.toarray(), + np.where(raw_cond.toarray(), raw_x.toarray(), raw_y.toarray()), + ) + is True + ) + + # GH 2009 + raw_x = np.arange(9.0).reshape(3, 3) + x = arange(9.0).reshape(3, 3) + arr = where(x < 5, 2, -1) + res = arr.execute().fetch() + np.testing.assert_array_equal(res, np.where(raw_x < 5, 2, -1)) + + +@pytest.mark.ray_dag +def test_reshape_execution(setup): + raw_data = np.random.rand(5, 10, 30) + x = tensor(raw_data, chunk_size=8) + + y = x.reshape(-1, 30) + + res = y.execute().fetch() + np.testing.assert_array_equal(res, raw_data.reshape(-1, 30)) + + y2 = x.reshape(10, -1) + + res = y2.execute().fetch() + np.testing.assert_array_equal(res, raw_data.reshape(10, -1)) + + y3 = x.reshape(-1) + + res = y3.execute().fetch() + np.testing.assert_array_equal(res, raw_data.reshape(-1)) + + y4 = x.ravel() + + res = y4.execute().fetch() + np.testing.assert_array_equal(res, raw_data.ravel()) + + raw_data = np.random.rand(6, 20, 4) + x = tensor(raw_data, chunk_size=5) + + y = x.reshape(-1, 4, 5, 2, 2) + + res = y.execute().fetch() + np.testing.assert_array_equal(res, raw_data.reshape(-1, 4, 5, 2, 2)) + + y2 = x.reshape(120, 2, 2) + + res = y2.execute().fetch() + np.testing.assert_array_equal(res, raw_data.reshape(120, 2, 2)) + + y3 = x.reshape(12, 5, 8) + + res = y3.execute().fetch() + np.testing.assert_array_equal(res, raw_data.reshape(12, 5, 8)) + + y4 = x.reshape(12, 5, 8) + y4.op.extra_params["_reshape_with_shuffle"] = True + + # size_res = self.executor.execute_tensor(y4, mock=True) + res = y4.execute().fetch() + # assert res[0].nbytes == sum(v[0] for v in size_res) + assert np.array_equal(res, raw_data.reshape(12, 5, 8)) is True + + y5 = x.ravel(order="F") + + res = y5.execute().fetch() + expected = raw_data.ravel(order="F") + np.testing.assert_array_equal(res, expected) + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + +def test_expand_dims_execution(setup): + raw_data = np.random.rand(10, 20, 30) + x = tensor(raw_data, chunk_size=6) + + y = expand_dims(x, 1) + + res = y.execute().fetch() + assert np.array_equal(res, np.expand_dims(raw_data, 1)) is True + + y = expand_dims(x, 0) + + res = y.execute().fetch() + assert np.array_equal(res, np.expand_dims(raw_data, 0)) is True + + y = expand_dims(x, 3) + + res = y.execute().fetch() + assert np.array_equal(res, np.expand_dims(raw_data, 3)) is True + + y = expand_dims(x, -1) + + res = y.execute().fetch() + assert np.array_equal(res, np.expand_dims(raw_data, -1)) is True + + y = expand_dims(x, -4) + + res = y.execute().fetch() + assert np.array_equal(res, np.expand_dims(raw_data, -4)) is True + + with pytest.raises(np.AxisError): + expand_dims(x, -5) + + with pytest.raises(np.AxisError): + expand_dims(x, 4) + + +def test_rollaxis_execution(setup): + x = ones((3, 4, 5, 6), chunk_size=1) + y = rollaxis(x, 3, 1) + + res = y.execute().fetch() + np.testing.assert_array_equal(res, np.rollaxis(np.ones((3, 4, 5, 6)), 3, 1)) + + +def test_atleast1d_execution(setup): + x = 1 + y = ones(3, chunk_size=2) + z = ones((3, 4), chunk_size=2) + + t = atleast_1d(x, y, z) + + res = [i.execute().fetch() for i in t] + + np.testing.assert_array_equal(res[0], np.array([1])) + np.testing.assert_array_equal(res[1], np.ones(3)) + np.testing.assert_array_equal(res[2], np.ones((3, 4))) + + +def test_atleast2d_execution(setup): + x = 1 + y = ones(3, chunk_size=2) + z = ones((3, 4), chunk_size=2) + + t = atleast_2d(x, y, z) + + res = [i.execute().fetch() for i in t] + + np.testing.assert_array_equal(res[0], np.array([[1]])) + np.testing.assert_array_equal(res[1], np.atleast_2d(np.ones(3))) + assert np.array_equal(res[2], np.ones((3, 4))) is True + + +def test_atleast3d_execution(setup): + x = 1 + y = ones(3, chunk_size=2) + z = ones((3, 4), chunk_size=2) + + t = atleast_3d(x, y, z) + + res = [i.execute().fetch() for i in t] + + np.testing.assert_array_equal(res[0], np.atleast_3d(x)) + np.testing.assert_array_equal(res[1], np.atleast_3d(np.ones(3))) + np.testing.assert_array_equal(res[2], np.atleast_3d(np.ones((3, 4)))) + + +def test_argwhere_execution(setup): + x = arange(6, chunk_size=2).reshape(2, 3) + t = argwhere(x > 1) + + res = t.execute().fetch() + expected = np.argwhere(np.arange(6).reshape(2, 3) > 1) + + np.testing.assert_array_equal(res, expected) + + data = np.asfortranarray(np.random.rand(10, 20)) + x = tensor(data, chunk_size=10) + + t = argwhere(x > 0.5) + + res = t.execute().fetch() + expected = np.argwhere(data > 0.5) + + np.testing.assert_array_equal(res, expected) + assert res.flags["F_CONTIGUOUS"] is True + assert res.flags["C_CONTIGUOUS"] is False + + +def test_array_split_execution(setup): + x = arange(48, chunk_size=3).reshape(2, 3, 8) + ss = array_split(x, 3, axis=2) + + res = [i.execute().fetch() for i in ss] + expected = np.array_split(np.arange(48).reshape(2, 3, 8), 3, axis=2) + assert len(res) == len(expected) + [np.testing.assert_equal(r, e) for r, e in zip(res, expected)] + + ss = array_split(x, [3, 5, 6, 10], axis=2) + + res = [i.execute().fetch() for i in ss] + expected = np.array_split(np.arange(48).reshape(2, 3, 8), [3, 5, 6, 10], axis=2) + assert len(res) == len(expected) + [np.testing.assert_equal(r, e) for r, e in zip(res, expected)] + + +def test_split_execution(setup): + for a in ((1, 1, 1, 2, 2, 3), [1, 1, 1, 2, 2, 3]): + splits = split(a, (3, 5)) + assert len(splits) == 3 + splits0 = splits[0].execute().fetch() + np.testing.assert_array_equal(splits0, (1, 1, 1)) + splits1 = splits[1].execute().fetch() + np.testing.assert_array_equal(splits1, (2, 2)) + splits2 = splits[2].execute().fetch() + np.testing.assert_array_equal(splits2, (3,)) + + x = arange(48, chunk_size=3).reshape(2, 3, 8) + ss = split(x, 4, axis=2) + + res = [i.execute().fetch() for i in ss] + expected = np.split(np.arange(48).reshape(2, 3, 8), 4, axis=2) + assert len(res) == len(expected) + [np.testing.assert_equal(r, e) for r, e in zip(res, expected)] + + ss = split(x, [3, 5, 6, 10], axis=2) + + res = [i.execute().fetch() for i in ss] + expected = np.split(np.arange(48).reshape(2, 3, 8), [3, 5, 6, 10], axis=2) + assert len(res) == len(expected) + [np.testing.assert_equal(r, e) for r, e in zip(res, expected)] + + # hsplit + x = arange(120, chunk_size=3).reshape(2, 12, 5) + ss = hsplit(x, 4) + + res = [i.execute().fetch() for i in ss] + expected = np.hsplit(np.arange(120).reshape(2, 12, 5), 4) + assert len(res) == len(expected) + [np.testing.assert_equal(r, e) for r, e in zip(res, expected)] + + # vsplit + x = arange(48, chunk_size=3).reshape(8, 3, 2) + ss = vsplit(x, 4) + + res = [i.execute().fetch() for i in ss] + expected = np.vsplit(np.arange(48).reshape(8, 3, 2), 4) + assert len(res) == len(expected) + [np.testing.assert_equal(r, e) for r, e in zip(res, expected)] + + # dsplit + x = arange(48, chunk_size=3).reshape(2, 3, 8) + ss = dsplit(x, 4) + + res = [i.execute().fetch() for i in ss] + expected = np.dsplit(np.arange(48).reshape(2, 3, 8), 4) + assert len(res) == len(expected) + [np.testing.assert_equal(r, e) for r, e in zip(res, expected)] + + x_data = sps.random(12, 8, density=0.1) + x = tensor(x_data, chunk_size=3) + ss = split(x, 4, axis=0) + + res = [i.execute().fetch() for i in ss] + expected = np.split(x_data.toarray(), 4, axis=0) + assert len(res) == len(expected) + [np.testing.assert_equal(r.toarray(), e) for r, e in zip(res, expected)] + + +def test_roll_execution(setup): + x = arange(10, chunk_size=2) + + t = roll(x, 2) + + res = t.execute().fetch() + expected = np.roll(np.arange(10), 2) + np.testing.assert_equal(res, expected) + + x2 = x.reshape(2, 5) + + t = roll(x2, 1) + + res = t.execute().fetch() + expected = np.roll(np.arange(10).reshape(2, 5), 1) + np.testing.assert_equal(res, expected) + + t = roll(x2, 1, axis=0) + + res = t.execute().fetch() + expected = np.roll(np.arange(10).reshape(2, 5), 1, axis=0) + np.testing.assert_equal(res, expected) + + t = roll(x2, 1, axis=1) + + res = t.execute().fetch() + expected = np.roll(np.arange(10).reshape(2, 5), 1, axis=1) + np.testing.assert_equal(res, expected) + + +def test_squeeze_execution(setup): + data = np.array([[[0], [1], [2]]]) + x = tensor(data, chunk_size=1) + + t = squeeze(x) + + res = t.execute().fetch() + expected = np.squeeze(data) + np.testing.assert_equal(res, expected) + + t = squeeze(x, axis=2) + + res = t.execute().fetch() + expected = np.squeeze(data, axis=2) + np.testing.assert_equal(res, expected) + + +def test_diff_execution(setup): + data = np.array([1, 2, 4, 7, 0]) + x = tensor(data, chunk_size=2) + + t = diff(x) + + res = t.execute().fetch() + expected = np.diff(data) + np.testing.assert_equal(res, expected) + + t = diff(x, n=2) + + res = t.execute().fetch() + expected = np.diff(data, n=2) + np.testing.assert_equal(res, expected) + + data = np.array([[1, 3, 6, 10], [0, 5, 6, 8]]) + x = tensor(data, chunk_size=2) + + t = diff(x) + + res = t.execute().fetch() + expected = np.diff(data) + np.testing.assert_equal(res, expected) + + t = diff(x, axis=0) + + res = t.execute().fetch() + expected = np.diff(data, axis=0) + np.testing.assert_equal(res, expected) + + x = mt.arange("1066-10-13", "1066-10-16", dtype=mt.datetime64) + t = diff(x) + + res = t.execute().fetch() + expected = np.diff(np.arange("1066-10-13", "1066-10-16", dtype=np.datetime64)) + np.testing.assert_equal(res, expected) + + +def test_ediff1d(setup): + data = np.array([1, 2, 4, 7, 0]) + x = tensor(data, chunk_size=2) + + t = ediff1d(x) + + res = t.execute().fetch() + expected = np.ediff1d(data) + np.testing.assert_equal(res, expected) + + to_begin = tensor(-99, chunk_size=2) + to_end = tensor([88, 99], chunk_size=2) + t = ediff1d(x, to_begin=to_begin, to_end=to_end) + + res = t.execute().fetch() + expected = np.ediff1d(data, to_begin=-99, to_end=np.array([88, 99])) + np.testing.assert_equal(res, expected) + + data = [[1, 2, 4], [1, 6, 24]] + + t = ediff1d(tensor(data, chunk_size=2)) + + res = t.execute().fetch() + expected = np.ediff1d(data) + np.testing.assert_equal(res, expected) + + +def test_flip_execution(setup): + a = arange(8, chunk_size=2).reshape((2, 2, 2)) + + t = flip(a, 0) + + res = t.execute().fetch() + expected = np.flip(np.arange(8).reshape(2, 2, 2), 0) + np.testing.assert_equal(res, expected) + + t = flip(a, 1) + + res = t.execute().fetch() + expected = np.flip(np.arange(8).reshape(2, 2, 2), 1) + np.testing.assert_equal(res, expected) + + t = flipud(a) + + res = t.execute().fetch() + expected = np.flipud(np.arange(8).reshape(2, 2, 2)) + np.testing.assert_equal(res, expected) + + t = fliplr(a) + + res = t.execute().fetch() + expected = np.fliplr(np.arange(8).reshape(2, 2, 2)) + np.testing.assert_equal(res, expected) + + +def test_repeat_execution(setup): + a = repeat(3, 4) + + res = a.execute().fetch() + expected = np.repeat(3, 4) + np.testing.assert_equal(res, expected) + + x_data = np.random.randn(20, 30) + x = tensor(x_data, chunk_size=(12, 16)) + + t = repeat(x, 2) + + res = t.execute().fetch() + expected = np.repeat(x_data, 2) + np.testing.assert_equal(res, expected) + + t = repeat(x, 3, axis=1) + + res = t.execute().fetch() + expected = np.repeat(x_data, 3, axis=1) + np.testing.assert_equal(res, expected) + + t = repeat(x, np.arange(20), axis=0) + + res = t.execute().fetch() + expected = np.repeat(x_data, np.arange(20), axis=0) + np.testing.assert_equal(res, expected) + + t = repeat(x, arange(20, chunk_size=10), axis=0) + + res = t.execute().fetch() + expected = np.repeat(x_data, np.arange(20), axis=0) + np.testing.assert_equal(res, expected) + + x_data = sps.random(20, 30, density=0.1) + x = tensor(x_data, chunk_size=(12, 16)) + + t = repeat(x, 2, axis=1) + + res = t.execute().fetch() + expected = np.repeat(x_data.toarray(), 2, axis=1) + np.testing.assert_equal(res.toarray(), expected) + + +def test_tile_execution(setup): + a_data = np.array([0, 1, 2]) + a = tensor(a_data, chunk_size=2) + + t = tile(a, 2) + + res = t.execute().fetch() + expected = np.tile(a_data, 2) + np.testing.assert_equal(res, expected) + + t = tile(a, (2, 2)) + + res = t.execute().fetch() + expected = np.tile(a_data, (2, 2)) + np.testing.assert_equal(res, expected) + + t = tile(a, (2, 1, 2)) + + res = t.execute().fetch() + expected = np.tile(a_data, (2, 1, 2)) + np.testing.assert_equal(res, expected) + + b_data = np.array([[1, 2], [3, 4]]) + b = tensor(b_data, chunk_size=1) + + t = tile(b, 2) + + res = t.execute().fetch() + expected = np.tile(b_data, 2) + np.testing.assert_equal(res, expected) + + t = tile(b, (2, 1)) + + res = t.execute().fetch() + expected = np.tile(b_data, (2, 1)) + np.testing.assert_equal(res, expected) + + c_data = np.array([1, 2, 3, 4]) + c = tensor(c_data, chunk_size=3) + + t = tile(c, (4, 1)) + + res = t.execute().fetch() + expected = np.tile(c_data, (4, 1)) + np.testing.assert_equal(res, expected) + + +@pytest.mark.ray_dag +def test_isin_execution(setup): + element = 2 * arange(4, chunk_size=1).reshape((2, 2)) + test_elements = [1, 2, 4, 8] + + mask = isin(element, test_elements) + + res = mask.execute().fetch() + expected = np.isin(2 * np.arange(4).reshape((2, 2)), test_elements) + np.testing.assert_equal(res, expected) + + res = element[mask].execute().fetch() + expected = np.array([2, 4]) + np.testing.assert_equal(res, expected) + + mask = isin(element, test_elements, invert=True) + + res = mask.execute().fetch() + expected = np.isin(2 * np.arange(4).reshape((2, 2)), test_elements, invert=True) + np.testing.assert_equal(res, expected) + + res = element[mask].execute().fetch() + expected = np.array([0, 6]) + np.testing.assert_equal(res, expected) + + test_set = {1, 2, 4, 8} + mask = isin(element, test_set) + + res = mask.execute().fetch() + expected = np.isin(2 * np.arange(4).reshape((2, 2)), test_set) + np.testing.assert_equal(res, expected) + + +def test_ravel_execution(setup): + arr = ones((10, 5), chunk_size=2) + flat_arr = mt.ravel(arr) + + res = flat_arr.execute().fetch() + assert len(res) == 50 + np.testing.assert_equal(res, np.ones(50)) + + +def test_searchsorted_execution(setup): + raw = np.sort(np.random.randint(100, size=(16,))) + + # test different chunk_size, 3 will have combine, 6 will skip combine + for chunk_size in (3, 8): + arr = tensor(raw, chunk_size=chunk_size) + + # test scalar, with value in the middle + t1 = searchsorted(arr, 20) + + res = t1.execute().fetch() + expected = np.searchsorted(raw, 20) + np.testing.assert_array_equal(res, expected) + + # test scalar, with value larger than 100 + t2 = searchsorted(arr, 200) + + res = t2.execute().fetch() + expected = np.searchsorted(raw, 200) + np.testing.assert_array_equal(res, expected) + + # test scalar, side left, with value exact in the middle of the array + t3 = searchsorted(arr, raw[10], side="left") + + res = t3.execute().fetch() + expected = np.searchsorted(raw, raw[10], side="left") + np.testing.assert_array_equal(res, expected) + + # test scalar, side right, with value exact in the middle of the array + t4 = searchsorted(arr, raw[10], side="right") + + res = t4.execute().fetch() + expected = np.searchsorted(raw, raw[10], side="right") + np.testing.assert_array_equal(res, expected) + + # test scalar, side left, with value exact in the end of the array + t5 = searchsorted(arr, raw[15], side="left") + + res = t5.execute().fetch() + expected = np.searchsorted(raw, raw[15], side="left") + np.testing.assert_array_equal(res, expected) + + # test scalar, side right, with value exact in the end of the array + t6 = searchsorted(arr, raw[15], side="right") + + res = t6.execute().fetch() + expected = np.searchsorted(raw, raw[15], side="right") + np.testing.assert_array_equal(res, expected) + + # test scalar, side left, with value exact in the start of the array + t7 = searchsorted(arr, raw[0], side="left") + + res = t7.execute().fetch() + expected = np.searchsorted(raw, raw[0], side="left") + np.testing.assert_array_equal(res, expected) + + # test scalar, side right, with value exact in the start of the array + t8 = searchsorted(arr, raw[0], side="right") + + res = t8.execute().fetch() + expected = np.searchsorted(raw, raw[0], side="right") + np.testing.assert_array_equal(res, expected) + + raw2 = np.random.randint(100, size=(3, 4)) + + # test tensor, side left + t9 = searchsorted(arr, tensor(raw2, chunk_size=2), side="left") + + res = t9.execute().fetch() + expected = np.searchsorted(raw, raw2, side="left") + np.testing.assert_array_equal(res, expected) + + # test tensor, side right + t10 = searchsorted(arr, tensor(raw2, chunk_size=2), side="right") + + res = t10.execute().fetch() + expected = np.searchsorted(raw, raw2, side="right") + np.testing.assert_array_equal(res, expected) + + # test one chunk + arr = tensor(raw, chunk_size=16) + + # test scalar, tensor to search has 1 chunk + t11 = searchsorted(arr, 20) + res = t11.execute().fetch() + expected = np.searchsorted(raw, 20) + np.testing.assert_array_equal(res, expected) + + # test tensor with 1 chunk, tensor to search has 1 chunk + t12 = searchsorted(arr, tensor(raw2, chunk_size=4)) + + res = t12.execute().fetch() + expected = np.searchsorted(raw, raw2) + np.testing.assert_array_equal(res, expected) + + # test tensor with more than 1 chunk, tensor to search has 1 chunk + t13 = searchsorted(arr, tensor(raw2, chunk_size=2)) + + res = t13.execute().fetch() + expected = np.searchsorted(raw, raw2) + np.testing.assert_array_equal(res, expected) + + # test sorter + raw3 = np.random.randint(100, size=(16,)) + arr = tensor(raw3, chunk_size=3) + order = np.argsort(raw3) + order_arr = tensor(order, chunk_size=4) + + t14 = searchsorted(arr, 20, sorter=order_arr) + + res = t14.execute().fetch() + expected = np.searchsorted(raw3, 20, sorter=order) + np.testing.assert_array_equal(res, expected) + + # all data same + raw4 = np.ones(8) + arr = tensor(raw4, chunk_size=2) + + for val in (0, 1, 2): + for side in ("left", "right"): + t15 = searchsorted(arr, val, side=side) + + res = t15.execute().fetch() + expected = np.searchsorted(raw4, val, side=side) + np.testing.assert_array_equal(res, expected) + + +@pytest.mark.ray_dag +def test_unique_execution(setup): + rs = np.random.RandomState(0) + raw = rs.randint(10, size=(10,)) + + for chunk_size in (10, 3): + x = tensor(raw, chunk_size=chunk_size) + + y = unique(x) + + res = y.execute().fetch() + expected = np.unique(raw) + np.testing.assert_array_equal(res, expected) + + y, indices = unique(x, return_index=True) + + res = fetch(execute(y, indices)) + expected = np.unique(raw, return_index=True) + assert len(res) == 2 + assert len(expected) == 2 + np.testing.assert_array_equal(res[0], expected[0]) + np.testing.assert_array_equal(res[1], expected[1]) + + y, inverse = unique(x, return_inverse=True) + + res = fetch(*execute(y, inverse)) + expected = np.unique(raw, return_inverse=True) + assert len(res) == 2 + assert len(expected) == 2 + np.testing.assert_array_equal(res[0], expected[0]) + np.testing.assert_array_equal(res[1], expected[1]) + + y, counts = unique(x, return_counts=True) + + res = fetch(*execute(y, counts)) + expected = np.unique(raw, return_counts=True) + assert len(res) == 2 + assert len(expected) == 2 + np.testing.assert_array_equal(res[0], expected[0]) + np.testing.assert_array_equal(res[1], expected[1]) + + y, indices, inverse, counts = unique( + x, return_index=True, return_inverse=True, return_counts=True + ) + + res = fetch(*execute(y, indices, inverse, counts)) + expected = np.unique( + raw, return_index=True, return_inverse=True, return_counts=True + ) + assert len(res) == 4 + assert len(expected) == 4 + np.testing.assert_array_equal(res[0], expected[0]) + np.testing.assert_array_equal(res[1], expected[1]) + np.testing.assert_array_equal(res[2], expected[2]) + np.testing.assert_array_equal(res[3], expected[3]) + + y, indices, counts = unique(x, return_index=True, return_counts=True) + + res = fetch(*execute(y, indices, counts)) + expected = np.unique(raw, return_index=True, return_counts=True) + assert len(res) == 3 + assert len(expected) == 3 + np.testing.assert_array_equal(res[0], expected[0]) + np.testing.assert_array_equal(res[1], expected[1]) + np.testing.assert_array_equal(res[2], expected[2]) + + raw2 = rs.randint(10, size=(4, 5, 6)) + x2 = tensor(raw2, chunk_size=chunk_size) + + y2 = unique(x2) + + res = y2.execute().fetch() + expected = np.unique(raw2) + np.testing.assert_array_equal(res, expected) + + y2 = unique(x2, axis=1) + + res = y2.execute().fetch() + expected = np.unique(raw2, axis=1) + np.testing.assert_array_equal(res, expected) + + y2 = unique(x2, axis=2) + + res = y2.execute().fetch() + expected = np.unique(raw2, axis=2) + np.testing.assert_array_equal(res, expected) + + raw = rs.randint(10, size=(10, 20)) + raw[:, 0] = raw[:, 11] = rs.randint(10, size=(10,)) + x = tensor(raw, chunk_size=2) + y, ind, inv, counts = unique( + x, + aggregate_size=3, + axis=1, + return_index=True, + return_inverse=True, + return_counts=True, + ) + + res_unique, res_ind, res_inv, res_counts = fetch(*execute(y, ind, inv, counts)) + exp_unique, exp_ind, exp_counts = np.unique( + raw, axis=1, return_index=True, return_counts=True + ) + raw_res_unique = res_unique + res_unique_df = pd.DataFrame(res_unique) + res_unique_ind = np.asarray( + res_unique_df.sort_values(list(range(res_unique.shape[0])), axis=1).columns + ) + res_unique = res_unique[:, res_unique_ind] + res_ind = res_ind[res_unique_ind] + res_counts = res_counts[res_unique_ind] + + np.testing.assert_array_equal(res_unique, exp_unique) + np.testing.assert_array_equal(res_ind, exp_ind) + np.testing.assert_array_equal(raw_res_unique[:, res_inv], raw) + np.testing.assert_array_equal(res_counts, exp_counts) + + x = (mt.random.RandomState(0).rand(1000, chunk_size=20) > 0.5).astype(np.int32) + y = unique(x) + res = np.sort(y.execute().fetch()) + np.testing.assert_array_equal(res, np.array([0, 1])) + + # test sparse + sparse_raw = sps.random(10, 3, density=0.1, format="csr", random_state=rs) + x = tensor(sparse_raw, chunk_size=2) + y = unique(x) + res = np.sort(y.execute().fetch()) + np.testing.assert_array_equal(res, np.unique(sparse_raw.data)) + + # test empty + x = tensor([]) + y = unique(x) + res = y.execute().fetch() + np.testing.assert_array_equal(res, np.unique([])) + + x = tensor([[]]) + y = unique(x) + res = y.execute().fetch() + np.testing.assert_array_equal(res, np.unique([[]])) + + +@require_cupy +def test_to_gpu_execution(setup_gpu): + raw = np.random.rand(10, 10) + x = tensor(raw, chunk_size=3) + + gx = to_gpu(x) + + res = gx.execute().fetch() + np.testing.assert_array_equal(res.get(), raw) + + +@require_cupy +def test_to_cpu_execution(setup_gpu): + raw = np.random.rand(10, 10) + x = tensor(raw, chunk_size=3, gpu=True) + + cx = to_cpu(x) + + res = cx.execute().fetch() + np.testing.assert_array_equal(res, raw) + + +@pytest.mark.ray_dag +def test_sort_execution(setup): + # only 1 chunk when axis = -1 + raw = np.random.rand(100, 10) + x = tensor(raw, chunk_size=20) + + sx = sort(x) + + res = sx.execute().fetch() + np.testing.assert_array_equal(res, np.sort(raw)) + + # 1-d chunk + raw = np.random.rand(100) + x = tensor(raw, chunk_size=20) + + sx = sort(x) + + res = sx.execute().fetch() + np.testing.assert_array_equal(res, np.sort(raw)) + + # test force need_align=True + sx = sort(x) + sx.op._need_align = True + + res = sx.execute().fetch() + np.testing.assert_array_equal(res, np.sort(raw)) + + # test psrs_kinds + sx = sort(x, psrs_kinds=[None, None, "quicksort"]) + + res = sx.execute().fetch() + np.testing.assert_array_equal(res, np.sort(raw)) + + # structured dtype + raw = np.empty(100, dtype=[("id", np.int32), ("size", np.int64)]) + raw["id"] = np.random.randint(1000, size=100, dtype=np.int32) + raw["size"] = np.random.randint(1000, size=100, dtype=np.int64) + x = tensor(raw, chunk_size=10) + + sx = sort(x, order=["size", "id"]) + + res = sx.execute().fetch() + np.testing.assert_array_equal(res, np.sort(raw, order=["size", "id"])) + + # test psrs_kinds with structured dtype + sx = sort(x, order=["size", "id"], psrs_kinds=[None, None, "quicksort"]) + + res = sx.execute().fetch() + np.testing.assert_array_equal(res, np.sort(raw, order=["size", "id"])) + + # test flatten case + raw = np.random.rand(10, 10) + x = tensor(raw, chunk_size=(5, 10)) + + sx = sort(x, axis=None) + + res = sx.execute().fetch() + np.testing.assert_array_equal(res, np.sort(raw, axis=None)) + + # test multi-dimension + raw = np.random.rand(10, 100) + x = tensor(raw, chunk_size=(5, 40)) + + sx = sort(x, psrs_kinds=["quicksort"] * 3) + + res = sx.execute().fetch() + np.testing.assert_array_equal(res, np.sort(raw)) + + sx = sort(x, psrs_kinds=[None, None, "quicksort"]) + + res = sx.execute().fetch() + np.testing.assert_array_equal(res, np.sort(raw)) + + raw = np.random.rand(10, 99) + x = tensor(raw, chunk_size=(5, 20)) + + sx = sort(x) + + res = sx.execute().fetch() + np.testing.assert_array_equal(res, np.sort(raw)) + + # test 3-d + raw = np.random.rand(20, 25, 28) + x = tensor(raw, chunk_size=(10, 15, 14)) + + sx = sort(x) + + res = sx.execute().fetch() + np.testing.assert_array_equal(res, np.sort(raw)) + + sx = sort(x, psrs_kinds=[None, None, "quicksort"]) + + res = sx.execute().fetch() + np.testing.assert_array_equal(res, np.sort(raw)) + + sx = sort(x, axis=0) + + res = sx.execute().fetch() + np.testing.assert_array_equal(res, np.sort(raw, axis=0)) + + sx = sort(x, axis=0, psrs_kinds=[None, None, "quicksort"]) + + res = sx.execute().fetch() + np.testing.assert_array_equal(res, np.sort(raw, axis=0)) + + sx = sort(x, axis=1) + + res = sx.execute().fetch() + np.testing.assert_array_equal(res, np.sort(raw, axis=1)) + + sx = sort(x, axis=1, psrs_kinds=[None, None, "quicksort"]) + + res = sx.execute().fetch() + np.testing.assert_array_equal(res, np.sort(raw, axis=1)) + + # test multi-dimension with structured type + raw = np.empty((10, 100), dtype=[("id", np.int32), ("size", np.int64)]) + raw["id"] = np.random.randint(1000, size=(10, 100), dtype=np.int32) + raw["size"] = np.random.randint(1000, size=(10, 100), dtype=np.int64) + x = tensor(raw, chunk_size=(7, 30)) + + sx = sort(x) + + res = sx.execute().fetch() + np.testing.assert_array_equal(res, np.sort(raw)) + + sx = sort(x, order=["size", "id"]) + + res = sx.execute().fetch() + np.testing.assert_array_equal(res, np.sort(raw, order=["size", "id"])) + + sx = sort(x, order=["size"]) + + res = sx.execute().fetch() + np.testing.assert_array_equal(res, np.sort(raw, order=["size"])) + + sx = sort(x, axis=0, order=["size", "id"]) + + res = sx.execute().fetch() + np.testing.assert_array_equal(res, np.sort(raw, axis=0, order=["size", "id"])) + + sx = sort(x, axis=0, order=["size", "id"], psrs_kinds=[None, None, "quicksort"]) + + res = sx.execute().fetch() + np.testing.assert_array_equal(res, np.sort(raw, axis=0, order=["size", "id"])) + + # test inplace sort + raw = np.random.rand(10, 12) + a = tensor(raw, chunk_size=(5, 4)) + a.sort(axis=1) + + res = a.execute().fetch() + np.testing.assert_array_equal(res, np.sort(raw, axis=1)) + + a.sort(axis=0) + + res = a.execute().fetch() + np.testing.assert_array_equal(res, np.sort(np.sort(raw, axis=1), axis=0)) + + # test with empty chunk + raw = np.random.rand(20, 10) + raw[:, :8] = 1 + a = tensor(raw, chunk_size=5) + filtered = a[a < 1] + filtered.sort() + + res = filtered.execute().fetch() + np.testing.assert_array_equal(res, np.sort(raw[raw < 1])) + + +@pytest.mark.ray_dag +def test_sort_indices_execution(setup): + # only 1 chunk when axis = -1 + raw = np.random.rand(100, 10) + x = tensor(raw, chunk_size=20) + + r = sort(x, return_index=True) + + sr, si = r.execute().fetch() + np.testing.assert_array_equal(sr, np.take_along_axis(raw, si, axis=-1)) + + x = tensor(raw, chunk_size=(22, 4)) + + r = sort(x, return_index=True) + + sr, si = r.execute().fetch() + np.testing.assert_array_equal(sr, np.take_along_axis(raw, si, axis=-1)) + + raw = np.random.rand(100) + + x = tensor(raw, chunk_size=23) + + r = sort(x, axis=0, return_index=True) + + sr, si = r.execute().fetch() + np.testing.assert_array_equal(sr, raw[si]) + + +@pytest.mark.ray_dag +def test_argsort(setup): + # only 1 chunk when axis = -1 + raw = np.random.rand(100, 10) + x = tensor(raw, chunk_size=10) + + xa = argsort(x) + + r = xa.execute().fetch() + np.testing.assert_array_equal(np.sort(raw), np.take_along_axis(raw, r, axis=-1)) + + x = tensor(raw, chunk_size=(22, 4)) + + xa = argsort(x) + + r = xa.execute().fetch() + np.testing.assert_array_equal(np.sort(raw), np.take_along_axis(raw, r, axis=-1)) + + raw = np.random.rand(100) + + x = tensor(raw, chunk_size=23) + + xa = argsort(x, axis=0) + + r = xa.execute().fetch() + np.testing.assert_array_equal(np.sort(raw, axis=0), raw[r]) + + +@pytest.mark.ray_dag +def test_partition_execution(setup): + # only 1 chunk when axis = -1 + raw = np.random.rand(100, 10) + x = tensor(raw, chunk_size=20) + + px = partition(x, [1, 8]) + + res = px.execute().fetch() + np.testing.assert_array_equal(res, np.partition(raw, [1, 8])) + + # 1-d chunk + raw = np.random.rand(100) + x = tensor(raw, chunk_size=20) + + kth = np.random.RandomState(0).randint(-100, 100, size=(10,)) + px = partition(x, kth) + + res = px.execute().fetch() + np.testing.assert_array_equal(res[kth], np.partition(raw, kth)[kth]) + + # structured dtype + raw = np.empty(100, dtype=[("id", np.int32), ("size", np.int64)]) + raw["id"] = np.random.randint(1000, size=100, dtype=np.int32) + raw["size"] = np.random.randint(1000, size=100, dtype=np.int64) + x = tensor(raw, chunk_size=20) + + px = partition(x, kth, order=["size", "id"]) + + res = px.execute().fetch() + np.testing.assert_array_equal( + res[kth], np.partition(raw, kth, order=["size", "id"])[kth] + ) + + # test flatten case + raw = np.random.rand(10, 10) + x = tensor(raw, chunk_size=5) + + px = partition(x, kth, axis=None) + + res = px.execute().fetch() + np.testing.assert_array_equal(res[kth], np.partition(raw, kth, axis=None)[kth]) + + # test multi-dimension + raw = np.random.rand(10, 100) + x = tensor(raw, chunk_size=(5, 20)) + + kth = np.random.RandomState(0).randint(-10, 10, size=(3,)) + px = partition(x, kth) + + res = px.execute().fetch() + np.testing.assert_array_equal(res[:, kth], np.partition(raw, kth)[:, kth]) + + raw = np.random.rand(10, 99) + x = tensor(raw, chunk_size=(5, 20)) + + px = partition(x, kth) + + res = px.execute().fetch() + np.testing.assert_array_equal(res[:, kth], np.partition(raw, kth)[:, kth]) + + # test 3-d + raw = np.random.rand(20, 25, 28) + x = tensor(raw, chunk_size=(10, 15, 14)) + + kth = np.random.RandomState(0).randint(-28, 28, size=(3,)) + px = partition(x, kth) + + res = px.execute().fetch() + np.testing.assert_array_equal(res[:, :, kth], np.partition(raw, kth)[:, :, kth]) + + kth = np.random.RandomState(0).randint(-20, 20, size=(3,)) + px = partition(x, kth, axis=0) + + res = px.execute().fetch() + np.testing.assert_array_equal(res[kth], np.partition(raw, kth, axis=0)[kth]) + + kth = np.random.RandomState(0).randint(-25, 25, size=(3,)) + px = partition(x, kth, axis=1) + + res = px.execute().fetch() + np.testing.assert_array_equal(res[:, kth], np.partition(raw, kth, axis=1)[:, kth]) + + # test multi-dimension with structured type + raw = np.empty((10, 100), dtype=[("id", np.int32), ("size", np.int64)]) + raw["id"] = np.random.randint(1000, size=(10, 100), dtype=np.int32) + raw["size"] = np.random.randint(1000, size=(10, 100), dtype=np.int64) + x = tensor(raw, chunk_size=(7, 30)) + + kth = np.random.RandomState(0).randint(-100, 100, size=(10,)) + px = partition(x, kth) + + res = px.execute().fetch() + np.testing.assert_array_equal(res[:, kth], np.partition(raw, kth)[:, kth]) + + px = partition(x, kth, order=["size", "id"]) + + res = px.execute().fetch() + np.testing.assert_array_equal( + res[:, kth], np.partition(raw, kth, order=["size", "id"])[:, kth] + ) + + px = partition(x, kth, order=["size"]) + + res = px.execute().fetch() + np.testing.assert_array_equal( + res[:, kth], np.partition(raw, kth, order=["size"])[:, kth] + ) + + kth = np.random.RandomState(0).randint(-10, 10, size=(5,)) + px = partition(x, kth, axis=0, order=["size", "id"]) + + res = px.execute().fetch() + np.testing.assert_array_equal( + res[kth], np.partition(raw, kth, axis=0, order=["size", "id"])[kth] + ) + + raw = np.random.rand(10, 12) + a = tensor(raw, chunk_size=(5, 4)) + kth = np.random.RandomState(0).randint(-12, 12, size=(2,)) + a.partition(kth, axis=1) + + res = a.execute().fetch() + np.testing.assert_array_equal(res[:, kth], np.partition(raw, kth, axis=1)[:, kth]) + + kth = np.random.RandomState(0).randint(-10, 10, size=(2,)) + a.partition(kth, axis=0) + + raw_base = res + res = a.execute().fetch() + np.testing.assert_array_equal(res[kth], np.partition(raw_base, kth, axis=0)[kth]) + + # test kth which is tensor + raw = np.random.rand(10, 12) + a = tensor(raw, chunk_size=(3, 5)) + kth = (mt.random.rand(5) * 24 - 12).astype(int) + + px = partition(a, kth) + sx = sort(a) + + res = px.execute().fetch() + kth_res = kth.execute().fetch() + sort_res = sx.execute().fetch() + np.testing.assert_array_equal(res[:, kth_res], sort_res[:, kth_res]) + + a = tensor(raw, chunk_size=(10, 12)) + kth = (mt.random.rand(5) * 24 - 12).astype(int) + + px = partition(a, kth) + sx = sort(a) + + res = px.execute().fetch() + kth_res = kth.execute().fetch() + sort_res = sx.execute().fetch() + np.testing.assert_array_equal(res[:, kth_res], sort_res[:, kth_res]) + + +@pytest.mark.ray_dag +def test_partition_indices_execution(setup): + # only 1 chunk when axis = -1 + raw = np.random.rand(100, 10) + x = tensor(raw, chunk_size=10) + + kth = [2, 5, 9] + r = partition(x, kth, return_index=True) + + pr, pi = r.execute().fetch() + np.testing.assert_array_equal(pr, np.take_along_axis(raw, pi, axis=-1)) + np.testing.assert_array_equal(np.sort(raw)[:, kth], pr[:, kth]) + + x = tensor(raw, chunk_size=(22, 4)) + + r = partition(x, kth, return_index=True) + + pr, pi = r.execute().fetch() + np.testing.assert_array_equal(pr, np.take_along_axis(raw, pi, axis=-1)) + np.testing.assert_array_equal(np.sort(raw)[:, kth], pr[:, kth]) + + raw = np.random.rand(100) + + x = tensor(raw, chunk_size=23) + + r = partition(x, kth, axis=0, return_index=True) + + pr, pi = r.execute().fetch() + np.testing.assert_array_equal(pr, np.take_along_axis(raw, pi, axis=-1)) + np.testing.assert_array_equal(np.sort(raw)[kth], pr[kth]) + + +@pytest.mark.ray_dag +def test_argpartition_execution(setup): + # only 1 chunk when axis = -1 + raw = np.random.rand(100, 10) + x = tensor(raw, chunk_size=10) + + kth = [6, 3, 8] + pa = argpartition(x, kth) + + r = pa.execute().fetch() + np.testing.assert_array_equal( + np.sort(raw)[:, kth], np.take_along_axis(raw, r, axis=-1)[:, kth] + ) + + x = tensor(raw, chunk_size=(22, 4)) + + pa = argpartition(x, kth) + + r = pa.execute().fetch() + np.testing.assert_array_equal( + np.sort(raw)[:, kth], np.take_along_axis(raw, r, axis=-1)[:, kth] + ) + + raw = np.random.rand(100) + + x = tensor(raw, chunk_size=23) + + pa = argpartition(x, kth, axis=0) + + r = pa.execute().fetch() + np.testing.assert_array_equal(np.sort(raw, axis=0)[kth], raw[r][kth]) + + +def _topk_slow(a, k, axis, largest, order): + if axis is None: + a = a.flatten() + axis = 0 + a = np.sort(a, axis=axis, order=order) + if largest: + a = a[(slice(None),) * axis + (slice(None, None, -1),)] + return a[(slice(None),) * axis + (slice(k),)] + + +def _handle_result(result, axis, largest, order): + result = np.sort(result, axis=axis, order=order) + if largest: + ax = axis if axis is not None else 0 + result = result[(slice(None),) * ax + (slice(None, None, -1),)] + return result + + +@pytest.mark.parametrize("chunk_size", [7, 4]) +@pytest.mark.parametrize("axis", [0, 1, 2, None]) +@pytest.mark.parametrize("largest", [True, False]) +@pytest.mark.parametrize("to_sort", [True, False]) +@pytest.mark.parametrize("parallel_kind", ["tree", "psrs"]) +def test_topk_execution(setup, chunk_size, axis, largest, to_sort, parallel_kind): + raw1, order1 = np.random.rand(5, 6, 7), None + raw2 = np.empty((5, 6, 7), dtype=[("a", np.int32), ("b", np.float64)]) + raw2["a"] = np.random.randint(1000, size=(5, 6, 7), dtype=np.int32) + raw2["b"] = np.random.rand(5, 6, 7) + order2 = ["b", "a"] + + for raw, order in [(raw1, order1), (raw2, order2)]: + a = tensor(raw, chunk_size=chunk_size) + size = raw.shape[axis] if axis is not None else raw.size + for k in [2, size - 2, size, size + 2]: + r = topk( + a, + k, + axis=axis, + largest=largest, + sorted=to_sort, + order=order, + parallel_kind=parallel_kind, + ) + + result = r.execute().fetch() + + if not to_sort: + result = _handle_result(result, axis, largest, order) + expected = _topk_slow(raw, k, axis, largest, order) + np.testing.assert_array_equal(result, expected) + + r = topk( + a, + k, + axis=axis, + largest=largest, + sorted=to_sort, + order=order, + parallel_kind=parallel_kind, + return_index=True, + ) + + ta, ti = r.execute().fetch() + raw2 = raw + if axis is None: + raw2 = raw.flatten() + np.testing.assert_array_equal(ta, np.take_along_axis(raw2, ti, axis)) + if not to_sort: + ta = _handle_result(ta, axis, largest, order) + np.testing.assert_array_equal(ta, expected) + + +def test_argtopk(setup): + # only 1 chunk when axis = -1 + raw = np.random.rand(100, 10) + x = tensor(raw, chunk_size=20) + + pa = argtopk(x, 3, parallel_kind="tree") + + r = pa.execute().fetch() + np.testing.assert_array_equal( + np.sort(raw)[:, -1:-4:-1], np.take_along_axis(raw, r, axis=-1) + ) + + pa = argtopk(x, 3, parallel_kind="psrs") + + r = pa.execute().fetch() + np.testing.assert_array_equal( + np.sort(raw)[:, -1:-4:-1], np.take_along_axis(raw, r, axis=-1) + ) + + x = tensor(raw, chunk_size=(22, 4)) + + pa = argtopk(x, 3, parallel_kind="tree") + + r = pa.execute().fetch() + np.testing.assert_array_equal( + np.sort(raw)[:, -1:-4:-1], np.take_along_axis(raw, r, axis=-1) + ) + + pa = argtopk(x, 3, parallel_kind="psrs") + + r = pa.execute().fetch() + np.testing.assert_array_equal( + np.sort(raw)[:, -1:-4:-1], np.take_along_axis(raw, r, axis=-1) + ) + + raw = np.random.rand(100) + + x = tensor(raw, chunk_size=23) + + pa = argtopk(x, 3, axis=0, parallel_kind="tree") + + r = pa.execute().fetch() + np.testing.assert_array_equal(np.sort(raw, axis=0)[-1:-4:-1], raw[r]) + + pa = argtopk(x, 3, axis=0, parallel_kind="psrs") + + r = pa.execute().fetch() + np.testing.assert_array_equal(np.sort(raw, axis=0)[-1:-4:-1], raw[r]) + + +def test_copy(setup): + x = tensor([1, 2, 3]) + y = mt.copy(x) + z = x + + x[0] = 10 + y_res = y.execute().fetch() + np.testing.assert_array_equal(y_res, np.array([1, 2, 3])) + + z_res = z.execute().fetch() + np.testing.assert_array_equal(z_res, np.array([10, 2, 3])) + + +def test_trapz_execution(setup): + raws = [np.random.rand(10), np.random.rand(10, 3)] + + for raw in raws: + for chunk_size in (4, 10): + for dx in (1.0, 2.0): + t = tensor(raw, chunk_size=chunk_size) + r = trapz(t, dx=dx) + + result = r.execute().fetch() + expected = np.trapz(raw, dx=dx) + np.testing.assert_almost_equal( + result, + expected, + err_msg=f"failed when raw={raw}, " + f"chunk_size={chunk_size}, dx={dx}", + ) + + # test x not None + raw_ys = [np.random.rand(10), np.random.rand(10, 3)] + raw_xs = [np.random.rand(10), np.random.rand(10, 3)] + + for raw_y, raw_x in zip(raw_ys, raw_xs): + ys = [tensor(raw_y, chunk_size=5), tensor(raw_y, chunk_size=10)] + x = tensor(raw_x, chunk_size=4) + + for y in ys: + r = trapz(y, x=x) + + result = r.execute().fetch() + expected = np.trapz(raw_y, x=raw_x) + np.testing.assert_almost_equal(result, expected) + + +@pytest.mark.ray_dag +def test_shape(setup): + raw = np.random.RandomState(0).rand(4, 3) + x = mt.tensor(raw, chunk_size=2) + + s = shape(x) + + result = s.execute().fetch() + assert result == [4, 3] + + s = shape(x[x > 0.5]) + + result = s.execute().fetch() + expected = np.shape(raw[raw > 0.5]) + assert result == expected + + s = shape(0) + + result = s.execute().fetch() + expected = np.shape(0) + assert result == expected + + +@pytest.mark.ray_dag +def test_rebalance_execution(setup): + session = setup + + raw = np.random.rand(10, 3) + x = mt.tensor(raw) + + r = x.rebalance(num_partitions=3) + result = r.execute().fetch() + np.testing.assert_array_equal(result, raw) + assert len(session._session._tileable_to_fetch[r.data].chunks) == 3 + + r = x.rebalance(factor=1.5) + result = r.execute().fetch() + np.testing.assert_array_equal(result, raw) + + r = x.rebalance() + result = r.execute().fetch() + np.testing.assert_array_equal(result, raw) + assert len(session._session._tileable_to_fetch[r.data].chunks) == 2 + + +def test_map_chunk_execution(setup): + raw = np.random.rand(20) + a = tensor(raw, chunk_size=10) + + r = a.map_chunk(lambda x: x * 0.5) + results = r.execute().fetch() + np.testing.assert_array_equal(raw * 0.5, results) + + r = a.map_chunk(lambda x: x * 0.5, elementwise=True) + results = r.execute().fetch() + np.testing.assert_array_equal(raw * 0.5, results) + + r = a.map_chunk( + lambda x, chunk_index: x * 0.5 + chunk_index[0], with_chunk_index=True + ) + results = r.execute().fetch() + np.testing.assert_array_equal(raw * 0.5 + np.arange(0, 20) // 10, results) + + +def test_insert_execution(setup): + raw = np.random.randint(0, 100, size=(20, 10)) + a = tensor(raw, chunk_size=6) + + r1 = mt.insert(a, 1, 5) + result = r1.execute().fetch() + np.testing.assert_array_equal(np.insert(raw, 1, 5), result) + + r2 = mt.insert(a, [3, 50, 10], 10) + result = r2.execute().fetch() + np.testing.assert_array_equal(np.insert(raw, [3, 50, 10], 10), result) + + r3 = mt.insert(a, [2, 3, 4], [5, 6, 7]) + result = r3.execute().fetch() + np.testing.assert_array_equal(np.insert(raw, [2, 3, 4], [5, 6, 7]), result) + + # specify axis + r4 = mt.insert(a, 5, 4, axis=0) + result = r4.execute().fetch() + np.testing.assert_array_equal(np.insert(raw, 5, 4, axis=0), result) + + r5 = mt.insert(a, [1, 2, 6], np.arange(20).reshape((20, 1)), axis=1) + result = r5.execute().fetch() + np.testing.assert_array_equal( + np.insert(raw, [1, 2, 6], np.arange(20).reshape((20, 1)), axis=1), result + ) + + r6 = mt.insert(a, [1, 16, 10], np.arange(30).reshape((3, 10)), axis=0) + result = r6.execute().fetch() + np.testing.assert_array_equal( + np.insert(raw, [1, 16, 10], np.arange(30).reshape((3, 10)), axis=0), result + ) + + # test mt.tensor as values + r5 = mt.insert(a, [1, 2, 6], mt.arange(20).reshape((20, 1)), axis=1) + result = r5.execute().fetch() + np.testing.assert_array_equal( + np.insert(raw, [1, 2, 6], np.arange(20).reshape((20, 1)), axis=1), result + ) + + r6 = mt.insert(a, [1, 16, 10], mt.arange(30).reshape((3, 10)), axis=0) + result = r6.execute().fetch() + np.testing.assert_array_equal( + np.insert(raw, [1, 16, 10], np.arange(30).reshape((3, 10)), axis=0), result + ) + + r7 = mt.insert(a, [20, 30, 50], mt.tensor([5, 6, 7])) + result = r7.execute().fetch() + np.testing.assert_array_equal(np.insert(raw, [20, 30, 50], [5, 6, 7]), result) + + # test mt.tensor as index + r8 = mt.insert(a, mt.tensor([1, 2, 6]), mt.arange(20).reshape((20, 1)), axis=1) + result = r8.execute().fetch() + np.testing.assert_array_equal( + np.insert(raw, [1, 2, 6], np.arange(20).reshape((20, 1)), axis=1), result + ) + + r9 = mt.insert(a, mt.tensor([1, 16, 10]), mt.arange(30).reshape((3, 10)), axis=0) + result = r9.execute().fetch() + np.testing.assert_array_equal( + np.insert(raw, [1, 16, 10], np.arange(30).reshape((3, 10)), axis=0), result + ) + + r10 = mt.insert(a, mt.tensor([20, 30, 50]), mt.tensor([5, 6, 7])) + result = r10.execute().fetch() + np.testing.assert_array_equal(np.insert(raw, [20, 30, 50], [5, 6, 7]), result) + + r11 = mt.insert(a, slice(0, 10), mt.arange(10), axis=0) + result = r11.execute().fetch() + np.testing.assert_array_equal( + np.insert(raw, slice(0, 10), np.arange(10), axis=0), result + ) + + r12 = mt.insert(a, 10, 5, axis=1) + result = r12.execute().fetch() + np.testing.assert_array_equal(np.insert(raw, 10, 5, axis=1), result) + + r13 = mt.insert(a, [2, 10], 5, axis=1) + result = r13.execute().fetch() + np.testing.assert_array_equal(np.insert(raw, [2, 10], 5, axis=1), result) + + r14 = mt.insert(a, mt.tensor([2, 20]), 5, axis=0) + result = r14.execute().fetch() + np.testing.assert_array_equal(np.insert(raw, [2, 20], 5, axis=0), result) + + r15 = mt.insert(a, 7, mt.arange(20), axis=1) + result = r15.execute().fetch() + np.testing.assert_array_equal(np.insert(raw, 7, mt.arange(20), axis=1), result) + + +def test_delete_execution(setup): + raw = np.random.randint(0, 100, size=(20, 10)) + a = tensor(raw, chunk_size=6) + + r1 = mt.delete(a, 1) + result = r1.execute().fetch() + np.testing.assert_array_equal(np.delete(raw, 1), result) + + r2 = mt.delete(a, [3, 50, 10]) + result = r2.execute().fetch() + np.testing.assert_array_equal(np.delete(raw, [3, 50, 10]), result) + + # specify axis + r4 = mt.delete(a, 5, axis=0) + result = r4.execute().fetch() + np.testing.assert_array_equal(np.delete(raw, 5, axis=0), result) + + r5 = mt.delete(a, [1, 2, 6], axis=1) + result = r5.execute().fetch() + np.testing.assert_array_equal(np.delete(raw, [1, 2, 6], axis=1), result) + + r6 = mt.delete(a, mt.tensor([1, 2, 6, 8], chunk_size=3), axis=1) + result = r6.execute().fetch() + np.testing.assert_array_equal(np.delete(raw, [1, 2, 6, 8], axis=1), result) + + r7 = mt.delete(a, slice(0, 10), axis=0) + result = r7.execute().fetch() + np.testing.assert_array_equal(np.delete(raw, slice(0, 10), axis=0), result) + + r8 = mt.delete(a, mt.tensor([10, 20, 6, 80])) + result = r8.execute().fetch() + np.testing.assert_array_equal(np.delete(raw, [10, 20, 6, 80]), result) + + r9 = mt.delete(a, 9, axis=1) + result = r9.execute().fetch() + np.testing.assert_array_equal(np.delete(raw, 9, axis=1), result) + + +@pytest.mark.parametrize("chunk_size", [3, 5]) +@pytest.mark.parametrize("invert", [True, False]) +def test_in1d_execute(setup, chunk_size, invert): + rs = np.random.RandomState(0) + raw1 = rs.randint(10, size=10) + ar1 = mt.tensor(raw1, chunk_size=5) + raw2 = np.arange(5) + ar2 = mt.tensor(raw2, chunk_size=chunk_size) + ar = mt.in1d(ar1, ar2, invert=invert) + result = ar.execute().fetch() + expected = np.in1d(raw1, raw2, invert=invert) + np.testing.assert_array_equal(result, expected) + + +@pytest.mark.parametrize("chunk_size", [3, 5]) +def test_setdiff1d_execute(setup, chunk_size): + rs = np.random.RandomState(0) + raw1 = rs.randint(10, size=10) + ar1 = mt.tensor(raw1, chunk_size=5) + raw2 = np.arange(5) + ar2 = mt.tensor(raw2, chunk_size=chunk_size) + ar = mt.setdiff1d(ar1, ar2) + result = ar.execute().fetch() + expected = np.setdiff1d(raw1, raw2) + np.testing.assert_array_equal(result, expected) + + raw3 = rs.shuffle(rs.choice(np.arange(100), 10)) + ar3 = mt.tensor(raw3, chunk_size=5) + ar = mt.setdiff1d(ar3, ar2, assume_unique=True) + result = ar.execute().fetch() + expected = np.setdiff1d(raw3, raw2, assume_unique=True) + np.testing.assert_array_equal(result, expected) diff --git a/python/xorbits/_mars/tensor/base/tile.py b/python/xorbits/_mars/tensor/base/tile.py new file mode 100644 index 000000000..76b7174b1 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/tile.py @@ -0,0 +1,109 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + + +def tile(A, reps): + """ + Construct a tensor by repeating A the number of times given by reps. + + If `reps` has length ``d``, the result will have dimension of + ``max(d, A.ndim)``. + + If ``A.ndim < d``, `A` is promoted to be d-dimensional by prepending new + axes. So a shape (3,) array is promoted to (1, 3) for 2-D replication, + or shape (1, 1, 3) for 3-D replication. If this is not the desired + behavior, promote `A` to d-dimensions manually before calling this + function. + + If ``A.ndim > d``, `reps` is promoted to `A`.ndim by pre-pending 1's to it. + Thus for an `A` of shape (2, 3, 4, 5), a `reps` of (2, 2) is treated as + (1, 1, 2, 2). + + Note : Although tile may be used for broadcasting, it is strongly + recommended to use Mars' broadcasting operations and functions. + + Parameters + ---------- + A : array_like + The input tensor. + reps : array_like + The number of repetitions of `A` along each axis. + + Returns + ------- + c : Tensor + The tiled output tensor. + + See Also + -------- + repeat : Repeat elements of a tensor. + broadcast_to : Broadcast a tensor to a new shape + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([0, 1, 2]) + >>> mt.tile(a, 2).execute() + array([0, 1, 2, 0, 1, 2]) + >>> mt.tile(a, (2, 2)).execute() + array([[0, 1, 2, 0, 1, 2], + [0, 1, 2, 0, 1, 2]]) + >>> mt.tile(a, (2, 1, 2)).execute() + array([[[0, 1, 2, 0, 1, 2]], + [[0, 1, 2, 0, 1, 2]]]) + + >>> b = mt.array([[1, 2], [3, 4]]) + >>> mt.tile(b, 2).execute() + array([[1, 2, 1, 2], + [3, 4, 3, 4]]) + >>> mt.tile(b, (2, 1)).execute() + array([[1, 2], + [3, 4], + [1, 2], + [3, 4]]) + + >>> c = mt.array([1,2,3,4]) + >>> mt.tile(c,(4,1)).execute() + array([[1, 2, 3, 4], + [1, 2, 3, 4], + [1, 2, 3, 4], + [1, 2, 3, 4]]) + """ + from ..merge import concatenate + + try: + tup = tuple(reps) + except TypeError: + tup = (reps,) + + d = len(tup) + if A.ndim < d: + A = A[tuple(np.newaxis for _ in range(d - A.ndim))] + elif A.ndim > d: + tup = (1,) * (A.ndim - d) + tup + + a = A + for axis, rep in enumerate(tup): + if rep == 0: + slc = (slice(None),) * axis + (slice(0),) + a = a[slc] + elif rep < 0: + raise ValueError("negative dimensions are not allowed") + elif rep > 1: + a = concatenate([a] * rep, axis=axis) + + return a diff --git a/python/xorbits/_mars/tensor/base/to_cpu.py b/python/xorbits/_mars/tensor/base/to_cpu.py new file mode 100644 index 000000000..56b5138e8 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/to_cpu.py @@ -0,0 +1,40 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorDeviceConversionBase + + +class TensorToCPU(TensorDeviceConversionBase): + _op_type_ = OperandDef.TO_CPU + + def __init__(self, dtype=None, gpu=None, sparse=None, **kw): + super().__init__(dtype=dtype, gpu=gpu, sparse=sparse, **kw) + if self.gpu or self.gpu is None: + self.gpu = False + + @classmethod + def execute(cls, ctx, op): + ctx[op.outputs[0].key] = ctx[op.input.key].get() + + +def to_cpu(x): + x = astensor(x) + + if x.op.gpu is False: + return x + + op = TensorToCPU(dtype=x.dtype) + return op(x) diff --git a/python/xorbits/_mars/tensor/base/to_gpu.py b/python/xorbits/_mars/tensor/base/to_gpu.py new file mode 100644 index 000000000..44677191d --- /dev/null +++ b/python/xorbits/_mars/tensor/base/to_gpu.py @@ -0,0 +1,42 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ..array_utils import move_to_device +from ..datasource import tensor as astensor +from .core import TensorDeviceConversionBase + + +class TensorToGPU(TensorDeviceConversionBase): + _op_type_ = OperandDef.TO_GPU + + def __init__(self, dtype=None, gpu=None, sparse=None, **kw): + super().__init__(dtype=dtype, gpu=gpu, sparse=sparse, **kw) + if not self.gpu: + self.gpu = True + + @classmethod + def execute(cls, ctx, op): + device = op.device or 0 + ctx[op.outputs[0].key] = move_to_device(ctx[op.input.key], device) + + +def to_gpu(x): + x = astensor(x) + + if x.op.gpu: + return x + + op = TensorToGPU(dtype=x.dtype) + return op(x) diff --git a/python/xorbits/_mars/tensor/base/topk.py b/python/xorbits/_mars/tensor/base/topk.py new file mode 100644 index 000000000..e8792b6e8 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/topk.py @@ -0,0 +1,597 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np + +from ... import opcodes as OperandDef +from ...config import options +from ...core import ExecutableTuple, recursive_tile +from ...core.operand import OperandStage +from ...serialization.serializables import ( + BoolField, + FieldTypes, + Int32Field, + Int64Field, + KeyField, + ListField, + StringField, +) +from ...utils import ceildiv, flatten +from ..array_utils import as_same_device, device +from ..core import TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorOperand, TensorOperandMixin +from ..utils import validate_axis, validate_order +from .sort import _validate_sort_psrs_kinds + + +class TensorTopk(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.TOPK + + _input = KeyField("input") + _k = Int64Field("k") + _axis = Int32Field("axis") + _largest = BoolField("largest") + _sorted = BoolField("sorted") + _order = ListField("order", FieldTypes.string) + _parallel_kind = StringField("parallel_kind") + _psrs_kinds = ListField("psrs_kinds", FieldTypes.string) + _return_value = BoolField("return_value") + _return_indices = BoolField("return_indices") + _axis_offset = Int64Field( + "axis_offset", + on_serialize=lambda x: -1 if x is not None and np.isnan(x) else x, + on_deserialize=lambda x: np.nan if x == -1 else x, + ) + + def __init__( + self, + k=None, + axis=None, + largest=None, + sorted=None, + order=None, + parallel_kind=None, + psrs_kinds=None, + return_value=None, + return_indices=None, + axis_offset=None, + **kw + ): + super().__init__( + _k=k, + _axis=axis, + _largest=largest, + _sorted=sorted, + _parallel_kind=parallel_kind, + _psrs_kinds=psrs_kinds, + _return_value=return_value, + _return_indices=return_indices, + _order=order, + _axis_offset=axis_offset, + **kw + ) + + @property + def input(self): + return self._input + + @property + def k(self): + return self._k + + @property + def axis(self): + return self._axis + + @property + def largest(self): + return self._largest + + @property + def sorted(self): + return self._sorted + + @property + def order(self): + return self._order + + @property + def parallel_kind(self): + return self._parallel_kind + + @property + def psrs_kinds(self): + return self._psrs_kinds + + @property + def return_value(self): + return self._return_value + + @property + def return_indices(self): + return self._return_indices + + @property + def axis_offset(self): + return self._axis_offset + + @property + def output_limit(self): + if self.stage != OperandStage.agg: + return 1 + else: + return int(bool(self._return_value)) + int(bool(self._return_indices)) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + def __call__(self, a): + shape = list(a.shape) + shape[self._axis] = min(a.shape[self._axis], self._k) + kws = [] + if self._return_value: + kws.append( + { + "shape": tuple(shape), + "order": a.order, + "dtype": a.dtype, + "type": "topk", + } + ) + if self._return_indices: + kws.append( + { + "shape": tuple(shape), + "order": TensorOrder.C_ORDER, + "dtype": np.dtype(np.int64), + "type": "argtopk", + } + ) + ret = self.new_tensors([a], kws=kws) + if len(kws) == 1: + return ret[0] + return ExecutableTuple(ret) + + @classmethod + def _tile_one_chunk(cls, op): + return_value, return_indices = op.return_value, op.return_indices + out = op.outputs[0] + chunk_op = op.copy().reset_key() + kws = [] + if return_value: + kws.append( + { + "shape": out.shape, + "order": out.order, + "index": (0,) * out.ndim, + "dtype": out.dtype, + "type": "topk", + } + ) + if return_indices: + kws.append( + { + "shape": out.shape, + "order": TensorOrder.C_ORDER, + "index": (0,) * out.ndim, + "dtype": np.dtype(np.int64), + "type": "argtopk", + } + ) + chunks = chunk_op.new_chunks([op.input.chunks[0]], kws=kws) + kws = [out.params for out in op.outputs] + nsplits = tuple((s,) for s in out.shape) + if return_value: + kws[0]["nsplits"] = nsplits + kws[0]["chunks"] = [chunks[0]] + if return_indices: + kws[-1]["nsplits"] = nsplits + kws[-1]["chunks"] = [chunks[1]] + new_op = op.copy() + return new_op.new_tensors(op.inputs, kws=kws) + + @classmethod + def _tile_via_psrs(cls, op): + from .sort import TensorSort + + return_value = op.return_value + return_indices = op.return_indices + + # just sort, force need_align=True + psrs_kinds = op.psrs_kinds or ["quicksort", "mergesort", "mergesort"] + sort_op = TensorSort( + axis=op.axis, + order=op.order, + psrs_kinds=psrs_kinds, + need_align=True, + return_value=return_value, + return_indices=return_indices, + ) + ret = sort_op(op.input) + + if not isinstance(ret, tuple): + ret = (ret,) + + base_slcs = (slice(None),) * op.axis + if op.largest: + ret = [r[base_slcs + (slice(-1, -op.k - 1, -1),)] for r in ret] + else: + ret = [r[base_slcs + (slice(op.k),)] for r in ret] + + ret = yield from recursive_tile(ret) + new_op = op.copy() + kws = [o.params for o in op.outputs] + if return_value: + kws[0]["nsplits"] = ret[0].nsplits + kws[0]["chunks"] = ret[0].chunks + if return_indices: + kws[-1]["nsplits"] = ret[-1].nsplits + kws[-1]["chunks"] = ret[-1].chunks + return new_op.new_tensors(op.inputs, kws=kws) + + @classmethod + def _gen_topk_chunk( + cls, input_chunk, op, is_terminate_node, axis_offset=None, chunk_index=None + ): + chunk_op = op.copy().reset_key() + if axis_offset is not None: + chunk_op._axis_offset = axis_offset + if not is_terminate_node: + # no need to sort if not the terminated node + chunk_op._sorted = False + shape = list(input_chunk.shape) + shape[op.axis] = min(op.k, input_chunk.shape[op.axis]) + if not is_terminate_node: + # whenever return_indices, value is required + chunk_op._return_value = True + if axis_offset is not None: + chunk_op.stage = OperandStage.map + else: + chunk_op.stage = OperandStage.combine + return chunk_op.new_chunk( + [input_chunk], + shape=tuple(shape), + order=input_chunk.order, + index=chunk_index, + ) + else: + chunk_op.stage = OperandStage.agg + kws = [] + if op.return_value: + kws.append( + { + "shape": tuple(shape), + "order": input_chunk.order, + "dtype": input_chunk.dtype, + "index": chunk_index, + "type": "topk", + } + ) + if op.return_indices: + kws.append( + { + "shape": tuple(shape), + "order": TensorOrder.C_ORDER, + "dtype": np.dtype(np.int64), + "index": chunk_index, + "type": "argtopk", + } + ) + return chunk_op.new_chunks([input_chunk], kws=kws) + + @classmethod + def _merge_chunks(cls, input_chunks, axis): + from ..merge import TensorConcatenate + + if len(input_chunks) == 1: + return input_chunks[0] + + shape = list(input_chunks[0].shape) + shape[axis] = sum(c.shape[axis] for c in input_chunks) + + merge_op = TensorConcatenate(axis=axis, dtype=input_chunks[0].dtype) + return merge_op.new_chunk( + input_chunks, shape=tuple(shape), order=input_chunks[0].order + ) + + @classmethod + def _tile_via_tree(cls, op): + a = op.input + axis = op.axis + return_value, return_indices = op.return_value, op.return_indices + combine_size = options.combine_size + axis_offsets = [0] + np.cumsum(a.nsplits[axis]).tolist()[:-1] + + out_chunks, indices_chunks = [], [] + for other_idx in itertools.product( + *(range(s) for i, s in enumerate(a.chunk_shape) if i != axis) + ): + merge_chunks = [] + for j in range(a.chunk_shape[axis]): + idx = list(other_idx) + idx.insert(axis, j) + input_chunk = a.cix[tuple(idx)] + merge_chunks.append( + cls._gen_topk_chunk( + input_chunk, op, False, axis_offset=axis_offsets[j] + ) + ) + while len(merge_chunks) > combine_size: + new_size = ceildiv(len(merge_chunks), combine_size) + new_merge_chunks = [] + for i in range(new_size): + to_merge_chunks = merge_chunks[ + i * combine_size : (i + 1) * combine_size + ] + merge_chunk = cls._merge_chunks(to_merge_chunks, axis) + topk_chunk = cls._gen_topk_chunk(merge_chunk, op, False) + new_merge_chunks.append(topk_chunk) + merge_chunks = new_merge_chunks + + merge_chunk = cls._merge_chunks(merge_chunks, axis) + chunk_index = list(other_idx) + chunk_index.insert(axis, 0) + chunks = cls._gen_topk_chunk( + merge_chunk, op, True, chunk_index=tuple(chunk_index) + ) + if return_value: + out_chunks.append(chunks[0]) + if return_indices: + indices_chunks.append(chunks[-1]) + + new_op = op.copy() + nsplits = list(a.nsplits) + nsplits[axis] = (min(a.shape[axis], op.k),) + kws = [out.params for out in op.outputs] + if return_value: + kws[0]["nsplits"] = nsplits + kws[0]["chunks"] = out_chunks + if return_indices: + kws[-1]["nsplits"] = nsplits + kws[-1]["chunks"] = indices_chunks + return new_op.new_tensors(op.inputs, kws=kws) + + @classmethod + def tile(cls, op): + a = op.input + combine_size = options.combine_size + k = op.k + axis = op.axis + + if len(a.chunks) == 1: + return cls._tile_one_chunk(op) + + parallel_kind = op.parallel_kind.lower() + + if parallel_kind == "auto": + nsplit = a.nsplits[axis] + max_chunk_size = max(nsplit) + if np.isnan(max_chunk_size): + # has unknown chunk shape and k > 100 just choose 'psrs' + parallel_kind = "psrs" if k > 100 else "tree" + else: + if combine_size * k <= max_chunk_size: + # each chunk will have k elements on specified axis, + # if combined chunk which generated in the tree reduction + # is less than max chunk size, parallel kind `tree` will be adopted + parallel_kind = "tree" + else: + parallel_kind = "psrs" + + if parallel_kind == "tree": + op._parallel_kind = "tree" + return cls._tile_via_tree(op) + else: + assert parallel_kind == "psrs" + op._parallel_kind = "psrs" + return (yield from cls._tile_via_psrs(op)) + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + flatten([ctx[inp.key] for inp in op.inputs]), + device=op.device, + ret_extra=True, + ) + if len(inputs) == 2: + a, indices = inputs + else: + a, indices = inputs[0], None + + k = op.k + axis = op.axis + to_sort = op.sorted + largest = op.largest + return_value = op.return_value + return_indices = op.return_indices + axis_offset = op.axis_offset + + with device(device_id): + av, ap = _topk_helper( + xp, + a, + k, + axis=axis, + largest=largest, + sorted=to_sort, + order=op.order, + indices=indices, + axis_offset=axis_offset, + return_value=return_value, + return_indices=return_indices, + ) + if op.stage != OperandStage.agg: + out = [av] + if op.return_indices: + out.append(ap) + ctx[op.outputs[0].key] = tuple(out) + else: + if op.return_value: + ctx[op.outputs[0].key] = av + if op.return_indices: + ctx[op.outputs[-1].key] = ap + + +def _gen_indices(shape, axis, xp): + ap = xp.swapaxes(xp.empty(shape, dtype=np.int64), axis, -1) + ap[...] = xp.arange(shape[axis]).reshape((1,) * (ap.ndim - 1) + (-1,)) + return xp.swapaxes(ap, -1, axis) + + +def _topk_helper( + xp, + a, + k, + axis=-1, + largest=True, + sorted=True, + order=None, + indices=None, + axis_offset=None, + return_value=True, + return_indices=False, +): + size = a.shape[axis] + base_slc = (slice(None),) * axis + kw = {} + if order is not None: + kw["order"] = order + + ap = None + if return_indices: + # do partition + if largest: + if k < size: + length = size - k + ap = xp.argpartition(a, length, axis=axis, **kw)[ + base_slc + (slice(-k, None),) + ] + av = xp.take_along_axis(a, ap, axis) + if indices is not None: + ap = xp.take_along_axis(indices, ap, axis) + else: + av = a + if indices is not None: + ap = indices + else: + ap = _gen_indices(a.shape, axis, xp) + if sorted: + # sort then reverse + ags = xp.argsort(av, axis=axis, **kw)[ + base_slc + (slice(None, None, -1),) + ] + ap = xp.take_along_axis(ap, ags, axis) + av = xp.take_along_axis(av, ags, axis) + else: + if k < size: + ap = xp.argpartition(a, k, axis=axis, **kw)[base_slc + (slice(k),)] + av = xp.take_along_axis(a, ap, axis) + if indices is not None: + ap = xp.take_along_axis(indices, ap, axis) + else: + av = a + if indices is not None: + ap = indices + else: + ap = _gen_indices(a.shape, axis, xp) + if sorted: + ags = xp.argsort(av, axis=axis, **kw) + ap = xp.take_along_axis(ap, ags, axis) + av = xp.take_along_axis(av, ags, axis) + if axis_offset: + ap = ap + axis_offset + else: + assert return_value + if largest: + if k < size: + length = size - k + av = xp.partition(a, length, axis=axis, **kw)[ + base_slc + (slice(-k, None),) + ] + else: + av = a + if sorted: + # sort then reverse + av = xp.sort(av, axis=axis, **kw)[base_slc + (slice(None, None, -1),)] + else: + if k < size: + av = xp.partition(a, k, axis=axis, **kw)[base_slc + (slice(k),)] + else: + av = a + if sorted: + av = xp.sort(av, axis=axis, **kw) + + return av, ap + + +def _validate_topk_arguments( + a, k, axis, largest, sorted, order, parallel_kind, psrs_kinds +): + a = astensor(a) + if axis is None: + a = a.flatten() + axis = 0 + else: + axis = validate_axis(a.ndim, axis) + # if a is structure type and order is not None + order = validate_order(a.dtype, order) + if parallel_kind.lower() not in {"auto", "tree", "psrs"}: + raise ValueError("`parallel_kind` could only be `auto`, `tree`, or `psrs`") + # if psrs is chosen, sort will be used, + # psrs_kinds will be passed into it, so use the validation logic in sort + psrs_kinds = _validate_sort_psrs_kinds(psrs_kinds) + return a, k, axis, largest, sorted, order, parallel_kind, psrs_kinds + + +def topk( + a, + k, + axis=-1, + largest=True, + sorted=True, + order=None, + parallel_kind="auto", + psrs_kinds=None, + return_index=False, +): + ( + a, + k, + axis, + largest, + sorted, + order, + parallel_kind, + psrs_kinds, + ) = _validate_topk_arguments( + a, k, axis, largest, sorted, order, parallel_kind, psrs_kinds + ) + op = TensorTopk( + k=k, + axis=axis, + largest=largest, + sorted=sorted, + order=order, + parallel_kind=parallel_kind, + psrs_kinds=psrs_kinds, + dtype=a.dtype, + return_value=True, + return_indices=return_index, + stage=OperandStage.agg, + ) + return op(a) diff --git a/python/xorbits/_mars/tensor/base/transpose.py b/python/xorbits/_mars/tensor/base/transpose.py new file mode 100644 index 000000000..24028ac5e --- /dev/null +++ b/python/xorbits/_mars/tensor/base/transpose.py @@ -0,0 +1,168 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import FieldTypes, KeyField, ListField +from ..array_utils import as_same_device, device +from ..core import TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorHasInput, TensorOperandMixin +from ..utils import reverse_order + + +def _reorder(x, axes): + if x is None: + return + return type(x)(x[ax] for ax in axes) + + +class TensorTranspose(TensorHasInput, TensorOperandMixin): + _op_type_ = OperandDef.TRANSPOSE + + _input = KeyField("input") + _axes = ListField("axes", FieldTypes.int32) + + def __init__(self, axes=None, **kw): + super().__init__( + _axes=axes, + # transpose will create a view + create_view=True, + **kw + ) + + @property + def axes(self): + return getattr(self, "_axes", None) + + def __call__(self, a): + shape = tuple( + s if np.isnan(s) else int(s) for s in _reorder(a.shape, self._axes) + ) + if self._axes == list(reversed(range(a.ndim))): + # order reversed + tensor_order = reverse_order(a.order) + else: + tensor_order = TensorOrder.C_ORDER + return self.new_tensor([a], shape, order=tensor_order) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + def on_output_modify(self, new_output): + op = self.copy().reset_key() + return op(new_output) + + def on_input_modify(self, new_input): + op = self.copy().reset_key() + return op(new_input) + + @classmethod + def tile(cls, op): + tensor = op.outputs[0] + + out_chunks = [] + for c in op.inputs[0].chunks: + chunk_op = op.copy().reset_key() + chunk_shape = tuple( + s if np.isnan(s) else int(s) for s in _reorder(c.shape, op.axes) + ) + chunk_idx = _reorder(c.index, op.axes) + out_chunk = chunk_op.new_chunk( + [c], shape=chunk_shape, index=chunk_idx, order=tensor.order + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + nsplits = _reorder(op.inputs[0].nsplits, op.axes) + return new_op.new_tensors( + op.inputs, + op.outputs[0].shape, + order=tensor.order, + chunks=out_chunks, + nsplits=nsplits, + ) + + @classmethod + def execute(cls, ctx, op): + (x,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + axes = op.axes + with device(device_id): + ctx[op.outputs[0].key] = xp.transpose(x, axes or None) + + +def transpose(a, axes=None): + """ + Permute the dimensions of a tensor. + + Parameters + ---------- + a : array_like + Input tensor. + axes : list of ints, optional + By default, reverse the dimensions, otherwise permute the axes + according to the values given. + + Returns + ------- + p : Tensor + `a` with its axes permuted. A view is returned whenever + possible. + + See Also + -------- + moveaxis + argsort + + Notes + ----- + Use `transpose(a, argsort(axes))` to invert the transposition of tensors + when using the `axes` keyword argument. + + Transposing a 1-D array returns an unchanged view of the original tensor. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.arange(4).reshape((2,2)) + >>> x.execute() + array([[0, 1], + [2, 3]]) + + >>> mt.transpose(x).execute() + array([[0, 2], + [1, 3]]) + + >>> x = mt.ones((1, 2, 3)) + >>> mt.transpose(x, (1, 0, 2)).shape + (2, 1, 3) + + """ + a = astensor(a) + if axes: + if len(axes) != a.ndim: + raise ValueError("axes don't match tensor") + + if not axes: + axes = list(range(a.ndim))[::-1] + else: + axes = list(axes) + op = TensorTranspose(axes, dtype=a.dtype, sparse=a.issparse()) + return op(a) diff --git a/python/xorbits/_mars/tensor/base/trapz.py b/python/xorbits/_mars/tensor/base/trapz.py new file mode 100644 index 000000000..7e37e651e --- /dev/null +++ b/python/xorbits/_mars/tensor/base/trapz.py @@ -0,0 +1,213 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes +from ...core import recursive_tile +from ...serialization.serializables import Float64Field, Int8Field, KeyField +from ...utils import has_unknown_shape +from ..array_utils import as_same_device, device +from ..core import TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorOperand, TensorOperandMixin +from ..utils import validate_axis + + +class TensorTrapz(TensorOperand, TensorOperandMixin): + _op_type_ = opcodes.TRAPZ + + _y = KeyField("y") + _x = KeyField("x") + _dx = Float64Field("dx") + _axis = Int8Field("axis") + + def __init__(self, y=None, x=None, dx=None, axis=None, **kw): + super().__init__(_y=y, _x=x, _dx=dx, _axis=axis, **kw) + + @property + def y(self): + return self._y + + @property + def x(self): + return self._x + + @property + def dx(self): + return self._dx + + @property + def axis(self): + return self._axis + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._y = self._inputs[0] + if self._x is not None: + self._x = self._inputs[-1] + + def __call__(self, y, x=None): + inputs = [y] + order = y.order + if x is not None: + x = astensor(x) + inputs.append(x) + if x.order == TensorOrder.C_ORDER: + order = TensorOrder.C_ORDER + + shape = tuple(s for ax, s in enumerate(y.shape) if ax != self._axis) + dtype = np.trapz(np.empty(1, dtype=y.dtype)).dtype + return self.new_tensor(inputs, shape=shape, dtype=dtype, order=order) + + @classmethod + def tile(cls, op: "TensorTrapz"): + from .diff import diff + + y = astensor(op.y) + x = op.x + axis = op.axis + + if x is not None: + x = astensor(x) + # rechunk x to make x.nsplits == y.nsplits + if has_unknown_shape(x, y): + yield + x = yield from recursive_tile(x.rechunk(y.nsplits)) + + if len(y.chunks) == 1: + return cls._tile_one_chunk(op, y, x) + + if x is None: + d = op.dx + else: + if x.ndim == 1: + d = diff(x) + # reshape to correct shape + shape = [1] * y.ndim + shape[axis] = d.shape[0] + d = d.reshape(shape) + else: + d = diff(x, axis=axis) + nd = y.ndim + slice1 = [slice(None)] * nd + slice2 = [slice(None)] * nd + slice1[axis] = slice(1, None) + slice2[axis] = slice(None, -1) + ret = (d * (y[tuple(slice1)] + y[tuple(slice2)]) / 2.0).sum(axis) + return [(yield from recursive_tile(ret))] + + @classmethod + def _tile_one_chunk(cls, op, y, x): + out = op.outputs[0] + chunk_op = op.copy().reset_key() + inputs = [y.chunks[0]] + if x is not None: + inputs.append(x.chunks[0]) + chunk = chunk_op.new_chunk( + inputs, shape=out.shape, order=out.order, index=(0,) * out.ndim + ) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + shape=out.shape, + order=out.order, + nsplits=tuple((s,) for s in out.shape), + chunks=[chunk], + ) + + @classmethod + def execute(cls, ctx, op: "TensorTrapz"): + inputs, device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + y = inputs[0] + if len(inputs) > 1: + x = inputs[-1] + else: + x = None + + with device(device_id): + ctx[op.outputs[0].key] = xp.trapz(y, x=x, dx=op.dx, axis=op.axis) + + +def trapz(y, x=None, dx=1.0, axis=-1): + """ + Integrate along the given axis using the composite trapezoidal rule. + + Integrate `y` (`x`) along given axis. + + Parameters + ---------- + y : array_like + Input tensor to integrate. + x : array_like, optional + The sample points corresponding to the `y` values. If `x` is None, + the sample points are assumed to be evenly spaced `dx` apart. The + default is None. + dx : scalar, optional + The spacing between sample points when `x` is None. The default is 1. + axis : int, optional + The axis along which to integrate. + + Returns + ------- + trapz : float + Definite integral as approximated by trapezoidal rule. + + See Also + -------- + sum, cumsum + + Notes + ----- + Image [2]_ illustrates trapezoidal rule -- y-axis locations of points + will be taken from `y` tensor, by default x-axis distances between + points will be 1.0, alternatively they can be provided with `x` tensor + or with `dx` scalar. Return value will be equal to combined area under + the red lines. + + + References + ---------- + .. [1] Wikipedia page: https://en.wikipedia.org/wiki/Trapezoidal_rule + + .. [2] Illustration image: + https://en.wikipedia.org/wiki/File:Composite_trapezoidal_rule_illustration.png + + Examples + -------- + >>> import mars.tensor as mt + >>> mt.trapz([1,2,3]).execute() + 4.0 + >>> mt.trapz([1,2,3], x=[4,6,8]).execute() + 8.0 + >>> mt.trapz([1,2,3], dx=2).execute() + 8.0 + >>> a = mt.arange(6).reshape(2, 3) + >>> a.execute() + array([[0, 1, 2], + [3, 4, 5]]) + >>> mt.trapz(a, axis=0).execute() + array([1.5, 2.5, 3.5]) + >>> mt.trapz(a, axis=1).execute() + array([2., 8.]) + + """ + y = astensor(y) + axis = validate_axis(y.ndim, axis) + op = TensorTrapz(y=y, x=x, dx=dx, axis=axis) + return op(y, x=x) diff --git a/python/xorbits/_mars/tensor/base/unique.py b/python/xorbits/_mars/tensor/base/unique.py new file mode 100644 index 000000000..caf169d53 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/unique.py @@ -0,0 +1,603 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np + +from ... import opcodes as OperandDef +from ...config import options +from ...core import recursive_tile +from ...core.operand import OperandStage +from ...lib import sparse +from ...lib.sparse.core import get_array_module as get_sparse_array_module +from ...serialization.serializables import BoolField, Int32Field, Int64Field +from ...utils import has_unknown_shape +from ..array_utils import as_same_device, device +from ..core import TensorOrder +from ..operands import TensorMapReduceOperand, TensorOperandMixin, TensorShuffleProxy +from ..utils import hash_on_axis, validate_axis + + +class TensorUnique(TensorMapReduceOperand, TensorOperandMixin): + _op_type_ = OperandDef.UNIQUE + + _return_index = BoolField("return_index") + _return_inverse = BoolField("return_inverse") + _return_counts = BoolField("return_counts") + _axis = Int32Field("axis") + _aggregate_size = Int32Field("aggregate_size") + + _start_pos = Int64Field("start_pos") + + def __init__( + self, + return_index=None, + return_inverse=None, + return_counts=None, + axis=None, + start_pos=None, + aggregate_size=None, + **kw + ): + super().__init__( + _return_index=return_index, + _return_inverse=return_inverse, + _return_counts=return_counts, + _axis=axis, + _start_pos=start_pos, + _aggregate_size=aggregate_size, + **kw + ) + + @property + def output_limit(self): + if self.stage == OperandStage.map: + return 1 + return ( + 1 + + bool(self._return_index) + + bool(self._return_inverse) + + bool(self._return_counts) + ) + + @property + def return_index(self): + return self._return_index + + @property + def return_inverse(self): + return self._return_inverse + + @property + def return_counts(self): + return self._return_counts + + @property + def axis(self): + return self._axis + + @property + def aggregate_size(self): + return self._aggregate_size + + @property + def start_pos(self): + return self._start_pos + + @classmethod + def _gen_kws(cls, op, input_obj, chunk=False, chunk_index=None): + kws = [] + + # unique tensor + shape = list(input_obj.shape) + shape[op.axis] = np.nan + kw = {"shape": tuple(shape), "dtype": input_obj.dtype, "gpu": input_obj.op.gpu} + if chunk: + idx = [0] * len(shape) + idx[op.axis] = chunk_index or 0 + kw["index"] = tuple(idx) + kws.append(kw) + + # unique indices tensor + if op.return_index: + kw = { + "shape": (np.nan,), + "dtype": np.dtype(np.intp), + "gpu": input_obj.op.gpu, + "type": "indices", + } + if chunk: + kw["index"] = (chunk_index or 0,) + kws.append(kw) + + # unique inverse tensor + if op.return_inverse: + kw = { + "shape": (input_obj.shape[op.axis],), + "dtype": np.dtype(np.intp), + "gpu": input_obj.op.gpu, + "type": "inverse", + } + if chunk: + kw["index"] = (chunk_index or 0,) + kws.append(kw) + + # unique counts tensor + if op.return_counts: + kw = { + "shape": (np.nan,), + "dtype": np.dtype(np.int_), + "gpu": input_obj.op.gpu, + "type": "counts", + } + if chunk: + kw["index"] = (chunk_index or 0,) + kws.append(kw) + + return kws + + def __call__(self, ar): + from .atleast_1d import atleast_1d + + ar = atleast_1d(ar) + if self.axis is None: + if ar.ndim > 1: + ar = ar.flatten() + self._axis = 0 + else: + self._axis = validate_axis(ar.ndim, self._axis) + + kws = self._gen_kws(self, ar) + tensors = self.new_tensors([ar], kws=kws, order=TensorOrder.C_ORDER) + if len(tensors) == 1: + return tensors[0] + return tensors + + @classmethod + def _tile_one_chunk(cls, op): + outs = op.outputs + ins = op.inputs + + chunk_op = op.copy().reset_key() + in_chunk = ins[0].chunks[0] + kws = cls._gen_kws(chunk_op, in_chunk, chunk=True) + out_chunks = chunk_op.new_chunks([in_chunk], kws=kws, order=outs[0].order) + new_op = op.copy() + kws = [out.params.copy() for out in outs] + for kw, out_chunk in zip(kws, out_chunks): + kw["chunks"] = [out_chunk] + kw["nsplits"] = tuple((s,) for s in out_chunk.shape) + return new_op.new_tensors(ins, kws=kws, order=outs[0].order) + + @classmethod + def _tile_via_shuffle(cls, op): + # rechunk the axes except the axis to do unique into 1 chunk + inp = op.inputs[0] + if has_unknown_shape(inp): + yield + + if inp.ndim > 1: + new_chunk_size = dict() + for axis in range(inp.ndim): + if axis == op.axis: + continue + if np.isnan(inp.shape[axis]): + yield + new_chunk_size[axis] = inp.shape[axis] + if has_unknown_shape(inp): + yield + inp = yield from recursive_tile(inp.rechunk(new_chunk_size)) + + aggregate_size = op.aggregate_size + if aggregate_size is None: + aggregate_size = max(inp.chunk_shape[op.axis] // options.combine_size, 1) + + unique_on_chunk_sizes = inp.nsplits[op.axis] + start_poses = np.cumsum((0,) + unique_on_chunk_sizes).tolist()[:-1] + map_chunks = [] + for c in inp.chunks: + map_op = TensorUnique( + stage=OperandStage.map, + return_index=op.return_index, + return_inverse=op.return_inverse, + return_counts=op.return_counts, + axis=op.axis, + aggregate_size=aggregate_size, + start_pos=start_poses[c.index[op.axis]], + dtype=inp.dtype, + ) + shape = list(c.shape) + shape[op.axis] = np.nan + map_chunks.append(map_op.new_chunk([c], shape=tuple(shape), index=c.index)) + + shuffle_chunk = TensorShuffleProxy( + dtype=inp.dtype, _tensor_keys=[inp.op.key] + ).new_chunk(map_chunks, shape=()) + + reduce_chunks = [list() for _ in range(len(op.outputs))] + for i in range(aggregate_size): + reduce_op = TensorUnique( + stage=OperandStage.reduce, + return_index=op.return_index, + return_inverse=op.return_inverse, + return_counts=op.return_counts, + axis=op.axis, + reducer_index=(i,), + reducer_phase="agg", + n_reducers=aggregate_size, + ) + kws = cls._gen_kws(op, inp, chunk=True, chunk_index=i) + chunks = reduce_op.new_chunks( + [shuffle_chunk], kws=kws, order=op.outputs[0].order + ) + if op.return_inverse: + inverse_idx = 2 if op.return_index else 1 + for j, chk in enumerate(chunks): + if j == inverse_idx: + chk.is_mapper = True + else: + chk.is_mapper = False + for j, c in enumerate(chunks): + reduce_chunks[j].append(c) + + if op.return_inverse: + inverse_pos = 2 if op.return_index else 1 + map_inverse_chunks = reduce_chunks[inverse_pos] + inverse_shuffle_chunk = TensorShuffleProxy( + dtype=map_inverse_chunks[0].dtype + ).new_chunk(map_inverse_chunks, shape=()) + inverse_chunks = [] + for j, cs in enumerate(unique_on_chunk_sizes): + chunk_op = TensorUnique( + stage=OperandStage.reduce, + n_reducers=len(unique_on_chunk_sizes), + dtype=map_inverse_chunks[0].dtype, + reducer_index=(j,), + reducer_phase="inverse", + ) + inverse_chunk = chunk_op.new_chunk( + [inverse_shuffle_chunk], shape=(cs,), index=(j,) + ) + inverse_chunks.append(inverse_chunk) + reduce_chunks[inverse_pos] = inverse_chunks + + kws = [out.params for out in op.outputs] + for kw, chunks in zip(kws, reduce_chunks): + kw["chunks"] = chunks + unique_nsplits = list(inp.nsplits) + unique_nsplits[op.axis] = (np.nan,) * len(reduce_chunks[0]) + kws[0]["nsplits"] = tuple(unique_nsplits) + i = 1 + if op.return_index: + kws[i]["nsplits"] = ((np.nan,) * len(reduce_chunks[i]),) + i += 1 + if op.return_inverse: + kws[i]["nsplits"] = (inp.nsplits[op.axis],) + i += 1 + if op.return_counts: + kws[i]["nsplits"] = ((np.nan,) * len(reduce_chunks[i]),) + + new_op = op.copy() + return new_op.new_tensors(op.inputs, kws=kws) + + @classmethod + def tile(cls, op: "TensorUnique"): + if len(op.inputs[0].chunks) == 1: + return cls._tile_one_chunk(op) + else: + return (yield from cls._tile_via_shuffle(op)) + + @classmethod + def _execute_map(cls, ctx, op: "TensorUnique"): + (ar,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + n_reducers = op.aggregate_size + + with device(device_id): + results = xp.unique( + ar, + return_index=op.return_index, + return_inverse=op.return_inverse, + return_counts=op.return_counts, + axis=op.axis, + ) + results = (results,) if not isinstance(results, tuple) else results + results_iter = iter(results) + unique_ar = next(results_iter) + indices_ar = next(results_iter) + op.start_pos if op.return_index else None + inverse_ar = next(results_iter) if op.return_inverse else None + counts_ar = next(results_iter) if op.return_counts else None + + if xp is sparse: + dense_xp = get_sparse_array_module(unique_ar) + else: + dense_xp = xp + unique_index = ( + dense_xp.arange(unique_ar.shape[op.axis]) + if inverse_ar is not None + else None + ) + if unique_ar.size > 0: + unique_reducers = dense_xp.asarray( + hash_on_axis(unique_ar, op.axis, n_reducers) + ) + else: + unique_reducers = dense_xp.empty_like(unique_ar) + ind_ar = dense_xp.arange(ar.shape[op.axis]) + + for reducer in range(n_reducers): + res = [] + cond = unique_reducers == reducer + # unique + slc = (slice(None),) * op.axis + (cond,) + res.append(unique_ar[slc]) + # indices + if indices_ar is not None: + res.append(indices_ar[cond]) + # inverse + if inverse_ar is not None: + index_selected = unique_index[cond] + inv_cond = xp.isin(inverse_ar, index_selected) + inv_selected = xp.searchsorted(index_selected, inverse_ar[inv_cond]) + ind_selected = ind_ar[inv_cond] + res.append(xp.stack([ind_selected, inv_selected])) + # counts + if counts_ar is not None: + res.append(counts_ar[cond]) + ctx[op.outputs[0].key, (reducer,)] = ( + ctx.get_current_chunk().index, + tuple(res), + ) + + @classmethod + def _execute_agg_reduce(cls, ctx, op: "TensorUnique"): + input_indexes, input_data = zip(*list(op.iter_mapper_data(ctx))) + + inputs = list(zip(*input_data)) + flatten, device_id, xp = as_same_device( + list(itertools.chain(*inputs)), device=op.device, ret_extra=True + ) + n_ret = len(inputs[0]) + inputs = [flatten[i * n_ret : (i + 1) * n_ret] for i in range(len(inputs))] + + inputs_iter = iter(inputs) + unique_arrays = next(inputs_iter) + indices_arrays = next(inputs_iter) if op.return_index else None + inverse_arrays = next(inputs_iter) if op.return_inverse else None + counts_arrays = next(inputs_iter) if op.return_counts else None + + with device(device_id): + ar = xp.concatenate(unique_arrays, axis=op.axis) + result_return_inverse = op.return_inverse or op.return_counts + axis = op.axis + if ar.size == 0 or ar.shape[axis] == 0: + # empty array on the axis + results = [xp.empty(ar.shape)] + i = 1 + for it in (op.return_index, op.return_inverse, op.return_counts): + if it: + results.append(xp.empty([], dtype=op.outputs[i].dtype)) + i += 1 + results = tuple(results) + else: + results = xp.unique( + ar, + return_index=op.return_index, + return_inverse=result_return_inverse, + axis=axis, + ) + results = (results,) if not isinstance(results, tuple) else results + results_iter = iter(results) + outputs_iter = iter(op.outputs) + # unique array + ctx[next(outputs_iter).key] = next(results_iter) + + if op.output_limit == 1: + return + + # calc indices + if op.return_index: + ctx[next(outputs_iter).key] = xp.concatenate(indices_arrays)[ + next(results_iter) + ] + # calc inverse + try: + inverse_result = next(results_iter) + if op.return_inverse: + unique_sizes = tuple(ua.shape[op.axis] for ua in unique_arrays) + cum_unique_sizes = np.cumsum((0,) + unique_sizes) + indices_out_key = next(outputs_iter).key + for i, inverse_array in enumerate(inverse_arrays): + p = inverse_result[ + cum_unique_sizes[i] : cum_unique_sizes[i + 1] + ] + r = xp.empty(inverse_array.shape, dtype=inverse_array.dtype) + if inverse_array.size > 0: + r[0] = inverse_array[0] + r[1] = p[inverse_array[1]] + # return unique length and + ctx[indices_out_key, (input_indexes[i][op.axis],)] = ( + results[0].shape[op.axis], + r, + ) + # calc counts + if op.return_counts: + result_counts = xp.zeros(results[0].shape[op.axis], dtype=int) + t = np.stack([inverse_result, np.concatenate(counts_arrays)]) + + def acc(a): + i, v = a + result_counts[i] += v + + np.apply_along_axis(acc, 0, t) + ctx[next(outputs_iter).key] = xp.asarray(result_counts) + except StopIteration: + pass + + @classmethod + def _execute_inverse_reduce(cls, ctx, op: "TensorUnique"): + out = op.outputs[0] + inputs = list(op.iter_mapper_data(ctx)) + unique_sizes = [inp[0] for inp in inputs] + cum_unique_sizes = np.cumsum([0] + unique_sizes) + invs, device_id, xp = as_same_device( + [inp[1] for inp in inputs], device=op.device, ret_extra=True + ) + with device(device_id): + ret = xp.empty(out.shape, dtype=out.dtype) + for i, inv in enumerate(invs): + ret[inv[0]] = cum_unique_sizes[i] + inv[1] + ctx[out.key] = ret + + @classmethod + def execute(cls, ctx, op: "TensorUnique"): + if op.stage == OperandStage.map: + cls._execute_map(ctx, op) + elif op.stage == OperandStage.reduce: + if op.reducer_phase == "agg": + cls._execute_agg_reduce(ctx, op) + else: + assert op.reducer_phase == "inverse" + cls._execute_inverse_reduce(ctx, op) + else: + (ar,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + kw = dict( + return_index=op.return_index, + return_inverse=op.return_inverse, + return_counts=op.return_counts, + ) + if ar.dtype != object and sum(ar.shape) > 0: + # axis cannot pass when dtype is object or array size is 0 + kw["axis"] = op.axis + results = xp.unique(ar, **kw) + outs = op.outputs + if len(outs) == 1: + ctx[outs[0].key] = results + return + + assert len(outs) == len(results) + for out, result in zip(outs, results): + ctx[out.key] = result + + +def unique( + ar, + return_index=False, + return_inverse=False, + return_counts=False, + axis=None, + aggregate_size=None, +): + """ + Find the unique elements of a tensor. + + Returns the sorted unique elements of a tensor. There are three optional + outputs in addition to the unique elements: + + * the indices of the input tensor that give the unique values + * the indices of the unique tensor that reconstruct the input tensor + * the number of times each unique value comes up in the input tensor + + Parameters + ---------- + ar : array_like + Input tensor. Unless `axis` is specified, this will be flattened if it + is not already 1-D. + return_index : bool, optional + If True, also return the indices of `ar` (along the specified axis, + if provided, or in the flattened tensor) that result in the unique tensor. + return_inverse : bool, optional + If True, also return the indices of the unique tensor (for the specified + axis, if provided) that can be used to reconstruct `ar`. + return_counts : bool, optional + If True, also return the number of times each unique item appears + in `ar`. + axis : int or None, optional + The axis to operate on. If None, `ar` will be flattened. If an integer, + the subarrays indexed by the given axis will be flattened and treated + as the elements of a 1-D tensor with the dimension of the given axis, + see the notes for more details. Object tensors or structured tensors + that contain objects are not supported if the `axis` kwarg is used. The + default is None. + aggregate_size: int or None, optional + How many chunks will be after unique, default as #input.chunks / options.combine_size + + Returns + ------- + unique : Tensor + The sorted unique values. + unique_indices : Tensor, optional + The indices of the first occurrences of the unique values in the + original tensor. Only provided if `return_index` is True. + unique_inverse : Tensor, optional + The indices to reconstruct the original tensor from the + unique tensor. Only provided if `return_inverse` is True. + unique_counts : Tensor, optional + The number of times each of the unique values comes up in the + original tensor. Only provided if `return_counts` is True. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.unique([1, 1, 2, 2, 3, 3]).execute() + array([1, 2, 3]) + >>> a = mt.array([[1, 1], [2, 3]]) + >>> mt.unique(a).execute() + array([1, 2, 3]) + + Return the unique rows of a 2D tensor + + >>> a = mt.array([[1, 0, 0], [1, 0, 0], [2, 3, 4]]) + >>> mt.unique(a, axis=0).execute() + array([[1, 0, 0], [2, 3, 4]]) + + Return the indices of the original tensor that give the unique values: + + >>> a = mt.array(['a', 'b', 'b', 'c', 'a']) + >>> u, indices = mt.unique(a, return_index=True) + >>> u.execute() + array(['a', 'b', 'c'], + dtype='|S1') + >>> indices.execute() + array([0, 1, 3]) + >>> a[indices].execute() + array(['a', 'b', 'c'], + dtype='|S1') + + Reconstruct the input array from the unique values: + + >>> a = mt.array([1, 2, 6, 4, 2, 3, 2]) + >>> u, indices = mt.unique(a, return_inverse=True) + >>> u.execute() + array([1, 2, 3, 4, 6]) + >>> indices.execute() + array([0, 1, 4, 3, 1, 2, 1]) + >>> u[indices].execute() + array([1, 2, 6, 4, 2, 3, 2]) + """ + op = TensorUnique( + return_index=return_index, + return_inverse=return_inverse, + return_counts=return_counts, + axis=axis, + aggregate_size=aggregate_size, + ) + return op(ar) diff --git a/python/xorbits/_mars/tensor/base/vsplit.py b/python/xorbits/_mars/tensor/base/vsplit.py new file mode 100644 index 000000000..328af84aa --- /dev/null +++ b/python/xorbits/_mars/tensor/base/vsplit.py @@ -0,0 +1,74 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from ..datasource import tensor as astensor +from .split import split + + +def vsplit(a, indices_or_sections): + """ + Split a tensor into multiple sub-tensors vertically (row-wise). + + Please refer to the ``split`` documentation. ``vsplit`` is equivalent + to ``split`` with `axis=0` (default), the tensor is always split along the + first axis regardless of the tensor dimension. + + See Also + -------- + split : Split a tensor into multiple sub-tensors of equal size. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.arange(16.0).reshape(4, 4) + >>> x.execute() + array([[ 0., 1., 2., 3.], + [ 4., 5., 6., 7.], + [ 8., 9., 10., 11.], + [ 12., 13., 14., 15.]]) + >>> mt.vsplit(x, 2).execute() + [array([[ 0., 1., 2., 3.], + [ 4., 5., 6., 7.]]), + array([[ 8., 9., 10., 11.], + [ 12., 13., 14., 15.]])] + >>> mt.vsplit(x, mt.array([3, 6])).execute() + [array([[ 0., 1., 2., 3.], + [ 4., 5., 6., 7.], + [ 8., 9., 10., 11.]]), + array([[ 12., 13., 14., 15.]]), + array([], dtype=float64)] + + With a higher dimensional tensor the split is still along the first axis. + + >>> x = mt.arange(8.0).reshape(2, 2, 2) + >>> x.execute() + array([[[ 0., 1.], + [ 2., 3.]], + [[ 4., 5.], + [ 6., 7.]]]) + >>> mt.vsplit(x, 2).execute() + [array([[[ 0., 1.], + [ 2., 3.]]]), + array([[[ 4., 5.], + [ 6., 7.]]])] + + """ + ary = a + a = astensor(a) + + if a.ndim < 2: + raise ValueError("vsplit only works on tensors of 2 or more dimensions") + return split(ary, indices_or_sections, 0) diff --git a/python/xorbits/_mars/tensor/base/where.py b/python/xorbits/_mars/tensor/base/where.py new file mode 100644 index 000000000..be32b6619 --- /dev/null +++ b/python/xorbits/_mars/tensor/base/where.py @@ -0,0 +1,193 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import KeyField +from ...utils import has_unknown_shape +from ..array_utils import as_same_device, device +from ..core import TENSOR_TYPE +from ..datasource import tensor as astensor +from ..operands import TensorOperand, TensorOperandMixin +from ..utils import broadcast_shape, unify_chunks +from .broadcast_to import broadcast_to + + +class TensorWhere(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.WHERE + + _condition = KeyField("condition") + _x = KeyField("x") + _y = KeyField("y") + + @property + def condition(self): + return self._condition + + @property + def x(self): + return self._x + + @property + def y(self): + return self._y + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._condition = self._inputs[0] + self._x = self._inputs[1] + self._y = self._inputs[2] + + def __call__(self, condition, x, y, shape=None): + shape = shape or broadcast_shape(condition.shape, x.shape, y.shape) + return self.new_tensor([condition, x, y], shape) + + @classmethod + def tile(cls, op): + if has_unknown_shape(*op.inputs): + yield + inputs = yield from unify_chunks( + *[(input, list(range(input.ndim))[::-1]) for input in op.inputs] + ) + chunk_shapes = [ + t.chunk_shape if isinstance(t, TENSOR_TYPE) else t for t in inputs + ] + out_chunk_shape = broadcast_shape(*chunk_shapes) + output = op.outputs[0] + + out_chunks = [] + nsplits = [[np.nan] * shape for shape in out_chunk_shape] + get_index = lambda idx, t: tuple( + 0 if t.nsplits[i] == (1,) else ix for i, ix in enumerate(idx) + ) + for out_index in itertools.product(*(map(range, out_chunk_shape))): + in_chunks = [ + t.cix[get_index(out_index[-t.ndim :], t)] + if t.ndim != 0 + else t.chunks[0] + for t in inputs + ] + chunk_shape = broadcast_shape(*(c.shape for c in in_chunks)) + out_chunk = ( + op.copy() + .reset_key() + .new_chunk( + in_chunks, shape=chunk_shape, index=out_index, order=output.order + ) + ) + out_chunks.append(out_chunk) + for i, idx, s in zip(itertools.count(0), out_index, out_chunk.shape): + nsplits[i][idx] = s + + new_op = op.copy() + return new_op.new_tensors( + inputs, output.shape, order=output.order, chunks=out_chunks, nsplits=nsplits + ) + + @classmethod + def execute(cls, ctx, op): + (cond, x, y), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + ctx[op.outputs[0].key] = xp.where(cond, x, y) + + +def where(condition, x=None, y=None): + """ + Return elements, either from `x` or `y`, depending on `condition`. + + If only `condition` is given, return ``condition.nonzero()``. + + Parameters + ---------- + condition : array_like, bool + When True, yield `x`, otherwise yield `y`. + x, y : array_like, optional + Values from which to choose. `x`, `y` and `condition` need to be + broadcastable to some shape. + + Returns + ------- + out : Tensor or tuple of Tensors + If both `x` and `y` are specified, the output tensor contains + elements of `x` where `condition` is True, and elements from + `y` elsewhere. + + If only `condition` is given, return the tuple + ``condition.nonzero()``, the indices where `condition` is True. + + See Also + -------- + nonzero, choose + + Notes + ----- + If `x` and `y` are given and input arrays are 1-D, `where` is + equivalent to:: + + [xv if c else yv for (c,xv,yv) in zip(condition,x,y)] + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.where([[True, False], [True, True]], + ... [[1, 2], [3, 4]], + ... [[9, 8], [7, 6]]).execute() + array([[1, 8], + [3, 4]]) + + >>> mt.where([[0, 1], [1, 0]]).execute() + (array([0, 1]), array([1, 0])) + + >>> x = mt.arange(9.).reshape(3, 3) + >>> mt.where( x > 5 ).execute() + (array([2, 2, 2]), array([0, 1, 2])) + >>> mt.where(x < 5, x, -1).execute() # Note: broadcasting. + array([[ 0., 1., 2.], + [ 3., 4., -1.], + [-1., -1., -1.]]) + + Find the indices of elements of `x` that are in `goodvalues`. + + >>> goodvalues = [3, 4, 7] + >>> ix = mt.isin(x, goodvalues) + >>> ix.execute() + array([[False, False, False], + [ True, True, False], + [False, True, False]]) + >>> mt.where(ix).execute() + (array([1, 1, 2]), array([0, 1, 1])) + """ + if (x is None) != (y is None): + raise ValueError("either both or neither of x and y should be given") + + if x is None and y is None: + return astensor(condition).nonzero() + + x, y = astensor(x), astensor(y) + dtype = np.result_type(x.dtype, y.dtype) + shape = broadcast_shape(x.shape, y.shape) + + if np.isscalar(condition): + return broadcast_to(x if condition else y, shape).astype(dtype) + else: + condition = astensor(condition) + op = TensorWhere(dtype=dtype) + return op(condition, x, y, shape=shape) diff --git a/python/xorbits/_mars/tensor/core.py b/python/xorbits/_mars/tensor/core.py new file mode 100644 index 000000000..c50d052dd --- /dev/null +++ b/python/xorbits/_mars/tensor/core.py @@ -0,0 +1,727 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from collections.abc import Iterable +from enum import Enum +from operator import attrgetter +from typing import Any, Dict + +import numpy as np + +from ..core import ( + Chunk, + ChunkData, + HasShapeTileable, + HasShapeTileableData, + OutputType, + _ExecuteAndFetchMixin, + is_build_mode, + register_output_types, +) +from ..core.entity.utils import refresh_tileable_shape +from ..serialization.serializables import ( + AnyField, + DataTypeField, + FieldTypes, + ListField, + ReferenceField, + Serializable, + StringField, + TupleField, +) +from ..utils import on_deserialize_shape, on_serialize_shape +from .utils import fetch_corner_data, get_chunk_slices + +logger = logging.getLogger(__name__) + + +class TensorOrder(Enum): + # C order + C_ORDER = "C" + # Fortran order + F_ORDER = "F" + + +class TensorChunkData(ChunkData): + __slots__ = () + _no_copy_attrs_ = ChunkData._no_copy_attrs_ | {"dtype"} + type_name = "Tensor" + + # required fields + _shape = TupleField( + "shape", + FieldTypes.int64, + on_serialize=on_serialize_shape, + on_deserialize=on_deserialize_shape, + ) + _order = ReferenceField("order", TensorOrder) + # optional fields + _dtype = DataTypeField("dtype") + + def __init__(self, op=None, index=None, shape=None, dtype=None, order=None, **kw): + if isinstance(order, str): + order = getattr(TensorOrder, order) + super().__init__( + _op=op, _index=index, _shape=shape, _dtype=dtype, _order=order, **kw + ) + if self.order is None and self.op is not None: + if len(self.inputs) == 0: + self._order = TensorOrder.C_ORDER + elif all( + hasattr(inp, "order") and inp.order == TensorOrder.F_ORDER + for inp in self.inputs + ): + self._order = TensorOrder.F_ORDER + else: + self._order = TensorOrder.C_ORDER + + @property + def params(self) -> Dict[str, Any]: + # params return the properties which useful to rebuild a new chunk + return { + "shape": self.shape, + "dtype": self.dtype, + "order": self.order, + "index": self.index, + } + + @params.setter + def params(self, new_params: Dict[str, Any]): + params = new_params.copy() + params.pop("index", None) # index not needed to update + new_shape = params.pop("shape", None) + if new_shape is not None: + self._shape = new_shape + dtype = params.pop("dtype", None) + if dtype is not None: + self._dtype = dtype + order = params.pop("order", None) + if order is not None: + self._order = order + if params: # pragma: no cover + raise TypeError(f"Unknown params: {list(params)}") + + @classmethod + def get_params_from_data(cls, data: np.ndarray) -> Dict[str, Any]: + from .array_utils import is_cupy + + if not is_cupy(data): + data = np.asarray(data) + order = ( + TensorOrder.C_ORDER if data.flags["C_CONTIGUOUS"] else TensorOrder.F_ORDER + ) + return {"shape": data.shape, "dtype": data.dtype, "order": order} + + def __len__(self): + try: + return self.shape[0] + except IndexError: + if is_build_mode(): + return 0 + raise TypeError("len() of unsized object") + + @property + def shape(self): + return getattr(self, "_shape", None) + + @property + def ndim(self): + return len(self.shape) + + @property + def size(self): + return np.prod(self.shape).item() + + @property + def dtype(self): + return getattr(self, "_dtype", None) or getattr(self.op, "dtype", None) + + @property + def order(self): + return getattr(self, "_order", None) + + @property + def nbytes(self): + return np.prod(self.shape) * self.dtype.itemsize + + +class TensorChunk(Chunk): + __slots__ = () + _allow_data_type_ = (TensorChunkData,) + type_name = "Tensor" + + def __len__(self): + return len(self._data) + + +class TensorData(HasShapeTileableData, _ExecuteAndFetchMixin): + __slots__ = () + type_name = "Tensor" + + # required fields + _order = StringField( + "order", on_serialize=attrgetter("value"), on_deserialize=TensorOrder + ) + # optional fields + _dtype = DataTypeField("dtype") + _chunks = ListField( + "chunks", + FieldTypes.reference(TensorChunkData), + on_serialize=lambda x: [it.data for it in x] if x is not None else x, + on_deserialize=lambda x: [TensorChunk(it) for it in x] if x is not None else x, + ) + + def __init__( + self, + op=None, + shape=None, + dtype=None, + order=None, + nsplits=None, + chunks=None, + **kw, + ): + if isinstance(order, str): + order = getattr(TensorOrder, order) + super().__init__( + _op=op, + _shape=shape, + _dtype=dtype, + _order=order, + _nsplits=nsplits, + _chunks=chunks, + **kw, + ) + if self.order is None and self.op is not None: + if len(self.inputs) == 0: + self._order = TensorOrder.C_ORDER + elif all( + hasattr(inp, "order") and inp.order == TensorOrder.F_ORDER + for inp in self.inputs + ): + self._order = TensorOrder.F_ORDER + else: + self._order = TensorOrder.C_ORDER + + def _to_str(self, representation=False): + if is_build_mode() or len(self._executed_sessions) == 0: + # in build mode, or not executed, just return representation + if representation: + return f"Tensor " + else: + return f"Tensor(op={type(self._op).__name__}, shape={self._shape})" + else: + print_options = np.get_printoptions() + threshold = print_options["threshold"] + + corner_data = fetch_corner_data(self, session=self._executed_sessions[-1]) + # if less than default threshold, just set it as default, + # if not, set to corner_data.size - 1 make sure ... exists in repr + threshold = threshold if self.size <= threshold else corner_data.size - 1 + with np.printoptions(threshold=threshold): + corner_str = repr(corner_data) if representation else str(corner_data) + return corner_str + + def __str__(self): + return self._to_str(representation=False) + + def __repr__(self): + return self._to_str(representation=True) + + @property + def params(self): + # params return the properties which useful to rebuild a new tileable object + return {"shape": self.shape, "dtype": self.dtype, "order": self.order} + + @params.setter + def params(self, new_params: Dict[str, Any]): + params = new_params.copy() + shape = params.pop("shape", None) + if shape is not None: + self._shape = shape + dtype = params.pop("dtype", None) + if dtype is not None: + self._dtype = dtype + order = params.pop("order", None) + if order is not None: + self._order = order + if params: # pragma: no cover + raise TypeError(f"Unknown params: {list(params)}") + + def refresh_params(self): + refresh_tileable_shape(self) + if self._dtype is None: + self._dtype = self.chunks[0].dtype + + @property + def flags(self): + c_order = True if self.ndim <= 1 else self.order == TensorOrder.C_ORDER + f_order = True if self.ndim <= 1 else self.order == TensorOrder.F_ORDER + return {"C_CONTIGUOUS": c_order, "F_CONTIGUOUS": f_order} + + @property + def real(self): + from .arithmetic import real + + return real(self) + + @property + def imag(self): + from .arithmetic import imag + + return imag(self) + + @property + def dtype(self): + return getattr(self, "_dtype", None) or getattr(self.op, "dtype", None) + + @property + def order(self): + return getattr(self, "_order", None) + + @property + def nbytes(self): + return np.prod(self.shape) * self.dtype.itemsize + + def get_chunk_slices(self, idx): + return get_chunk_slices(self.nsplits, idx) + + def is_scalar(self): + return self.ndim == 0 + + isscalar = is_scalar + + def tosparse(self, missing=None): + if self.issparse(): + return self + + from .datasource import fromdense + + return fromdense(self, missing=missing) + + def todense(self, fill_value=None): + if not self.issparse(): + return self + + from .datasource import fromsparse + + return fromsparse(self, fill_value=fill_value) + + def transpose(self, *axes): + from .base import transpose + + if len(axes) == 1 and isinstance(axes[0], Iterable): + axes = axes[0] + + return transpose(self, axes) + + @property + def T(self): + return self.transpose() + + def reshape(self, shape, *shapes, **kw): + from .reshape import reshape + + order = kw.pop("order", "C") + if kw: + raise TypeError( + f"'{next(iter(kw))}' is an invalid keyword argument for this function" + ) + + if isinstance(shape, Iterable): + shape = tuple(shape) + else: + shape = (shape,) + shape += shapes + + return reshape(self, shape, order=order) + + def totiledb(self, uri, ctx=None, key=None, timestamp=None): + from .datastore import totiledb + + return totiledb(uri, self, ctx=ctx, key=key, timestamp=timestamp) + + @staticmethod + def from_dataframe(in_df): + from .datasource import from_dataframe + + return from_dataframe(in_df) + + def to_dataframe(self, *args, **kwargs): + from ..dataframe.datasource.from_tensor import dataframe_from_tensor + + return dataframe_from_tensor(self, *args, **kwargs) + + @property + def flat(self): + return flatiter(self) + + def to_numpy(self, session=None, **kw): + return self._execute_and_fetch(session=session, **kw) + + +class Tensor(HasShapeTileable): + __slots__ = () + _allow_data_type_ = (TensorData,) + type_name = "Tensor" + + def __len__(self): + return len(self._data) + + @property + def shape(self): + return self._data.shape + + @shape.setter + def shape(self, new_shape): + self._data = self._data.reshape(new_shape).data + + def _update_shape(self, new_shape): + self._data._update_shape(new_shape) + + @property + def real(self): + return self.data.real + + @real.setter + def real(self, new_real): + from .arithmetic.setreal import set_real + + self._data = set_real(self._data, new_real).data + + @property + def imag(self): + return self.data.imag + + @imag.setter + def imag(self, new_imag): + from .arithmetic.setimag import set_imag + + self._data = set_imag(self._data, new_imag).data + + def __array__(self, dtype=None): + return np.asarray(self.to_numpy(), dtype=dtype) + + def __array_function__(self, func, types, args, kwargs): + from .. import tensor as module + + for submodule in func.__module__.split(".")[1:]: + try: + module = getattr(module, submodule) + except AttributeError: + return NotImplemented + if not hasattr(module, func.__name__): + return NotImplemented + mars_func = getattr(module, func.__name__) + if mars_func is func: + # avoid Numpy func + return NotImplemented + return mars_func(*args, **kwargs) + + def view(self): + return self._view() + + @property + def ndim(self): + """ + Number of array dimensions. + + Examples + -------- + >>> import mars.tensor as mt + >>> x = mt.array([1, 2, 3]) + >>> x.ndim + 1 + >>> y = mt.zeros((2, 3, 4)) + >>> y.ndim + 3 + """ + return super().ndim + + def transpose(self, *axes): + """ + Returns a view of the tensor with axes transposed. + + For a 1-D tensor, this has no effect. (To change between column and + row vectors, first cast the 1-D tensor into a matrix object.) + For a 2-D tensor, this is the usual matrix transpose. + For an n-D tensor, if axes are given, their order indicates how the + axes are permuted (see Examples). If axes are not provided and + ``a.shape = (i[0], i[1], ... i[n-2], i[n-1])``, then + ``a.transpose().shape = (i[n-1], i[n-2], ... i[1], i[0])``. + + Parameters + ---------- + axes : None, tuple of ints, or `n` ints + + * None or no argument: reverses the order of the axes. + + * tuple of ints: `i` in the `j`-th place in the tuple means `a`'s + `i`-th axis becomes `a.transpose()`'s `j`-th axis. + + * `n` ints: same as an n-tuple of the same ints (this form is + intended simply as a "convenience" alternative to the tuple form) + + Returns + ------- + out : Tensor + View of `a`, with axes suitably permuted. + + See Also + -------- + Tensor.T : Tensor property returning the tensor transposed. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([[1, 2], [3, 4]]) + >>> a.execute() + array([[1, 2], + [3, 4]]) + >>> a.transpose().execute() + array([[1, 3], + [2, 4]]) + >>> a.transpose((1, 0)) + array([[1, 3], + [2, 4]]) + >>> a.transpose(1, 0).execute() + array([[1, 3], + [2, 4]]) + """ + return self._data.transpose(*axes) + + @property + def T(self): + """ + Same as self.transpose(), except that self is returned if + self.ndim < 2. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.array([[1.,2.],[3.,4.]]) + >>> x.execute() + array([[ 1., 2.], + [ 3., 4.]]) + >>> x.T.execute() + array([[ 1., 3.], + [ 2., 4.]]) + >>> x = mt.array([1.,2.,3.,4.]) + >>> x.execute() + array([ 1., 2., 3., 4.]) + >>> x.T.execute() + array([ 1., 2., 3., 4.]) + """ + return self._data.T + + def totiledb(self, uri, ctx=None, key=None, timestamp=None): + return self._data.totiledb(uri, ctx=ctx, key=key, timestamp=timestamp) + + def copy(self, order="C"): + return super().copy().astype(self.dtype, order=order, copy=False) + + def sort(self, axis=-1, kind=None, parallel_kind=None, psrs_kinds=None, order=None): + """ + Sort a tensor, in-place. + + Parameters + ---------- + axis : int, optional + Axis along which to sort. Default is -1, which means sort along the + last axis. + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional + Sorting algorithm. Default is 'quicksort'. + parallel_kind: {'PSRS'}, optional + Parallel sorting algorithm, for the details, refer to: + http://csweb.cs.wfu.edu/bigiron/LittleFE-PSRS/build/html/PSRSalgorithm.html + psrs_kinds: list with 3 elements, optional + Sorting algorithms during PSRS algorithm. + order : str or list of str, optional + When `a` is a tensor with fields defined, this argument specifies + which fields to compare first, second, etc. A single field can + be specified as a string, and not all fields need be specified, + but unspecified fields will still be used, in the order in which + they come up in the dtype, to break ties. + + See Also + -------- + numpy.sort : Return a sorted copy of a tensor. + argsort : Indirect sort. + lexsort : Indirect stable sort on multiple keys. + searchsorted : Find elements in sorted tensor. + partition: Partial sort. + + Notes + ----- + See ``sort`` for notes on the different sorting algorithms. + + Examples + -------- + >>> import mars.tensor as mt + >>> a = mt.array([[1,4], [3,1]]) + >>> a.sort(axis=1) + >>> a.execute() + array([[1, 4], + [1, 3]]) + >>> a.sort(axis=0) + >>> a.execute() + array([[1, 3], + [1, 4]]) + + Use the `order` keyword to specify a field to use when sorting a + structured tensor: + + >>> a = mt.array([('a', 2), ('c', 1)], dtype=[('x', 'S1'), ('y', int)]) + >>> a.sort(order='y') + >>> a.execute() + array([('c', 1), ('a', 2)], + dtype=[('x', '|S1'), ('y', '>> import mars.tensor as mt + >>> a = mt.array([3, 4, 2, 1]) + >>> a.partition(3) + >>> a.execute() + array([2, 1, 3, 4]) + + >>> a.partition((1, 3)) + >>> a.execute() + array([1, 2, 3, 4]) + """ + from .base import partition + + self._data = partition(self, kth, axis=axis, kind=kind, order=order, **kw).data + + @property + def flat(self): + """ + Flat iterator object to iterate over arrays. + + A `flatiter` iterator is returned by ``x.flat`` for any tensor `x`. + It allows iterating over the tensor as if it were a 1-D array, + either in a for-loop or by calling its `next` method. + + Iteration is done in row-major, C-style order (the last + index varying the fastest). The iterator can also be indexed using + basic slicing or advanced indexing. + + See Also + -------- + Tensor.flat : Return a flat iterator over a tensor. + Tensor.flatten : Returns a flattened copy of a tensor. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.arange(6).reshape(2, 3) + >>> fl = x.flat + + >>> fl[2:4].execute() + array([2, 3]) + """ + return self._data.flat + + def from_dataframe(self, in_df): + return self._data.from_dataframe(in_df) + + def to_dataframe(self, *args, **kwargs): + return self._data.to_dataframe(*args, **kwargs) + + def to_numpy(self, session=None, **kw): + return self._data.to_numpy(session, **kw) + + +SparseTensor = Tensor + + +class flatiter(object): + def __init__(self, tensor): + # flatten creates a copy + self._flatten_tensor = tensor.flatten() + # ravel creates a view + self._ravel_tensor = tensor.ravel() + + def __getitem__(self, item): + # a.flat[item] create a copy + return self._flatten_tensor[item] + + def __setitem__(self, key, value): + # a.flat[item] = value will apply changes to original tensor + self._ravel_tensor[key] = value + + +class Indexes(Serializable): + indexes = AnyField("indexes") + + +TENSOR_TYPE = (Tensor, TensorData) +TENSOR_CHUNK_TYPE = (TensorChunk, TensorChunkData) + +register_output_types(OutputType.tensor, TENSOR_TYPE, TENSOR_CHUNK_TYPE) +register_output_types(OutputType.scalar, TENSOR_TYPE, TENSOR_CHUNK_TYPE) diff --git a/python/xorbits/_mars/tensor/datasource/__init__.py b/python/xorbits/_mars/tensor/datasource/__init__.py new file mode 100644 index 000000000..74688b13c --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/__init__.py @@ -0,0 +1,45 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from .arange import TensorArange, arange +from .array import ( + ArrayDataSource, + CSRMatrixDataSource, + array, + asarray, + ascontiguousarray, + asfortranarray, + tensor, +) +from .diag import TensorDiag, diag +from .diagflat import diagflat +from .empty import TensorEmpty, TensorEmptyLike, empty, empty_like +from .eye import TensorEye, eye +from .from_dataframe import TensorFromDataFrame, from_dataframe, from_series +from .from_dense import DenseToSparse, fromdense +from .from_hdf5 import TensorHDF5DataSource, fromhdf5 +from .from_sparse import SparseToDense, fromsparse +from .from_tiledb import TensorTileDBDataSource, fromtiledb +from .from_vineyard import TensorFromVineyard, TensorFromVineyardChunk, fromvineyard +from .from_zarr import TensorFromZarr, fromzarr +from .full import TensorFull, TensorFullLike, full, full_like +from .identity import identity +from .indices import TensorIndices, indices +from .linspace import TensorLinspace, linspace +from .meshgrid import meshgrid +from .ones import TensorOnes, TensorOnesLike, ones, ones_like +from .scalar import Scalar, scalar +from .tri import TensorTril, TensorTriu, tril, triu +from .zeros import TensorZeros, TensorZerosLike, zeros, zeros_like diff --git a/python/xorbits/_mars/tensor/datasource/arange.py b/python/xorbits/_mars/tensor/datasource/arange.py new file mode 100644 index 000000000..62975ebc8 --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/arange.py @@ -0,0 +1,219 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import numpy as np + +from ... import opcodes as OperandDef +from ...config import options +from ...serialization.serializables import AnyField +from ..array_utils import create_array +from ..utils import decide_chunk_sizes +from .core import TensorNoInput + + +class TensorArange(TensorNoInput): + _op_type_ = OperandDef.TENSOR_ARANGE + + _start = AnyField("start") + _stop = AnyField("stop") + _step = AnyField("step") + + def __init__(self, start=None, stop=None, step=None, dtype=None, **kw): + if dtype is not None: + dtype = np.dtype(dtype) + elif stop is not None and step is not None: + dtype = ( + np.dtype(dtype) + if dtype is not None + else np.arange(0, type(stop)(1), step).dtype + ) + super().__init__(_start=start, _stop=stop, _step=step, dtype=dtype, **kw) + + @property + def start(self): + return self._start + + @property + def stop(self): + return self._stop + + @property + def step(self): + return self._step + + def to_chunk_op(self, *args): + op = self.copy().reset_key() + start, stop, step = args + op._start = start + op._stop = stop + op._step = step + return op + + @classmethod + def tile(cls, op): + tensor = op.outputs[0] + + chunk_length = tensor.extra_params.raw_chunk_size or options.chunk_size + chunk_length = decide_chunk_sizes( + tensor.shape, chunk_length, tensor.dtype.itemsize + ) + + start, stop, step = op.start, op.stop, op.step # noqa: F841 + + out_chunks = [] + n_elem = 0 + for i, cs in enumerate(chunk_length[0]): + chunk_start = start + n_elem * step + chunk_stop = start + (n_elem + cs) * step + chunk_size = max(int(np.ceil((chunk_stop - chunk_start) / step)), 0) + if chunk_size > cs: + chunk_stop -= step + chunk_shape = (cs,) + chunk_idx = (i,) + chunk_op = op.to_chunk_op(chunk_start, chunk_stop, step) + out_chunk = chunk_op.new_chunk(None, shape=chunk_shape, index=chunk_idx) + n_elem += cs + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + tensor.shape, + order=tensor.order, + chunks=out_chunks, + nsplits=chunk_length, + ) + + @classmethod + def execute(cls, ctx, op): + ctx[op.outputs[0].key] = create_array(op)( + "arange", op.start, op.stop, op.step, dtype=op.dtype + ) + + +def arange(*args, **kwargs): + """ + Return evenly spaced values within a given interval. + + Values are generated within the half-open interval ``[start, stop)`` + (in other words, the interval including `start` but excluding `stop`). + For integer arguments the function is equivalent to the Python built-in + `range `_ function, + but returns a tensor rather than a list. + + When using a non-integer step, such as 0.1, the results will often not + be consistent. It is better to use ``linspace`` for these cases. + + Parameters + ---------- + start : number, optional + Start of interval. The interval includes this value. The default + start value is 0. + stop : number + End of interval. The interval does not include this value, except + in some cases where `step` is not an integer and floating point + round-off affects the length of `out`. + step : number, optional + Spacing between values. For any output `out`, this is the distance + between two adjacent values, ``out[i+1] - out[i]``. The default + step size is 1. If `step` is specified as a position argument, + `start` must also be given. + dtype : dtype + The type of the output tensor. If `dtype` is not given, infer the data + type from the other input arguments. + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + + Returns + ------- + arange : Tensor + Tensor of evenly spaced values. + + For floating point arguments, the length of the result is + ``ceil((stop - start)/step)``. Because of floating point overflow, + this rule may result in the last element of `out` being greater + than `stop`. + + See Also + -------- + linspace : Evenly spaced numbers with careful handling of endpoints. + ogrid: Tensors of evenly spaced numbers in N-dimensions. + mgrid: Grid-shaped tensors of evenly spaced numbers in N-dimensions. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.arange(3).execute() + array([0, 1, 2]) + >>> mt.arange(3.0).execute() + array([ 0., 1., 2.]) + >>> mt.arange(3,7).execute() + array([3, 4, 5, 6]) + >>> mt.arange(3,7,2).execute() + array([3, 5]) + """ + kw_args = [kwargs.get("start"), kwargs.get("stop"), kwargs.get("step")] + kw_def = any(arg is not None for arg in kw_args) + dtype = None + if not kw_def: + if len(args) == 1: + start = 0 + stop = args[0] + step = 1 + elif len(args) == 2: + start = args[0] + stop = args[1] + step = 1 + elif len(args) == 3: + start, stop, step = args + elif len(args) == 4: + start, stop, step, dtype = args + dtype = np.dtype(dtype) + else: + raise TypeError("Required argument 'start' (pos 1) not found") + else: + names = "start", "stop", "step" + for i, arg in enumerate(args): + if kw_args[i] is not None: + raise TypeError( + f"Argument given by name ('{names[i]}') and position ({i})" + ) + kw_args[i] = arg + start, stop, step = kw_args + + if dtype is None: + if "dtype" in kwargs: + dtype = np.dtype(kwargs["dtype"]) + else: + dtype = np.arange(0, type(stop)(1), step).dtype + + start, stop = dtype.type(start), dtype.type(stop) + if dtype == np.datetime64 and not start: + raise ValueError( + "arange requires both a start and a stop for Mars datetime64 ranges" + ) + if dtype == np.datetime64: + span = np.array([stop - start]) + span[0] = step + step = span[0] + dtype = np.dtype(stop.dtype) + else: + step = dtype.type(step) + size = max(int(np.ceil(np.true_divide(stop - start, step))), 0) + + op = TensorArange(start, stop, step, dtype=dtype, gpu=kwargs.get("gpu", False)) + shape = (size,) + return op(shape, chunk_size=kwargs.pop("chunk_size", None)) diff --git a/python/xorbits/_mars/tensor/datasource/array.py b/python/xorbits/_mars/tensor/datasource/array.py new file mode 100644 index 000000000..0342bba82 --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/array.py @@ -0,0 +1,437 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...lib.sparse import SparseNDArray +from ...lib.sparse.core import cp, cps, get_array_module, issparse, sps +from ...serialization.serializables import ( + AnyField, + FieldTypes, + NDArrayField, + TupleField, +) +from ...utils import on_deserialize_shape, on_serialize_shape +from ..array_utils import array_module, is_array, is_cupy +from ..core import TENSOR_TYPE, Tensor, TensorData, TensorOrder +from ..utils import get_chunk_slices +from .core import TensorNoInput +from .scalar import scalar + + +class ArrayDataSource(TensorNoInput): + """ + Represents data from numpy or cupy array + """ + + _op_type_ = OperandDef.TENSOR_DATA_SOURCE + + data = NDArrayField("data") + chunk_size = AnyField("chunk_size") + + def __init__(self, data=None, dtype=None, gpu=None, **kw): + if dtype is not None: + dtype = np.dtype(dtype) + elif data is not None: + dtype = np.dtype(data.dtype) + + if gpu is None and is_cupy(data): # pragma: no cover + gpu = True + + super().__init__(data=data, dtype=dtype, gpu=gpu, **kw) + + def to_chunk_op(self, *args): + _, idx, chunk_size = args + chunk_op = self.copy().reset_key() + chunk_op.data = self.data[get_chunk_slices(chunk_size, idx)].astype( + chunk_op.dtype, order=self.outputs[0].order.value, copy=False + ) + chunk_op.chunk_size = None + + return chunk_op + + @classmethod + def execute(cls, ctx, op): + ctx[op.outputs[0].key] = array_module(op.gpu).asarray(op.data) + + +class CSRMatrixDataSource(TensorNoInput): + """ + Represents data from sparse array include scipy sparse or cupy sparse matrix. + """ + + _op_type_ = OperandDef.SPARSE_MATRIX_DATA_SOURCE + + indices = NDArrayField("indices") + indptr = NDArrayField("indptr") + data = NDArrayField("data") + shape = TupleField( + "shape", + FieldTypes.int64, + on_serialize=on_serialize_shape, + on_deserialize=on_deserialize_shape, + ) + chunk_size = AnyField("chunk_size") + + def __init__(self, data=None, **kw): + kw["sparse"] = True + if is_cupy(data): # pragma: no cover + kw["gpu"] = True + super().__init__(data=data, **kw) + + def to_chunk_op(self, *args): + _, idx, chunk_size = args + + xps = cps if self.gpu else sps + if len(self.shape) == 1: + shape = (1, self.shape[0]) + else: + shape = self.shape + data = xps.csr_matrix((self.data, self.indices, self.indptr), shape) + chunk_data = data[get_chunk_slices(chunk_size, idx)] + + chunk_op = self.copy().reset_key() + chunk_op.data = chunk_data.data + chunk_op.indices = chunk_data.indices + chunk_op.indptr = chunk_data.indptr + chunk_shape = chunk_data.shape[1:] if len(self.shape) == 1 else chunk_data.shape + chunk_op.shape = chunk_shape + + return chunk_op + + @classmethod + def execute(cls, ctx, op: "CSRMatrixDataSource"): + xps = cps if op.gpu else sps + chunk_shape = (1, op.shape[0]) if op.outputs[0].ndim == 1 else op.shape + ctx[op.outputs[0].key] = SparseNDArray( + xps.csr_matrix((op.data, op.indices, op.indptr), shape=chunk_shape), + shape=op.shape, + ) + + +def _from_spmatrix(spmatrix, dtype=None, chunk_size=None, gpu=None): + if gpu is None: + m = get_array_module(spmatrix) + if cp is not None and m is cp: + gpu = True + elif cp is np: + gpu = False + if dtype and spmatrix.dtype != dtype: + spmatrix = spmatrix.astype(dtype) + spmatrix = spmatrix.tocsr() + op = CSRMatrixDataSource( + indices=spmatrix.indices, + indptr=spmatrix.indptr, + data=spmatrix.data, + shape=spmatrix.shape, + dtype=spmatrix.dtype, + gpu=gpu, + chunk_size=chunk_size, + ) + return op(spmatrix.shape, chunk_size=chunk_size) + + +def tensor( + data=None, dtype=None, order="K", chunk_size=None, gpu=None, sparse=False +) -> Tensor: + order = order or "K" + if isinstance(data, TENSOR_TYPE): + if isinstance(data, TensorData): + data = Tensor(data) + return data.astype(dtype or data.dtype, order=order, copy=False) + elif ( + isinstance(data, (tuple, list)) + and len(data) > 0 + and all(isinstance(d, TENSOR_TYPE) for d in data) + ): + from ..merge import stack + + data = stack(data) + return data.astype(dtype or data.dtype, order=order, copy=False) + elif np.isscalar(data): + return scalar(data, dtype=dtype) + elif issparse(data): + return _from_spmatrix(data, dtype=dtype, chunk_size=chunk_size, gpu=gpu) + elif hasattr(data, "__mars_tensor__"): + return data.__mars_tensor__(dtype=dtype, order=order) + else: + m = get_array_module(data) + try: + data = m.asarray(data, dtype=dtype, order=order) + except ValueError: + arr = data.__array__(dtype=dtype) + if isinstance(arr, TENSOR_TYPE): + return arr.astype(arr.dtype, order=order, copy=False) + raise + if gpu is None: + if cp is not None and m is cp: + gpu = True + + if is_array(data): + if data.ndim == 0: + return scalar(data.item(), dtype=dtype) + tensor_order = ( + TensorOrder.C_ORDER if data.flags["C_CONTIGUOUS"] else TensorOrder.F_ORDER + ) + op = ArrayDataSource(data, dtype=dtype, gpu=gpu, chunk_size=chunk_size) + t = op(data.shape, chunk_size=chunk_size, order=tensor_order) + if sparse and not t.issparse(): + return t.tosparse() + return t + else: + raise ValueError(f"Cannot create tensor by given data: {data}") + + +def array(x, dtype=None, copy=True, order="K", ndmin=None, chunk_size=None): + """ + Create a tensor. + + Parameters + ---------- + object : array_like + An array, any object exposing the array interface, an object whose + __array__ method returns an array, or any (nested) sequence. + dtype : data-type, optional + The desired data-type for the array. If not given, then the type will + be determined as the minimum type required to hold the objects in the + sequence. This argument can only be used to 'upcast' the array. For + downcasting, use the .astype(t) method. + copy : bool, optional + If true (default), then the object is copied. Otherwise, a copy will + only be made if __array__ returns a copy, if obj is a nested sequence, + or if a copy is needed to satisfy any of the other requirements + (`dtype`, `order`, etc.). + order : {'K', 'A', 'C', 'F'}, optional + Specify the memory layout of the array. If object is not an array, the + newly created array will be in C order (row major) unless 'F' is + specified, in which case it will be in Fortran order (column major). + If object is an array the following holds. + + ===== ========= =================================================== + order no copy copy=True + ===== ========= =================================================== + 'K' unchanged F & C order preserved, otherwise most similar order + 'A' unchanged F order if input is F and not C, otherwise C order + 'C' C order C order + 'F' F order F order + ===== ========= =================================================== + + When ``copy=False`` and a copy is made for other reasons, the result is + the same as if ``copy=True``, with some exceptions for `A`, see the + Notes section. The default order is 'K'. + ndmin : int, optional + Specifies the minimum number of dimensions that the resulting + array should have. Ones will be prepended to the shape as + needed to meet this requirement. + chunk_size: int, tuple, optional + Specifies chunk size for each dimension. + + Returns + ------- + out : Tensor + An tensor object satisfying the specified requirements. + + See Also + -------- + empty, empty_like, zeros, zeros_like, ones, ones_like, full, full_like + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.array([1, 2, 3]).execute() + array([1, 2, 3]) + + Upcasting: + + >>> mt.array([1, 2, 3.0]).execute() + array([ 1., 2., 3.]) + + More than one dimension: + + >>> mt.array([[1, 2], [3, 4]]).execute() + array([[1, 2], + [3, 4]]) + + Minimum dimensions 2: + + >>> mt.array([1, 2, 3], ndmin=2).execute() + array([[1, 2, 3]]) + + Type provided: + + >>> mt.array([1, 2, 3], dtype=complex).execute() + array([ 1.+0.j, 2.+0.j, 3.+0.j]) + + """ + raw_x = x + order = order or "K" + x = tensor(x, dtype=dtype, order=order, chunk_size=chunk_size) + while ndmin is not None and x.ndim < ndmin: + x = x[np.newaxis] + + if copy and x is raw_x: + x = x.copy(order=order) + elif ( + not copy + and isinstance(raw_x, TENSOR_TYPE) + and raw_x.dtype == x.dtype + and raw_x.order == x.order + and raw_x.shape == x.shape + and raw_x is not x + and hasattr(raw_x, "data") + ): + raw_x.data = x.data + + return x + + +def asarray(x, dtype=None, order=None, chunk_size=None): + """Convert the input to an array. + + Parameters + ---------- + a : array_like + Input data, in any form that can be converted to a tensor. This + includes lists, lists of tuples, tuples, tuples of tuples, tuples + of lists and tensors. + dtype : data-type, optional + By default, the data-type is inferred from the input data. + order : {'C', 'F'}, optional + Whether to use row-major (C-style) or + column-major (Fortran-style) memory representation. + chunk_size: int, tuple, optional + Specifies chunk size for each dimension. + + Returns + ------- + out : Tensor + Tensor interpretation of `a`. No copy is performed if the input + is already an ndarray with matching dtype and order. If `a` is a + subclass of ndarray, a base class ndarray is returned. + + See Also + -------- + ascontiguousarray : Convert input to a contiguous tensor. + asfortranarray : Convert input to a tensor with column-major + memory order. + + Examples + -------- + Convert a list into a tensor: + + >>> import mars.tensor as mt + + >>> a = [1, 2] + >>> mt.asarray(a).execute() + array([1, 2]) + + Existing arrays are not copied: + + >>> a = mt.array([1, 2]) + >>> mt.asarray(a) is a + True + + If `dtype` is set, array is copied only if dtype does not match: + + >>> a = mt.array([1, 2], dtype=mt.float32) + >>> mt.asarray(a, dtype=mt.float32) is a + True + >>> mt.asarray(a, dtype=mt.float64) is a + False + """ + return array(x, dtype=dtype, copy=False, order=order, chunk_size=chunk_size) + + +def ascontiguousarray(a, dtype=None, chunk_size=None): + """ + Return a contiguous tensor (ndim >= 1) in memory (C order). + + Parameters + ---------- + a : array_like + Input tensor. + dtype : str or dtype object, optional + Data-type of returned tensor. + chunk_size: int, tuple, optional + Specifies chunk size for each dimension. + + Returns + ------- + out : Tensor + Contiguous tensor of same shape and content as `a`, with type `dtype` + if specified. + + See Also + -------- + asfortranarray : Convert input to a tensor with column-major + memory order. + Tensor.flags : Information about the memory layout of the tensor. + + Examples + -------- + >>> import mars.tensor as mt + >>> x = mt.arange(6).reshape(2,3) + >>> mt.ascontiguousarray(x, dtype=mt.float32) + array([[ 0., 1., 2.], + [ 3., 4., 5.]], dtype=float32) + >>> x.flags['C_CONTIGUOUS'] + True + + Note: This function returns a tensor with at least one-dimension (1-d) + so it will not preserve 0-d tensors. + + """ + + return array(a, dtype, copy=False, order="C", ndmin=1, chunk_size=chunk_size) + + +def asfortranarray(a, dtype=None, chunk_size=None): + """ + Return a tensor (ndim >= 1) laid out in Fortran order in memory. + + Parameters + ---------- + a : array_like + Input tensor. + dtype : str or dtype object, optional + By default, the data-type is inferred from the input data. + chunk_size: int, tuple, optional + Specifies chunk size for each dimension. + + Returns + ------- + out : Tensor + The input `a` in Fortran, or column-major, order. + + See Also + -------- + ascontiguousarray : Convert input to a contiguous (C order) tensor. + + Examples + -------- + >>> import mars.tensor as mt + >>> x = mt.arange(6).reshape(2,3) + >>> y = mt.asfortranarray(x) + >>> x.flags['F_CONTIGUOUS'] + False + >>> y.flags['F_CONTIGUOUS'] + True + + Note: This function returns a tensor with at least one-dimension (1-d) + so it will not preserve 0-d tensors. + + """ + return array(a, dtype, copy=False, order="F", ndmin=1, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/datasource/core.py b/python/xorbits/_mars/tensor/datasource/core.py new file mode 100644 index 000000000..00f0dab1d --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/core.py @@ -0,0 +1,205 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import itertools + +import numpy as np + +from ...config import options +from ...serialization.serializables import FieldTypes, StringField, TupleField +from ..core import TensorOrder +from ..operands import TensorOperand, TensorOperandMixin +from ..utils import decide_chunk_sizes, normalize_shape + + +class TensorDataSource(TensorOperand, TensorOperandMixin): + """ + Tensor data source base class, provide universal tile logic, + subclass can overwrite tile method. + """ + + __slots__ = () + + def to_chunk_op(self, *args): + chunk_shape = args[0] + chunk_op = self.copy().reset_key() + chunk_op.extra_params = {"size": chunk_shape} # to make op key different + return chunk_op + + @classmethod + def tile(cls, op): + tensor = op.outputs[0] + + chunk_size = tensor.extra_params.raw_chunk_size or options.chunk_size + chunk_size = decide_chunk_sizes(tensor.shape, chunk_size, tensor.dtype.itemsize) + chunk_size_idxes = (range(len(size)) for size in chunk_size) + + out_chunks = [] + for chunk_shape, chunk_idx in zip( + itertools.product(*chunk_size), itertools.product(*chunk_size_idxes) + ): + chunk_op = op.to_chunk_op(chunk_shape, chunk_idx, chunk_size) + out_chunk = chunk_op.new_chunk( + None, shape=chunk_shape, index=chunk_idx, order=tensor.order + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + tensor.shape, + chunks=out_chunks, + nsplits=chunk_size, + order=tensor.order, + **tensor.extra_params + ) + + +class TensorNoInput(TensorDataSource): + """ + Tensor operand with no inputs. + """ + + def check_inputs(self, inputs): + # no inputs + if inputs and len(inputs) > 0: + raise ValueError("Tensor data source has no inputs") + + def _new_chunks(self, inputs, kws=None, **kw): + shape = kw.get("shape", None) + self.extra_params[ + "shape" + ] = shape # set shape to make the operand key different + return super()._new_chunks(inputs, kws=kws, **kw) + + def _new_tileables(self, inputs, kws=None, **kw): + shape = kw.get("shape", None) + self.extra_params[ + "shape" + ] = shape # set shape to make the operand key different + return super()._new_tileables(inputs, kws=kws, **kw) + + def __call__(self, shape, chunk_size=None, order=None): + shape = normalize_shape(shape) + order = TensorOrder.C_ORDER if order is None else order + return self.new_tensor(None, shape, raw_chunk_size=chunk_size, order=order) + + +class TensorHasInput(TensorDataSource): + """ + Tensor operand with a single input. + """ + + @property + def input(self): + return self._input + + def check_inputs(self, inputs): + # no inputs + if len(inputs) != 1: + raise ValueError("Tensor can only have 1 input") + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + @classmethod + def tile(cls, op): + output = op.outputs[0] + + out_chunks = [] + for c in op.input.chunks: + out_chunk = ( + op.copy() + .reset_key() + .new_chunk([c], shape=c.shape, index=c.index, order=output.order) + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + output.shape, + order=output.order, + chunks=out_chunks, + nsplits=op.input.nsplits, + ) + + def __call__(self, a, order=None): + order = a.order if order is None else order + return self.new_tensor([a], a.shape, order=order) + + +class TensorLike(TensorHasInput): + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if self.dtype is None: + self.dtype = self.input.dtype + if self.gpu is None: + self.gpu = self.input.op.gpu + + # FIXME: remove when cupy supports other dtypes + if self.gpu and self.dtype not in (np.float32, np.float64): + raise NotImplementedError( + "Sparse tensor on GPU only supports float32 and float64" + ) + + +class TensorFromHDF5Like(TensorNoInput): + _filename = StringField("filename") + _group = StringField("group") + _dataset = StringField("dataset") + _axis_offsets = TupleField("axis_offsets", FieldTypes.int64) + + def __init__(self, filename=None, group=None, dataset=None, **kw): + super().__init__(_filename=filename, _group=group, _dataset=dataset, **kw) + + @property + def filename(self): + return self._filename + + @property + def group(self): + return self._group + + @property + def dataset(self): + return self._dataset + + @property + def axis_offsets(self): + return self._axis_offsets + + @property + def path(self): + return self.get_path(self.group, self.dataset) + + def to_chunk_op(self, *args): + _, chunk_index, nsplits = args + chunk_op = super().to_chunk_op(*args) + cum_offsets = [[0] + np.cumsum(ns).tolist() for ns in nsplits] + axis_offsets = [] + for axis, idx in enumerate(chunk_index): + axis_offsets.append(cum_offsets[axis][idx]) + chunk_op._axis_offsets = tuple(axis_offsets) + return chunk_op + + @staticmethod + def get_path(group, dataset): + paths = [] + if group: + paths.append(group) + paths.append(dataset) + return "/".join(paths) diff --git a/python/xorbits/_mars/tensor/datasource/diag.py b/python/xorbits/_mars/tensor/datasource/diag.py new file mode 100644 index 000000000..8362e7896 --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/diag.py @@ -0,0 +1,297 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np + +from ... import opcodes as OperandDef +from ...lib import sparse +from ...lib.sparse import diag as sparse_diag +from ...lib.sparse.core import get_array_module, get_sparse_module, issparse +from ...serialization.serializables import Int32Field, KeyField +from ...utils import has_unknown_shape +from ..array_utils import create_array +from ..core import TENSOR_TYPE, TensorOrder +from .array import tensor +from .core import TensorHasInput +from .zeros import TensorZeros + + +def _get_diag_shape(v_shape, k): + size_0, size_1 = 0, 0 + if k > 0: + size_1 += k + elif k < 0: + size_0 -= k + size = min(v_shape[0] - size_0, v_shape[1] - size_1) + return (size,) + + +class TensorDiagBase: + __slots__ = () + + def to_chunk_op(self, *args): + op = self.copy().reset_key() + (k,) = args + op._k = k + return op + + @classmethod + def _get_nsplits(cls, op): + raise NotImplementedError + + @classmethod + def _get_chunk(cls, op, chunk_k, chunk_shape, chunk_idx): + raise NotImplementedError + + @classmethod + def tile(cls, op): + if op.inputs: + if has_unknown_shape(*op.inputs): + yield + tensor = op.outputs[0] + + # op can be TensorDiag or TensorEye + k = op.k + nsplits = op._get_nsplits(op) + + fx = lambda x, y: x - y + k + cum_size = [np.cumsum(s).tolist() for s in nsplits] + out_chunks = [] + for out_idx in itertools.product(*[range(len(s)) for s in nsplits]): + i, j = out_idx + ld_pos = cum_size[0][i] - 1, cum_size[1][j] - nsplits[1][j] + ru_pos = cum_size[0][i] - nsplits[0][i], cum_size[1][j] - 1 + + ld_fx = fx(*ld_pos) + ru_fx = fx(*ru_pos) + + chunk_shape = (nsplits[0][i], nsplits[1][j]) + if (ld_fx > 0 and ru_fx > 0) or (ld_fx < 0 and ru_fx < 0): + # does not cross, fill with zeros + chunk_op = TensorZeros( + dtype=op.dtype, + gpu=op.gpu, + sparse=op.sparse, + shape=chunk_shape, + order=tensor.order.value, + ) + chunk = chunk_op.new_chunk(None, shape=chunk_shape, index=out_idx) + else: + lu_pos = ru_pos[0], ld_pos[1] + chunk_k = fx(*lu_pos) + chunk = op._get_chunk(op, chunk_k, chunk_shape, out_idx) + + out_chunks.append(chunk) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, tensor.shape, chunks=out_chunks, nsplits=nsplits + ) + + +class TensorDiag(TensorDiagBase, TensorHasInput): + _op_type_ = OperandDef.TENSOR_DIAG + + _input = KeyField("input") + _k = Int32Field("k") + + def __init__(self, k=None, **kw): + super().__init__(_k=k, **kw) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if self.dtype is None: + self._dtype = self.input.dtype + + def to_chunk_op(self, *args): + return TensorDiagBase.to_chunk_op(self, *args) + + @classmethod + def _get_nsplits(cls, op): + assert op.input.ndim == 1 + k = op.k + nsplits_1d = op.input.nsplits[0] + nsplit_0, nsplit_1 = list(nsplits_1d), list(nsplits_1d) + if k > 0: + nsplit_0.append(k) + nsplit_1.insert(0, k) + elif k < 0: + nsplit_0.insert(0, abs(k)) + nsplit_1.append(abs(k)) + return nsplit_0, nsplit_1 + + @classmethod + def _get_chunk(cls, op, chunk_k, chunk_shape, chunk_idx): + assert chunk_shape[0] == chunk_shape[1] + input_idx = chunk_idx[1] if op.k < 0 else chunk_idx[0] + input_chunk = op.inputs[0].cix[input_idx,] + op = TensorDiag(k=chunk_k, dtype=op.dtype, gpu=op.gpu, sparse=op.sparse) + return op.new_chunk([input_chunk], shape=chunk_shape, index=chunk_idx) + + def __call__(self, v, shape, chunk_size=None): + return self.new_tensor( + [v], shape, raw_chunk_size=chunk_size, order=TensorOrder.C_ORDER + ) + + @classmethod + def tile(cls, op): + tensor = op.outputs[0] + + v = op.input + k = op.k + idx = itertools.count(0) + if v.ndim == 2: + if has_unknown_shape(*op.inputs): + yield + chunks = [] + nsplit = [] + + fx = lambda x, y: x - y + k + in_nsplits = v.nsplits + cum_size = [np.cumsum(s).tolist() for s in in_nsplits] + for c in v.chunks: + i, j = c.index + ld_pos = cum_size[0][i] - 1, cum_size[1][j] - in_nsplits[1][j] + ru_pos = cum_size[0][i] - in_nsplits[0][i], cum_size[1][j] - 1 + + ld_fx = fx(*ld_pos) + ru_fx = fx(*ru_pos) + + if (ld_fx > 0 and ru_fx > 0) or (ld_fx < 0 and ru_fx < 0): + continue + + lu_pos = ru_pos[0], ld_pos[1] + chunk_k = fx(*lu_pos) + + chunk_shape = _get_diag_shape(c.shape, chunk_k) + chunk_idx = (next(idx),) + chunk_op = op.to_chunk_op(chunk_k) + chunk = chunk_op.new_chunk( + [c], shape=chunk_shape, index=chunk_idx, order=tensor.order + ) + nsplit.append(chunk_shape[0]) + chunks.append(chunk) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + op.outputs[0].shape, + order=tensor.order, + chunks=chunks, + nsplits=(tuple(nsplit),), + ) + else: + return (yield from super().tile(op)) + + @property + def k(self): + return getattr(self, "_k", 0) + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + if op.sparse: + ctx[chunk.key] = sparse.diag(ctx[op.inputs[0].key], k=op.k, gpu=op.gpu) + else: + ctx[chunk.key] = create_array(op)("diag", ctx[op.inputs[0].key], k=op.k) + + +def diag(v, k=0, sparse=None, gpu=None, chunk_size=None): + """ + Extract a diagonal or construct a diagonal tensor. + + See the more detailed documentation for ``mt.diagonal`` if you use this + function to extract a diagonal and wish to write to the resulting tensor + + Parameters + ---------- + v : array_like + If `v` is a 2-D tensor, return its `k`-th diagonal. + If `v` is a 1-D tensor, return a 2-D tensor with `v` on the `k`-th + diagonal. + k : int, optional + Diagonal in question. The default is 0. Use `k>0` for diagonals + above the main diagonal, and `k<0` for diagonals below the main + diagonal. + sparse: bool, optional + Create sparse tensor if True, False as default + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + + Returns + ------- + out : Tensor + The extracted diagonal or constructed diagonal tensor. + + See Also + -------- + diagonal : Return specified diagonals. + diagflat : Create a 2-D array with the flattened input as a diagonal. + trace : Sum along diagonals. + triu : Upper triangle of a tensor. + tril : Lower triangle of a tensor. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.arange(9).reshape((3,3)) + >>> x.execute() + array([[0, 1, 2], + [3, 4, 5], + [6, 7, 8]]) + + >>> mt.diag(x).execute() + array([0, 4, 8]) + >>> mt.diag(x, k=1).execute() + array([1, 5]) + >>> mt.diag(x, k=-1).execute() + array([3, 7]) + + >>> mt.diag(mt.diag(x)).execute() + array([[0, 0, 0], + [0, 4, 0], + [0, 0, 8]]) + + """ + if not isinstance(v, TENSOR_TYPE): + tensor_v = tensor(v) + if tensor_v.issparse(): + xps = get_sparse_module(tensor_v.data) + v = xps.csr_matrix( + (tensor_v.op.data, tensor_v.op.indices, tensor_v.op.indptr), + tensor_v.shape, + ) + diag_v = sparse_diag(v, k=k) + else: + v = tensor(v).op.data + diag_v = get_array_module(v).diag(v, k=k) + sparse = sparse if sparse is not None else issparse(v) + return tensor(diag_v, gpu=gpu, sparse=sparse, chunk_size=chunk_size) + + sparse = sparse if sparse is not None else v.issparse() + + if v.ndim == 1: + shape = (v.size + abs(k),) * 2 + elif v.ndim == 2: + shape = _get_diag_shape(v.shape, k) + else: + raise ValueError("Input must be 1- or 2-d.") + + op = TensorDiag(k, dtype=v.dtype, gpu=gpu, sparse=sparse) + return op(v, shape) diff --git a/python/xorbits/_mars/tensor/datasource/diagflat.py b/python/xorbits/_mars/tensor/datasource/diagflat.py new file mode 100644 index 000000000..b66bab858 --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/diagflat.py @@ -0,0 +1,69 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..core import Tensor +from .array import tensor as astensor +from .diag import diag + + +def diagflat(v, k=0, sparse=None, gpu=None, chunk_size=None): + """ + Create a two-dimensional tensor with the flattened input as a diagonal. + + Parameters + ---------- + v : array_like + Input data, which is flattened and set as the `k`-th + diagonal of the output. + k : int, optional + Diagonal to set; 0, the default, corresponds to the "main" diagonal, + a positive (negative) `k` giving the number of the diagonal above + (below) the main. + sparse: bool, optional + Create sparse tensor if True, False as default + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + + Returns + ------- + out : Tensor + The 2-D output tensor. + + See Also + -------- + diag : MATLAB work-alike for 1-D and 2-D tensors. + diagonal : Return specified diagonals. + trace : Sum along diagonals. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.diagflat([[1,2], [3,4]]).execute() + array([[1, 0, 0, 0], + [0, 2, 0, 0], + [0, 0, 3, 0], + [0, 0, 0, 4]]) + + >>> mt.diagflat([1,2], 1).execute() + array([[0, 1, 0], + [0, 0, 2], + [0, 0, 0]]) + + """ + if not isinstance(v, Tensor): + v = astensor(v).op.data + return diag(v.flatten(), k=k, sparse=sparse, gpu=gpu, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/datasource/empty.py b/python/xorbits/_mars/tensor/datasource/empty.py new file mode 100644 index 000000000..fa0d3df04 --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/empty.py @@ -0,0 +1,213 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...lib.sparse import SparseNDArray +from ...lib.sparse.core import get_array_module, get_sparse_module, naked +from ...serialization.serializables import KeyField, StringField +from ..array_utils import create_array +from ..utils import get_order +from .array import tensor +from .core import TensorLike, TensorNoInput + + +class TensorEmptyBase(object): + __slots__ = () + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._gen_rand() + + def _gen_rand(self): + if getattr(self, "_rand", None) is None: + self._obj_set("_rand", np.random.random()) + + def to_chunk_op(self, *args): + op = self.copy().reset_key() + op._rand = None + op._gen_rand() + return op + + +class TensorEmpty(TensorEmptyBase, TensorNoInput): + __slots__ = ("_rand",) + _op_type_ = OperandDef.TENSOR_EMPTY + + _order = StringField("order") + + def __init__(self, dtype=None, order=None, **kw): + dtype = np.dtype(dtype or "f8") + super().__init__(dtype=dtype, _order=order, **kw) + + @property + def order(self): + return self._order + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + ctx[chunk.key] = create_array(op)( + "empty", chunk.shape, dtype=op.dtype, order=op.order + ) + + +def empty(shape, dtype=None, chunk_size=None, gpu=None, order="C"): + """ + Return a new tensor of given shape and type, without initializing entries. + + Parameters + ---------- + shape : int or tuple of int + Shape of the empty tensor + dtype : data-type, optional + Desired output data-type. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + order : {'C', 'F'}, optional, default: 'C' + Whether to store multi-dimensional data in row-major + (C-style) or column-major (Fortran-style) order in + memory. + + Returns + ------- + out : Tensor + Tensor of uninitialized (arbitrary) data of the given shape, dtype, and + order. Object arrays will be initialized to None. + + See Also + -------- + empty_like, zeros, ones + + Notes + ----- + `empty`, unlike `zeros`, does not set the array values to zero, + and may therefore be marginally faster. On the other hand, it requires + the user to manually set all the values in the array, and should be + used with caution. + + Examples + -------- + >>> import mars.tensor as mt + >>> mt.empty([2, 2]).execute() + array([[ -9.74499359e+001, 6.69583040e-309], + [ 2.13182611e-314, 3.06959433e-309]]) #random + >>> mt.empty([2, 2], dtype=int).execute() + array([[-1073741821, -1067949133], + [ 496041986, 19249760]]) #random + """ + tensor_order = get_order( + order, + None, + available_options="CF", + err_msg="only 'C' or 'F' order is permitted", + ) + op = TensorEmpty(dtype=dtype, gpu=gpu, order=order) + return op(shape, chunk_size=chunk_size, order=tensor_order) + + +class TensorEmptyLike(TensorEmptyBase, TensorLike): + __slots__ = ("_rand",) + _op_type_ = OperandDef.TENSOR_EMPTY_LIKE + + _input = KeyField("input") + _order = StringField("order") + + def __init__(self, dtype=None, gpu=None, sparse=False, order=None, **kw): + dtype = np.dtype(dtype) if dtype is not None else None + super().__init__(dtype=dtype, gpu=gpu, _order=order, sparse=sparse, **kw) + + @property + def order(self): + return self._order + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + if op.issparse(): + in_data = naked(ctx[op.inputs[0].key]) + xps = get_sparse_module(in_data) + xp = get_array_module(in_data) + ctx[chunk.key] = SparseNDArray( + xps.csr_matrix( + ( + xp.empty_like(in_data.data, dtype=op.dtype), + in_data.indices, + in_data.indptr, + ), + shape=in_data.shape, + ) + ) + else: + ctx[chunk.key] = create_array(op)( + "empty_like", ctx[op.inputs[0].key], dtype=op.dtype, order=op.order + ) + + +def empty_like(a, dtype=None, gpu=None, order="K"): + """ + Return a new tensor with the same shape and type as a given tensor. + + Parameters + ---------- + a : array_like + The shape and data-type of `a` define these same attributes of the + returned tensor. + dtype : data-type, optional + Overrides the data type of the result. + gpu : bool, optional + Allocate the tensor on GPU if True, None as default + order : {'C', 'F', 'A', or 'K'}, optional + Overrides the memory layout of the result. 'C' means C-order, + 'F' means F-order, 'A' means 'F' if ``prototype`` is Fortran + contiguous, 'C' otherwise. 'K' means match the layout of ``prototype`` + as closely as possible. + + Returns + ------- + out : Tensor + Array of uninitialized (arbitrary) data with the same + shape and type as `a`. + See Also + -------- + ones_like : Return a tensor of ones with shape and type of input. + zeros_like : Return a tensor of zeros with shape and type of input. + empty : Return a new uninitialized tensor. + ones : Return a new tensor setting values to one. + zeros : Return a new tensor setting values to zero. + Notes + ----- + This function does *not* initialize the returned tensor; to do that use + `zeros_like` or `ones_like` instead. It may be marginally faster than + the functions that do set the array values. + Examples + -------- + >>> import mars.tensor as mt + >>> a = ([1,2,3], [4,5,6]) # a is array-like + >>> mt.empty_like(a).execute() + array([[-1073741821, -1073741821, 3], #ranm + [ 0, 0, -1073741821]]) + >>> a = mt.array([[1., 2., 3.],[4.,5.,6.]]) + >>> mt.empty_like(a).execute() + array([[ -2.00000715e+000, 1.48219694e-323, -2.00000572e+000],#random + [ 4.38791518e-305, -2.00000715e+000, 4.17269252e-309]]) + """ + a = tensor(a) + tensor_order = get_order(order, a.order) + gpu = a.op.gpu if gpu is None else gpu + op = TensorEmptyLike(dtype=dtype, gpu=gpu, sparse=a.issparse(), order=order) + return op(a, order=tensor_order) diff --git a/python/xorbits/_mars/tensor/datasource/eye.py b/python/xorbits/_mars/tensor/datasource/eye.py new file mode 100644 index 000000000..9c4ceda80 --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/eye.py @@ -0,0 +1,139 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...config import options +from ...lib import sparse +from ...serialization.serializables import Int32Field, StringField +from ..array_utils import create_array +from ..utils import decide_chunk_sizes, get_order +from .core import TensorNoInput +from .diag import TensorDiagBase + + +class TensorEye(TensorNoInput, TensorDiagBase): + _op_type_ = OperandDef.TENSOR_EYE + + _k = Int32Field("k") + _order = StringField("order") + + def __init__(self, k=None, dtype=None, order=None, **kw): + dtype = np.dtype(dtype or "f8") + super().__init__(_k=k, dtype=dtype, _order=order, **kw) + + @property + def k(self): + return getattr(self, "_k", 0) + + @property + def order(self): + return self._order + + @classmethod + def _get_nsplits(cls, op): + tensor = op.outputs[0] + chunk_size = tensor.extra_params.raw_chunk_size or options.chunk_size + return decide_chunk_sizes(tensor.shape, chunk_size, tensor.dtype.itemsize) + + @classmethod + def _get_chunk(cls, op, chunk_k, chunk_shape, chunk_idx): + chunk_op = TensorEye(k=chunk_k, dtype=op.dtype, gpu=op.gpu, sparse=op.sparse) + return chunk_op.new_chunk(None, shape=chunk_shape, index=chunk_idx) + + @classmethod + def tile(cls, op): + return (yield from TensorDiagBase.tile(op)) + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + if op.sparse: + ctx[chunk.key] = sparse.eye( + chunk.shape[0], M=chunk.shape[1], k=op.k, dtype=op.dtype, gpu=op.gpu + ) + else: + ctx[chunk.key] = create_array(op)( + "eye", + chunk.shape[0], + M=chunk.shape[1], + k=op.k, + dtype=op.dtype, + order=op.order, + ) + + +def eye(N, M=None, k=0, dtype=None, sparse=False, gpu=None, chunk_size=None, order="C"): + """ + Return a 2-D tensor with ones on the diagonal and zeros elsewhere. + + Parameters + ---------- + N : int + Number of rows in the output. + M : int, optional + Number of columns in the output. If None, defaults to `N`. + k : int, optional + Index of the diagonal: 0 (the default) refers to the main diagonal, + a positive value refers to an upper diagonal, and a negative value + to a lower diagonal. + dtype : data-type, optional + Data-type of the returned tensor. + sparse: bool, optional + Create sparse tensor if True, False as default + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + order : {'C', 'F'}, optional + Whether the output should be stored in row-major (C-style) or + column-major (Fortran-style) order in memory. + + Returns + ------- + I : Tensor of shape (N,M) + An tensor where all elements are equal to zero, except for the `k`-th + diagonal, whose values are equal to one. + + See Also + -------- + identity : (almost) equivalent function + diag : diagonal 2-D tensor from a 1-D tensor specified by the user. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.eye(2, dtype=int).execute() + array([[1, 0], + [0, 1]]) + >>> mt.eye(3, k=1).execute() + array([[ 0., 1., 0.], + [ 0., 0., 1.], + [ 0., 0., 0.]]) + + """ + if M is None: + M = N + + shape = (N, M) + tensor_order = get_order( + order, + None, + available_options="CF", + err_msg="only 'C' or 'F' order is permitted", + ) + op = TensorEye(k, dtype=dtype, gpu=gpu, sparse=sparse, order=order) + return op(shape, chunk_size=chunk_size, order=tensor_order) diff --git a/python/xorbits/_mars/tensor/datasource/from_dataframe.py b/python/xorbits/_mars/tensor/datasource/from_dataframe.py new file mode 100644 index 000000000..6500639f4 --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/from_dataframe.py @@ -0,0 +1,109 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...serialization.serializables import BoolField, KeyField +from ..core import TensorOrder +from ..utils import to_numpy +from .core import TensorHasInput + + +class TensorFromDataFrame(TensorHasInput): + """represent tensor from DataFrame""" + + _op_type_ = OperandDef.TENSOR_FROM_DATAFRAME + _input = KeyField("_input") + _extract_multi_index = BoolField("extract_multi_index") + + def __init__(self, extract_multi_index=False, **kw): + super().__init__(_extract_multi_index=extract_multi_index, **kw) + + @classmethod + def execute(cls, ctx, op: "TensorFromDataFrame"): + df = ctx[op.inputs[0].key] + if op._extract_multi_index: + df = df.to_frame() + ctx[op.outputs[0].key] = to_numpy(df).astype(op.dtype, order="F") + + @classmethod + def tile(cls, op: "TensorFromDataFrame"): + output = op.outputs[0] + + out_chunks = [] + for c in op.input.chunks: + shape = ( + (c.shape[0], output.shape[1]) if op._extract_multi_index else c.shape + ) + index = (c.index[0], 0) if op._extract_multi_index else c.index + out_chunk = ( + op.copy() + .reset_key() + .new_chunk([c], shape=shape, index=index, order=output.order) + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + nsplits = ( + (op.input.nsplits[0], (output.shape[1],)) + if op._extract_multi_index + else op.input.nsplits + ) + return new_op.new_tensors( + op.inputs, + output.shape, + order=output.order, + chunks=out_chunks, + nsplits=nsplits, + ) + + def __call__(self, a, order=None): + from ...dataframe.core import INDEX_TYPE, IndexValue + + if ( + self._extract_multi_index + and isinstance(a, INDEX_TYPE) + and isinstance(a.index_value.value, IndexValue.MultiIndex) + ): + order = a.order if order is None else order + return self.new_tensor( + [a], (a.shape[0], len(a.index_value.value.names)), order=order + ) + else: + self._extract_multi_index = False + + return super().__call__(a, order=order) + + +def from_dataframe(in_df, dtype=None): + from ...dataframe.utils import build_empty_df + + if dtype is None: + empty_pdf = build_empty_df(in_df.dtypes) + dtype = to_numpy(empty_pdf).dtype + op = TensorFromDataFrame(dtype=dtype, gpu=in_df.op.gpu) + return op(in_df, order=TensorOrder.F_ORDER) # return tensor with F-order always + + +def from_series(in_series, dtype=None): + op = TensorFromDataFrame(dtype=dtype or in_series.dtype, gpu=in_series.op.gpu) + return op(in_series, order=TensorOrder.F_ORDER) # return tensor with F-order always + + +def from_index(in_index, dtype=None, extract_multi_index=False): + op = TensorFromDataFrame( + dtype=dtype or in_index.dtype, + gpu=in_index.op.gpu, + extract_multi_index=extract_multi_index, + ) + return op(in_index, order=TensorOrder.F_ORDER) # return tensor with F-order always diff --git a/python/xorbits/_mars/tensor/datasource/from_dense.py b/python/xorbits/_mars/tensor/datasource/from_dense.py new file mode 100644 index 000000000..0d6bfc11f --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/from_dense.py @@ -0,0 +1,74 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import Iterable +from functools import reduce +from operator import and_ + +import numpy as np +import pandas as pd + +from ... import opcodes as OperandDef +from ...lib.sparse import SparseNDArray +from ...lib.sparse.core import cps, naked, sps +from ...serialization.serializables import AnyField, KeyField +from .array import tensor +from .core import TensorHasInput + + +class DenseToSparse(TensorHasInput): + _op_type_ = OperandDef.DENSE_TO_SPARSE + + _input = KeyField("input") + _missing = AnyField("missing") + + def __init__(self, missing=None, **kw): + super().__init__(sparse=True, _missing=missing, **kw) + + @property + def missing(self): + return self._missing + + @staticmethod + def _get_mask(data, missing): + if isinstance(missing, Iterable): + return reduce(and_, (DenseToSparse._get_mask(data, m) for m in missing)) + elif pd.isna(missing): + return ~pd.isna(data) + else: + return data != missing + + @classmethod + def execute(cls, ctx, op): + out = op.outputs[0] + in_data = naked(ctx[op.inputs[0].key]) + missing = op.missing + shape = in_data.shape if any(np.isnan(s) for s in out.shape) else out.shape + + xps = cps if op.gpu else sps + if missing is None: + ctx[out.key] = SparseNDArray(xps.csr_matrix(in_data), shape=shape) + else: + mask = cls._get_mask(in_data, missing) + spmatrix = xps.csr_matrix((in_data[mask], mask.nonzero()), shape=shape) + ctx[out.key] = SparseNDArray(spmatrix) + + +def fromdense(a, missing=None): + a = tensor(a) + if a.issparse(): + return a + + op = DenseToSparse(dtype=a.dtype, gpu=a.op.gpu, missing=missing) + return op(a) diff --git a/python/xorbits/_mars/tensor/datasource/from_hdf5.py b/python/xorbits/_mars/tensor/datasource/from_hdf5.py new file mode 100644 index 000000000..f845cb6e4 --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/from_hdf5.py @@ -0,0 +1,85 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...lib.filesystem import open_file +from .core import TensorFromHDF5Like, TensorOrder + + +class TensorHDF5DataSource(TensorFromHDF5Like): + _op_type_ = OperandDef.TENSOR_FROM_HDF5 + + @classmethod + def execute(cls, ctx, op): + import h5py + + axis_offsets = op.axis_offsets + shape = op.outputs[0].shape + + with h5py.File(open_file(op.filename), mode="r") as f: + ds = f[op.path] + data = ds[ + tuple( + slice(offset, offset + size) + for offset, size in zip(axis_offsets, shape) + ) + ] + ctx[op.outputs[0].key] = data + + +def fromhdf5(hdf5_file, group=None, dataset=None, chunk_size=None): + import h5py + + if isinstance(hdf5_file, h5py.Dataset): + filename = hdf5_file.file.filename + group = hdf5_file.parent.name + dataset = hdf5_file.name.rsplit("/", 1)[1] + chunk_size = chunk_size if chunk_size is not None else hdf5_file.chunks + shape = hdf5_file.shape + dtype = hdf5_file.dtype + elif isinstance(hdf5_file, h5py.File): + filename = hdf5_file.filename + if dataset is None: + raise ValueError("`dataset` should be provided") + try: + h5_dataset = hdf5_file[TensorHDF5DataSource.get_path(group, dataset)] + except KeyError: + raise ValueError(f"dataset({dataset}) does not exist") + chunk_size = chunk_size if chunk_size is not None else h5_dataset.chunks + shape = h5_dataset.shape + dtype = h5_dataset.dtype + elif isinstance(hdf5_file, str): + filename = hdf5_file + try: + with h5py.File(open_file(filename), mode="r") as f: + if dataset is None: + raise ValueError("`dataset` should be provided") + h5_dataset = f[TensorHDF5DataSource.get_path(group, dataset)] + + chunk_size = chunk_size if chunk_size is not None else h5_dataset.chunks + shape = h5_dataset.shape + dtype = h5_dataset.dtype + except KeyError: + raise ValueError(f"dataset({dataset}) does not exist") + else: + raise TypeError( + "`hdf5_file` passed has wrong type, " + "expect str, h5py.File or h5py.Dataset, " + f"got {type(hdf5_file)}" + ) + + op = TensorHDF5DataSource( + filename=filename, group=group, dataset=dataset, dtype=dtype + ) + return op(shape, chunk_size=chunk_size, order=TensorOrder.C_ORDER) diff --git a/python/xorbits/_mars/tensor/datasource/from_sparse.py b/python/xorbits/_mars/tensor/datasource/from_sparse.py new file mode 100644 index 000000000..5726117cf --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/from_sparse.py @@ -0,0 +1,75 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField, KeyField, StringField +from ..array_utils import as_same_device, device, get_array_module +from ..utils import get_order +from .array import tensor +from .core import TensorHasInput + + +class SparseToDense(TensorHasInput): + _op_type_ = OperandDef.SPARSE_TO_DENSE + + _input = KeyField("input") + _order = StringField("order") + _fill_value = AnyField("fill_value") + + def __init__(self, fill_value=None, order=None, **kw): + super().__init__(_fill_value=fill_value, sparse=False, _order=order, **kw) + + @property + def order(self): + return self._order + + @property + def fill_value(self): + return self._fill_value + + @classmethod + def execute(cls, ctx, op): + fill_value = op.fill_value + out = op.outputs[0] + (inp,), device_id, xp = as_same_device( + [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + if fill_value is None: + ctx[out.key] = inp.toarray().astype( + out.dtype, order=op.order, copy=False + ) + else: + xp = get_array_module(xp) + spmatrix = inp.spmatrix + inds = spmatrix.nonzero() + ret = xp.full(inp.shape, fill_value, dtype=out.dtype, order=op.order) + ret[inds] = spmatrix.data + ctx[out.key] = ret + + +def fromsparse(a, order="C", fill_value=None): + a = tensor(a) + if not a.issparse(): + return a.astype(a.dtype, order=order, copy=False) + + tensor_order = get_order( + order, + None, + available_options="CF", + err_msg="only 'C' or 'F' order is permitted", + ) + op = SparseToDense(dtype=a.dtype, gpu=a.op.gpu, order=order, fill_value=fill_value) + return op(a, order=tensor_order) diff --git a/python/xorbits/_mars/tensor/datasource/from_tiledb.py b/python/xorbits/_mars/tensor/datasource/from_tiledb.py new file mode 100644 index 000000000..751c8d18f --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/from_tiledb.py @@ -0,0 +1,205 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...lib.sparse import SparseNDArray +from ...lib.sparse.core import sps +from ...serialization.serializables import ( + DictField, + FieldTypes, + Int64Field, + StringField, + TupleField, +) +from ..core import TensorOrder +from .core import TensorNoInput + + +class TensorTileDBDataSource(TensorNoInput): + _op_type_ = OperandDef.TENSOR_FROM_TILEDB + + _tiledb_config = DictField("tiledb_config") + # URI of array to open + _tiledb_uri = StringField("tiledb_uri") + # tiledb dim start + _tiledb_dim_starts = TupleField("tiledb_dim_starts", FieldTypes.int64) + # encryption key to decrypt if provided + _tiledb_key = StringField("tiledb_key") + # open array at a given timestamp if provided + _tiledb_timestamp = Int64Field("tiledb_timestamp") + _axis_offsets = TupleField("axis_offsets", FieldTypes.int64) + + def __init__( + self, + tiledb_config=None, + tiledb_uri=None, + tiledb_dim_starts=None, + tiledb_key=None, + tiledb_timstamp=None, + **kw + ): + super().__init__( + _tiledb_config=tiledb_config, + _tiledb_uri=tiledb_uri, + _tiledb_dim_starts=tiledb_dim_starts, + _tiledb_key=tiledb_key, + _tiledb_timestamp=tiledb_timstamp, + **kw + ) + + @property + def tiledb_config(self): + return self._tiledb_config + + @property + def tiledb_uri(self): + return self._tiledb_uri + + @property + def tiledb_dim_starts(self): + return self._tiledb_dim_starts + + @property + def tiledb_key(self): + return self._tiledb_key + + @property + def tiledb_timestamp(self): + return self._tiledb_timestamp + + @property + def axis_offsets(self): + return self._axis_offsets + + def to_chunk_op(self, *args): + _, chunk_idx, nsplits = args + chunk_op = super().to_chunk_op(*args) + axis_offsets = [] + for axis, idx in enumerate(chunk_idx): + axis_offsets.append(sum(nsplits[axis][:idx])) + chunk_op._axis_offsets = tuple(axis_offsets) + return chunk_op + + @classmethod + def execute(cls, ctx, op): + import tiledb + + chunk = op.outputs[0] + from ..array_utils import array_module + from ..utils import get_tiledb_ctx + + xp = array_module(op.gpu) + + axis_offsets = [ + offset + dim_start + for offset, dim_start in zip(op.axis_offsets, op.tiledb_dim_starts) + ] + tiledb_ctx = get_tiledb_ctx(op.tiledb_config) + uri = op.tiledb_uri + key = op.tiledb_key + timestamp = op.tiledb_timestamp + + slcs = [] + for axis in range(chunk.ndim): + axis_offset = axis_offsets[axis] + axis_length = chunk.shape[axis] + slcs.append(slice(axis_offset, axis_offset + axis_length)) + + if not op.sparse: + # read dense array from tiledb + with tiledb.DenseArray( + uri=uri, ctx=tiledb_ctx, key=key, timestamp=timestamp + ) as tiledb_arr: + ctx[chunk.key] = tiledb_arr[tuple(slcs)] + else: + # read sparse array from tiledb + with tiledb.SparseArray( + uri=uri, ctx=tiledb_ctx, key=key, timestamp=timestamp + ) as tiledb_arr: + if tiledb_arr.ndim > 2: + raise NotImplementedError( + "Does not support to read array with more than 2 dimensions" + ) + + data = tiledb_arr[tuple(slcs)] + coords = data["coords"] + + value = data[tiledb_arr.attr(0).name] + if tiledb_arr.ndim == 2: + # 2-d + ij = tuple( + coords[tiledb_arr.domain.dim(k).name] - axis_offsets[k] + for k in range(tiledb_arr.ndim) + ) + spmatrix = sps.coo_matrix((value, ij), shape=chunk.shape) + ctx[chunk.key] = SparseNDArray(spmatrix) + else: + # 1-d + ij = ( + xp.zeros(coords.shape), + coords[tiledb_arr.domain.dim(0).name] - axis_offsets[0], + ) + spmatrix = sps.coo_matrix((value, ij), shape=(1,) + chunk.shape) + ctx[chunk.key] = SparseNDArray(spmatrix, shape=chunk.shape) + + +def fromtiledb(uri, ctx=None, key=None, timestamp=None, gpu=None): + import tiledb + + raw_ctx = ctx + if raw_ctx is None: + ctx = tiledb.Ctx() + + # get metadata from tiledb + try: + tiledb_arr = tiledb.DenseArray(uri=uri, ctx=ctx, key=key, timestamp=timestamp) + sparse = False + except ValueError: + # if the array is not dense, ValueError will be raised by tiledb + tiledb_arr = tiledb.SparseArray(uri=uri, ctx=ctx, key=key, timestamp=timestamp) + sparse = True + + if tiledb_arr.nattr > 1: + raise NotImplementedError( + "Does not supported TileDB array schema with more than 1 attr" + ) + tiledb_dim_starts = tuple( + tiledb_arr.domain.dim(j).domain[0].item() for j in range(tiledb_arr.ndim) + ) + if any(isinstance(s, float) for s in tiledb_dim_starts): + raise ValueError( + "Does not support TileDB array schema whose dimensions has float domain" + ) + + dtype = tiledb_arr.attr(0).dtype + tiledb_config = None if raw_ctx is None else ctx.config().dict() + tensor_order = ( + TensorOrder.C_ORDER + if tiledb_arr.schema.cell_order == "row-major" + else TensorOrder.F_ORDER + ) + op = TensorTileDBDataSource( + tiledb_config=tiledb_config, + tiledb_uri=uri, + tiledb_key=key, + tiledb_timstamp=timestamp, + tiledb_dim_starts=tiledb_dim_starts, + gpu=gpu, + sparse=sparse, + dtype=dtype, + ) + chunk_size = tuple( + int(tiledb_arr.domain.dim(i).tile) for i in range(tiledb_arr.domain.ndim) + ) + return op(tiledb_arr.shape, chunk_size=chunk_size, order=tensor_order) diff --git a/python/xorbits/_mars/tensor/datasource/from_vineyard.py b/python/xorbits/_mars/tensor/datasource/from_vineyard.py new file mode 100644 index 000000000..535e81ed8 --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/from_vineyard.py @@ -0,0 +1,192 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json + +import numpy as np + +from ... import opcodes as OperandDef +from ...core.context import get_context +from ...serialization.serializables import Int32Field, StringField +from ...storage.base import StorageLevel +from ...utils import calc_nsplits, has_unknown_shape, lazy_import +from ..operands import TensorOperand, TensorOperandMixin +from .core import TensorNoInput + +vineyard = lazy_import("vineyard") +vy_data_utils = lazy_import("vineyard.data.utils", rename="vy_data_utils") + + +def resolve_vineyard_socket(ctx, op): + if op.vineyard_socket is None: # pragma: no cover + storage_backend = ctx.get_storage_info(level=StorageLevel.MEMORY) + if storage_backend.get("name", None) == "vineyard": + return storage_backend["socket"] + else: + return op.vineyard_socket + else: + return op.vineyard_socket + + +class TensorFromVineyard(TensorNoInput): + _op_type_ = OperandDef.TENSOR_FROM_VINEYARD_META + + # vineyard ipc socket + vineyard_socket = StringField("vineyard_socket") + + # ObjectID in vineyard + object_id = StringField("object_id") + + # a dummy attr to make sure ops have different keys + operator_index = Int32Field("operator_index") + + def __init__(self, vineyard_socket=None, object_id=None, **kw): + super().__init__(vineyard_socket=vineyard_socket, object_id=object_id, **kw) + + @classmethod + def tile(cls, op): + ctx = get_context() + workers = ctx.get_worker_addresses() + + out_chunks = [] + for index, worker in enumerate(workers): + chunk_op = op.copy().reset_key() + chunk_op.expect_worker = worker + chunk_op.operator_index = index + out_chunk = chunk_op.new_chunk( + [], dtype=np.dtype(object), shape=(1,), index=(index,) + ) + out_chunks.append(out_chunk) + + new_op = op.copy().reset_key() + return new_op.new_tensors( + op.inputs, + shape=(np.nan,), + dtype=np.dtype(object), + chunks=out_chunks, + nsplits=((np.nan,) * len(workers),), + ) + + @classmethod + def execute(cls, ctx, op): + if vineyard is None: + raise RuntimeError("vineyard is not available") + + socket = resolve_vineyard_socket(ctx, op) + client = vineyard.connect(socket) + + meta = client.get_meta(vineyard.ObjectID(op.object_id)) + chunks = [] + for idx in range(meta["partitions_-size"]): + chunk_meta = meta["partitions_-%d" % idx] + if not chunk_meta.islocal: + continue + dtype = vy_data_utils.normalize_dtype( + chunk_meta["value_type_"], chunk_meta.get("value_type_meta_", None) + ) + shape = tuple(json.loads(chunk_meta["shape_"])) + chunk_index = tuple(json.loads(chunk_meta["partition_index_"])) + # chunk: (chunk_id, worker_address, dtype, shape, index) + chunks.append( + (repr(chunk_meta.id), ctx.worker_address, dtype, shape, chunk_index) + ) + + holder = np.empty((1,), dtype=object) + holder[0] = chunks + ctx[op.outputs[0].key] = np.asarray(holder) + + +class TensorFromVineyardChunk(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.TENSOR_FROM_VINEYARD_CHUNK + + # vineyard ipc socket + vineyard_socket = StringField("vineyard_socket") + + # ObjectID of chunk in vineyard + object_id = StringField("object_id") + + def __init__(self, vineyard_socket=None, object_id=None, **kw): + super().__init__(vineyard_socket=vineyard_socket, object_id=object_id, **kw) + + def __call__(self, meta): + return self.new_tensor([meta], shape=(np.nan,)) + + @classmethod + def tile(cls, op): + if has_unknown_shape(*op.inputs): + yield + + ctx = get_context() + + in_chunk_keys = [chunk.key for chunk in op.inputs[0].chunks] + out_chunks = [] + chunk_map = dict() + dtype = None + for chunk, infos in zip( + op.inputs[0].chunks, ctx.get_chunks_result(in_chunk_keys) + ): + for info in infos[0]: # n.b. 1-element ndarray + chunk_op = op.copy().reset_key() + chunk_op.object_id = info[0] + chunk_op.expect_worker = info[1] + dtype = info[2] + shape = info[3] + chunk_index = info[4] + chunk_map[chunk_index] = info[3] + out_chunk = chunk_op.new_chunk( + [chunk], shape=shape, dtype=dtype, index=chunk_index + ) + out_chunks.append(out_chunk) + + nsplits = calc_nsplits(chunk_map) + shape = [np.sum(nsplit) for nsplit in nsplits] + new_op = op.copy().reset_key() + return new_op.new_tensors( + op.inputs, shape=shape, dtype=dtype, chunks=out_chunks, nsplits=nsplits + ) + + @classmethod + def execute(cls, ctx, op): + if vineyard is None: + raise RuntimeError("vineyard is not available") + + socket = resolve_vineyard_socket(ctx, op) + client = vineyard.connect(socket) + + client = vineyard.connect(socket) + ctx[op.outputs[0].key] = client.get(vineyard.ObjectID(op.object_id)) + + +def fromvineyard(tensor, vineyard_socket=None): + if vineyard is not None and isinstance(tensor, vineyard.Object): # pragma: no cover + if "vineyard::GlobalTensor" not in tensor.typename: + raise TypeError( + "The input tensor %r is not a vineyard' GlobalTensor" % tensor + ) + object_id = tensor.id + else: + object_id = tensor + if vineyard is not None and isinstance(object_id, vineyard.ObjectID): + object_id = repr(object_id) + metaop = TensorFromVineyard( + vineyard_socket=vineyard_socket, + object_id=object_id, + dtype=np.dtype("byte"), + gpu=None, + ) + meta = metaop(shape=(np.nan,), chunk_size=(np.nan,)) + op = TensorFromVineyardChunk( + vineyard_socket=vineyard_socket, object_id=object_id, gpu=None + ) + return op(meta) diff --git a/python/xorbits/_mars/tensor/datasource/from_zarr.py b/python/xorbits/_mars/tensor/datasource/from_zarr.py new file mode 100644 index 000000000..4899697f5 --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/from_zarr.py @@ -0,0 +1,93 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...lib.filesystem import FSMap, get_fs +from ..core import TensorOrder +from .core import TensorFromHDF5Like + + +class TensorFromZarr(TensorFromHDF5Like): + _op_type_ = OperandDef.TENSOR_FROM_ZARR + + @classmethod + def execute(cls, ctx, op): + import zarr + + axis_offsets = op.axis_offsets + shape = op.outputs[0].shape + + fs = get_fs(op.filename, None) + fs_map = FSMap(op.filename, fs) + + root = zarr.group(store=fs_map) + path = cls.get_path(op.group, op.dataset) + arr = root[path] + + data = arr[ + tuple( + slice(offset, offset + size) + for offset, size in zip(axis_offsets, shape) + ) + ] + ctx[op.outputs[0].key] = data + + +def fromzarr(path, group=None, dataset=None, chunk_size=None): + import zarr + + try: + # since v2.11.0, zarr convert mutable mappings to KVStore + from zarr.storage import KVStore as zarr_kvstore + except ImportError: # pragma: no cover + zarr_kvstore = None + + if isinstance(path, zarr.Array): + arr = path + if zarr_kvstore is None and isinstance(arr.store, FSMap): # pragma: no cover + root = arr.store.root + path, dataset = root.rsplit("/", 1) + elif zarr_kvstore and isinstance(arr.store, zarr_kvstore): + root = arr.store._mutable_mapping.root + path, dataset = root.rsplit("/", 1) + else: + path = arr.store.path + if "/" in arr.path and group is None: + group = arr.path.rsplit("/", 1)[0] + dataset = arr.basename + if not dataset: + path, dataset = path.rsplit("/", 1) + shape = arr.shape + elif isinstance(path, str): + fs = get_fs(path, None) + fs_map = FSMap(path, fs) + + if group is None and dataset is None: + arr = zarr.open(fs_map) + if isinstance(arr, zarr.Array): + return fromzarr(arr, chunk_size=chunk_size) + + g = zarr.group(store=fs_map) + arr = g[TensorFromZarr.get_path(group, dataset)] + shape = arr.shape + else: + raise TypeError( + "`path` passed has wrong type, " + "expect str, or zarr.Array" + f"got {type(path)}" + ) + + chunk_size = chunk_size if chunk_size is not None else arr.chunks + op = TensorFromZarr(filename=path, group=group, dataset=dataset, dtype=arr.dtype) + return op(shape, chunk_size=chunk_size, order=TensorOrder(arr.order)) diff --git a/python/xorbits/_mars/tensor/datasource/full.py b/python/xorbits/_mars/tensor/datasource/full.py new file mode 100644 index 000000000..65a089fda --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/full.py @@ -0,0 +1,243 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...lib.sparse import SparseNDArray +from ...lib.sparse.core import get_array_module, get_sparse_module, naked +from ...serialization.serializables import AnyField, KeyField, StringField +from ..array_utils import create_array +from ..utils import get_order +from .array import tensor +from .core import TensorLike, TensorNoInput + + +class TensorFull(TensorNoInput): + _op_type_ = OperandDef.TENSOR_FULL + + _fill_value = AnyField("fill_value") + _order = StringField("order") + + def __init__(self, fill_value=None, dtype=None, order=None, **kw): + if dtype is not None: + dtype = np.dtype(dtype) + if fill_value is not None: + fill_value = dtype.type(fill_value) + elif fill_value is not None: + dtype = np.array(fill_value).dtype + super().__init__(_fill_value=fill_value, dtype=dtype, _order=order, **kw) + + @property + def fill_value(self): + return self._fill_value + + @property + def order(self): + return self._order + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + ctx[chunk.key] = create_array(op)( + "full", chunk.shape, op.fill_value, dtype=op.dtype, order=op.order + ) + + +def full(shape, fill_value, dtype=None, chunk_size=None, gpu=None, order="C"): + """ + Return a new tensor of given shape and type, filled with `fill_value`. + + Parameters + ---------- + shape : int or sequence of ints + Shape of the new tensor, e.g., ``(2, 3)`` or ``2``. + fill_value : scalar + Fill value. + dtype : data-type, optional + The desired data-type for the tensor The default, `None`, means + `np.array(fill_value).dtype`. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + order : {'C', 'F'}, optional + Whether to store multidimensional data in C- or Fortran-contiguous + (row- or column-wise) order in memory. + + Returns + ------- + out : Tensor + Tensor of `fill_value` with the given shape, dtype, and order. + + See Also + -------- + zeros_like : Return a tensor of zeros with shape and type of input. + ones_like : Return a tensor of ones with shape and type of input. + empty_like : Return an empty tensor with shape and type of input. + full_like : Fill a tensor with shape and type of input. + zeros : Return a new tensor setting values to zero. + ones : Return a new tensor setting values to one. + empty : Return a new uninitialized tensor. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.full((2, 2), mt.inf).execute() + array([[ inf, inf], + [ inf, inf]]) + >>> mt.full((2, 2), 10).execute() + array([[10, 10], + [10, 10]]) + + """ + v = np.asarray(fill_value) + if len(v.shape) > 0: + from ..base import broadcast_to + + return broadcast_to( + tensor(v, dtype=dtype, chunk_size=chunk_size, gpu=gpu, order=order), shape + ) + + tensor_order = get_order( + order, + None, + available_options="CF", + err_msg="only 'C' or 'F' order is permitted", + ) + op = TensorFull(fill_value, dtype=dtype, gpu=gpu, order=order) + return op(shape, chunk_size=chunk_size, order=tensor_order) + + +class TensorFullLike(TensorLike): + _op_type_ = OperandDef.TENSOR_FULL_LIKE + + _input = KeyField("input") + _fill_value = AnyField("fill_value") + _order = StringField("order") + + def __init__( + self, fill_value=None, dtype=None, gpu=None, sparse=False, order=None, **kw + ): + if dtype is not None: + dtype = np.dtype(dtype) + if fill_value is not None: + fill_value = dtype.type(fill_value) + elif fill_value is not None: + dtype = np.array(fill_value).dtype + super().__init__( + _fill_value=fill_value, + _order=order, + dtype=dtype, + gpu=gpu, + sparse=sparse, + **kw + ) + + @property + def fill_value(self): + return self._fill_value + + @property + def order(self): + return self._order + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + if op.issparse(): + in_data = naked(ctx[op.inputs[0].key]) + xps = get_sparse_module(in_data) + xp = get_array_module(in_data) + ctx[chunk.key] = SparseNDArray( + xps.csr_matrix( + ( + xp.full_like(in_data.data, op.fill_value, dtype=op.dtype), + in_data.indices, + in_data.indptr, + ), + shape=in_data.shape, + ) + ) + else: + ctx[chunk.key] = create_array(op)( + "full_like", + ctx[op.inputs[0].key], + op.fill_value, + dtype=op.dtype, + order=op.order, + ) + + +def full_like(a, fill_value, dtype=None, gpu=None, order="K"): + """ + Return a full tensor with the same shape and type as a given tensor. + + Parameters + ---------- + a : array_like + The shape and data-type of `a` define these same attributes of + the returned tensor. + fill_value : scalar + Fill value. + dtype : data-type, optional + Overrides the data type of the result. + gpu : bool, optional + Allocate the tensor on GPU if True, None as default + order : {'C', 'F', 'A', or 'K'}, optional + Overrides the memory layout of the result. 'C' means C-order, + 'F' means F-order, 'A' means 'F' if `a` is Fortran contiguous, + 'C' otherwise. 'K' means match the layout of `a` as closely + as possible. + + Returns + ------- + out : Tensor + Tensor of `fill_value` with the same shape and type as `a`. + + See Also + -------- + empty_like : Return an empty tensor with shape and type of input. + ones_like : Return a tensor of ones with shape and type of input. + zeros_like : Return a tensor of zeros with shape and type of input. + full : Return a new tensor of given shape filled with value. + + Examples + -------- + >>> import mars.tensor as mt + >>> x = mt.arange(6, dtype=int) + >>> mt.full_like(x, 1).execute() + array([1, 1, 1, 1, 1, 1]) + >>> mt.full_like(x, 0.1).execute() + array([0, 0, 0, 0, 0, 0]) + >>> mt.full_like(x, 0.1, dtype=mt.double).execute() + array([ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]) + >>> mt.full_like(x, mt.nan, dtype=mt.double).execute() + array([ nan, nan, nan, nan, nan, nan]) + + >>> y = mt.arange(6, dtype=mt.double) + >>> mt.full_like(y, 0.1).execute() + array([ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]) + + """ + a = tensor(a) + tensor_order = get_order(order, a.order) + if dtype is None: + dtype = a.dtype + gpu = a.op.gpu if gpu is None else gpu + op = TensorFullLike( + fill_value=fill_value, dtype=dtype, gpu=gpu, sparse=a.issparse() + ) + return op(a, order=tensor_order) diff --git a/python/xorbits/_mars/tensor/datasource/identity.py b/python/xorbits/_mars/tensor/datasource/identity.py new file mode 100644 index 000000000..c60580bb2 --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/identity.py @@ -0,0 +1,54 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .eye import eye + + +def identity(n, dtype=None, sparse=False, gpu=None, chunk_size=None): + """ + Return the identity tensor. + + The identity tensor is a square array with ones on + the main diagonal. + + Parameters + ---------- + n : int + Number of rows (and columns) in `n` x `n` output. + dtype : data-type, optional + Data-type of the output. Defaults to ``float``. + sparse: bool, optional + Create sparse tensor if True, False as default + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + chunks : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + + Returns + ------- + out : Tensor + `n` x `n` array with its main diagonal set to one, + and all other elements 0. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.identity(3).execute() + array([[ 1., 0., 0.], + [ 0., 1., 0.], + [ 0., 0., 1.]]) + + """ + return eye(n, dtype=dtype, sparse=sparse, gpu=gpu, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/datasource/indices.py b/python/xorbits/_mars/tensor/datasource/indices.py new file mode 100644 index 000000000..0cb389ad9 --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/indices.py @@ -0,0 +1,131 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import Iterable + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import FieldTypes, ListField +from .arange import arange +from .core import TensorNoInput +from .empty import empty +from .meshgrid import meshgrid + + +class TensorIndices(TensorNoInput): + _op_type_ = OperandDef.TENSOR_INDICES + + _dimensions = ListField("dimensions", FieldTypes.uint64) + + def __init__(self, dimensions=None, **kw): + super().__init__(_dimensions=dimensions, **kw) + + @property + def dimensions(self): + return self._dimensions + + +def indices(dimensions, dtype=int, chunk_size=None): + """ + Return a tensor representing the indices of a grid. + + Compute a tensor where the subtensors contain index values 0,1,... + varying only along the corresponding axis. + + Parameters + ---------- + dimensions : sequence of ints + The shape of the grid. + dtype : dtype, optional + Data type of the result. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + + Returns + ------- + grid : Tensor + The tensor of grid indices, + ``grid.shape = (len(dimensions),) + tuple(dimensions)``. + + See Also + -------- + mgrid, meshgrid + + Notes + ----- + The output shape is obtained by prepending the number of dimensions + in front of the tuple of dimensions, i.e. if `dimensions` is a tuple + ``(r0, ..., rN-1)`` of length ``N``, the output shape is + ``(N,r0,...,rN-1)``. + + The subtensors ``grid[k]`` contains the N-D array of indices along the + ``k-th`` axis. Explicitly:: + + grid[k,i0,i1,...,iN-1] = ik + + Examples + -------- + >>> import mars.tensor as mt + + >>> grid = mt.indices((2, 3)) + >>> grid.shape + (2, 2, 3) + >>> grid[0].execute() # row indices + array([[0, 0, 0], + [1, 1, 1]]) + >>> grid[1].execute() # column indices + array([[0, 1, 2], + [0, 1, 2]]) + + The indices can be used as an index into a tensor. + + >>> x = mt.arange(20).reshape(5, 4) + >>> row, col = mt.indices((2, 3)) + >>> # x[row, col] # TODO(jisheng): accomplish this if multiple fancy indexing is supported + + Note that it would be more straightforward in the above example to + extract the required elements directly with ``x[:2, :3]``. + + """ + from ..merge import stack + + dimensions = tuple(dimensions) + dtype = np.dtype(dtype) + raw_chunk_size = chunk_size + if chunk_size is not None and isinstance(chunk_size, Iterable): + chunk_size = tuple(chunk_size) + else: + chunk_size = (chunk_size,) * len(dimensions) + + xi = [] + for ch, dim in zip(chunk_size, dimensions): + xi.append(arange(dim, dtype=dtype, chunk_size=ch)) + + grid = None + if np.prod(dimensions): + grid = meshgrid(*xi, indexing="ij") + + if grid: + grid = stack(grid) + else: + if raw_chunk_size is None: + empty_chunk_size = None + else: + empty_chunk_size = (1,) + chunk_size + grid = empty( + (len(dimensions),) + dimensions, dtype=dtype, chunk_size=empty_chunk_size + ) + + return grid diff --git a/python/xorbits/_mars/tensor/datasource/linspace.py b/python/xorbits/_mars/tensor/datasource/linspace.py new file mode 100644 index 000000000..fc8954f3b --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/linspace.py @@ -0,0 +1,219 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...config import options +from ...core import ExecutableTuple +from ...serialization.serializables import AnyField, BoolField, Int64Field +from ..array_utils import create_array +from ..utils import decide_chunk_sizes +from .core import TensorNoInput + + +class TensorLinspace(TensorNoInput): + _op_type_ = OperandDef.TENSOR_LINSPACE + + _start = AnyField("start") + _stop = AnyField("stop") + _num = Int64Field("num") + _endpoint = BoolField("endpoint") + + def __init__( + self, start=None, stop=None, num=None, endpoint=None, dtype=None, **kw + ): + dtype = np.dtype(np.linspace(0, 1, 1).dtype if dtype is None else dtype) + super().__init__( + _start=start, _stop=stop, _num=num, _endpoint=endpoint, dtype=dtype, **kw + ) + + def to_chunk_op(self, *args): + start, stop, num, endpoint = args + op = self.copy().reset_key() + op._start = start + op._stop = stop + op._num = num + op._endpoint = endpoint + return op + + @classmethod + def tile(cls, op): + tensor = op.outputs[0] + + chunk_length = tensor.extra_params.raw_chunk_size or options.chunk_size + chunk_length = decide_chunk_sizes( + tensor.shape, chunk_length, tensor.dtype.itemsize + ) + + start, stop, num, endpoint = ( + tensor.op.start, + tensor.op.stop, + tensor.op.num, + tensor.op.endpoint, + ) + if num > 1: + step = float(stop - start) / (num if not endpoint else num - 1) + else: + step = 0.0 + + chunks = [] + chunk_start = start + nsplit = [] + for i, cs in enumerate(chunk_length[0]): + chunk_stop = chunk_start + (cs - 1) * step + chunk_op = op.to_chunk_op(chunk_start, chunk_stop, cs, True) + chunk_shape = (cs,) + chunk_idx = (i,) + chunk = chunk_op.new_chunk(None, shape=chunk_shape, index=chunk_idx) + chunks.append(chunk) + nsplit.append(cs) + chunk_start = chunk_start + cs * step + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, op.outputs[0].shape, chunks=chunks, nsplits=(tuple(nsplit),) + ) + + @property + def start(self): + return self._start + + @property + def stop(self): + return self._stop + + @property + def num(self): + return self._num + + @property + def endpoint(self): + return self._endpoint + + @classmethod + def execute(cls, ctx, op): + ctx[op.outputs[0].key] = create_array(op)( + "linspace", + op.start, + op.stop, + num=op.num, + endpoint=op.endpoint, + dtype=op.dtype, + ) + + +def linspace( + start, + stop, + num=50, + endpoint=True, + retstep=False, + dtype=None, + gpu=None, + chunk_size=None, +): + """ + Return evenly spaced numbers over a specified interval. + + Returns `num` evenly spaced samples, calculated over the + interval [`start`, `stop`]. + + The endpoint of the interval can optionally be excluded. + + Parameters + ---------- + start : scalar + The starting value of the sequence. + stop : scalar + The end value of the sequence, unless `endpoint` is set to False. + In that case, the sequence consists of all but the last of ``num + 1`` + evenly spaced samples, so that `stop` is excluded. Note that the step + size changes when `endpoint` is False. + num : int, optional + Number of samples to generate. Default is 50. Must be non-negative. + endpoint : bool, optional + If True, `stop` is the last sample. Otherwise, it is not included. + Default is True. + retstep : bool, optional + If True, return (`samples`, `step`), where `step` is the spacing + between samples. + dtype : dtype, optional + The type of the output tensor. If `dtype` is not given, infer the data + type from the other input arguments. + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + + Returns + ------- + samples : Tensor + There are `num` equally spaced samples in the closed interval + ``[start, stop]`` or the half-open interval ``[start, stop)`` + (depending on whether `endpoint` is True or False). + step : float, optional + Only returned if `retstep` is True + + Size of spacing between samples. + + + See Also + -------- + arange : Similar to `linspace`, but uses a step size (instead of the + number of samples). + logspace : Samples uniformly distributed in log space. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.linspace(2.0, 3.0, num=5).execute() + array([ 2. , 2.25, 2.5 , 2.75, 3. ]) + >>> mt.linspace(2.0, 3.0, num=5, endpoint=False).execute() + array([ 2. , 2.2, 2.4, 2.6, 2.8]) + >>> mt.linspace(2.0, 3.0, num=5, retstep=True).execute() + (array([ 2. , 2.25, 2.5 , 2.75, 3. ]), 0.25) + + Graphical illustration: + + >>> import matplotlib.pyplot as plt + >>> N = 8 + >>> y = mt.zeros(N) + >>> x1 = mt.linspace(0, 10, N, endpoint=True) + >>> x2 = mt.linspace(0, 10, N, endpoint=False) + >>> plt.plot(x1.execute(), y.execute(), 'o') + [] + >>> plt.plot(x2.execute(), y.execute() + 0.5, 'o') + [] + >>> plt.ylim([-0.5, 1]) + (-0.5, 1) + >>> plt.show() + + """ + num = int(num) + + op = TensorLinspace(start, stop, num, endpoint, dtype=dtype, gpu=gpu) + shape = (num,) + ret = op(shape, chunk_size=chunk_size) + + if not retstep: + return ret + + if num > 1: + step = float(stop - start) / (num if not endpoint else num - 1) + else: + step = np.nan + + return ExecutableTuple([ret, step]) diff --git a/python/xorbits/_mars/tensor/datasource/meshgrid.py b/python/xorbits/_mars/tensor/datasource/meshgrid.py new file mode 100644 index 000000000..bb2817518 --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/meshgrid.py @@ -0,0 +1,135 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .array import tensor + + +def meshgrid(*xi, **kwargs): + """ + Return coordinate matrices from coordinate vectors. + + Make N-D coordinate arrays for vectorized evaluations of + N-D scalar/vector fields over N-D grids, given + one-dimensional coordinate tensors x1, x2,..., xn. + + Parameters + ---------- + x1, x2,..., xn : array_like + 1-D arrays representing the coordinates of a grid. + indexing : {'xy', 'ij'}, optional + Cartesian ('xy', default) or matrix ('ij') indexing of output. + See Notes for more details. + sparse : bool, optional + If True a sparse grid is returned in order to conserve memory. + Default is False. + + Returns + ------- + X1, X2,..., XN : Tensor + For vectors `x1`, `x2`,..., 'xn' with lengths ``Ni=len(xi)`` , + return ``(N1, N2, N3,...Nn)`` shaped tensors if indexing='ij' + or ``(N2, N1, N3,...Nn)`` shaped tensors if indexing='xy' + with the elements of `xi` repeated to fill the matrix along + the first dimension for `x1`, the second for `x2` and so on. + + Notes + ----- + This function supports both indexing conventions through the indexing + keyword argument. Giving the string 'ij' returns a meshgrid with + matrix indexing, while 'xy' returns a meshgrid with Cartesian indexing. + In the 2-D case with inputs of length M and N, the outputs are of shape + (N, M) for 'xy' indexing and (M, N) for 'ij' indexing. In the 3-D case + with inputs of length M, N and P, outputs are of shape (N, M, P) for + 'xy' indexing and (M, N, P) for 'ij' indexing. The difference is + illustrated by the following code snippet:: + + xv, yv = mt.meshgrid(x, y, sparse=False, indexing='ij') + for i in range(nx): + for j in range(ny): + # treat xv[i,j], yv[i,j] + + xv, yv = mt.meshgrid(x, y, sparse=False, indexing='xy') + for i in range(nx): + for j in range(ny): + # treat xv[j,i], yv[j,i] + + In the 1-D and 0-D case, the indexing and sparse keywords have no effect. + + Examples + -------- + >>> import mars.tensor as mt + + >>> nx, ny = (3, 2) + >>> x = mt.linspace(0, 1, nx) + >>> y = mt.linspace(0, 1, ny) + >>> xv, yv = mt.meshgrid(x, y) + >>> xv.execute() + array([[ 0. , 0.5, 1. ], + [ 0. , 0.5, 1. ]]) + >>> yv.execute() + array([[ 0., 0., 0.], + [ 1., 1., 1.]]) + >>> xv, yv = mt.meshgrid(x, y, sparse=True) # make sparse output arrays + >>> xv.execute() + array([[ 0. , 0.5, 1. ]]) + >>> yv.execute() + array([[ 0.], + [ 1.]]) + + `meshgrid` is very useful to evaluate functions on a grid. + + >>> import matplotlib.pyplot as plt + >>> x = mt.arange(-5, 5, 0.1) + >>> y = mt.arange(-5, 5, 0.1) + >>> xx, yy = mt.meshgrid(x, y, sparse=True) + >>> z = mt.sin(xx**2 + yy**2) / (xx**2 + yy**2) + >>> h = plt.contourf(x,y,z) + + """ + from ..base import broadcast_to + + indexing = kwargs.pop("indexing", "xy") + sparse = kwargs.pop("sparse", False) + + if kwargs: + raise TypeError( + f"meshgrid() got an unexpected keyword argument '{list(kwargs)[0]}'" + ) + if indexing not in ("xy", "ij"): + raise ValueError("Valid values for `indexing` are 'xy' and 'ij'.") + + xi = [tensor(x) for x in xi] + xi = [a.ravel() for a in xi] + shape = [x.size for x in xi] + + if indexing == "xy" and len(xi) > 1: + xi[0], xi[1] = xi[1], xi[0] + shape[0], shape[1] = shape[1], shape[0] + + grid = [] + for i, x in enumerate(xi): + slc = [None] * len(shape) + slc[i] = slice(None) + + r = x[tuple(slc)] + + if not sparse: + r = broadcast_to(r, shape) + + grid.append(r) + + if indexing == "xy" and len(xi) > 1: + grid[0], grid[1] = grid[1], grid[0] + + return grid diff --git a/python/xorbits/_mars/tensor/datasource/ones.py b/python/xorbits/_mars/tensor/datasource/ones.py new file mode 100644 index 000000000..01c1a1b5a --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/ones.py @@ -0,0 +1,217 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...lib.sparse import SparseNDArray +from ...lib.sparse.core import get_array_module, get_sparse_module, naked +from ...serialization.serializables import ( + AnyField, + FieldTypes, + KeyField, + StringField, + TupleField, +) +from ..array_utils import convert_order, create_array +from ..utils import get_order +from .array import tensor +from .core import TensorLike, TensorNoInput + + +class TensorOnes(TensorNoInput): + _op_type_ = OperandDef.TENSOR_ONES + + order = StringField("order") + shape = TupleField("shape", FieldTypes.int64) + chunk_size = AnyField("chunk_size") + + def __init__(self, shape=None, **kwargs): + if type(shape) is int: + shape = (shape,) + super().__init__(shape=shape, **kwargs) + + def to_chunk_op(self, *args): + chunk_op = super().to_chunk_op(*args) + chunk_op.shape = args[0] + chunk_op.chunk_size = None + return chunk_op + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + try: + ctx[chunk.key] = create_array(op)( + "ones", op.shape, dtype=op.dtype, order=op.order + ) + except TypeError: # in case that cp.ones does not have arg ``order`` + x = create_array(op)("ones", op.shape, dtype=op.dtype) + ctx[chunk.key] = convert_order(x, op.order) + + +def ones(shape, dtype=None, chunk_size=None, gpu=None, order="C"): + """ + Return a new tensor of given shape and type, filled with ones. + + Parameters + ---------- + shape : int or sequence of ints + Shape of the new tensor, e.g., ``(2, 3)`` or ``2``. + dtype : data-type, optional + The desired data-type for the tensor, e.g., `mt.int8`. Default is + `mt.float64`. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + order : {'C', 'F'}, optional, default: C + Whether to store multi-dimensional data in row-major + (C-style) or column-major (Fortran-style) order in + memory. + + Returns + ------- + out : Tensor + Tensor of ones with the given shape, dtype, and order. + + See Also + -------- + zeros, ones_like + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.ones(5).execute() + array([ 1., 1., 1., 1., 1.]) + + >>> mt.ones((5,), dtype=int).execute() + array([1, 1, 1, 1, 1]) + + >>> mt.ones((2, 1)).execute() + array([[ 1.], + [ 1.]]) + + >>> s = (2,2) + >>> mt.ones(s).execute() + array([[ 1., 1.], + [ 1., 1.]]) + + """ + tensor_order = get_order( + order, + None, + available_options="CF", + err_msg="only 'C' or 'F' order is permitted", + ) + dtype = np.dtype(dtype or "f8") + op = TensorOnes( + dtype=dtype, shape=shape, chunk_size=chunk_size, gpu=gpu, order=order + ) + return op(shape, chunk_size=chunk_size, order=tensor_order) + + +class TensorOnesLike(TensorLike): + _op_type_ = OperandDef.TENSOR_ONES_LIKE + + _input = KeyField("input") + + def __init__(self, dtype=None, sparse=False, **kw): + dtype = np.dtype(dtype) if dtype is not None else None + super().__init__(dtype=dtype, sparse=sparse, **kw) + + @classmethod + def execute_sparse(cls, ctx, op): + chunk = op.outputs[0] + in_data = naked(ctx[op.input.key]) + xps = get_sparse_module(in_data) + xp = get_array_module(in_data) + ctx[chunk.key] = SparseNDArray( + xps.csr_matrix( + ( + xp.ones_like(in_data.data, dtype=chunk.op.dtype), + in_data.indices, + in_data.indptr, + ), + shape=in_data.shape, + ) + ) + + @classmethod + def execute(cls, ctx, op): + if op.sparse: + cls.execute_sparse(ctx, op) + else: + ctx[op.outputs[0].key] = create_array(op)( + "ones_like", ctx[op.inputs[0].key], dtype=op.dtype + ) + + +def ones_like(a, dtype=None, gpu=None, order="K"): + """ + Return a tensor of ones with the same shape and type as a given tensor. + + Parameters + ---------- + a : array_like + The shape and data-type of `a` define these same attributes of + the returned tensor. + dtype : data-type, optional + Overrides the data type of the result. + gpu : bool, optional + Allocate the tensor on GPU if True, None as default + order : {'C', 'F', 'A', or 'K'}, optional + Overrides the memory layout of the result. 'C' means C-order, + 'F' means F-order, 'A' means 'F' if `a` is Fortran contiguous, + 'C' otherwise. 'K' means match the layout of `a` as closely + as possible. + + Returns + ------- + out : Tensor + Tensor of ones with the same shape and type as `a`. + + See Also + -------- + zeros_like : Return a tensor of zeros with shape and type of input. + empty_like : Return a empty tensor with shape and type of input. + zeros : Return a new tensor setting values to zero. + ones : Return a new tensor setting values to one. + empty : Return a new uninitialized tensor. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.arange(6) + >>> x = x.reshape((2, 3)) + >>> x.execute() + array([[0, 1, 2], + [3, 4, 5]]) + >>> mt.ones_like(x).execute() + array([[1, 1, 1], + [1, 1, 1]]) + + >>> y = mt.arange(3, dtype=float) + >>> y.execute() + array([ 0., 1., 2.]) + >>> mt.ones_like(y).execute() + array([ 1., 1., 1.]) + + """ + a = tensor(a) + tensor_order = get_order(order, a.order) + gpu = a.op.gpu if gpu is None else gpu + op = TensorOnesLike(dtype=dtype, gpu=gpu, sparse=a.issparse(), order=order) + return op(a, order=tensor_order) diff --git a/python/xorbits/_mars/tensor/datasource/scalar.py b/python/xorbits/_mars/tensor/datasource/scalar.py new file mode 100644 index 000000000..0e303a058 --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/scalar.py @@ -0,0 +1,64 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..array_utils import create_array +from .core import TensorNoInput + + +class Scalar(TensorNoInput): + """ + Operand represents scalar type. + """ + + _op_type_ = OperandDef.SCALAR + + _data = AnyField("data") + + def __init__(self, data=None, **kw): + super().__init__(_data=data, **kw) + + @classmethod + def tile(cls, op): + chunk_op = op.copy().reset_key() + chunk = chunk_op.new_chunk(None, shape=(), index=()) + new_op = op.copy() + return new_op.new_tensors( + op.inputs, op.outputs[0].shape, chunks=[chunk], nsplits=() + ) + + @property + def data(self): + return self._data + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + if chunk.ndim != 0: + raise ValueError("Missing op for chunk") + ctx[chunk.key] = create_array(op)("asarray", op.data) + + +def scalar(data, dtype=None, gpu=None): + try: + arr = np.array(data, dtype=dtype) + op = Scalar(arr, dtype=arr.dtype, gpu=gpu) + shape = () + return op(shape) + except ValueError: + raise TypeError(f"Expect scalar, got: {data}") diff --git a/python/xorbits/_mars/tensor/datasource/tests/__init__.py b/python/xorbits/_mars/tensor/datasource/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/tensor/datasource/tests/test_datasource.py b/python/xorbits/_mars/tensor/datasource/tests/test_datasource.py new file mode 100644 index 000000000..1b51c8413 --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/tests/test_datasource.py @@ -0,0 +1,647 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import shutil +import tempfile +from copy import copy + +import numpy as np +import pytest +import scipy.sparse as sps + +try: + import tiledb +except (ImportError, OSError): # pragma: no cover + tiledb = None + +from .... import dataframe as md +from ....core import enter_mode, tile +from ... import arange, diag, full, linspace, ones, ones_like, tensor, tril, triu, zeros +from ...core import SparseTensor, Tensor +from .. import ( + TensorTileDBDataSource, + array, + asarray, + ascontiguousarray, + asfortranarray, + fromdense, + fromtiledb, +) +from ..array import CSRMatrixDataSource +from ..from_dataframe import from_dataframe +from ..from_dense import DenseToSparse +from ..ones import TensorOnes, TensorOnesLike +from ..tri import TensorTril, TensorTriu +from ..zeros import TensorZeros + + +def test_array(): + a = tensor([0, 1, 2], chunk_size=2) + + b = array(a) + assert a is not b + + c = asarray(a) + assert a is c + + +def test_ascontiguousarray(): + # dtype different + raw_a = np.asfortranarray(np.random.rand(2, 4)) + raw_b = np.ascontiguousarray(raw_a, dtype="f4") + + a = tensor(raw_a, chunk_size=2) + b = ascontiguousarray(a, dtype="f4") + + assert a.dtype == raw_a.dtype + assert a.flags["C_CONTIGUOUS"] == raw_a.flags["C_CONTIGUOUS"] + assert a.flags["F_CONTIGUOUS"] == raw_a.flags["F_CONTIGUOUS"] + + assert b.dtype == raw_b.dtype + assert b.flags["C_CONTIGUOUS"] == raw_b.flags["C_CONTIGUOUS"] + assert b.flags["F_CONTIGUOUS"] == raw_b.flags["F_CONTIGUOUS"] + + # no copy + raw_a = np.random.rand(2, 4) + raw_b = np.ascontiguousarray(raw_a) + + a = tensor(raw_a, chunk_size=2) + b = ascontiguousarray(a) + + assert a.dtype == raw_a.dtype + assert a.flags["C_CONTIGUOUS"] == raw_a.flags["C_CONTIGUOUS"] + assert a.flags["F_CONTIGUOUS"] == raw_a.flags["F_CONTIGUOUS"] + + assert b.dtype == raw_b.dtype + assert b.flags["C_CONTIGUOUS"] == raw_b.flags["C_CONTIGUOUS"] + assert b.flags["F_CONTIGUOUS"] == raw_b.flags["F_CONTIGUOUS"] + + +def test_asfortranarray(): + # dtype different + raw_a = np.random.rand(2, 4) + raw_b = np.asfortranarray(raw_a, dtype="f4") + + a = tensor(raw_a, chunk_size=2) + b = asfortranarray(a, dtype="f4") + + assert a.dtype == raw_a.dtype + assert a.flags["C_CONTIGUOUS"] == raw_a.flags["C_CONTIGUOUS"] + assert a.flags["F_CONTIGUOUS"] == raw_a.flags["F_CONTIGUOUS"] + + assert b.dtype == raw_b.dtype + assert b.flags["C_CONTIGUOUS"] == raw_b.flags["C_CONTIGUOUS"] + assert b.flags["F_CONTIGUOUS"] == raw_b.flags["F_CONTIGUOUS"] + + # no copy + raw_a = np.asfortranarray(np.random.rand(2, 4)) + raw_b = np.asfortranarray(raw_a) + + a = tensor(raw_a, chunk_size=2) + b = asfortranarray(a) + + assert a.dtype == raw_a.dtype + assert a.flags["C_CONTIGUOUS"] == raw_a.flags["C_CONTIGUOUS"] + assert a.flags["F_CONTIGUOUS"] == raw_a.flags["F_CONTIGUOUS"] + + assert b.dtype == raw_b.dtype + assert b.flags["C_CONTIGUOUS"] == raw_b.flags["C_CONTIGUOUS"] + assert b.flags["F_CONTIGUOUS"] == raw_b.flags["F_CONTIGUOUS"] + + +def test_ones(): + tensor = ones((10, 10, 8), chunk_size=(3, 3, 5)) + tensor = tile(tensor) + assert tensor.shape == (10, 10, 8) + assert len(tensor.chunks) == 32 + + tensor = ones((10, 3), chunk_size=(4, 2)) + tensor = tile(tensor) + assert tensor.shape == (10, 3) + + chunk = tensor.cix[1, 1] + assert tensor.get_chunk_slices(chunk.index) == (slice(4, 8), slice(2, 3)) + + tensor = ones((10, 5), chunk_size=(2, 3), gpu=True) + tensor = tile(tensor) + + assert tensor.op.gpu is True + assert tensor.chunks[0].op.gpu is True + + tensor1 = ones((10, 10, 8), chunk_size=(3, 3, 5)) + tensor1 = tile(tensor1) + + tensor2 = ones((10, 10, 8), chunk_size=(3, 3, 5)) + tensor2 = tile(tensor2) + + assert tensor1.chunks[0].op.key == tensor2.chunks[0].op.key + assert tensor1.chunks[0].key == tensor2.chunks[0].key + assert tensor1.chunks[0].op.key != tensor1.chunks[1].op.key + assert tensor1.chunks[0].key != tensor1.chunks[1].key + + tensor = ones((2, 3, 4)) + assert len(list(tensor)) == 2 + + tensor2 = ones((2, 3, 4), chunk_size=1) + assert tensor.op.key != tensor2.op.key + assert tensor.key != tensor2.key + + tensor3 = ones((2, 3, 3)) + assert tensor.op.key != tensor3.op.key + assert tensor.key != tensor3.key + + # test create chunk op of ones manually + chunk_op1 = TensorOnes(dtype=tensor.dtype) + chunk1 = chunk_op1.new_chunk(None, shape=(3, 3), index=(0, 0)) + chunk_op2 = TensorOnes(dtype=tensor.dtype) + chunk2 = chunk_op2.new_chunk(None, shape=(3, 4), index=(0, 1)) + assert chunk1.op.key != chunk2.op.key + assert chunk1.key != chunk2.key + + tensor = ones((100, 100), chunk_size=50) + tensor = tile(tensor) + assert len({c.op.key for c in tensor.chunks}) == 1 + assert len({c.key for c in tensor.chunks}) == 1 + + +def test_zeros(): + tensor = zeros((2, 3, 4)) + assert len(list(tensor)) == 2 + assert tensor.op.gpu is None + + tensor2 = zeros((2, 3, 4), chunk_size=1) + # tensor's op key must be equal to tensor2 + assert tensor.op.key != tensor2.op.key + assert tensor.key != tensor2.key + + tensor3 = zeros((2, 3, 3)) + assert tensor.op.key != tensor3.op.key + assert tensor.key != tensor3.key + + # test create chunk op of zeros manually + chunk_op1 = TensorZeros(dtype=tensor.dtype) + chunk1 = chunk_op1.new_chunk(None, shape=(3, 3), index=(0, 0)) + chunk_op2 = TensorZeros(dtype=tensor.dtype) + chunk2 = chunk_op2.new_chunk(None, shape=(3, 4), index=(0, 1)) + assert chunk1.op.key != chunk2.op.key + assert chunk1.key != chunk2.key + + tensor = zeros((100, 100), chunk_size=50) + tensor = tile(tensor) + assert len({c.op.key for c in tensor.chunks}) == 1 + assert len({c.key for c in tensor.chunks}) == 1 + + +def test_data_source(): + from ...base.broadcast_to import TensorBroadcastTo + + data = np.random.random((10, 3)) + t = tensor(data, chunk_size=2) + assert t.op.gpu is None + t = tile(t) + assert (t.chunks[0].op.data == data[:2, :2]).all() + assert (t.chunks[1].op.data == data[:2, 2:3]).all() + assert (t.chunks[2].op.data == data[2:4, :2]).all() + assert (t.chunks[3].op.data == data[2:4, 2:3]).all() + + assert t.key == tile(tensor(data, chunk_size=2)).key + assert t.key != tile(tensor(data, chunk_size=3)).key + assert t.key != tile(tensor(np.random.random((10, 3)), chunk_size=2)).key + + t = tensor(data, chunk_size=2, gpu=True) + t = tile(t) + + assert t.op.gpu is True + assert t.chunks[0].op.gpu is True + + t = full((2, 2), 2, dtype="f4") + assert t.op.gpu is None + assert t.shape == (2, 2) + assert t.dtype == np.float32 + + t = full((2, 2), [1.0, 2.0], dtype="f4") + assert t.shape == (2, 2) + assert t.dtype == np.float32 + assert isinstance(t.op, TensorBroadcastTo) + + with pytest.raises(ValueError): + full((2, 2), [1.0, 2.0, 3.0], dtype="f4") + + +def test_ufunc(): + t = ones((3, 10), chunk_size=2) + + x = np.add(t, [[1], [2], [3]]) + assert isinstance(x, Tensor) + + y = np.sum(t, axis=1) + assert isinstance(y, Tensor) + + +def test_arange(): + t = arange(10, chunk_size=3) + + assert t.op.gpu is False + t = tile(t) + + assert t.shape == (10,) + assert t.nsplits == ((3, 3, 3, 1),) + assert t.chunks[1].op.start == 3 + assert t.chunks[1].op.stop == 6 + + t = arange(0, 10, 3, chunk_size=2) + t = tile(t) + + assert t.shape == (4,) + assert t.nsplits == ((2, 2),) + assert t.chunks[0].op.start == 0 + assert t.chunks[0].op.stop == 6 + assert t.chunks[0].op.step == 3 + assert t.chunks[1].op.start == 6 + assert t.chunks[1].op.stop == 12 + assert t.chunks[1].op.step == 3 + + pytest.raises(TypeError, lambda: arange(10, start=0)) + pytest.raises(TypeError, lambda: arange(0, 10, stop=0)) + pytest.raises(TypeError, lambda: arange()) + pytest.raises( + ValueError, lambda: arange("1066-10-13", dtype=np.datetime64, chunks=3) + ) + + +def test_diag(): + # test 2-d, shape[0] == shape[1], k == 0 + v = tensor(np.arange(16).reshape(4, 4), chunk_size=2) + t = diag(v) + + assert t.shape == (4,) + assert t.op.gpu is None + t = tile(t) + assert t.nsplits == ((2, 2),) + + v = tensor(np.arange(16).reshape(4, 4), chunk_size=(2, 3)) + t = diag(v) + + assert t.shape == (4,) + t = tile(t) + assert t.nsplits == ((2, 1, 1),) + + # test 1-d, k == 0 + v = tensor(np.arange(3), chunk_size=2) + t = diag(v, sparse=True) + + assert t.shape == (3, 3) + t = tile(t) + assert t.nsplits == ((2, 1), (2, 1)) + assert len([c for c in t.chunks if c.op.__class__.__name__ == "TensorDiag"]) == 2 + assert t.chunks[0].op.sparse is True + + # test 2-d, shape[0] != shape[1] + v = tensor(np.arange(24).reshape(4, 6), chunk_size=2) + t = diag(v) + + assert t.shape == np.diag(np.arange(24).reshape(4, 6)).shape + t = tile(t) + assert tuple(sum(s) for s in t.nsplits) == t.shape + + v = tensor(np.arange(24).reshape(4, 6), chunk_size=2) + + t = diag(v, k=1) + assert t.shape == np.diag(np.arange(24).reshape(4, 6), k=1).shape + t = tile(t) + assert tuple(sum(s) for s in t.nsplits) == t.shape + + t = diag(v, k=2) + assert t.shape == np.diag(np.arange(24).reshape(4, 6), k=2).shape + t = tile(t) + assert tuple(sum(s) for s in t.nsplits) == t.shape + + t = diag(v, k=-1) + assert t.shape == np.diag(np.arange(24).reshape(4, 6), k=-1).shape + t = tile(t) + assert tuple(sum(s) for s in t.nsplits) == t.shape + + t = diag(v, k=-2) + assert t.shape == np.diag(np.arange(24).reshape(4, 6), k=-2).shape + t = tile(t) + assert tuple(sum(s) for s in t.nsplits) == t.shape + + # test tiled zeros' keys + a = arange(5, chunk_size=2) + t = diag(a) + t = tile(t) + # 1 and 2 of t.chunks is ones, they have different shapes + assert t.chunks[1].op.key != t.chunks[2].op.key + + +def test_linspace(): + a = linspace(2.0, 3.0, num=5, chunk_size=2) + + assert a.shape == (5,) + + a = tile(a) + assert a.nsplits == ((2, 2, 1),) + assert a.chunks[0].op.start == 2.0 + assert a.chunks[0].op.stop == 2.25 + assert a.chunks[1].op.start == 2.5 + assert a.chunks[1].op.stop == 2.75 + assert a.chunks[2].op.start == 3.0 + assert a.chunks[2].op.stop == 3.0 + + a = linspace(2.0, 3.0, num=5, endpoint=False, chunk_size=2) + + assert a.shape == (5,) + + a = tile(a) + assert a.nsplits == ((2, 2, 1),) + assert a.chunks[0].op.start == 2.0 + assert a.chunks[0].op.stop == 2.2 + assert a.chunks[1].op.start == 2.4 + assert a.chunks[1].op.stop == 2.6 + assert a.chunks[2].op.start == 2.8 + assert a.chunks[2].op.stop == 2.8 + + _, step = linspace(2.0, 3.0, num=5, chunk_size=2, retstep=True) + assert step == 0.25 + + +def test_triu_tril(): + a_data = np.arange(12).reshape(4, 3) + a = tensor(a_data, chunk_size=2) + + t = triu(a) + + assert t.op.gpu is None + + t = tile(t) + assert len(t.chunks) == 4 + assert isinstance(t.chunks[0].op, TensorTriu) + assert isinstance(t.chunks[1].op, TensorTriu) + assert isinstance(t.chunks[2].op, TensorZeros) + assert isinstance(t.chunks[3].op, TensorTriu) + + t = triu(a, k=1) + + t = tile(t) + assert len(t.chunks) == 4 + assert isinstance(t.chunks[0].op, TensorTriu) + assert isinstance(t.chunks[1].op, TensorTriu) + assert isinstance(t.chunks[2].op, TensorZeros) + assert isinstance(t.chunks[3].op, TensorZeros) + + t = triu(a, k=2) + + t = tile(t) + assert len(t.chunks) == 4 + assert isinstance(t.chunks[0].op, TensorZeros) + assert isinstance(t.chunks[1].op, TensorTriu) + assert isinstance(t.chunks[2].op, TensorZeros) + assert isinstance(t.chunks[3].op, TensorZeros) + + t = triu(a, k=-1) + + t = tile(t) + assert len(t.chunks) == 4 + assert isinstance(t.chunks[0].op, TensorTriu) + assert isinstance(t.chunks[1].op, TensorTriu) + assert isinstance(t.chunks[2].op, TensorTriu) + assert isinstance(t.chunks[3].op, TensorTriu) + + t = tril(a) + + assert t.op.gpu is None + + t = tile(t) + assert len(t.chunks) == 4 + assert isinstance(t.chunks[0].op, TensorTril) + assert isinstance(t.chunks[1].op, TensorZeros) + assert isinstance(t.chunks[2].op, TensorTril) + assert isinstance(t.chunks[3].op, TensorTril) + + t = tril(a, k=1) + + t = tile(t) + assert len(t.chunks) == 4 + assert isinstance(t.chunks[0].op, TensorTril) + assert isinstance(t.chunks[1].op, TensorTril) + assert isinstance(t.chunks[2].op, TensorTril) + assert isinstance(t.chunks[3].op, TensorTril) + + t = tril(a, k=-1) + + t = tile(t) + assert len(t.chunks) == 4 + assert isinstance(t.chunks[0].op, TensorTril) + assert isinstance(t.chunks[1].op, TensorZeros) + assert isinstance(t.chunks[2].op, TensorTril) + assert isinstance(t.chunks[3].op, TensorTril) + + t = tril(a, k=-2) + + t = tile(t) + assert len(t.chunks) == 4 + assert isinstance(t.chunks[0].op, TensorZeros) + assert isinstance(t.chunks[1].op, TensorZeros) + assert isinstance(t.chunks[2].op, TensorTril) + assert isinstance(t.chunks[3].op, TensorZeros) + + +def test_set_tensor_inputs(): + t1 = tensor([1, 2], chunk_size=2) + t2 = tensor([2, 3], chunk_size=2) + t3 = t1 + t2 + + t1c = copy(t1) + t2c = copy(t2) + + assert t1c is not t1 + assert t2c is not t2 + + assert t3.op.lhs is t1.data + assert t3.op.rhs is t2.data + assert t3.op.inputs == [t1.data, t2.data] + assert t3.inputs == [t1.data, t2.data] + + with pytest.raises(StopIteration): + t3.inputs = [] + + t1 = tensor([1, 2], chunk_size=2) + t2 = tensor([True, False], chunk_size=2) + t3 = t1[t2] + + t1c = copy(t1) + t2c = copy(t2) + t3c = copy(t3) + t3c.inputs = [t1c, t2c] + + with enter_mode(build=True): + assert t3c.op.input is t1c.data + assert t3c.op.indexes[0] is t2c.data + + +def test_from_spmatrix(): + t = tensor(sps.csr_matrix([[0, 0, 1], [1, 0, 0]], dtype="f8"), chunk_size=2) + + assert isinstance(t, SparseTensor) + assert isinstance(t.op, CSRMatrixDataSource) + assert t.issparse() is True + assert not t.op.gpu + + t = tile(t) + assert t.chunks[0].index == (0, 0) + assert isinstance(t.op, CSRMatrixDataSource) + assert not t.op.gpu + m = sps.csr_matrix([[0, 0], [1, 0]]) + assert np.array_equal(t.chunks[0].op.indices, m.indices) is True + assert np.array_equal(t.chunks[0].op.indptr, m.indptr) is True + assert np.array_equal(t.chunks[0].op.data, m.data) is True + assert np.array_equal(t.chunks[0].op.shape, m.shape) is True + + +def test_from_dense(): + t = fromdense(tensor([[0, 0, 1], [1, 0, 0]], chunk_size=2)) + + assert isinstance(t, SparseTensor) + assert isinstance(t.op, DenseToSparse) + assert t.issparse() is True + + t = tile(t) + assert t.chunks[0].index == (0, 0) + assert isinstance(t.op, DenseToSparse) + + +def test_ones_like(): + t1 = tensor([[0, 0, 1], [1, 0, 0]], chunk_size=2).tosparse() + t = ones_like(t1, dtype="f8") + + assert isinstance(t, SparseTensor) + assert isinstance(t.op, TensorOnesLike) + assert t.issparse() is True + assert t.op.gpu is None + + t = tile(t) + assert t.chunks[0].index == (0, 0) + assert isinstance(t.op, TensorOnesLike) + assert t.chunks[0].issparse() is True + + +def test_from_array(): + x = array([1, 2, 3]) + assert x.shape == (3,) + + y = array([x, x]) + assert y.shape == (2, 3) + + z = array((x, x, x)) + assert z.shape == (3, 3) + + +@pytest.mark.skipif(tiledb is None, reason="TileDB not installed") +def test_from_tile_db(): + ctx = tiledb.Ctx() + + for sparse in (True, False): + dom = tiledb.Domain( + tiledb.Dim(ctx=ctx, name="i", domain=(1, 30), tile=7, dtype=np.int32), + tiledb.Dim(ctx=ctx, name="j", domain=(1, 20), tile=3, dtype=np.int32), + tiledb.Dim(ctx=ctx, name="k", domain=(1, 10), tile=4, dtype=np.int32), + ctx=ctx, + ) + schema = tiledb.ArraySchema( + ctx=ctx, + domain=dom, + sparse=sparse, + attrs=[tiledb.Attr(ctx=ctx, name="a", dtype=np.float32)], + ) + + tempdir = tempfile.mkdtemp() + try: + # create tiledb array + array_type = tiledb.DenseArray if not sparse else tiledb.SparseArray + array_type.create(tempdir, schema) + + tensor = fromtiledb(tempdir) + assert isinstance(tensor.op, TensorTileDBDataSource) + assert tensor.op.issparse() == sparse + assert tensor.shape == (30, 20, 10) + assert tensor.extra_params.raw_chunk_size == (7, 3, 4) + assert tensor.op.tiledb_config is None + assert tensor.op.tiledb_uri == tempdir + assert tensor.op.tiledb_key is None + assert tensor.op.tiledb_timestamp is None + + tensor = tile(tensor) + + assert len(tensor.chunks) == 105 + assert isinstance(tensor.chunks[0].op, TensorTileDBDataSource) + assert tensor.chunks[0].op.issparse() == sparse + assert tensor.chunks[0].shape == (7, 3, 4) + assert tensor.chunks[0].op.tiledb_config is None + assert tensor.chunks[0].op.tiledb_uri == tempdir + assert tensor.chunks[0].op.tiledb_key is None + assert tensor.chunks[0].op.tiledb_timestamp is None + assert tensor.chunks[0].op.tiledb_dim_starts == (1, 1, 1) + + # test axis_offsets of chunk op + assert tensor.chunks[0].op.axis_offsets == (0, 0, 0) + assert tensor.chunks[1].op.axis_offsets == (0, 0, 4) + assert tensor.cix[0, 2, 2].op.axis_offsets == (0, 6, 8) + assert tensor.cix[0, 6, 2].op.axis_offsets == (0, 18, 8) + assert tensor.cix[4, 6, 2].op.axis_offsets == (28, 18, 8) + + tensor2 = fromtiledb(tempdir, ctx=ctx) + assert tensor2.op.tiledb_config == ctx.config().dict() + + tensor2 = tile(tensor2) + + assert tensor2.chunks[0].op.tiledb_config == ctx.config().dict() + finally: + shutil.rmtree(tempdir) + + +@pytest.mark.skipif(tiledb is None, reason="TileDB not installed") +def test_dim_start_float(): + ctx = tiledb.Ctx() + + dom = tiledb.Domain( + tiledb.Dim(ctx=ctx, name="i", domain=(0.0, 6.0), tile=6, dtype=np.float64), + ctx=ctx, + ) + schema = tiledb.ArraySchema( + ctx=ctx, + domain=dom, + sparse=True, + attrs=[tiledb.Attr(ctx=ctx, name="a", dtype=np.float32)], + ) + + tempdir = tempfile.mkdtemp() + try: + # create tiledb array + tiledb.SparseArray.create(tempdir, schema) + + with pytest.raises(ValueError): + fromtiledb(tempdir, ctx=ctx) + finally: + shutil.rmtree(tempdir) + + +def test_from_dataframe(): + mdf = md.DataFrame( + {"a": [0, 1, 2], "b": [3, 4, 5], "c": [0.1, 0.2, 0.3]}, + index=["c", "d", "e"], + chunk_size=2, + ) + tensor = from_dataframe(mdf) + assert tensor.shape == (3, 3) + assert np.float64 == tensor.dtype diff --git a/python/xorbits/_mars/tensor/datasource/tests/test_datasource_execution.py b/python/xorbits/_mars/tensor/datasource/tests/test_datasource_execution.py new file mode 100644 index 000000000..85769a551 --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/tests/test_datasource_execution.py @@ -0,0 +1,1168 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import tempfile +import time + +import numpy as np +import pandas as pd +import pytest + +try: + import scipy.sparse as sps +except ImportError: + sps = None +try: + import tiledb +except (ImportError, OSError): # pragma: no cover + tiledb = None +try: + import h5py +except ImportError: # pragma: no cover + h5py = None +try: + import zarr +except ImportError: # pragma: no cover + zarr = None + +from .... import dataframe as md +from .... import tensor as mt +from ....lib.sparse import SparseNDArray +from ....tests.core import require_cupy +from ....utils import lazy_import +from ...lib import nd_grid +from .. import ( + arange, + diag, + diagflat, + empty, + empty_like, + eye, + from_dataframe, + fromhdf5, + fromtiledb, + fromzarr, + full, + full_like, + indices, + linspace, + meshgrid, + ones_like, + tensor, + tril, + triu, + zeros, + zeros_like, +) + +cupy = lazy_import("cupy") + + +@require_cupy +def test_array_gpu_execution(setup_gpu): + raw = cupy.random.rand(20, 30) + t = tensor(raw, dtype="f8", chunk_size=10) + + res = t.execute().fetch() + expected = raw.astype("f8") + cupy.testing.assert_array_equal(res, expected) + + +def test_create_sparse_execution(setup): + mat = sps.csr_matrix([[0, 0, 2], [2, 0, 0]]) + t = tensor(mat, dtype="f8", chunk_size=2) + + res = t.execute().fetch() + assert isinstance(res, SparseNDArray) + assert res.dtype == np.float64 + np.testing.assert_array_equal(res.toarray(), mat.toarray()) + + t2 = ones_like(t, dtype="f4") + + res = t2.execute().fetch() + expected = sps.csr_matrix([[0, 0, 1], [1, 0, 0]]) + assert isinstance(res, SparseNDArray) + assert res.dtype == np.float32 + np.testing.assert_array_equal(res.toarray(), expected.toarray()) + + t3 = tensor(np.array([[0, 0, 2], [2, 0, 0]]), chunk_size=2).tosparse() + + res = t3.execute().fetch() + assert isinstance(res, SparseNDArray) + assert res.dtype == np.int_ + np.testing.assert_array_equal(res.toarray(), mat.toarray()) + + # test missing argument + t4 = tensor(np.array([[0, 0, 2], [2, 0, 0]]), chunk_size=2).tosparse(missing=2) + t4 = t4 + 1 + expected = mat.toarray() + raw = expected.copy() + expected[raw == 0] += 1 + expected[raw != 0] = 0 + + res = t4.execute().fetch() + assert isinstance(res, SparseNDArray) + assert res.dtype == np.int_ + np.testing.assert_array_equal(res.toarray(), expected) + + # test missing argument that is np.nan + t5 = tensor( + np.array([[np.nan, np.nan, 2], [2, np.nan, -999]]), chunk_size=2 + ).tosparse(missing=[np.nan, -999]) + t5 = (t5 + 1).todense(fill_value=np.nan) + expected = mat.toarray().astype(float) + expected[expected != 0] += 1 + expected[expected == 0] = np.nan + + res = t5.execute().fetch() + assert res.dtype == np.float64 + np.testing.assert_array_equal(res, expected) + + +def test_zeros_execution(setup): + t = zeros((20, 30), dtype="i8", chunk_size=10) + + res = t.execute().fetch() + np.testing.assert_array_equal(res, np.zeros((20, 30), dtype="i8")) + assert res[0].dtype == np.int64 + + t2 = zeros_like(t) + res = t2.execute().fetch() + np.testing.assert_array_equal(res, np.zeros((20, 30), dtype="i8")) + assert res[0].dtype == np.int64 + + t = zeros((20, 30), dtype="i4", chunk_size=5, sparse=True) + res = t.execute().fetch() + + assert res[0].nnz == 0 + + t = zeros((20, 30), dtype="i8", chunk_size=6, order="F") + res = t.execute().fetch() + expected = np.zeros((20, 30), dtype="i8", order="F") + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + +def test_empty_execution(setup): + t = empty((20, 30), dtype="i8", chunk_size=5) + + res = t.execute().fetch() + assert res.shape == (20, 30) + assert res.dtype == np.int64 + + t = empty((20, 30), chunk_size=10) + + res = t.execute().fetch() + assert res.shape == (20, 30) + assert res.dtype == np.float64 + + t2 = empty_like(t) + res = t2.execute().fetch() + assert res.shape == (20, 30) + assert res.dtype == np.float64 + + t = empty((20, 30), dtype="i8", chunk_size=5, order="F") + + res = t.execute().fetch() + expected = np.empty((20, 30), dtype="i8", order="F") + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + +def test_full_execution(setup): + t = full((2, 2), 1, dtype="f4", chunk_size=1) + + res = t.execute().fetch() + np.testing.assert_array_equal(res, np.full((2, 2), 1, dtype="f4")) + + t = full((2, 2), [1, 2], dtype="f8", chunk_size=1) + + res = t.execute().fetch() + np.testing.assert_array_equal(res, np.full((2, 2), [1, 2], dtype="f8")) + + t = full((2, 2), 1, dtype="f4", chunk_size=1, order="F") + + res = t.execute().fetch() + expected = np.full((2, 2), 1, dtype="f4", order="F") + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + t2 = full_like(t, 10, order="F") + + res = t2.execute().fetch() + expected = np.full((2, 2), 10, dtype="f4", order="F") + np.testing.assert_array_equal(res, expected) + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + +def test_arange_execution(setup): + t = arange(1, 20, 3, chunk_size=2) + + res = t.execute().fetch() + assert np.array_equal(res, np.arange(1, 20, 3)) is True + + t = arange(1, 20, 0.3, chunk_size=4) + + res = t.execute().fetch() + expected = np.arange(1, 20, 0.3) + assert np.allclose(res, expected) is True + + t = arange(1.0, 1.8, 0.3, chunk_size=4) + + res = t.execute().fetch() + expected = np.arange(1.0, 1.8, 0.3) + assert np.allclose(res, expected) is True + + t = arange("1066-10-13", "1066-10-31", dtype=np.datetime64, chunk_size=3) + + res = t.execute().fetch() + expected = np.arange("1066-10-13", "1066-10-31", dtype=np.datetime64) + assert np.array_equal(res, expected) is True + + +def test_diag_execution(setup): + # 2-d 6 * 6 + a = arange(36, chunk_size=5).reshape(6, 6) + + d = diag(a) + res = d.execute().fetch() + expected = np.diag(np.arange(36).reshape(6, 6)) + np.testing.assert_equal(res, expected) + + d = diag(a, k=1) + res = d.execute().fetch() + expected = np.diag(np.arange(36).reshape(6, 6), k=1) + np.testing.assert_equal(res, expected) + + d = diag(a, k=3) + res = d.execute().fetch() + expected = np.diag(np.arange(36).reshape(6, 6), k=3) + np.testing.assert_equal(res, expected) + + d = diag(a, k=-2) + res = d.execute().fetch() + expected = np.diag(np.arange(36).reshape(6, 6), k=-2) + np.testing.assert_equal(res, expected) + + d = diag(a, k=-5) + res = d.execute().fetch() + expected = np.diag(np.arange(36).reshape(6, 6), k=-5) + np.testing.assert_equal(res, expected) + + # 2-d 6 * 6 sparse, no tensor + a = sps.rand(6, 6, density=0.1) + + d = diag(a) + res = d.execute().fetch() + expected = np.diag(a.toarray()) + np.testing.assert_equal(res.toarray(), expected) + + d = diag(a, k=1) + res = d.execute().fetch() + expected = np.diag(a.toarray(), k=1) + np.testing.assert_equal(res.toarray(), expected) + + d = diag(a, k=3) + res = d.execute().fetch() + expected = np.diag(a.toarray(), k=3) + np.testing.assert_equal(res.toarray(), expected) + + d = diag(a, k=-2) + res = d.execute().fetch() + expected = np.diag(a.toarray(), k=-2) + np.testing.assert_equal(res.toarray(), expected) + + d = diag(a, k=-5) + res = d.execute().fetch() + expected = np.diag(a.toarray(), k=-5) + np.testing.assert_equal(res.toarray(), expected) + + # 2-d 6 * 6 sparse, from tensor + raw_a = sps.rand(6, 6, density=0.1) + a = tensor(raw_a, chunk_size=2) + + d = diag(a) + res = d.execute().fetch() + expected = np.diag(raw_a.toarray()) + np.testing.assert_equal(res.toarray(), expected) + + d = diag(a, k=1) + res = d.execute().fetch() + expected = np.diag(raw_a.toarray(), k=1) + np.testing.assert_equal(res.toarray(), expected) + + d = diag(a, k=3) + res = d.execute().fetch() + expected = np.diag(raw_a.toarray(), k=3) + np.testing.assert_equal(res.toarray(), expected) + + d = diag(a, k=-2) + res = d.execute().fetch() + expected = np.diag(raw_a.toarray(), k=-2) + np.testing.assert_equal(res.toarray(), expected) + + d = diag(a, k=-5) + res = d.execute().fetch() + expected = np.diag(raw_a.toarray(), k=-5) + np.testing.assert_equal(res.toarray(), expected) + + # 2-d 4 * 9 + a = arange(36, chunk_size=2).reshape(4, 9) + + d = diag(a) + res = d.execute().fetch() + expected = np.diag(np.arange(36).reshape(4, 9)) + np.testing.assert_equal(res, expected) + + d = diag(a, k=1) + res = d.execute().fetch() + expected = np.diag(np.arange(36).reshape(4, 9), k=1) + np.testing.assert_equal(res, expected) + + d = diag(a, k=3) + res = d.execute().fetch() + expected = np.diag(np.arange(36).reshape(4, 9), k=3) + np.testing.assert_equal(res, expected) + + d = diag(a, k=-2) + res = d.execute().fetch() + expected = np.diag(np.arange(36).reshape(4, 9), k=-2) + np.testing.assert_equal(res, expected) + + d = diag(a, k=-3) + res = d.execute().fetch() + expected = np.diag(np.arange(36).reshape(4, 9), k=-3) + np.testing.assert_equal(res, expected) + + # 2-d 4 * 9 sparse, no tensor + a = sps.rand(4, 9, density=0.1) + + d = diag(a) + res = d.execute().fetch() + expected = np.diag(a.toarray()) + np.testing.assert_equal(res.toarray(), expected) + + d = diag(a, k=1) + res = d.execute().fetch() + expected = np.diag(a.toarray(), k=1) + np.testing.assert_equal(res.toarray(), expected) + + d = diag(a, k=3) + res = d.execute().fetch() + expected = np.diag(a.toarray(), k=3) + np.testing.assert_equal(res.toarray(), expected) + + d = diag(a, k=-2) + res = d.execute().fetch() + expected = np.diag(a.toarray(), k=-2) + np.testing.assert_equal(res.toarray(), expected) + + d = diag(a, k=-3) + res = d.execute().fetch() + expected = np.diag(a.toarray(), k=-3) + np.testing.assert_equal(res.toarray(), expected) + + # 2-d 4 * 9 sparse, from tensor + raw_a = sps.rand(4, 9, density=0.1) + a = tensor(raw_a, chunk_size=2) + + d = diag(a) + res = d.execute().fetch() + expected = np.diag(raw_a.toarray()) + np.testing.assert_equal(res.toarray(), expected) + + d = diag(a, k=1) + res = d.execute().fetch() + expected = np.diag(raw_a.toarray(), k=1) + np.testing.assert_equal(res.toarray(), expected) + + d = diag(a, k=3) + res = d.execute().fetch() + expected = np.diag(raw_a.toarray(), k=3) + np.testing.assert_equal(res.toarray(), expected) + + d = diag(a, k=-2) + res = d.execute().fetch() + expected = np.diag(raw_a.toarray(), k=-2) + np.testing.assert_equal(res.toarray(), expected) + + d = diag(a, k=-3) + res = d.execute().fetch() + expected = np.diag(raw_a.toarray(), k=-3) + np.testing.assert_equal(res.toarray(), expected) + + # 1-d + a = arange(5, chunk_size=2) + + d = diag(a) + res = d.execute().fetch() + expected = np.diag(np.arange(5)) + np.testing.assert_equal(res, expected) + assert res.flags["C_CONTIGUOUS"] is True + assert res.flags["F_CONTIGUOUS"] is False + + d = diag(a, k=1) + res = d.execute().fetch() + expected = np.diag(np.arange(5), k=1) + np.testing.assert_equal(res, expected) + + d = diag(a, k=3) + res = d.execute().fetch() + expected = np.diag(np.arange(5), k=3) + np.testing.assert_equal(res, expected) + + d = diag(a, k=-2) + res = d.execute().fetch() + expected = np.diag(np.arange(5), k=-2) + np.testing.assert_equal(res, expected) + + d = diag(a, k=-3) + res = d.execute().fetch() + expected = np.diag(np.arange(5), k=-3) + np.testing.assert_equal(res, expected) + + d = diag(a, sparse=True) + res = d.execute().fetch() + expected = np.diag(np.arange(5)) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res.toarray(), expected) + + d = diag(a, k=1, sparse=True) + res = d.execute().fetch() + expected = np.diag(np.arange(5), k=1) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res.toarray(), expected) + + d = diag(a, k=2, sparse=True) + res = d.execute().fetch() + expected = np.diag(np.arange(5), k=2) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res.toarray(), expected) + + d = diag(a, k=-2, sparse=True) + res = d.execute().fetch() + expected = np.diag(np.arange(5), k=-2) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res.toarray(), expected) + + d = diag(a, k=-3, sparse=True) + res = d.execute().fetch() + expected = np.diag(np.arange(5), k=-3) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res.toarray(), expected) + + +def test_diagflat_execution(setup): + a = diagflat([[1, 2], [3, 4]], chunk_size=1) + + res = a.execute().fetch() + expected = np.diagflat([[1, 2], [3, 4]]) + np.testing.assert_equal(res, expected) + + d = tensor([[1, 2], [3, 4]], chunk_size=1) + a = diagflat(d) + + res = a.execute().fetch() + expected = np.diagflat([[1, 2], [3, 4]]) + np.testing.assert_equal(res, expected) + + a = diagflat([1, 2], 1, chunk_size=1) + + res = a.execute().fetch() + expected = np.diagflat([1, 2], 1) + np.testing.assert_equal(res, expected) + + d = tensor([[1, 2]], chunk_size=1) + a = diagflat(d, 1) + + res = a.execute().fetch() + expected = np.diagflat([1, 2], 1) + np.testing.assert_equal(res, expected) + + +def test_eye_execution(setup): + t = eye(5, chunk_size=2) + + res = t.execute().fetch() + expected = np.eye(5) + np.testing.assert_equal(res, expected) + + t = eye(5, k=1, chunk_size=2) + + res = t.execute().fetch() + expected = np.eye(5, k=1) + np.testing.assert_equal(res, expected) + + t = eye(5, k=2, chunk_size=2) + + res = t.execute().fetch() + expected = np.eye(5, k=2) + np.testing.assert_equal(res, expected) + + t = eye(5, k=-1, chunk_size=2) + + res = t.execute().fetch() + expected = np.eye(5, k=-1) + np.testing.assert_equal(res, expected) + + t = eye(5, k=-3, chunk_size=2) + + res = t.execute().fetch() + expected = np.eye(5, k=-3) + np.testing.assert_equal(res, expected) + + t = eye(5, M=3, k=1, chunk_size=2) + + res = t.execute().fetch() + expected = np.eye(5, M=3, k=1) + np.testing.assert_equal(res, expected) + + t = eye(5, M=3, k=-3, chunk_size=2) + + res = t.execute().fetch() + expected = np.eye(5, M=3, k=-3) + np.testing.assert_equal(res, expected) + + t = eye(5, M=7, k=1, chunk_size=2) + + res = t.execute().fetch() + expected = np.eye(5, M=7, k=1) + np.testing.assert_equal(res, expected) + + t = eye(5, M=8, k=-3, chunk_size=2) + + res = t.execute().fetch() + expected = np.eye(5, M=8, k=-3) + np.testing.assert_equal(res, expected) + + t = eye(2, dtype=int) + + res = t.execute().fetch() + assert res.dtype == np.int_ + + # test sparse + t = eye(5, sparse=True, chunk_size=2) + + res = t.execute().fetch() + expected = np.eye(5) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res.toarray(), expected) + + t = eye(5, k=1, sparse=True, chunk_size=2) + + res = t.execute().fetch() + expected = np.eye(5, k=1) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res.toarray(), expected) + + t = eye(5, k=2, sparse=True, chunk_size=2) + + res = t.execute().fetch() + expected = np.eye(5, k=2) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res.toarray(), expected) + + t = eye(5, k=-1, sparse=True, chunk_size=2) + + res = t.execute().fetch() + expected = np.eye(5, k=-1) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res.toarray(), expected) + + t = eye(5, k=-3, sparse=True, chunk_size=2) + + res = t.execute().fetch() + expected = np.eye(5, k=-3) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res.toarray(), expected) + + t = eye(5, M=3, k=1, sparse=True, chunk_size=2) + + res = t.execute().fetch() + expected = np.eye(5, M=3, k=1) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res.toarray(), expected) + + t = eye(5, M=3, k=-3, sparse=True, chunk_size=2) + + res = t.execute().fetch() + expected = np.eye(5, M=3, k=-3) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res.toarray(), expected) + + t = eye(5, M=7, k=1, sparse=True, chunk_size=2) + + res = t.execute().fetch() + expected = np.eye(5, M=7, k=1) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res.toarray(), expected) + + t = eye(5, M=8, k=-3, sparse=True, chunk_size=2) + + res = t.execute().fetch() + expected = np.eye(5, M=8, k=-3) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res.toarray(), expected) + + t = eye(5, M=9, k=-3, chunk_size=2, order="F") + + res = t.execute().fetch() + assert res.flags["C_CONTIGUOUS"] is True + assert res.flags["F_CONTIGUOUS"] is False + + +def test_linspace_execution(setup): + a = linspace(2.0, 9.0, num=11, chunk_size=3) + + res = a.execute().fetch() + expected = np.linspace(2.0, 9.0, num=11) + np.testing.assert_allclose(res, expected) + + a = linspace(2.0, 9.0, num=11, endpoint=False, chunk_size=3) + + res = a.execute().fetch() + expected = np.linspace(2.0, 9.0, num=11, endpoint=False) + np.testing.assert_allclose(res, expected) + + a = linspace(2.0, 9.0, num=11, chunk_size=3, dtype=int) + + res = a.execute().fetch() + assert res.dtype == np.int_ + + +def test_meshgrid_execution(setup): + a = arange(5, chunk_size=2) + b = arange(6, 12, chunk_size=3) + c = arange(12, 19, chunk_size=4) + + A, B, C = meshgrid(a, b, c) + + A_res = A.execute().fetch() + A_expected = np.meshgrid(np.arange(5), np.arange(6, 12), np.arange(12, 19))[0] + np.testing.assert_equal(A_res, A_expected) + + B_res = B.execute().fetch() + B_expected = np.meshgrid(np.arange(5), np.arange(6, 12), np.arange(12, 19))[1] + np.testing.assert_equal(B_res, B_expected) + + C_res = C.execute().fetch() + C_expected = np.meshgrid(np.arange(5), np.arange(6, 12), np.arange(12, 19))[2] + np.testing.assert_equal(C_res, C_expected) + + A, B, C = meshgrid(a, b, c, indexing="ij") + + A_res = A.execute().fetch() + A_expected = np.meshgrid( + np.arange(5), np.arange(6, 12), np.arange(12, 19), indexing="ij" + )[0] + np.testing.assert_equal(A_res, A_expected) + + B_res = B.execute().fetch() + B_expected = np.meshgrid( + np.arange(5), np.arange(6, 12), np.arange(12, 19), indexing="ij" + )[1] + np.testing.assert_equal(B_res, B_expected) + + C_res = C.execute().fetch() + C_expected = np.meshgrid( + np.arange(5), np.arange(6, 12), np.arange(12, 19), indexing="ij" + )[2] + np.testing.assert_equal(C_res, C_expected) + + A, B, C = meshgrid(a, b, c, sparse=True) + + A_res = A.execute().fetch() + A_expected = np.meshgrid( + np.arange(5), np.arange(6, 12), np.arange(12, 19), sparse=True + )[0] + np.testing.assert_equal(A_res, A_expected) + + B_res = B.execute().fetch() + B_expected = np.meshgrid( + np.arange(5), np.arange(6, 12), np.arange(12, 19), sparse=True + )[1] + np.testing.assert_equal(B_res, B_expected) + + C_res = C.execute().fetch() + C_expected = np.meshgrid( + np.arange(5), np.arange(6, 12), np.arange(12, 19), sparse=True + )[2] + np.testing.assert_equal(C_res, C_expected) + + A, B, C = meshgrid(a, b, c, indexing="ij", sparse=True) + + A_res = A.execute().fetch() + A_expected = np.meshgrid( + np.arange(5), np.arange(6, 12), np.arange(12, 19), indexing="ij", sparse=True + )[0] + np.testing.assert_equal(A_res, A_expected) + + B_res = B.execute().fetch() + B_expected = np.meshgrid( + np.arange(5), np.arange(6, 12), np.arange(12, 19), indexing="ij", sparse=True + )[1] + np.testing.assert_equal(B_res, B_expected) + + C_res = C.execute().fetch() + C_expected = np.meshgrid( + np.arange(5), np.arange(6, 12), np.arange(12, 19), indexing="ij", sparse=True + )[2] + np.testing.assert_equal(C_res, C_expected) + + +def test_indices_execution(setup): + grid = indices((2, 3), chunk_size=1) + + res = grid.execute().fetch() + expected = np.indices((2, 3)) + np.testing.assert_equal(res, expected) + + res = grid[0].execute().fetch() + np.testing.assert_equal(res, expected[0]) + + res = grid[1].execute().fetch() + np.testing.assert_equal(res, expected[1]) + + +def test_triu_execution(setup): + a = arange(24, chunk_size=2).reshape(2, 3, 4) + + t = triu(a) + + res = t.execute().fetch() + expected = np.triu(np.arange(24).reshape(2, 3, 4)) + np.testing.assert_equal(res, expected) + + t = triu(a, k=1) + + res = t.execute().fetch() + expected = np.triu(np.arange(24).reshape(2, 3, 4), k=1) + np.testing.assert_equal(res, expected) + + t = triu(a, k=2) + + res = t.execute().fetch() + expected = np.triu(np.arange(24).reshape(2, 3, 4), k=2) + np.testing.assert_equal(res, expected) + + t = triu(a, k=-1) + + res = t.execute().fetch() + expected = np.triu(np.arange(24).reshape(2, 3, 4), k=-1) + np.testing.assert_equal(res, expected) + + t = triu(a, k=-2) + + res = t.execute().fetch() + expected = np.triu(np.arange(24).reshape(2, 3, 4), k=-2) + np.testing.assert_equal(res, expected) + + # test sparse + a = arange(12, chunk_size=2).reshape(3, 4).tosparse() + + t = triu(a) + + res = t.execute().fetch() + expected = np.triu(np.arange(12).reshape(3, 4)) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res, expected) + + t = triu(a, k=1) + + res = t.execute().fetch() + expected = np.triu(np.arange(12).reshape(3, 4), k=1) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res, expected) + + t = triu(a, k=2) + + res = t.execute().fetch() + expected = np.triu(np.arange(12).reshape(3, 4), k=2) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res, expected) + + t = triu(a, k=-1) + + res = t.execute().fetch() + expected = np.triu(np.arange(12).reshape(3, 4), k=-1) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res, expected) + + t = triu(a, k=-2) + + res = t.execute().fetch() + expected = np.triu(np.arange(12).reshape(3, 4), k=-2) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res, expected) + + raw = np.asfortranarray(np.random.rand(10, 7)) + a = tensor(raw, chunk_size=3) + + t = triu(a, k=-2) + + res = t.execute().fetch() + expected = np.triu(raw, k=-2) + np.testing.assert_array_equal(res, expected) + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + +def test_tril_execution(setup): + a = arange(24, chunk_size=2).reshape(2, 3, 4) + + t = tril(a) + + res = t.execute().fetch() + expected = np.tril(np.arange(24).reshape(2, 3, 4)) + np.testing.assert_equal(res, expected) + + t = tril(a, k=1) + + res = t.execute().fetch() + expected = np.tril(np.arange(24).reshape(2, 3, 4), k=1) + np.testing.assert_equal(res, expected) + + t = tril(a, k=2) + + res = t.execute().fetch() + expected = np.tril(np.arange(24).reshape(2, 3, 4), k=2) + np.testing.assert_equal(res, expected) + + t = tril(a, k=-1) + + res = t.execute().fetch() + expected = np.tril(np.arange(24).reshape(2, 3, 4), k=-1) + np.testing.assert_equal(res, expected) + + t = tril(a, k=-2) + + res = t.execute().fetch() + expected = np.tril(np.arange(24).reshape(2, 3, 4), k=-2) + np.testing.assert_equal(res, expected) + + a = arange(12, chunk_size=2).reshape(3, 4).tosparse() + + t = tril(a) + + res = t.execute().fetch() + expected = np.tril(np.arange(12).reshape(3, 4)) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res, expected) + + t = tril(a, k=1) + + res = t.execute().fetch() + expected = np.tril(np.arange(12).reshape(3, 4), k=1) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res, expected) + + t = tril(a, k=2) + + res = t.execute().fetch() + expected = np.tril(np.arange(12).reshape(3, 4), k=2) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res, expected) + + t = tril(a, k=-1) + + res = t.execute().fetch() + expected = np.tril(np.arange(12).reshape(3, 4), k=-1) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res, expected) + + t = tril(a, k=-2) + + res = t.execute().fetch() + expected = np.tril(np.arange(12).reshape(3, 4), k=-2) + assert isinstance(res, SparseNDArray) + np.testing.assert_equal(res, expected) + + +def test_index_trick_execution(setup): + mgrid = nd_grid() + t = mgrid[0:5, 0:5] + + res = t.execute().fetch() + expected = np.lib.index_tricks.nd_grid()[0:5, 0:5] + np.testing.assert_equal(res, expected) + + t = mgrid[-1:1:5j] + + res = t.execute().fetch() + expected = np.lib.index_tricks.nd_grid()[-1:1:5j] + np.testing.assert_equal(res, expected) + + ogrid = nd_grid(sparse=True) + + t = ogrid[0:5, 0:5] + + res = [o.execute().fetch() for o in t] + expected = np.lib.index_tricks.nd_grid(sparse=True)[0:5, 0:5] + [np.testing.assert_equal(r, e) for r, e in zip(res, expected)] + + +@pytest.mark.skipif(tiledb is None, reason="tiledb not installed") +def test_read_tile_db_execution(setup): + ctx = tiledb.Ctx() + + tempdir = tempfile.mkdtemp() + try: + # create TileDB dense array + dom = tiledb.Domain( + tiledb.Dim(ctx=ctx, domain=(1, 100), tile=30, dtype=np.int32), + tiledb.Dim(ctx=ctx, domain=(0, 90), tile=22, dtype=np.int32), + tiledb.Dim(ctx=ctx, domain=(0, 9), tile=8, dtype=np.int32), + ctx=ctx, + ) + schema = tiledb.ArraySchema( + ctx=ctx, + domain=dom, + sparse=False, + attrs=[tiledb.Attr(ctx=ctx, dtype=np.float64)], + ) + tiledb.DenseArray.create(tempdir, schema) + + expected = np.random.rand(100, 91, 10) + with tiledb.DenseArray(uri=tempdir, ctx=ctx, mode="w") as arr: + arr.write_direct(expected) + + a = fromtiledb(tempdir, ctx=ctx) + result = a.execute().fetch() + + np.testing.assert_allclose(expected, result) + finally: + shutil.rmtree(tempdir) + + tempdir = tempfile.mkdtemp() + try: + # create 2-d TileDB sparse array + dom = tiledb.Domain( + tiledb.Dim(ctx=ctx, domain=(0, 99), tile=30, dtype=np.int32), + tiledb.Dim(ctx=ctx, domain=(2, 11), tile=8, dtype=np.int32), + ctx=ctx, + ) + schema = tiledb.ArraySchema( + ctx=ctx, + domain=dom, + sparse=True, + attrs=[tiledb.Attr(ctx=ctx, dtype=np.float64)], + ) + tiledb.SparseArray.create(tempdir, schema) + + expected = sps.rand(100, 10, density=0.01) + with tiledb.SparseArray(uri=tempdir, ctx=ctx, mode="w") as arr: + I, J = expected.row, expected.col + 2 + arr[I, J] = {arr.attr(0).name: expected.data} + + a = fromtiledb(tempdir, ctx=ctx) + result = a.execute().fetch() + + np.testing.assert_allclose(expected.toarray(), result.toarray()) + finally: + shutil.rmtree(tempdir) + + tempdir = tempfile.mkdtemp() + try: + # create 1-d TileDB sparse array + dom = tiledb.Domain( + tiledb.Dim(ctx=ctx, domain=(1, 100), tile=30, dtype=np.int32), ctx=ctx + ) + schema = tiledb.ArraySchema( + ctx=ctx, + domain=dom, + sparse=True, + attrs=[tiledb.Attr(ctx=ctx, dtype=np.float64)], + ) + tiledb.SparseArray.create(tempdir, schema) + + expected = sps.rand(1, 100, density=0.05) + with tiledb.SparseArray(uri=tempdir, ctx=ctx, mode="w") as arr: + arr[expected.col + 1] = expected.data + + a = fromtiledb(tempdir, ctx=ctx) + result = a.execute().fetch() + + np.testing.assert_allclose(expected.toarray()[0], result.toarray()) + finally: + shutil.rmtree(tempdir) + + tempdir = tempfile.mkdtemp() + try: + # create TileDB dense array with column-major + dom = tiledb.Domain( + tiledb.Dim(ctx=ctx, domain=(1, 100), tile=30, dtype=np.int32), + tiledb.Dim(ctx=ctx, domain=(0, 90), tile=22, dtype=np.int32), + tiledb.Dim(ctx=ctx, domain=(0, 9), tile=8, dtype=np.int32), + ctx=ctx, + ) + schema = tiledb.ArraySchema( + ctx=ctx, + domain=dom, + sparse=False, + cell_order="F", + attrs=[tiledb.Attr(ctx=ctx, dtype=np.float64)], + ) + tiledb.DenseArray.create(tempdir, schema) + + expected = np.asfortranarray(np.random.rand(100, 91, 10)) + with tiledb.DenseArray(uri=tempdir, ctx=ctx, mode="w") as arr: + arr.write_direct(expected) + + a = fromtiledb(tempdir, ctx=ctx) + result = a.execute().fetch() + + np.testing.assert_allclose(expected, result) + assert result.flags["F_CONTIGUOUS"] is True + assert result.flags["C_CONTIGUOUS"] is False + finally: + shutil.rmtree(tempdir) + + +def test_from_dataframe_execution(setup): + mdf = md.DataFrame( + {"angle": [0, 3, 4], "degree": [360, 180, 360]}, + index=["circle", "triangle", "rectangle"], + ) + tensor_result = from_dataframe(mdf).execute().fetch() + tensor_expected = mt.tensor([[0, 360], [3, 180], [4, 360]]).execute().fetch() + np.testing.assert_equal(tensor_result, tensor_expected) + + # test up-casting + mdf2 = md.DataFrame({"a": [0.1, 0.2, 0.3], "b": [1, 2, 3]}) + tensor_result2 = from_dataframe(mdf2).execute().fetch() + np.testing.assert_equal(tensor_result2[0].dtype, np.dtype("float64")) + tensor_expected2 = mt.tensor([[0.1, 1.0], [0.2, 2.0], [0.3, 3.0]]).execute().fetch() + np.testing.assert_equal(tensor_result2, tensor_expected2) + + raw = [[0.1, 0.2, 0.4], [0.4, 0.7, 0.3]] + mdf3 = md.DataFrame(raw, columns=list("abc"), chunk_size=2) + tensor_result3 = from_dataframe(mdf3).execute().fetch() + np.testing.assert_array_equal(tensor_result3, np.asarray(raw)) + assert tensor_result3.flags["F_CONTIGUOUS"] is True + assert tensor_result3.flags["C_CONTIGUOUS"] is False + + # test from series + series = md.Series([1, 2, 3]) + tensor_result = series.to_tensor().execute().fetch() + np.testing.assert_array_equal(tensor_result, np.array([1, 2, 3])) + + series = md.Series(range(10), chunk_size=3) + tensor_result = series.to_tensor().execute().fetch() + np.testing.assert_array_equal(tensor_result, np.arange(10)) + + # test from index + index = md.Index(pd.MultiIndex.from_tuples([(0, 1), (2, 3), (4, 5)])) + tensor_result = index.to_tensor(extract_multi_index=True).execute().fetch() + np.testing.assert_array_equal(tensor_result, np.arange(6).reshape((3, 2))) + + index = md.Index(pd.MultiIndex.from_tuples([(0, 1), (2, 3), (4, 5)])) + tensor_result = index.to_tensor(extract_multi_index=False).execute().fetch() + np.testing.assert_array_equal( + tensor_result, pd.MultiIndex.from_tuples([(0, 1), (2, 3), (4, 5)]).to_series() + ) + + +@pytest.mark.skipif(h5py is None, reason="h5py not installed") +def test_read_hdf5_execution(setup): + test_array = np.random.RandomState(0).rand(20, 10) + group_name = "test_group" + dataset_name = "test_dataset" + + with pytest.raises(TypeError): + fromhdf5(object()) + + with tempfile.TemporaryDirectory() as d: + filename = os.path.join(d, f"test_read_{int(time.time())}.hdf5") + with h5py.File(filename, "w") as f: + g = f.create_group(group_name) + g.create_dataset(dataset_name, chunks=(7, 4), data=test_array) + + # test filename + r = fromhdf5(filename, group=group_name, dataset=dataset_name) + + result = r.execute().fetch() + np.testing.assert_array_equal(result, test_array) + assert r.extra_params["raw_chunk_size"] == (7, 4) + + with pytest.raises(ValueError): + fromhdf5(filename) + + with pytest.raises(ValueError): + fromhdf5(filename, dataset="non_exist") + + with h5py.File(filename, "r") as f: + # test file + r = fromhdf5(f, group=group_name, dataset=dataset_name) + + result = r.execute().fetch() + np.testing.assert_array_equal(result, test_array) + + with pytest.raises(ValueError): + fromhdf5(f) + + with pytest.raises(ValueError): + fromhdf5(f, dataset="non_exist") + + # test dataset + ds = f[f"{group_name}/{dataset_name}"] + r = fromhdf5(ds) + + result = r.execute().fetch() + np.testing.assert_array_equal(result, test_array) + + +@pytest.mark.skipif(zarr is None, reason="zarr not installed") +def test_read_zarr_execution(setup): + session = setup + + test_array = np.random.RandomState(0).rand(20, 10) + group_name = "test_group" + dataset_name = "test_dataset" + + with pytest.raises(TypeError): + fromzarr(object()) + + with tempfile.TemporaryDirectory() as d: + path = os.path.join(d, f"test_read_{int(time.time())}.zarr") + + group = zarr.group(path) + arr = group.array(group_name + "/" + dataset_name, test_array, chunks=(7, 4)) + + r = fromzarr(arr) + + result = r.execute().fetch() + np.testing.assert_array_equal(result, test_array) + assert len(session._session._tileable_to_fetch[r.data].chunks) > 1 + + arr = zarr.open_array(f"{path}/{group_name}/{dataset_name}") + r = fromzarr(arr) + + result = r.execute().fetch() + np.testing.assert_array_equal(result, test_array) + assert len(session._session._tileable_to_fetch[r.data].chunks) > 1 + + r = fromzarr(path, group=group_name, dataset=dataset_name) + + result = r.execute().fetch() + np.testing.assert_array_equal(result, test_array) + assert len(session._session._tileable_to_fetch[r.data].chunks) > 1 + + r = fromzarr(path + "/" + group_name + "/" + dataset_name) + + result = r.execute().fetch() + np.testing.assert_array_equal(result, test_array) + assert len(session._session._tileable_to_fetch[r.data].chunks) > 1 diff --git a/python/xorbits/_mars/tensor/datasource/tri.py b/python/xorbits/_mars/tensor/datasource/tri.py new file mode 100644 index 000000000..c1c0ca4ed --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/tri.py @@ -0,0 +1,204 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np + +from ... import opcodes as OperandDef +from ...lib import sparse +from ...serialization.serializables import Int32Field, KeyField +from ...utils import has_unknown_shape +from ..array_utils import create_array +from ..core import TensorOrder +from .array import tensor as astensor +from .core import TensorHasInput +from .zeros import TensorZeros + + +class TensorTri(TensorHasInput): + def __call__(self, m, order=None): + order = TensorOrder.C_ORDER if order is None else order + return self.new_tensor([m], shape=m.shape, order=order) + + def to_chunk_op(self, *args): + (k,) = args + op = self.copy().reset_key() + op._k = k + return op + + @classmethod + def tile(cls, op): + if has_unknown_shape(*op.inputs): + yield + tensor = op.outputs[0] + + m = op.input + k = op.k + is_triu = type(op) == TensorTriu + + fx = lambda x, y: x - y + k + nsplits = m.nsplits + cum_size = [np.cumsum(s).tolist() for s in nsplits] + + out_chunks = [] + for out_idx in itertools.product(*[range(len(s)) for s in nsplits]): + i, j = out_idx[-2:] + ld_pos = cum_size[-2][i] - 1, cum_size[-1][j] - nsplits[-1][j] + ru_pos = cum_size[-2][i] - nsplits[-2][i], cum_size[-1][j] - 1 + + ld_fx = fx(*ld_pos) + ru_fx = fx(*ru_pos) + + chunk_shape = tuple(nsplits[i][idx] for i, idx in enumerate(out_idx)) + if (is_triu and ld_fx > 0 and ru_fx > 0) or ( + not is_triu and ld_fx < 0 and ru_fx < 0 + ): + # does not cross, fill with zeros + chunk_op = TensorZeros( + dtype=op.dtype, + gpu=op.gpu, + sparse=op.sparse, + shape=chunk_shape, + order=tensor.order.value, + ) + out_chunk = chunk_op.new_chunk( + None, shape=chunk_shape, index=out_idx, order=tensor.order + ) + else: + lu_pos = ru_pos[0], ld_pos[1] + chunk_k = fx(*lu_pos) + + input_chunk = m.cix[out_idx] + chunk_op = op.to_chunk_op(chunk_k) + out_chunk = chunk_op.new_chunk( + [input_chunk], shape=chunk_shape, index=out_idx, order=tensor.order + ) + + out_chunks.append(out_chunk) + + new_op = op.copy() + params = tensor.params + params["chunks"] = out_chunks + params["nsplits"] = m.nsplits + return new_op.new_tensors(op.inputs, kws=[params]) + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + f = "triu" if isinstance(op, TensorTriu) else "tril" + if op.sparse: + ctx[chunk.key] = getattr(sparse, f)(ctx[op.inputs[0].key], k=op.k) + else: + ctx[chunk.key] = create_array(op)(f, ctx[op.inputs[0].key], op.k) + + +class TensorTriu(TensorTri): + _op_type_ = OperandDef.TENSOR_TRIU + + _input = KeyField("input") + _k = Int32Field("k") + + def __init__(self, k=None, **kw): + super().__init__(_k=k, **kw) + + @property + def k(self): + return self._k + + +def triu(m, k=0, gpu=None): + """ + Upper triangle of a tensor. + + Return a copy of a matrix with the elements below the `k`-th diagonal + zeroed. + + Please refer to the documentation for `tril` for further details. + + See Also + -------- + tril : lower triangle of a tensor + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.triu([[1,2,3],[4,5,6],[7,8,9],[10,11,12]], -1).execute() + array([[ 1, 2, 3], + [ 4, 5, 6], + [ 0, 8, 9], + [ 0, 0, 12]]) + + """ + m = astensor(m) + gpu = m.op.gpu if gpu is None else gpu + op = TensorTriu(k, dtype=m.dtype, sparse=m.issparse(), gpu=gpu) + return op(m) + + +class TensorTril(TensorTri): + _op_type_ = OperandDef.TENSOR_TRIL + + _input = KeyField("input") + _k = Int32Field("k") + + def __init__(self, k=None, **kw): + super().__init__(_k=k, **kw) + + @property + def k(self): + return self._k + + +def tril(m, k=0, gpu=None): + """ + Lower triangle of a tensor. + + Return a copy of a tensor with elements above the `k`-th diagonal zeroed. + + Parameters + ---------- + m : array_like, shape (M, N) + Input tensor. + k : int, optional + Diagonal above which to zero elements. `k = 0` (the default) is the + main diagonal, `k < 0` is below it and `k > 0` is above. + gpu : bool, optional + Allocate the tensor on GPU if True, None as default + + Returns + ------- + tril : Tensor, shape (M, N) + Lower triangle of `m`, of same shape and data-type as `m`. + + See Also + -------- + triu : same thing, only for the upper triangle + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.tril([[1,2,3],[4,5,6],[7,8,9],[10,11,12]], -1).execute() + array([[ 0, 0, 0], + [ 4, 0, 0], + [ 7, 8, 0], + [10, 11, 12]]) + + """ + m = astensor(m) + gpu = m.op.gpu if gpu is None else gpu + op = TensorTril(k, dtype=m.dtype, sparse=m.issparse(), gpu=gpu) + return op(m) diff --git a/python/xorbits/_mars/tensor/datasource/zeros.py b/python/xorbits/_mars/tensor/datasource/zeros.py new file mode 100644 index 000000000..250dc9824 --- /dev/null +++ b/python/xorbits/_mars/tensor/datasource/zeros.py @@ -0,0 +1,231 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import numpy as np + +from ... import opcodes as OperandDef +from ...lib import sparse +from ...lib.sparse.core import get_array_module, get_sparse_module, naked +from ...serialization.serializables import ( + AnyField, + FieldTypes, + KeyField, + StringField, + TupleField, +) +from ..array_utils import create_array +from ..utils import get_order +from .array import tensor +from .core import TensorLike, TensorNoInput + + +class TensorZeros(TensorNoInput): + _op_type_ = OperandDef.TENSOR_ZEROS + + order = StringField("order") + shape = TupleField("shape", FieldTypes.int64) + chunk_size = AnyField("chunk_size") + + def __init__(self, shape=None, **kwargs): + if type(shape) is int: + shape = (shape,) + super().__init__(shape=shape, **kwargs) + + def to_chunk_op(self, *args): + chunk_op = super().to_chunk_op(*args) + chunk_op.shape = args[0] + chunk_op.chunk_size = None + return chunk_op + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + if op.sparse: + ctx[chunk.key] = sparse.zeros(op.shape, dtype=op.dtype, gpu=op.gpu) + else: + ctx[chunk.key] = create_array(op)( + "zeros", op.shape, dtype=op.dtype, order=op.order + ) + + +def zeros(shape, dtype=None, chunk_size=None, gpu=None, sparse=False, order="C"): + """ + Return a new tensor of given shape and type, filled with zeros. + + Parameters + ---------- + shape : int or sequence of ints + Shape of the new tensor, e.g., ``(2, 3)`` or ``2``. + dtype : data-type, optional + The desired data-type for the array, e.g., `mt.int8`. Default is + `mt.float64`. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + sparse: bool, optional + Create sparse tensor if True, False as default + order : {'C', 'F'}, optional, default: 'C' + Whether to store multi-dimensional data in row-major + (C-style) or column-major (Fortran-style) order in + memory. + + Returns + ------- + out : Tensor + Tensor of zeros with the given shape, dtype, and order. + + See Also + -------- + zeros_like : Return a tensor of zeros with shape and type of input. + ones_like : Return a tensor of ones with shape and type of input. + empty_like : Return a empty tensor with shape and type of input. + ones : Return a new tensor setting values to one. + empty : Return a new uninitialized tensor. + + Examples + -------- + >>> import mars.tensor as mt + >>> mt.zeros(5).execute() + array([ 0., 0., 0., 0., 0.]) + + >>> mt.zeros((5,), dtype=int).execute() + array([0, 0, 0, 0, 0]) + + >>> mt.zeros((2, 1)).execute() + array([[ 0.], + [ 0.]]) + + >>> s = (2,2) + >>> mt.zeros(s).execute() + array([[ 0., 0.], + [ 0., 0.]]) + + >>> mt.zeros((2,), dtype=[('x', 'i4'), ('y', 'i4')]).execute() # custom dtype + array([(0, 0), (0, 0)], + dtype=[('x', '>> import mars.tensr as mt + >>> x = mt.arange(6) + >>> x = x.reshape((2, 3)) + >>> x.execute() + array([[0, 1, 2], + [3, 4, 5]]) + + >>> mt.zeros_like(x).execute() + array([[0, 0, 0], + [0, 0, 0]]) + + >>> y = mt.arange(3, dtype=float) + >>> y.execute() + array([ 0., 1., 2.]) + + >>> mt.zeros_like(y).execute() + array([ 0., 0., 0.]) + """ + a = tensor(a) + tensor_order = get_order(order, a.order) + gpu = a.op.gpu if gpu is None else gpu + op = TensorZerosLike(dtype=dtype, gpu=gpu, sparse=a.issparse(), order=order) + return op(a, order=tensor_order) diff --git a/python/xorbits/_mars/tensor/datastore/__init__.py b/python/xorbits/_mars/tensor/datastore/__init__.py new file mode 100644 index 000000000..bdf35e360 --- /dev/null +++ b/python/xorbits/_mars/tensor/datastore/__init__.py @@ -0,0 +1,22 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .to_hdf5 import TensorHDF5DataStore, tohdf5 +from .to_tiledb import TensorTileDBConsolidate, TensorTileDBDataStore, totiledb +from .to_vineyard import ( + TensorVineyardDataStoreChunk, + TensorVineyardDataStoreMeta, + tovineyard, +) +from .to_zarr import TensorToZarrDataStore, tozarr diff --git a/python/xorbits/_mars/tensor/datastore/core.py b/python/xorbits/_mars/tensor/datastore/core.py new file mode 100644 index 000000000..0bee60ff7 --- /dev/null +++ b/python/xorbits/_mars/tensor/datastore/core.py @@ -0,0 +1,56 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..operands import TensorHasInput, TensorOperandMixin + + +class TensorDataStore(TensorHasInput, TensorOperandMixin): + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = inputs[0] + + def __call__(self, a, order=None): + shape = (0,) * a.ndim + order = a.order if order is None else order + return self.new_tensor([a], shape, order=order) + + @classmethod + def _get_out_chunk(cls, op, in_chunk): + chunk_op = op.copy().reset_key() + out_chunk_shape = (0,) * in_chunk.ndim + return chunk_op.new_chunk( + [in_chunk], out_chunk_shape, index=in_chunk.index, order=op.outputs[0].order + ) + + @classmethod + def _process_out_chunks(cls, op, out_chunks): + return out_chunks + + @classmethod + def tile(cls, op): + in_tensor = op.input + + out_chunks = [] + for chunk in in_tensor.chunks: + out_chunk = cls._get_out_chunk(op, chunk) + out_chunks.append(out_chunk) + out_chunks = cls._process_out_chunks(op, out_chunks) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + op.outputs[0].shape, + chunks=out_chunks, + nsplits=((0,) for _ in range(in_tensor.ndim)), + ) diff --git a/python/xorbits/_mars/tensor/datastore/tests/__init__.py b/python/xorbits/_mars/tensor/datastore/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/tensor/datastore/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/tensor/datastore/tests/test_datastore.py b/python/xorbits/_mars/tensor/datastore/tests/test_datastore.py new file mode 100644 index 000000000..4430f2852 --- /dev/null +++ b/python/xorbits/_mars/tensor/datastore/tests/test_datastore.py @@ -0,0 +1,109 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import shutil +import tempfile + +import numpy as np +import pytest + +try: + import tiledb +except (ImportError, OSError): # pragma: no cover + tiledb = None + +from ....core import tile +from ... import random +from .. import totiledb +from ..utils import check_tiledb_array_with_tensor, get_tiledb_schema_from_tensor + + +@pytest.mark.skipif(tiledb is None, reason="TileDB not installed") +def test_get_tile_db_schema(): + ctx = tiledb.Ctx() + + nsplits = ((1, 2), (3, 1), (2, 2, 1)) + a = random.rand(3, 4, 5, dtype=np.float64, chunk_size=nsplits) + schema = get_tiledb_schema_from_tensor(a, ctx, nsplits) + assert schema.ndim == 3 + assert schema.shape == (3, 4, 5) + assert [schema.domain.dim(i).tile for i in range(a.ndim)] == [2, 3, 2] + assert schema.attr(0).dtype == a.dtype + + +@pytest.mark.skipif(tiledb is None, reason="TileDB not installed") +def test_check_tile_db(): + ctx = tiledb.Ctx() + + tempdir = tempfile.mkdtemp() + try: + np_a = np.random.rand(2, 3) + tiledb_a = tiledb.DenseArray.from_numpy(ctx=ctx, uri=tempdir, array=np_a) + + with pytest.raises(ValueError): + # ndim not match + check_tiledb_array_with_tensor(random.rand(2, 3, 4), tiledb_a) + + with pytest.raises(ValueError): + # shape not matchn + check_tiledb_array_with_tensor(random.rand(2, 4), tiledb_a) + + with pytest.raises(ValueError): + # dtype not match + check_tiledb_array_with_tensor( + random.rand(2, 3, dtype=np.float32), tiledb_a + ) + + # legal + check_tiledb_array_with_tensor(random.rand(2, 3), tiledb_a) + finally: + shutil.rmtree(tempdir) + + +@pytest.mark.skipif(tiledb is None, reason="TileDB not installed") +def test_store_tile_db(): + ctx = tiledb.Ctx() + tempdir = tempfile.mkdtemp() + try: + t = random.rand(50, 30, chunk_size=13) + t2 = t + 1 + + saved = totiledb(tempdir, t2) + assert saved.shape == (0, 0) + assert saved.op.tiledb_config is None + assert saved.op.tiledb_uri == tempdir + + with pytest.raises(tiledb.TileDBError): + tiledb.DenseArray(ctx=ctx, uri=tempdir) + + # tiledb array is created in the tile + saved = tile(saved) + + # no error + tiledb.DenseArray(ctx=ctx, uri=tempdir) + + # TileDB consolidation + assert len(saved.chunks) == 1 + + assert saved.chunks[0].inputs[0].op.axis_offsets == (0, 0) + assert saved.chunks[0].inputs[1].op.axis_offsets == (0, 13) + assert saved.chunks[0].inputs[2].op.axis_offsets == (0, 26) # input (0, 2) + assert saved.chunks[0].inputs[5].op.axis_offsets == (13, 26) # input (1, 2) + assert saved.chunks[0].inputs[11].op.axis_offsets == (39, 26) # input (3, 2) + + with pytest.raises(ValueError): + t3 = random.rand(30, 50) + totiledb(tempdir, t3, ctx=ctx) # shape incompatible + finally: + shutil.rmtree(tempdir) diff --git a/python/xorbits/_mars/tensor/datastore/tests/test_datastore_execution.py b/python/xorbits/_mars/tensor/datastore/tests/test_datastore_execution.py new file mode 100644 index 000000000..a858e9937 --- /dev/null +++ b/python/xorbits/_mars/tensor/datastore/tests/test_datastore_execution.py @@ -0,0 +1,245 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import tempfile +import time + +import numpy as np +import pytest +import scipy.sparse as sps + +try: + import tiledb +except (ImportError, OSError): # pragma: no cover + tiledb = None +try: + import h5py +except ImportError: # pragma: no cover + h5py = None +try: + import zarr + from numcodecs import Blosc, Delta, Zstd +except ImportError: # pragma: no cover + zarr = None +try: + import vineyard +except ImportError: + vineyard = None + +from ... import arange, tensor, tohdf5, totiledb, tovineyard, tozarr +from ...datasource import fromvineyard + +_exec_timeout = 120 if "CI" in os.environ else -1 + + +@pytest.mark.skipif(tiledb is None, reason="tiledb not installed") +def test_store_tiledb_execution(setup): + ctx = tiledb.Ctx() + + tempdir = tempfile.mkdtemp() + try: + # store TileDB dense array + expected = np.random.rand(8, 4, 3) + a = tensor(expected, chunk_size=(3, 3, 2)) + save = totiledb(tempdir, a, ctx=ctx) + save.execute() + + with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr: + np.testing.assert_allclose(expected, arr.read_direct()) + finally: + shutil.rmtree(tempdir) + + tempdir = tempfile.mkdtemp() + try: + # store tensor with 1 chunk to TileDB dense array + a = arange(12) + save = totiledb(tempdir, a, ctx=ctx) + save.execute() + + with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr: + np.testing.assert_allclose(np.arange(12), arr.read_direct()) + finally: + shutil.rmtree(tempdir) + + tempdir = tempfile.mkdtemp() + try: + # store 2-d TileDB sparse array + expected = sps.random(8, 7, density=0.1) + a = tensor(expected, chunk_size=(3, 5)) + save = totiledb(tempdir, a, ctx=ctx) + save.execute() + + with tiledb.SparseArray(uri=tempdir, ctx=ctx) as arr: + data = arr[:, :] + coords = data["coords"] + value = data[arr.attr(0).name] + ij = tuple(coords[arr.domain.dim(k).name] for k in range(arr.ndim)) + result = sps.coo_matrix((value, ij), shape=arr.shape) + + np.testing.assert_allclose(expected.toarray(), result.toarray()) + finally: + shutil.rmtree(tempdir) + + tempdir = tempfile.mkdtemp() + try: + # store TileDB dense array + expected = np.asfortranarray(np.random.rand(8, 4, 3)) + a = tensor(expected, chunk_size=(3, 3, 2)) + save = totiledb(tempdir, a, ctx=ctx) + save.execute() + + with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr: + np.testing.assert_allclose(expected, arr.read_direct()) + assert arr.schema.cell_order == "col-major" + finally: + shutil.rmtree(tempdir) + + +@pytest.mark.skipif(h5py is None, reason="h5py not installed") +@pytest.mark.ray_dag +def test_store_hdf5_execution(setup): + raw = np.random.RandomState(0).rand(10, 20) + + group_name = "test_group" + dataset_name = "test_dataset" + + t1 = tensor(raw, chunk_size=20) + t2 = tensor(raw, chunk_size=9) + + with pytest.raises(TypeError): + tohdf5(object(), t2) + + with tempfile.TemporaryDirectory() as d: + filename = os.path.join(d, f"test_store_{int(time.time())}.hdf5") + + # test 1 chunk + r = tohdf5(filename, t1, group=group_name, dataset=dataset_name) + r.execute() + + with h5py.File(filename, "r") as f: + result = np.asarray(f[f"{group_name}/{dataset_name}"]) + np.testing.assert_array_equal(result, raw) + + # test filename + r = tohdf5(filename, t2, group=group_name, dataset=dataset_name) + r.execute() + + with h5py.File(filename, "r") as f: + result = np.asarray(f[f"{group_name}/{dataset_name}"]) + np.testing.assert_array_equal(result, raw) + + with pytest.raises(ValueError): + tohdf5(filename, t2) + + with h5py.File(filename, "r") as f: + # test file + r = tohdf5(f, t2, group=group_name, dataset=dataset_name) + r.execute() + + with h5py.File(filename, "r") as f: + result = np.asarray(f[f"{group_name}/{dataset_name}"]) + np.testing.assert_array_equal(result, raw) + + with pytest.raises(ValueError): + with h5py.File(filename, "r") as f: + tohdf5(f, t2) + + with h5py.File(filename, "r") as f: + # test dataset + ds = f[f"{group_name}/{dataset_name}"] + # test file + r = tohdf5(ds, t2) + r.execute() + + with h5py.File(filename, "r") as f: + result = np.asarray(f[f"{group_name}/{dataset_name}"]) + np.testing.assert_array_equal(result, raw) + + +@pytest.mark.skipif(zarr is None, reason="zarr not installed") +def test_store_zarr_execution(setup): + raw = np.random.RandomState(0).rand(10, 20) + + group_name = "test_group" + dataset_name = "test_dataset" + + t = tensor(raw, chunk_size=6) + + with pytest.raises(TypeError): + tozarr(object(), t) + + with tempfile.TemporaryDirectory() as d: + filename = os.path.join(d, f"test_store_{int(time.time())}.zarr") + path = f"{filename}/{group_name}/{dataset_name}" + + r = tozarr( + filename, + t, + group=group_name, + dataset=dataset_name, + compressor=Zstd(level=3), + ) + r.execute() + + arr = zarr.open(path) + np.testing.assert_array_equal(arr, raw) + assert arr.compressor == Zstd(level=3) + + r = tozarr(path, t + 2) + r.execute() + + arr = zarr.open(path) + np.testing.assert_array_equal(arr, raw + 2) + + filters = [Delta(dtype="i4")] + compressor = Blosc(cname="zstd", clevel=1, shuffle=Blosc.SHUFFLE) + arr = zarr.open(path, compressor=compressor, filters=filters) + + r = tozarr(arr, t + 1) + r.execute() + result = zarr.open_array(path) + np.testing.assert_array_equal(result, raw + 1) + + +@pytest.mark.skipif(vineyard is None, reason="vineyard not installed") +def test_vineyard_execution(setup): + raw = np.random.RandomState(0).rand(55, 55) + + extra_config = { + "check_dtype": False, + "check_nsplits": False, + "check_shape": False, + } + + with vineyard.deploy.local.start_vineyardd() as (_, vineyard_socket, _): + a = tensor(raw, chunk_size=15) + a.execute() # n.b.: pre-execute + + b = tovineyard(a, vineyard_socket=vineyard_socket) + object_id = b.execute(extra_config=extra_config).fetch()[0] + + c = fromvineyard(object_id, vineyard_socket=vineyard_socket) + value = c.execute(extra_config=extra_config).fetch() + np.testing.assert_allclose(value, raw) + + a = tensor(raw, chunk_size=15) # n.b.: no pre-execute + + b = tovineyard(a, vineyard_socket=vineyard_socket) + object_id = b.execute(extra_config=extra_config).fetch()[0] + + c = fromvineyard(object_id, vineyard_socket=vineyard_socket) + value = c.execute(extra_config=extra_config).fetch() + np.testing.assert_allclose(value, raw) diff --git a/python/xorbits/_mars/tensor/datastore/to_hdf5.py b/python/xorbits/_mars/tensor/datastore/to_hdf5.py new file mode 100644 index 000000000..84a295d12 --- /dev/null +++ b/python/xorbits/_mars/tensor/datastore/to_hdf5.py @@ -0,0 +1,254 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import threading +import time +from typing import List + +import numpy as np + +from ... import opcodes as OperandDef +from ...core.context import get_context +from ...lib.filesystem import open_file +from ...oscar import ActorNotExist +from ...serialization.serializables import ( + DictField, + FieldTypes, + KeyField, + StringField, + TupleField, +) +from ...utils import has_unknown_shape +from ..datasource import tensor as astensor +from .core import TensorDataStore + + +class _HDF5Container: + def __init__(self, all_chunk_op_keys: List[str]): + self._all_chunk_op_keys = set(all_chunk_op_keys) + self._done_chunk_op_keys = set() + self._lock = threading.Lock() + + def acquire(self): + return self._lock.acquire() + + def release(self): + return self._lock.release() + + def mark_done(self, op_key: str): + self._done_chunk_op_keys.add(op_key) + + def is_done(self): + return self._done_chunk_op_keys == self._all_chunk_op_keys + + +class TensorHDF5DataStore(TensorDataStore): + _op_type_ = OperandDef.TENSOR_STORE_HDF5 + + _input = KeyField("input") + _filename = StringField("filename") + _group = StringField("group") + _dataset = StringField("dataset") + _dataset_kwds = DictField("dataset_kwds", key_type=FieldTypes.string) + _axis_offsets = TupleField("axis_offsets", FieldTypes.int32) + _out_shape = TupleField("out_shape", FieldTypes.int32) + _container_name = StringField("container_name") + + def __init__( + self, + filename=None, + group=None, + dataset=None, + dataset_kwds=None, + container_name=None, + **kw, + ): + super().__init__( + _filename=filename, + _group=group, + _dataset=dataset, + _dataset_kwds=dataset_kwds, + _container_name=container_name, + **kw, + ) + + @property + def input(self): + return self._input + + @property + def filename(self): + return self._filename + + @property + def group(self): + return self._group + + @property + def dataset(self): + return self._dataset + + @property + def dataset_kwds(self): + return self._dataset_kwds + + @property + def axis_offsets(self): + return self._axis_offsets + + @property + def out_shape(self): + return self._out_shape + + @property + def container_name(self): + return self._container_name + + @property + def path(self): + paths = [] + if self._group is not None: + paths.append(self.group) + paths.append(self.dataset) + return "/".join(paths) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + @classmethod + def tile(cls, op): + if has_unknown_shape(*op.inputs): + yield + in_tensor = op.input + + with open_file(op.filename, "w"): + # create file if not exist + pass + + nsplits = tuple([(0,) * len(ns) for ns in in_tensor.nsplits]) + if len(in_tensor.chunks) == 1: + in_chunk = in_tensor.chunks[0] + chunk_op = op.copy().reset_key() + chunk_op._axis_offsets = (0,) * in_chunk.ndim + chunk_op._out_shape = in_tensor.shape + out_chunk = chunk_op.new_chunk( + [in_chunk], shape=(0,) * in_chunk.ndim, index=in_chunk.index + ) + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + shape=(0,) * in_tensor.ndim, + nsplits=nsplits, + chunks=[out_chunk], + ) + + container_name = f"{op.key}_{int(time.time() * 1000)}" + + out_chunks = [] + acc = [[0] + np.cumsum(ns).tolist() for ns in in_tensor.nsplits] + chunk_op_keys = [] + for chunk in in_tensor.chunks: + chunk_op = op.copy().reset_key() + chunk_op._out_shape = in_tensor.shape + chunk_op._container_name = container_name + chunk_op._axis_offsets = tuple( + acc[ax][i] for ax, i in enumerate(chunk.index) + ) + out_chunk = chunk_op.new_chunk( + [chunk], shape=(0,) * chunk.ndim, index=chunk.index + ) + out_chunks.append(out_chunk) + chunk_op_keys.append(out_chunk.op.key) + + ctx = get_context() + ctx.create_remote_object(container_name, _HDF5Container, chunk_op_keys) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, shape=(0,) * in_tensor.ndim, nsplits=nsplits, chunks=out_chunks + ) + + @classmethod + def execute(cls, ctx, op: "TensorHDF5DataStore"): + import h5py + + to_store = ctx[op.inputs[0].key] + axis_offsets = op.axis_offsets + + container_name = op.container_name + container: _HDF5Container = None + if container_name: + container = ctx.get_remote_object(container_name) + container.acquire() + try: + with h5py.File(open_file(op.filename, mode="r+b"), mode="r+") as f: + try: + ds = f[op.path] + except KeyError: + ds = f.create_dataset( + op.path, + shape=op.out_shape, + dtype=to_store.dtype, + **op.dataset_kwds, + ) + ds[ + tuple( + slice(offset, offset + size) + for offset, size in zip(axis_offsets, to_store.shape) + ) + ] = to_store + ctx[op.outputs[0].key] = np.empty( + (0,) * to_store.ndim, dtype=to_store.dtype + ) + if container: + container.mark_done(op.key) + finally: + if container: + try: + container.release() + if container.is_done(): + ctx.destroy_remote_object(container_name) + except ActorNotExist: + # destroyed by other execution, just ignore + return + + +def tohdf5(hdf5_file, x, group=None, dataset=None, **kwds): + import h5py + + x = astensor(x) + if isinstance(hdf5_file, h5py.Dataset): + filename = hdf5_file.file.filename + group = hdf5_file.parent.name + dataset = hdf5_file.name.rsplit("/", 1)[1] + elif isinstance(hdf5_file, h5py.File): + filename = hdf5_file.filename + if dataset is None: + raise ValueError("`dataset` should be provided") + elif isinstance(hdf5_file, str): + filename = hdf5_file + if dataset is None: + raise ValueError("`dataset` should be provided") + else: + raise TypeError( + "`hdf5_file` passed has wrong type, " + "expect str, h5py.File or h5py.Dataset, " + f"got {type(hdf5_file)}" + ) + + op = TensorHDF5DataStore( + filename=filename, group=group, dataset=dataset, dataset_kwds=kwds + ) + return op(x) diff --git a/python/xorbits/_mars/tensor/datastore/to_tiledb.py b/python/xorbits/_mars/tensor/datastore/to_tiledb.py new file mode 100644 index 000000000..48683ef4f --- /dev/null +++ b/python/xorbits/_mars/tensor/datastore/to_tiledb.py @@ -0,0 +1,263 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +try: + import tiledb +except (ImportError, OSError): # pragma: no cover + tildb = None + +from ... import opcodes as OperandDef +from ...lib.sparse import SparseNDArray +from ...lib.sparse.core import sps +from ...serialization.serializables import ( + DictField, + FieldTypes, + Int64Field, + KeyField, + StringField, + TupleField, +) +from ..datasource import tensor as astensor +from ..operands import TensorOperand, TensorOperandMixin +from ..utils import get_tiledb_ctx +from .core import TensorDataStore +from .utils import check_tiledb_array_with_tensor, get_tiledb_schema_from_tensor + + +class TensorTileDBDataStore(TensorDataStore): + _op_type_ = OperandDef.TENSOR_STORE_TILEDB + + _input = KeyField("input") + _tiledb_config = DictField("tiledb_config") + # URI of array to write + _tiledb_uri = StringField("tiledb_uri") + # encryption key to decrypt if provided + _tiledb_key = StringField("tiledb_key") + # open array at a given timestamp if provided + _tiledb_timestamp = Int64Field("tiledb_timestamp") + _axis_offsets = TupleField("axis_offsets", FieldTypes.int64) + + def __init__( + self, + tiledb_config=None, + tiledb_uri=None, + tiledb_key=None, + tiledb_timestamp=None, + **kw, + ): + super().__init__( + _tiledb_config=tiledb_config, + _tiledb_uri=tiledb_uri, + _tiledb_key=tiledb_key, + _tiledb_timestamp=tiledb_timestamp, + **kw, + ) + + @property + def tiledb_config(self): + return self._tiledb_config + + @property + def tiledb_uri(self): + return self._tiledb_uri + + @property + def tiledb_key(self): + return self._tiledb_key + + @property + def tiledb_timestamp(self): + return self._tiledb_timestamp + + @property + def axis_offsets(self): + return self._axis_offsets + + @classmethod + def _get_out_chunk(cls, op, in_chunk): + chunk_op = op.copy().reset_key() + nsplits = op.input.nsplits + axis_offsets = [] + for axis, idx in enumerate(in_chunk.index): + axis_offsets.append(sum(nsplits[axis][:idx])) + chunk_op._axis_offsets = tuple(axis_offsets) + out_chunk_shape = (0,) * in_chunk.ndim + return chunk_op.new_chunk( + [in_chunk], shape=out_chunk_shape, index=in_chunk.index + ) + + @classmethod + def _process_out_chunks(cls, op, out_chunks): + if len(out_chunks) == 1: + return out_chunks + + consolidate_op = TensorTileDBConsolidate( + tiledb_config=op.tiledb_config, + tiledb_uri=op.tiledb_uri, + tiledb_key=op.tiledb_key, + sparse=op.sparse, + dtype=op.dtype, + ) + return consolidate_op.new_chunks( + out_chunks, shape=out_chunks[0].shape, index=(0,) * out_chunks[0].ndim + ) + + @classmethod + def tile(cls, op): + import tiledb + + tensor = super().tile(op)[0] + + ctx = tiledb.Ctx(op.tiledb_config) + tiledb_array_type = ( + tiledb.SparseArray if tensor.issparse() else tiledb.DenseArray + ) + try: + tiledb_array_type( + uri=op.tiledb_uri, + key=op.tiledb_key, + timestamp=op.tiledb_timestamp, + ctx=ctx, + ) + except tiledb.TileDBError: + # not exist, try to create TileDB Array by given uri + tiledb_array_schema = get_tiledb_schema_from_tensor( + op.input, ctx, op.input.nsplits + ) + tiledb_array_type.create( + op.tiledb_uri, tiledb_array_schema, key=op.tiledb_key + ) + + return [tensor] + + @classmethod + def execute(cls, ctx, op): + tiledb_ctx = get_tiledb_ctx(op.tiledb_config) + uri = op.tiledb_uri + key = op.tiledb_key + timestamp = op.tiledb_timestamp + axis_offsets = op.axis_offsets + + chunk = op.outputs[0] + if not chunk.issparse(): + # dense + to_store = np.ascontiguousarray(ctx[op.input.key]) + slcs = [] + for axis in range(chunk.ndim): + axis_offset = int(axis_offsets[axis]) + axis_length = int(op.input.shape[axis]) + slcs.append(slice(axis_offset, axis_offset + axis_length)) + with tiledb.DenseArray( + uri=uri, ctx=tiledb_ctx, mode="w", key=key, timestamp=timestamp + ) as arr: + arr[tuple(slcs)] = to_store + ctx[chunk.key] = np.empty((0,) * chunk.ndim, dtype=chunk.dtype) + else: + # sparse + to_store = ctx[op.input.key].spmatrix.tocoo() + if to_store.nnz > 0: + with tiledb.SparseArray( + uri=uri, ctx=tiledb_ctx, mode="w", key=key, timestamp=timestamp + ) as arr: + if chunk.ndim == 1: + vec = to_store.col if to_store.shape[0] == 1 else to_store.row + vec += axis_offsets[0] + arr[vec] = to_store.data + else: + i, j = ( + to_store.row + axis_offsets[0], + to_store.col + axis_offsets[1], + ) + arr[i, j] = to_store.data + ctx[chunk.key] = SparseNDArray( + sps.csr_matrix((0, 0), dtype=chunk.dtype), shape=chunk.shape + ) + + +class TensorTileDBConsolidate(TensorOperandMixin, TensorOperand): + _op_type_ = OperandDef.TENSOR_STORE_TILEDB_CONSOLIDATE + + _tiledb_config = DictField("tiledb_config") + # URI of array to write + _tiledb_uri = StringField("tiledb_uri") + # encryption key to decrypt if provided + _tiledb_key = StringField("tiledb_key") + + def __init__(self, tiledb_config=None, tiledb_uri=None, tiledb_key=None, **kw): + super().__init__( + _tiledb_config=tiledb_config, + _tiledb_uri=tiledb_uri, + _tiledb_key=tiledb_key, + **kw, + ) + + def calc_shape(self, *inputs_shape): + return self.outputs[0].shape + + @property + def tiledb_config(self): + return self._tiledb_config + + @property + def tiledb_uri(self): + return self._tiledb_uri + + @property + def tiledb_key(self): + return self._tiledb_key + + @classmethod + def tile(cls, op): + raise TypeError(f"{cls.__name__} is a chunk op, cannot be tiled") + + @classmethod + def execute(cls, ctx, op): + tiledb_config = tiledb.Config(op.tiledb_config) + uri = op.tiledb_uri + key = op.tiledb_key + + tiledb.consolidate(config=tiledb_config, uri=uri, key=key) + ctx[op.outputs[0].key] = ctx[op.inputs[0].key] + + +def totiledb(uri, x, ctx=None, key=None, timestamp=None): + import tiledb + + x = astensor(x) + raw_ctx = ctx + if raw_ctx is None: + ctx = tiledb.Ctx() + + tiledb_array_type = tiledb.SparseArray if x.issparse() else tiledb.DenseArray + try: + tiledb_array = tiledb_array_type(uri=uri, key=key, timestamp=timestamp, ctx=ctx) + # if already created, we will check the shape and dtype + check_tiledb_array_with_tensor(x, tiledb_array) + except tiledb.TileDBError: + # not exist, as we don't know the tile, + # we will create the tiledb array in the tile of tensor + pass + + tiledb_config = None if raw_ctx is None else raw_ctx.config().dict() + op = TensorTileDBDataStore( + tiledb_config=tiledb_config, + tiledb_uri=uri, + tiledb_key=key, + tiledb_timestamp=timestamp, + dtype=x.dtype, + sparse=x.issparse(), + ) + return op(x) diff --git a/python/xorbits/_mars/tensor/datastore/to_vineyard.py b/python/xorbits/_mars/tensor/datastore/to_vineyard.py new file mode 100644 index 000000000..fd771e40b --- /dev/null +++ b/python/xorbits/_mars/tensor/datastore/to_vineyard.py @@ -0,0 +1,179 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Tuple + +import numpy as np + +from ... import opcodes as OperandDef +from ...core.operand.base import SchedulingHint +from ...serialization.serializables import FieldTypes, KeyField, StringField, TupleField +from ...storage.base import StorageLevel +from ...utils import lazy_import +from ..datasource import tensor as astensor +from .core import TensorDataStore + +vineyard = lazy_import("vineyard") +vy_data_tensor = lazy_import("vineyard.data.tensor", rename="vy_data_tensor") +vy_data_utils = lazy_import("vineyard.data.utils", rename="vy_data_utils") + + +def resolve_vineyard_socket(ctx, op) -> Tuple[str, bool]: + storage_backend = ctx.get_storage_info(level=StorageLevel.MEMORY) + if storage_backend.get("name", None) == "vineyard": # pragma: no cover + if ( + op.vineyard_socket is not None + and op.vineyard_socket != storage_backend["socket"] + ): + return op.vineyard_socket, True + else: + return storage_backend["socket"], False + else: + return op.vineyard_socket, True + + +class TensorVineyardDataStoreChunk(TensorDataStore): + _op_type_ = OperandDef.TENSOR_STORE_VINEYARD_CHUNK + + _input = KeyField("input") + + # vineyard ipc socket + vineyard_socket = StringField("vineyard_socket") + + # a dummy attr to make sure ops have different keys + operator_index = TupleField("operator_index", FieldTypes.int32) + + def __init__(self, vineyard_socket=None, **kw): + super().__init__(vineyard_socket=vineyard_socket, **kw) + + @classmethod + def _process_out_chunks(cls, op, out_chunks): + merge_op = TensorVineyardDataStoreMeta( + vineyard_socket=op.vineyard_socket, sparse=op.sparse, dtype=np.dtype("O") + ) + return merge_op.new_chunks( + out_chunks, shape=(1,), dtype=np.dtype("O"), index=(0,) * out_chunks[0].ndim + ) + + @classmethod + def tile(cls, op): + out_chunks = [] + scheduling_hint = SchedulingHint(fuseable=False) + for idx, chunk in enumerate(op.inputs[0].chunks): + chunk_op = op.copy().reset_key() + chunk_op.scheduling_hint = scheduling_hint + chunk_op.operator_index = chunk.index + out_chunk = chunk_op.new_chunk( + [chunk], dtype=np.dtype("O"), shape=(1,), index=(idx,) + ) + out_chunks.append(out_chunk) + out_chunks = cls._process_out_chunks(op, out_chunks) + + new_op = op.copy().reset_key() + return new_op.new_tensors( + op.inputs, + shape=(len(out_chunks),), + dtype=np.dtype("O"), + chunks=out_chunks, + nsplits=((1,),), + ) + + @classmethod + def execute(cls, ctx, op): + if vineyard is None: + raise RuntimeError("vineyard is not available") + socket, needs_put = resolve_vineyard_socket(ctx, op) + client = vineyard.connect(socket) + + # some op might be fused and executed twice on different workers + if not needs_put: + # might be fused + try: # pragma: no cover + meta = ctx.get_chunks_meta([op.inputs[0].key])[0] + tensor_id = vineyard.ObjectID(meta["object_ref"]) + if not client.exists(tensor_id): + needs_put = True + except KeyError: + needs_put = True + if needs_put: + tensor_id = client.put( + np.asarray(ctx[op.inputs[0].key]), partition_index=op.inputs[0].index + ) + else: # pragma: no cover + meta = client.get_meta(tensor_id) + new_meta = vineyard.ObjectMeta() + for k, v in meta.items(): + if k not in ["id", "signature", "instance_id"]: + if isinstance(v, vineyard.ObjectMeta): + new_meta.add_member(k, v) + else: + new_meta[k] = v + new_meta["partition_index_"] = vy_data_utils.to_json(op.inputs[0].index) + tensor_id = client.create_metadata(new_meta).id + + client.persist(tensor_id) + holder = np.empty((1,), dtype=object) + holder[0] = tensor_id + ctx[op.outputs[0].key] = holder + + +class TensorVineyardDataStoreMeta(TensorDataStore): + _op_type_ = OperandDef.TENSOR_STORE_VINEYARD_META + + _input = KeyField("input") + + # vineyard ipc socket + vineyard_socket = StringField("vineyard_socket") + + def __init__(self, vineyard_socket=None, dtype=None, sparse=None, **kw): + super().__init__( + vineyard_socket=vineyard_socket, dtype=dtype, sparse=sparse, **kw + ) + + @classmethod + def tile(cls, op): + chunk_op = op.copy().reset_key() + out_chunk = chunk_op.new_chunk( + op.inputs[0].chunks, dtype=np.dtype("O"), shape=(1,), index=(0,) + ) + new_op = op.copy().reset_key() + return new_op.new_tensors( + op.inputs, + shape=(1,), + dtype=np.dtype("O"), + chunks=[out_chunk], + nsplits=((1,),), + ) + + @classmethod + def execute(cls, ctx, op): + if vineyard is None: + raise RuntimeError("vineyard is not available") + + socket, _ = resolve_vineyard_socket(ctx, op) + client = vineyard.connect(socket) + + # # store the result object id to execution context + chunks = [ctx[chunk.key][0] for chunk in op.inputs] + holder = np.empty((1,), dtype=object) + holder[0] = vy_data_tensor.make_global_tensor(client, chunks).id + ctx[op.outputs[0].key] = holder + + +def tovineyard(x, vineyard_socket=None): + x = astensor(x) + op = TensorVineyardDataStoreChunk( + vineyard_socket=vineyard_socket, dtype=x.dtype, sparse=x.issparse() + ) + return op(x) diff --git a/python/xorbits/_mars/tensor/datastore/to_zarr.py b/python/xorbits/_mars/tensor/datastore/to_zarr.py new file mode 100644 index 000000000..91aae4e3f --- /dev/null +++ b/python/xorbits/_mars/tensor/datastore/to_zarr.py @@ -0,0 +1,212 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pickle +from typing import Dict + +import numpy as np + +from ... import opcodes as OperandDef +from ...lib.filesystem import FSMap, get_fs +from ...serialization.serializables import ( + BytesField, + FieldTypes, + KeyField, + StringField, + TupleField, +) +from ...utils import has_unknown_shape +from .core import TensorDataStore + + +class ZarrOptions(object): + def __init__(self, options: Dict): + self._options = options + + def todict(self): + return self._options + + @staticmethod + def _stringfy(v): + return pickle.dumps(v) if not isinstance(v, str) else v + + def __mars_tokenize__(self): + return ( + list(self._options.keys()), + list(self._stringfy(v) for v in self._options.values()), + ) + + def __getstate__(self): + return self._options + + def __setstate__(self, state): + self._options = state + + +class TensorToZarrDataStore(TensorDataStore): + _op_type_ = OperandDef.TENSOR_STORE_ZARR + + _input = KeyField("input") + _path = StringField("path") + _group = StringField("group") + _dataset = StringField("dataset") + _zarr_options = BytesField( + "zarr_options", on_serialize=pickle.dumps, on_deserialize=pickle.loads + ) + _axis_offsets = TupleField("axis_offsets", FieldTypes.int32) + + def __init__( + self, + path=None, + group=None, + dataset=None, + zarr_options=None, + axis_offsets=None, + **kw, + ): + super().__init__( + _path=path, + _group=group, + _dataset=dataset, + _zarr_options=zarr_options, + _axis_offsets=axis_offsets, + **kw, + ) + + @property + def path(self): + return self._path + + @property + def group(self): + return self._group + + @property + def dataset(self): + return self._dataset + + @property + def zarr_options(self): + return self._zarr_options + + @property + def axis_offsets(self): + return self._axis_offsets + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + @classmethod + def tile(cls, op): + import zarr + + if has_unknown_shape(*op.inputs): + yield + in_tensor = op.input + + # create dataset + fs = get_fs(op.path, None) + path = op.path + if op.group is not None: + path += "/" + op.group + fs_map = FSMap(path, fs) + zarr.open( + fs_map, + "w", + path=op.dataset, + dtype=in_tensor.dtype, + shape=in_tensor.shape, + chunks=tuple(max(ns) for ns in in_tensor.nsplits), + **op.zarr_options.todict(), + ) + + cum_nsplits = [[0] + np.cumsum(ns).tolist() for ns in in_tensor.nsplits] + out_chunks = [] + for chunk in in_tensor.chunks: + chunk_op = op.copy().reset_key() + chunk_op._axis_offsets = tuple( + cs[i] for i, cs in zip(chunk.index, cum_nsplits) + ) + out_chunks.append( + chunk_op.new_chunk([chunk], shape=(0,) * chunk.ndim, index=chunk.index) + ) + + new_op = op.copy() + out = op.outputs[0] + nsplits = tuple((0,) * len(ns) for ns in in_tensor.nsplits) + return new_op.new_tensors( + op.inputs, + shape=out.shape, + order=out.order, + nsplits=nsplits, + chunks=out_chunks, + ) + + @classmethod + def execute(cls, ctx, op): + import zarr + + fs = get_fs(op.path, None) + fs_map = FSMap(op.path, fs) + + group = zarr.Group(store=fs_map, path=op.group) + array = group[op.dataset] + + to_store = ctx[op.inputs[0].key] + axis_offsets = op.axis_offsets + shape = to_store.shape + + array[ + tuple( + slice(offset, offset + size) + for offset, size in zip(axis_offsets, shape) + ) + ] = to_store + + ctx[op.outputs[0].key] = np.empty((0,) * to_store.ndim, dtype=to_store.dtype) + + +def tozarr(path, x, group=None, dataset=None, **zarr_options): + import zarr + + if isinstance(path, zarr.Array): + arr = path + if isinstance(arr.store, FSMap): + root = arr.store.root + path, dataset = root.rsplit("/", 1) + else: + path = arr.store.path + if "/" in arr.path and group is None: + group = arr.path.rsplit("/", 1)[0] + dataset = arr.basename + if not dataset: + path, dataset = path.rsplit("/", 1) + for attr in ["compressor", "filters"]: + if getattr(arr, attr): + zarr_options[attr] = getattr(arr, attr) + elif isinstance(path, str): + if dataset is None: + path, dataset = path.rsplit("/", 1) + else: + raise TypeError( + "`path` passed has wrong type, " + "expect str, or zarr.Array" + f"got {type(path)}" + ) + + op = TensorToZarrDataStore( + path=path, group=group, dataset=dataset, zarr_options=ZarrOptions(zarr_options) + ) + return op(x) diff --git a/python/xorbits/_mars/tensor/datastore/utils.py b/python/xorbits/_mars/tensor/datastore/utils.py new file mode 100644 index 000000000..91db50a33 --- /dev/null +++ b/python/xorbits/_mars/tensor/datastore/utils.py @@ -0,0 +1,67 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +try: + import tiledb +except (ImportError, OSError): # pragma: no cover + tiledb = None + + +def get_tiledb_schema_from_tensor(tensor, tiledb_ctx, nsplits, **kw): + from ..core import TensorOrder + + ctx = tiledb_ctx + + dims = [] + for d in range(tensor.ndim): + extent = tensor.shape[d] + domain = (0, extent - 1) + tile = max(nsplits[d]) + dims.append( + tiledb.Dim(name="", domain=domain, tile=tile, dtype=np.int64, ctx=ctx) + ) + dom = tiledb.Domain(*dims, ctx=ctx) + att = tiledb.Attr(ctx=ctx, dtype=tensor.dtype) + cell_order = "C" if tensor.order == TensorOrder.C_ORDER else "F" + return tiledb.ArraySchema( + ctx=ctx, + domain=dom, + attrs=(att,), + sparse=tensor.issparse(), + cell_order=cell_order, + **kw, + ) + + +def check_tiledb_array_with_tensor(tensor, tiledb_array): + if tensor.ndim != tiledb_array.ndim: + # ndim + raise ValueError( + "ndim of TileDB Array to store is different to tensor, " + f"expect {tensor.ndim}, got {tiledb_array.ndim}" + ) + if tensor.shape != tiledb_array.shape: + # shape + raise ValueError( + "shape of TileDB Array to store is different to tensor, " + f"expect {tensor.shape}, got {tiledb_array.shape}" + ) + if tensor.dtype != tiledb_array.attr(0).dtype: + # dtype + raise ValueError( + "dtype of TileDB Array to store is different to tensor, " + f"expect {tensor.dtype}, got {tiledb_array.domain.dtype}" + ) diff --git a/python/xorbits/_mars/tensor/einsum/__init__.py b/python/xorbits/_mars/tensor/einsum/__init__.py new file mode 100644 index 000000000..f21419396 --- /dev/null +++ b/python/xorbits/_mars/tensor/einsum/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .core import einsum diff --git a/python/xorbits/_mars/tensor/einsum/core.py b/python/xorbits/_mars/tensor/einsum/core.py new file mode 100644 index 000000000..924743c47 --- /dev/null +++ b/python/xorbits/_mars/tensor/einsum/core.py @@ -0,0 +1,486 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from collections import defaultdict + +import numpy as np + +from ... import opcodes as OperandDef +from ...core import recursive_tile +from ...serialization.serializables import AnyField, StringField +from ..arithmetic.utils import chunk_tree_add +from ..array_utils import as_same_device, device +from ..core import TensorOrder +from ..operands import TensorOperand, TensorOperandMixin +from ..utils import decide_unify_split +from .einsumfunc import einsum_path, parse_einsum_input + + +class TensorEinsum(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.EINSUM + + _subscripts = StringField("subscripts") + _optimize = AnyField("optimize") + _order = StringField("order") + _casting = StringField("casting") + + def __init__(self, subscripts=None, optimize=None, order=None, casting=None, **kw): + super().__init__( + _subscripts=subscripts, + _optimize=optimize, + _order=order, + _casting=casting, + **kw + ) + + @property + def subscripts(self): + return self._subscripts + + @property + def optimize(self): + return self._optimize + + @property + def order(self): + return self._order + + @property + def casting(self): + return self._casting + + def __call__(self, input_tensors, shape): + if self.order in "KA": + if any(t.order == TensorOrder.C_ORDER for t in input_tensors): + order = TensorOrder.C_ORDER + else: + order = TensorOrder.F_ORDER + else: + if self.order == "C": + order = TensorOrder.C_ORDER + else: + order = TensorOrder.F_ORDER + return self.new_tensor( + input_tensors, shape=shape, dtype=self.dtype, order=order + ) + + @classmethod + def tile(cls, op): + out_tensor = op.outputs[0] + input_scripts, output_scripts = op.subscripts.split("->") + tensor_axes = list(zip(op.inputs, input_scripts.split(","))) + + # rechunk to unify nsplits + input_nsplits = defaultdict(list) + for t, axes in tensor_axes: + for splits, ax in zip(t.nsplits, axes): + input_nsplits[ax].append(splits) + input_tensors = [] + for t, axes in tensor_axes: + new_nsplits = tuple( + decide_unify_split(*input_nsplits[ax]) + if t.shape[j] > 1 + else (t.shape[j],) + for j, ax in enumerate(axes) + ) + input_tensors.append((yield from recursive_tile(t.rechunk(new_nsplits)))) + + tensor_indexes = dict() + output_axes = defaultdict(list) + axes_splits = dict() + tensor_contract_axes = [] + for i, (t, axes) in enumerate(zip(input_tensors, input_scripts.split(","))): + for j in range(t.ndim): + if axes[j] in output_scripts: + # Record the output tensor's axes and its nsplit. + tensor_indexes[axes[j]] = range(len(t.nsplits[j])) + output_axes[axes[j]].append((i, j)) + axis_splits = dict((axes[j], t.nsplits[j]) for j in range(t.ndim)) + axes_splits.update(axis_splits) + tensor_contract_axes.append([ax for ax in axes if ax not in output_scripts]) + + out_chunks = [] + output_indexes = [tensor_indexes[ax] for ax in output_scripts] + for out_idx in itertools.product(*output_indexes): + all_indexes = [[None] * t.ndim for t in input_tensors] + tensor_shape = [] + for i, idx in enumerate(out_idx): + tensor_shape.append(axes_splits[output_scripts[i]][idx]) + for t_idx, axis in output_axes[output_scripts[i]]: + if input_tensors[t_idx].shape[axis] == 1: + all_indexes[t_idx][axis] = 0 + else: + all_indexes[t_idx][axis] = idx + tensor_shape = tuple(tensor_shape) + einsum_chunks = [] + contract_axes = [ + s + for s in set(input_scripts.replace(",", "")) + if s not in output_scripts + ] + for contract_indexes in itertools.product( + *[range(len(axes_splits[ax])) for ax in contract_axes] + ): + for j, t_contract_axes in enumerate(tensor_contract_axes): + for axis in t_contract_axes: + axis_index = tensor_axes[j][1].index(axis) + all_indexes[j][axis_index] = contract_indexes[ + contract_axes.index(axis) + ] + einsum_op = op.copy().reset_key() + in_chunks = [ + t.cix[tuple(indices)] + for t, indices in zip(input_tensors, all_indexes) + ] + chunk = einsum_op.new_chunk( + in_chunks, shape=tensor_shape, order=out_tensor.order + ) + einsum_chunks.append(chunk) + + if len(einsum_chunks) == 1: + c = einsum_chunks[0] + chunk_op = c.op.copy() + chunk = chunk_op.new_chunk( + c.inputs, shape=c.shape, index=out_idx, order=out_tensor.order + ) + else: + chunk = chunk_tree_add( + op.dtype, einsum_chunks, out_idx, tensor_shape, sparse=op.sparse + ) + out_chunks.append(chunk) + + nsplits = [axes_splits[ax] for ax in output_scripts] + new_op = op.copy() + return new_op.new_tensors( + input_tensors, out_tensor.shape, chunks=out_chunks, nsplits=nsplits + ) + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + if xp is np: + ctx[op.outputs[0].key] = xp.einsum( + op.subscripts, + *inputs, + optimize=op.optimize, + dtype=op.dtype, + order=op.order, + casting=op.casting + ) + else: + # Cupy doesn't support `optimize`, `order` and `casting`. + ctx[op.outputs[0].key] = xp.einsum( + op.subscripts, *inputs, dtype=op.dtype + ) + + +def einsum( + subscripts, *operands, dtype=None, order="K", casting="safe", optimize=False +): + """ + Evaluates the Einstein summation convention on the operands. + + Using the Einstein summation convention, many common multi-dimensional, + linear algebraic array operations can be represented in a simple fashion. + In *implicit* mode `einsum` computes these values. + + In *explicit* mode, `einsum` provides further flexibility to compute + other array operations that might not be considered classical Einstein + summation operations, by disabling, or forcing summation over specified + subscript labels. + + See the notes and examples for clarification. + + Parameters + ---------- + subscripts : str + Specifies the subscripts for summation as comma separated list of + subscript labels. An implicit (classical Einstein summation) + calculation is performed unless the explicit indicator '->' is + included as well as subscript labels of the precise output form. + operands : list of array_like + These are the arrays for the operation. + dtype : {data-type, None}, optional + If provided, forces the calculation to use the data type specified. + Note that you may have to also give a more liberal `casting` + parameter to allow the conversions. Default is None. + order : {'C', 'F', 'A', 'K'}, optional + Controls the memory layout of the output. 'C' means it should + be C contiguous. 'F' means it should be Fortran contiguous, + 'A' means it should be 'F' if the inputs are all 'F', 'C' otherwise. + 'K' means it should be as close to the layout as the inputs as + is possible, including arbitrarily permuted axes. + Default is 'K'. + casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional + Controls what kind of data casting may occur. Setting this to + 'unsafe' is not recommended, as it can adversely affect accumulations. + + * 'no' means the data types should not be cast at all. + * 'equiv' means only byte-order changes are allowed. + * 'safe' means only casts which can preserve values are allowed. + * 'same_kind' means only safe casts or casts within a kind, + like float64 to float32, are allowed. + * 'unsafe' means any data conversions may be done. + + Default is 'safe'. + optimize : {False, True, 'greedy', 'optimal'}, optional + Controls if intermediate optimization should occur. No optimization + will occur if False and True will default to the 'greedy' algorithm. + Also accepts an explicit contraction list from the ``np.einsum_path`` + function. See ``np.einsum_path`` for more details. Defaults to False. + + Returns + ------- + output : Mars.tensor + The calculation based on the Einstein summation convention. + + The Einstein summation convention can be used to compute + many multi-dimensional, linear algebraic array operations. `einsum` + provides a succinct way of representing these. + + A non-exhaustive list of these operations, + which can be computed by `einsum`, is shown below along with examples: + + * Trace of an array, :py:func:`numpy.trace`. + * Return a diagonal, :py:func:`numpy.diag`. + * Array axis summations, :py:func:`numpy.sum`. + * Transpositions and permutations, :py:func:`numpy.transpose`. + * Matrix multiplication and dot product, :py:func:`numpy.matmul` :py:func:`numpy.dot`. + * Vector inner and outer products, :py:func:`numpy.inner` :py:func:`numpy.outer`. + * Broadcasting, element-wise and scalar multiplication, :py:func:`numpy.multiply`. + * Tensor contractions, :py:func:`numpy.tensordot`. + * Chained array operations, in efficient calculation order, :py:func:`numpy.einsum_path`. + + The subscripts string is a comma-separated list of subscript labels, + where each label refers to a dimension of the corresponding operand. + Whenever a label is repeated it is summed, so ``mt.einsum('i,i', a, b)`` + is equivalent to :py:func:`mt.inner(a,b) `. If a label + appears only once, it is not summed, so ``mt.einsum('i', a)`` produces a + view of ``a`` with no changes. A further example ``mt.einsum('ij,jk', a, b)`` + describes traditional matrix multiplication and is equivalent to + :py:func:`mt.matmul(a,b) `. + + In *implicit mode*, the chosen subscripts are important + since the axes of the output are reordered alphabetically. This + means that ``mt.einsum('ij', a)`` doesn't affect a 2D array, while + ``mt.einsum('ji', a)`` takes its transpose. Additionally, + ``mt.einsum('ij,jk', a, b)`` returns a matrix multiplication, while, + ``mt.einsum('ij,jh', a, b)`` returns the transpose of the + multiplication since subscript 'h' precedes subscript 'i'. + + In *explicit mode* the output can be directly controlled by + specifying output subscript labels. This requires the + identifier '->' as well as the list of output subscript labels. + This feature increases the flexibility of the function since + summing can be disabled or forced when required. The call + ``mt.einsum('i->', a)`` is like :py:func:`mt.sum(a, axis=-1) `, + and ``mt.einsum('ii->i', a)`` is like :py:func:`mt.diag(a) `. + The difference is that `einsum` does not allow broadcasting by default. + Additionally ``mt.einsum('ij,jh->ih', a, b)`` directly specifies the + order of the output subscript labels and therefore returns matrix + multiplication, unlike the example above in implicit mode. + + To enable and control broadcasting, use an ellipsis. Default + NumPy-style broadcasting is done by adding an ellipsis + to the left of each term, like ``mt.einsum('...ii->...i', a)``. + To take the trace along the first and last axes, + you can do ``mt.einsum('i...i', a)``, or to do a matrix-matrix + product with the left-most indices instead of rightmost, one can do + ``mt.einsum('ij...,jk...->ik...', a, b)``. + + When there is only one operand, no axes are summed, and no output + parameter is provided, a view into the operand is returned instead + of a new array. Thus, taking the diagonal as ``mt.einsum('ii->i', a)`` + produces a view (changed in version 1.10.0). + + `einsum` also provides an alternative way to provide the subscripts + and operands as ``einsum(op0, sublist0, op1, sublist1, ..., [sublistout])``. + If the output shape is not provided in this format `einsum` will be + calculated in implicit mode, otherwise it will be performed explicitly. + The examples below have corresponding `einsum` calls with the two + parameter methods. + + Examples + -------- + >>> import mars.tensor as mt + >>> a = mt.arange(25).reshape(5,5) + >>> b = mt.arange(5) + >>> c = mt.arange(6).reshape(2,3) + Trace of a matrix: + >>> mt.einsum('ii', a).execute() + 60 + >>> mt.einsum(a, [0,0]).execute() + 60 + Extract the diagonal (requires explicit form): + >>> mt.einsum('ii->i', a).execute() + array([ 0, 6, 12, 18, 24]) + >>> mt.einsum(a, [0,0], [0]).execute() + array([ 0, 6, 12, 18, 24]) + >>> mt.diag(a).execute() + array([ 0, 6, 12, 18, 24]) + Sum over an axis (requires explicit form): + >>> mt.einsum('ij->i', a).execute() + array([ 10, 35, 60, 85, 110]) + >>> mt.einsum(a, [0,1], [0]).execute() + array([ 10, 35, 60, 85, 110]) + >>> mt.sum(a, axis=1).execute() + array([ 10, 35, 60, 85, 110]) + For higher dimensional arrays summing a single axis can be done with ellipsis: + >>> mt.einsum('...j->...', a).execute() + array([ 10, 35, 60, 85, 110]) + >>> mt.einsum(a, [Ellipsis,1], [Ellipsis]).execute() + array([ 10, 35, 60, 85, 110]) + Compute a matrix transpose, or reorder any number of axes: + >>> mt.einsum('ji', c).execute() + array([[0, 3], + [1, 4], + [2, 5]]) + >>> mt.einsum('ij->ji', c).execute() + array([[0, 3], + [1, 4], + [2, 5]]) + >>> mt.einsum(c, [1,0]).execute() + array([[0, 3], + [1, 4], + [2, 5]]) + >>> mt.transpose(c).execute() + array([[0, 3], + [1, 4], + [2, 5]]) + Vector inner products: + >>> mt.einsum('i,i', b, b).execute() + 30 + >>> mt.einsum(b, [0], b, [0]).execute() + 30 + >>> mt.inner(b,b).execute() + 30 + Matrix vector multiplication: + >>> mt.einsum('ij,j', a, b).execute() + array([ 30, 80, 130, 180, 230]) + >>> mt.einsum(a, [0,1], b, [1]).execute() + array([ 30, 80, 130, 180, 230]) + >>> mt.dot(a, b).execute() + array([ 30, 80, 130, 180, 230]) + >>> mt.einsum('...j,j', a, b).execute() + array([ 30, 80, 130, 180, 230]) + Broadcasting and scalar multiplication: + >>> mt.einsum('..., ...', 3, c).execute() + array([[ 0, 3, 6], + [ 9, 12, 15]]) + >>> mt.einsum(',ij', 3, c).execute() + array([[ 0, 3, 6], + [ 9, 12, 15]]) + >>> mt.einsum(3, [Ellipsis], c, [Ellipsis]).execute() + array([[ 0, 3, 6], + [ 9, 12, 15]]) + >>> mt.multiply(3, c).execute() + array([[ 0, 3, 6], + [ 9, 12, 15]]) + Vector outer product: + >>> mt.einsum('i,j', mt.arange(2)+1, b).execute() + array([[0, 1, 2, 3, 4], + [0, 2, 4, 6, 8]]) + >>> mt.einsum(mt.arange(2)+1, [0], b, [1]).execute() + array([[0, 1, 2, 3, 4], + [0, 2, 4, 6, 8]]) + >>> mt.outer(mt.arange(2)+1, b).execute() + array([[0, 1, 2, 3, 4], + [0, 2, 4, 6, 8]]) + Tensor contraction: + >>> a = mt.arange(60.).reshape(3,4,5) + >>> b = mt.arange(24.).reshape(4,3,2) + >>> mt.einsum('ijk,jil->kl', a, b).execute() + array([[4400., 4730.], + [4532., 4874.], + [4664., 5018.], + [4796., 5162.], + [4928., 5306.]]) + >>> mt.einsum(a, [0,1,2], b, [1,0,3], [2,3]).execute() + array([[4400., 4730.], + [4532., 4874.], + [4664., 5018.], + [4796., 5162.], + [4928., 5306.]]) + >>> mt.tensordot(a,b, axes=([1,0],[0,1])).execute() + array([[4400., 4730.], + [4532., 4874.], + [4664., 5018.], + [4796., 5162.], + [4928., 5306.]]) + Writeable returned arrays (since version 1.10.0): + >>> a = mt.zeros((3, 3)) + >>> mt.einsum('ii->i', a)[:] = 1 + >>> a.execute() + array([[1., 0., 0.], + [0., 1., 0.], + [0., 0., 1.]]) + Example of ellipsis use: + >>> a = mt.arange(6).reshape((3,2)) + >>> b = mt.arange(12).reshape((4,3)) + >>> mt.einsum('ki,jk->ij', a, b).execute() + array([[10, 28, 46, 64], + [13, 40, 67, 94]]) + >>> mt.einsum('ki,...k->i...', a, b).execute() + array([[10, 28, 46, 64], + [13, 40, 67, 94]]) + >>> mt.einsum('k...,jk', a, b).execute() + array([[10, 28, 46, 64], + [13, 40, 67, 94]]) + Chained array operations. For more complicated contractions, speed ups + might be achieved by repeatedly computing a 'greedy' path or pre-computing the + 'optimal' path and repeatedly applying it, using an + `einsum_path` insertion (since version 1.12.0). Performance improvements can be + particularly significant with larger arrays: + >>> a = mt.ones(64).reshape(2,4,8) + Basic `einsum`: ~1520ms (benchmarked on 3.1GHz Intel i5.) + >>> for iteration in range(500): + ... _ = mt.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a) + Sub-optimal `einsum` (due to repeated path calculation time): ~330ms + >>> for iteration in range(500): + ... _ = mt.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize='optimal') + Greedy `einsum` (faster optimal path approximation): ~160ms + >>> for iteration in range(500): + ... _ = mt.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize='greedy') + Optimal `einsum` (best usage pattern in some use cases): ~110ms + >>> path = mt.einsum_path('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize='optimal')[0] + >>> for iteration in range(500): + ... _ = mt.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize=path) + + """ + + all_inputs = [subscripts] + list(operands) + inputs, outputs, operands = parse_einsum_input(all_inputs) + subscripts = "->".join((inputs, outputs)) + axes_shape = dict() + for axes, op in zip(inputs.split(","), operands): + for ax, s in zip(axes, op.shape): + axes_shape[ax] = s + + if optimize: + optimize, _ = einsum_path(*all_inputs, optimize=optimize) + + shape = tuple(axes_shape[ax] for ax in outputs) + op = TensorEinsum( + subscripts=subscripts, + optimize=optimize, + dtype=dtype or operands[0].dtype, + order=order, + casting=casting, + ) + return op(operands, shape) diff --git a/python/xorbits/_mars/tensor/einsum/einsumfunc.py b/python/xorbits/_mars/tensor/einsum/einsumfunc.py new file mode 100644 index 000000000..23988b0c2 --- /dev/null +++ b/python/xorbits/_mars/tensor/einsum/einsumfunc.py @@ -0,0 +1,1027 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +from numpy.compat import basestring + +from ..datasource.array import tensor as astensor + +__all__ = ["parse_einsum_input", "einsum_path"] + +einsum_symbols = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" +einsum_symbols_set = set(einsum_symbols) + + +def _flop_count(idx_contraction, inner, num_terms, size_dictionary): + """ + Computes the number of FLOPS in the contraction. + + Parameters + ---------- + idx_contraction : iterable + The indices involved in the contraction + inner : bool + Does this contraction require an inner product? + num_terms : int + The number of terms in a contraction + size_dictionary : dict + The size of each of the indices in idx_contraction + + Returns + ------- + flop_count : int + The total number of FLOPS required for the contraction. + + Examples + -------- + + >>> _flop_count('abc', False, 1, {'a': 2, 'b':3, 'c':5}) + 30 + + >>> _flop_count('abc', True, 2, {'a': 2, 'b':3, 'c':5}) + 60 + + """ + + overall_size = _compute_size_by_dict(idx_contraction, size_dictionary) + op_factor = max(1, num_terms - 1) + if inner: + op_factor += 1 + + return overall_size * op_factor + + +def _compute_size_by_dict(indices, idx_dict): + """ + Computes the product of the elements in indices based on the dictionary + idx_dict. + + Parameters + ---------- + indices : iterable + Indices to base the product on. + idx_dict : dictionary + Dictionary of index sizes + + Returns + ------- + ret : int + The resulting product. + + Examples + -------- + >>> _compute_size_by_dict('abbc', {'a': 2, 'b':3, 'c':5}) + 90 + + """ + ret = 1 + for i in indices: + ret *= idx_dict[i] + return ret + + +def _find_contraction(positions, input_sets, output_set): + """ + Finds the contraction for a given set of input and output sets. + + Parameters + ---------- + positions : iterable + Integer positions of terms used in the contraction. + input_sets : list + List of sets that represent the lhs side of the einsum subscript + output_set : set + Set that represents the rhs side of the overall einsum subscript + + Returns + ------- + new_result : set + The indices of the resulting contraction + remaining : list + List of sets that have not been contracted, the new set is appended to + the end of this list + idx_removed : set + Indices removed from the entire contraction + idx_contraction : set + The indices used in the current contraction + + Examples + -------- + + # A simple dot product test case + >>> pos = (0, 1) + >>> isets = [set('ab'), set('bc')] + >>> oset = set('ac') + >>> _find_contraction(pos, isets, oset) + ({'a', 'c'}, [{'a', 'c'}], {'b'}, {'a', 'b', 'c'}) + + # A more complex case with additional terms in the contraction + >>> pos = (0, 2) + >>> isets = [set('abd'), set('ac'), set('bdc')] + >>> oset = set('ac') + >>> _find_contraction(pos, isets, oset) + ({'a', 'c'}, [{'a', 'c'}, {'a', 'c'}], {'b', 'd'}, {'a', 'b', 'c', 'd'}) + """ + + idx_contract = set() + idx_remain = output_set.copy() + remaining = [] + for ind, value in enumerate(input_sets): + if ind in positions: + idx_contract |= value + else: + remaining.append(value) + idx_remain |= value + + new_result = idx_remain & idx_contract + idx_removed = idx_contract - new_result + remaining.append(new_result) + + return (new_result, remaining, idx_removed, idx_contract) + + +def _optimal_path(input_sets, output_set, idx_dict, memory_limit): + """ + Computes all possible pair contractions, sieves the results based + on ``memory_limit`` and returns the lowest cost path. This algorithm + scales factorial with respect to the elements in the list ``input_sets``. + + Parameters + ---------- + input_sets : list + List of sets that represent the lhs side of the einsum subscript + output_set : set + Set that represents the rhs side of the overall einsum subscript + idx_dict : dictionary + Dictionary of index sizes + memory_limit : int + The maximum number of elements in a temporary array + + Returns + ------- + path : list + The optimal contraction order within the memory limit constraint. + + Examples + -------- + >>> isets = [set('abd'), set('ac'), set('bdc')] + >>> oset = set() + >>> idx_sizes = {'a': 1, 'b':2, 'c':3, 'd':4} + >>> _optimal_path(isets, oset, idx_sizes, 5000) + [(0, 2), (0, 1)] + """ + + full_results = [(0, [], input_sets)] + for iteration in range(len(input_sets) - 1): + iter_results = [] + + # Compute all unique pairs + for curr in full_results: + cost, positions, remaining = curr + for con in itertools.combinations(range(len(input_sets) - iteration), 2): + # Find the contraction + cont = _find_contraction(con, remaining, output_set) + new_result, new_input_sets, idx_removed, idx_contract = cont + + # Sieve the results based on memory_limit + new_size = _compute_size_by_dict(new_result, idx_dict) + if new_size > memory_limit: + continue + + # Build (total_cost, positions, indices_remaining) + total_cost = cost + _flop_count( + idx_contract, idx_removed, len(con), idx_dict + ) + new_pos = positions + [con] + iter_results.append((total_cost, new_pos, new_input_sets)) + + # Update combinatorial list, if we did not find anything return best + # path + remaining contractions + if iter_results: + full_results = iter_results + else: + path = min(full_results, key=lambda x: x[0])[1] + path += [tuple(range(len(input_sets) - iteration))] + return path + + # If we have not found anything return single einsum contraction + if len(full_results) == 0: + return [tuple(range(len(input_sets)))] + + path = min(full_results, key=lambda x: x[0])[1] + return path + + +def _parse_possible_contraction( + positions, input_sets, output_set, idx_dict, memory_limit, path_cost, naive_cost +): + """Compute the cost (removed size + flops) and resultant indices for + performing the contraction specified by ``positions``. + + Parameters + ---------- + positions : tuple of int + The locations of the proposed tensors to contract. + input_sets : list of sets + The indices found on each tensors. + output_set : set + The output indices of the expression. + idx_dict : dict + Mapping of each index to its size. + memory_limit : int + The total allowed size for an intermediary tensor. + path_cost : int + The contraction cost so far. + naive_cost : int + The cost of the unoptimized expression. + + Returns + ------- + cost : (int, int) + A tuple containing the size of any indices removed, and the flop cost. + positions : tuple of int + The locations of the proposed tensors to contract. + new_input_sets : list of sets + The resulting new list of indices if this proposed contraction is performed. + + """ + + # Find the contraction + contract = _find_contraction(positions, input_sets, output_set) + idx_result, new_input_sets, idx_removed, idx_contract = contract + + # Sieve the results based on memory_limit + new_size = _compute_size_by_dict(idx_result, idx_dict) + if new_size > memory_limit: + return None + + # Build sort tuple + old_sizes = (_compute_size_by_dict(input_sets[p], idx_dict) for p in positions) + removed_size = sum(old_sizes) - new_size + + # NB: removed_size used to be just the size of any removed indices i.e.: + # helpers.compute_size_by_dict(idx_removed, idx_dict) + cost = _flop_count(idx_contract, idx_removed, len(positions), idx_dict) + sort = (-removed_size, cost) + + # Sieve based on total cost as well + if (path_cost + cost) > naive_cost: + return None + + # Add contraction to possible choices + return [sort, positions, new_input_sets] + + +def _update_other_results(results, best): + """Update the positions and provisional input_sets of ``results`` based on + performing the contraction result ``best``. Remove any involving the tensors + contracted. + + Parameters + ---------- + results : list + List of contraction results produced by ``_parse_possible_contraction``. + best : list + The best contraction of ``results`` i.e. the one that will be performed. + + Returns + ------- + mod_results : list + The list of modified results, updated with outcome of ``best`` contraction. + """ + + best_con = best[1] + bx, by = best_con + mod_results = [] + + for cost, (x, y), con_sets in results: + # Ignore results involving tensors just contracted + if x in best_con or y in best_con: + continue + + # Update the input_sets + del con_sets[by - int(by > x) - int(by > y)] + del con_sets[bx - int(bx > x) - int(bx > y)] + con_sets.insert(-1, best[2][-1]) + + # Update the position indices + mod_con = x - int(x > bx) - int(x > by), y - int(y > bx) - int(y > by) + mod_results.append((cost, mod_con, con_sets)) + + return mod_results + + +def _greedy_path(input_sets, output_set, idx_dict, memory_limit): + """ + Finds the path by contracting the best pair until the input list is + exhausted. The best pair is found by minimizing the tuple + ``(-prod(indices_removed), cost)``. What this amounts to is prioritizing + matrix multiplication or inner product operations, then Hadamard like + operations, and finally outer operations. Outer products are limited by + ``memory_limit``. This algorithm scales cubically with respect to the + number of elements in the list ``input_sets``. + + Parameters + ---------- + input_sets : list + List of sets that represent the lhs side of the einsum subscript + output_set : set + Set that represents the rhs side of the overall einsum subscript + idx_dict : dictionary + Dictionary of index sizes + memory_limit_limit : int + The maximum number of elements in a temporary array + + Returns + ------- + path : list + The greedy contraction order within the memory limit constraint. + + Examples + -------- + >>> isets = [set('abd'), set('ac'), set('bdc')] + >>> oset = set() + >>> idx_sizes = {'a': 1, 'b':2, 'c':3, 'd':4} + >>> _greedy_path(isets, oset, idx_sizes, 5000) + [(0, 2), (0, 1)] + """ + + # Handle trivial cases that leaked through + if len(input_sets) == 1: + return [(0,)] + elif len(input_sets) == 2: + return [(0, 1)] + + # Build up a naive cost + contract = _find_contraction(range(len(input_sets)), input_sets, output_set) + idx_result, new_input_sets, idx_removed, idx_contract = contract + naive_cost = _flop_count(idx_contract, idx_removed, len(input_sets), idx_dict) + + # Initially iterate over all pairs + comb_iter = itertools.combinations(range(len(input_sets)), 2) + known_contractions = [] + + path_cost = 0 + path = [] + + for iteration in range(len(input_sets) - 1): + # Iterate over all pairs on first step, only previously found pairs on subsequent steps + for positions in comb_iter: + # Always initially ignore outer products + if input_sets[positions[0]].isdisjoint(input_sets[positions[1]]): + continue + + result = _parse_possible_contraction( + positions, + input_sets, + output_set, + idx_dict, + memory_limit, + path_cost, + naive_cost, + ) + if result is not None: + known_contractions.append(result) + + # If we do not have a inner contraction, rescan pairs including outer products + if len(known_contractions) == 0: # pragma: no cover + # Then check the outer products + for positions in itertools.combinations(range(len(input_sets)), 2): + result = _parse_possible_contraction( + positions, + input_sets, + output_set, + idx_dict, + memory_limit, + path_cost, + naive_cost, + ) + if result is not None: + known_contractions.append(result) + + # If we still did not find any remaining contractions, default back to einsum like behavior + if len(known_contractions) == 0: + path.append(tuple(range(len(input_sets)))) + break + + # Sort based on first index + best = min(known_contractions, key=lambda x: x[0]) + + # Now propagate as many unused contractions as possible to next iteration + known_contractions = _update_other_results(known_contractions, best) + + # Next iteration only compute contractions with the new tensor + # All other contractions have been accounted for + input_sets = best[2] + new_tensor_pos = len(input_sets) - 1 + comb_iter = ((i, new_tensor_pos) for i in range(new_tensor_pos)) + + # Update path and total cost + path.append(best[1]) + path_cost += best[0][1] + + return path + + +def _can_dot(inputs, result, idx_removed): + """ + Checks if we can use BLAS (np.tensordot) call and its beneficial to do so. + + Parameters + ---------- + inputs : list of str + Specifies the subscripts for summation. + result : str + Resulting summation. + idx_removed : set + Indices that are removed in the summation + + + Returns + ------- + type : bool + Returns true if BLAS should and can be used, else False + + Notes + ----- + If the operations is BLAS level 1 or 2 and is not already aligned + we default back to einsum as the memory movement to copy is more + costly than the operation itself. + + + Examples + -------- + + # Standard GEMM operation + >>> _can_dot(['ij', 'jk'], 'ik', set('j')) + True + + # Can use the standard BLAS, but requires odd data movement + >>> _can_dot(['ijj', 'jk'], 'ik', set('j')) + False + + # DDOT where the memory is not aligned + >>> _can_dot(['ijk', 'ikj'], '', set('ijk')) + False + + """ + + # All `dot` calls remove indices + if len(idx_removed) == 0: + return False + + # BLAS can only handle two operands + if len(inputs) != 2: + return False + + input_left, input_right = inputs + + for c in set(input_left + input_right): + # can't deal with repeated indices on same input or more than 2 total + nl, nr = input_left.count(c), input_right.count(c) + if (nl > 1) or (nr > 1) or (nl + nr > 2): + return False + + # can't do implicit summation or dimension collapse e.g. + # "ab,bc->c" (implicitly sum over 'a') + # "ab,ca->ca" (take diagonal of 'a') + if nl + nr - 1 == int(c in result): + return False + + # Build a few temporaries + set_left = set(input_left) + set_right = set(input_right) + keep_left = set_left - idx_removed + keep_right = set_right - idx_removed + rs = len(idx_removed) + + # At this point we are a DOT, GEMV, or GEMM operation + + # Handle inner products + + # DDOT with aligned data + if input_left == input_right: + return True + + # DDOT without aligned data (better to use einsum) + if set_left == set_right: + return False + + # Handle the 4 possible (aligned) GEMV or GEMM cases + + # GEMM or GEMV no transpose + if input_left[-rs:] == input_right[:rs]: + return True + + # GEMM or GEMV transpose both + if input_left[:rs] == input_right[-rs:]: + return True + + # GEMM or GEMV transpose right + if input_left[-rs:] == input_right[-rs:]: + return True + + # GEMM or GEMV transpose left + if input_left[:rs] == input_right[:rs]: + return True + + # Einsum is faster than GEMV if we have to copy data + if not keep_left or not keep_right: + return False + + # We are a matrix-matrix product, but we need to copy data + return True + + +def parse_einsum_input(operands): + """ + A reproduction of einsum c side einsum parsing in python. + + Returns + ------- + input_strings : str + Parsed input strings + output_string : str + Parsed output string + operands : list of array_like + The operands to use in the numpy contraction + + Examples + -------- + The operand list is simplified to reduce printing: + + >>> import mars.tensor as mt + >>> mt.random.seed(123) + >>> a = mt.random.rand(4, 4) + >>> b = mt.random.rand(4, 4, 4) + >>> parse_einsum_input(('...a,...a->...', a, b)) + ('za,xza', 'xz', [a, b]) # may vary + + >>> parse_einsum_input((a, [Ellipsis, 0], b, [Ellipsis, 0])) + ('za,xza', 'xz', [a, b]) # may vary + """ + + if len(operands) == 0: + raise ValueError("No input operands") + + if isinstance(operands[0], basestring): + subscripts = operands[0].replace(" ", "") + operands = [astensor(v) for v in operands[1:]] + + # Ensure all characters are valid + for s in subscripts: + if s in ".,->": + continue + if s not in einsum_symbols: + raise ValueError(f"Character {s} is not a valid symbol.") + + else: # pragma: no cover + tmp_operands = list(operands) + operand_list = [] + subscript_list = [] + for p in range(len(operands) // 2): + operand_list.append(tmp_operands.pop(0)) + subscript_list.append(tmp_operands.pop(0)) + + output_list = tmp_operands[-1] if len(tmp_operands) else None + operands = [astensor(v) for v in operand_list] + subscripts = "" + last = len(subscript_list) - 1 + for num, sub in enumerate(subscript_list): + for s in sub: + if s is Ellipsis: + subscripts += "..." + elif isinstance(s, int): + subscripts += einsum_symbols[s] + else: + raise TypeError( + "For this input type lists must contain " + "either int or Ellipsis" + ) + if num != last: + subscripts += "," + + if output_list is not None: + subscripts += "->" + for s in output_list: + if s is Ellipsis: + subscripts += "..." + elif isinstance(s, int): + subscripts += einsum_symbols[s] + else: + raise TypeError( + "For this input type lists must contain " + "either int or Ellipsis" + ) + # Check for proper "->" + if ("-" in subscripts) or (">" in subscripts): + invalid = (subscripts.count("-") > 1) or (subscripts.count(">") > 1) + if invalid or (subscripts.count("->") != 1): + raise ValueError("Subscripts can only contain one '->'.") + + # Parse ellipses + if "." in subscripts: + used = subscripts.replace(".", "").replace(",", "").replace("->", "") + unused = list(einsum_symbols_set - set(used)) + ellipse_inds = "".join(unused) + longest = 0 + + if "->" in subscripts: + input_tmp, output_sub = subscripts.split("->") + split_subscripts = input_tmp.split(",") + out_sub = True + else: + split_subscripts = subscripts.split(",") + out_sub = False + + for num, sub in enumerate(split_subscripts): + if "." in sub: + if (sub.count(".") != 3) or (sub.count("...") != 1): + raise ValueError("Invalid Ellipses.") + + # Take into account numerical values + if operands[num].shape == (): + ellipse_count = 0 + else: + ellipse_count = max(operands[num].ndim, 1) + ellipse_count -= len(sub) - 3 + + if ellipse_count > longest: + longest = ellipse_count + + if ellipse_count < 0: + raise ValueError("Ellipses lengths do not match.") + elif ellipse_count == 0: + split_subscripts[num] = sub.replace("...", "") + else: + rep_inds = ellipse_inds[-ellipse_count:] + split_subscripts[num] = sub.replace("...", rep_inds) + + subscripts = ",".join(split_subscripts) + if longest == 0: + out_ellipse = "" + else: + out_ellipse = ellipse_inds[-longest:] + + if out_sub: + subscripts += "->" + output_sub.replace("...", out_ellipse) + else: + # Special care for outputless ellipses + output_subscript = "" + tmp_subscripts = subscripts.replace(",", "") + for s in sorted(set(tmp_subscripts)): + if s not in (einsum_symbols): + raise ValueError(f"Character {s} is not a valid symbol.") + if tmp_subscripts.count(s) == 1: + output_subscript += s + normal_inds = "".join(sorted(set(output_subscript) - set(out_ellipse))) + + subscripts += "->" + out_ellipse + normal_inds + + # Build output string if does not exist + if "->" in subscripts: + input_subscripts, output_subscript = subscripts.split("->") + else: + input_subscripts = subscripts + # Build output subscripts + tmp_subscripts = subscripts.replace(",", "") + output_subscript = "" + for s in sorted(set(tmp_subscripts)): + if s not in einsum_symbols: + raise ValueError(f"Character {s} is not a valid symbol.") + if tmp_subscripts.count(s) == 1: + output_subscript += s + + # Make sure output subscripts are in the input + for char in output_subscript: + if char not in input_subscripts: + raise ValueError(f"Output character {char} did not appear in the input") + + # Make sure number operands is equivalent to the number of terms + if len(input_subscripts.split(",")) != len(operands): + raise ValueError( + "Number of einsum subscripts must be equal to the number of operands." + ) + + return (input_subscripts, output_subscript, operands) + + +def _einsum_path_dispatcher(*operands, **kwargs): + # NOTE: technically, we should only dispatch on array-like arguments, not + # subscripts (given as strings). But separating operands into + # arrays/subscripts is a little tricky/slow (given einsum's two supported + # signatures), so as a practical shortcut we dispatch on everything. + # Strings will be ignored for dispatching since they don't define + # __array_function__. + return operands + + +def einsum_path(*operands, **kwargs): + """ + einsum_path(subscripts, *operands, optimize='greedy') + + Evaluates the lowest cost contraction order for an einsum expression by + considering the creation of intermediate arrays. + + Parameters + ---------- + subscripts : str + Specifies the subscripts for summation. + *operands : list of array_like + These are the arrays for the operation. + optimize : {bool, list, tuple, 'greedy', 'optimal'} + Choose the type of path. If a tuple is provided, the second argument is + assumed to be the maximum intermediate size created. If only a single + argument is provided the largest input or output array size is used + as a maximum intermediate size. + + * if a list is given that starts with ``einsum_path``, uses this as the + contraction path + * if False no optimization is taken + * if True defaults to the 'greedy' algorithm + * 'optimal' An algorithm that combinatorially explores all possible + ways of contracting the listed tensors and choosest the least costly + path. Scales exponentially with the number of terms in the + contraction. + * 'greedy' An algorithm that chooses the best pair contraction + at each step. Effectively, this algorithm searches the largest inner, + Hadamard, and then outer products at each step. Scales cubically with + the number of terms in the contraction. Equivalent to the 'optimal' + path for most contractions. + + Default is 'greedy'. + + Returns + ------- + path : list of tuples + A list representation of the einsum path. + string_repr : str + A printable representation of the einsum path. + + Notes + ----- + The resulting path indicates which terms of the input contraction should be + contracted first, the result of this contraction is then appended to the + end of the contraction list. This list can then be iterated over until all + intermediate contractions are complete. + + See Also + -------- + einsum, linalg.multi_dot + + Examples + -------- + + We can begin with a chain dot example. In this case, it is optimal to + contract the ``b`` and ``c`` tensors first as represented by the first + element of the path ``(1, 2)``. The resulting tensor is added to the end + of the contraction and the remaining contraction ``(0, 1)`` is then + completed. + + >>> np.random.seed(123) + >>> a = np.random.rand(2, 2) + >>> b = np.random.rand(2, 5) + >>> c = np.random.rand(5, 2) + >>> path_info = np.einsum_path('ij,jk,kl->il', a, b, c, optimize='greedy') + >>> print(path_info[0]) + ['einsum_path', (1, 2), (0, 1)] + >>> print(path_info[1]) + Complete contraction: ij,jk,kl->il # may vary + Naive scaling: 4 + Optimized scaling: 3 + Naive FLOP count: 1.600e+02 + Optimized FLOP count: 5.600e+01 + Theoretical speedup: 2.857 + Largest intermediate: 4.000e+00 elements + ------------------------------------------------------------------------- + scaling current remaining + ------------------------------------------------------------------------- + 3 kl,jk->jl ij,jl->il + 3 jl,ij->il il->il + + + A more complex index transformation example. + + >>> I = np.random.rand(10, 10, 10, 10) + >>> C = np.random.rand(10, 10) + >>> path_info = np.einsum_path('ea,fb,abcd,gc,hd->efgh', C, C, I, C, C, + ... optimize='greedy') + + >>> print(path_info[0]) + ['einsum_path', (0, 2), (0, 3), (0, 2), (0, 1)] + >>> print(path_info[1]) + Complete contraction: ea,fb,abcd,gc,hd->efgh # may vary + Naive scaling: 8 + Optimized scaling: 5 + Naive FLOP count: 8.000e+08 + Optimized FLOP count: 8.000e+05 + Theoretical speedup: 1000.000 + Largest intermediate: 1.000e+04 elements + -------------------------------------------------------------------------- + scaling current remaining + -------------------------------------------------------------------------- + 5 abcd,ea->bcde fb,gc,hd,bcde->efgh + 5 bcde,fb->cdef gc,hd,cdef->efgh + 5 cdef,gc->defg hd,defg->efgh + 5 defg,hd->efgh efgh->efgh + """ + + # Make sure all keywords are valid + valid_contract_kwargs = ["optimize", "einsum_call"] + unknown_kwargs = [k for (k, v) in kwargs.items() if k not in valid_contract_kwargs] + if len(unknown_kwargs): + raise TypeError(f"Did not understand the following kwargs: {unknown_kwargs!r}") + + # Figure out what the path really is + path_type = kwargs.pop("optimize", True) + if path_type is True: + path_type = "greedy" + if path_type is None: + path_type = False + + memory_limit = None + + # No optimization or a named path algorithm + if (path_type is False) or isinstance(path_type, basestring): + pass + + # Given an explicit path + elif len(path_type) and (path_type[0] == "einsum_path"): # pragma: no cover + pass + + # Path tuple with memory limit + elif ( + (len(path_type) == 2) + and isinstance(path_type[0], basestring) + and isinstance(path_type[1], (int, float)) + ): # pragma: no cover + memory_limit = int(path_type[1]) + path_type = path_type[0] + + else: # pragma: no cover + raise TypeError(f"Did not understand the path: {path_type}") + + # Hidden option, only einsum should call this + einsum_call_arg = kwargs.pop("einsum_call", False) + + # Python side parsing + input_subscripts, output_subscript, operands = parse_einsum_input(operands) + + # Build a few useful list and sets + input_list = input_subscripts.split(",") + input_sets = [set(x) for x in input_list] + output_set = set(output_subscript) + indices = set(input_subscripts.replace(",", "")) + + # Get length of each unique dimension and ensure all dimensions are correct + dimension_dict = {} + broadcast_indices = [[] for x in range(len(input_list))] + for tnum, term in enumerate(input_list): + sh = operands[tnum].shape + if len(sh) != len(term): + raise ValueError( + "Einstein sum subscript %s does not contain the " + "correct number of indices for operand %d." + % (input_subscripts[tnum], tnum) + ) + for cnum, char in enumerate(term): + dim = sh[cnum] + + # Build out broadcast indices + if dim == 1: + broadcast_indices[tnum].append(char) + + if char in dimension_dict.keys(): + # For broadcasting cases we always want the largest dim size + if dimension_dict[char] == 1: + dimension_dict[char] = dim + elif dim not in (1, dimension_dict[char]): + raise ValueError( + "Size of label '%s' for operand %d (%d) " + "does not match previous terms (%d)." + % (char, tnum, dimension_dict[char], dim) + ) + else: + dimension_dict[char] = dim + + # Convert broadcast inds to sets + broadcast_indices = [set(x) for x in broadcast_indices] + + # Compute size of each input array plus the output array + size_list = [ + _compute_size_by_dict(term, dimension_dict) + for term in input_list + [output_subscript] + ] + max_size = max(size_list) + + if memory_limit is None: + memory_arg = max_size + else: + memory_arg = memory_limit + + # Compute naive cost + # This isn't quite right, need to look into exactly how einsum does this + inner_product = (sum(len(x) for x in input_sets) - len(indices)) > 0 + naive_cost = _flop_count(indices, inner_product, len(input_list), dimension_dict) + + # Compute the path + if (path_type is False) or (len(input_list) in [1, 2]) or (indices == output_set): + # Nothing to be optimized, leave it to einsum + path = [tuple(range(len(input_list)))] + elif path_type == "greedy": + path = _greedy_path(input_sets, output_set, dimension_dict, memory_arg) + elif path_type == "optimal": + path = _optimal_path(input_sets, output_set, dimension_dict, memory_arg) + elif path_type[0] == "einsum_path": # pragma: no cover + path = path_type[1:] + else: # pragma: no cover + raise KeyError("Path name %s not found", path_type) + + cost_list, scale_list, size_list, contraction_list = [], [], [], [] + + # Build contraction tuple (positions, gemm, einsum_str, remaining) + for cnum, contract_inds in enumerate(path): + # Make sure we remove inds from right to left + contract_inds = tuple(sorted(list(contract_inds), reverse=True)) + + contract = _find_contraction(contract_inds, input_sets, output_set) + out_inds, input_sets, idx_removed, idx_contract = contract + + cost = _flop_count( + idx_contract, idx_removed, len(contract_inds), dimension_dict + ) + cost_list.append(cost) + scale_list.append(len(idx_contract)) + size_list.append(_compute_size_by_dict(out_inds, dimension_dict)) + + bcast = set() + tmp_inputs = [] + for x in contract_inds: + tmp_inputs.append(input_list.pop(x)) + bcast |= broadcast_indices.pop(x) + + new_bcast_inds = bcast - idx_removed + + # If we're broadcasting, nix blas + if not len(idx_removed & bcast): + do_blas = _can_dot(tmp_inputs, out_inds, idx_removed) + else: + do_blas = False + + # Last contraction + if (cnum - len(path)) == -1: + idx_result = output_subscript + else: + sort_result = [(dimension_dict[ind], ind) for ind in out_inds] + idx_result = "".join([x[1] for x in sorted(sort_result)]) + + input_list.append(idx_result) + broadcast_indices.append(new_bcast_inds) + einsum_str = ",".join(tmp_inputs) + "->" + idx_result + + contraction = (contract_inds, idx_removed, einsum_str, input_list[:], do_blas) + contraction_list.append(contraction) + + opt_cost = sum(cost_list) + 1 + + if einsum_call_arg: + return (operands, contraction_list) + + # Return the path along with a nice string representation + overall_contraction = input_subscripts + "->" + output_subscript + header = ("scaling", "current", "remaining") + + speedup = naive_cost / opt_cost + max_i = max(size_list) + + path_print = " Complete contraction: %s\n" % overall_contraction # noqa: E221 + path_print += " Naive scaling: %d\n" % len(indices) + path_print += " Optimized scaling: %d\n" % max(scale_list) + path_print += " Naive FLOP count: %.3e\n" % naive_cost + path_print += " Optimized FLOP count: %.3e\n" % opt_cost + path_print += " Theoretical speedup: %3.3f\n" % speedup + path_print += " Largest intermediate: %.3e elements\n" % max_i + path_print += "-" * 74 + "\n" + path_print += "%6s %24s %40s\n" % header + path_print += "-" * 74 + + for n, contraction in enumerate(contraction_list): + inds, idx_rm, einsum_str, remaining, blas = contraction + remaining_str = ",".join(remaining) + "->" + output_subscript + path_run = (scale_list[n], einsum_str, remaining_str) + path_print += "\n%4d %24s %40s" % path_run + + path = ["einsum_path"] + path + return path, path_print diff --git a/python/xorbits/_mars/tensor/einsum/tests/__init__.py b/python/xorbits/_mars/tensor/einsum/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/tensor/einsum/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/tensor/einsum/tests/test_einsum.py b/python/xorbits/_mars/tensor/einsum/tests/test_einsum.py new file mode 100644 index 000000000..271f3d3db --- /dev/null +++ b/python/xorbits/_mars/tensor/einsum/tests/test_einsum.py @@ -0,0 +1,65 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ....core import tile +from ... import einsum +from ...datasource import tensor + + +def test_einsum(): + data1 = np.random.rand(3, 4, 5) + data2 = np.random.rand(4, 3, 2) + + t1 = tensor(data1, chunk_size=2) + t2 = tensor(data2, chunk_size=3) + t = einsum("ijk, jil -> kl", t1, t2) + + assert t.shape == (5, 2) + + t = tile(t) + assert len(t.chunks) == 3 + + # multiply(data1, data2) + data1 = np.random.rand(6, 6) + data2 = np.random.rand(6, 6) + t1 = tensor(data1, chunk_size=3) + t2 = tensor(data2, chunk_size=3) + t = einsum("..., ...", t1, t2) + + assert t.shape == (6, 6) + + t = tile(t) + assert len(t.chunks) == 4 + + t = einsum("..., ...", t1, t2, optimize=True) + assert t.op.optimize == ["einsum_path", (0, 1)] + + # test broadcast + data1 = np.random.rand(1, 10, 9) + data2 = np.random.rand(9, 6) + data3 = np.random.rand(10, 6) + data4 = np.random.rand(8) + + t1 = tensor(data1, chunk_size=(1, (5, 5), (3, 3, 3))) + t2 = tensor(data2, chunk_size=((3, 3, 3), (3, 3))) + t3 = tensor(data3, chunk_size=((6, 4), (4, 2))) + t4 = tensor(data4, chunk_size=3) + t = einsum("ajk,kl,jl,a->a", t1, t2, t3, t4, optimize="") + + assert t.shape == (8,) + + t = tile(t) + assert len(t.chunks) == 3 diff --git a/python/xorbits/_mars/tensor/einsum/tests/test_einsum_execution.py b/python/xorbits/_mars/tensor/einsum/tests/test_einsum_execution.py new file mode 100644 index 000000000..675e7f672 --- /dev/null +++ b/python/xorbits/_mars/tensor/einsum/tests/test_einsum_execution.py @@ -0,0 +1,85 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import einsum +from ...datasource import tensor + + +def test_einsum_execution(setup): + data1 = np.random.rand(3, 4, 5) + data2 = np.random.rand(4, 3, 2) + + t1 = tensor(data1, chunk_size=2) + t2 = tensor(data2, chunk_size=3) + t = einsum("ijk, jil -> kl", t1, t2) + res = t.execute().fetch() + expected = np.einsum("ijk, jil -> kl", data1, data2) + np.testing.assert_almost_equal(res, expected) + + # dot + t = einsum("ijk, jil", t1, t2, optimize=True) + res = t.execute().fetch() + expected = np.einsum("ijk, jil", data1, data2, optimize=True) + np.testing.assert_almost_equal(res, expected) + + # multiply(data1, data2) + data1 = np.random.rand(6, 6) + data2 = np.random.rand(6, 6) + t1 = tensor(data1, chunk_size=3) + t2 = tensor(data2, chunk_size=3) + t = einsum("..., ...", t1, t2, order="C") + res = t.execute().fetch() + expected = np.einsum("..., ...", data1, data2, order="C") + np.testing.assert_almost_equal(res, expected) + + # sum(data, axis=-1) + data = np.random.rand(10) + t1 = tensor(data, chunk_size=3) + t = einsum("i->", t1, order="F") + res = t.execute().fetch() + expected = np.einsum("i->", data, order="F") + np.testing.assert_almost_equal(res, expected) + + # sum(data, axis=0) + t1 = tensor(data) + t = einsum("...i->...", t1) + res = t.execute().fetch() + expected = np.einsum("...i->...", data) + np.testing.assert_almost_equal(res, expected) + + # test broadcast + data1 = np.random.rand(1, 10, 9) + data2 = np.random.rand(9, 6) + data3 = np.random.rand(10, 6) + data4 = np.random.rand(8) + + t1 = tensor(data1, chunk_size=(1, (5, 5), (3, 3, 3))) + t2 = tensor(data2, chunk_size=((3, 3, 3), (3, 3))) + t3 = tensor(data3, chunk_size=((6, 4), (4, 2))) + t4 = tensor(data4, chunk_size=4) + t = einsum("ajk,kl,jl,a->a", t1, t2, t3, t4, optimize="optimal") + res = t.execute().fetch() + expected = np.einsum( + "ajk,kl,jl,a->a", data1, data2, data3, data4, optimize="optimal" + ) + np.testing.assert_almost_equal(res, expected) + + t = einsum("ajk,kl,jl,a->a", t1, t2, t3, t4, optimize="greedy") + res = t.execute().fetch() + expected = np.einsum( + "ajk,kl,jl,a->a", data1, data2, data3, data4, optimize="greedy" + ) + np.testing.assert_almost_equal(res, expected) diff --git a/python/xorbits/_mars/tensor/fetch/__init__.py b/python/xorbits/_mars/tensor/fetch/__init__.py new file mode 100644 index 000000000..eec1e1702 --- /dev/null +++ b/python/xorbits/_mars/tensor/fetch/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .core import TensorFetch, TensorFetchShuffle diff --git a/python/xorbits/_mars/tensor/fetch/core.py b/python/xorbits/_mars/tensor/fetch/core.py new file mode 100644 index 000000000..d84a9c7f3 --- /dev/null +++ b/python/xorbits/_mars/tensor/fetch/core.py @@ -0,0 +1,59 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...core import OutputType, register_fetch_class +from ...core.operand import Fetch, FetchMixin, FetchShuffle +from ...serialization.serializables import DataTypeField +from ..operands import TensorOperandMixin + + +class TensorFetchMixin(TensorOperandMixin, FetchMixin): + __slots__ = () + _output_type_ = OutputType.tensor + + +class TensorFetch(TensorFetchMixin, Fetch): + dtype = DataTypeField("dtype") + + def __init__(self, **kw): + kw.pop("output_types", None) + kw.pop("_output_types", None) + super().__init__(**kw) + + def _new_chunks(self, inputs, kws=None, **kw): + if "_key" in kw and self.source_key is None: + self.source_key = kw["_key"] + return super()._new_chunks(inputs, kws=kws, **kw) + + def _new_tileables(self, inputs, kws=None, **kw): + if "_key" in kw and self.source_key is None: + self.source_key = kw["_key"] + return super()._new_tileables(inputs, kws=kws, **kw) + + +class TensorFetchShuffle(TensorFetchMixin, FetchShuffle): + _dtype = DataTypeField("dtype") + + def __init__(self, **kw): + kw.pop("output_types", None) + kw.pop("_output_types", None) + super().__init__(**kw) + + @property + def dtype(self): + return getattr(self, "_dtype", None) + + +register_fetch_class(OutputType.tensor, TensorFetch, TensorFetchShuffle) +register_fetch_class(OutputType.scalar, TensorFetch, TensorFetchShuffle) diff --git a/python/xorbits/_mars/tensor/fft/__init__.py b/python/xorbits/_mars/tensor/fft/__init__.py new file mode 100644 index 000000000..b80c1d708 --- /dev/null +++ b/python/xorbits/_mars/tensor/fft/__init__.py @@ -0,0 +1,32 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .fft import TensorFFT, fft +from .fft2 import TensorFFT2, fft2 +from .fftfreq import TensorFFTFreq, fftfreq +from .fftn import TensorFFTN, fftn +from .fftshift import TensorFFTShift, fftshift +from .hfft import TensorHFFT, hfft +from .ifft import TensorIFFT, ifft +from .ifft2 import TensorIFFT2, ifft2 +from .ifftn import TensorIFFTN, ifftn +from .ifftshift import TensorIFFTShift, ifftshift +from .ihfft import TensorIHFFT, ihfft +from .irfft import TensorIRFFT, irfft +from .irfft2 import TensorIRFFT2, irfft2 +from .irfftn import TensorIRFFTN, irfftn +from .rfft import TensorRFFT, rfft +from .rfft2 import TensorRFFT2, rfft2 +from .rfftfreq import TensorRFFTFreq, rfftfreq +from .rfftn import TensorRFFTN, rfftn diff --git a/python/xorbits/_mars/tensor/fft/core.py b/python/xorbits/_mars/tensor/fft/core.py new file mode 100644 index 000000000..241f55eb8 --- /dev/null +++ b/python/xorbits/_mars/tensor/fft/core.py @@ -0,0 +1,300 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import Iterable + +from ...core import recursive_tile +from ...serialization.serializables import ( + FieldTypes, + Int32Field, + Int64Field, + KeyField, + StringField, + TupleField, +) +from ...utils import has_unknown_shape +from ..array_utils import get_array_module +from ..operands import TensorHasInput, TensorOperandMixin +from ..utils import decide_chunk_sizes, validate_axis + + +class TensorFFTBaseMixin(TensorOperandMixin): + __slots__ = () + + @classmethod + def _get_shape(cls, op, shape): + raise NotImplementedError + + @classmethod + def _tile_fft(cls, op, axes): + in_tensor = op.inputs[0] + out_tensor = op.outputs[0] + + if any(in_tensor.chunk_shape[axis] != 1 for axis in axes): + if has_unknown_shape(in_tensor): + yield + # fft requires only 1 chunk for the specified axis, so we do rechunk first + chunks = { + validate_axis(in_tensor.ndim, axis): in_tensor.shape[axis] + for axis in axes + } + new_chunks = decide_chunk_sizes( + in_tensor.shape, chunks, in_tensor.dtype.itemsize + ) + in_tensor = yield from recursive_tile(in_tensor.rechunk(new_chunks)) + + out_chunks = [] + for c in in_tensor.chunks: + chunk_op = op.copy().reset_key() + chunk_shape = cls._get_shape(op, c.shape) + out_chunk = chunk_op.new_chunk( + [c], shape=chunk_shape, index=c.index, order=out_tensor.order + ) + out_chunks.append(out_chunk) + + nsplits = [ + tuple( + c.shape[i] + for c in out_chunks + if all(idx == 0 for j, idx in enumerate(c.index) if j != i) + ) + for i in range(len(out_chunks[0].shape)) + ] + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + out_tensor.shape, + order=out_tensor.order, + chunks=out_chunks, + nsplits=nsplits, + ) + + def __call__(self, a, order=None): + shape = self._get_shape(self, a.shape) + order = a.order if order is None else order + return self.new_tensor([a], shape, order=order) + + +class TensorFFTMixin(TensorFFTBaseMixin): + __slots__ = () + + @classmethod + def tile(cls, op): + return (yield from cls._tile_fft(op, [op.axis])) + + +class TensorComplexFFTMixin(TensorFFTMixin): + @classmethod + def _get_shape(cls, op, shape): + new_shape = list(shape) + if op.n is not None: + new_shape[op.axis] = op.n + return tuple(new_shape) + + +def validate_fft(tensor, axis=-1, norm=None): + validate_axis(tensor.ndim, axis) + if norm is not None and norm not in ("ortho",): + raise ValueError(f'Invalid norm value {norm}, should be None or "ortho"') + + +class TensorFFTNMixin(TensorFFTBaseMixin): + @classmethod + def tile(cls, op): + return (yield from cls._tile_fft(op, op.axes)) + + @staticmethod + def _merge_shape(op, shape): + new_shape = list(shape) + if op.shape is not None: + for ss, axis in zip(op.shape, op.axes): + new_shape[axis] = ss + return new_shape + + +class TensorComplexFFTNMixin(TensorFFTNMixin): + @classmethod + def _get_shape(cls, op, shape): + return tuple(cls._merge_shape(op, shape)) + + +class TensorRealFFTNMixin(TensorFFTNMixin): + @classmethod + def _get_shape(cls, op, shape): + new_shape = cls._merge_shape(op, shape) + new_shape[op.axes[-1]] = new_shape[op.axes[-1]] // 2 + 1 + return tuple(new_shape) + + +class TensorRealIFFTNMixin(TensorFFTNMixin): + @classmethod + def _get_shape(cls, op, shape): + new_shape = list(shape) + new_shape[op.axes[-1]] = 2 * (new_shape[op.axes[-1]] - 1) + return tuple(cls._merge_shape(op, new_shape)) + + +def validate_fftn(tensor, s=None, axes=None, norm=None): + if axes is None: + if s is None: + axes = tuple(range(tensor.ndim)) + else: + axes = tuple(range(len(s))) + else: + for axis in axes: + validate_axis(tensor.ndim, axis) + if len(set(axes)) < len(axes): + raise ValueError("Duplicate axes not allowed") + + if norm is not None and norm not in ("ortho",): + raise ValueError(f'Invalid norm value {norm}, should be None or "ortho"') + + return axes + + +class TensorFFTShiftMixin(TensorOperandMixin): + __slots__ = () + + @classmethod + def _is_inverse(cls): + return False + + @classmethod + def _process_axes(cls, x, axes): + if axes is None: + axes = tuple(range(x.ndim)) + elif isinstance(axes, Iterable): + axes = tuple(axes) + else: + axes = (axes,) + + return axes + + @classmethod + def tile(cls, op): + from ..merge import concatenate + + axes = op.axes + in_tensor = op.input + is_inverse = cls._is_inverse() + + if has_unknown_shape(in_tensor): + yield + + x = in_tensor + for axis in axes: + size = in_tensor.shape[axis] + slice_on = (size + 1) // 2 if not is_inverse else size // 2 + slc1 = [slice(None)] * axis + [slice(slice_on)] + slc2 = [slice(None)] * axis + [slice(slice_on, None)] + x = concatenate([x[slc2], x[slc1]], axis=axis) + + x = yield from recursive_tile(x) + new_op = op.copy() + return new_op.new_tensors( + op.inputs, op.outputs[0].shape, chunks=x.chunks, nsplits=x.nsplits + ) + + +class TensorDiscreteFourierTransform(TensorHasInput): + __slots__ = () + + +class TensorBaseFFT(TensorDiscreteFourierTransform): + _input = KeyField("input") + _norm = StringField("norm") + + @property + def norm(self): + return getattr(self, "_norm", None) + + +class TensorBaseSingleDimensionFFT(TensorBaseFFT): + _n = Int64Field("n") + _axis = Int32Field("axis") + + @property + def n(self): + return self._n + + @property + def axis(self): + return self._axis + + @classmethod + def execute(cls, ctx, op): + a = ctx[op.inputs[0].key] + xp = get_array_module(a) + fun = _get_fft_func(op, xp) + res = fun(a, n=op.n, axis=op.axis, norm=op.norm) + if res.dtype != op.dtype: + res = res.astype(op.dtype) + ctx[op.outputs[0].key] = res + + +class TensorBaseMultipleDimensionFFT(TensorBaseFFT): + _shape = TupleField("shape", FieldTypes.int64) + _axes = TupleField("axes", FieldTypes.int32) + + @property + def shape(self): + return self._shape + + @property + def axes(self): + return self._axes + + @classmethod + def execute(cls, ctx, op): + a = ctx[op.inputs[0].key] + xp = get_array_module(a) + fun = _get_fft_func(op, xp) + res = fun(a, s=op.shape, axes=op.axes, norm=op.norm) + if res.dtype != op.dtype: + res = res.astype(op.dtype) + ctx[op.outputs[0].key] = res + + +def _get_fft_func(op, xp): + fun_name = type(op).__name__.lower()[6:] # all op starts with tensor + return getattr(xp.fft, fun_name) + + +class TensorStandardFFT(TensorBaseSingleDimensionFFT): + pass + + +class TensorStandardFFTN(TensorBaseMultipleDimensionFFT): + pass + + +class TensorFFTShiftBase(TensorHasInput): + _input = KeyField("input") + _axes = TupleField("axes", FieldTypes.int32) + + @property + def axes(self): + return self._axes + + +class TensorRealFFT(TensorBaseSingleDimensionFFT): + pass + + +class TensorRealFFTN(TensorBaseMultipleDimensionFFT): + pass + + +class TensorHermitianFFT(TensorBaseSingleDimensionFFT): + pass diff --git a/python/xorbits/_mars/tensor/fft/fft.py b/python/xorbits/_mars/tensor/fft/fft.py new file mode 100644 index 000000000..f1a3e9e8d --- /dev/null +++ b/python/xorbits/_mars/tensor/fft/fft.py @@ -0,0 +1,114 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorComplexFFTMixin, TensorStandardFFT, validate_fft + + +class TensorFFT(TensorStandardFFT, TensorComplexFFTMixin): + _op_type_ = OperandDef.FFT + + def __init__(self, n=None, axis=-1, norm=None, **kw): + super().__init__(_n=n, _axis=axis, _norm=norm, **kw) + + +def fft(a, n=None, axis=-1, norm=None): + """ + Compute the one-dimensional discrete Fourier Transform. + + This function computes the one-dimensional *n*-point discrete Fourier + Transform (DFT) with the efficient Fast Fourier Transform (FFT) + algorithm [CT]. + + Parameters + ---------- + a : array_like + Input tensor, can be complex. + n : int, optional + Length of the transformed axis of the output. + If `n` is smaller than the length of the input, the input is cropped. + If it is larger, the input is padded with zeros. If `n` is not given, + the length of the input along the axis specified by `axis` is used. + axis : int, optional + Axis over which to compute the FFT. If not given, the last axis is + used. + norm : {None, "ortho"}, optional + Normalization mode (see `mt.fft`). Default is None. + + Returns + ------- + out : complex Tensor + The truncated or zero-padded input, transformed along the axis + indicated by `axis`, or the last one if `axis` is not specified. + + Raises + ------ + IndexError + if `axes` is larger than the last axis of `a`. + + See Also + -------- + mt.fft : for definition of the DFT and conventions used. + ifft : The inverse of `fft`. + fft2 : The two-dimensional FFT. + fftn : The *n*-dimensional FFT. + rfftn : The *n*-dimensional FFT of real input. + fftfreq : Frequency bins for given FFT parameters. + + Notes + ----- + FFT (Fast Fourier Transform) refers to a way the discrete Fourier + Transform (DFT) can be calculated efficiently, by using symmetries in the + calculated terms. The symmetry is highest when `n` is a power of 2, and + the transform is therefore most efficient for these sizes. + + The DFT is defined, with the conventions used in this implementation, in + the documentation for the `numpy.fft` module. + + References + ---------- + .. [CT] Cooley, James W., and John W. Tukey, 1965, "An algorithm for the + machine calculation of complex Fourier series," *Math. Comput.* + 19: 297-301. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.fft.fft(mt.exp(2j * mt.pi * mt.arange(8) / 8)).execute() + array([-2.33486982e-16+1.14423775e-17j, 8.00000000e+00-6.89018570e-16j, + 2.33486982e-16+2.33486982e-16j, 0.00000000e+00+0.00000000e+00j, + -1.14423775e-17+2.33486982e-16j, 0.00000000e+00+1.99159850e-16j, + 1.14423775e-17+1.14423775e-17j, 0.00000000e+00+0.00000000e+00j]) + + In this example, real input has an FFT which is Hermitian, i.e., symmetric + in the real part and anti-symmetric in the imaginary part, as described in + the `numpy.fft` documentation: + + >>> import matplotlib.pyplot as plt + >>> t = mt.arange(256) + >>> sp = mt.fft.fft(mt.sin(t)) + >>> freq = mt.fft.fftfreq(t.shape[-1]) + >>> plt.plot(freq.execute(), sp.real.execute(), freq.execute(), sp.imag.execute()) + [, ] + >>> plt.show() + + """ + a = astensor(a) + validate_fft(a, axis, norm) + op = TensorFFT(n=n, axis=axis, norm=norm, dtype=np.dtype(np.complex_)) + return op(a) diff --git a/python/xorbits/_mars/tensor/fft/fft2.py b/python/xorbits/_mars/tensor/fft/fft2.py new file mode 100644 index 000000000..3f76cb009 --- /dev/null +++ b/python/xorbits/_mars/tensor/fft/fft2.py @@ -0,0 +1,120 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorComplexFFTNMixin, TensorStandardFFTN, validate_fftn + + +class TensorFFT2(TensorStandardFFTN, TensorComplexFFTNMixin): + _op_type_ = OperandDef.FFT2 + + def __init__(self, shape=None, axes=None, norm=None, **kw): + super().__init__(_shape=shape, _axes=axes, _norm=norm, **kw) + + +def fft2(a, s=None, axes=(-2, -1), norm=None): + """ + Compute the 2-dimensional discrete Fourier Transform + + This function computes the *n*-dimensional discrete Fourier Transform + over any axes in an *M*-dimensional array by means of the + Fast Fourier Transform (FFT). By default, the transform is computed over + the last two axes of the input array, i.e., a 2-dimensional FFT. + + Parameters + ---------- + a : array_like + Input tensor, can be complex + s : sequence of ints, optional + Shape (length of each transformed axis) of the output + (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). + This corresponds to ``n`` for ``fft(x, n)``. + Along each axis, if the given shape is smaller than that of the input, + the input is cropped. If it is larger, the input is padded with zeros. + if `s` is not given, the shape of the input along the axes specified + by `axes` is used. + axes : sequence of ints, optional + Axes over which to compute the FFT. If not given, the last two + axes are used. A repeated index in `axes` means the transform over + that axis is performed multiple times. A one-element sequence means + that a one-dimensional FFT is performed. + norm : {None, "ortho"}, optional + Normalization mode (see `mt.fft`). Default is None. + + Returns + ------- + out : complex Tensor + The truncated or zero-padded input, transformed along the axes + indicated by `axes`, or the last two axes if `axes` is not given. + + Raises + ------ + ValueError + If `s` and `axes` have different length, or `axes` not given and + ``len(s) != 2``. + IndexError + If an element of `axes` is larger than than the number of axes of `a`. + + See Also + -------- + mt.fft : Overall view of discrete Fourier transforms, with definitions + and conventions used. + ifft2 : The inverse two-dimensional FFT. + fft : The one-dimensional FFT. + fftn : The *n*-dimensional FFT. + fftshift : Shifts zero-frequency terms to the center of the array. + For two-dimensional input, swaps first and third quadrants, and second + and fourth quadrants. + + Notes + ----- + `fft2` is just `fftn` with a different default for `axes`. + + The output, analogously to `fft`, contains the term for zero frequency in + the low-order corner of the transformed axes, the positive frequency terms + in the first half of these axes, the term for the Nyquist frequency in the + middle of the axes and the negative frequency terms in the second half of + the axes, in order of decreasingly negative frequency. + + See `fftn` for details and a plotting example, and `mt.fft` for + definitions and conventions used. + + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.mgrid[:5, :5][0] + >>> mt.fft.fft2(a).execute() + array([[ 50.0 +0.j , 0.0 +0.j , 0.0 +0.j , + 0.0 +0.j , 0.0 +0.j ], + [-12.5+17.20477401j, 0.0 +0.j , 0.0 +0.j , + 0.0 +0.j , 0.0 +0.j ], + [-12.5 +4.0614962j , 0.0 +0.j , 0.0 +0.j , + 0.0 +0.j , 0.0 +0.j ], + [-12.5 -4.0614962j , 0.0 +0.j , 0.0 +0.j , + 0.0 +0.j , 0.0 +0.j ], + [-12.5-17.20477401j, 0.0 +0.j , 0.0 +0.j , + 0.0 +0.j , 0.0 +0.j ]]) + + """ + if len(axes) != 2: + raise ValueError("axes length should be 2") + a = astensor(a) + axes = validate_fftn(a, s=s, axes=axes, norm=norm) + op = TensorFFT2(shape=s, axes=axes, norm=norm, dtype=np.dtype(np.complex_)) + return op(a) diff --git a/python/xorbits/_mars/tensor/fft/fftfreq.py b/python/xorbits/_mars/tensor/fft/fftfreq.py new file mode 100644 index 000000000..b24d2cddb --- /dev/null +++ b/python/xorbits/_mars/tensor/fft/fftfreq.py @@ -0,0 +1,160 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...core import NotSupportTile, recursive_tile +from ...serialization.serializables import Float64Field, Int32Field, KeyField +from ..core import TensorOrder +from ..datasource import arange +from ..operands import TensorHasInput, TensorOperand, TensorOperandMixin + + +class TensorFFTFreq(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.FFTFREQ + + _n = Int32Field("n") + _d = Float64Field("d") + + def __init__(self, n=None, d=None, **kw): + super().__init__(_n=n, _d=d, **kw) + + @property + def n(self): + return self._n + + @property + def d(self): + return self._d + + def __call__(self, chunk_size=None): + shape = (self.n,) + return self.new_tensor( + None, shape, raw_chunk_size=chunk_size, order=TensorOrder.C_ORDER + ) + + @classmethod + def tile(cls, op): + tensor = op.outputs[0] + in_tensor = yield from recursive_tile( + arange( + op.n, + gpu=op.gpu, + dtype=op.dtype, + chunks=tensor.extra_params.raw_chunk_size, + ) + ) + + out_chunks = [] + for c in in_tensor.chunks: + chunk_op = TensorFFTFreqChunk(n=op.n, d=op.d, dtype=op.dtype) + out_chunk = chunk_op.new_chunk( + [c], shape=c.shape, index=c.index, order=tensor.order + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + tensor.shape, + order=tensor.order, + chunks=out_chunks, + nsplits=in_tensor.nsplits, + **tensor.extra_params + ) + + +class TensorFFTFreqChunk(TensorHasInput, TensorOperandMixin): + _op_type_ = OperandDef.FFTFREQ_CHUNK + + _input = KeyField("input") + _n = Int32Field("n") + _d = Float64Field("d") + + def __init__(self, n=None, d=None, dtype=None, **kw): + super().__init__(_n=n, _d=d, dtype=dtype, **kw) + + @property + def n(self): + return self._n + + @property + def d(self): + return self._d + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + @classmethod + def tile(cls, op): + raise NotSupportTile( + "FFTFreqChunk is a chunk operand which does not support tile" + ) + + @classmethod + def execute(cls, ctx, op): + n, d = op.n, op.d + x = ctx[op.inputs[0].key].copy() + x[x >= (n + 1) // 2] -= n + x /= n * d + ctx[op.outputs[0].key] = x + + +def fftfreq(n, d=1.0, gpu=None, chunk_size=None): + """ + Return the Discrete Fourier Transform sample frequencies. + + The returned float tensor `f` contains the frequency bin centers in cycles + per unit of the sample spacing (with zero at the start). For instance, if + the sample spacing is in seconds, then the frequency unit is cycles/second. + + Given a window length `n` and a sample spacing `d`:: + + f = [0, 1, ..., n/2-1, -n/2, ..., -1] / (d*n) if n is even + f = [0, 1, ..., (n-1)/2, -(n-1)/2, ..., -1] / (d*n) if n is odd + + Parameters + ---------- + n : int + Window length. + d : scalar, optional + Sample spacing (inverse of the sampling rate). Defaults to 1. + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + + Returns + ------- + f : Tensor + Array of length `n` containing the sample frequencies. + + Examples + -------- + >>> import mars.tensor as mt + + >>> signal = mt.array([-2, 8, 6, 4, 1, 0, 3, 5], dtype=float) + >>> fourier = mt.fft.fft(signal) + >>> n = signal.size + >>> timestep = 0.1 + >>> freq = mt.fft.fftfreq(n, d=timestep) + >>> freq.execute() + array([ 0. , 1.25, 2.5 , 3.75, -5. , -3.75, -2.5 , -1.25]) + + """ + n, d = int(n), float(d) + op = TensorFFTFreq(n=n, d=d, dtype=np.dtype(float), gpu=gpu) + return op(chunk_size) diff --git a/python/xorbits/_mars/tensor/fft/fftn.py b/python/xorbits/_mars/tensor/fft/fftn.py new file mode 100644 index 000000000..cf4ce1ec6 --- /dev/null +++ b/python/xorbits/_mars/tensor/fft/fftn.py @@ -0,0 +1,125 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorComplexFFTNMixin, TensorStandardFFTN, validate_fftn + + +class TensorFFTN(TensorStandardFFTN, TensorComplexFFTNMixin): + _op_type_ = OperandDef.FFTN + + def __init__(self, shape=None, axes=None, norm=None, **kw): + super().__init__(_shape=shape, _axes=axes, _norm=norm, **kw) + + +def fftn(a, s=None, axes=None, norm=None): + """ + Compute the N-dimensional discrete Fourier Transform. + + This function computes the *N*-dimensional discrete Fourier Transform over + any number of axes in an *M*-dimensional tensor by means of the Fast Fourier + Transform (FFT). + + Parameters + ---------- + a : array_like + Input tensor, can be complex. + s : sequence of ints, optional + Shape (length of each transformed axis) of the output + (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). + This corresponds to ``n`` for ``fft(x, n)``. + Along any axis, if the given shape is smaller than that of the input, + the input is cropped. If it is larger, the input is padded with zeros. + if `s` is not given, the shape of the input along the axes specified + by `axes` is used. + axes : sequence of ints, optional + Axes over which to compute the FFT. If not given, the last ``len(s)`` + axes are used, or all axes if `s` is also not specified. + Repeated indices in `axes` means that the transform over that axis is + performed multiple times. + norm : {None, "ortho"}, optional + Normalization mode (see `mt.fft`). Default is None. + + Returns + ------- + out : complex Tensor + The truncated or zero-padded input, transformed along the axes + indicated by `axes`, or by a combination of `s` and `a`, + as explained in the parameters section above. + + Raises + ------ + ValueError + If `s` and `axes` have different length. + IndexError + If an element of `axes` is larger than than the number of axes of `a`. + + See Also + -------- + mt.fft : Overall view of discrete Fourier transforms, with definitions + and conventions used. + ifftn : The inverse of `fftn`, the inverse *n*-dimensional FFT. + fft : The one-dimensional FFT, with definitions and conventions used. + rfftn : The *n*-dimensional FFT of real input. + fft2 : The two-dimensional FFT. + fftshift : Shifts zero-frequency terms to centre of tensor + + Notes + ----- + The output, analogously to `fft`, contains the term for zero frequency in + the low-order corner of all axes, the positive frequency terms in the + first half of all axes, the term for the Nyquist frequency in the middle + of all axes and the negative frequency terms in the second half of all + axes, in order of decreasingly negative frequency. + + See `mt.fft` for details, definitions and conventions used. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.mgrid[:3, :3, :3][0] + >>> mt.fft.fftn(a, axes=(1, 2)).execute() + array([[[ 0.+0.j, 0.+0.j, 0.+0.j], + [ 0.+0.j, 0.+0.j, 0.+0.j], + [ 0.+0.j, 0.+0.j, 0.+0.j]], + [[ 9.+0.j, 0.+0.j, 0.+0.j], + [ 0.+0.j, 0.+0.j, 0.+0.j], + [ 0.+0.j, 0.+0.j, 0.+0.j]], + [[ 18.+0.j, 0.+0.j, 0.+0.j], + [ 0.+0.j, 0.+0.j, 0.+0.j], + [ 0.+0.j, 0.+0.j, 0.+0.j]]]) + >>> mt.fft.fftn(a, (2, 2), axes=(0, 1)).execute() + array([[[ 2.+0.j, 2.+0.j, 2.+0.j], + [ 0.+0.j, 0.+0.j, 0.+0.j]], + [[-2.+0.j, -2.+0.j, -2.+0.j], + [ 0.+0.j, 0.+0.j, 0.+0.j]]]) + + >>> import matplotlib.pyplot as plt + >>> [X, Y] = mt.meshgrid(2 * mt.pi * mt.arange(200) / 12, + ... 2 * mt.pi * mt.arange(200) / 34) + >>> S = mt.sin(X) + mt.cos(Y) + mt.random.uniform(0, 1, X.shape) + >>> FS = mt.fft.fftn(S) + >>> plt.imshow(mt.log(mt.abs(mt.fft.fftshift(FS))**2).execute()) + + >>> plt.show() + + """ + a = astensor(a) + axes = validate_fftn(a, s=s, axes=axes, norm=norm) + op = TensorFFTN(shape=s, axes=axes, norm=norm, dtype=np.dtype(np.complex_)) + return op(a) diff --git a/python/xorbits/_mars/tensor/fft/fftshift.py b/python/xorbits/_mars/tensor/fft/fftshift.py new file mode 100644 index 000000000..801eb4fa8 --- /dev/null +++ b/python/xorbits/_mars/tensor/fft/fftshift.py @@ -0,0 +1,86 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorFFTShiftBase, TensorFFTShiftMixin + + +class TensorFFTShift(TensorFFTShiftBase, TensorFFTShiftMixin): + _op_type_ = OperandDef.FFTSHIFT + + def __init__(self, axes=None, **kw): + super().__init__(_axes=axes, **kw) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + def __call__(self, x): + return self.new_tensor([x], x.shape) + + +def fftshift(x, axes=None): + """ + Shift the zero-frequency component to the center of the spectrum. + + This function swaps half-spaces for all axes listed (defaults to all). + Note that ``y[0]`` is the Nyquist component only if ``len(x)`` is even. + + Parameters + ---------- + x : array_like + Input tensor. + axes : int or shape tuple, optional + Axes over which to shift. Default is None, which shifts all axes. + + Returns + ------- + y : Tensor + The shifted tensor. + + See Also + -------- + ifftshift : The inverse of `fftshift`. + + Examples + -------- + >>> import mars.tensor as mt + + >>> freqs = mt.fft.fftfreq(10, 0.1) + >>> freqs.execute() + array([ 0., 1., 2., 3., 4., -5., -4., -3., -2., -1.]) + >>> mt.fft.fftshift(freqs).execute() + array([-5., -4., -3., -2., -1., 0., 1., 2., 3., 4.]) + + Shift the zero-frequency component only along the second axis: + + >>> freqs = mt.fft.fftfreq(9, d=1./9).reshape(3, 3) + >>> freqs.execute() + array([[ 0., 1., 2.], + [ 3., 4., -4.], + [-3., -2., -1.]]) + >>> mt.fft.fftshift(freqs, axes=(1,)).execute() + array([[ 2., 0., 1.], + [-4., 3., 4.], + [-1., -3., -2.]]) + + """ + x = astensor(x) + dtype = np.fft.fftshift(np.empty((1,) * max(1, x.ndim), dtype=x.dtype)).dtype + axes = TensorFFTShift._process_axes(x, axes) + op = TensorFFTShift(axes=axes, dtype=dtype) + return op(x) diff --git a/python/xorbits/_mars/tensor/fft/hfft.py b/python/xorbits/_mars/tensor/fft/hfft.py new file mode 100644 index 000000000..6e3b071cd --- /dev/null +++ b/python/xorbits/_mars/tensor/fft/hfft.py @@ -0,0 +1,115 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorFFTMixin, TensorHermitianFFT, validate_fft + + +class TensorHFFT(TensorHermitianFFT, TensorFFTMixin): + _op_type_ = OperandDef.HFFT + + def __init__(self, n=None, axis=-1, norm=None, **kw): + super().__init__(_n=n, _axis=axis, _norm=norm, **kw) + + @classmethod + def _get_shape(cls, op, shape): + new_shape = list(shape) + if op.n is not None: + new_shape[op.axis] = op.n + else: + new_shape[op.axis] = 2 * (shape[op.axis] - 1) + return tuple(new_shape) + + +def hfft(a, n=None, axis=-1, norm=None): + """ + Compute the FFT of a signal that has Hermitian symmetry, i.e., a real + spectrum. + + Parameters + ---------- + a : array_like + The input tensor. + n : int, optional + Length of the transformed axis of the output. For `n` output + points, ``n//2 + 1`` input points are necessary. If the input is + longer than this, it is cropped. If it is shorter than this, it is + padded with zeros. If `n` is not given, it is determined from the + length of the input along the axis specified by `axis`. + axis : int, optional + Axis over which to compute the FFT. If not given, the last + axis is used. + norm : {None, "ortho"}, optional + Normalization mode (see `mt.fft`). Default is None. + + Returns + ------- + out : Tensor + The truncated or zero-padded input, transformed along the axis + indicated by `axis`, or the last one if `axis` is not specified. + The length of the transformed axis is `n`, or, if `n` is not given, + ``2*m - 2`` where ``m`` is the length of the transformed axis of + the input. To get an odd number of output points, `n` must be + specified, for instance as ``2*m - 1`` in the typical case, + + Raises + ------ + IndexError + If `axis` is larger than the last axis of `a`. + + See also + -------- + rfft : Compute the one-dimensional FFT for real input. + ihfft : The inverse of `hfft`. + + Notes + ----- + `hfft`/`ihfft` are a pair analogous to `rfft`/`irfft`, but for the + opposite case: here the signal has Hermitian symmetry in the time + domain and is real in the frequency domain. So here it's `hfft` for + which you must supply the length of the result if it is to be odd. + + * even: ``ihfft(hfft(a, 2*len(a) - 2) == a``, within roundoff error, + * odd: ``ihfft(hfft(a, 2*len(a) - 1) == a``, within roundoff error. + + Examples + -------- + >>> import mars.tensor as mt + + >>> signal = mt.array([1, 2, 3, 4, 3, 2]) + >>> mt.fft.fft(signal).execute() + array([ 15.+0.j, -4.+0.j, 0.+0.j, -1.-0.j, 0.+0.j, -4.+0.j]) + >>> mt.fft.hfft(signal[:4]).execute() # Input first half of signal + array([ 15., -4., 0., -1., 0., -4.]) + >>> mt.fft.hfft(signal, 6).execute() # Input entire signal and truncate + array([ 15., -4., 0., -1., 0., -4.]) + + + >>> signal = mt.array([[1, 1.j], [-1.j, 2]]) + >>> (mt.conj(signal.T) - signal).execute() # check Hermitian symmetry + array([[ 0.-0.j, 0.+0.j], + [ 0.+0.j, 0.-0.j]]) + >>> freq_spectrum = mt.fft.hfft(signal) + >>> freq_spectrum.execute() + array([[ 1., 1.], + [ 2., -2.]]) + + """ + a = astensor(a) + validate_fft(a, axis=axis, norm=norm) + op = TensorHFFT(n=n, axis=axis, norm=norm, dtype=np.dtype(np.float_)) + return op(a) diff --git a/python/xorbits/_mars/tensor/fft/ifft.py b/python/xorbits/_mars/tensor/fft/ifft.py new file mode 100644 index 000000000..7256f6a72 --- /dev/null +++ b/python/xorbits/_mars/tensor/fft/ifft.py @@ -0,0 +1,116 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorComplexFFTMixin, TensorStandardFFT, validate_fft + + +class TensorIFFT(TensorStandardFFT, TensorComplexFFTMixin): + _op_type_ = OperandDef.IFFT + + def __init__(self, n=None, axis=-1, norm=None, **kw): + super().__init__(_n=n, _axis=axis, _norm=norm, **kw) + + +def ifft(a, n=None, axis=-1, norm=None): + """ + Compute the one-dimensional inverse discrete Fourier Transform. + + This function computes the inverse of the one-dimensional *n*-point + discrete Fourier transform computed by `fft`. In other words, + ``ifft(fft(a)) == a`` to within numerical accuracy. + For a general description of the algorithm and definitions, + see `mt.fft`. + + The input should be ordered in the same way as is returned by `fft`, + i.e., + + * ``a[0]`` should contain the zero frequency term, + * ``a[1:n//2]`` should contain the positive-frequency terms, + * ``a[n//2 + 1:]`` should contain the negative-frequency terms, in + increasing order starting from the most negative frequency. + + For an even number of input points, ``A[n//2]`` represents the sum of + the values at the positive and negative Nyquist frequencies, as the two + are aliased together. See `numpy.fft` for details. + + Parameters + ---------- + a : array_like + Input tensor, can be complex. + n : int, optional + Length of the transformed axis of the output. + If `n` is smaller than the length of the input, the input is cropped. + If it is larger, the input is padded with zeros. If `n` is not given, + the length of the input along the axis specified by `axis` is used. + See notes about padding issues. + axis : int, optional + Axis over which to compute the inverse DFT. If not given, the last + axis is used. + norm : {None, "ortho"}, optional + Normalization mode (see `numpy.fft`). Default is None. + + Returns + ------- + out : complex Tensor + The truncated or zero-padded input, transformed along the axis + indicated by `axis`, or the last one if `axis` is not specified. + + Raises + ------ + IndexError + If `axes` is larger than the last axis of `a`. + + See Also + -------- + mt.fft : An introduction, with definitions and general explanations. + fft : The one-dimensional (forward) FFT, of which `ifft` is the inverse + ifft2 : The two-dimensional inverse FFT. + ifftn : The n-dimensional inverse FFT. + + Notes + ----- + If the input parameter `n` is larger than the size of the input, the input + is padded by appending zeros at the end. Even though this is the common + approach, it might lead to surprising results. If a different padding is + desired, it must be performed before calling `ifft`. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.fft.ifft([0, 4, 0, 0]).execute() + array([ 1.+0.j, 0.+1.j, -1.+0.j, 0.-1.j]) + + Create and plot a band-limited signal with random phases: + + >>> import matplotlib.pyplot as plt + >>> t = mt.arange(400) + >>> n = mt.zeros((400,), dtype=complex) + >>> n[40:60] = mt.exp(1j*mt.random.uniform(0, 2*mt.pi, (20,))) + >>> s = mt.fft.ifft(n) + >>> plt.plot(t.execute(), s.real.execute(), 'b-', t.execute(), s.imag.execute(), 'r--') + ... + >>> plt.legend(('real', 'imaginary')) + ... + >>> plt.show() + + """ + a = astensor(a) + validate_fft(a, axis, norm) + op = TensorIFFT(n=n, axis=axis, norm=norm, dtype=np.dtype(np.complex_)) + return op(a) diff --git a/python/xorbits/_mars/tensor/fft/ifft2.py b/python/xorbits/_mars/tensor/fft/ifft2.py new file mode 100644 index 000000000..37ce72d9f --- /dev/null +++ b/python/xorbits/_mars/tensor/fft/ifft2.py @@ -0,0 +1,117 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorComplexFFTNMixin, TensorStandardFFTN, validate_fftn + + +class TensorIFFT2(TensorStandardFFTN, TensorComplexFFTNMixin): + _op_type_ = OperandDef.IFFT2 + + def __init__(self, shape=None, axes=None, norm=None, **kw): + super().__init__(_shape=shape, _axes=axes, _norm=norm, **kw) + + +def ifft2(a, s=None, axes=(-2, -1), norm=None): + """ + Compute the 2-dimensional inverse discrete Fourier Transform. + + This function computes the inverse of the 2-dimensional discrete Fourier + Transform over any number of axes in an M-dimensional array by means of + the Fast Fourier Transform (FFT). In other words, ``ifft2(fft2(a)) == a`` + to within numerical accuracy. By default, the inverse transform is + computed over the last two axes of the input array. + + The input, analogously to `ifft`, should be ordered in the same way as is + returned by `fft2`, i.e. it should have the term for zero frequency + in the low-order corner of the two axes, the positive frequency terms in + the first half of these axes, the term for the Nyquist frequency in the + middle of the axes and the negative frequency terms in the second half of + both axes, in order of decreasingly negative frequency. + + Parameters + ---------- + a : array_like + Input tensor, can be complex. + s : sequence of ints, optional + Shape (length of each axis) of the output (``s[0]`` refers to axis 0, + ``s[1]`` to axis 1, etc.). This corresponds to `n` for ``ifft(x, n)``. + Along each axis, if the given shape is smaller than that of the input, + the input is cropped. If it is larger, the input is padded with zeros. + if `s` is not given, the shape of the input along the axes specified + by `axes` is used. See notes for issue on `ifft` zero padding. + axes : sequence of ints, optional + Axes over which to compute the FFT. If not given, the last two + axes are used. A repeated index in `axes` means the transform over + that axis is performed multiple times. A one-element sequence means + that a one-dimensional FFT is performed. + norm : {None, "ortho"}, optional + Normalization mode (see `mt.fft`). Default is None. + + Returns + ------- + out : complex Tensor + The truncated or zero-padded input, transformed along the axes + indicated by `axes`, or the last two axes if `axes` is not given. + + Raises + ------ + ValueError + If `s` and `axes` have different length, or `axes` not given and + ``len(s) != 2``. + IndexError + If an element of `axes` is larger than than the number of axes of `a`. + + See Also + -------- + mt.fft : Overall view of discrete Fourier transforms, with definitions + and conventions used. + fft2 : The forward 2-dimensional FFT, of which `ifft2` is the inverse. + ifftn : The inverse of the *n*-dimensional FFT. + fft : The one-dimensional FFT. + ifft : The one-dimensional inverse FFT. + + Notes + ----- + `ifft2` is just `ifftn` with a different default for `axes`. + + See `ifftn` for details and a plotting example, and `numpy.fft` for + definition and conventions used. + + Zero-padding, analogously with `ifft`, is performed by appending zeros to + the input along the specified dimension. Although this is the common + approach, it might lead to surprising results. If another form of zero + padding is desired, it must be performed before `ifft2` is called. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = 4 * mt.eye(4) + >>> mt.fft.ifft2(a).execute() + array([[ 1.+0.j, 0.+0.j, 0.+0.j, 0.+0.j], + [ 0.+0.j, 0.+0.j, 0.+0.j, 1.+0.j], + [ 0.+0.j, 0.+0.j, 1.+0.j, 0.+0.j], + [ 0.+0.j, 1.+0.j, 0.+0.j, 0.+0.j]]) + + """ + if len(axes) != 2: + raise ValueError("axes length should be 2") + a = astensor(a) + axes = validate_fftn(a, s=s, axes=axes, norm=norm) + op = TensorIFFT2(shape=s, axes=axes, norm=norm, dtype=np.dtype(np.complex_)) + return op(a) diff --git a/python/xorbits/_mars/tensor/fft/ifftn.py b/python/xorbits/_mars/tensor/fft/ifftn.py new file mode 100644 index 000000000..4c563d5ca --- /dev/null +++ b/python/xorbits/_mars/tensor/fft/ifftn.py @@ -0,0 +1,125 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorComplexFFTNMixin, TensorStandardFFTN, validate_fftn + + +class TensorIFFTN(TensorStandardFFTN, TensorComplexFFTNMixin): + _op_type_ = OperandDef.IFFTN + + def __init__(self, shape=None, axes=None, norm=None, **kw): + super().__init__(_shape=shape, _axes=axes, _norm=norm, **kw) + + +def ifftn(a, s=None, axes=None, norm=None): + """ + Compute the N-dimensional inverse discrete Fourier Transform. + + This function computes the inverse of the N-dimensional discrete + Fourier Transform over any number of axes in an M-dimensional tensor by + means of the Fast Fourier Transform (FFT). In other words, + ``ifftn(fftn(a)) == a`` to within numerical accuracy. + For a description of the definitions and conventions used, see `mt.fft`. + + The input, analogously to `ifft`, should be ordered in the same way as is + returned by `fftn`, i.e. it should have the term for zero frequency + in all axes in the low-order corner, the positive frequency terms in the + first half of all axes, the term for the Nyquist frequency in the middle + of all axes and the negative frequency terms in the second half of all + axes, in order of decreasingly negative frequency. + + Parameters + ---------- + a : array_like + Input tensor, can be complex. + s : sequence of ints, optional + Shape (length of each transformed axis) of the output + (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). + This corresponds to ``n`` for ``ifft(x, n)``. + Along any axis, if the given shape is smaller than that of the input, + the input is cropped. If it is larger, the input is padded with zeros. + if `s` is not given, the shape of the input along the axes specified + by `axes` is used. See notes for issue on `ifft` zero padding. + axes : sequence of ints, optional + Axes over which to compute the IFFT. If not given, the last ``len(s)`` + axes are used, or all axes if `s` is also not specified. + Repeated indices in `axes` means that the inverse transform over that + axis is performed multiple times. + norm : {None, "ortho"}, optional + Normalization mode (see `mt.fft`). Default is None. + + Returns + ------- + out : complex Tensor + The truncated or zero-padded input, transformed along the axes + indicated by `axes`, or by a combination of `s` or `a`, + as explained in the parameters section above. + + Raises + ------ + ValueError + If `s` and `axes` have different length. + IndexError + If an element of `axes` is larger than than the number of axes of `a`. + + See Also + -------- + mt.fft : Overall view of discrete Fourier transforms, with definitions + and conventions used. + fftn : The forward *n*-dimensional FFT, of which `ifftn` is the inverse. + ifft : The one-dimensional inverse FFT. + ifft2 : The two-dimensional inverse FFT. + ifftshift : Undoes `fftshift`, shifts zero-frequency terms to beginning + of tensor. + + Notes + ----- + See `mt.fft` for definitions and conventions used. + + Zero-padding, analogously with `ifft`, is performed by appending zeros to + the input along the specified dimension. Although this is the common + approach, it might lead to surprising results. If another form of zero + padding is desired, it must be performed before `ifftn` is called. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.eye(4) + >>> mt.fft.ifftn(mt.fft.fftn(a, axes=(0,)), axes=(1,)).execute() + array([[ 1.+0.j, 0.+0.j, 0.+0.j, 0.+0.j], + [ 0.+0.j, 1.+0.j, 0.+0.j, 0.+0.j], + [ 0.+0.j, 0.+0.j, 1.+0.j, 0.+0.j], + [ 0.+0.j, 0.+0.j, 0.+0.j, 1.+0.j]]) + + + Create and plot an image with band-limited frequency content: + + >>> import matplotlib.pyplot as plt + >>> n = mt.zeros((200,200), dtype=complex) + >>> n[60:80, 20:40] = mt.exp(1j*mt.random.uniform(0, 2*mt.pi, (20, 20))) + >>> im = mt.fft.ifftn(n).real + >>> plt.imshow(im.execute()) + + >>> plt.show() + + """ + a = astensor(a) + axes = validate_fftn(a, s=s, axes=axes, norm=norm) + op = TensorIFFTN(shape=s, axes=axes, norm=norm, dtype=np.dtype(np.complex_)) + return op(a) diff --git a/python/xorbits/_mars/tensor/fft/ifftshift.py b/python/xorbits/_mars/tensor/fft/ifftshift.py new file mode 100644 index 000000000..e83943aa9 --- /dev/null +++ b/python/xorbits/_mars/tensor/fft/ifftshift.py @@ -0,0 +1,80 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorFFTShiftBase, TensorFFTShiftMixin + + +class TensorIFFTShift(TensorFFTShiftBase, TensorFFTShiftMixin): + _op_type_ = OperandDef.IFFTSHIFT + + def __init__(self, axes=None, **kw): + super().__init__(_axes=axes, **kw) + + @classmethod + def _is_inverse(cls): + return True + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + def __call__(self, x): + return self.new_tensor([x], x.shape) + + +def ifftshift(x, axes=None): + """ + The inverse of `fftshift`. Although identical for even-length `x`, the + functions differ by one sample for odd-length `x`. + + Parameters + ---------- + x : array_like + Input tensor. + axes : int or shape tuple, optional + Axes over which to calculate. Defaults to None, which shifts all axes. + + Returns + ------- + y : Tensor + The shifted tensor. + + See Also + -------- + fftshift : Shift zero-frequency component to the center of the spectrum. + + Examples + -------- + >>> import mars.tensor as mt + + >>> freqs = mt.fft.fftfreq(9, d=1./9).reshape(3, 3) + >>> freqs.execute() + array([[ 0., 1., 2.], + [ 3., 4., -4.], + [-3., -2., -1.]]) + >>> mt.fft.ifftshift(mt.fft.fftshift(freqs)).execute() + array([[ 0., 1., 2.], + [ 3., 4., -4.], + [-3., -2., -1.]]) + + """ + x = astensor(x) + dtype = np.fft.ifftshift(np.empty((1,) * max(1, x.ndim), dtype=x.dtype)).dtype + axes = TensorIFFTShift._process_axes(x, axes) + op = TensorIFFTShift(axes=axes, dtype=dtype) + return op(x) diff --git a/python/xorbits/_mars/tensor/fft/ihfft.py b/python/xorbits/_mars/tensor/fft/ihfft.py new file mode 100644 index 000000000..bc4b1c150 --- /dev/null +++ b/python/xorbits/_mars/tensor/fft/ihfft.py @@ -0,0 +1,95 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorFFTMixin, TensorHermitianFFT, validate_fft + + +class TensorIHFFT(TensorHermitianFFT, TensorFFTMixin): + _op_type_ = OperandDef.IHFFT + + def __init__(self, n=None, axis=-1, norm=None, **kw): + super().__init__(_n=n, _axis=axis, _norm=norm, **kw) + + @classmethod + def _get_shape(cls, op, shape): + new_shape = list(shape) + shape = op.n if op.n is not None else shape[op.axis] + if shape % 2 == 0: + shape = (shape // 2) + 1 + else: + shape = (shape + 1) // 2 + new_shape[op.axis] = shape + return tuple(new_shape) + + +def ihfft(a, n=None, axis=-1, norm=None): + """ + Compute the inverse FFT of a signal that has Hermitian symmetry. + + Parameters + ---------- + a : array_like + Input tensor. + n : int, optional + Length of the inverse FFT, the number of points along + transformation axis in the input to use. If `n` is smaller than + the length of the input, the input is cropped. If it is larger, + the input is padded with zeros. If `n` is not given, the length of + the input along the axis specified by `axis` is used. + axis : int, optional + Axis over which to compute the inverse FFT. If not given, the last + axis is used. + norm : {None, "ortho"}, optional + Normalization mode (see `numpy.fft`). Default is None. + + Returns + ------- + out : complex Tensor + The truncated or zero-padded input, transformed along the axis + indicated by `axis`, or the last one if `axis` is not specified. + The length of the transformed axis is ``n//2 + 1``. + + See also + -------- + hfft, irfft + + Notes + ----- + `hfft`/`ihfft` are a pair analogous to `rfft`/`irfft`, but for the + opposite case: here the signal has Hermitian symmetry in the time + domain and is real in the frequency domain. So here it's `hfft` for + which you must supply the length of the result if it is to be odd: + + * even: ``ihfft(hfft(a, 2*len(a) - 2) == a``, within roundoff error, + * odd: ``ihfft(hfft(a, 2*len(a) - 1) == a``, within roundoff error. + + Examples + -------- + >>> import mars.tensor as mt + + >>> spectrum = mt.array([ 15, -4, 0, -1, 0, -4]) + >>> mt.fft.ifft(spectrum).execute() + array([ 1.+0.j, 2.-0.j, 3.+0.j, 4.+0.j, 3.+0.j, 2.-0.j]) + >>> mt.fft.ihfft(spectrum).execute() + array([ 1.-0.j, 2.-0.j, 3.-0.j, 4.-0.j]) + + """ + a = astensor(a) + validate_fft(a, axis=axis, norm=norm) + op = TensorIHFFT(n=n, axis=axis, norm=norm, dtype=np.dtype(np.complex_)) + return op(a) diff --git a/python/xorbits/_mars/tensor/fft/irfft.py b/python/xorbits/_mars/tensor/fft/irfft.py new file mode 100644 index 000000000..ece39e164 --- /dev/null +++ b/python/xorbits/_mars/tensor/fft/irfft.py @@ -0,0 +1,121 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorFFTMixin, TensorRealFFT, validate_fft + + +class TensorIRFFT(TensorRealFFT, TensorFFTMixin): + _op_type_ = OperandDef.IRFFT + + def __init__(self, n=None, axis=-1, norm=None, **kw): + super().__init__(_n=n, _axis=axis, _norm=norm, **kw) + + @classmethod + def _get_shape(cls, op, shape): + new_shape = list(shape) + if op.n is not None: + new_shape[op.axis] = op.n + else: + new_shape[op.axis] = 2 * (new_shape[op.axis] - 1) + return tuple(new_shape) + + +def irfft(a, n=None, axis=-1, norm=None): + """ + Compute the inverse of the n-point DFT for real input. + + This function computes the inverse of the one-dimensional *n*-point + discrete Fourier Transform of real input computed by `rfft`. + In other words, ``irfft(rfft(a), len(a)) == a`` to within numerical + accuracy. (See Notes below for why ``len(a)`` is necessary here.) + + The input is expected to be in the form returned by `rfft`, i.e. the + real zero-frequency term followed by the complex positive frequency terms + in order of increasing frequency. Since the discrete Fourier Transform of + real input is Hermitian-symmetric, the negative frequency terms are taken + to be the complex conjugates of the corresponding positive frequency terms. + + Parameters + ---------- + a : array_like + The input tensor. + n : int, optional + Length of the transformed axis of the output. + For `n` output points, ``n//2+1`` input points are necessary. If the + input is longer than this, it is cropped. If it is shorter than this, + it is padded with zeros. If `n` is not given, it is determined from + the length of the input along the axis specified by `axis`. + axis : int, optional + Axis over which to compute the inverse FFT. If not given, the last + axis is used. + norm : {None, "ortho"}, optional + Normalization mode (see `mt.fft`). Default is None. + + Returns + ------- + out : Tensor + The truncated or zero-padded input, transformed along the axis + indicated by `axis`, or the last one if `axis` is not specified. + The length of the transformed axis is `n`, or, if `n` is not given, + ``2*(m-1)`` where ``m`` is the length of the transformed axis of the + input. To get an odd number of output points, `n` must be specified. + + Raises + ------ + IndexError + If `axis` is larger than the last axis of `a`. + + See Also + -------- + mt.fft : For definition of the DFT and conventions used. + rfft : The one-dimensional FFT of real input, of which `irfft` is inverse. + fft : The one-dimensional FFT. + irfft2 : The inverse of the two-dimensional FFT of real input. + irfftn : The inverse of the *n*-dimensional FFT of real input. + + Notes + ----- + Returns the real valued `n`-point inverse discrete Fourier transform + of `a`, where `a` contains the non-negative frequency terms of a + Hermitian-symmetric sequence. `n` is the length of the result, not the + input. + + If you specify an `n` such that `a` must be zero-padded or truncated, the + extra/removed values will be added/removed at high frequencies. One can + thus resample a series to `m` points via Fourier interpolation by: + ``a_resamp = irfft(rfft(a), m)``. + + Examples + -------- + >>> import mars.tenosr as mt + + >>> mt.fft.ifft([1, -1j, -1, 1j]).execute() + array([ 0.+0.j, 1.+0.j, 0.+0.j, 0.+0.j]) + >>> mt.fft.irfft([1, -1j, -1]).execute() + array([ 0., 1., 0., 0.]) + + Notice how the last term in the input to the ordinary `ifft` is the + complex conjugate of the second term, and the output has zero imaginary + part everywhere. When calling `irfft`, the negative frequencies are not + specified, and the output array is purely real. + + """ + a = astensor(a) + validate_fft(a, axis=axis, norm=norm) + op = TensorIRFFT(n=n, axis=axis, norm=norm, dtype=np.dtype(np.float_)) + return op(a) diff --git a/python/xorbits/_mars/tensor/fft/irfft2.py b/python/xorbits/_mars/tensor/fft/irfft2.py new file mode 100644 index 000000000..0a975605b --- /dev/null +++ b/python/xorbits/_mars/tensor/fft/irfft2.py @@ -0,0 +1,65 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorRealFFTN, TensorRealIFFTNMixin, validate_fftn + + +class TensorIRFFT2(TensorRealFFTN, TensorRealIFFTNMixin): + _op_type_ = OperandDef.IRFFT2 + + def __init__(self, shape=None, axes=None, norm=None, **kw): + super().__init__(_shape=shape, _axes=axes, _norm=norm, **kw) + + +def irfft2(a, s=None, axes=(-2, -1), norm=None): + """ + Compute the 2-dimensional inverse FFT of a real array. + + Parameters + ---------- + a : array_like + The input tensor + s : sequence of ints, optional + Shape of the inverse FFT. + axes : sequence of ints, optional + The axes over which to compute the inverse fft. + Default is the last two axes. + norm : {None, "ortho"}, optional + Normalization mode (see `mt.fft`). Default is None. + + Returns + ------- + out : Tensor + The result of the inverse real 2-D FFT. + + See Also + -------- + irfftn : Compute the inverse of the N-dimensional FFT of real input. + + Notes + ----- + This is really `irfftn` with different defaults. + For more details see `irfftn`. + + """ + if len(axes) != 2: + raise ValueError("axes length should be 2") + a = astensor(a) + axes = validate_fftn(a, s=s, axes=axes, norm=norm) + op = TensorIRFFT2(shape=s, axes=axes, norm=norm, dtype=np.dtype(np.float_)) + return op(a) diff --git a/python/xorbits/_mars/tensor/fft/irfftn.py b/python/xorbits/_mars/tensor/fft/irfftn.py new file mode 100644 index 000000000..ea6cf21be --- /dev/null +++ b/python/xorbits/_mars/tensor/fft/irfftn.py @@ -0,0 +1,117 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorRealFFTN, TensorRealIFFTNMixin, validate_fftn + + +class TensorIRFFTN(TensorRealFFTN, TensorRealIFFTNMixin): + _op_type_ = OperandDef.IRFFTN + + def __init__(self, shape=None, axes=None, norm=None, **kw): + super().__init__(_shape=shape, _axes=axes, _norm=norm, **kw) + + +def irfftn(a, s=None, axes=None, norm=None): + """ + Compute the inverse of the N-dimensional FFT of real input. + + This function computes the inverse of the N-dimensional discrete + Fourier Transform for real input over any number of axes in an + M-dimensional tensor by means of the Fast Fourier Transform (FFT). In + other words, ``irfftn(rfftn(a), a.shape) == a`` to within numerical + accuracy. (The ``a.shape`` is necessary like ``len(a)`` is for `irfft`, + and for the same reason.) + + The input should be ordered in the same way as is returned by `rfftn`, + i.e. as for `irfft` for the final transformation axis, and as for `ifftn` + along all the other axes. + + Parameters + ---------- + a : array_like + Input tensor. + s : sequence of ints, optional + Shape (length of each transformed axis) of the output + (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). `s` is also the + number of input points used along this axis, except for the last axis, + where ``s[-1]//2+1`` points of the input are used. + Along any axis, if the shape indicated by `s` is smaller than that of + the input, the input is cropped. If it is larger, the input is padded + with zeros. If `s` is not given, the shape of the input along the + axes specified by `axes` is used. + axes : sequence of ints, optional + Axes over which to compute the inverse FFT. If not given, the last + `len(s)` axes are used, or all axes if `s` is also not specified. + Repeated indices in `axes` means that the inverse transform over that + axis is performed multiple times. + norm : {None, "ortho"}, optional + Normalization mode (see `mt.fft`). Default is None. + + Returns + ------- + out : Tensor + The truncated or zero-padded input, transformed along the axes + indicated by `axes`, or by a combination of `s` or `a`, + as explained in the parameters section above. + The length of each transformed axis is as given by the corresponding + element of `s`, or the length of the input in every axis except for the + last one if `s` is not given. In the final transformed axis the length + of the output when `s` is not given is ``2*(m-1)`` where ``m`` is the + length of the final transformed axis of the input. To get an odd + number of output points in the final axis, `s` must be specified. + + Raises + ------ + ValueError + If `s` and `axes` have different length. + IndexError + If an element of `axes` is larger than than the number of axes of `a`. + + See Also + -------- + rfftn : The forward n-dimensional FFT of real input, + of which `ifftn` is the inverse. + fft : The one-dimensional FFT, with definitions and conventions used. + irfft : The inverse of the one-dimensional FFT of real input. + irfft2 : The inverse of the two-dimensional FFT of real input. + + Notes + ----- + See `fft` for definitions and conventions used. + + See `rfft` for definitions and conventions used for real input. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.zeros((3, 2, 2)) + >>> a[0, 0, 0] = 3 * 2 * 2 + >>> mt.fft.irfftn(a).execute() + array([[[ 1., 1.], + [ 1., 1.]], + [[ 1., 1.], + [ 1., 1.]], + [[ 1., 1.], + [ 1., 1.]]]) + + """ + a = astensor(a) + axes = validate_fftn(a, s=s, axes=axes, norm=norm) + op = TensorIRFFTN(shape=s, axes=axes, norm=norm, dtype=np.dtype(np.float_)) + return op(a) diff --git a/python/xorbits/_mars/tensor/fft/rfft.py b/python/xorbits/_mars/tensor/fft/rfft.py new file mode 100644 index 000000000..110c77966 --- /dev/null +++ b/python/xorbits/_mars/tensor/fft/rfft.py @@ -0,0 +1,118 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorFFTMixin, TensorRealFFT, validate_fft + + +class TensorRFFT(TensorRealFFT, TensorFFTMixin): + _op_type_ = OperandDef.RFFT + + def __init__(self, n=None, axis=-1, norm=None, **kw): + super().__init__(_n=n, _axis=axis, _norm=norm, **kw) + + @classmethod + def _get_shape(cls, op, shape): + new_shape = list(shape) + if op.n is not None: + new_shape[op.axis] = op.n + new_shape[op.axis] = new_shape[op.axis] // 2 + 1 + return tuple(new_shape) + + +def rfft(a, n=None, axis=-1, norm=None): + """ + Compute the one-dimensional discrete Fourier Transform for real input. + + This function computes the one-dimensional *n*-point discrete Fourier + Transform (DFT) of a real-valued array by means of an efficient algorithm + called the Fast Fourier Transform (FFT). + + Parameters + ---------- + a : array_like + Input tensor + n : int, optional + Number of points along transformation axis in the input to use. + If `n` is smaller than the length of the input, the input is cropped. + If it is larger, the input is padded with zeros. If `n` is not given, + the length of the input along the axis specified by `axis` is used. + axis : int, optional + Axis over which to compute the FFT. If not given, the last axis is + used. + norm : {None, "ortho"}, optional + Normalization mode (see `mt.fft`). Default is None. + + Returns + ------- + out : complex Tensor + The truncated or zero-padded input, transformed along the axis + indicated by `axis`, or the last one if `axis` is not specified. + If `n` is even, the length of the transformed axis is ``(n/2)+1``. + If `n` is odd, the length is ``(n+1)/2``. + + Raises + ------ + IndexError + If `axis` is larger than the last axis of `a`. + + See Also + -------- + mt.fft : For definition of the DFT and conventions used. + irfft : The inverse of `rfft`. + fft : The one-dimensional FFT of general (complex) input. + fftn : The *n*-dimensional FFT. + rfftn : The *n*-dimensional FFT of real input. + + Notes + ----- + When the DFT is computed for purely real input, the output is + Hermitian-symmetric, i.e. the negative frequency terms are just the complex + conjugates of the corresponding positive-frequency terms, and the + negative-frequency terms are therefore redundant. This function does not + compute the negative frequency terms, and the length of the transformed + axis of the output is therefore ``n//2 + 1``. + + When ``A = rfft(a)`` and fs is the sampling frequency, ``A[0]`` contains + the zero-frequency term 0*fs, which is real due to Hermitian symmetry. + + If `n` is even, ``A[-1]`` contains the term representing both positive + and negative Nyquist frequency (+fs/2 and -fs/2), and must also be purely + real. If `n` is odd, there is no term at fs/2; ``A[-1]`` contains + the largest positive frequency (fs/2*(n-1)/n), and is complex in the + general case. + + If the input `a` contains an imaginary part, it is silently discarded. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.fft.fft([0, 1, 0, 0]).execute() + array([ 1.+0.j, 0.-1.j, -1.+0.j, 0.+1.j]) + >>> mt.fft.rfft([0, 1, 0, 0]).execute() + array([ 1.+0.j, 0.-1.j, -1.+0.j]) + + Notice how the final element of the `fft` output is the complex conjugate + of the second element, for real input. For `rfft`, this symmetry is + exploited to compute only the non-negative frequency terms. + + """ + a = astensor(a) + validate_fft(a, axis=axis, norm=norm) + op = TensorRFFT(n=n, axis=axis, norm=norm, dtype=np.dtype(np.complex_)) + return op(a) diff --git a/python/xorbits/_mars/tensor/fft/rfft2.py b/python/xorbits/_mars/tensor/fft/rfft2.py new file mode 100644 index 000000000..8a47517d5 --- /dev/null +++ b/python/xorbits/_mars/tensor/fft/rfft2.py @@ -0,0 +1,65 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorRealFFTN, TensorRealFFTNMixin, validate_fftn + + +class TensorRFFT2(TensorRealFFTN, TensorRealFFTNMixin): + _op_type_ = OperandDef.RFFT2 + + def __init__(self, shape=None, axes=None, norm=None, **kw): + super().__init__(_shape=shape, _axes=axes, _norm=norm, **kw) + + +def rfft2(a, s=None, axes=(-2, -1), norm=None): + """ + Compute the 2-dimensional FFT of a real tensor. + + Parameters + ---------- + a : array_like + Input tensor, taken to be real. + s : sequence of ints, optional + Shape of the FFT. + axes : sequence of ints, optional + Axes over which to compute the FFT. + norm : {None, "ortho"}, optional + Normalization mode (see `mt.fft`). Default is None. + + Returns + ------- + out : Tensor + The result of the real 2-D FFT. + + See Also + -------- + rfftn : Compute the N-dimensional discrete Fourier Transform for real + input. + + Notes + ----- + This is really just `rfftn` with different default behavior. + For more details see `rfftn`. + + """ + if len(axes) != 2: + raise ValueError("axes length should be 2") + a = astensor(a) + axes = validate_fftn(a, s=s, axes=axes, norm=norm) + op = TensorRFFT2(shape=s, axes=axes, norm=norm, dtype=np.dtype(np.complex_)) + return op(a) diff --git a/python/xorbits/_mars/tensor/fft/rfftfreq.py b/python/xorbits/_mars/tensor/fft/rfftfreq.py new file mode 100644 index 000000000..1d4819cee --- /dev/null +++ b/python/xorbits/_mars/tensor/fft/rfftfreq.py @@ -0,0 +1,122 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...core import recursive_tile +from ...serialization.serializables import Float64Field, Int32Field +from ..core import TensorOrder +from ..datasource import arange +from ..operands import TensorOperand, TensorOperandMixin + + +class TensorRFFTFreq(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.RFFTFREQ + + _n = Int32Field("n") + _d = Float64Field("d") + + def __init__(self, n=None, d=None, **kw): + super().__init__(_n=n, _d=d, **kw) + + @property + def n(self): + return self._n + + @property + def d(self): + return self._d + + def __call__(self, chunk_size=None): + shape = (self.n // 2 + 1,) + return self.new_tensor( + None, shape, raw_chunk_size=chunk_size, order=TensorOrder.C_ORDER + ) + + @classmethod + def tile(cls, op): + tensor = op.outputs[0] + t = arange( + tensor.shape[0], + dtype=op.dtype, + gpu=op.gpu, + chunk_size=tensor.extra_params.raw_chunk_size, + ) + t = t / (op.n * op.d) + t = yield from recursive_tile(t) + + new_op = op.copy() + return new_op.new_tensors( + None, + tensor.shape, + order=tensor.order, + chunks=t.chunks, + nsplits=t.nsplits, + **tensor.extra_params + ) + + +def rfftfreq(n, d=1.0, gpu=None, chunk_size=None): + """ + Return the Discrete Fourier Transform sample frequencies + (for usage with rfft, irfft). + + The returned float tensor `f` contains the frequency bin centers in cycles + per unit of the sample spacing (with zero at the start). For instance, if + the sample spacing is in seconds, then the frequency unit is cycles/second. + + Given a window length `n` and a sample spacing `d`:: + + f = [0, 1, ..., n/2-1, n/2] / (d*n) if n is even + f = [0, 1, ..., (n-1)/2-1, (n-1)/2] / (d*n) if n is odd + + Unlike `fftfreq` (but like `scipy.fftpack.rfftfreq`) + the Nyquist frequency component is considered to be positive. + + Parameters + ---------- + n : int + Window length. + d : scalar, optional + Sample spacing (inverse of the sampling rate). Defaults to 1. + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + + Returns + ------- + f : Tensor + Tensor of length ``n//2 + 1`` containing the sample frequencies. + + Examples + -------- + >>> import mars.tensor as mt + + >>> signal = mt.array([-2, 8, 6, 4, 1, 0, 3, 5, -3, 4], dtype=float) + >>> fourier = mt.fft.rfft(signal) + >>> n = signal.size + >>> sample_rate = 100 + >>> freq = mt.fft.fftfreq(n, d=1./sample_rate) + >>> freq.execute() + array([ 0., 10., 20., 30., 40., -50., -40., -30., -20., -10.]) + >>> freq = mt.fft.rfftfreq(n, d=1./sample_rate) + >>> freq.execute() + array([ 0., 10., 20., 30., 40., 50.]) + + """ + n, d = int(n), float(d) + op = TensorRFFTFreq(n=n, d=d, dtype=np.dtype(float), gpu=gpu) + return op(chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/fft/rfftn.py b/python/xorbits/_mars/tensor/fft/rfftn.py new file mode 100644 index 000000000..bdfceeb1a --- /dev/null +++ b/python/xorbits/_mars/tensor/fft/rfftn.py @@ -0,0 +1,115 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorRealFFTN, TensorRealFFTNMixin, validate_fftn + + +class TensorRFFTN(TensorRealFFTN, TensorRealFFTNMixin): + _op_type_ = OperandDef.RFFTN + + def __init__(self, shape=None, axes=None, norm=None, **kw): + super().__init__(_shape=shape, _axes=axes, _norm=norm, **kw) + + +def rfftn(a, s=None, axes=None, norm=None): + """ + Compute the N-dimensional discrete Fourier Transform for real input. + + This function computes the N-dimensional discrete Fourier Transform over + any number of axes in an M-dimensional real tensor by means of the Fast + Fourier Transform (FFT). By default, all axes are transformed, with the + real transform performed over the last axis, while the remaining + transforms are complex. + + Parameters + ---------- + a : array_like + Input tensor, taken to be real. + s : sequence of ints, optional + Shape (length along each transformed axis) to use from the input. + (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). + The final element of `s` corresponds to `n` for ``rfft(x, n)``, while + for the remaining axes, it corresponds to `n` for ``fft(x, n)``. + Along any axis, if the given shape is smaller than that of the input, + the input is cropped. If it is larger, the input is padded with zeros. + if `s` is not given, the shape of the input along the axes specified + by `axes` is used. + axes : sequence of ints, optional + Axes over which to compute the FFT. If not given, the last ``len(s)`` + axes are used, or all axes if `s` is also not specified. + norm : {None, "ortho"}, optional + Normalization mode (see `mt.fft`). Default is None. + + Returns + ------- + out : complex Tensor + The truncated or zero-padded input, transformed along the axes + indicated by `axes`, or by a combination of `s` and `a`, + as explained in the parameters section above. + The length of the last axis transformed will be ``s[-1]//2+1``, + while the remaining transformed axes will have lengths according to + `s`, or unchanged from the input. + + Raises + ------ + ValueError + If `s` and `axes` have different length. + IndexError + If an element of `axes` is larger than than the number of axes of `a`. + + See Also + -------- + irfftn : The inverse of `rfftn`, i.e. the inverse of the n-dimensional FFT + of real input. + fft : The one-dimensional FFT, with definitions and conventions used. + rfft : The one-dimensional FFT of real input. + fftn : The n-dimensional FFT. + rfft2 : The two-dimensional FFT of real input. + + Notes + ----- + The transform for real input is performed over the last transformation + axis, as by `rfft`, then the transform over the remaining axes is + performed as by `fftn`. The order of the output is as for `rfft` for the + final transformation axis, and as for `fftn` for the remaining + transformation axes. + + See `fft` for details, definitions and conventions used. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.ones((2, 2, 2)) + >>> mt.fft.rfftn(a).execute() + array([[[ 8.+0.j, 0.+0.j], + [ 0.+0.j, 0.+0.j]], + [[ 0.+0.j, 0.+0.j], + [ 0.+0.j, 0.+0.j]]]) + + >>> mt.fft.rfftn(a, axes=(2, 0)).execute() + array([[[ 4.+0.j, 0.+0.j], + [ 4.+0.j, 0.+0.j]], + [[ 0.+0.j, 0.+0.j], + [ 0.+0.j, 0.+0.j]]]) + + """ + a = astensor(a) + axes = validate_fftn(a, s=s, axes=axes, norm=norm) + op = TensorRFFTN(shape=s, axes=axes, norm=norm, dtype=np.dtype(np.complex_)) + return op(a) diff --git a/python/xorbits/_mars/tensor/fft/tests/__init__.py b/python/xorbits/_mars/tensor/fft/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/tensor/fft/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/tensor/fft/tests/test_fft.py b/python/xorbits/_mars/tensor/fft/tests/test_fft.py new file mode 100644 index 000000000..84e02f43f --- /dev/null +++ b/python/xorbits/_mars/tensor/fft/tests/test_fft.py @@ -0,0 +1,186 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ....core import tile +from ...datasource import ones +from .. import ( + fft, + fft2, + fftfreq, + fftn, + fftshift, + hfft, + ifft, + ifft2, + ifftn, + ifftshift, + ihfft, + irfft, + irfft2, + irfftn, + rfft, + rfft2, + rfftfreq, + rfftn, +) + + +def test_standard_fft(): + t = ones((10, 20, 30), chunk_size=(3, 20, 30)) + + t1 = fft(t) + assert t1.shape == (10, 20, 30) + t1 = tile(t1) + assert t1.shape == tuple(sum(ns) for ns in t1.nsplits) + + t = ones((10, 20, 30), chunk_size=(3, 20, 30)) + + t1 = ifft(t) + assert t1.shape == (10, 20, 30) + t1 = tile(t1) + assert t1.shape == tuple(sum(ns) for ns in t1.nsplits) + + t = ones((10, 20, 30), chunk_size=(3, 20, 30)) + + t1 = fft2(t, s=(23, 21)) + assert t1.shape == (10, 23, 21) + t1 = tile(t1) + assert t1.shape == tuple(sum(ns) for ns in t1.nsplits) + + t = ones((10, 20, 30), chunk_size=(3, 20, 30)) + + t1 = ifft2(t, s=(11, 9), axes=(1, 2)) + assert t1.shape == (10, 11, 9) + t1 = tile(t1) + assert t1.shape == tuple(sum(ns) for ns in t1.nsplits) + + t = ones((10, 20, 30), chunk_size=(3, 20, 30)) + + t1 = fftn(t, s=(11, 9), axes=(1, 2)) + assert t1.shape == (10, 11, 9) + t1 = tile(t1) + assert t1.shape == tuple(sum(ns) for ns in t1.nsplits) + + t = ones((10, 20, 30), chunk_size=(3, 20, 30)) + + t1 = ifftn(t, s=(11, 9), axes=(1, 2)) + assert t1.shape == (10, 11, 9) + t1 = tile(t1) + assert t1.shape == tuple(sum(ns) for ns in t1.nsplits) + + +def test_real_fft(): + t = ones((10, 20, 30), chunk_size=(3, 20, 30)) + + t1 = rfft(t) + assert t1.shape == np.fft.rfft(np.ones(t.shape)).shape + t1 = tile(t1) + assert t1.shape == tuple(sum(ns) for ns in t1.nsplits) + + t = ones((10, 20, 30), chunk_size=(3, 20, 30)) + + t1 = irfft(t) + assert t1.shape == np.fft.irfft(np.ones(t.shape)).shape + t1 = tile(t1) + assert t1.shape == tuple(sum(ns) for ns in t1.nsplits) + + t = ones((10, 20, 30), chunk_size=(3, 20, 30)) + + t1 = rfft2(t, s=(23, 21)) + assert t1.shape == np.fft.rfft2(np.ones(t.shape), s=(23, 21)).shape + t1 = tile(t1) + assert t1.shape == tuple(sum(ns) for ns in t1.nsplits) + + t = ones((10, 20, 30), chunk_size=(3, 20, 30)) + + t1 = irfft2(t, s=(11, 9), axes=(1, 2)) + assert t1.shape == np.fft.irfft2(np.ones(t.shape), s=(11, 9), axes=(1, 2)).shape + t1 = tile(t1) + assert t1.shape == tuple(sum(ns) for ns in t1.nsplits) + + t = ones((10, 20, 30), chunk_size=(3, 20, 30)) + + t1 = rfftn(t, s=(11, 30), axes=(1, 2)) + assert t1.shape == np.fft.rfftn(np.ones(t.shape), s=(11, 30), axes=(1, 2)).shape + t1 = tile(t1) + assert t1.shape == tuple(sum(ns) for ns in t1.nsplits) + + t = ones((10, 20, 30), chunk_size=(3, 20, 30)) + + t1 = irfftn(t, s=(11, 9), axes=(1, 2)) + assert t1.shape == np.fft.irfftn(np.ones(t.shape), s=(11, 9), axes=(1, 2)).shape + t1 = tile(t1) + assert t1.shape == tuple(sum(ns) for ns in t1.nsplits) + + +def test_hermitian_fft(): + t = ones((10, 20, 30), chunk_size=(3, 20, 30)) + + t1 = hfft(t) + assert t1.shape == np.fft.hfft(np.ones(t.shape)).shape + t1 = tile(t1) + assert t1.shape == tuple(sum(ns) for ns in t1.nsplits) + + t = ones((10, 20, 30), chunk_size=(3, 20, 30)) + + t1 = hfft(t, n=100) + assert t1.shape == np.fft.hfft(np.ones(t.shape), n=100).shape + t1 = tile(t1) + assert t1.shape == tuple(sum(ns) for ns in t1.nsplits) + + t = ones((10, 20, 30), chunk_size=(3, 20, 30)) + + t1 = ihfft(t) + assert t1.shape == np.fft.ihfft(np.ones(t.shape)).shape + t1 = tile(t1) + assert t1.shape == tuple(sum(ns) for ns in t1.nsplits) + + t = ones((10, 20, 30), chunk_size=(3, 20, 30)) + + t1 = ihfft(t, n=100) + assert t1.shape == np.fft.ihfft(np.ones(t.shape), n=100).shape + t1 = tile(t1) + assert t1.shape == tuple(sum(ns) for ns in t1.nsplits) + + t1 = ihfft(t, n=101) + assert t1.shape == np.fft.ihfft(np.ones(t.shape), n=101).shape + t1 = tile(t1) + assert t1.shape == tuple(sum(ns) for ns in t1.nsplits) + + +def test_fft_shift(): + freqs = fftfreq(9, d=1.0 / 9).reshape(3, 3) + t = ifftshift(fftshift(freqs)) + + assert t.dtype is not None + expect_dtype = np.fft.ifftshift( + np.fft.fftshift(np.fft.fftfreq(9, d=1.0 / 9).reshape(3, 3)) + ).dtype + assert t.dtype == expect_dtype + + +def test_fft_freq(): + t = fftfreq(10, 0.1, chunk_size=3) + + assert t.shape == np.fft.fftfreq(10, 0.1).shape + t = tile(t) + assert t.shape == tuple(sum(ns) for ns in t.nsplits) + + t = rfftfreq(10, 0.1, chunk_size=3) + + assert t.shape == np.fft.rfftfreq(10, 0.1).shape + t = tile(t) + assert t.shape == tuple(sum(ns) for ns in t.nsplits) diff --git a/python/xorbits/_mars/tensor/fft/tests/test_fft_execution.py b/python/xorbits/_mars/tensor/fft/tests/test_fft_execution.py new file mode 100644 index 000000000..9e520bc90 --- /dev/null +++ b/python/xorbits/_mars/tensor/fft/tests/test_fft_execution.py @@ -0,0 +1,541 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ....lib.mkl_interface import mkl_free_buffers +from ...datasource import tensor +from .. import ( + fft, + fft2, + fftfreq, + fftn, + fftshift, + hfft, + ifft, + ifft2, + ifftn, + ifftshift, + ihfft, + irfft, + irfft2, + irfftn, + rfft, + rfft2, + rfftfreq, + rfftn, +) + + +def test_fft_execution(setup): + raw = np.random.rand(10, 20, 30) + t = tensor(raw, chunk_size=(8, 8, 30)) + + r = fft(t) + res = r.execute().fetch() + expected = np.fft.fft(raw) + np.testing.assert_allclose(res, expected) + + r = fft(t, norm="ortho") + res = r.execute().fetch() + expected = np.fft.fft(raw, norm="ortho") + np.testing.assert_allclose(res, expected) + + r = fft(t, n=11) + res = r.execute().fetch() + expected = np.fft.fft(raw, n=11) + np.testing.assert_allclose(res, expected) + + raw = np.random.rand(10, 20, 30) + t = tensor(raw, chunk_size=(8, 8, 8)) + + r = fft(t) + res = r.execute().fetch() + expected = np.fft.fft(raw) + np.testing.assert_allclose(res, expected) + + r = fft(t, norm="ortho") + res = r.execute().fetch() + expected = np.fft.fft(raw, norm="ortho") + np.testing.assert_allclose(res, expected) + + r = fft(t, n=11) + res = r.execute().fetch() + expected = np.fft.fft(raw, n=11) + np.testing.assert_allclose(res, expected) + + +def test_ifft_execution(setup): + raw = np.random.rand(10, 20, 30) + t = tensor(raw, chunk_size=(8, 8, 30)) + + r = ifft(t) + res = r.execute().fetch() + expected = np.fft.ifft(raw) + np.testing.assert_allclose(res, expected) + + r = ifft(t, norm="ortho") + res = r.execute().fetch() + expected = np.fft.ifft(raw, norm="ortho") + np.testing.assert_allclose(res, expected) + + r = ifft(t, n=11) + res = r.execute().fetch() + expected = np.fft.ifft(raw, n=11) + np.testing.assert_allclose(res, expected) + + raw = np.random.rand(10, 20, 30) + t = tensor(raw, chunk_size=(8, 8, 10)) + + r = ifft(t) + res = r.execute().fetch() + expected = np.fft.ifft(raw) + np.testing.assert_allclose(res, expected) + + r = ifft(t, norm="ortho") + res = r.execute().fetch() + expected = np.fft.ifft(raw, norm="ortho") + np.testing.assert_allclose(res, expected) + + r = ifft(t, n=11) + res = r.execute().fetch() + expected = np.fft.ifft(raw, n=11) + np.testing.assert_allclose(res, expected) + + +def test_fft2_execution(setup): + raw = np.random.rand(10, 20, 30) + t = tensor(raw, chunk_size=(8, 20, 30)) + + r = fft2(t) + res = r.execute().fetch() + expected = np.fft.fft2(raw) + np.testing.assert_allclose(res, expected) + + r = fft2(t, norm="ortho") + res = r.execute().fetch() + expected = np.fft.fft2(raw, norm="ortho") + np.testing.assert_allclose(res, expected) + + r = fft2(t, s=(11, 12)) + res = r.execute().fetch() + expected = np.fft.fft2(raw, s=(11, 12)) + np.testing.assert_allclose(res, expected) + + r = fft2(t, s=(11, 12), axes=(-1, -2)) + res = r.execute().fetch() + expected = np.fft.fft2(raw, s=(11, 12), axes=(-1, -2)) + np.testing.assert_allclose(res, expected) + + raw = np.random.rand(10, 20, 30) + t = tensor(raw, chunk_size=(8, 10, 12)) + + r = fft2(t) + res = r.execute().fetch() + expected = np.fft.fft2(raw) + np.testing.assert_allclose(res, expected) + + r = fft2(t, norm="ortho") + res = r.execute().fetch() + expected = np.fft.fft2(raw, norm="ortho") + np.testing.assert_allclose(res, expected) + + r = fft2(t, s=(11, 12)) + res = r.execute().fetch() + expected = np.fft.fft2(raw, s=(11, 12)) + np.testing.assert_allclose(res, expected) + + r = fft2(t, s=(11, 12), axes=(-1, -2)) + res = r.execute().fetch() + expected = np.fft.fft2(raw, s=(11, 12), axes=(-1, -2)) + np.testing.assert_allclose(res, expected) + + +def test_ifft2_execution(setup): + raw = np.random.rand(10, 20, 30) + t = tensor(raw, chunk_size=(8, 20, 30)) + + r = ifft2(t) + res = r.execute().fetch() + expected = np.fft.ifft2(raw) + np.testing.assert_allclose(res, expected) + + r = ifft2(t, norm="ortho") + res = r.execute().fetch() + expected = np.fft.ifft2(raw, norm="ortho") + np.testing.assert_allclose(res, expected) + + r = ifft2(t, s=(11, 12)) + res = r.execute().fetch() + expected = np.fft.ifft2(raw, s=(11, 12)) + np.testing.assert_allclose(res, expected) + + r = ifft2(t, s=(11, 12), axes=(-1, -2)) + res = r.execute().fetch() + expected = np.fft.ifft2(raw, s=(11, 12), axes=(-1, -2)) + np.testing.assert_allclose(res, expected) + + raw = np.random.rand(10, 20, 30) + t = tensor(raw, chunk_size=(8, 6, 10)) + + r = ifft2(t) + res = r.execute().fetch() + expected = np.fft.ifft2(raw) + np.testing.assert_allclose(res, expected) + + r = ifft2(t, norm="ortho") + res = r.execute().fetch() + expected = np.fft.ifft2(raw, norm="ortho") + np.testing.assert_allclose(res, expected) + + r = ifft2(t, s=(11, 12)) + res = r.execute().fetch() + expected = np.fft.ifft2(raw, s=(11, 12)) + np.testing.assert_allclose(res, expected) + + r = ifft2(t, s=(11, 12), axes=(-1, -2)) + res = r.execute().fetch() + expected = np.fft.ifft2(raw, s=(11, 12), axes=(-1, -2)) + np.testing.assert_allclose(res, expected) + + +def test_fftn_execution(setup): + raw = np.random.rand(10, 20, 30) + t = tensor(raw, chunk_size=(10, 20, 30)) + + r = fftn(t) + res = r.execute().fetch() + expected = np.fft.fftn(raw) + np.testing.assert_allclose(res, expected) + + r = fftn(t, norm="ortho") + res = r.execute().fetch() + expected = np.fft.fftn(raw, norm="ortho") + np.testing.assert_allclose(res, expected) + + r = fftn(t, s=(11, 12, 5)) + res = r.execute().fetch() + expected = np.fft.fftn(raw, s=(11, 12, 5)) + np.testing.assert_allclose(res, expected) + + r = fftn(t, s=(11, 12, 5), axes=(-1, -2, -3)) + res = r.execute().fetch() + expected = np.fft.fftn(raw, s=(11, 12, 5), axes=(-1, -2, -3)) + np.testing.assert_allclose(res, expected) + + raw = np.random.rand(10, 20, 30) + t = tensor(raw, chunk_size=(6, 6, 8)) + + r = fftn(t) + res = r.execute().fetch() + expected = np.fft.fftn(raw) + np.testing.assert_allclose(res, expected) + + r = fftn(t, norm="ortho") + res = r.execute().fetch() + expected = np.fft.fftn(raw, norm="ortho") + np.testing.assert_allclose(res, expected) + + r = fftn(t, s=(11, 12, 5)) + res = r.execute().fetch() + expected = np.fft.fftn(raw, s=(11, 12, 5)) + np.testing.assert_allclose(res, expected) + + r = fftn(t, s=(11, 12, 5), axes=(-1, -2, -3)) + res = r.execute().fetch() + expected = np.fft.fftn(raw, s=(11, 12, 5), axes=(-1, -2, -3)) + np.testing.assert_allclose(res, expected) + + +def test_ifftn_execution(setup): + raw = np.random.rand(10, 20, 30) + t = tensor(raw, chunk_size=(10, 20, 30)) + + r = ifftn(t) + res = r.execute().fetch() + expected = np.fft.ifftn(raw) + np.testing.assert_allclose(res, expected) + + r = ifftn(t, norm="ortho") + res = r.execute().fetch() + expected = np.fft.ifftn(raw, norm="ortho") + np.testing.assert_allclose(res, expected) + + r = ifftn(t, s=(11, 12, 5)) + res = r.execute().fetch() + expected = np.fft.ifftn(raw, s=(11, 12, 5)) + np.testing.assert_allclose(res, expected) + + r = ifftn(t, s=(11, 12, 5), axes=(-1, -2, -3)) + res = r.execute().fetch() + expected = np.fft.ifftn(raw, s=(11, 12, 5), axes=(-1, -2, -3)) + np.testing.assert_allclose(res, expected) + + raw = np.random.rand(10, 20, 30) + t = tensor(raw, chunk_size=(6, 8, 14)) + + r = ifftn(t) + res = r.execute().fetch() + expected = np.fft.ifftn(raw) + np.testing.assert_allclose(res, expected) + + r = ifftn(t, norm="ortho") + res = r.execute().fetch() + expected = np.fft.ifftn(raw, norm="ortho") + np.testing.assert_allclose(res, expected) + + r = ifftn(t, s=(11, 12, 5)) + res = r.execute().fetch() + expected = np.fft.ifftn(raw, s=(11, 12, 5)) + np.testing.assert_allclose(res, expected) + + r = ifftn(t, s=(11, 12, 5), axes=(-1, -2, -3)) + res = r.execute().fetch() + expected = np.fft.ifftn(raw, s=(11, 12, 5), axes=(-1, -2, -3)) + np.testing.assert_allclose(res, expected) + + +def test_rfft_execution(setup): + raw = np.random.rand(10, 20, 30) + t = tensor(raw, chunk_size=(8, 8, 30)) + + r = rfft(t) + res = r.execute().fetch() + expected = np.fft.rfft(raw) + np.testing.assert_allclose(res, expected) + + r = rfft(t, norm="ortho") + res = r.execute().fetch() + expected = np.fft.rfft(raw, norm="ortho") + np.testing.assert_allclose(res, expected) + + r = rfft(t, n=11) + res = r.execute().fetch() + expected = np.fft.rfft(raw, n=11) + np.testing.assert_allclose(res, expected) + + +def test_irfft_execution(setup): + raw = np.random.rand(10, 20, 30) + t = tensor(raw, chunk_size=(8, 8, 30)) + + r = irfft(t) + res = r.execute().fetch() + expected = np.fft.irfft(raw) + np.testing.assert_allclose(res, expected) + + r = irfft(t, norm="ortho") + res = r.execute().fetch() + expected = np.fft.irfft(raw, norm="ortho") + np.testing.assert_allclose(res, expected) + + r = irfft(t, n=11) + res = r.execute().fetch() + expected = np.fft.irfft(raw, n=11) + np.testing.assert_allclose(res, expected) + + +def test_rfft2_execution(setup): + raw = np.random.rand(10, 20, 30) + t = tensor(raw, chunk_size=(8, 20, 30)) + + r = rfft2(t) + res = r.execute().fetch() + expected = np.fft.rfft2(raw) + np.testing.assert_allclose(res, expected) + + r = rfft2(t, norm="ortho") + res = r.execute().fetch() + expected = np.fft.rfft2(raw, norm="ortho") + np.testing.assert_allclose(res, expected) + + r = rfft2(t, s=(11, 12)) + res = r.execute().fetch() + expected = np.fft.rfft2(raw, s=(11, 12)) + np.testing.assert_allclose(res, expected) + + r = rfft2(t, s=(11, 12), axes=(-1, -2)) + res = r.execute().fetch() + expected = np.fft.rfft2(raw, s=(11, 12), axes=(-1, -2)) + np.testing.assert_allclose(res, expected) + + +def test_irfft2_execution(setup): + raw = np.random.rand(10, 20, 30) + t = tensor(raw, chunk_size=(8, 20, 30)) + + r = irfft2(t) + res = r.execute().fetch() + expected = np.fft.irfft2(raw) + np.testing.assert_allclose(res, expected) + + r = irfft2(t, norm="ortho") + res = r.execute().fetch() + expected = np.fft.irfft2(raw, norm="ortho") + np.testing.assert_allclose(res, expected) + + r = irfft2(t, s=(11, 12)) + res = r.execute().fetch() + expected = np.fft.irfft2(raw, s=(11, 12)) + np.testing.assert_allclose(res, expected) + + r = irfft2(t, s=(11, 12), axes=(-1, -2)) + res = r.execute().fetch() + expected = np.fft.irfft2(raw, s=(11, 12), axes=(-1, -2)) + np.testing.assert_allclose(res, expected) + + +def test_rfftn_execution(setup): + raw = np.random.rand(10, 20, 30) + t = tensor(raw, chunk_size=(10, 20, 30)) + + r = rfftn(t) + res = r.execute().fetch() + expected = np.fft.rfftn(raw) + np.testing.assert_allclose(res, expected) + + r = rfftn(t, norm="ortho") + res = r.execute().fetch() + expected = np.fft.rfftn(raw, norm="ortho") + np.testing.assert_allclose(res, expected) + + r = rfftn(t, s=(11, 12, 5)) + res = r.execute().fetch() + expected = np.fft.rfftn(raw, s=(11, 12, 5)) + np.testing.assert_allclose(res, expected) + + r = rfftn(t, s=(11, 12, 11), axes=(-1, -2, -3)) + res = r.execute().fetch() + expected = np.fft.rfftn(raw, s=(11, 12, 11), axes=(-1, -2, -3)) + np.testing.assert_allclose(res, expected) + + +def test_irfftn_execution(setup): + raw = np.random.rand(10, 20, 30) + t = tensor(raw, chunk_size=(10, 20, 30)) + + r = irfftn(t) + res = r.execute().fetch() + expected = np.fft.irfftn(raw) + np.testing.assert_allclose(res, expected) + + r = irfftn(t, norm="ortho") + res = r.execute().fetch() + expected = np.fft.irfftn(raw, norm="ortho") + np.testing.assert_allclose(res, expected) + + r = irfftn(t, s=(11, 21, 5)) + res = r.execute().fetch() + expected = np.fft.irfftn(raw, s=(11, 21, 5)) + np.testing.assert_allclose(res, expected) + + # a bug in mkl version will cause the section below to fail + if mkl_free_buffers is None: + r = irfftn(t, s=(11, 21, 30), axes=(-1, -2, -3)) + res = r.execute().fetch() + expected = np.fft.irfftn(raw, s=(11, 21, 30), axes=(-1, -2, -3)) + np.testing.assert_allclose(res, expected) + + +def test_hfft_execution(setup): + raw = np.random.rand(10, 20, 30) + t = tensor(raw, chunk_size=(8, 8, 30)) + + r = hfft(t) + res = r.execute().fetch() + expected = np.fft.hfft(raw) + np.testing.assert_allclose(res, expected) + + r = hfft(t, norm="ortho") + res = r.execute().fetch() + expected = np.fft.hfft(raw, norm="ortho") + np.testing.assert_allclose(res, expected) + + r = hfft(t, n=11) + res = r.execute().fetch() + expected = np.fft.hfft(raw, n=11) + np.testing.assert_allclose(res, expected) + + +def test_ihfft_execution(setup): + raw = np.random.rand(10, 20, 30) + t = tensor(raw, chunk_size=(8, 8, 30)) + + r = ihfft(t) + res = r.execute().fetch() + expected = np.fft.ihfft(raw) + np.testing.assert_allclose(res, expected) + + r = ihfft(t, norm="ortho") + res = r.execute().fetch() + expected = np.fft.ihfft(raw, norm="ortho") + np.testing.assert_allclose(res, expected) + + r = ihfft(t, n=11) + res = r.execute().fetch() + expected = np.fft.ihfft(raw, n=11) + np.testing.assert_allclose(res, expected) + + r = ihfft(t, n=12) + res = r.execute().fetch() + expected = np.fft.ihfft(raw, n=12) + np.testing.assert_allclose(res, expected) + + +def test_fft_freq_execution(setup): + t = fftfreq(10, 0.1, chunk_size=6) + + res = t.execute().fetch() + np.testing.assert_allclose(res, np.fft.fftfreq(10, 0.1)) + + t = fftfreq(11, 0.01, chunk_size=6) + + res = t.execute().fetch() + np.testing.assert_allclose(res, np.fft.fftfreq(11, 0.01)) + + +def test_rfft_freq_execution(setup): + t = rfftfreq(20, 0.1, chunk_size=6) + + res = t.execute().fetch() + np.testing.assert_allclose(res, np.fft.rfftfreq(20, 0.1)) + + t = rfftfreq(21, 0.01, chunk_size=6) + + res = t.execute().fetch() + np.testing.assert_allclose(res, np.fft.rfftfreq(21, 0.01)) + + +def test_fft_shift_execution(setup): + t = fftfreq(10, 0.1, chunk_size=6) + r = fftshift(t) + + res = r.execute().fetch() + np.testing.assert_allclose(res, np.fft.fftshift(np.fft.fftfreq(10, 0.1))) + + freqs = fftfreq(9, d=1.0 / 9, chunk_size=4).reshape(3, 3) + r = fftshift(freqs, axes=(1,)) + + res = r.execute().fetch() + expected = np.fft.fftshift(np.fft.fftfreq(9, d=1.0 / 9).reshape(3, 3), axes=(1,)) + np.testing.assert_allclose(res, expected) + + +def test_ifft_shift_execution(setup): + t = fftfreq(9, d=1.0 / 9, chunk_size=4).reshape(3, 3) + r = ifftshift(t) + + res = r.execute().fetch() + expected = np.fft.ifftshift(np.fft.fftfreq(9, d=1.0 / 9).reshape(3, 3)) + np.testing.assert_allclose(res, expected) diff --git a/python/xorbits/_mars/tensor/fuse/__init__.py b/python/xorbits/_mars/tensor/fuse/__init__.py new file mode 100644 index 000000000..1b08a8e25 --- /dev/null +++ b/python/xorbits/_mars/tensor/fuse/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .core import TensorFuseChunk +from .cupy import TensorCpFuseChunk +from .numexpr import TensorNeFuseChunk diff --git a/python/xorbits/_mars/tensor/fuse/core.py b/python/xorbits/_mars/tensor/fuse/core.py new file mode 100644 index 000000000..8cda209c4 --- /dev/null +++ b/python/xorbits/_mars/tensor/fuse/core.py @@ -0,0 +1,25 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...core.operand import FuseChunkMixin +from ..operands import TensorFuse, TensorOperandMixin + + +class TensorFuseChunkMixin(FuseChunkMixin, TensorOperandMixin): + __slots__ = () + + +class TensorFuseChunk(TensorFuse, TensorFuseChunkMixin): + def __init__(self, dtype=None, **kw): + super().__init__(dtype=dtype, **kw) diff --git a/python/xorbits/_mars/tensor/fuse/cupy.py b/python/xorbits/_mars/tensor/fuse/cupy.py new file mode 100644 index 000000000..f95db4a3b --- /dev/null +++ b/python/xorbits/_mars/tensor/fuse/cupy.py @@ -0,0 +1,79 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from string import ascii_letters + +from ...utils import tokenize +from .. import arithmetic +from ..operands import TensorFuse +from .core import TensorFuseChunkMixin + + +class TensorCpFuseChunk(TensorFuse, TensorFuseChunkMixin): + # use for cupy-fused operand + _op_type_ = None # no opcode, cannot be serialized + + @classmethod + def execute(cls, ctx, op): + import cupy as cp + + chunk = op.outputs[0] + func = cp.ElementwiseKernel(*_evaluate(chunk)) + ctx[chunk.key] = func(*[ctx[i.key] for i in op.inputs]) + + +# execution part +CP_BINOP_TO_STRING = { + arithmetic.TensorSubtract: "-", + arithmetic.TensorMultiply: "*", + arithmetic.TensorTrueDiv: "/", +} + +CP_UNARYOP_TO_STRING = { + arithmetic.TensorSqrt: "sqrt", +} + + +def _evaluate(chunk): + letters = iter(letter for letter in ascii_letters if letter not in "ni") + + input_types = [i.dtype.name for i in chunk.op.inputs] + input_names = {i: next(letters) for i in chunk.op.inputs} + input_arguments = ", ".join( + [f"{tp} {input_names[i]}" for i, tp in zip(chunk.op.inputs, input_types)] + ) + output_type = chunk.op.dtype.name + output_name = next(letters) + output_argument = f"{output_type} {output_name}" + body = dict(input_names) + + for node in chunk.composed: + op_cls = type(node.op) + if op_cls in CP_BINOP_TO_STRING: + input_bodies = [body.get(i, repr(i)) for i in (node.op.lhs, node.op.rhs)] + body[node] = f" {CP_BINOP_TO_STRING[op_cls]} ".join(input_bodies) + elif op_cls in CP_UNARYOP_TO_STRING: + input_data = body[node.op.inputs[0]] + body[node] = f"{CP_UNARYOP_TO_STRING[op_cls]}({input_data})" + else: + raise NotImplementedError + + body = f"{output_name} = {body[chunk.composed[-1]]}" + key = tokenize(input_arguments, output_argument, body) + return ( + input_arguments, + output_argument, + body, + f"{type(chunk.op).__name__.lower()}_{key}", + ) diff --git a/python/xorbits/_mars/tensor/fuse/numexpr.py b/python/xorbits/_mars/tensor/fuse/numexpr.py new file mode 100644 index 000000000..6546c1dcd --- /dev/null +++ b/python/xorbits/_mars/tensor/fuse/numexpr.py @@ -0,0 +1,198 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +from itertools import count + +try: + import numexpr as ne + + NUMEXPR_INSTALLED = True +except ImportError: + ne = None + NUMEXPR_INSTALLED = False +import numpy as np + +from .. import arithmetic, reduction +from ..array_utils import as_same_device +from ..operands import TensorFuse +from .core import TensorFuseChunkMixin + + +class TensorNeFuseChunk(TensorFuse, TensorFuseChunkMixin): + _op_type_ = None # no opcode, cannot be serialized + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + inputs = as_same_device([ctx[c.key] for c in op.inputs], device=op.device) + counter = count() + # Unified the var names to V_0, V_1, ... for better cache hit. + key_to_var = defaultdict(lambda: f"V_{counter.__next__()}") + local_dict = {key_to_var[c.key]: i for c, i in zip(op.inputs, inputs)} + expr = _evaluate(chunk).format_map(key_to_var) + # The numexpr.evaluate is thread safe: https://github.com/pydata/numexpr/pull/200 + try: + res = ne.evaluate(expr, local_dict=local_dict, global_dict={}) + except Exception as e: + raise RuntimeError( + f"Failed to evaluate numexpr {repr(expr)} on local dict {local_dict}." + ) from e + res = _maybe_keepdims(chunk, res) + if chunk.ndim == 0 and res.ndim == 1 and res.size == 0: + res = res.dtype.type(0) + ctx[chunk.key] = res + + +# execution part +NE_UNARYOP_TO_STRING = { + arithmetic.TensorNegative: "-", + arithmetic.TensorAbs: "abs", + arithmetic.TensorConj: "conj", + arithmetic.TensorExp: "exp", + arithmetic.TensorLog: "log", + arithmetic.TensorLog10: "log10", + arithmetic.TensorExpm1: "expm1", + arithmetic.TensorLog1p: "log1p", + arithmetic.TensorSqrt: "sqrt", + arithmetic.TensorSin: "sin", + arithmetic.TensorCos: "cos", + arithmetic.TensorTan: "tan", + arithmetic.TensorArcsin: "arcsin", + arithmetic.TensorArccos: "arccos", + arithmetic.TensorArctan: "arctan", + arithmetic.TensorSinh: "sinh", + arithmetic.TensorCosh: "cosh", + arithmetic.TensorTanh: "tanh", + arithmetic.TensorArcsinh: "arcsinh", + arithmetic.TensorArccosh: "arccosh", + arithmetic.TensorArctanh: "arctanh", + arithmetic.TensorFloor: "floor", + arithmetic.TensorCeil: "ceil", + arithmetic.TensorNot: "~", +} + + +NE_BINOP_TO_STRING = { + arithmetic.TensorAdd: "+", + arithmetic.TensorSubtract: "-", + arithmetic.TensorMultiply: "*", + arithmetic.TensorDivide: "/", + arithmetic.TensorMod: "%", + arithmetic.TensorPower: "**", + arithmetic.TensorLshift: "<<", + arithmetic.TensorRshift: ">>", + arithmetic.TensorEqual: "==", + arithmetic.TensorNotEqual: "!=", + arithmetic.TensorLessThan: "<", + arithmetic.TensorLessEqual: "<=", + arithmetic.TensorGreaterThan: ">", + arithmetic.TensorGreaterEqual: ">=", + arithmetic.TensorAnd: "and", + arithmetic.TensorOr: "or", +} + +NE_TREE_OP_TO_STRING = { + arithmetic.TensorTreeAdd: "+", + arithmetic.TensorTreeMultiply: "*", +} + +NE_REDUCTION_TO_STRING = { + reduction.TensorSum: "sum", + reduction.TensorProd: "prod", + reduction.TensorMax: "max", + reduction.TensorMin: "min", +} + + +class _Default(dict): + def __missing__(self, key): + return f"{{{key}}}" + + +def _handle_unary(chunk): + if len(chunk.inputs) != 1: + raise ValueError("unary operand inputs should be 1") + data = chunk.inputs[0] + unary_op = NE_UNARYOP_TO_STRING[type(chunk.op)] + return f"{unary_op}({{{data.key}}})" + + +def _decompose(chunk): + expr = f"{{{chunk.key}}}" + for node in reversed(chunk.composed): + _expr = _evaluate(node) + expr = expr.format_map(_Default([(node.key, f"({_expr})")])) + return expr + + +def _handle_bin(chunk): + op = chunk.op + lhs = str(op.lhs) if np.isscalar(op.lhs) else f"{{{op.lhs.key}}}" + rhs = str(op.rhs) if np.isscalar(op.rhs) else f"{{{op.rhs.key}}}" + reverse = getattr(op, "reverse", False) + op = NE_BINOP_TO_STRING[type(op)] + if reverse: + exprs = [rhs, lhs] + else: + exprs = [lhs, rhs] + return op.join(exprs) + + +def _handle_tree(chunk): + op = NE_TREE_OP_TO_STRING[type(chunk.op)] + return op.join(f"{{{c.key}}}" for c in chunk.inputs) + + +def _wrap_bool(data): + if data.dtype == np.bool_: + return f"where({{{data.key}}}, 1, 0)" + + return f"{{{data.key}}}" + + +def _handle_reduction(chunk): + ax = chunk.op.axis + data = chunk.inputs[0] + op_str = NE_REDUCTION_TO_STRING[type(chunk.op)] + # TODO(hks): delete it if numexpr.sum fixed + if len(ax) == data.ndim: + return f"{op_str}({_wrap_bool(data)})" + elif len(ax) == 1: + return f"{op_str}({_wrap_bool(data)},axis={ax[0]})" + else: + raise ValueError("numexpr cannot encode axis") + + +def _evaluate(chunk): + op_type = type(chunk.op) + if op_type in NE_UNARYOP_TO_STRING: + return _handle_unary(chunk) + elif op_type in NE_BINOP_TO_STRING: + return _handle_bin(chunk) + elif op_type in NE_TREE_OP_TO_STRING: + return _handle_tree(chunk) + elif op_type in NE_REDUCTION_TO_STRING: + return _handle_reduction(chunk) + elif op_type is TensorNeFuseChunk: + return _decompose(chunk) + else: + raise TypeError(f"unsupported operator in numexpr: {op_type.__name__}") + + +def _maybe_keepdims(chunk, res): + out_chunk = chunk.composed[-1] if type(chunk.op) == TensorNeFuseChunk else chunk + if type(out_chunk.op) in NE_REDUCTION_TO_STRING and out_chunk.op.keepdims: + res = np.reshape(res, out_chunk.shape) + return res diff --git a/python/xorbits/_mars/tensor/fuse/tests/__init__.py b/python/xorbits/_mars/tensor/fuse/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/tensor/fuse/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/tensor/fuse/tests/test_numexpr_execution.py b/python/xorbits/_mars/tensor/fuse/tests/test_numexpr_execution.py new file mode 100644 index 000000000..1d11bacc8 --- /dev/null +++ b/python/xorbits/_mars/tensor/fuse/tests/test_numexpr_execution.py @@ -0,0 +1,206 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ....utils import ignore_warning +from ...arithmetic import abs as mt_abs +from ...datasource import arange, tensor +from ...reduction import sum as mt_sum + + +def test_base_execution(setup): + rs = np.random.RandomState(0) + raw1 = rs.randint(10, size=(10, 10, 10)) + raw2 = rs.randint(10, size=(10, 10, 10)) + arr1 = tensor(raw1, chunk_size=5) + arr2 = tensor(raw2, chunk_size=5) + + arr3 = arr1 + arr2 + 10 + arr4 = 10 + arr1 + arr2 + res3 = arr3.execute().fetch() + res3_cmp = arr4.execute().fetch() + np.testing.assert_array_equal(res3, res3_cmp) + + a = arange(10) + b = arange(10) * 0.1 + raw_a = np.arange(10) + raw_b = np.arange(10) * 0.1 + c = a * b - 4.1 * a > 2.5 * b + res4_cmp = raw_a * raw_b - 4.1 * raw_a > 2.5 * raw_b + res4 = c.execute().fetch() + np.testing.assert_array_equal(res4, res4_cmp) + + c = mt_sum(1) * (-1) + r = c.execute().fetch() + assert r == -1 + + c = -mt_abs(mt_sum(mt_abs(-1))) + r = c.execute().fetch() + assert r == -1 + + +def _gen_pairs(seq): + test_seq = np.random.RandomState(0).permutation(seq) + for i in range(0, len(seq), 2): + j = (i + 1) % len(seq) + yield test_seq[i], test_seq[j] + + +@ignore_warning +def test_unary_execution(setup): + from ...arithmetic import UNARY_UFUNC, arccosh, conj, invert, logical_not, sin + + _sp_unary_ufunc = {arccosh, invert, conj, logical_not} + _new_unary_ufunc = list(UNARY_UFUNC - _sp_unary_ufunc)[:3] + + def _normalize_by_sin(func1, func2, arr): + return func1(abs(sin((func2(arr))))) + + tested = set() + rs = np.random.RandomState(0) + for func1, func2 in _gen_pairs(_new_unary_ufunc): + raw = rs.random((8, 8, 8)) + arr1 = tensor(raw, chunk_size=4) + + arr2 = _normalize_by_sin(func1, func2, arr1) + res = arr2.execute() + res_cmp = arr2.execute(fuse_enabled=False) + np.testing.assert_allclose(res[0], res_cmp[0]) + tested.update([func1, func2]) + # make sure all functions tested + assert tested == set(_new_unary_ufunc) + + raw = rs.randint(100, size=(8, 8, 8)) + arr1 = tensor(raw, chunk_size=4) + arr2 = arccosh(1 + abs(invert(arr1))) + res = arr2.execute(fuse_enabled=False).fetch() + res_cmp = arccosh(1 + abs(~raw)) + np.testing.assert_array_almost_equal(res[0], res_cmp[0]) + + +@ignore_warning +def test_bin_execution(setup): + from ...arithmetic import ( + BIN_UFUNC, + bitand, + bitor, + bitxor, + fmod, + ldexp, + logical_and, + logical_or, + lshift, + mod, + rshift, + ) + + _sp_bin_ufunc = [ + mod, + fmod, + bitand, + bitor, + bitxor, + lshift, + rshift, + logical_and, + logical_or, + ] + _new_bin_ufunc = list(BIN_UFUNC - set(_sp_bin_ufunc) - {ldexp}) + + tested = set() + rs = np.random.RandomState(0) + for func1, func2 in _gen_pairs(_new_bin_ufunc): + raw = rs.random((9, 9, 9)) + arr1 = tensor(raw, chunk_size=5) + + arr2 = func1(1, func2(2, arr1)) + res = arr2.execute().fetch() + res_cmp = arr2.execute(fuse_enabled=False).fetch() + np.testing.assert_array_almost_equal(res, res_cmp) + tested.update([func1, func2]) + # make sure all functions tested + assert tested == set(_new_bin_ufunc) + + tested = set() + for func1, func2 in _gen_pairs(_sp_bin_ufunc): + raw = rs.randint(1, 100, size=(10, 10, 10)) + arr1 = tensor(raw, chunk_size=6) + + arr2 = func1(10, func2(arr1, 5)) + res = arr2.execute().fetch() + res_cmp = arr2.execute(fuse_enabled=False).fetch() + np.testing.assert_array_almost_equal(res, res_cmp) + tested.update([func1, func2]) + # make sure all functions tested + assert tested == set(_sp_bin_ufunc) + + +def test_reduction_execution(setup): + rs = np.random.RandomState(0) + raw1 = rs.randint(5, size=(8, 8, 8)) + raw2 = rs.randint(5, size=(8, 8, 8)) + arr1 = tensor(raw1, chunk_size=4) + arr2 = tensor(raw2, chunk_size=4) + + res1 = (arr1 + 1).sum(keepdims=True).execute().fetch() + res2 = (arr1 + 1).prod(keepdims=True).execute().fetch() + np.testing.assert_array_equal((raw1 + 1).sum(keepdims=True), res1) + np.testing.assert_array_equal((raw1 + 1).prod(keepdims=True), res2) + + res1 = (arr1 + 1).sum(axis=1).execute().fetch() + res2 = (arr1 + 1).prod(axis=1).execute().fetch() + res3 = (arr1 + 1).max(axis=1).execute().fetch() + res4 = (arr1 + 1).min(axis=1).execute().fetch() + np.testing.assert_array_equal((raw1 + 1).sum(axis=1), res1) + np.testing.assert_array_equal((raw1 + 1).prod(axis=1), res2) + np.testing.assert_array_equal((raw1 + 1).max(axis=1), res3) + np.testing.assert_array_equal((raw1 + 1).min(axis=1), res4) + + raw3 = raw2 - raw1 + 10 + arr3 = -arr1 + arr2 + 10 + + res1 = arr3.sum(axis=(0, 1)).execute().fetch() + res2 = arr3.prod(axis=(0, 1)).execute().fetch() + res3 = arr3.max(axis=(0, 1)).execute().fetch() + res4 = arr3.min(axis=(0, 1)).execute().fetch() + np.testing.assert_array_equal(raw3.sum(axis=(0, 1)), res1) + np.testing.assert_array_equal(raw3.prod(axis=(0, 1)), res2) + np.testing.assert_array_equal(raw3.max(axis=(0, 1)), res3) + np.testing.assert_array_equal(raw3.min(axis=(0, 1)), res4) + + +def test_bool_reduction_execution(setup): + rs = np.random.RandomState(0) + raw = rs.randint(5, size=(8, 8, 8)) + arr = tensor(raw, chunk_size=4) + + res = (arr > 3).sum(axis=1).execute().fetch() + np.testing.assert_array_equal(res, (raw > 3).sum(axis=1)) + + res = (arr > 3).sum().execute().fetch() + np.testing.assert_array_equal(res, (raw > 3).sum()) + + +def test_order_execution(setup): + rs = np.random.RandomState(0) + raw = np.asfortranarray(rs.rand(4, 5, 6)) + arr = tensor(raw, chunk_size=3) + + res = (arr * 3 + 1).execute().fetch() + expected = raw * 3 + 1 + + np.testing.assert_array_equal(res, expected) + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] diff --git a/python/xorbits/_mars/tensor/images/__init__.py b/python/xorbits/_mars/tensor/images/__init__.py new file mode 100644 index 000000000..79fb234f5 --- /dev/null +++ b/python/xorbits/_mars/tensor/images/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .imread import imread diff --git a/python/xorbits/_mars/tensor/images/imread.py b/python/xorbits/_mars/tensor/images/imread.py new file mode 100644 index 000000000..41ed282f0 --- /dev/null +++ b/python/xorbits/_mars/tensor/images/imread.py @@ -0,0 +1,106 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...config import options +from ...lib.filesystem import file_size, glob, open_file +from ...serialization.serializables import AnyField +from ...utils import ModulePlaceholder, ceildiv +from ..operands import TensorOperand, TensorOperandMixin + +try: + from PIL import Image +except ImportError: + Image = ModulePlaceholder("PIL") + + +def _read_image(fpath): + return np.asarray(Image.open(fpath)) + + +class TensorImread(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.IMREAD + + _filepath = AnyField("filepath") + + def __init__(self, filepath=None, **kwargs): + super().__init__(_filepath=filepath, **kwargs) + + @property + def filepath(self): + return self._filepath + + @classmethod + def tile(cls, op): + out_shape = op.outputs[0].shape + paths = ( + op.filepath if isinstance(op.filepath, (tuple, list)) else glob(op.filepath) + ) + chunk_size = op.outputs[0].extra_params.raw_chunk_size + n_chunks = ceildiv(len(paths), chunk_size) + if len(paths) > 1: + chunks = [] + splits = [] + for i in range(n_chunks): + chunk_op = op.copy().reset_key() + chunk_op._filepath = paths[i * chunk_size : (i + 1) * chunk_size] + file_nums = len(chunk_op._filepath) + shape = (file_nums,) + out_shape[1:] + chunk = chunk_op.new_chunk( + None, shape=shape, index=(i,) + (0,) * (len(out_shape) - 1) + ) + chunks.append(chunk) + splits.append(file_nums) + nsplits = (tuple(splits),) + tuple((s,) for s in out_shape[1:]) + else: + chunk_op = op.copy().reset_key() + chunks = [ + chunk_op.new_chunk(None, shape=out_shape, index=(0,) * len(out_shape)) + ] + nsplits = tuple((s,) for s in out_shape) + new_op = op.copy() + return new_op.new_tensors(None, shape=out_shape, chunks=chunks, nsplits=nsplits) + + @classmethod + def execute(cls, ctx, op): + if isinstance(op.filepath, list): + arrays = np.empty(op.outputs[0].shape) + for i, path in enumerate(op.filepath): + with open_file(path, "rb") as f: + arrays[i] = _read_image(f) + ctx[op.outputs[0].key] = np.array(arrays) + else: + with open_file(op.filepath, "rb") as f: + ctx[op.outputs[0].key] = np.array(_read_image(f)) + + def __call__(self, shape, chunk_size): + return self.new_tensor(None, shape, raw_chunk_size=chunk_size) + + +def imread(path, chunk_size=None): + paths = path if isinstance(path, (tuple, list)) else glob(path) + with open_file(paths[0], "rb") as f: + sample_data = _read_image(f) + img_shape = sample_data.shape + img_size = file_size(paths[0]) + if len(paths) > 1: + shape = (len(paths),) + img_shape + else: + shape = img_shape + if chunk_size is None: + chunk_size = int(options.chunk_store_limit / img_size) + op = TensorImread(filepath=path, chunk_size=chunk_size, dtype=sample_data.dtype) + return op(shape=shape, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/images/tests/__init__.py b/python/xorbits/_mars/tensor/images/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/tensor/images/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/tensor/images/tests/test_images.py b/python/xorbits/_mars/tensor/images/tests/test_images.py new file mode 100644 index 000000000..97994c069 --- /dev/null +++ b/python/xorbits/_mars/tensor/images/tests/test_images.py @@ -0,0 +1,65 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile + +import numpy as np +import pytest + +try: + from PIL import Image +except ImportError: + Image = None + +from ....core import tile +from ...images import imread + + +@pytest.mark.skipif(not Image, reason="Pillow not installed") +def test_imread(): + with tempfile.TemporaryDirectory() as tempdir: + raws = [] + for i in range(10): + array = np.random.randint(0, 256, 2500 * 3, dtype=np.uint8).reshape( + (50, 50, 3) + ) + raws.append(array) + im = Image.fromarray(array) + im.save(os.path.join(tempdir, f"random_{i}.png")) + + t = imread(os.path.join(tempdir, "random_0.png")) + assert t.shape == (50, 50, 3) + assert t.dtype == np.dtype("uint8") + + tiled = tile(t) + assert len(tiled.chunks) == 1 + assert tiled.chunks[0].shape == (50, 50, 3) + assert tiled.chunks[0].dtype == np.dtype("uint8") + + t = imread(os.path.join(tempdir, "random_*.png"), chunk_size=3) + assert t.shape == (10, 50, 50, 3) + + tiled = tile(t) + assert len(tiled.chunks) == 4 + assert tiled.nsplits == ((3, 3, 3, 1), (50,), (50,), (3,)) + assert tiled.chunks[0].dtype == np.dtype("uint8") + assert tiled.chunks[0].index == (0, 0, 0, 0) + assert tiled.chunks[0].shape == (3, 50, 50, 3) + assert tiled.chunks[1].index == (1, 0, 0, 0) + assert tiled.chunks[1].shape == (3, 50, 50, 3) + assert tiled.chunks[2].index == (2, 0, 0, 0) + assert tiled.chunks[2].shape == (3, 50, 50, 3) + assert tiled.chunks[3].index == (3, 0, 0, 0) + assert tiled.chunks[3].shape == (1, 50, 50, 3) diff --git a/python/xorbits/_mars/tensor/images/tests/test_images_execution.py b/python/xorbits/_mars/tensor/images/tests/test_images_execution.py new file mode 100644 index 000000000..35b4816ba --- /dev/null +++ b/python/xorbits/_mars/tensor/images/tests/test_images_execution.py @@ -0,0 +1,53 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile + +import numpy as np +import pytest + +try: + from PIL import Image +except ImportError: + Image = None + +from ...images import imread + + +@pytest.mark.skipif(not Image, reason="Pillow not installed") +def test_imread_execution(setup): + with tempfile.TemporaryDirectory() as tempdir: + raws = [] + for i in range(10): + array = np.random.randint(0, 256, 2500, dtype=np.uint8).reshape((50, 50)) + raws.append(array) + im = Image.fromarray(array) + im.save(os.path.join(tempdir, f"random_{i}.png")) + # Single image + t = imread(os.path.join(tempdir, "random_0.png")) + res = t.execute().fetch() + np.testing.assert_array_equal(res, raws[0]) + + t2 = imread(os.path.join(tempdir, "random_*.png")) + res = t2.execute().fetch() + np.testing.assert_array_equal(np.sort(res, axis=0), np.sort(raws, axis=0)) + + t3 = imread(os.path.join(tempdir, "random_*.png"), chunk_size=4) + res = t3.execute().fetch() + np.testing.assert_array_equal(np.sort(res, axis=0), np.sort(raws, axis=0)) + + t4 = imread(os.path.join(tempdir, "random_*.png"), chunk_size=4) + res = t4.execute().fetch() + np.testing.assert_array_equal(np.sort(res, axis=0), np.sort(raws, axis=0)) diff --git a/python/xorbits/_mars/tensor/indexing/__init__.py b/python/xorbits/_mars/tensor/indexing/__init__.py new file mode 100644 index 000000000..d1102bac3 --- /dev/null +++ b/python/xorbits/_mars/tensor/indexing/__init__.py @@ -0,0 +1,47 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .choose import TensorChoose, choose +from .compress import compress +from .extract import extract +from .fill_diagonal import TensorFillDiagonal, fill_diagonal +from .flatnonzero import flatnonzero +from .getitem import FancyIndexingConcat, FancyIndexingDistribute, TensorIndex +from .nonzero import TensorNonzero, nonzero +from .setitem import TensorIndexSetValue +from .slice import TensorSlice +from .take import take +from .unravel_index import TensorUnravelIndex, unravel_index + + +def _install(): + from ..core import Tensor, TensorData + from .getitem import _getitem + from .setitem import _setitem + + setattr(Tensor, "__getitem__", _getitem) + setattr(TensorData, "__getitem__", _getitem) + setattr(Tensor, "__setitem__", _setitem) + setattr(Tensor, "take", take) + setattr( + Tensor, + "compress", + lambda a, condition, axis=None: compress(condition, a, axis=axis), + ) + setattr(Tensor, "choose", choose) + setattr(Tensor, "nonzero", nonzero) + + +_install() +del _install diff --git a/python/xorbits/_mars/tensor/indexing/choose.py b/python/xorbits/_mars/tensor/indexing/choose.py new file mode 100644 index 000000000..5209bb95c --- /dev/null +++ b/python/xorbits/_mars/tensor/indexing/choose.py @@ -0,0 +1,226 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import FieldTypes, KeyField, ListField, StringField +from ..array_utils import as_same_device, device +from ..core import Tensor, TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorOperand, TensorOperandMixin +from ..utils import broadcast_shape, check_out_param + + +class TensorChoose(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.CHOOSE + + _a = KeyField("a") + _choices = ListField("choices", FieldTypes.key) + _mode = StringField("mode") + + def __init__(self, mode=None, **kw): + super().__init__(_mode=mode, **kw) + + def __setattr__(self, key, value): + if key == "_mode" and value not in ("raise", "wrap", "clip"): + raise ValueError(f"mode should be raise, wrap or clip, not {value}") + + super().__setattr__(key, value) + + @property + def a(self): + return self._a + + @property + def choices(self): + return self._choices + + @property + def mode(self): + return self._mode + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._a = self._inputs[0] + self._choices = self._inputs[1:] + + def __call__(self, a, choices, out=None): + if out is not None and not isinstance(out, Tensor): + raise TypeError(f"out should be Tensor object, got {type(out)} instead") + + inputs = [a] + choices + shape = broadcast_shape(a.shape, *[c.shape for c in choices]) + order = TensorOrder.C_ORDER if out is None else out.order + t = self.new_tensor(inputs, shape, order=order) + + if out is None: + return t + + check_out_param(out, t, "unsafe") + out_shape, out_dtype = out.shape, out.dtype + # if `out` is specified, use out's dtype and shape + if out_shape != t.shape: + raise ValueError(f"output shape should be {t.shape}, got {out_shape}") + setattr(self, "dtype", out_dtype) + out.data = t.data + return out + + @classmethod + def tile(cls, op): + from ..arithmetic.core import TensorElementWise + + return (yield from TensorElementWise.tile(op)) + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + a, choices = inputs[0], inputs[1:] + + out = op.outputs[0] + with device(device_id): + ctx[out.key] = xp.choose(a, choices, mode=op.mode).astype( + op.dtype, order=out.order.value, copy=False + ) + + +def choose(a, choices, out=None, mode="raise"): + """ + Construct a tensor from an index tensor and a set of tensors to choose from. + + First of all, if confused or uncertain, definitely look at the Examples - + in its full generality, this function is less simple than it might + seem from the following code description (below ndi = + `mt.lib.index_tricks`): + + ``mt.choose(a,c) == mt.array([c[a[I]][I] for I in ndi.ndindex(a.shape)])``. + + But this omits some subtleties. Here is a fully general summary: + + Given an "index" tensor (`a`) of integers and a sequence of `n` tensors + (`choices`), `a` and each choice tensor are first broadcast, as necessary, + to tensors of a common shape; calling these *Ba* and *Bchoices[i], i = + 0,...,n-1* we have that, necessarily, ``Ba.shape == Bchoices[i].shape`` + for each `i`. Then, a new array with shape ``Ba.shape`` is created as + follows: + + * if ``mode=raise`` (the default), then, first of all, each element of + `a` (and thus `Ba`) must be in the range `[0, n-1]`; now, suppose that + `i` (in that range) is the value at the `(j0, j1, ..., jm)` position + in `Ba` - then the value at the same position in the new array is the + value in `Bchoices[i]` at that same position; + + * if ``mode=wrap``, values in `a` (and thus `Ba`) may be any (signed) + integer; modular arithmetic is used to map integers outside the range + `[0, n-1]` back into that range; and then the new array is constructed + as above; + + * if ``mode=clip``, values in `a` (and thus `Ba`) may be any (signed) + integer; negative integers are mapped to 0; values greater than `n-1` + are mapped to `n-1`; and then the new tensor is constructed as above. + + Parameters + ---------- + a : int tensor + This tensor must contain integers in `[0, n-1]`, where `n` is the number + of choices, unless ``mode=wrap`` or ``mode=clip``, in which cases any + integers are permissible. + choices : sequence of tensors + Choice tensors. `a` and all of the choices must be broadcastable to the + same shape. If `choices` is itself a tensor (not recommended), then + its outermost dimension (i.e., the one corresponding to + ``choices.shape[0]``) is taken as defining the "sequence". + out : tensor, optional + If provided, the result will be inserted into this tensor. It should + be of the appropriate shape and dtype. + mode : {'raise' (default), 'wrap', 'clip'}, optional + Specifies how indices outside `[0, n-1]` will be treated: + + * 'raise' : an exception is raised + * 'wrap' : value becomes value mod `n` + * 'clip' : values < 0 are mapped to 0, values > n-1 are mapped to n-1 + + Returns + ------- + merged_array : Tensor + The merged result. + + Raises + ------ + ValueError: shape mismatch + If `a` and each choice tensor are not all broadcastable to the same + shape. + + See Also + -------- + Tensor.choose : equivalent method + + Notes + ----- + To reduce the chance of misinterpretation, even though the following + "abuse" is nominally supported, `choices` should neither be, nor be + thought of as, a single tensor, i.e., the outermost sequence-like container + should be either a list or a tuple. + + Examples + -------- + + >>> import mars.tensor as mt + + >>> choices = [[0, 1, 2, 3], [10, 11, 12, 13], + ... [20, 21, 22, 23], [30, 31, 32, 33]] + >>> mt.choose([2, 3, 1, 0], choices + ... # the first element of the result will be the first element of the + ... # third (2+1) "array" in choices, namely, 20; the second element + ... # will be the second element of the fourth (3+1) choice array, i.e., + ... # 31, etc. + ... ).execute() + array([20, 31, 12, 3]) + >>> mt.choose([2, 4, 1, 0], choices, mode='clip').execute() # 4 goes to 3 (4-1) + array([20, 31, 12, 3]) + >>> # because there are 4 choice arrays + >>> mt.choose([2, 4, 1, 0], choices, mode='wrap').execute() # 4 goes to (4 mod 4) + array([20, 1, 12, 3]) + >>> # i.e., 0 + + A couple examples illustrating how choose broadcasts: + + >>> a = [[1, 0, 1], [0, 1, 0], [1, 0, 1]] + >>> choices = [-10, 10] + >>> mt.choose(a, choices).execute() + array([[ 10, -10, 10], + [-10, 10, -10], + [ 10, -10, 10]]) + + >>> # With thanks to Anne Archibald + >>> a = mt.array([0, 1]).reshape((2,1,1)) + >>> c1 = mt.array([1, 2, 3]).reshape((1,3,1)) + >>> c2 = mt.array([-1, -2, -3, -4, -5]).reshape((1,1,5)) + >>> mt.choose(a, (c1, c2)).execute() # result is 2x3x5, res[0,:,:]=c1, res[1,:,:]=c2 + array([[[ 1, 1, 1, 1, 1], + [ 2, 2, 2, 2, 2], + [ 3, 3, 3, 3, 3]], + [[-1, -2, -3, -4, -5], + [-1, -2, -3, -4, -5], + [-1, -2, -3, -4, -5]]]) + + """ + a = astensor(a, dtype="i8") + choices = [astensor(c) for c in choices] + + dtype = np.result_type(*[c.dtype for c in choices]) + op = TensorChoose(mode=mode, dtype=dtype) + return op(a, choices, out=out) diff --git a/python/xorbits/_mars/tensor/indexing/compress.py b/python/xorbits/_mars/tensor/indexing/compress.py new file mode 100644 index 000000000..f0168f93a --- /dev/null +++ b/python/xorbits/_mars/tensor/indexing/compress.py @@ -0,0 +1,122 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ..core import Tensor +from ..datasource import tensor as astensor +from ..utils import validate_axis + + +def compress(condition, a, axis=None, out=None): + """ + Return selected slices of a tensor along given axis. + + When working along a given axis, a slice along that axis is returned in + `output` for each index where `condition` evaluates to True. When + working on a 1-D array, `compress` is equivalent to `extract`. + + Parameters + ---------- + condition : 1-D tensor of bools + Tensor that selects which entries to return. If len(condition) + is less than the size of `a` along the given axis, then output is + truncated to the length of the condition tensor. + a : array_like + Tensor from which to extract a part. + axis : int, optional + Axis along which to take slices. If None (default), work on the + flattened tensor. + out : Tensor, optional + Output tensor. Its type is preserved and it must be of the right + shape to hold the output. + + Returns + ------- + compressed_array : Tensor + A copy of `a` without the slices along axis for which `condition` + is false. + + See Also + -------- + take, choose, diag, diagonal, select + Tensor.compress : Equivalent method in ndarray + mt.extract: Equivalent method when working on 1-D arrays + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([[1, 2], [3, 4], [5, 6]]) + >>> a.execute() + array([[1, 2], + [3, 4], + [5, 6]]) + >>> mt.compress([0, 1], a, axis=0).execute() + array([[3, 4]]) + >>> mt.compress([False, True, True], a, axis=0).execute() + array([[3, 4], + [5, 6]]) + >>> mt.compress([False, True], a, axis=1).execute() + array([[2], + [4], + [6]]) + + Working on the flattened tensor does not return slices along an axis but + selects elements. + + >>> mt.compress([False, True], a).execute() + array([2]) + + """ + a = astensor(a) + condition = astensor(condition, dtype=bool) + + if condition.ndim != 1: + raise ValueError("condition must be an 1-d tensor") + + if axis is None: + a = a.ravel() + if len(condition) < a.size: + a = a[: len(condition)] + return a[condition] + + try: + axis = validate_axis(a.ndim, axis) + except ValueError: + raise np.AxisError( + f"axis {axis} is out of bounds for tensor of dimension {a.ndim}" + ) + + try: + if len(condition) < a.shape[axis]: + a = a[(slice(None),) * axis + (slice(len(condition)),)] + t = a[(slice(None),) * axis + (condition,)] + if out is None: + return t + + if out is not None and not isinstance(out, Tensor): + raise TypeError(f"out should be Tensor object, got {type(out)} instead") + if not np.can_cast(out.dtype, t.dtype, "safe"): + raise TypeError( + f"Cannot cast array data from dtype('{out.dtype}') to dtype('{t.dtype}') " + "according to the rule 'safe'" + ) + # skip shape check because out shape is unknown + out.data = t.astype(out.dtype, order=out.order.value).data + return out + except IndexError: + raise np.AxisError( + f"axis {len(condition)} is out of bounds for tensor of dimension 1" + ) diff --git a/python/xorbits/_mars/tensor/indexing/core.py b/python/xorbits/_mars/tensor/indexing/core.py new file mode 100644 index 000000000..13d1136e4 --- /dev/null +++ b/python/xorbits/_mars/tensor/indexing/core.py @@ -0,0 +1,193 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import itertools +from numbers import Integral + +import numpy as np + +from ..core import TENSOR_CHUNK_TYPE, TENSOR_TYPE +from ..datasource import tensor as astensor +from ..utils import broadcast_shape, calc_sliced_size, index_ndim, replace_ellipsis + +_INDEX_ERROR_MSG = ( + "only integers, slices (`:`), ellipsis (`...`), " + "numpy.newaxis (`None`) and integer or boolean arrays are valid indices" +) + + +def calc_shape(tensor_shape, index): + shape = [] + in_axis = 0 + out_axis = 0 + fancy_index = None + fancy_index_shapes = [] + for ind in index: + if ( + isinstance(ind, TENSOR_TYPE + TENSOR_CHUNK_TYPE + (np.ndarray,)) + and ind.dtype == np.bool_ + ): + # bool + shape.append(np.nan if not isinstance(ind, np.ndarray) else int(ind.sum())) + for i, t_size, size in zip( + itertools.count(0), + ind.shape, + tensor_shape[in_axis : ind.ndim + in_axis], + ): + if not np.isnan(t_size) and not np.isnan(size) and t_size != size: + raise IndexError( + f"boolean index did not match indexed array along dimension {in_axis + i}; " + f"dimension is {size} but corresponding boolean dimension is {t_size}" + ) + in_axis += ind.ndim + out_axis += 1 + elif isinstance(ind, TENSOR_TYPE + TENSOR_CHUNK_TYPE + (np.ndarray,)): + first_fancy_index = False + if fancy_index is None: + first_fancy_index = True + fancy_index = out_axis + if isinstance(ind, np.ndarray) and np.any(ind >= tensor_shape[in_axis]): + out_of_range_index = next( + i for i in ind.flat if i >= tensor_shape[in_axis] + ) + raise IndexError( + f"IndexError: index {out_of_range_index} is out of " + f"bounds with size {tensor_shape[in_axis]}" + ) + fancy_index_shapes.append(ind.shape) + in_axis += 1 + if first_fancy_index: + out_axis += ind.ndim + elif isinstance(ind, slice): + if np.isnan(tensor_shape[in_axis]): + shape.append(np.nan) + else: + shape.append(calc_sliced_size(tensor_shape[in_axis], ind)) + in_axis += 1 + out_axis += 1 + elif isinstance(ind, Integral): + size = tensor_shape[in_axis] + if not np.isnan(size) and ind >= size: + raise IndexError( + f"index {ind} is out of bounds for axis {in_axis} with size {size}" + ) + in_axis += 1 + else: + assert ind is None + shape.append(1) + + if fancy_index is not None: + try: + if any(np.isnan(np.prod(s)) for s in fancy_index_shapes): + fancy_index_shape = (np.nan,) * len(fancy_index_shapes[0]) + else: + fancy_index_shape = broadcast_shape(*fancy_index_shapes) + shape = shape[:fancy_index] + list(fancy_index_shape) + shape[fancy_index:] + except ValueError: + raise IndexError( + "shape mismatch: indexing arrays could not be broadcast together " + "with shapes {0}".format(" ".join(str(s) for s in fancy_index_shapes)) + ) + + return shape + + +def preprocess_index(index, convert_bool_to_fancy=None): + from .nonzero import nonzero + + inds = [] + fancy_indexes = [] + bool_indexes = [] + all_fancy_index_ndarray = True + all_bool_index_ndarray = True + for j, ind in enumerate(index): + if isinstance(ind, (list, np.ndarray) + TENSOR_TYPE): + if not isinstance(ind, TENSOR_TYPE): + ind = np.array(ind) + if ind.dtype.kind not in "biu": + raise IndexError(_INDEX_ERROR_MSG) + if ind.dtype.kind == "b": + # bool indexing + bool_indexes.append(j) + if not isinstance(ind, np.ndarray): + all_bool_index_ndarray = False + else: + # fancy indexing + fancy_indexes.append(j) + if not isinstance(ind, np.ndarray): + all_fancy_index_ndarray = False + elif ( + not isinstance(ind, (slice, Integral)) + and ind is not None + and ind is not Ellipsis + ): + raise IndexError(_INDEX_ERROR_MSG) + inds.append(ind) + + if convert_bool_to_fancy is None: + convert_bool_to_fancy = (fancy_indexes and len(bool_indexes) > 0) or len( + bool_indexes + ) > 1 + + if not all_fancy_index_ndarray or ( + convert_bool_to_fancy and not all_bool_index_ndarray + ): + # if not all fancy indexes are ndarray, + # or bool indexes need to be converted to fancy indexes, + # and not all bool indexes are ndarray, + # we will convert all of them to Tensor + for fancy_index in fancy_indexes: + inds[fancy_index] = astensor(inds[fancy_index]) + + # convert bool index to fancy index when any situation below meets: + # 1. fancy indexes and bool indexes both exists + # 2. bool indexes more than 2 + if convert_bool_to_fancy: + default_m = None + if len(fancy_indexes) > 0: + default_m = ( + np.nonzero + if isinstance(inds[fancy_indexes[0]], np.ndarray) + else nonzero + ) + for bool_index in bool_indexes: + ind = inds[bool_index] + m = default_m + if m is None: + m = np.nonzero if isinstance(ind, np.ndarray) else nonzero + ind = m(ind)[0] + inds[bool_index] = ind + + return tuple(inds) + + +def process_index(tensor_ndim, item, convert_bool_to_fancy=None): + if isinstance(item, list): + arr = np.array(item) + if arr.dtype == object: + item = tuple(item) + elif arr.dtype.kind == "f": + raise IndexError(_INDEX_ERROR_MSG) + else: + item = (arr,) + elif not isinstance(item, tuple): + item = (item,) + + index = preprocess_index(item, convert_bool_to_fancy=convert_bool_to_fancy) + index = replace_ellipsis(index, tensor_ndim) + missing = tensor_ndim - sum(index_ndim(i) for i in index) + if missing < 0: + raise IndexError("too many indices for tensor") + return index + (slice(None),) * missing diff --git a/python/xorbits/_mars/tensor/indexing/extract.py b/python/xorbits/_mars/tensor/indexing/extract.py new file mode 100644 index 000000000..243d6cdb0 --- /dev/null +++ b/python/xorbits/_mars/tensor/indexing/extract.py @@ -0,0 +1,69 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..datasource import tensor as astensor + + +def extract(condition, a): + """ + Return the elements of a tensor that satisfy some condition. + + This is equivalent to ``mt.compress(ravel(condition), ravel(arr))``. If + `condition` is boolean ``mt.extract`` is equivalent to ``arr[condition]``. + + Note that `place` does the exact opposite of `extract`. + + Parameters + ---------- + condition : array_like + An array whose nonzero or True entries indicate the elements of `arr` + to extract. + a : array_like + Input tensor of the same size as `condition`. + + Returns + ------- + extract : Tensor + Rank 1 tensor of values from `arr` where `condition` is True. + + See Also + -------- + take, put, copyto, compress, place + + Examples + -------- + >>> import mars.tensor as mt + + >>> arr = mt.arange(12).reshape((3, 4)) + >>> arr.execute() + array([[ 0, 1, 2, 3], + [ 4, 5, 6, 7], + [ 8, 9, 10, 11]]) + >>> condition = mt.mod(arr, 3)==0 + >>> condition.execute() + array([[ True, False, False, True], + [False, False, True, False], + [False, True, False, False]]) + >>> mt.extract(condition, arr).execute() + array([0, 3, 6, 9]) + + + If `condition` is boolean: + + >>> arr[condition].execute() + array([0, 3, 6, 9]) + + """ + condition = astensor(condition, dtype=bool) + return a[condition] diff --git a/python/xorbits/_mars/tensor/indexing/fill_diagonal.py b/python/xorbits/_mars/tensor/indexing/fill_diagonal.py new file mode 100644 index 000000000..5337e2dd1 --- /dev/null +++ b/python/xorbits/_mars/tensor/indexing/fill_diagonal.py @@ -0,0 +1,465 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...core import ENTITY_TYPE, recursive_tile +from ...serialization.serializables import AnyField, BoolField, Int32Field, KeyField +from ...utils import ceildiv, has_unknown_shape +from ..array_utils import as_same_device, device +from ..core import TENSOR_TYPE, Tensor +from ..datasource import tensor as astensor +from ..operands import TensorOperand, TensorOperandMixin +from ..utils import decide_unify_split + + +class TensorFillDiagonal(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.FILL_DIAGONAL + + _input = KeyField("input") + _val = AnyField("val") + _wrap = BoolField("wrap") + # used for chunk + _k = Int32Field("k") + + def __init__(self, val=None, wrap=None, k=None, **kw): + super().__init__(_val=val, _wrap=wrap, _k=k, **kw) + + @property + def input(self): + return self._input + + @property + def val(self): + return self._val + + @property + def wrap(self): + return self._wrap + + @property + def k(self): + return self._k + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + if len(self._inputs) == 2: + self._val = self._inputs[1] + + def __call__(self, a, val=None): + inputs = [a] + if val is not None: + inputs.append(val) + return self.new_tensor(inputs, shape=a.shape, order=a.order) + + @staticmethod + def _process_val(val, a, wrap): + """ + given the `val`, `a`, `wrap` which are the arguments in `fill_diagonal`, + do some preprocess on `val` includes: + + 1. calculate the length to fill on diagonal, 2-d and n-d(n > 2) + as well as that `wrap` is True and `a` is a tall matrix need to be considered. + 2. if val is a Tensor, rechunk it into one chunk. + """ + + from ..base import tile + from ..datasource import diag + + is_val_tensor = isinstance(val, TENSOR_TYPE) + + if a.ndim == 2: + if wrap and TensorFillDiagonal._is_tall(a): + size = sum( + diag(sub).shape[0] + for sub in TensorFillDiagonal._split_tall_matrix(a) + ) + else: + size = diag(a).shape[0] + else: + # every dimension has same shape + size = a.shape[0] + + repeat_method = tile if is_val_tensor else np.tile + val_size = val.size + if val_size < size: + n = ceildiv(size, val_size) + val = repeat_method(val, n)[:size] + elif val_size > size: + val = val[:size] + + if is_val_tensor and val.ndim > 0: + val = yield from recursive_tile(val) + val = val.rechunk({0: val.size}) + + return (yield from recursive_tile(val)) if is_val_tensor else val + + @staticmethod + def _gen_val(val, diag_idx, cum_sizes): + """ + Given a tensor-level `val`, calculate the chunk-level `val`. + Consider both the cases that `val` could be a tensor or ndarray. + + :param val: tensor-level `val` + :diag_idx: chunk index on the diagonal direction + :cum_sizes: accumulative chunk sizes on the diagonal direction + """ + from .slice import TensorSlice + + if val.ndim == 0: + if isinstance(val, TENSOR_TYPE): + return val.chunks[0] + else: + return val + + if isinstance(val, TENSOR_TYPE): + start, stop = cum_sizes[diag_idx], cum_sizes[diag_idx + 1] + slc = slice(start, stop) + slc_op = TensorSlice(slices=[slc], dtype=val.dtype) + return slc_op.new_chunk( + [val.chunks[0]], + shape=(stop - start,), + order=val.order, + index=(diag_idx,), + ) + else: + return val[cum_sizes[diag_idx] : cum_sizes[diag_idx + 1]] + + @classmethod + def _tile_2d(cls, op, val): + from ..datasource import diag + + d = yield from recursive_tile(diag(op.input)) + index_to_diag_chunk = {c.inputs[0].index: c for c in d.chunks} + cum_sizes = [0] + np.cumsum(d.nsplits[0]).tolist() + + out_chunks = [] + for chunk in op.input.chunks: + if chunk.index not in index_to_diag_chunk: + out_chunks.append(chunk) + else: + diag_chunk = index_to_diag_chunk[chunk.index] + diag_idx = diag_chunk.index[0] + input_chunks = [chunk] + chunk_val = cls._gen_val(val, diag_idx, cum_sizes) + if len(op.inputs) == 2: + input_chunks.append(chunk_val) + chunk_op = op.copy().reset_key() + chunk_op._wrap = False + chunk_op._k = diag_chunk.op.k + chunk_op._val = chunk_val + out_chunk = chunk_op.new_chunk( + input_chunks, + shape=chunk.shape, + order=chunk.order, + index=chunk.index, + ) + out_chunks.append(out_chunk) + + out = op.outputs[0] + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + shape=out.shape, + order=out.order, + chunks=out_chunks, + nsplits=op.input.nsplits, + ) + + @classmethod + def _tile_nd(cls, op, val): + # if more than 3d, we will rechunk the tensor into square chunk + # on the diagonal direction + in_tensor = op.input + nsplits = [tuple(np.array(split)) for split in in_tensor.nsplits] + if len(set(nsplits)) != 1: + # need rechunk + nsplit = decide_unify_split(*in_tensor.nsplits) + in_tensor = yield from recursive_tile( + in_tensor.rechunk(tuple(nsplit for _ in range(in_tensor.ndim))) + ) + cum_sizes = [0] + np.cumsum(in_tensor.nsplits[0]).tolist() + + out_chunks = [] + for chunk in in_tensor.chunks: + if len(set(chunk.index)) == 1: + # chunk on the diagonal direction + chunk_op = op.copy().reset_key() + chunk_op._k = 0 + chunk_inputs = [chunk] + chunk_val = cls._gen_val(val, chunk.index[0], cum_sizes) + if len(op.inputs) == 2: + chunk_inputs.append(chunk_val) + chunk_op._val = chunk_val + out_chunk = chunk_op.new_chunk( + chunk_inputs, + shape=chunk.shape, + order=chunk.order, + index=chunk.index, + ) + out_chunks.append(out_chunk) + else: + out_chunks.append(chunk) + + out = op.outputs[0] + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + shape=out.shape, + order=out.order, + chunks=out_chunks, + nsplits=in_tensor.nsplits, + ) + + @classmethod + def _tile_one_chunk(cls, op, val): + out = op.outputs[0] + chunk_op = op.copy().reset_key() + chunk_inputs = [op.input.chunks[0]] + if isinstance(val, TENSOR_TYPE): + chunk_inputs.append(val.chunks[0]) + chunk = chunk_op.new_chunk( + chunk_inputs, shape=out.shape, order=out.order, index=(0,) * out.ndim + ) + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + shape=out.shape, + order=out.order, + chunks=[chunk], + nsplits=((s,) for s in out.shape), + ) + + @staticmethod + def _is_tall(x): + return x.shape[0] > x.shape[1] + 1 + + @staticmethod + def _split_tall_matrix(a): + blocksize = a.shape[1] + 1 + n_block = ceildiv(a.shape[0], blocksize) + return [a[i * blocksize : (i + 1) * blocksize] for i in range(n_block)] + + @classmethod + def tile(cls, op): + # input tensor must have no unknown chunk shape + if has_unknown_shape(*op.inputs): + yield + + in_tensor = op.input + is_in_tensor_tall = cls._is_tall(in_tensor) + + if op.val.ndim > 0: + val = yield from cls._process_val(op.val, in_tensor, op.wrap) + else: + val = op.val + + if len(in_tensor.chunks) == 1: + return cls._tile_one_chunk(op, val) + + if op.input.ndim == 2: + if op.wrap and is_in_tensor_tall: + from ..merge import concatenate + + sub_tensors = cls._split_tall_matrix(in_tensor) + for i, sub_tensor in enumerate(sub_tensors): + if val.ndim > 0: + sub_val = val[ + i * sub_tensor.shape[1] : (i + 1) * sub_tensor.shape[1] + ] + else: + sub_val = val + fill_diagonal(sub_tensor, sub_val, wrap=False) + out_tensor = concatenate(sub_tensors) + return [(yield from recursive_tile(out_tensor))] + else: + return (yield from cls._tile_2d(op, val)) + else: + return (yield from cls._tile_nd(op, val)) + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True + ) + a = inputs[0] + if len(inputs) == 2: + val = inputs[1] + else: + val = op.val + + with device(device_id): + if not op.k: + a = a.copy() + xp.fill_diagonal(a, val, wrap=op.wrap) + else: + assert a.ndim == 2 + k = op.k or 0 + n_rows, n_cols = a.shape + if k > 0: + n_cols -= k + elif k < 0: + n_rows += k + n = min(n_rows, n_cols) + + # generate indices + rows, cols = np.diag_indices(n) + if k > 0: + cols = cols.copy() + cols += k + elif k < 0: + rows = rows.copy() + rows -= k + + a = a.copy() + a[rows, cols] = val + + ctx[op.outputs[0].key] = a + + +def fill_diagonal(a, val, wrap=False): + """Fill the main diagonal of the given tensor of any dimensionality. + + For a tensor `a` with ``a.ndim >= 2``, the diagonal is the list of + locations with indices ``a[i, ..., i]`` all identical. This function + modifies the input tensor in-place, it does not return a value. + + Parameters + ---------- + a : Tensor, at least 2-D. + Tensor whose diagonal is to be filled, it gets modified in-place. + + val : scalar + Value to be written on the diagonal, its type must be compatible with + that of the tensor a. + + wrap : bool + For tall matrices in NumPy version up to 1.6.2, the + diagonal "wrapped" after N columns. You can have this behavior + with this option. This affects only tall matrices. + + See also + -------- + diag_indices, diag_indices_from + + Notes + ----- + + This functionality can be obtained via `diag_indices`, but internally + this version uses a much faster implementation that never constructs the + indices and uses simple slicing. + + Examples + -------- + >>> import mars.tensor as mt + >>> a = mt.zeros((3, 3), int) + >>> mt.fill_diagonal(a, 5) + >>> a.execute() + array([[5, 0, 0], + [0, 5, 0], + [0, 0, 5]]) + + The same function can operate on a 4-D tensor: + + >>> a = mt.zeros((3, 3, 3, 3), int) + >>> mt.fill_diagonal(a, 4) + + We only show a few blocks for clarity: + + >>> a[0, 0].execute() + array([[4, 0, 0], + [0, 0, 0], + [0, 0, 0]]) + >>> a[1, 1].execute() + array([[0, 0, 0], + [0, 4, 0], + [0, 0, 0]]) + >>> a[2, 2].execute() + array([[0, 0, 0], + [0, 0, 0], + [0, 0, 4]]) + + The wrap option affects only tall matrices: + + >>> # tall matrices no wrap + >>> a = mt.zeros((5, 3), int) + >>> mt.fill_diagonal(a, 4) + >>> a.execute() + array([[4, 0, 0], + [0, 4, 0], + [0, 0, 4], + [0, 0, 0], + [0, 0, 0]]) + + >>> # tall matrices wrap + >>> a = mt.zeros((5, 3), int) + >>> mt.fill_diagonal(a, 4, wrap=True) + >>> a.execute() + array([[4, 0, 0], + [0, 4, 0], + [0, 0, 4], + [0, 0, 0], + [4, 0, 0]]) + + >>> # wide matrices + >>> a = mt.zeros((3, 5), int) + >>> mt.fill_diagonal(a, 4, wrap=True) + >>> a.execute() + array([[4, 0, 0, 0, 0], + [0, 4, 0, 0, 0], + [0, 0, 4, 0, 0]]) + + The anti-diagonal can be filled by reversing the order of elements + using either `numpy.flipud` or `numpy.fliplr`. + + >>> a = mt.zeros((3, 3), int) + >>> mt.fill_diagonal(mt.fliplr(a), [1,2,3]) # Horizontal flip + >>> a.execute() + array([[0, 0, 1], + [0, 2, 0], + [3, 0, 0]]) + >>> mt.fill_diagonal(mt.flipud(a), [1,2,3]) # Vertical flip + >>> a.execute() + array([[0, 0, 3], + [0, 2, 0], + [1, 0, 0]]) + + Note that the order in which the diagonal is filled varies depending + on the flip function. + """ + + if not isinstance(a, Tensor): + raise TypeError(f"`a` should be a tensor, got {type(a)}") + if a.ndim < 2: + raise ValueError("array must be at least 2-d") + if a.ndim > 2 and len(set(a.shape)) != 1: + raise ValueError("All dimensions of input must be of equal length") + + # process val + if isinstance(val, ENTITY_TYPE): + val = astensor(val) + if val.ndim > 1: + val = val.ravel() + val_input = val + else: + val = np.asarray(val) + if val.ndim > 1: + val = val.ravel() + val_input = None + + op = TensorFillDiagonal(val=val, wrap=wrap, dtype=a.dtype) + t = op(a, val=val_input) + a.data = t.data diff --git a/python/xorbits/_mars/tensor/indexing/flatnonzero.py b/python/xorbits/_mars/tensor/indexing/flatnonzero.py new file mode 100644 index 000000000..935011c07 --- /dev/null +++ b/python/xorbits/_mars/tensor/indexing/flatnonzero.py @@ -0,0 +1,58 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .nonzero import nonzero + + +def flatnonzero(a): + """ + Return indices that are non-zero in the flattened version of a. + + This is equivalent to a.ravel().nonzero()[0]. + + Parameters + ---------- + a : Tensor + Input tensor. + + Returns + ------- + res : Tensor + Output tensor, containing the indices of the elements of `a.ravel()` + that are non-zero. + + See Also + -------- + nonzero : Return the indices of the non-zero elements of the input tensor. + ravel : Return a 1-D tensor containing the elements of the input tensor. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.arange(-2, 3) + >>> x.execute() + array([-2, -1, 0, 1, 2]) + >>> mt.flatnonzero(x).execute() + array([0, 1, 3, 4]) + + Use the indices of the non-zero elements as an index array to extract + these elements: + + >>> x.ravel()[mt.flatnonzero(x)].execute() # TODO(jisheng): accomplish this after fancy indexing is supported + + """ + from ..base import ravel + + return nonzero(ravel(a))[0] diff --git a/python/xorbits/_mars/tensor/indexing/getitem.py b/python/xorbits/_mars/tensor/indexing/getitem.py new file mode 100644 index 000000000..e2d33f9d1 --- /dev/null +++ b/python/xorbits/_mars/tensor/indexing/getitem.py @@ -0,0 +1,388 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from numbers import Integral + +import numpy as np + +from ... import opcodes as OperandDef +from ...core import ENTITY_TYPE +from ...core.operand import OperandStage +from ...serialization.serializables import ( + FieldTypes, + Int32Field, + KeyField, + ListField, + TupleField, +) +from ..array_utils import get_array_module +from ..core import TENSOR_TYPE, TensorOrder +from ..operands import TensorHasInput, TensorMapReduceOperand, TensorOperandMixin +from ..utils import calc_pos, filter_inputs, split_indexes_into_chunks +from .core import calc_shape, process_index +from .index_lib import TensorIndexesHandler + +FANCY_INDEX_TYPES = TENSOR_TYPE + (np.ndarray,) + + +class TensorIndex(TensorHasInput, TensorOperandMixin): + _op_type_ = OperandDef.INDEX + + _input = KeyField("input") + _indexes = ListField("indexes") + + def __init__(self, indexes=None, **kw): + super().__init__(_indexes=indexes, **kw) + + @property + def indexes(self): + return self._indexes + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs[1:]) + new_indexes = [ + next(inputs_iter) if isinstance(index, ENTITY_TYPE) else index + for index in self._indexes + ] + self._indexes = new_indexes + + def on_output_modify(self, new_output): + from .setitem import TensorIndexSetValue + + if self.create_view: + a = self.input + op = TensorIndexSetValue( + dtype=a.dtype, + sparse=a.issparse(), + indexes=tuple(self._indexes), + value=new_output, + ) + return op(a, self._indexes, new_output) + + def on_input_modify(self, new_input): + if self.create_view: + new_op = self.copy().reset_key() + new_inputs = [new_input] + self.inputs[1:] + return new_op.new_tensor(new_inputs, shape=self.outputs[0].shape) + + def __call__(self, a, index, shape, order): + self._indexes = list(index) + return self.new_tensor(filter_inputs([a] + list(index)), shape, order=order) + + @classmethod + def _tile_one_chunk(cls, op: "TensorIndex"): + inp = op.inputs[0] + out = op.outputs[0] + chunk_op = op.copy().reset_key() + chunk_params = out.params.copy() + chunk_params["shape"] = shape = tuple(calc_shape(inp.shape, op.indexes)) + chunk_params["index"] = (0,) * out.ndim + chunk = chunk_op.new_chunk( + [inp.chunks[0] for inp in op.inputs], kws=[chunk_params] + ) + params = out.params.copy() + params["chunks"] = [chunk] + params["nsplits"] = tuple((s,) for s in shape) + return op.copy().new_tensors(op.inputs, kws=[params]) + + @classmethod + def tile(cls, op): + if all(len(inp.chunks) == 1 for inp in op.inputs): + return cls._tile_one_chunk(op) + + handler = TensorIndexesHandler() + return [(yield from handler.handle(op))] + + @classmethod + def execute(cls, ctx, op): + indexes = tuple( + ctx[index.key] if hasattr(index, "key") else index for index in op.indexes + ) + input_ = ctx[op.inputs[0].key] + xp = get_array_module(input_) + ret = xp.asarray(input_)[indexes] + if hasattr(ret, "astype"): + ret = ret.astype(ret.dtype, order=op.outputs[0].order.value, copy=False) + ctx[op.outputs[0].key] = ret + + @classmethod + def estimate_size(cls, ctx, op): + chunk = op.outputs[0] + shape = chunk.shape + + if any(np.isnan(s) for s in shape): + return super().estimate_size(ctx, op) + + new_indexes = [index for index in op._indexes if index is not None] + new_shape = [] + first_fancy_index = False + for index in new_indexes: + if isinstance(index, ENTITY_TYPE): + if index.dtype != np.bool_: + if not first_fancy_index: + first_fancy_index = True + else: + continue + new_shape.append(ctx[index.key][0] // index.dtype.itemsize) + + rough_shape = [] + idx = 0 + for s in shape: + if np.isnan(s): + rough_shape.append(new_shape[idx]) + idx += 1 + else: + rough_shape.append(s) + result = int(np.prod(rough_shape) * chunk.dtype.itemsize) + ctx[chunk.key] = (result, result) + + +class FancyIndexingDistribute(TensorMapReduceOperand, TensorOperandMixin): + _op_type_ = OperandDef.FANCY_INDEX_DISTRIBUTE + + _input = KeyField("input") + _dest_nsplits = TupleField("dest_nsplits", FieldTypes.tuple(FieldTypes.uint64)) + _axes = TupleField("axes", FieldTypes.int32) + + def __init__(self, dest_nsplits=None, axes=None, **kw): + super().__init__(_dest_nsplits=dest_nsplits, _axes=axes, **kw) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + @property + def output_limit(self): + if self.stage == OperandStage.map: + return 1 + # return fancy indexes on each axis as well as original position + return len(self._axes) + 1 + + @property + def dest_nsplits(self): + return self._dest_nsplits + + @property + def axes(self): + return self._axes + + @classmethod + def _execute_map(cls, ctx, op): + nsplits = op.dest_nsplits + axes = op.axes + fancy_index_nsplits = [nsplits[ax] for ax in axes] + indexes = ctx[op.inputs[0].key] + flatten_indexes = indexes.reshape(indexes.shape[0], -1) + idx_to_fancy_indexes, idx_to_poses = split_indexes_into_chunks( + fancy_index_nsplits, flatten_indexes, False + ) + for idx in idx_to_fancy_indexes: + ctx[op.outputs[0].key, idx] = (idx_to_fancy_indexes[idx], idx_to_poses[idx]) + + @classmethod + def _execute_reduce(cls, ctx, op: "FancyIndexingDistribute"): + fancy_indexes = [] + poses = [] + xp = None + for fancy_index, pos in op.iter_mapper_data(ctx): + if xp is None: + xp = get_array_module(fancy_index) + if fancy_index.size == 0: + fancy_index = fancy_index.reshape(len(op.axes), 0) + fancy_indexes.append(fancy_index) + poses.append(pos) + + fancy_index = np.hstack(fancy_indexes) + pos = np.hstack(poses) + + assert len(op.outputs) - 1 == len(fancy_index) + for out_chunk, axis_fancy_index in zip(op.outputs[:-1], fancy_index): + ctx[out_chunk.key] = axis_fancy_index + ctx[op.outputs[-1].key] = np.asarray([len(p) for p in poses]), pos + + @classmethod + def execute(cls, ctx, op): + if op.stage == OperandStage.map: + cls._execute_map(ctx, op) + else: + cls._execute_reduce(ctx, op) + + @classmethod + def estimate_size(cls, ctx, op): + if op.stage == OperandStage.map: + fancy_index_size = len(op.axes) + inp_size = ctx[op.inputs[0].key][0] + factor = ( + 1 / float(fancy_index_size) + fancy_index_size + ) # 1/#fancy_index is the poses + ctx[op.outputs[0].key] = (inp_size * factor,) * 2 + else: + sum_size = 0 + for shuffle_input in op.inputs[0].inputs or (): + sum_size += ctx[shuffle_input.key] + for out_chunk in op.outputs: + ctx[out_chunk.key] = sum_size, sum_size + + +class FancyIndexingConcat(TensorMapReduceOperand, TensorOperandMixin): + _op_type_ = OperandDef.FANCY_INDEX_CONCAT + + _fancy_index_axis = Int32Field("fancy_index_axis") + _fancy_index_shape = TupleField("fancy_index_shape", FieldTypes.int64) + + def __init__(self, fancy_index_axis=None, fancy_index_shape=None, **kw): + super().__init__( + _fancy_index_axis=fancy_index_axis, + _fancy_index_shape=fancy_index_shape, + **kw + ) + + @property + def input(self): + return self._input + + @property + def fancy_index_axis(self): + return self._fancy_index_axis + + @property + def fancy_index_shape(self): + return self._fancy_index_shape + + @classmethod + def _execute_map(cls, ctx, op): + indexed_array = ctx[op.inputs[0].key] + sizes, pos = ctx[op.inputs[1].key] + acc_sizes = np.cumsum(sizes) + fancy_index_axis = op.fancy_index_axis + + for i in range(len(sizes)): + start = 0 if i == 0 else acc_sizes[i - 1] + end = acc_sizes[i] + select = (slice(None),) * fancy_index_axis + (slice(start, end),) + ctx[op.outputs[0].key, (i,)] = (indexed_array[select], pos[start:end]) + + @classmethod + def _execute_reduce(cls, ctx, op: "FancyIndexingConcat"): + fancy_index_axis = op.fancy_index_axis + fancy_index_shape = op.fancy_index_shape + + indexed_arrays = [] + poses = [] + for index_array, pos in op.iter_mapper_data(ctx): + indexed_arrays.append(index_array) + poses.append(pos) + + concat_array = get_array_module(indexed_arrays[0]).concatenate( + indexed_arrays, axis=fancy_index_axis + ) + concat_pos = get_array_module(poses[0]).hstack(poses) + select_pos = calc_pos( + fancy_index_shape, concat_pos, xp=get_array_module(poses[0]) + ) + select = (slice(None),) * fancy_index_axis + (select_pos,) + ctx[op.outputs[0].key] = concat_array[select] + + @classmethod + def execute(cls, ctx, op): + if op.stage == OperandStage.map: + cls._execute_map(ctx, op) + else: + cls._execute_reduce(ctx, op) + + @classmethod + def estimate_size(cls, ctx, op): + if op.stage == OperandStage.map: + input_size = ctx[op.inputs[0].key][0] + pos_size = ctx[op.inputs[0].key][0] + ctx[op.outputs[0].key] = input_size + pos_size, input_size + pos_size * 2 + else: + chunk = op.outputs[0] + input_sizes = [ctx[c.key][0] for c in op.inputs[0].inputs or ()] + ctx[chunk.key] = chunk.nbytes, chunk.nbytes + sum(input_sizes) + + +def _is_bool_index(index_obj): + return isinstance(index_obj, TENSOR_TYPE) and index_obj.dtype == np.bool_ + + +def _is_fancy_index(index_obj): + return isinstance(index_obj, FANCY_INDEX_TYPES) and index_obj.dtype != np.bool_ + + +def _is_create_view(index): + # is view if all of index is slice, int or newaxis + return all(isinstance(ind, (slice, Integral)) or ind is None for ind in index) + + +def _calc_order(a, index): + if a.order == TensorOrder.C_ORDER: + return TensorOrder.C_ORDER + + in_axis = 0 + for ind in index: + if _is_bool_index(ind): + in_axis += ind.ndim + return TensorOrder.C_ORDER + elif _is_fancy_index(ind): + in_axis += 1 + return TensorOrder.C_ORDER + elif ind is None: + continue + elif isinstance(ind, slice): + shape = a.shape[in_axis] + slc = ind.indices(shape) + if slc[0] == 0 and slc[1] == shape and slc[2] == 1: + continue + else: + return TensorOrder.C_ORDER + else: + assert isinstance(ind, Integral) + in_axis += 1 + return TensorOrder.C_ORDER + + return TensorOrder.F_ORDER + + +def _getitem_nocheck(a, item, convert_bool_to_fancy=None): + index = process_index(a.ndim, item, convert_bool_to_fancy=convert_bool_to_fancy) + if convert_bool_to_fancy is False: + # come from __setitem__, the bool index is not converted to fancy index + # if multiple bool indexes or bool + fancy indexes exist, + # thus the shape will be wrong, + # here we just convert when calculating shape, + # refer to issue #1282. + shape = calc_shape(a.shape, process_index(a.ndim, index)) + else: + shape = calc_shape(a.shape, index) + tensor_order = _calc_order(a, index) + op = TensorIndex( + dtype=a.dtype, + sparse=a.issparse(), + indexes=list(index), + create_view=_is_create_view(index), + ) + return op(a, index, tuple(shape), order=tensor_order) + + +def _getitem(a, item): + if isinstance(item, (list, tuple)) and all( + isinstance(it, slice) and it == slice(None) for it in item + ): + # nothing to do + return a + + # TODO(jisheng): field access, e.g. t['a'], t[['a', 'b']] + return _getitem_nocheck(a, item) diff --git a/python/xorbits/_mars/tensor/indexing/index_lib.py b/python/xorbits/_mars/tensor/indexing/index_lib.py new file mode 100644 index 000000000..54f8c73d5 --- /dev/null +++ b/python/xorbits/_mars/tensor/indexing/index_lib.py @@ -0,0 +1,1062 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +import itertools +from abc import ABC, abstractmethod +from collections import OrderedDict, namedtuple +from enum import Enum +from numbers import Integral +from operator import itemgetter +from typing import List, Tuple, Union + +import numpy as np + +from ...core import Tileable, recursive_tile +from ...core.operand import OperandStage +from ...utils import calc_nsplits, has_unknown_shape +from ..core import TENSOR_TYPE, Chunk, TensorOrder +from ..operands import TensorShuffleProxy +from ..utils import ( + broadcast_shape, + calc_pos, + calc_sliced_size, + filter_inputs, + slice_split, + split_indexes_into_chunks, + unify_chunks, +) + + +class IndexType(Enum): + new_axis = 0 + slice = 1 + label_slice = 2 # e.g. 'a': 'd' used for pandas etc + integer = 3 + label = 4 # e.g. 'a' used for pandas etc + bool_index = 5 + fancy_index = 6 + label_fancy_index = 7 # e.g. ['a', 'b', 'c'] for pandas etc + + +class IndexInfo: + def __init__( + self, + index_type: IndexType, + input_axis: int, + output_axis: int, + raw_index, + handler, + ): + self.index_type = index_type + self.input_axis = input_axis + self.output_axis = output_axis + self.raw_index = raw_index + self.handler = handler + + +class FancyIndexInfo(IndexInfo): + def __init__( + self, + index_type: IndexType, + input_axis: int, + output_axis: int, + raw_index, + handler, + ): + super().__init__(index_type, input_axis, output_axis, raw_index, handler) + + # extra info for fancy index + # shape broadcast index + self.shape_unified_index = None + # split info + # - chunk_index_to_fancy_index_arrays + # - chunk_index_to_raw_positions + # - is_fancy_index_asc_sorted + self.split_info = None + + +ChunkIndexAxisInfo = namedtuple( + "chunk_index_axis_info", ["output_axis_index", "processed_index", "output_shape"] +) + + +class ChunkIndexInfo: + def __init__(self): + self.indexes = [] + self.output_chunk_index = [] + self.output_chunk_shape = [] + + def set(self, info: ChunkIndexAxisInfo): + output_axis_index = info.output_axis_index + if output_axis_index is not None: + self.output_chunk_index.append(output_axis_index) + self.indexes.append(info.processed_index) + output_shape = info.output_shape + if output_shape is not None: + if not isinstance(output_shape, tuple): + self.output_chunk_shape.append(output_shape) + else: + self.output_chunk_shape.extend(output_shape) + + +class IndexHandlerContext(ABC): + def __init__(self, op): + self.parsed_infos = [] + self.input_axis = 0 + self.output_axis = 0 + + # store index_type -> positions + # for a quick search on indexes of a specified index type + self._index_type_to_positions = dict() + + # store chunk index -> ChunkIndexInfo + # for the IndexHandler to process + self.chunk_index_to_info = OrderedDict() + self.op = op + self.tileable = op.input + self.set_tileable(self.tileable) + + # chunks and nsplits, used for store intermediate result + self.processed_chunks = None + self.out_chunks = None + self.out_nsplits = None + + def append(self, index_info: IndexInfo): + position = len(self.parsed_infos) + if index_info.index_type not in self._index_type_to_positions: + self._index_type_to_positions[index_info.index_type] = [] + self._index_type_to_positions[index_info.index_type].append(position) + self.parsed_infos.append(index_info) + + def get_positions(self, index_type: IndexType) -> List[int]: + return self._index_type_to_positions.get(index_type, []) + + def get_indexes(self, index_type: IndexType): + return [self.parsed_infos[i] for i in self.get_positions(index_type)] + + def set_tileable(self, tileable: Tileable): + for chunk in tileable.chunks: + self.chunk_index_to_info[chunk.index] = ChunkIndexInfo() + + @abstractmethod + def concat_chunks(self, chunks: List[Chunk], axis: Union[Tuple, int]) -> Chunk: + pass + + @abstractmethod + def create_chunk( + self, chunk_index: Tuple[int], chunk_index_info: ChunkIndexInfo + ) -> Chunk: + pass + + def create_tileable(self) -> Tileable: + out = self.op.outputs[0] + params = out.params + params["chunks"] = self.out_chunks + params["nsplits"] = self.out_nsplits + if "shape" in params and any(np.isnan(s) for s in params["shape"]): + params["shape"] = tuple(sum(ns) for ns in self.out_nsplits) + new_op = out.op.copy() + return new_op.new_tileable(out.inputs, kws=[params]) + + +class TensorIndexHandlerContext(IndexHandlerContext): + def concat_chunks(self, chunks: List[Chunk], axis: Union[Tuple[int], int]) -> Chunk: + from ..merge import TensorConcatenate + + assert isinstance(axis, int), "axis to concat could only be int for tensor" + + shape = list(chunks[0].shape) + shape[axis] = sum(c.shape[axis] for c in chunks) + chunk_index = list(chunks[0].index) + chunk_index[axis] = 0 + + op = TensorConcatenate( + axis=axis, dtype=chunks[0].dtype, sparse=chunks[0].issparse() + ) + return op.new_chunk( + chunks, + shape=tuple(shape), + index=tuple(chunk_index), + order=TensorOrder.C_ORDER, + ) + + def create_chunk( + self, chunk_index: Tuple[int], chunk_index_info: ChunkIndexInfo + ) -> Chunk: + chunk_op = self.op.copy().reset_key() + chunk_op._indexes = indexes = chunk_index_info.indexes + chunk_input = ( + self.tileable.chunks[0] + if self.tileable.ndim == 0 + else self.tileable.cix[chunk_index] + ) + chunk_inputs = filter_inputs([chunk_input] + indexes) + return chunk_op.new_chunk( + chunk_inputs, + shape=tuple(chunk_index_info.output_chunk_shape), + index=tuple(chunk_index_info.output_chunk_index), + order=self.op.outputs[0].order, + ) + + +_type_to_instance = {} + + +class IndexHandler(ABC): + @classmethod + def get_instance(cls): + if cls not in _type_to_instance: + _type_to_instance[cls] = cls() + return _type_to_instance[cls] + + @abstractmethod + def accept(cls, raw_index): + pass + + @abstractmethod + def parse(self, raw_index, context: IndexHandlerContext) -> IndexInfo: + pass + + def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + pass + + @abstractmethod + def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + pass + + def postprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + pass + + @classmethod + def set_chunk_index_info( + cls, + context: IndexHandlerContext, + index_info: IndexInfo, + chunk_index: Tuple[int], + chunk_index_info: ChunkIndexInfo, + output_axis_index: int, + index, + output_shape: int, + ): + _ = context, index_info, chunk_index + chunk_index_info.set( + ChunkIndexAxisInfo( + output_axis_index=output_axis_index, + processed_index=index, + output_shape=output_shape, + ) + ) + + +class NewaxisIndexHandler(IndexHandler): + def accept(self, raw_index): + return raw_index is np.newaxis + + def parse(self, raw_index, context: IndexHandlerContext) -> IndexInfo: + info = IndexInfo( + IndexType.new_axis, context.input_axis, context.output_axis, raw_index, self + ) + context.output_axis += 1 + context.append(info) + return info + + def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + for chunk_index_info in context.chunk_index_to_info.values(): + # index on axis and index object + chunk_index_info.set( + ChunkIndexAxisInfo( + output_axis_index=0, processed_index=None, output_shape=1 + ) + ) + + +class SliceIndexHandler(IndexHandler): + def accept(self, raw_index): + return isinstance(raw_index, slice) + + def parse(self, raw_index, context: IndexHandlerContext) -> IndexInfo: + info = IndexInfo( + IndexType.slice, context.input_axis, context.output_axis, raw_index, self + ) + context.input_axis += 1 + context.output_axis += 1 + context.append(info) + return info + + def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + # make sure input tileable has known chunk shapes + if has_unknown_shape(context.tileable): + yield + + def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + tileable = context.tileable + input_axis = index_info.input_axis + # slice.step < 0 + is_reversed = (index_info.raw_index.step or 0) < 0 + + # e.g. slice_split(slice(3, 10), [2, 2, 7, 5]) + # return {1: slice(1, 2, 1), 2: slice(0, 6, 1)} + effected_i_to_slice = slice_split( + index_info.raw_index, tileable.nsplits[index_info.input_axis] + ) + output_axis_index_range = ( + range(len(effected_i_to_slice)) + if not is_reversed + else range(len(effected_i_to_slice) - 1, -1, -1) + ) + other_index_to_iter = dict() + + index_to_info = context.chunk_index_to_info.copy() + for chunk_index, chunk_index_info in index_to_info.items(): + i = chunk_index[input_axis] + other_index = chunk_index[:input_axis] + chunk_index[input_axis + 1 :] + size = tileable.nsplits[input_axis][i] + if i not in effected_i_to_slice: + # delete it, the input chunk could be ignored + del context.chunk_index_to_info[chunk_index] + else: + slc = effected_i_to_slice[i] + output_shape = calc_sliced_size(size, slc) + if other_index not in other_index_to_iter: + other_index_to_iter[other_index] = iter(output_axis_index_range) + output_axis_index = next(other_index_to_iter[other_index]) + self.set_chunk_index_info( + context, + index_info, + chunk_index, + chunk_index_info, + output_axis_index, + slc, + output_shape, + ) + + +class IntegralIndexHandler(IndexHandler): + def accept(self, raw_index): + return isinstance(raw_index, Integral) + + def parse(self, raw_index, context: IndexHandlerContext) -> IndexInfo: + info = IndexInfo( + IndexType.integer, context.input_axis, context.output_axis, raw_index, self + ) + context.input_axis += 1 + context.append(info) + return info + + def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + if has_unknown_shape(context.tileable): # pragma: no cover + yield + + def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + tileable = context.tileable + input_axis = index_info.input_axis + + # e.g. slice_split(6, [2, 2, 7, 5]) + # return {2: 2} + effected_i_to_slice = slice_split( + index_info.raw_index, tileable.nsplits[index_info.input_axis] + ) + + index_to_info = context.chunk_index_to_info.copy() + for chunk_index, chunk_index_info in index_to_info.items(): + i = chunk_index[input_axis] + if i not in effected_i_to_slice: + # delete it, the input chunk could be ignored + del context.chunk_index_to_info[chunk_index] + else: + slc = effected_i_to_slice[i] + chunk_index_info.set( + ChunkIndexAxisInfo( + output_axis_index=None, processed_index=slc, output_shape=None + ) + ) + + +class _BoolIndexHandler(IndexHandler): + def parse(self, raw_index, context: IndexHandlerContext) -> IndexInfo: + info = IndexInfo( + IndexType.bool_index, + context.input_axis, + context.output_axis, + raw_index, + self, + ) + context.input_axis += raw_index.ndim + context.output_axis += 1 + context.append(info) + return info + + @classmethod + def _is_first_bool_index( + self, context: IndexHandlerContext, index_info: IndexInfo + ) -> bool: + bool_index_infos = [ + info + for info in context.parsed_infos + if info.index_type == IndexType.bool_index + ] + return bool_index_infos[0] is index_info + + +class NDArrayBoolIndexHandler(_BoolIndexHandler): + def accept(self, raw_index): + return isinstance(raw_index, np.ndarray) and raw_index.dtype == np.bool_ + + def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + if has_unknown_shape(context.tileable): # pragma: no cover + yield + + def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + tileable = context.tileable + input_axis = index_info.input_axis + is_first_bool_index = self._is_first_bool_index(context, index_info) + + axes = list(range(input_axis, input_axis + index_info.raw_index.ndim)) + cum_sizes = [] + for axis in axes: + cum_sizes.append(np.cumsum((0,) + tileable.nsplits[axis])) + + other_index_to_iter = dict() + for chunk_index, chunk_index_info in context.chunk_index_to_info.items(): + slcs = [] + for j, axis in enumerate(axes): + axis_index = chunk_index[axis] + slcs.append( + slice(cum_sizes[j][axis_index], cum_sizes[j][axis_index + 1]) + ) + other_index = chunk_index[: axes[0]] + chunk_index[axes[-1] + 1 :] + if other_index not in other_index_to_iter: + other_index_to_iter[other_index] = itertools.count() + index = index_info.raw_index[tuple(slcs)] + output_axis_index = next(other_index_to_iter[other_index]) + + # if more than 1 bool index, getitem will rewrite them into fancy + # but for now, setitem will keep them, thus we cannot record + # index or shape for this one + output_axis_index = None if not is_first_bool_index else output_axis_index + output_size = None if not is_first_bool_index else int(index.sum()) + + self.set_chunk_index_info( + context, + index_info, + chunk_index, + chunk_index_info, + output_axis_index, + index, + output_size, + ) + + +class TensorBoolIndexHandler(_BoolIndexHandler): + def accept(self, raw_index): + return isinstance(raw_index, TENSOR_TYPE) and raw_index.dtype == np.bool_ + + def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + # check both input tileable and index object itself + if has_unknown_shape(context.tileable): + yield + if has_unknown_shape(index_info.raw_index): # pragma: no cover + yield + + def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + tileable = context.tileable + input_axis = index_info.input_axis + index = index_info.raw_index + # rechunk index into the same chunk size + nsplits = tileable.nsplits[input_axis : input_axis + index.ndim] + index = yield from recursive_tile(index.rechunk(nsplits)) + is_first_bool_index = self._is_first_bool_index(context, index_info) + + other_index_to_iter = dict() + for chunk_index, chunk_index_info in context.chunk_index_to_info.items(): + effected_chunk_index = chunk_index[input_axis : input_axis + index.ndim] + other_index = ( + chunk_index[:input_axis] + chunk_index[input_axis + index.ndim :] + ) + if other_index not in other_index_to_iter: + other_index_to_iter[other_index] = itertools.count() + output_axis_index = next(other_index_to_iter[other_index]) + + # if more than 1 bool index, getitem will rewrite them into fancy + # but for now, setitem will keep them, thus we cannot record + # index or shape for this one + output_axis_index = None if not is_first_bool_index else output_axis_index + output_size = None if not is_first_bool_index else np.nan + + self.set_chunk_index_info( + context, + index_info, + chunk_index, + chunk_index_info, + output_axis_index, + index.cix[tuple(effected_chunk_index)], + output_size, + ) + + +class _FancyIndexHandler(IndexHandler): + def parse(self, raw_index, context: IndexHandlerContext) -> IndexInfo: + prev_fancy_indexes = context.get_indexes(IndexType.fancy_index) + is_first_fancy_index = len(prev_fancy_indexes) == 0 + + if is_first_fancy_index: + output_axis = context.output_axis + else: + output_axis = prev_fancy_indexes[0].output_axis + info = FancyIndexInfo( + IndexType.fancy_index, context.input_axis, output_axis, raw_index, self + ) + + context.input_axis += 1 + if is_first_fancy_index: + context.output_axis += 1 + context.append(info) + return info + + @classmethod + def is_first(cls, index_info: IndexInfo, context: IndexHandlerContext) -> bool: + # check if is first fancy index after parsing + fancy_indexes = context.get_indexes(index_info.index_type) + i = fancy_indexes.index(index_info) + if i > 0: + # only process for the first fancy indexes + return False + else: + return True + + def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + fancy_indexe_infos = context.get_indexes(index_info.index_type) + # check all fancy indexes are all ndarrays + for fancy_index_info in fancy_indexe_infos: + if not self.accept(fancy_index_info.raw_index): # pragma: no cover + raise TypeError("Fancy indexes should be all ndarrays or tensors") + + +class NDArrayFancyIndexHandler(_FancyIndexHandler): + def accept(self, raw_index): + return isinstance(raw_index, np.ndarray) and raw_index.dtype != np.bool_ + + def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + is_first = self.is_first(index_info, context) + if not is_first: + return + + # check if all ndarrays + super().preprocess(index_info, context) + if has_unknown_shape(context.tileable): # pragma: no cover + yield + + fancy_index_infos = context.get_indexes(index_info.index_type) + # unify shapes of all fancy indexes + shape = broadcast_shape(*(info.raw_index.shape for info in fancy_index_infos)) + for fancy_index_info in fancy_index_infos: + fancy_index_info.shape_unified_index = np.broadcast_to( + fancy_index_info.raw_index, shape + ) + + # concat all fancy index together + concat_fancy_index = np.stack( + [info.shape_unified_index.ravel() for info in fancy_index_infos] + ) + effected_nsplits = [ + context.tileable.nsplits[info.input_axis] for info in fancy_index_infos + ] + # split concatenated fancy index into chunks according to input tileable + split_info = split_indexes_into_chunks(effected_nsplits, concat_fancy_index) + fancy_index_infos[0].split_info = split_info + + def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + fancy_index_infos = context.get_indexes(index_info.index_type) + fancy_index_axes = [info.input_axis for info in fancy_index_infos] + split_info = fancy_index_infos[0].split_info + chunk_index_to_fancy_index_arrays = split_info[0] + i_fancy_index = fancy_index_infos.index(index_info) + + other_index_to_iter = dict() + chunk_index_to_info = context.chunk_index_to_info.copy() + for chunk_index, chunk_index_info in chunk_index_to_info.items(): + effected_chunk_index = tuple(chunk_index[ax] for ax in fancy_index_axes) + fancy_index_array = chunk_index_to_fancy_index_arrays[effected_chunk_index][ + i_fancy_index + ] + + if fancy_index_array.size == 0: + # not effected + del context.chunk_index_to_info[chunk_index] + continue + + if i_fancy_index == 0: + other_index = tuple( + ci for i, ci in enumerate(chunk_index) if i not in fancy_index_axes + ) + if other_index not in other_index_to_iter: + other_index_to_iter[other_index] = itertools.count() + output_axis_index = next(other_index_to_iter[other_index]) + output_axis_shape = fancy_index_array.shape[0] + else: + output_axis_index = None + output_axis_shape = None + + chunk_index_info.set( + ChunkIndexAxisInfo( + output_axis_index=output_axis_index, + processed_index=fancy_index_array, + output_shape=output_axis_shape, + ) + ) + + @classmethod + def need_postprocess(cls, context: IndexHandlerContext) -> bool: + fancy_indexes = context.get_indexes(IndexType.fancy_index) + + if ( + fancy_indexes[0].split_info[2] + and fancy_indexes[0].shape_unified_index.ndim == 1 + ): + # if fancy indexes are asc sorted, + # and they are 1-d, no further computation required + return False + + return True + + def postprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + fancy_indexes = context.get_indexes(index_info.index_type) + + if not self.need_postprocess(context): + return + + is_first = self.is_first(index_info, context) + if not is_first: + # only need to postprocess fancy indexes once + return + + # current chunks and nsplits + chunks, nsplits = context.out_chunks, context.out_nsplits + + index_to_chunks = {c.index: c for c in chunks} + fancy_index_shape = fancy_indexes[0].shape_unified_index.shape + reorder_index = calc_pos(fancy_index_shape, fancy_indexes[0].split_info[1]) + + to_concat_axis = index_info.output_axis + new_out_chunks = [] + for chunk_index in itertools.product( + *(range(len(ns)) for ax, ns in enumerate(nsplits) if ax != to_concat_axis) + ): + # concat chunks on output axis of first fancy index + to_concat_chunks = [] + for i in range(len(nsplits[to_concat_axis])): + to_concat_index = list(chunk_index) + to_concat_index.insert(to_concat_axis, i) + to_concat_chunks.append(index_to_chunks[tuple(to_concat_index)]) + concat_chunk = context.concat_chunks(to_concat_chunks, to_concat_axis) + + reorder_chunk_op = context.op.copy().reset_key() + reorder_chunk_op._indexes = [slice(None)] * to_concat_axis + [reorder_index] + reorder_shape = ( + concat_chunk.shape[:to_concat_axis] + + fancy_index_shape + + concat_chunk.shape[to_concat_axis + 1 :] + ) + chunk_reorder_index = ( + concat_chunk.index[:to_concat_axis] + + (0,) * len(fancy_index_shape) + + concat_chunk.index[to_concat_axis + 1 :] + ) + reorder_chunk = reorder_chunk_op.new_chunk( + [concat_chunk], + shape=reorder_shape, + index=chunk_reorder_index, + order=TensorOrder.C_ORDER, + ) + new_out_chunks.append(reorder_chunk) + + new_nsplits = ( + nsplits[:to_concat_axis] + + tuple((s,) for s in fancy_index_shape) + + nsplits[to_concat_axis + 1 :] + ) + context.out_chunks = new_out_chunks + context.out_nsplits = new_nsplits + + +class TensorFancyIndexHandler(_FancyIndexHandler): + def accept(self, raw_index): + return isinstance(raw_index, TENSOR_TYPE) and raw_index.dtype != np.bool_ + + def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + from ..base import broadcast_to + from ..merge import stack + + is_first = self.is_first(index_info, context) + if not is_first: + return + + fancy_index_infos = context.get_indexes(index_info.index_type) + + # check if all tensors + super().preprocess(index_info, context) + to_check = [context.tileable] + list( + info.raw_index for info in fancy_index_infos + ) + if has_unknown_shape(*to_check): + yield + + # unify shapes of all fancy indexes + shape = broadcast_shape(*(info.raw_index.shape for info in fancy_index_infos)) + fancy_indexes = [] + for fancy_index_info in fancy_index_infos: + fancy_index = yield from recursive_tile( + broadcast_to(fancy_index_info.raw_index, shape) + ) + fancy_indexes.append(fancy_index) + shape_unified_fancy_indexes = yield from unify_chunks(*fancy_indexes) + for fancy_index_info, shape_unified_fancy_index in zip( + fancy_index_infos, shape_unified_fancy_indexes + ): + fancy_index_info.shape_unified_index = shape_unified_fancy_index + + fancy_index_axes = tuple(info.input_axis for info in fancy_index_infos) + + # stack fancy indexes into one + concat_fancy_index = yield from recursive_tile( + stack( + [ + fancy_index_info.shape_unified_index + for fancy_index_info in fancy_index_infos + ] + ) + ) + concat_fancy_index = yield from recursive_tile( + concat_fancy_index.rechunk({0: len(fancy_index_infos)}) + ) + + self._shuffle_fancy_indexes( + concat_fancy_index, context, index_info, fancy_index_axes + ) + + @classmethod + def _shuffle_fancy_indexes( + cls, + concat_fancy_index: Tileable, + context: IndexHandlerContext, + index_info: IndexInfo, + axes: Tuple, + ): + from .getitem import FancyIndexingDistribute + + tileable = context.tileable + + # generate shuffle map, for concatenated fancy index, + # calculated a counterpart index chunk for each chunk of input tensor + map_chunks = [] + for chunk in concat_fancy_index.chunks: + map_op = FancyIndexingDistribute( + stage=OperandStage.map, + dest_nsplits=tileable.nsplits, + axes=axes, + dtype=chunk.dtype, + ) + map_chunk = map_op.new_chunk( + [chunk], shape=(np.nan,), index=chunk.index, order=TensorOrder.C_ORDER + ) + map_chunks.append(map_chunk) + # shuffle proxy + proxy_chunk = TensorShuffleProxy(dtype=concat_fancy_index.dtype).new_chunk( + map_chunks, shape=(), order=TensorOrder.C_ORDER + ) + chunk_index_to_fancy_index_chunks = OrderedDict() + chunk_index_to_raw_positions = OrderedDict() + out_indices = list( + itertools.product(*(range(tileable.chunk_shape[ax]) for ax in axes)) + ) + for chunk_index in out_indices: + reduce_op = FancyIndexingDistribute( + stage=OperandStage.reduce, + axes=axes, + dtype=proxy_chunk.dtype, + n_reducers=len(out_indices), + ) + # chunks of fancy indexes on each axis + kws = [ + { + "axis": ax, + "shape": (np.nan,), + "index": chunk_index, + "order": context.op.outputs[0].order, + } + for ax in axes + ] + kws.append({"pos": True, "shape": (np.nan,), "index": chunk_index}) + reduce_chunks = reduce_op.new_chunks([proxy_chunk], kws=kws) + chunk_index_to_fancy_index_chunks[chunk_index] = reduce_chunks[:-1] + chunk_index_to_raw_positions[chunk_index] = reduce_chunks[-1] + + # split info + # - chunk_index_to_fancy_index_chunks + # - chunk_index_to_raw_positions + # - is_fancy_index_asc_sorted, False for tensor fancy indexes + index_info.split_info = ( + chunk_index_to_fancy_index_chunks, + chunk_index_to_raw_positions, + False, + ) + + def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + fancy_index_infos = context.get_indexes(index_info.index_type) + fancy_index_axes = [info.input_axis for info in fancy_index_infos] + split_info = fancy_index_infos[0].split_info + chunk_index_to_fancy_index_chunks = split_info[0] + i_fancy_index = fancy_index_infos.index(index_info) + + other_index_to_iter = dict() + for chunk_index, chunk_index_info in context.chunk_index_to_info.items(): + effected_chunk_index = tuple(chunk_index[ax] for ax in fancy_index_axes) + fancy_index_chunk = chunk_index_to_fancy_index_chunks[effected_chunk_index][ + i_fancy_index + ] + + if i_fancy_index == 0: + other_index = tuple( + ci for i, ci in enumerate(chunk_index) if i not in fancy_index_axes + ) + if other_index not in other_index_to_iter: + other_index_to_iter[other_index] = itertools.count() + output_axis_index = next(other_index_to_iter[other_index]) + output_axis_shape = fancy_index_chunk.shape[0] + else: + output_axis_index = output_axis_shape = None + + chunk_index_info.set( + ChunkIndexAxisInfo( + output_axis_index=output_axis_index, + processed_index=fancy_index_chunk, + output_shape=output_axis_shape, + ) + ) + + def postprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None: + from .getitem import FancyIndexingConcat + + fancy_index_infos = context.get_indexes(index_info.index_type) + + is_first = self.is_first(index_info, context) + if not is_first: + # only need to postprocess fancy indexes once + return + + # current chunks and nsplits + chunks, nsplits = context.out_chunks, context.out_nsplits + chunk_shape = tuple(len(ns) for ns in nsplits) + to_concat_axis = index_info.output_axis + tileable = context.tileable + fancy_index_effected_input_chunk_shapes = tuple( + tileable.chunk_shape[info.input_axis] for info in fancy_index_infos + ) + fancy_indexes = [info.shape_unified_index for info in fancy_index_infos] + + concat_index_to_chunks = dict() + for chunk in chunks: + effected_chunk_index = np.unravel_index( + chunk.index[to_concat_axis], fancy_index_effected_input_chunk_shapes + ) + raw_position_chunk = fancy_index_infos[0].split_info[1][ + effected_chunk_index + ] + concat_map_op = FancyIndexingConcat( + stage=OperandStage.map, + fancy_index_axis=to_concat_axis, + sparse=chunk.issparse(), + dtype=chunk.dtype, + ) + map_chunk_shape = ( + chunk.shape[:to_concat_axis] + + (np.nan,) + + chunk.shape[to_concat_axis + 1 :] + ) + concat_map_chunk = concat_map_op.new_chunk( + [chunk, raw_position_chunk], + index=chunk.index, + shape=map_chunk_shape, + order=TensorOrder.C_ORDER, + ) + concat_index_to_chunks[concat_map_chunk.index] = concat_map_chunk + + other_index_chunk_shape = ( + chunk_shape[:to_concat_axis] + chunk_shape[to_concat_axis + 1 :] + ) + out_chunks = [] + for chunk_index in itertools.product( + *(range(s) for s in other_index_chunk_shape) + ): + to_shuffle_chunks = [] + other_shape = None + for i in range(chunk_shape[to_concat_axis]): + to_concat_chunk_index = ( + chunk_index[:to_concat_axis] + (i,) + chunk_index[to_concat_axis:] + ) + to_concat_chunk = concat_index_to_chunks[to_concat_chunk_index] + to_shuffle_chunks.append(to_concat_chunk) + if other_shape is None: + other_shape = tuple( + s + for ax, s in enumerate(to_concat_chunk.shape) + if ax != to_concat_axis + ) + + proxy_chunk = TensorShuffleProxy( + dtype=to_shuffle_chunks[0].dtype + ).new_chunk(to_shuffle_chunks, shape=(), order=TensorOrder.C_ORDER) + + it = itertools.count() + out_indices = list( + itertools.product(*(range(s) for s in fancy_indexes[0].chunk_shape)) + ) + for ordinal, reduce_index in enumerate(out_indices): + fancy_index_chunk = fancy_indexes[0].cix[reduce_index] + concat_reduce_op = FancyIndexingConcat( + stage=OperandStage.reduce, + fancy_index_axis=to_concat_axis, + fancy_index_shape=fancy_index_chunk.shape, + dtype=proxy_chunk.dtype, + sparse=to_shuffle_chunks[0].issparse(), + reducer_index=(next(it),), + n_reducers=len(out_indices), + ) + reduce_chunk_shape = ( + other_shape[:to_concat_axis] + + fancy_index_chunk.shape + + other_shape[to_concat_axis:] + ) + reduce_chunk_index = ( + chunk_index[:to_concat_axis] + + fancy_index_chunk.index + + chunk_index[to_concat_axis:] + ) + concat_reduce_chunk = concat_reduce_op.new_chunk( + [proxy_chunk], + shape=reduce_chunk_shape, + index=reduce_chunk_index, + order=TensorOrder.C_ORDER, + ) + out_chunks.append(concat_reduce_chunk) + + context.out_chunks = out_chunks + context.out_nsplits = ( + nsplits[:to_concat_axis] + + fancy_indexes[0].nsplits + + nsplits[to_concat_axis + 1 :] + ) + + +class IndexesHandler(ABC): + def __init__(self): + self.available_index_handlers = [] + + def register(self, *handlers): + self.available_index_handlers.extend(h.get_instance() for h in handlers) + + @abstractmethod + def create_context(self, op): + pass + + def handle(self, op, return_context: bool = False): + indexes = op.indexes + # create context + context = self.create_context(op) + + # parse index infos + index_infos = [] + for index in indexes: + parsed = False + for index_handler in self.available_index_handlers: + if index_handler.accept(index): + parsed = True + index_infos.append(index_handler.parse(index, context)) + break + if not parsed: + raise TypeError(f"unable to parse index {index}") + + yield from self._preprocess(context, index_infos) + yield from self._process(context, index_infos) + self._postprocess(context, index_infos) + + if return_context: + return context + else: + return context.create_tileable() + + @classmethod + def _preprocess(cls, context: IndexHandlerContext, index_infos: List[IndexInfo]): + # preprocess + for index_info in index_infos: + preprocess = index_info.handler.preprocess(index_info, context) + if inspect.isgenerator(preprocess): + yield from preprocess + + @classmethod + def _process(cls, context, index_infos): + # process + for index_info in index_infos: + process = index_info.handler.process(index_info, context) + if inspect.isgenerator(process): + yield from process + + context.processed_chunks = context.out_chunks = out_chunks = [] + for chunk_index, chunk_index_info in context.chunk_index_to_info.items(): + out_chunks.append(context.create_chunk(chunk_index, chunk_index_info)) + index_to_shape = OrderedDict( + sorted([(c.index, c.shape) for c in out_chunks], key=itemgetter(0)) + ) + context.out_nsplits = calc_nsplits(index_to_shape) + + @classmethod + def _postprocess(cls, context, index_infos): + # post process + for index_info in index_infos: + index_info.handler.postprocess(index_info, context) + + +class NDArrayIndexesHandler(IndexesHandler): + # indexes handler only for slice, integer, + # boolean ndarray, integer ndarray and None + def __init__(self): + super().__init__() + self.register( + NewaxisIndexHandler, + SliceIndexHandler, + IntegralIndexHandler, + NDArrayBoolIndexHandler, + NDArrayFancyIndexHandler, + ) + + def create_context(self, op): + return TensorIndexHandlerContext(op) + + +class TensorIndexesHandler(IndexesHandler): + def __init__(self): + super().__init__() + self.register( + NewaxisIndexHandler, + SliceIndexHandler, + IntegralIndexHandler, + NDArrayBoolIndexHandler, + TensorBoolIndexHandler, + NDArrayFancyIndexHandler, + TensorFancyIndexHandler, + ) + + def create_context(self, op): + return TensorIndexHandlerContext(op) diff --git a/python/xorbits/_mars/tensor/indexing/nonzero.py b/python/xorbits/_mars/tensor/indexing/nonzero.py new file mode 100644 index 000000000..58d9ba953 --- /dev/null +++ b/python/xorbits/_mars/tensor/indexing/nonzero.py @@ -0,0 +1,139 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...core import ExecutableTuple, recursive_tile +from ...serialization.serializables import KeyField +from ..core import TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorHasInput, TensorOperandMixin +from .unravel_index import unravel_index + + +class TensorNonzero(TensorHasInput, TensorOperandMixin): + _op_type_ = OperandDef.NONZERO + + _input = KeyField("input") + + @property + def output_limit(self): + return float("inf") + + def __call__(self, a): + kws = [ + {"shape": (np.nan,), "order": TensorOrder.C_ORDER, "_idx_": i} + for i in range(a.ndim) + ] + return ExecutableTuple(self.new_tensors([a], kws=kws, output_limit=len(kws))) + + @classmethod + def tile(cls, op): + from ..datasource import arange + + in_tensor = astensor(op.input) + + flattened = in_tensor.astype(bool).flatten() + flattened = yield from recursive_tile(flattened) + indices = arange(flattened.size, dtype=np.intp, chunk_size=flattened.nsplits) + indices = indices[flattened] + dim_indices = unravel_index(indices, in_tensor.shape) + dim_indices = yield from recursive_tile(dim_indices) + + kws = [ + {"nsplits": ind.nsplits, "chunks": ind.chunks, "shape": o.shape} + for ind, o in zip(dim_indices, op.outputs) + ] + new_op = op.copy() + return new_op.new_tensors(op.inputs, kws=kws, output_limit=len(kws)) + + +def nonzero(a): + """ + Return the indices of the elements that are non-zero. + + Returns a tuple of tensors, one for each dimension of `a`, + containing the indices of the non-zero elements in that + dimension. The values in `a` are always tested and returned. + The corresponding non-zero + values can be obtained with:: + + a[nonzero(a)] + + To group the indices by element, rather than dimension, use:: + + transpose(nonzero(a)) + + The result of this is always a 2-D array, with a row for + each non-zero element. + + Parameters + ---------- + a : array_like + Input tensor. + + Returns + ------- + tuple_of_arrays : tuple + Indices of elements that are non-zero. + + See Also + -------- + flatnonzero : + Return indices that are non-zero in the flattened version of the input + tensor. + Tensor.nonzero : + Equivalent tensor method. + count_nonzero : + Counts the number of non-zero elements in the input tensor. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.array([[1,0,0], [0,2,0], [1,1,0]]) + >>> x.execute() + array([[1, 0, 0], + [0, 2, 0], + [1, 1, 0]]) + >>> mt.nonzero(x).execute() + (array([0, 1, 2, 2]), array([0, 1, 0, 1])) + + >>> x[mt.nonzero(x)].execute() # TODO(jisheng): accomplish this after fancy indexing is supported + + >>> mt.transpose(mt.nonzero(x)).execute() # TODO(jisheng): accomplish this later + + A common use for ``nonzero`` is to find the indices of an array, where + a condition is True. Given an array `a`, the condition `a` > 3 is a + boolean array and since False is interpreted as 0, np.nonzero(a > 3) + yields the indices of the `a` where the condition is true. + + >>> a = mt.array([[1,2,3],[4,5,6],[7,8,9]]) + >>> (a > 3).execute() + array([[False, False, False], + [ True, True, True], + [ True, True, True]]) + >>> mt.nonzero(a > 3).execute() + (array([1, 1, 1, 2, 2, 2]), array([0, 1, 2, 0, 1, 2])) + + The ``nonzero`` method of the boolean array can also be called. + + >>> (a > 3).nonzero().execute() + (array([1, 1, 1, 2, 2, 2]), array([0, 1, 2, 0, 1, 2])) + + """ + a = astensor(a) + op = TensorNonzero(dtype=np.dtype(np.intp)) + return op(a) diff --git a/python/xorbits/_mars/tensor/indexing/setitem.py b/python/xorbits/_mars/tensor/indexing/setitem.py new file mode 100644 index 000000000..74a89171d --- /dev/null +++ b/python/xorbits/_mars/tensor/indexing/setitem.py @@ -0,0 +1,372 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import itertools +import operator +from numbers import Integral +from typing import Union + +import numpy as np + +from ... import opcodes as OperandDef +from ...core import ENTITY_TYPE, recursive_tile +from ...core.context import Context +from ...core.operand import OperandStage +from ...serialization.serializables import AnyField, BoolField, KeyField, TupleField +from ...tensor import tensor as astensor +from ...utils import has_unknown_shape +from ..base import broadcast_to +from ..core import TENSOR_TYPE, TensorOrder +from ..operands import TensorMapReduceOperand, TensorOperandMixin, TensorShuffleProxy +from ..utils import broadcast_shape, filter_inputs +from .core import process_index + + +class TensorIndexSetValue(TensorMapReduceOperand, TensorOperandMixin): + _op_type_ = OperandDef.INDEXSETVALUE + + input = KeyField("input") + indexes = TupleField("indexes") + value = AnyField("value") + is_fancy_index = BoolField("is_fancy_index") + input_nsplits = TupleField("input_nsplits") + chunk_offsets = TupleField("chunk_offsets") + shuffle_axes = TupleField("shuffle_axes") + + def __init__( + self, + indexes=None, + value=None, + is_fancy_index=None, + input_nsplits=None, + chunk_offsets=None, + shuffle_axes=None, + **kw, + ): + super().__init__( + indexes=indexes, + value=value, + is_fancy_index=is_fancy_index, + input_nsplits=input_nsplits, + chunk_offsets=chunk_offsets, + shuffle_axes=shuffle_axes, + **kw, + ) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if self.stage == OperandStage.reduce: + self.input = self._inputs[0] + return + elif self.stage == OperandStage.map: + inputs_iter = iter(self._inputs) + else: + self.input = self._inputs[0] + inputs_iter = iter(self._inputs[1:]) + new_indexes = [ + next(inputs_iter) if isinstance(index, ENTITY_TYPE) else index + for index in self.indexes + ] + self.indexes = tuple(new_indexes) + if isinstance(self.value, ENTITY_TYPE): + self.value = next(inputs_iter) + + def __call__(self, a, index, value): + inputs = filter_inputs([a] + list(index) + [value]) + self.indexes = tuple(index) + self.value = value + return self.new_tensor(inputs, a.shape, order=a.order) + + def on_output_modify(self, new_output): + return new_output + + def on_input_modify(self, new_input): + new_op = self.copy().reset_key() + new_inputs = [new_input] + self.inputs[1:] + return new_op.new_tensor(new_inputs, shape=self.outputs[0].shape) + + @classmethod + def _tile_fancy_index(cls, op: "TensorIndexSetValue"): + from ..utils import unify_chunks + + tensor = op.outputs[0] + inp = op.inputs[0] + value = op.value + indexes = op.indexes + + if has_unknown_shape(inp): + yield + + fancy_indexes = [index for index in indexes if isinstance(index, ENTITY_TYPE)] + shape = broadcast_shape(*[ind.shape for ind in fancy_indexes]) + fancy_indexes = [broadcast_to(ind, shape) for ind in fancy_indexes] + if isinstance(value, ENTITY_TYPE): + value = broadcast_to(value, shape) + value, *fancy_indexes = yield from unify_chunks(value, *fancy_indexes) + value = value.chunks + else: + fancy_indexes = yield from unify_chunks(*fancy_indexes) + value = [value] * len(fancy_indexes[0].chunks) + input_nsplits = inp.nsplits + shuffle_axes = tuple( + axis for axis, ind in enumerate(indexes) if isinstance(ind, ENTITY_TYPE) + ) + + map_chunks = [] + for value_chunk, *index_chunks in zip( + value, *[index.chunks for index in fancy_indexes] + ): + map_op = TensorIndexSetValue( + stage=OperandStage.map, + input_nsplits=input_nsplits, + value=value_chunk, + indexes=tuple(index_chunks), + shuffle_axes=shuffle_axes, + dtype=tensor.dtype, + ) + inputs = filter_inputs([value_chunk] + list(index_chunks)) + map_chunk = map_op.new_chunk( + inputs, + shape=(np.nan,), + index=index_chunks[0].index, + order=TensorOrder.C_ORDER, + ) + map_chunks.append(map_chunk) + + proxy_chunk = TensorShuffleProxy(dtype=tensor.dtype).new_chunk( + map_chunks, shape=(), order=TensorOrder.C_ORDER + ) + + reducer_chunks = [] + offsets_on_axis = [np.cumsum([0] + list(split)) for split in input_nsplits] + for input_chunk in inp.chunks: + chunk_offsets = tuple( + offsets_on_axis[axis][input_chunk.index[axis]] + for axis in range(len(inp.shape)) + ) + reducer_op = TensorIndexSetValue( + stage=OperandStage.reduce, + n_reducers=len(inp.chunks), + dtype=input_chunk.dtype, + shuffle_axes=shuffle_axes, + chunk_offsets=chunk_offsets, + ) + reducer_chunk = reducer_op.new_chunk( + [input_chunk, proxy_chunk], + index=input_chunk.index, + shape=input_chunk.shape, + order=input_chunk.order, + ) + reducer_chunks.append(reducer_chunk) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + tensor.shape, + order=tensor.order, + chunks=reducer_chunks, + nsplits=op.input.nsplits, + ) + + @classmethod + def _tile(cls, op: "TensorIndexSetValue"): + from ..base import broadcast_to + from .getitem import _getitem_nocheck + + tensor = op.outputs[0] + value = op.value + indexed = yield from recursive_tile( + _getitem_nocheck(op.input, op.indexes, convert_bool_to_fancy=False) + ) + is_value_tensor = isinstance(value, TENSOR_TYPE) + + if is_value_tensor and value.ndim > 0: + if has_unknown_shape(indexed, value): + exec_chunks = indexed.chunks + op.input.chunks + for c in indexed.chunks: + exec_chunks.extend(c.inputs) + yield exec_chunks + [indexed] + + nsplits = indexed.nsplits + value = yield from recursive_tile( + broadcast_to(value, indexed.shape) + .astype(op.input.dtype, copy=False) + .rechunk(nsplits) + ) + + chunk_mapping = {c.op.input.index: c for c in indexed.chunks} + out_chunks = [] + for chunk in indexed.op.input.chunks: + index_chunk = chunk_mapping.get(chunk.index) + if index_chunk is None: + out_chunks.append(chunk) + continue + + if is_value_tensor: + if value.ndim > 0: + value_chunk = value.cix[index_chunk.index] + else: + value_chunk = value.chunks[0] + else: + # non tensor + value_chunk = value + chunk_op = TensorIndexSetValue( + dtype=op.dtype, + sparse=op.sparse, + indexes=tuple(index_chunk.op.indexes), + value=value_chunk, + ) + chunk_inputs = filter_inputs( + [chunk] + index_chunk.op.indexes + [value_chunk] + ) + out_chunk = chunk_op.new_chunk( + chunk_inputs, shape=chunk.shape, index=chunk.index, order=tensor.order + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + tensor.shape, + order=tensor.order, + chunks=out_chunks, + nsplits=op.input.nsplits, + ) + + @classmethod + def tile(cls, op: "TensorIndexSetValue"): + if op.is_fancy_index: + return (yield from cls._tile_fancy_index(op)) + else: + return (yield from cls._tile(op)) + + @classmethod + def execute(cls, ctx: Union[dict, Context], op: "TensorIndexSetValue"): + if op.stage == OperandStage.map: + return cls._execute_map(ctx, op) + elif op.stage == OperandStage.reduce: + return cls._execute_reduce(ctx, op) + else: + return cls._execute(ctx, op) + + @classmethod + def _execute(cls, ctx, op): + indexes = [ + ctx[index.key] if hasattr(index, "key") else index for index in op.indexes + ] + input_ = ctx[op.inputs[0].key].copy() + value = ctx[op.value.key] if hasattr(op.value, "key") else op.value + if hasattr(input_, "flags") and not input_.flags.writeable: + input_.setflags(write=True) + input_[tuple(indexes)] = value + ctx[op.outputs[0].key] = input_ + + @classmethod + def _execute_map(cls, ctx, op): + nsplits = op.input_nsplits + shuffle_axes = op.shuffle_axes + all_inputs = [ctx[inp.key] for inp in op.inputs] + if hasattr(op.value, "key"): + value = ctx[op.value.key] + indexes = all_inputs[1:] + else: + value = op.value + indexes = all_inputs + + offsets_on_axis = [np.cumsum([0] + list(split)) for split in nsplits] + for reducer_index in itertools.product( + *(map(range, [len(s) for s in nsplits])) + ): + chunk_filters = [] + indexes_iter = iter(indexes) + for axis, _ in enumerate(reducer_index): + start = offsets_on_axis[axis][reducer_index[axis]] + end = offsets_on_axis[axis][reducer_index[axis] + 1] + if axis in shuffle_axes: + index_on_axis = next(indexes_iter) + filtered = (index_on_axis >= start) & (index_on_axis < end) + chunk_filters.append(filtered) + combined_filter = functools.reduce(operator.and_, chunk_filters) + if hasattr(op.value, "key"): + ctx[op.outputs[0].key, reducer_index] = tuple( + inp[combined_filter] for inp in all_inputs + ) + else: + ctx[op.outputs[0].key, reducer_index] = tuple( + [value] + [inp[combined_filter] for inp in all_inputs] + ) + + @classmethod + def _execute_reduce(cls, ctx, op): + input_data = ctx[op.inputs[0].key].copy() + for index_value in op.iter_mapper_data(ctx, input_id=1): + value = index_value[0] + indexes_with_offset = index_value[1:] + indexes = [] + index_iter = iter(indexes_with_offset) + for axis in range(input_data.ndim): + if axis in op.shuffle_axes: + indexes.append(next(index_iter) - op.chunk_offsets[axis]) + input_data[tuple(indexes)] = value + + ctx[op.outputs[0].key] = input_data + + +def _check_support(indexes): + if all( + ( + isinstance(ix, (TENSOR_TYPE, np.ndarray)) + and ix.dtype != np.bool_ + or isinstance(ix, slice) + and ix == slice(None) + ) + for ix in indexes + ): + if any(isinstance(ix, (TENSOR_TYPE, np.ndarray)) for ix in indexes): + return True + for index in indexes: + if isinstance(index, (slice, Integral)): + pass + elif isinstance(index, (np.ndarray, TENSOR_TYPE)) and index.dtype == np.bool_: + pass + else: # pragma: no cover + raise NotImplementedError( + "Only slice, int, or bool indexing " + f"supported by now, got {type(index)}" + ) + return False + + +def _setitem(a, item, value): + index = process_index(a.ndim, item, convert_bool_to_fancy=False) + if not (np.isscalar(value) or (isinstance(value, tuple) and a.dtype.fields)): + # do not convert for tuple when dtype is record type. + value = astensor(value) + + is_fancy_index = _check_support(index) + if is_fancy_index: + index = [astensor(ind) if isinstance(ind, np.ndarray) else ind for ind in index] + + # __setitem__ on a view should be still a view, see GH #732. + op = TensorIndexSetValue( + dtype=a.dtype, + sparse=a.issparse(), + is_fancy_index=is_fancy_index, + indexes=tuple(index), + value=value, + create_view=a.op.create_view, + ) + ret = op(a, index, value) + a.data = ret.data diff --git a/python/xorbits/_mars/tensor/indexing/slice.py b/python/xorbits/_mars/tensor/indexing/slice.py new file mode 100644 index 000000000..cd8ba473c --- /dev/null +++ b/python/xorbits/_mars/tensor/indexing/slice.py @@ -0,0 +1,68 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from ... import opcodes as OperandDef +from ...serialization.serializables import KeyField, ListField +from ..array_utils import get_array_module +from ..core import TensorOrder +from ..operands import TensorHasInput, TensorOperandMixin + + +class TensorSlice(TensorHasInput, TensorOperandMixin): + _op_type_ = OperandDef.SLICE + + _input = KeyField("input") + _slices = ListField("slices") + + def __init__(self, slices=None, **kw): + super().__init__(_slices=slices, **kw) + + @property + def slices(self): + return self._slices + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + def _get_order(self, kw, i): + order = kw.pop("order", None) + if order is None: + inp = self.input + if inp is None or inp.order == TensorOrder.C_ORDER: + return TensorOrder.C_ORDER + + for shape, slc in zip(inp.shape, self._slices): + if slc is None: + continue + s = slc.indices(shape) + if s[0] == 0 and s[1] == shape and s[2] == 1: + continue + else: + return TensorOrder.C_ORDER + + return inp.order + + return order[i] if isinstance(order, (list, tuple)) else order + + @classmethod + def execute(cls, ctx, op): + inp = ctx[op.inputs[0].key] + if op.input.ndim == 0 and not hasattr(inp, "shape"): + # scalar, but organize it into an array + inp = get_array_module(inp).array(inp) + x = inp[tuple(op.slices)] + out = op.outputs[0] + ctx[out.key] = x.astype(x.dtype, order=out.order.value, copy=False) diff --git a/python/xorbits/_mars/tensor/indexing/take.py b/python/xorbits/_mars/tensor/indexing/take.py new file mode 100644 index 000000000..f2b2a5ba7 --- /dev/null +++ b/python/xorbits/_mars/tensor/indexing/take.py @@ -0,0 +1,128 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..datasource import tensor as astensor +from ..utils import check_out_param, validate_axis + + +def take(a, indices, axis=None, out=None): + """ + Take elements from a tensor along an axis. + + When axis is not None, this function does the same thing as "fancy" + indexing (indexing arrays using tensors); however, it can be easier to use + if you need elements along a given axis. A call such as + ``mt.take(arr, indices, axis=3)`` is equivalent to + ``arr[:,:,:,indices,...]``. + + Explained without fancy indexing, this is equivalent to the following use + of `ndindex`, which sets each of ``ii``, ``jj``, and ``kk`` to a tuple of + indices:: + + Ni, Nk = a.shape[:axis], a.shape[axis+1:] + Nj = indices.shape + for ii in ndindex(Ni): + for jj in ndindex(Nj): + for kk in ndindex(Nk): + out[ii + jj + kk] = a[ii + (indices[jj],) + kk] + + Parameters + ---------- + a : array_like (Ni..., M, Nk...) + The source tensor. + indices : array_like (Nj...) + The indices of the values to extract. + + Also allow scalars for indices. + axis : int, optional + The axis over which to select values. By default, the flattened + input tensor is used. + out : Tensor, optional (Ni..., Nj..., Nk...) + If provided, the result will be placed in this tensor. It should + be of the appropriate shape and dtype. + mode : {'raise', 'wrap', 'clip'}, optional + Specifies how out-of-bounds indices will behave. + + * 'raise' -- raise an error (default) + * 'wrap' -- wrap around + * 'clip' -- clip to the range + + 'clip' mode means that all indices that are too large are replaced + by the index that addresses the last element along that axis. Note + that this disables indexing with negative numbers. + + Returns + ------- + out : Tensor (Ni..., Nj..., Nk...) + The returned tensor has the same type as `a`. + + See Also + -------- + compress : Take elements using a boolean mask + Tensor.take : equivalent method + + Notes + ----- + + By eliminating the inner loop in the description above, and using `s_` to + build simple slice objects, `take` can be expressed in terms of applying + fancy indexing to each 1-d slice:: + + Ni, Nk = a.shape[:axis], a.shape[axis+1:] + for ii in ndindex(Ni): + for kk in ndindex(Nj): + out[ii + s_[...,] + kk] = a[ii + s_[:,] + kk][indices] + + For this reason, it is equivalent to (but faster than) the following use + of `apply_along_axis`:: + + out = mt.apply_along_axis(lambda a_1d: a_1d[indices], axis, a) + + Examples + -------- + >>> import mars.tensor as mt + >>> a = [4, 3, 5, 7, 6, 8] + >>> indices = [0, 1, 4] + >>> mt.take(a, indices).execute() + array([4, 3, 6]) + + In this example if `a` is a tensor, "fancy" indexing can be used. + + >>> a = mt.array(a) + >>> a[indices].execute() + array([4, 3, 6]) + + If `indices` is not one dimensional, the output also has these dimensions. + + >>> mt.take(a, [[0, 1], [2, 3]]).execute() + array([[4, 3], + [5, 7]]) + """ + a = astensor(a) + if axis is None: + t = a.ravel()[indices] + else: + axis = validate_axis(a.ndim, axis) + t = a[(slice(None),) * axis + (indices,)] + + if out is None: + return t + + if out.shape != t.shape: + raise ValueError( + f"output tensor has wrong shape, expect: {t.shape}, got: {out.shape}" + ) + check_out_param(out, t, "unsafe") + out.data = t.data + return out diff --git a/python/xorbits/_mars/tensor/indexing/tests/__init__.py b/python/xorbits/_mars/tensor/indexing/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/tensor/indexing/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/tensor/indexing/tests/test_indexing.py b/python/xorbits/_mars/tensor/indexing/tests/test_indexing.py new file mode 100644 index 000000000..28217b953 --- /dev/null +++ b/python/xorbits/_mars/tensor/indexing/tests/test_indexing.py @@ -0,0 +1,414 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest + +from ....config import option_context +from ....core import tile +from ...base.broadcast_to import TensorBroadcastTo +from ...datasource import array, empty, ones, tensor +from ...datasource.ones import TensorOnes +from ...merge.concatenate import TensorConcatenate +from .. import choose, compress, fill_diagonal, nonzero, unravel_index +from ..setitem import TensorIndexSetValue + + +def test_bool_indexing(): + t = ones((100, 200, 300)) + indexed = t[t < 2] + assert len(indexed.shape) == 1 + assert np.isnan(indexed.shape[0]) + + t2 = ones((100, 200)) + indexed = t[t2 < 2] + assert len(indexed.shape) == 2 + assert np.isnan(indexed.shape[0]) + assert indexed.shape[1] == 300 + + t2 = ones((100, 200)) + indexed = t[t2 < 2] + 1 + assert len(indexed.shape) == 2 + assert np.isnan(indexed.shape[0]) + assert indexed.shape[1] == 300 + + t2 = ones((10, 20)) + rs = np.random.RandomState(0) + i1 = np.zeros(10, dtype=bool) + i1[rs.permutation(np.arange(10))[:5]] = True + i2 = np.zeros(20, dtype=bool) + i2[rs.permutation(np.arange(20))[:5]] = True + indexed = t2[i1, i2] + assert len(indexed.shape) == 1 + assert indexed.shape[0] == 5 + + t2 = tile(indexed) + assert t2.chunks[0].index == (0,) + + t3 = ones((101, 200)) + with pytest.raises(IndexError) as cm: + _ = t[t3 < 2] # noqa: F841 + e = cm.value.args[0] + assert "along dimension 0" in e + assert "dimension is 100 but corresponding boolean dimension is 101" in e + + t4 = ones((100, 201)) + with pytest.raises(IndexError) as cm: + _ = t[t4 < 2] # noqa: F841 + e = cm.value.args[0] + assert "along dimension 1" in e + assert "dimension is 200 but corresponding boolean dimension is 201" in e + + +def test_slice(): + t = ones((100, 200, 300)) + t2 = t[10:30, 199:, -30:303] + assert t2.shape == (20, 1, 30) + + t3 = t[10:90:4, 20:80:5] + s1 = len(list(range(100))[10:90:4]) + s2 = len(list(range(200))[20:80:5]) + assert t3.shape == (s1, s2, 300) + + +def test_fancy_indexing(): + t = ones((100, 200, 300)) + t2 = t[[0, 1], [2, 3]] + assert t2.shape == (2, 300) + + t3 = t[[[0, 1], [2, 3]], [4, 5]] + assert t3.shape == (2, 2, 300) + + with pytest.raises(IndexError) as cm: + _ = t[[1, 2], [3, 4, 5]] # noqa: F841 + e = cm.value.args[0] + assert ( + e == "shape mismatch: indexing arrays could not be broadcast " + "together with shapes (2,) (3,)" + ) + + with pytest.raises(IndexError): + t[[100]] + + t = ones((100, 200, 300), chunk_size=10) + + # fancy index on numpy ndarrays + + t4 = tile(t[:10, -10:, [13, 244, 151, 242, 34]]) + assert t4.shape == (10, 10, 5) + assert t4.chunk_shape == (1, 1, 1) + + t5 = tile(t[:10, -10:, [1, 10, 20, 33, 34, 200]]) + assert t5.shape == (10, 10, 6) + assert t5.chunk_shape == (1, 1, 5) + + t6 = tile(t[[20, 1, 33, 22, 11], :15, [255, 211, 2, 11, 121]]) + assert t6.shape == (5, 15) + # need a concat, because the fancy indexes are not ascending according to chunk index + assert t6.chunk_shape == (1, 2) + assert t6.chunks[0].ndim == 2 + assert t6.nsplits == ((5,), (10, 5)) + + t7 = tile(t[[5, 6, 33, 66], :15, [0, 9, 2, 11]]) + assert t7.shape == (4, 15) + # not need a concat + assert t7.chunk_shape == (3, 2) + assert t7.chunks[0].ndim == 2 + assert t7.nsplits == ((2, 1, 1), (10, 5)) + + t8 = tile(t[[[5, 33], [66, 6]], :15, [255, 11]]) + assert t8.shape == (2, 2, 15) + assert t8.chunk_shape == (1, 1, 2) + assert t8.chunks[0].ndim == 3 + assert t8.nsplits == ((2,), (2,), (10, 5)) + + # fancy index on tensors + + t9 = tile(t[:10, -10:, tensor([13, 244, 151, 242, 34], chunk_size=2)]) + assert t9.shape == (10, 10, 5) + assert t9.chunk_shape == (1, 1, 3) + + t10 = tile(t[:10, -10:, tensor([1, 10, 20, 33, 34, 200], chunk_size=4)]) + assert t10.shape == (10, 10, 6) + assert t10.chunk_shape == (1, 1, 2) + + t11 = tile( + t[ + tensor([20, 1, 33, 22, 11], chunk_size=2), + :15, + tensor([255, 211, 2, 11, 121], chunk_size=3), + ] + ) + assert t11.shape == (5, 15) + # need a concat, because the fancy indexes are not ascending according to chunk index + assert t11.chunk_shape == (4, 2) + assert t11.chunks[0].ndim == 2 + assert t11.nsplits == ((2, 1, 1, 1), (10, 5)) + + t12 = tile(t[tensor([5, 6, 33, 66], chunk_size=2), :15, [0, 9, 2, 11]]) + assert t12.shape == (4, 15) + # not need a concat + assert t12.chunk_shape == (2, 2) + assert t12.chunks[0].ndim == 2 + assert t12.nsplits == ((2, 2), (10, 5)) + + t13 = tile(t[tensor([[5, 33], [66, 6]]), :15, tensor([255, 11])]) + assert t13.shape == (2, 2, 15) + assert t13.chunk_shape == (1, 1, 2) + assert t13.chunks[0].ndim == 3 + assert t13.nsplits == ((2,), (2,), (10, 5)) + + +def test_mixed_indexing(): + t = ones((100, 200, 300, 400)) + + with pytest.raises(IndexError): + _ = t[ones((100, 200), dtype=float)] # noqa: F841 + + t2 = t[ones(100) < 2, ..., 20::101, 2] + assert len(t2.shape) == 3 + assert np.isnan(t2.shape[0]) + + t3 = ones((2, 3, 4, 5)) + t4 = t3[1] + assert t4.flags["C_CONTIGUOUS"] == np.ones((2, 3, 4, 5))[1].flags["C_CONTIGUOUS"] + assert t4.flags["F_CONTIGUOUS"] == np.ones((2, 3, 4, 5))[1].flags["F_CONTIGUOUS"] + + +def test_bool_indexing_tiles(): + t = ones((100, 200, 300), chunk_size=30) + indexed = t[t < 2] + indexed, t = tile(indexed, t) + + assert len(indexed.chunks) == 280 + assert indexed.chunks[0].index == (0,) + assert indexed.chunks[20].index == (20,) + assert indexed.chunks[20].inputs[0] is t.cix[(0, 2, 0)].data + assert indexed.chunks[20].inputs[1] is indexed.op.indexes[0].cix[0, 2, 0].data + + t = ones((100, 200, 300), chunk_size=30) + t2 = ones((100, 200), chunk_size=30) + indexed2 = t[t2 < 2] + indexed2, t = tile(indexed2, t) + + assert len(indexed2.chunks) == 280 + assert len(indexed2.chunks[0].shape) == 2 + assert np.isnan(indexed2.chunks[0].shape[0]) + assert indexed2.chunks[0].shape[1] == 30 + assert indexed2.chunks[20].inputs[0] == t.cix[(0, 2, 0)].data + assert indexed2.chunks[20].inputs[1] == indexed2.op.indexes[0].cix[0, 2].data + + +def test_slice_tiles(): + t = ones((100, 200, 300), chunk_size=30) + t2 = t[10:40, 199:, -30:303] + t, t2 = tile(t, t2) + + assert t2.chunk_shape == (2, 1, 1) + assert t2.chunks[0].inputs[0] == t.cix[0, -1, -1].data + assert t2.chunks[0].op.indexes == [slice(10, 30, 1), slice(19, 20, 1), slice(None)] + assert t2.chunks[0].index == (0, 0, 0) + assert t2.chunks[1].inputs[0] == t.cix[1, -1, -1].data + assert t2.chunks[1].op.indexes == [slice(0, 10, 1), slice(19, 20, 1), slice(None)] + assert t2.chunks[1].index == (1, 0, 0) + + +def test_indices_indexing_tiles(): + t = ones((10, 20, 30), chunk_size=(2, 20, 30)) + t2 = t[3] + t, t2 = tile(t, t2) + + assert len(t2.chunks) == 1 + assert t2.chunks[0].inputs[0] is t.cix[1, 0, 0].data + assert t2.chunks[0].op.indexes[0] == 1 + + t = ones((10, 20, 30), chunk_size=(2, 20, 30)) + t3 = t[4] + t, t3 = tile(t, t3) + + assert len(t3.chunks) == 1 + assert t3.chunks[0].inputs[0] is t.cix[2, 0, 0].data + assert t3.chunks[0].op.indexes[0] == 0 + + +def test_mixed_indexing_tiles(): + t = ones((100, 200, 300, 400), chunk_size=24) + + cmp = ones(400, chunk_size=24) < 2 + t2 = t[10:90:3, 5, ..., None, cmp] + t2, cmp = tile(t2, cmp) + + assert t2.shape[:-1] == (27, 300, 1) + assert np.isnan(t2.shape[-1]) + assert t2.chunk_shape == (4, 13, 1, 17) + assert t2.chunks[0].op.indexes == [ + slice(10, 24, 3), + 5, + slice(None), + None, + cmp.cix[(0,)].data, + ] + + +def test_setitem(): + shape = (10, 20, 30, 40) + t = ones(shape, chunk_size=5, dtype="i4") + t[5:20:3, 5, ..., :-5] = 2.2 + + assert isinstance(t.op, TensorIndexSetValue) + assert t.shape == shape + assert isinstance(t.inputs[0].op.outputs[0].op, TensorOnes) + + t = tile(t) + assert isinstance(t.chunks[0].op, TensorOnes) + assert isinstance(t.cix[1, 1, 0, 0].op, TensorIndexSetValue) + assert t.cix[1, 1, 0, 0].op.value == 2.2 + + t2 = ones(shape, chunk_size=5, dtype="i4") + shape = t2[5:20:3, 5, ..., :-5].shape + t2[5:20:3, 5, ..., :-5] = ones(shape, chunk_size=4, dtype="i4") * 2 + + t2 = tile(t2) + assert isinstance(t2.chunks[0].op, TensorOnes) + assert isinstance(t2.cix[1, 1, 0, 0].op, TensorIndexSetValue) + assert isinstance(t2.cix[1, 1, 0, 0].op.value.op, TensorConcatenate) + + +def test_setitem_structured(): + # Check to value is properly broadcast for `setitem` on complex record dtype arrays. + rec_type = np.dtype( + [ + ("a", np.int32), + ("b", np.double), + ("c", np.dtype([("a", np.int16), ("b", np.int64)])), + ] + ) + + t = ones((4, 5), dtype=rec_type, chunk_size=3) + + # assign tuple to record + t[1:4, 1] = (3, 4.0, (5, 6)) + tt = tile(t) + assert tt.cix[0, 0].op.value == (3, 4.0, (5, 6)) + + # assign scalar to record + t[1:4, 2] = 8 + tt = tile(t) + assert tt.cix[0, 0].op.value == 8 + + # assign scalar array to record array with broadcast + t[1:3] = np.arange(5) + tt = tile(t) + slices_op = tt.cix[0, 0].op.value.op + assert slices_op.slices == [slice(None, None, None), slice(0, 3, None)] + broadcast_op = slices_op.inputs[0].op.inputs[0].op + assert isinstance(broadcast_op, TensorBroadcastTo) + assert broadcast_op.shape == (2, 5) + np.testing.assert_array_equal(broadcast_op.inputs[0].op.data, np.arange(5)) + + # assign scalar array to record array of same shape, no broadcast + t[2:4] = np.arange(10).reshape(2, 5) + tt = tile(t) + slices_op = tt.cix[0, 0].op.value.op + assert slices_op.slices == [slice(0, 1, None), slice(0, 3, None)] + np.testing.assert_array_equal( + slices_op.inputs[0].op.inputs[0].op.data, np.arange(10).reshape(2, 5) + ) + + +def test_choose(): + with option_context() as options: + options.chunk_size = 2 + + choices = [[0, 1, 2, 3], [10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33]] + a = choose([2, 3, 1, 0], choices) + + a = tile(a) + assert len(a.chunks) == 2 + assert isinstance(a.chunks[0].op, type(a.op)) + assert len(a.chunks[0].inputs) == 5 + + with pytest.raises(TypeError): + choose([2, 3, 1, 0], choices, out=1) + + with pytest.raises(ValueError): + choose([2, 3, 1, 0], choices, out=tensor(np.empty((1, 4)))) + + +def test_unravel_index(): + indices = tensor([22, 41, 37], chunk_size=1) + t = unravel_index(indices, (7, 6)) + + assert len(t) == 2 + + t = [tile(r) for r in t] + + assert len(t[0].chunks) == 3 + assert len(t[1].chunks) == 3 + + with pytest.raises(TypeError): + unravel_index([22, 41, 37], (7, 6), order="B") + + +def test_nonzero(): + x = tensor([[1, 0, 0], [0, 2, 0], [1, 1, 0]], chunk_size=2) + y = nonzero(x) + + assert len(y) == 2 + + tile(y[0]) + + +def test_compress(): + a = np.array([[1, 2], [3, 4], [5, 6]]) + + with pytest.raises(TypeError): + compress([0, 1], a, axis=0, out=1) + + with pytest.raises(TypeError): + compress( + [0, 1], + array([[1, 2], [3, 4], [5, 6]], dtype="i8"), + axis=0, + out=empty((1, 2), dtype="f8"), + ) + + +def test_operand_key(): + t = ones((10, 2), chunk_size=5) + t_slice1 = t[:5] + t_slice2 = t[5:] + + assert t_slice1.op.key != t_slice2.op.key + + +def test_fill_diagonal(): + a = tensor(np.random.rand(10, 13)) + fill_diagonal(a, 10) + + assert a.shape == (10, 13) + + # must be Tensor + with pytest.raises(TypeError): + fill_diagonal(np.random.rand(11, 10), 1) + + # at least 2-d required + with pytest.raises(ValueError): + a = tensor(np.random.rand(4)) + fill_diagonal(a, 1) + + # for more than 2-d, shape on each dimension should be equal + with pytest.raises(ValueError): + a = tensor(np.random.rand(11, 10, 11)) + fill_diagonal(a, 1) diff --git a/python/xorbits/_mars/tensor/indexing/tests/test_indexing_execution.py b/python/xorbits/_mars/tensor/indexing/tests/test_indexing_execution.py new file mode 100644 index 000000000..74d70d88c --- /dev/null +++ b/python/xorbits/_mars/tensor/indexing/tests/test_indexing_execution.py @@ -0,0 +1,897 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +import scipy.sparse as sps + +from ....config import options +from ... import hstack, mod, stack +from ...datasource import arange, tensor, zeros +from .. import ( + choose, + compress, + extract, + fill_diagonal, + flatnonzero, + nonzero, + take, + unravel_index, +) + + +def test_bool_indexing_execution(setup): + raw = np.random.random((11, 8, 12, 14)) + arr = tensor(raw, chunk_size=6) + + index = arr < 0.5 + arr2 = arr[index] + # size_res = self.executor.execute_tensor(arr2, mock=True) + res = arr2.execute().fetch() + + # assert sum(s[0] for s in size_res) == arr.nbytes + np.testing.assert_array_equal(np.sort(res), np.sort(raw[raw < 0.5])) + + index2 = tensor(raw[:, :, 0, 0], chunk_size=3) < 0.5 + arr3 = arr[index2] + res = arr3.execute().fetch() + + expected = raw[raw[:, :, 0, 0] < 0.5] + assert sum(it.size for it in res) == expected.size + assert res.shape == expected.shape + + raw = np.asfortranarray(np.random.random((11, 8, 12, 14))) + arr = tensor(raw, chunk_size=3) + + index = tensor(raw[:, :, 0, 0], chunk_size=3) < 0.5 + arr2 = arr[index] + res = arr2.execute().fetch() + expected = raw[raw[:, :, 0, 0] < 0.5].copy("A") + + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + +def test_fancy_indexing_numpy_execution(setup): + # test fancy index of type numpy ndarray + raw = np.random.random((11, 8, 12, 14)) + arr = tensor(raw, chunk_size=(6, 5, 7, 8)) + + index = [9, 10, 3, 1, 8, 10] + arr2 = arr[index] + + res = arr2.execute().fetch() + np.testing.assert_array_equal(res, raw[index]) + + index = np.random.permutation(8) + arr3 = arr[:2, ..., index] + + res = arr3.execute().fetch() + np.testing.assert_array_equal(res, raw[:2, ..., index]) + + index = [1, 3, 9, 10] + arr4 = arr[..., index, :5] + + res = arr4.execute().fetch() + np.testing.assert_array_equal(res, raw[..., index, :5]) + + index1 = [8, 10, 3, 1, 9, 10] + index2 = [1, 3, 9, 10, 2, 7] + arr5 = arr[index1, :, index2] + + res = arr5.execute().fetch() + np.testing.assert_array_equal(res, raw[index1, :, index2]) + + index1 = [1, 3, 5, 7, 9, 10] + index2 = [1, 9, 9, 10, 2, 7] + arr6 = arr[index1, :, index2] + + res = arr6.execute().fetch() + np.testing.assert_array_equal(res, raw[index1, :, index2]) + + index1 = [[8, 10, 3], [1, 9, 10]] + index2 = [[1, 3, 9], [10, 2, 7]] + arr7 = arr[index1, :, index2] + + res = arr7.execute().fetch() + np.testing.assert_array_equal(res, raw[index1, :, index2]) + + index1 = [[1, 3], [3, 7], [7, 7]] + index2 = [1, 9] + arr8 = arr[0, index1, :, index2] + + res = arr8.execute().fetch() + np.testing.assert_array_equal(res, raw[0, index1, :, index2]) + + +def test_fancy_indexing_tensor_execution(setup): + # test fancy index of type tensor + + raw = np.random.random((11, 8, 12, 14)) + arr = tensor(raw, chunk_size=(6, 5, 7, 8)) + + raw_index = [8, 10, 3, 1, 9, 10] + index = tensor(raw_index, chunk_size=4) + arr2 = arr[index] + + res = arr2.execute().fetch() + np.testing.assert_array_equal(res, raw[raw_index]) + + raw_index = np.random.permutation(8) + index = tensor(raw_index, chunk_size=3) + arr3 = arr[:2, ..., index] + + res = arr3.execute().fetch() + np.testing.assert_array_equal(res, raw[:2, ..., raw_index]) + + raw_index = [1, 3, 9, 10] + index = tensor(raw_index) + arr4 = arr[..., index, :5] + + res = arr4.execute().fetch() + np.testing.assert_array_equal(res, raw[..., raw_index, :5]) + + raw_index1 = [8, 10, 3, 1, 9, 10] + raw_index2 = [1, 3, 9, 10, 2, 7] + index1 = tensor(raw_index1, chunk_size=4) + index2 = tensor(raw_index2, chunk_size=3) + arr5 = arr[index1, :, index2] + + res = arr5.execute().fetch() + np.testing.assert_array_equal(res, raw[raw_index1, :, raw_index2]) + + raw_index1 = [1, 3, 5, 7, 9, 10] + raw_index2 = [1, 9, 9, 10, 2, 7] + index1 = tensor(raw_index1, chunk_size=3) + index2 = tensor(raw_index2, chunk_size=4) + arr6 = arr[index1, :, index2] + + res = arr6.execute().fetch() + np.testing.assert_array_equal(res, raw[raw_index1, :, raw_index2]) + + raw_index1 = [[8, 10, 3], [1, 9, 10]] + raw_index2 = [[1, 3, 9], [10, 2, 7]] + index1 = tensor(raw_index1) + index2 = tensor(raw_index2, chunk_size=2) + arr7 = arr[index1, :, index2] + + res = arr7.execute().fetch() + np.testing.assert_array_equal(res, raw[raw_index1, :, raw_index2]) + + raw_index1 = [[1, 3], [3, 7], [7, 7]] + raw_index2 = [1, 9] + index1 = tensor(raw_index1, chunk_size=(2, 1)) + index2 = tensor(raw_index2) + arr8 = arr[0, index1, :, index2] + + res = arr8.execute().fetch() + np.testing.assert_array_equal(res, raw[0, raw_index1, :, raw_index2]) + + raw_a = np.random.rand(30, 30) + a = tensor(raw_a, chunk_size=(13, 17)) + b = a.argmax(axis=0) + c = a[b, arange(30)] + res = c.execute().fetch() + + np.testing.assert_array_equal(res, raw_a[raw_a.argmax(axis=0), np.arange(30)]) + + # test one chunk + arr = tensor(raw, chunk_size=20) + + raw_index = [8, 10, 3, 1, 9, 10] + index = tensor(raw_index, chunk_size=20) + arr9 = arr[index] + + res = arr9.execute().fetch() + np.testing.assert_array_equal(res, raw[raw_index]) + + raw_index1 = [[1, 3], [3, 7], [7, 7]] + raw_index2 = [1, 9] + index1 = tensor(raw_index1) + index2 = tensor(raw_index2) + arr10 = arr[0, index1, :, index2] + + res = arr10.execute().fetch() + np.testing.assert_array_equal(res, raw[0, raw_index1, :, raw_index2]) + + # test order + raw = np.asfortranarray(np.random.random((11, 8, 12, 14))) + arr = tensor(raw, chunk_size=(6, 5, 7, 8)) + + raw_index = [8, 10, 3, 1, 9, 10] + index = tensor(raw_index, chunk_size=4) + arr11 = arr[index] + + res = arr11.execute().fetch() + expected = raw[raw_index].copy("A") + np.testing.assert_array_equal(res, expected) + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + +def test_slice_execution(setup): + raw = np.random.random((11, 8, 12, 14)) + arr = tensor(raw, chunk_size=6) + + arr2 = arr[2:9:2, 3:7, -1:-9:-2, 12:-11:-4] + res = arr2.execute().fetch() + + np.testing.assert_array_equal(res, raw[2:9:2, 3:7, -1:-9:-2, 12:-11:-4]) + + arr3 = arr[-4, 2:] + res = arr3.execute().fetch() + np.testing.assert_equal(res, raw[-4, 2:]) + + raw = sps.random(12, 14, density=0.1) + arr = tensor(raw, chunk_size=6) + + arr2 = arr[-1:-9:-2, 12:-11:-4] + res = arr2.execute().fetch() + + np.testing.assert_equal(res.toarray(), raw.toarray()[-1:-9:-2, 12:-11:-4]) + + # test order + raw = np.asfortranarray(np.random.random((11, 8, 12, 14))) + arr = tensor(raw, chunk_size=6) + + arr2 = arr[2:9:2, 3:7, -1:-9:-2, 12:-11:-4] + res = arr2.execute().fetch() + expected = raw[2:9:2, 3:7, -1:-9:-2, 12:-11:-4].copy("A") + + np.testing.assert_array_equal(res, expected) + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + arr3 = arr[0:13, :, None] + res = arr3.execute().fetch() + expected = raw[0:13, :, None].copy("A") + + np.testing.assert_array_equal(res, expected) + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + +def test_mixed_indexing_execution(setup): + rs = np.random.RandomState(0) + raw = rs.random((11, 8, 12, 13)) + arr = tensor(raw, chunk_size=6) + + raw_cond = raw[0, :, 0, 0] < 0.5 + cond = tensor(raw[0, :, 0, 0], chunk_size=3) < 0.5 + arr2 = arr[10::-2, cond, None, ..., :5] + # size_res = self.executor.execute_tensor(arr2, mock=True) + res = arr2.execute().fetch() + + new_shape = list(arr2.shape) + new_shape[1] = cond.shape[0] + # assert sum(s[0] for s in size_res) == int(np.prod(new_shape) * arr2.dtype.itemsize) + np.testing.assert_array_equal(res, raw[10::-2, raw_cond, None, ..., :5]) + + b_raw = np.random.random(8) + raw_cond = b_raw < 0.5 + conds = [raw_cond, tensor(b_raw, chunk_size=2) < 0.5] + for cond in conds: + arr3 = arr[-2::-3, cond, ...] + res = arr3.execute().fetch() + + np.testing.assert_array_equal(res, raw[-2::-3, raw_cond, ...]) + + # test multiple bool index and fancy index + cond1 = np.zeros(11, dtype=bool) + cond1[rs.permutation(11)[:5]] = True + cond2 = np.zeros(12, dtype=bool) + cond2[rs.permutation(12)[:5]] = True + f3 = np.random.randint(13, size=5) + + expected = raw[cond1, ..., cond2, f3] + + t = arr[cond1, ..., cond2, f3] + res = t.execute().fetch() + np.testing.assert_array_equal(res, expected) + + t = arr[tensor(cond1), ..., tensor(cond2), tensor(f3)] + res = t.execute().fetch() + np.testing.assert_array_equal(res, expected) + + +@pytest.mark.ray_dag +def test_setitem_fancy_index_execution(setup): + rs = np.random.RandomState(0) + + raw = rs.randint(0, 10, size=(11, 12)) + + # index is a ndarray, value is a scalar + arr = tensor(raw.copy(), chunk_size=5) + idx = rs.randint(0, 11, (5,)) + arr[idx] = 20 + res = arr.execute().fetch() + expected = raw.copy() + expected[idx] = 20 + np.testing.assert_array_equal(res, expected) + + # index is a tensor, value is a scalar + arr = tensor(raw.copy(), chunk_size=5) + raw_index = rs.randint(0, 11, (8,)) + idx = tensor(raw_index.copy(), chunk_size=5) + arr[idx] = 2 + res = arr.execute().fetch() + expected = raw.copy() + expected[raw_index] = 2 + np.testing.assert_array_equal(res, expected) + + # indexes are all tensors + arr = tensor(raw.copy(), chunk_size=6) + raw_index1 = rs.randint(0, 11, (20,)) + idx1 = tensor(raw_index1.copy(), chunk_size=8) + raw_index2 = rs.randint(0, 12, (20,)) + idx2 = tensor(raw_index2.copy(), chunk_size=8) + arr[idx1, idx2] = 2 + res = arr.execute().fetch() + expected = raw.copy() + expected[raw_index1, raw_index2] = 2 + np.testing.assert_array_equal(res, expected) + + # indexes all tensors, value is also a tensor + arr = tensor(raw.copy(), chunk_size=6) + raw_index1 = rs.randint(0, 11, (20,)) + idx1 = tensor(raw_index1.copy(), chunk_size=8) + raw_index2 = rs.randint(0, 12, (20,)) + idx2 = tensor(raw_index2.copy(), chunk_size=8) + raw_value = rs.randint(0, 10, (20,)) + arr[idx1, idx2] = tensor(raw_value, chunk_size=4) + res = arr.execute().fetch() + expected = raw.copy() + expected[raw_index1, raw_index2] = raw_value + np.testing.assert_array_equal(res, expected) + + raw = rs.randint(0, 10, size=(20,)) + arr = tensor(raw.copy(), chunk_size=6) + raw_index = rs.randint(0, 11, (9,)) + raw_value = rs.randint(0, 10, (9,)) + index = tensor(raw_index, chunk_size=3) + arr[index] = tensor(raw_value, chunk_size=4) + res = arr.execute().fetch() + expected = raw.copy() + expected[raw_index] = raw_value + np.testing.assert_array_equal(res, expected) + + # input's nsplits is unknown + raw = rs.randint(0, 10, size=(11, 11)) + arr = tensor(raw.copy(), chunk_size=6) + arr1 = arr[arr[0] < 20, :] + raw_index1 = rs.randint(0, 11, (10,)) + idx1 = tensor(raw_index1.copy(), chunk_size=3) + raw_index2 = rs.randint(0, 11, (10,)) + idx2 = tensor(raw_index2.copy(), chunk_size=4) + raw_value = rs.randint(100, 110, (10,)) + arr1[idx1, idx2] = tensor(raw_value, chunk_size=4) + res = arr1.execute().fetch() + expected = raw.copy() + expected = expected[expected[0] < 20, :] + expected[raw_index1, raw_index2] = raw_value + np.testing.assert_array_equal(res, expected) + + +def test_setitem_execution(setup): + rs = np.random.RandomState(0) + + raw = data = rs.randint(0, 10, size=(11, 8, 12, 13)) + arr = tensor(raw.copy(), chunk_size=6) + raw = raw.copy() + + idx = slice(2, 9, 2), slice(3, 7), slice(-1, -9, -2), 2 + arr[idx] = 20 + res = arr.execute().fetch() + + raw[idx] = 20 + np.testing.assert_array_equal(res, raw) + assert res.flags["C_CONTIGUOUS"] == raw.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == raw.flags["F_CONTIGUOUS"] + + raw = data + shape = raw[idx].shape + + arr2 = tensor(raw.copy(), chunk_size=6) + raw = raw.copy() + + replace = rs.randint(10, 20, size=shape[:-1] + (1,)).astype("f4") + arr2[idx] = tensor(replace, chunk_size=7) + res = arr2.execute().fetch() + + raw[idx] = replace + np.testing.assert_array_equal(res, raw) + + raw = np.asfortranarray(np.random.randint(0, 10, size=(11, 8, 12, 13))) + arr = tensor(raw.copy("A"), chunk_size=6) + raw = raw.copy("A") + + idx = slice(2, 9, 2), slice(3, 7), slice(-1, -9, -2), 2 + arr[idx] = 20 + res = arr.execute().fetch() + + raw[idx] = 20 + np.testing.assert_array_equal(res, raw) + assert res.flags["C_CONTIGUOUS"] == raw.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == raw.flags["F_CONTIGUOUS"] + + # test bool indexing set + raw = data + + arr = tensor(raw.copy(), chunk_size=6) + raw1 = rs.rand(11) + arr[tensor(raw1, chunk_size=4) < 0.6, 2:7] = 3 + res = arr.execute().fetch() + + raw[raw1 < 0.6, 2:7] = 3 + np.testing.assert_array_equal(res, raw) + + raw = np.random.randint(3, size=10).astype(np.int64) + raw2 = np.arange(3) + + arr = zeros((10, 3)) + arr[tensor(raw) == 1, tensor(raw2) == 1] = 1 + res = arr.execute().fetch() + + expected = np.zeros((10, 3)) + expected[raw == 1, raw2 == 1] = 1 + np.testing.assert_array_equal(res, expected) + + raw = data + + arr = tensor(raw.copy(), chunk_size=6) + raw1 = rs.rand(11) + set_data = rs.rand((raw1 < 0.8).sum(), 8, 12, 13) + arr[tensor(raw1, chunk_size=4) < 0.8] = tensor(set_data) + + res = arr.execute().fetch() + + raw[raw1 < 0.8] = set_data + np.testing.assert_array_equal(res, raw) + + # test error + with pytest.raises(ValueError): + t = tensor(raw, chunk_size=3) + t[0, 0, 0, 0] = zeros(2, chunk_size=10) + t.execute() + + +def test_setitem_structured_execution(setup): + rec_type = np.dtype( + [ + ("a", np.int32), + ("b", np.double), + ("c", np.dtype([("a", np.int16), ("b", np.int64)])), + ] + ) + + raw = np.zeros((4, 5), dtype=rec_type) + arr = tensor(raw.copy(), chunk_size=3) + + arr[1:4, 1] = (3, 4.0, (5, 6)) + arr[1:4, 2] = 8 + arr[1:3] = np.arange(5) + arr[2:4] = np.arange(10).reshape(2, 5) + arr[0] = np.arange(5) + + raw[1:4, 1] = (3, 4.0, (5, 6)) + raw[1:4, 2] = 8 + raw[1:3] = np.arange(5) + raw[2:4] = np.arange(10).reshape(2, 5) + raw[0] = np.arange(5) + + res = arr.execute().fetch() + assert arr.dtype == raw.dtype + assert arr.shape == raw.shape + np.testing.assert_array_equal(res, raw) + + +def test_take_execution(setup): + data = np.random.rand(10, 20, 30) + t = tensor(data, chunk_size=10) + + a = t.take([4, 1, 2, 6, 200]) + + res = a.execute().fetch() + expected = np.take(data, [4, 1, 2, 6, 200]) + np.testing.assert_array_equal(res, expected) + + a = take(t, [5, 19, 2, 13], axis=1) + + res = a.execute().fetch() + expected = np.take(data, [5, 19, 2, 13], axis=1) + np.testing.assert_array_equal(res, expected) + + with pytest.raises(ValueError): + take(t, [1, 3, 4], out=tensor(np.random.rand(4))) + + out = tensor([1, 2, 3, 4]) + a = take(t, [4, 19, 2, 8], out=out) + + res = out.execute().fetch() + expected = np.take(data, [4, 19, 2, 8]) + np.testing.assert_array_equal(res, expected) + + +def test_compress_execution(setup): + data = np.array([[1, 2], [3, 4], [5, 6]]) + a = tensor(data, chunk_size=1) + + t = compress([0, 1], a, axis=0) + + res = t.execute().fetch() + expected = np.compress([0, 1], data, axis=0) + np.testing.assert_array_equal(res, expected) + + t = compress([0, 1], a, axis=1) + + res = t.execute().fetch() + expected = np.compress([0, 1], data, axis=1) + np.testing.assert_array_equal(res, expected) + + t = a.compress([0, 1, 1]) + + res = t.execute().fetch() + expected = np.compress([0, 1, 1], data) + np.testing.assert_array_equal(res, expected) + + t = compress([False, True, True], a, axis=0) + + res = t.execute().fetch() + expected = np.compress([False, True, True], data, axis=0) + np.testing.assert_array_equal(res, expected) + + t = compress([False, True], a, axis=1) + + res = t.execute().fetch() + expected = np.compress([False, True], data, axis=1) + np.testing.assert_array_equal(res, expected) + + with pytest.raises(np.AxisError): + compress([0, 1, 1], a, axis=1) + + # test order + data = np.asfortranarray([[1, 2], [3, 4], [5, 6]]) + a = tensor(data, chunk_size=1) + + t = compress([0, 1, 1], a, axis=0) + + res = t.execute().fetch() + expected = np.compress([0, 1, 1], data, axis=0) + np.testing.assert_array_equal(res, expected) + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + t = compress( + [0, 1, 1], a, axis=0, out=tensor(np.empty((2, 2), order="F", dtype=int)) + ) + + res = t.execute().fetch() + expected = np.compress( + [0, 1, 1], data, axis=0, out=np.empty((2, 2), order="F", dtype=int) + ) + np.testing.assert_array_equal(res, expected) + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + +def test_extract_execution(setup): + data = np.arange(12).reshape((3, 4)) + a = tensor(data, chunk_size=2) + condition = mod(a, 3) == 0 + + t = extract(condition, a) + + res = t.execute().fetch() + expected = np.extract(np.mod(data, 3) == 0, data) + np.testing.assert_array_equal(res, expected) + + +def test_choose_execution(setup): + options.chunk_size = 2 + + choices = [[0, 1, 2, 3], [10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33]] + a = choose([2, 3, 1, 0], choices) + + res = a.execute().fetch() + expected = np.choose([2, 3, 1, 0], choices) + + np.testing.assert_array_equal(res, expected) + + a = choose([2, 4, 1, 0], choices, mode="clip") # 4 goes to 3 (4-1) + expected = np.choose([2, 4, 1, 0], choices, mode="clip") + + res = a.execute().fetch() + np.testing.assert_array_equal(res, expected) + + a = choose([2, 4, 1, 0], choices, mode="wrap") # 4 goes to (4 mod 4) + expected = np.choose([2, 4, 1, 0], choices, mode="wrap") # 4 goes to (4 mod 4) + + res = a.execute().fetch() + np.testing.assert_array_equal(res, expected) + + a = [[1, 0, 1], [0, 1, 0], [1, 0, 1]] + choices = [-10, 10] + + b = choose(a, choices) + expected = np.choose(a, choices) + + res = b.execute().fetch() + np.testing.assert_array_equal(res, expected) + + a = np.array([0, 1]).reshape((2, 1, 1)) + c1 = np.array([1, 2, 3]).reshape((1, 3, 1)) + c2 = np.array([-1, -2, -3, -4, -5]).reshape((1, 1, 5)) + + b = choose(a, (c1, c2)) + expected = np.choose(a, (c1, c2)) + + res = b.execute().fetch() + np.testing.assert_array_equal(res, expected) + + # test order + a = np.array([0, 1]).reshape((2, 1, 1), order="F") + c1 = np.array([1, 2, 3]).reshape((1, 3, 1), order="F") + c2 = np.array([-1, -2, -3, -4, -5]).reshape((1, 1, 5), order="F") + + b = choose(a, (c1, c2)) + expected = np.choose(a, (c1, c2)) + + res = b.execute().fetch() + np.testing.assert_array_equal(res, expected) + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + b = choose(a, (c1, c2), out=tensor(np.empty(res.shape, order="F"))) + expected = np.choose(a, (c1, c2), out=np.empty(res.shape, order="F")) + + res = b.execute().fetch() + np.testing.assert_array_equal(res, expected) + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + +def test_unravel_execution(setup): + a = tensor([22, 41, 37], chunk_size=1) + t = stack(unravel_index(a, (7, 6))) + + res = t.execute().fetch() + expected = np.stack(np.unravel_index([22, 41, 37], (7, 6))) + + np.testing.assert_array_equal(res, expected) + + +def test_nonzero_execution(setup): + data = np.array([[1, 0, 0], [0, 2, 0], [1, 1, 0]]) + x = tensor(data, chunk_size=2) + t = hstack(nonzero(x)) + + res = t.execute().fetch() + expected = np.hstack(np.nonzero(data)) + + np.testing.assert_array_equal(res, expected) + + t = hstack((x > 1).nonzero()) + + res = t.execute().fetch() + expected = np.hstack(np.nonzero(data > 1)) + + np.testing.assert_array_equal(res, expected) + + +def test_flatnonzero_execution(setup): + x = arange(-2, 3, chunk_size=2) + + t = flatnonzero(x) + + res = t.execute().fetch() + expected = np.flatnonzero(np.arange(-2, 3)) + + np.testing.assert_equal(res, expected) + + +def test_fill_diagonal_execution(setup): + # 2-d + raws = [ + np.random.rand(30, 11), + np.random.rand(15, 15), + np.random.rand(11, 30), + sps.random(30, 11, density=0.1, format="csr"), + ] + + def copy(x): + if hasattr(x, "nnz"): + # sparse + return x.A + else: + return x.copy() + + for raw in raws: + # test 1 chunk, wrap=False + t = tensor(raw, chunk_size=30) + fill_diagonal(t, 1) + + res = t.execute().fetch() + expected = copy(raw) + np.fill_diagonal(expected, 1) + + np.testing.assert_array_equal(np.asarray(res), expected) + + # test 1 chunk, wrap=True + t = tensor(raw, chunk_size=30) + fill_diagonal(t, 1, wrap=True) + + res = t.execute().fetch() + expected = copy(raw) + np.fill_diagonal(expected, 1, wrap=True) + + np.testing.assert_array_equal(np.asarray(res), expected) + + # test multiple chunks, wrap=False + t = tensor(raw, chunk_size=(12, 4)) + fill_diagonal(t, 1) + + res = t.execute().fetch() + expected = copy(raw) + np.fill_diagonal(expected, 1) + + np.testing.assert_array_equal(np.asarray(res), expected) + + t = tensor(raw, chunk_size=(4, 12)) + fill_diagonal(t, 1) + + res = t.execute().fetch() + expected = copy(raw) + np.fill_diagonal(expected, 1) + + np.testing.assert_array_equal(np.asarray(res), expected) + + # test multiple chunk, val with list type + t = tensor(raw, chunk_size=(12, 4)) + fill_diagonal(t, [1, 2, 3]) + + res = t.execute().fetch() + expected = copy(raw) + np.fill_diagonal(expected, [1, 2, 3]) + + np.testing.assert_array_equal(np.asarray(res), expected) + + # test multiple chunk, val with tensor type + t = tensor(raw, chunk_size=(12, 4)) + fill_diagonal(t, tensor([1, 2, 3])) + + res = t.execute().fetch() + expected = copy(raw) + np.fill_diagonal(expected, [1, 2, 3]) + + np.testing.assert_array_equal(np.asarray(res), expected) + + # test multiple chunks, wrap=True + t = tensor(raw, chunk_size=(12, 4)) + fill_diagonal(t, 1, wrap=True) + + res = t.execute().fetch() + expected = copy(raw) + np.fill_diagonal(expected, 1, wrap=True) + + np.testing.assert_array_equal(np.asarray(res), expected) + + t = tensor(raw, chunk_size=(4, 12)) + fill_diagonal(t, 1, wrap=True) + + res = t.execute().fetch() + expected = copy(raw) + np.fill_diagonal(expected, 1, wrap=True) + + np.testing.assert_array_equal(np.asarray(res), expected) + + # test multiple chunk, val with list type + t = tensor(raw, chunk_size=(12, 4)) + fill_diagonal(t, [1, 2, 3], wrap=True) + + res = t.execute().fetch() + expected = copy(raw) + np.fill_diagonal(expected, [1, 2, 3], wrap=True) + + np.testing.assert_array_equal(np.asarray(res), expected) + + # test multiple chunk, val with tensor type + t = tensor(raw, chunk_size=(12, 4)) + fill_diagonal(t, tensor([[1, 2], [3, 4]]), wrap=True) + + res = t.execute().fetch() + expected = copy(raw) + np.fill_diagonal(expected, [1, 2, 3, 4], wrap=True) + + np.testing.assert_array_equal(np.asarray(res), expected) + + # 3-d + raw = np.random.rand(11, 11, 11) + + expected = raw.copy() + np.fill_diagonal(expected, 1) + expected2 = raw.copy() + np.fill_diagonal(expected2, 1, wrap=True) + np.testing.assert_array_equal(expected, expected2) + + # test 1 chunk + t = tensor(raw, chunk_size=30) + fill_diagonal(t, 1) + + res = t.execute().fetch() + + np.testing.assert_array_equal(res, expected) + + t = tensor(raw, chunk_size=30) + # wrap = True does not take effect when ndim > 2 + fill_diagonal(t, 1, wrap=True) + + res = t.execute().fetch() + + np.testing.assert_array_equal(res, expected) + + # test multiple chunk + t = tensor(raw, chunk_size=(3, 4, 5)) + fill_diagonal(t, 1) + + res = t.execute().fetch() + + np.testing.assert_array_equal(res, expected) + + t = tensor(raw, chunk_size=(3, 4, 5)) + # wrap = True does not take effect when ndim > 2 + fill_diagonal(t, 1, wrap=True) + + res = t.execute().fetch() + + np.testing.assert_array_equal(res, expected) + + # test val with list type + t = tensor(raw, chunk_size=(3, 4, 5)) + fill_diagonal(t, [[1, 2], [3, 4]]) + + res = t.execute().fetch() + expected = raw.copy() + np.fill_diagonal(expected, [1, 2, 3, 4]) + + np.testing.assert_array_equal(res, expected) + + # test val with tensor type + t = tensor(raw, chunk_size=(3, 4, 5)) + fill_diagonal(t, tensor([1, 2, 3])) + + res = t.execute().fetch() + expected = raw.copy() + np.fill_diagonal(expected, [1, 2, 3]) + + np.testing.assert_array_equal(res, expected) + + # test val with tensor type which ndim == 0 + t = tensor(raw, chunk_size=(3, 4, 5)) + fill_diagonal(t, tensor([1, 2, 3]).sum()) + + res = t.execute().fetch() + expected = raw.copy() + np.fill_diagonal(expected, 6) + + np.testing.assert_array_equal(res, expected) + + # test val with ndarray type which size is too long + t = tensor(raw, chunk_size=(3, 4, 5)) + fill_diagonal(t, np.arange(20)) + + res = t.execute().fetch() + expected = raw.copy() + np.fill_diagonal(expected, np.arange(20)) + + np.testing.assert_array_equal(res, expected) diff --git a/python/xorbits/_mars/tensor/indexing/unravel_index.py b/python/xorbits/_mars/tensor/indexing/unravel_index.py new file mode 100644 index 000000000..a2773a63e --- /dev/null +++ b/python/xorbits/_mars/tensor/indexing/unravel_index.py @@ -0,0 +1,150 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import Iterable + +import numpy as np + +from ... import opcodes as OperandDef +from ...core import ExecutableTuple +from ...serialization.serializables import FieldTypes, KeyField, StringField, TupleField +from ..array_utils import as_same_device, device +from ..core import TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorHasInput, TensorOperandMixin + + +class TensorUnravelIndex(TensorHasInput, TensorOperandMixin): + _op_type_ = OperandDef.UNRAVEL_INDEX + + _input = KeyField("input") + _dims = TupleField("dims", FieldTypes.int32) + _order = StringField("order") + + def __init__(self, dims=None, order=None, **kw): + super().__init__(_dims=dims, _order=order, **kw) + if self._order is None: + self._order = "C" + + @property + def dims(self): + return self._dims + + @property + def order(self): + return self._order + + @property + def output_limit(self): + return float("inf") + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + def __call__(self, indices): + order = TensorOrder.C_ORDER if self._order == "C" else TensorOrder.F_ORDER + kws = [{"pos": i, "order": order} for i in range(len(self._dims))] + return ExecutableTuple( + self.new_tensors([indices], indices.shape, kws=kws, output_limit=len(kws)) + ) + + @classmethod + def tile(cls, op): + indices = op.inputs[0] + dims = op.dims + order = op.outputs[0].order + + out_chunks = [list() for _ in range(len(dims))] + for in_chunk in indices.chunks: + chunk_op = op.copy().reset_key() + chunk_kws = [ + {"pos": i, "index": in_chunk.index, "order": order} + for i in range(len(dims)) + ] + chunks = chunk_op.new_chunks( + [in_chunk], shape=in_chunk.shape, kws=chunk_kws, output_limit=len(dims) + ) + for out_chunk, c in zip(out_chunks, chunks): + out_chunk.append(c) + + new_op = op.copy() + kws = [ + {"chunks": out_chunk, "nsplits": indices.nsplits, "shape": o.shape} + for out_chunk, o in zip(out_chunks, op.outputs) + ] + return new_op.new_tensors( + op.inputs, kws=kws, output_limit=len(dims), order=order + ) + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + indices = inputs[0] + + with device(device_id): + outputs = xp.unravel_index(indices, op.dims, order=op.order) + for o, output in zip(op.outputs, outputs): + ctx[o.key] = output + + +def unravel_index(indices, dims, order="C"): + """ + Converts a flat index or tensor of flat indices into a tuple + of coordinate tensors. + + Parameters + ---------- + indices : array_like + An integer tensor whose elements are indices into the flattened + version of a tensor of dimensions ``dims``. + dims : tuple of ints + The shape of the tensor to use for unraveling ``indices``. + order : {'C', 'F'}, optional + Determines whether the indices should be viewed as indexing in + row-major (C-style) or column-major (Fortran-style) order. + + Returns + ------- + unraveled_coords : tuple of Tensor + Each tensor in the tuple has the same shape as the ``indices`` + tensor. + + See Also + -------- + ravel_multi_index + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.unravel_index([22, 41, 37], (7,6)).execute() + (array([3, 6, 6]), array([4, 5, 1])) + + >>> mt.unravel_index(1621, (6,7,8,9)).execute() + (3, 1, 4, 1) + """ + indices = astensor(indices) + if isinstance(dims, Iterable): + dims = tuple(dims) + else: + dims = (dims,) + + if order not in "CF": + raise TypeError("only 'C' or 'F' order is permitted") + + op = TensorUnravelIndex(dims=dims, dtype=np.dtype(np.intp), order=order) + return op(indices) diff --git a/python/xorbits/_mars/tensor/lib/__init__.py b/python/xorbits/_mars/tensor/lib/__init__.py new file mode 100644 index 000000000..01608ff0d --- /dev/null +++ b/python/xorbits/_mars/tensor/lib/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import index_tricks +from .index_tricks import nd_grid diff --git a/python/xorbits/_mars/tensor/lib/index_tricks.py b/python/xorbits/_mars/tensor/lib/index_tricks.py new file mode 100644 index 000000000..0d7a3305a --- /dev/null +++ b/python/xorbits/_mars/tensor/lib/index_tricks.py @@ -0,0 +1,497 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import numpy as np +from numpy.core.numeric import ScalarType +from numpy.core.numerictypes import find_common_type +from numpy.lib.index_tricks import ndindex + +from .. import datasource as _nx +from ..base import ndim +from ..core import Tensor +from ..merge import concatenate + + +class nd_grid(object): + """ + Construct a multi-dimensional "meshgrid". + + ``grid = nd_grid()`` creates an instance which will return a mesh-grid + when indexed. The dimension and number of the output arrays are equal + to the number of indexing dimensions. If the step length is not a + complex number, then the stop is not inclusive. + + However, if the step length is a **complex number** (e.g. 5j), then the + integer part of its magnitude is interpreted as specifying the + number of points to create between the start and stop values, where + the stop value **is inclusive**. + + If instantiated with an argument of ``sparse=True``, the mesh-grid is + open (or not fleshed out) so that only one-dimension of each returned + argument is greater than 1. + + Parameters + ---------- + sparse : bool, optional + Whether the grid is sparse or not. Default is False. + + Notes + ----- + Two instances of `nd_grid` are made available in the Mars.tensor namespace, + `mgrid` and `ogrid`:: + + mgrid = nd_grid(sparse=False) + ogrid = nd_grid(sparse=True) + + Users should use these pre-defined instances instead of using `nd_grid` + directly. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mgrid = mt.lib.index_tricks.nd_grid() + >>> mgrid[0:5,0:5] + array([[[0, 0, 0, 0, 0], + [1, 1, 1, 1, 1], + [2, 2, 2, 2, 2], + [3, 3, 3, 3, 3], + [4, 4, 4, 4, 4]], + [[0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4]]]) + >>> mgrid[-1:1:5j] + array([-1. , -0.5, 0. , 0.5, 1. ]) + + >>> ogrid = mt.lib.index_tricks.nd_grid(sparse=True) + >>> ogrid[0:5,0:5] + [array([[0], + [1], + [2], + [3], + [4]]), array([[0, 1, 2, 3, 4]])] + + """ + + def __init__(self, sparse=False): + self.sparse = sparse + + def __getitem__(self, key): + try: + size = [] + typ = int + for k in key: + step = k.step + start = k.start + if start is None: + start = 0 + if step is None: + step = 1 + if isinstance(step, complex): + size.append(int(abs(step))) + typ = float + else: + size.append(int(math.ceil((k.stop - start) / (step * 1.0)))) + if ( + isinstance(step, float) + or isinstance(start, float) + or isinstance(k.stop, float) + ): + typ = float + if self.sparse: + nn = [ + _nx.arange(_x, dtype=_t) for _x, _t in zip(size, (typ,) * len(size)) + ] + else: + nn = _nx.indices(size, typ) + for k in range(len(size)): + step = key[k].step + start = key[k].start + if start is None: + start = 0 + if step is None: + step = 1 + if isinstance(step, complex): + step = int(abs(step)) + if step != 1: + step = (key[k].stop - start) / float(step - 1) + nn[k] = nn[k] * step + start + if self.sparse: + slobj = [np.newaxis] * len(size) + for k in range(len(size)): + slobj[k] = slice(None, None) + nn[k] = nn[k][slobj] + slobj[k] = np.newaxis + return nn + except (IndexError, TypeError): # pragma: no cover + step = key.step + stop = key.stop + start = key.start + if start is None: + start = 0 + if isinstance(step, complex): + step = abs(step) + length = int(step) + if step != 1: + step = (key.stop - start) / float(step - 1) + stop = key.stop + step + return _nx.arange(0, length, 1, float) * step + start + else: + return _nx.arange(start, stop, step) + + def __len__(self): + return 0 + + +class MGridClass(nd_grid): + """ + `nd_grid` instance which returns a dense multi-dimensional "meshgrid". + + An instance of `numpy.lib.index_tricks.nd_grid` which returns an dense + (or fleshed out) mesh-grid when indexed, so that each returned argument + has the same shape. The dimensions and number of the output arrays are + equal to the number of indexing dimensions. If the step length is not a + complex number, then the stop is not inclusive. + + However, if the step length is a **complex number** (e.g. 5j), then + the integer part of its magnitude is interpreted as specifying the + number of points to create between the start and stop values, where + the stop value **is inclusive**. + + Returns + ------- + mesh-grid `ndarrays` all of the same dimensions + + See Also + -------- + lib.index_tricks.nd_grid : class of `ogrid` and `mgrid` objects + ogrid : like mgrid but returns open (not fleshed out) mesh grids + meshgrid: return coordinate matrices from coordinate vectors + r_ : array concatenator + :ref:`how-to-partition` + + Examples + -------- + >>> import mars.tensor as mt + >>> mt.mgrid[0:5, 0:5] + array([[[0, 0, 0, 0, 0], + [1, 1, 1, 1, 1], + [2, 2, 2, 2, 2], + [3, 3, 3, 3, 3], + [4, 4, 4, 4, 4]], + [[0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4]]]) + >>> mt.mgrid[-1:1:5j] + array([-1. , -0.5, 0. , 0.5, 1. ]) + + """ + + def __init__(self): + super().__init__(sparse=False) + + +class OGridClass(nd_grid): + """ + `nd_grid` instance which returns an open multi-dimensional "meshgrid". + + An instance of `numpy.lib.index_tricks.nd_grid` which returns an open + (i.e. not fleshed out) mesh-grid when indexed, so that only one dimension + of each returned array is greater than 1. The dimension and number of the + output arrays are equal to the number of indexing dimensions. If the step + length is not a complex number, then the stop is not inclusive. + + However, if the step length is a **complex number** (e.g. 5j), then + the integer part of its magnitude is interpreted as specifying the + number of points to create between the start and stop values, where + the stop value **is inclusive**. + + Returns + ------- + mesh-grid + `ndarrays` with only one dimension not equal to 1 + + See Also + -------- + np.lib.index_tricks.nd_grid : class of `ogrid` and `mgrid` objects + mgrid : like `ogrid` but returns dense (or fleshed out) mesh grids + meshgrid: return coordinate matrices from coordinate vectors + r_ : array concatenator + :ref:`how-to-partition` + + Examples + -------- + >>> import mars.tensor as mt + >>> mt.ogrid[-1:1:5j] + array([-1. , -0.5, 0. , 0.5, 1. ]) + >>> mt.ogrid[0:5,0:5] + [array([[0], + [1], + [2], + [3], + [4]]), array([[0, 1, 2, 3, 4]])] + + """ + + def __init__(self): + super().__init__(sparse=True) + + +mgrid = MGridClass() +ogrid = OGridClass() + + +class AxisConcatenator: + def __init__(self, axis=0, matrix=False, ndmin=1, trans1d=-1): + self.axis = axis + self.matrix = matrix + self.trans1d = trans1d + self.ndmin = ndmin + + def __getitem__(self, key): + # handle matrix builder syntax + if isinstance(key, str): # pragma: no cover + raise NotImplementedError("Does not support operation on matrix") + + if not isinstance(key, tuple): + key = (key,) + + # copy attributes, since they can be overridden in the first argument + trans1d = self.trans1d + ndmin = self.ndmin + matrix = self.matrix + axis = self.axis + + objs = [] + scalars = [] + arraytypes = [] + scalartypes = [] + + for k, item in enumerate(key): + scalar = False + if isinstance(item, slice): + step = item.step + start = item.start + stop = item.stop + if start is None: + start = 0 + if step is None: + step = 1 + if isinstance(step, complex): + size = int(abs(step)) + newobj = _nx.linspace(start, stop, num=size) + else: + newobj = _nx.arange(start, stop, step) + if ndmin > 1: + newobj = _nx.array(newobj, copy=False, ndmin=ndmin) + if trans1d != -1: + newobj = newobj.swapaxes(-1, trans1d) + elif isinstance(item, str): + if k != 0: + raise ValueError("special directives must be the first entry.") + if item in ("r", "c"): # pragma: no cover + raise NotImplementedError("Does not support operation on matrix") + if "," in item: + vec = item.split(",") + try: + axis, ndmin = [int(x) for x in vec[:2]] + if len(vec) == 3: + trans1d = int(vec[2]) + continue + except Exception: # pragma: no cover + raise ValueError("unknown special directive") + try: + axis = int(item) + continue + except (ValueError, TypeError): # pragma: no cover# pragma: no cover + raise ValueError("unknown special directive") + elif type(item) in ScalarType: + newobj = np.array(item, ndmin=ndmin) + scalars.append(len(objs)) + scalar = True + scalartypes.append(newobj.dtype) + else: + item_ndim = ndim(item) + newobj = _nx.array(item, copy=False, ndmin=ndmin) + if trans1d != -1 and item_ndim < ndmin: + k2 = ndmin - item_ndim + k1 = trans1d + if k1 < 0: + k1 += k2 + 1 + defaxes = list(range(ndmin)) + axes = defaxes[:k1] + defaxes[k2:] + defaxes[k1:k2] + newobj = newobj.transpose(axes) + objs.append(newobj) + if not scalar and isinstance(newobj, Tensor): + arraytypes.append(newobj.dtype) + + # Ensure that scalars won't up-cast unless warranted + final_dtype = find_common_type(arraytypes, scalartypes) + if final_dtype is not None: + for k in scalars: + objs[k] = objs[k].astype(final_dtype) + + res = concatenate(tuple(objs), axis=axis) + + if matrix: # pragma: no cover + raise NotImplementedError("Does not support operation on matrix") + return res + + def __len__(self): + return 0 + + +# separate classes are used here instead of just making r_ = concatentor(0), +# etc. because otherwise we couldn't get the doc string to come out right +# in help(r_) + + +class RClass(AxisConcatenator): + """ + Translates slice objects to concatenation along the first axis. + + This is a simple way to build up tensor quickly. There are two use cases. + + 1. If the index expression contains comma separated tensors, then stack + them along their first axis. + 2. If the index expression contains slice notation or scalars then create + a 1-D tensor with a range indicated by the slice notation. + + If slice notation is used, the syntax ``start:stop:step`` is equivalent + to ``mt.arange(start, stop, step)`` inside of the brackets. However, if + ``step`` is an imaginary number (i.e. 100j) then its integer portion is + interpreted as a number-of-points desired and the start and stop are + inclusive. In other words ``start:stop:stepj`` is interpreted as + ``mt.linspace(start, stop, step, endpoint=1)`` inside of the brackets. + After expansion of slice notation, all comma separated sequences are + concatenated together. + + Optional character strings placed as the first element of the index + expression can be used to change the output. The strings 'r' or 'c' result + in matrix output. If the result is 1-D and 'r' is specified a 1 x N (row) + matrix is produced. If the result is 1-D and 'c' is specified, then a N x 1 + (column) matrix is produced. If the result is 2-D then both provide the + same matrix result. + + A string integer specifies which axis to stack multiple comma separated + tensors along. A string of two comma-separated integers allows indication + of the minimum number of dimensions to force each entry into as the + second integer (the axis to concatenate along is still the first integer). + + A string with three comma-separated integers allows specification of the + axis to concatenate along, the minimum number of dimensions to force the + entries to, and which axis should contain the start of the tensors which + are less than the specified number of dimensions. In other words the third + integer allows you to specify where the 1's should be placed in the shape + of the tensors that have their shapes upgraded. By default, they are placed + in the front of the shape tuple. The third argument allows you to specify + where the start of the tensor should be instead. Thus, a third argument of + '0' would place the 1's at the end of the tensor shape. Negative integers + specify where in the new shape tuple the last dimension of upgraded tensors + should be placed, so the default is '-1'. + + Parameters + ---------- + Not a function, so takes no parameters + + + Returns + ------- + A concatenated tensor or matrix. + + See Also + -------- + concatenate : Join a sequence of tensors along an existing axis. + c_ : Translates slice objects to concatenation along the second axis. + + Examples + -------- + >>> import mars.tensor as mt + >>> mt.r_[mt.array([1,2,3]), 0, 0, mt.array([4,5,6])].execute() + array([1, 2, 3, ..., 4, 5, 6]) + >>> mt.r_[-1:1:6j, [0]*3, 5, 6].execute() + array([-1. , -0.6, -0.2, 0.2, 0.6, 1. , 0. , 0. , 0. , 5. , 6. ]) + + String integers specify the axis to concatenate along or the minimum + number of dimensions to force entries into. + + >>> a = mt.array([[0, 1, 2], [3, 4, 5]]) + >>> mt.r_['-1', a, a].execute() # concatenate along last axis + array([[0, 1, 2, 0, 1, 2], + [3, 4, 5, 3, 4, 5]]) + >>> mt.r_['0,2', [1,2,3], [4,5,6]].execute() # concatenate along first axis, dim>=2 + array([[1, 2, 3], + [4, 5, 6]]) + + >>> mt.r_['0,2,0', [1,2,3], [4,5,6]].execute() + array([[1], + [2], + [3], + [4], + [5], + [6]]) + >>> mt.r_['1,2,0', [1,2,3], [4,5,6]].execute() + array([[1, 4], + [2, 5], + [3, 6]]) + """ + + def __init__(self): + AxisConcatenator.__init__(self, 0) + + +r_ = RClass() + + +class CClass(AxisConcatenator): + """ + Translates slice objects to concatenation along the second axis. + + This is short-hand for ``mt.r_['-1,2,0', index expression]``, which is + useful because of its common occurrence. In particular, tensors will be + stacked along their last axis after being upgraded to at least 2-D with + 1's post-pended to the shape (column vectors made out of 1-D tensors). + + See Also + -------- + column_stack : Stack 1-D tensors as columns into a 2-D tensor. + r_ : For more detailed documentation. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.c_[mt.array([1,2,3]), mt.array([4,5,6])].execute() + array([[1, 4], + [2, 5], + [3, 6]]) + >>> mt.c_[mt.array([[1,2,3]]), 0, 0, mt.array([[4,5,6]])].execute() + array([[1, 2, 3, ..., 4, 5, 6]]) + + """ + + def __init__(self): + AxisConcatenator.__init__(self, -1, ndmin=2, trans1d=0) + + +c_ = CClass() + + +__all__ = ["ndindex", "mgrid", "ogrid", "r_", "c_"] diff --git a/python/xorbits/_mars/tensor/lib/tests/__init__.py b/python/xorbits/_mars/tensor/lib/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/tensor/lib/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/tensor/lib/tests/test_index_tricks.py b/python/xorbits/_mars/tensor/lib/tests/test_index_tricks.py new file mode 100644 index 000000000..13b7fbc69 --- /dev/null +++ b/python/xorbits/_mars/tensor/lib/tests/test_index_tricks.py @@ -0,0 +1,105 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest + +from .... import tensor as mt +from ....core import tile +from ...lib import nd_grid + + +def test_index_tricks(): + mgrid = nd_grid() + g = mgrid[0:5, 0:5] + tile(g) # tileable means no loop exists + + ogrid = nd_grid(sparse=True) + o = ogrid[0:5, 0:5] + tile(*o) # tilesable means no loop exists + + +def test_r_(setup): + r = mt.r_[mt.array([1, 2, 3]), 0, 0, mt.array([4, 5, 6])] + + result = r.execute().fetch() + expected = np.r_[np.array([1, 2, 3]), 0, 0, np.array([4, 5, 6])] + + np.testing.assert_array_equal(result, expected) + + r = mt.r_[-1:1:6j, [0] * 3, 5, 6] + + result = r.execute().fetch() + expected = np.r_[-1:1:6j, [0] * 3, 5, 6] + + np.testing.assert_array_equal(result, expected) + + r = mt.r_[-1:1:6j] + + result = r.execute().fetch() + expected = np.r_[-1:1:6j] + + np.testing.assert_array_equal(result, expected) + + raw = [[0, 1, 2], [3, 4, 5]] + a = mt.array(raw, chunk_size=2) + r = mt.r_["-1", a, a] + + result = r.execute().fetch() + expected = np.r_["-1", raw, raw] + + np.testing.assert_array_equal(result, expected) + + r = mt.r_["0,2", [1, 2, 3], [4, 5, 6]] + + result = r.execute().fetch() + expected = np.r_["0,2", [1, 2, 3], [4, 5, 6]] + + np.testing.assert_array_equal(result, expected) + + r = mt.r_["0,2,0", [1, 2, 3], [4, 5, 6]] + + result = r.execute().fetch() + expected = np.r_["0,2,0", [1, 2, 3], [4, 5, 6]] + np.testing.assert_array_equal(result, expected) + + r = mt.r_["1,2,0", [1, 2, 3], [4, 5, 6]] + + result = r.execute().fetch() + expected = np.r_["1,2,0", [1, 2, 3], [4, 5, 6]] + np.testing.assert_array_equal(result, expected) + + assert len(mt.r_) == 0 + + with pytest.raises(ValueError): + _ = mt.r_[:3, "wrong"] + + +def test_c_(setup): + r = mt.c_[mt.array([1, 2, 3]), mt.array([4, 5, 6])] + + result = r.execute().fetch() + expected = np.c_[np.array([1, 2, 3]), np.array([4, 5, 6])] + np.testing.assert_array_equal(result, expected) + + r = mt.c_[mt.array([[1, 2, 3]]), 0, 0, mt.array([[4, 5, 6]])] + + result = r.execute().fetch() + expected = np.c_[np.array([[1, 2, 3]]), 0, 0, np.array([[4, 5, 6]])] + np.testing.assert_array_equal(result, expected) + + r = mt.c_[:3, 1:4] + result = r.execute().fetch() + expected = np.c_[:3, 1:4] + np.testing.assert_array_equal(result, expected) diff --git a/python/xorbits/_mars/tensor/linalg/__init__.py b/python/xorbits/_mars/tensor/linalg/__init__.py new file mode 100644 index 000000000..4c62f4087 --- /dev/null +++ b/python/xorbits/_mars/tensor/linalg/__init__.py @@ -0,0 +1,41 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .cholesky import TensorCholesky, cholesky +from .dot import TensorDot, dot +from .inner import inner, innerproduct +from .inv import TensorInv, inv +from .lu import TensorLU, lu +from .matmul import TensorMatmul, matmul +from .norm import TensorNorm, norm +from .qr import TensorQR, qr +from .randomized_svd import randomized_svd +from .solve import solve +from .solve_triangular import TensorSolveTriangular, solve_triangular +from .svd import TensorSVD, svd +from .tensordot import TensorTensorDot, tensordot +from .vdot import vdot + + +def _install(): + from ..core import Tensor, TensorData + + setattr(Tensor, "__matmul__", matmul) + setattr(Tensor, "dot", dot) + setattr(TensorData, "__matmul__", matmul) + setattr(TensorData, "dot", dot) + + +_install() +del _install diff --git a/python/xorbits/_mars/tensor/linalg/cholesky.py b/python/xorbits/_mars/tensor/linalg/cholesky.py new file mode 100644 index 000000000..7537cfeb7 --- /dev/null +++ b/python/xorbits/_mars/tensor/linalg/cholesky.py @@ -0,0 +1,325 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from numpy.linalg import LinAlgError + +from ... import opcodes as OperandDef +from ...core import recursive_tile +from ...serialization.serializables import BoolField, KeyField +from ...utils import has_unknown_shape +from ..array_utils import as_same_device, device +from ..core import TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorHasInput, TensorOperand, TensorOperandMixin + + +class TensorCholesky(TensorHasInput, TensorOperandMixin): + _op_type_ = OperandDef.CHOLESKY + + _input = KeyField("input") + _lower = BoolField("lower") + + def __init__(self, lower=None, **kw): + super().__init__(_lower=lower, **kw) + + @property + def lower(self): + return self._lower + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + def __call__(self, a): + return self.new_tensor([a], a.shape, order=TensorOrder.C_ORDER) + + @classmethod + def tile(cls, op): + from ..base import TensorTranspose + from ..datasource.zeros import TensorZeros + from ..utils import reverse_order + from .dot import TensorDot + from .solve_triangular import TensorSolveTriangular + + tensor = op.outputs[0] + in_tensor = op.input + if has_unknown_shape(in_tensor): + yield + if in_tensor.nsplits[0] != in_tensor.nsplits[1]: + # all chunks on diagonal should be square + nsplits = in_tensor.nsplits[0] + in_tensor = yield from recursive_tile(in_tensor.rechunk([nsplits, nsplits])) + + lower_chunks, upper_chunks = {}, {} + for i in range(in_tensor.chunk_shape[0]): + for j in range(in_tensor.chunk_shape[1]): + if i < j: + lower_shape = (in_tensor.nsplits[0][i], in_tensor.nsplits[1][j]) + lower_chunk = TensorZeros( + dtype=tensor.dtype, shape=lower_shape, order=tensor.order.value + ).new_chunk( + None, + shape=lower_shape, + index=(i, j), + order=tensor.order, + ) + upper_shape = (in_tensor.nsplits[1][j], in_tensor.nsplits[0][i]) + upper_chunk = TensorZeros( + dtype=tensor.dtype, shape=upper_shape, order=tensor.order.value + ).new_chunk( + None, + shape=upper_shape, + index=(j, i), + order=tensor.order, + ) + lower_chunks[lower_chunk.index] = lower_chunk + upper_chunks[upper_chunk.index] = upper_chunk + elif i == j: + target = in_tensor.cix[i, j] + if i > 0: + prev_chunks = [] + for p in range(i): + a, b = lower_chunks[i, p], upper_chunks[p, j] + prev_chunk = TensorDot(dtype=tensor.dtype).new_chunk( + [a, b], + shape=(a.shape[0], b.shape[1]), + order=tensor.order, + ) + prev_chunks.append(prev_chunk) + + cholesky_fuse_op = TensorCholeskyFuse() + lower_chunk = cholesky_fuse_op.new_chunk( + [target] + prev_chunks, + shape=target.shape, + index=(i, j), + order=tensor.order, + ) + else: + lower_chunk = TensorCholesky( + lower=True, dtype=tensor.dtype + ).new_chunk( + [target], + shape=target.shape, + index=(i, j), + order=tensor.order, + ) + + upper_chunk = TensorTranspose(dtype=lower_chunk.dtype).new_chunk( + [lower_chunk], + shape=lower_chunk.shape[::-1], + index=lower_chunk.index[::-1], + order=reverse_order(lower_chunk.order), + ) + lower_chunks[lower_chunk.index] = lower_chunk + upper_chunks[upper_chunk.index] = upper_chunk + else: + target = in_tensor.cix[j, i] + if j > 0: + prev_chunks = [] + for p in range(j): + a, b = lower_chunks[j, p], upper_chunks[p, i] + prev_chunk = TensorDot(dtype=tensor.dtype).new_chunk( + [a, b], + shape=(a.shape[0], b.shape[1]), + order=tensor.order, + ) + prev_chunks.append(prev_chunk) + cholesky_fuse_op = TensorCholeskyFuse(by_solve_triangular=True) + upper_chunk = cholesky_fuse_op.new_chunk( + [target] + [lower_chunks[j, j]] + prev_chunks, + shape=target.shape, + index=(j, i), + order=tensor.order, + ) + else: + upper_chunk = TensorSolveTriangular( + lower=True, dtype=tensor.dtype + ).new_chunk( + [lower_chunks[j, j], target], + shape=target.shape, + index=(j, i), + order=tensor.order, + ) + lower_chunk = TensorTranspose(dtype=upper_chunk.dtype).new_chunk( + [upper_chunk], + shape=upper_chunk.shape[::-1], + index=upper_chunk.index[::-1], + order=reverse_order(upper_chunk.order), + ) + lower_chunks[lower_chunk.index] = lower_chunk + upper_chunks[upper_chunk.index] = upper_chunk + + new_op = op.copy() + if op.lower: + return new_op.new_tensors( + op.inputs, + tensor.shape, + order=tensor.order, + chunks=list(lower_chunks.values()), + nsplits=in_tensor.nsplits, + ) + else: + return new_op.new_tensors( + op.inputs, + tensor.shape, + order=tensor.order, + chunks=list(upper_chunks.values()), + nsplits=in_tensor.nsplits, + ) + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + (a,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + if xp is np: + import scipy.linalg + + ctx[chunk.key] = scipy.linalg.cholesky(a, lower=op.lower) + return + + r = xp.linalg.cholesky(a) + if not chunk.op.lower: + r = r.T.conj() + + ctx[chunk.key] = r + + +class TensorCholeskyFuse(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.CHOLESKY_FUSE + + _by_solve_triangular = BoolField("by_solve_triangular") + + def __init__(self, by_solve_triangular=None, **kw): + super().__init__(_by_solve_triangular=by_solve_triangular, **kw) + + @property + def by_solve_triangular(self): + return self._by_solve_triangular + + @classmethod + def _execute_by_cholesky(cls, inputs): + import scipy.linalg + + target = inputs[0] + return scipy.linalg.cholesky((target - sum(inputs[1:])), lower=True) + + @classmethod + def _execute_by_solve_striangular(cls, inputs): + import scipy.linalg + + target = inputs[0] + lower = inputs[1] + return scipy.linalg.solve_triangular( + lower, (target - sum(inputs[2:])), lower=True + ) + + @classmethod + def execute(cls, ctx, op): + inputs = [ctx[c.key] for c in op.inputs] + if op.by_solve_triangular: + ret = cls._execute_by_solve_striangular(inputs) + else: + ret = cls._execute_by_cholesky(inputs) + ctx[op.outputs[0].key] = ret + + +def cholesky(a, lower=False): + """ + Cholesky decomposition. + + Return the Cholesky decomposition, `L * L.H`, of the square matrix `a`, + where `L` is lower-triangular and .H is the conjugate transpose operator + (which is the ordinary transpose if `a` is real-valued). `a` must be + Hermitian (symmetric if real-valued) and positive-definite. Only `L` is + actually returned. + + Parameters + ---------- + a : (..., M, M) array_like + Hermitian (symmetric if all elements are real), positive-definite + input matrix. + lower : bool + Whether to compute the upper or lower triangular Cholesky + factorization. Default is upper-triangular. + + Returns + ------- + L : (..., M, M) array_like + Upper or lower-triangular Cholesky factor of `a`. + + Raises + ------ + LinAlgError + If the decomposition fails, for example, if `a` is not + positive-definite. + + Notes + ----- + + Broadcasting rules apply, see the `mt.linalg` documentation for + details. + + The Cholesky decomposition is often used as a fast way of solving + + .. math:: A \\mathbf{x} = \\mathbf{b} + + (when `A` is both Hermitian/symmetric and positive-definite). + + First, we solve for :math:`\\mathbf{y}` in + + .. math:: L \\mathbf{y} = \\mathbf{b}, + + and then for :math:`\\mathbf{x}` in + + .. math:: L.H \\mathbf{x} = \\mathbf{y}. + + Examples + -------- + >>> import mars.tensor as mt + + >>> A = mt.array([[1,-2j],[2j,5]]) + >>> A.execute() + array([[ 1.+0.j, 0.-2.j], + [ 0.+2.j, 5.+0.j]]) + >>> L = mt.linalg.cholesky(A, lower=True) + >>> L.execute() + array([[ 1.+0.j, 0.+0.j], + [ 0.+2.j, 1.+0.j]]) + >>> mt.dot(L, L.T.conj()).execute() # verify that L * L.H = A + array([[ 1.+0.j, 0.-2.j], + [ 0.+2.j, 5.+0.j]]) + >>> A = [[1,-2j],[2j,5]] # what happens if A is only array_like? + >>> mt.linalg.cholesky(A, lower=True).execute() + array([[ 1.+0.j, 0.+0.j], + [ 0.+2.j, 1.+0.j]]) + + """ + a = astensor(a) + + if a.ndim != 2: # pragma: no cover + raise LinAlgError( + f"{a.ndim}-dimensional array given. Tensor must be two-dimensional" + ) + if a.shape[0] != a.shape[1]: # pragma: no cover + raise LinAlgError("Input must be square") + + cho = np.linalg.cholesky(np.array([[1, 2], [2, 5]], dtype=a.dtype)) + + op = TensorCholesky(lower=lower, dtype=cho.dtype) + return op(a) diff --git a/python/xorbits/_mars/tensor/linalg/core.py b/python/xorbits/_mars/tensor/linalg/core.py new file mode 100644 index 000000000..f05f6f264 --- /dev/null +++ b/python/xorbits/_mars/tensor/linalg/core.py @@ -0,0 +1,320 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ...core import recursive_tile +from ...utils import has_unknown_shape +from ..core import TensorOrder +from ..utils import decide_chunk_sizes +from .utils import calc_svd_shapes + + +class SFQR: + __slots__ = () + + @classmethod + def tile(cls, op): + """ + Short-and-Fat QR + + Q [R_1 R_2 ...] = [A_1 A_2 ...] + """ + from ..base import TensorTranspose + from .dot import TensorDot + from .qr import TensorQR + + a = op.input + q, r = op.outputs + + tinyq, tinyr = np.linalg.qr(np.ones((1, 1), dtype=a.dtype)) + q_dtype, r_dtype = tinyq.dtype, tinyr.dtype + + check_nan_shape = False + rechunk_size = dict() + if a.chunk_shape[0] != 1: + check_nan_shape = True + rechunk_size[0] = a.shape[0] + + if len(a.chunks) > 1: + check_nan_shape = True + if a.chunks[0].shape[0] > a.chunks[0].shape[1]: + rechunk_size[1] = a.shape[0] + + if check_nan_shape: + if has_unknown_shape(a): + yield + + if rechunk_size: + new_chunks = decide_chunk_sizes(a.shape, rechunk_size, a.dtype.itemsize) + a = yield from recursive_tile(a.rechunk(new_chunks)) + + # A_1's QR decomposition + r_chunks = [] + first_chunk = a.chunks[0] + x, y = first_chunk.shape + q_shape, r_shape = ( + (first_chunk.shape, (y, y)) if x > y else ((x, x), first_chunk.shape) + ) + qr_op = TensorQR() + q_chunk, r_chunk = qr_op.new_chunks( + [first_chunk], + index=(0, 0), + kws=[ + {"side": "q", "dtype": q_dtype, "shape": q_shape, "order": q.order}, + {"side": "r", "dtype": r_dtype, "shape": r_shape, "order": r.order}, + ], + ) + # q is an orthogonal matrix, so q.T and inverse of q is equal + trans_op = TensorTranspose() + q_transpose = trans_op.new_chunk([q_chunk], shape=q_chunk.shape) + r_chunks.append(r_chunk) + + r_rest = [ + TensorDot().new_chunk( + [q_transpose, c], + shape=(q_transpose.shape[0], c.shape[1]), + index=c.index, + order=q.order, + ) + for c in a.chunks[1:] + ] + r_chunks.extend(r_rest) + + new_op = op.copy() + q_nsplits = ((q_chunk.shape[0],), (q_chunk.shape[1],)) + r_nsplits = ((r_chunks[0].shape[0],), tuple(c.shape[1] for c in r_chunks)) + kws = [ + { + "chunks": [q_chunk], + "nsplits": q_nsplits, + "dtype": q.dtype, + "shape": q.shape, + }, + { + "chunks": r_chunks, + "nsplits": r_nsplits, + "dtype": r.dtype, + "shape": r.shape, + }, + ] + return new_op.new_tensors(op.inputs, kws=kws) + + +class TSQR: + __slots__ = () + + @classmethod + def tile(cls, op): + from ..indexing.slice import TensorSlice + from ..merge.concatenate import TensorConcatenate + from .dot import TensorDot + from .qr import TensorQR + from .svd import TensorSVD + + calc_svd = getattr(op, "_is_svd", lambda: None)() or False + + a = op.input + + tinyq, tinyr = np.linalg.qr(np.ones((1, 1), dtype=a.dtype)) + q_dtype, r_dtype = tinyq.dtype, tinyr.dtype + + if a.chunk_shape[1] != 1: + if has_unknown_shape(a): + yield + new_chunk_size = decide_chunk_sizes( + a.shape, {1: a.shape[1]}, a.dtype.itemsize + ) + a = yield from recursive_tile(a.rechunk(new_chunk_size)) + + # stage 1, map phase + stage1_q_chunks, stage1_r_chunks = stage1_chunks = [[], []] # Q and R chunks + for c in a.chunks: + x, y = c.shape + q_shape, r_shape = (c.shape, (y, y)) if x > y else ((x, x), c.shape) + qr_op = TensorQR() + qr_chunks = qr_op.new_chunks( + [c], + index=c.index, + kws=[ + {"side": "q", "dtype": q_dtype, "shape": q_shape}, + {"side": "r", "dtype": r_dtype, "shape": r_shape}, + ], + ) + stage1_chunks[0].append(qr_chunks[0]) + stage1_chunks[1].append(qr_chunks[1]) + + # stage 2, reduce phase + # concatenate all r chunks into one + shape = (sum(c.shape[0] for c in stage1_r_chunks), stage1_r_chunks[0].shape[1]) + concat_op = TensorConcatenate(axis=0, dtype=stage1_r_chunks[0].dtype) + concat_r_chunk = concat_op.new_chunk( + stage1_r_chunks, shape=shape, index=(0, 0), order=TensorOrder.C_ORDER + ) + qr_op = TensorQR() + qr_chunks = qr_op.new_chunks( + [concat_r_chunk], + index=concat_r_chunk.index, + kws=[ + { + "side": "q", + "dtype": q_dtype, + "order": TensorOrder.C_ORDER, + "shape": (concat_r_chunk.shape[0], min(concat_r_chunk.shape)), + }, + { + "side": "r", + "dtype": r_dtype, + "order": TensorOrder.C_ORDER, + "shape": (min(concat_r_chunk.shape), concat_r_chunk.shape[1]), + }, + ], + ) + stage2_q_chunk, stage2_r_chunk = qr_chunks + + # stage 3, map phase + # split stage2_q_chunk into the same size as stage1_q_chunks + q_splits = np.cumsum([c.shape[1] for c in stage1_q_chunks]).tolist() + q_slices = [ + slice(q_splits[i]) if i == 0 else slice(q_splits[i - 1], q_splits[i]) + for i in range(len(q_splits)) + ] + stage2_q_chunks = [] + for c, s in zip(stage1_q_chunks, q_slices): + slice_op = TensorSlice(slices=[s], dtype=c.dtype) + slice_length = s.stop - (s.start or 0) + stage2_q_chunks.append( + slice_op.new_chunk( + [stage2_q_chunk], + index=c.index, + order=TensorOrder.C_ORDER, + shape=(slice_length, stage2_q_chunk.shape[1]), + ) + ) + stage3_q_chunks = [] + for c1, c2 in zip(stage1_q_chunks, stage2_q_chunks): + dot_op = TensorDot(dtype=q_dtype) + shape = (c1.shape[0], c2.shape[1]) + stage3_q_chunks.append( + dot_op.new_chunk( + [c1, c2], shape=shape, index=c1.index, order=TensorOrder.C_ORDER + ) + ) + + if not calc_svd: + q, r = op.outputs + new_op = op.copy() + q_nsplits = ( + tuple(c.shape[0] for c in stage3_q_chunks), + (stage3_q_chunks[0].shape[1],), + ) + r_nsplits = ((stage2_r_chunk.shape[0],), (stage2_r_chunk.shape[1],)) + kws = [ + # Q + { + "chunks": stage3_q_chunks, + "nsplits": q_nsplits, + "dtype": q.dtype, + "shape": q.shape, + }, + # R, calculate from stage2 + { + "chunks": [stage2_r_chunk], + "nsplits": r_nsplits, + "dtype": r.dtype, + "shape": r.shape, + }, + ] + return new_op.new_tensors(op.inputs, kws=kws) + else: + U, s, V = op.outputs + U_dtype, s_dtype, V_dtype = U.dtype, s.dtype, V.dtype + U_shape, s_shape, V_shape = U.shape, s.shape, V.shape + + svd_op = TensorSVD() + u_shape, s_shape, v_shape = calc_svd_shapes(stage2_r_chunk) + stage2_usv_chunks = svd_op.new_chunks( + [stage2_r_chunk], + kws=[ + { + "side": "U", + "dtype": U_dtype, + "index": stage2_r_chunk.index, + "shape": u_shape, + "order": TensorOrder.C_ORDER, + }, + { + "side": "s", + "dtype": s_dtype, + "index": stage2_r_chunk.index[1:], + "shape": s_shape, + "order": TensorOrder.C_ORDER, + }, + { + "side": "V", + "dtype": V_dtype, + "index": stage2_r_chunk.index, + "shape": v_shape, + "order": TensorOrder.C_ORDER, + }, + ], + ) + stage2_u_chunk, stage2_s_chunk, stage2_v_chunk = stage2_usv_chunks + + # stage 4, U = Q @ u + stage4_u_chunks = [] + if U is not None: # U is not garbage collected + for c1 in stage3_q_chunks: + dot_op = TensorDot(dtype=U_dtype) + shape = (c1.shape[0], stage2_u_chunk.shape[1]) + stage4_u_chunks.append( + dot_op.new_chunk( + [c1, stage2_u_chunk], + shape=shape, + index=c1.index, + order=TensorOrder.C_ORDER, + ) + ) + + new_op = op.copy() + u_nsplits = ( + tuple(c.shape[0] for c in stage4_u_chunks), + (stage4_u_chunks[0].shape[1],), + ) + s_nsplits = ((stage2_s_chunk.shape[0],),) + v_nsplits = ((stage2_v_chunk.shape[0],), (stage2_v_chunk.shape[1],)) + kws = [ + { + "chunks": stage4_u_chunks, + "nsplits": u_nsplits, + "dtype": U_dtype, + "shape": U_shape, + "order": U.order, + }, # U + { + "chunks": [stage2_s_chunk], + "nsplits": s_nsplits, + "dtype": s_dtype, + "shape": s_shape, + "order": s.order, + }, # s + { + "chunks": [stage2_v_chunk], + "nsplits": v_nsplits, + "dtype": V_dtype, + "shape": V_shape, + "order": V.order, + }, # V + ] + return new_op.new_tensors(op.inputs, kws=kws) diff --git a/python/xorbits/_mars/tensor/linalg/dot.py b/python/xorbits/_mars/tensor/linalg/dot.py new file mode 100644 index 000000000..280289623 --- /dev/null +++ b/python/xorbits/_mars/tensor/linalg/dot.py @@ -0,0 +1,164 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ...serialization.serializables import KeyField +from ..array_utils import as_same_device, device, is_sparse_module +from ..core import Tensor, TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorOperand, TensorOperandMixin +from .tensordot import tensordot + + +class TensorDot(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.DOT + + _a = KeyField("a") + _b = KeyField("b") + + @property + def a(self): + return self._a + + @property + def b(self): + return self._b + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._a, self._b = self._inputs + + @classmethod + def execute(cls, ctx, op): + chunk = op.outputs[0] + (a, b), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + if not op.sparse and is_sparse_module(xp): + # tell sparse to do calculation on numpy or cupy dot + ctx[chunk.key] = xp.dot(a, b, sparse=False) + else: + ctx[chunk.key] = xp.dot(a, b) + + +def dot(a, b, out=None, sparse=None): + """ + Dot product of two arrays. Specifically, + + - If both `a` and `b` are 1-D arrays, it is inner product of vectors + (without complex conjugation). + + - If both `a` and `b` are 2-D arrays, it is matrix multiplication, + but using :func:`matmul` or ``a @ b`` is preferred. + + - If either `a` or `b` is 0-D (scalar), it is equivalent to :func:`multiply` + and using ``numpy.multiply(a, b)`` or ``a * b`` is preferred. + + - If `a` is an N-D array and `b` is a 1-D array, it is a sum product over + the last axis of `a` and `b`. + + - If `a` is an N-D array and `b` is an M-D array (where ``M>=2``), it is a + sum product over the last axis of `a` and the second-to-last axis of `b`:: + + dot(a, b)[i,j,k,m] = sum(a[i,j,:] * b[k,:,m]) + + Parameters + ---------- + a : array_like + First argument. + b : array_like + Second argument. + out : Tensor, optional + Output argument. This must have the exact kind that would be returned + if it was not used. In particular, it must have the right type, must be + C-contiguous, and its dtype must be the dtype that would be returned + for `dot(a,b)`. This is a performance feature. Therefore, if these + conditions are not met, an exception is raised, instead of attempting + to be flexible. + + Returns + ------- + output : Tensor + Returns the dot product of `a` and `b`. If `a` and `b` are both + scalars or both 1-D arrays then a scalar is returned; otherwise + a tensor is returned. + If `out` is given, then it is returned. + + Raises + ------ + ValueError + If the last dimension of `a` is not the same size as + the second-to-last dimension of `b`. + + See Also + -------- + vdot : Complex-conjugating dot product. + tensordot : Sum products over arbitrary axes. + einsum : Einstein summation convention. + matmul : '@' operator as method with out parameter. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.dot(3, 4).execute() + 12 + + Neither argument is complex-conjugated: + + >>> mt.dot([2j, 3j], [2j, 3j]).execute() + (-13+0j) + + For 2-D arrays it is the matrix product: + + >>> a = [[1, 0], [0, 1]] + >>> b = [[4, 1], [2, 2]] + >>> mt.dot(a, b).execute() + array([[4, 1], + [2, 2]]) + + >>> a = mt.arange(3*4*5*6).reshape((3,4,5,6)) + >>> b = mt.arange(3*4*5*6)[::-1].reshape((5,4,6,3)) + >>> mt.dot(a, b)[2,3,2,1,2,2].execute() + 499128 + >>> mt.sum(a[2,3,2,:] * b[1,2,:,2]).execute() + 499128 + """ + a, b = astensor(a), astensor(b) + if a.isscalar() and b.isscalar(): + ret = a * b + else: + ret = tensordot(a, b, axes=((a.ndim - 1,), (b.ndim - 2,)), sparse=sparse) + + if out is None: + return ret + + # set to out + if not isinstance(out, Tensor): + raise TypeError(f"`out` must be a Tensor, got {type(out)} instead") + if out.shape != ret.shape: + raise ValueError("output tensor has wrong dimensions") + if not ( + out.dtype == ret.dtype + and out.ndim == ret.ndim + and out.order == TensorOrder.C_ORDER + ): + raise ValueError( + "output tensor is not acceptable " + "(must have the right datatype, number of dimensions and be a C-Tensor" + ) + out.data = ret.astype(out.dtype, order=out.order.value, copy=False).data + return out diff --git a/python/xorbits/_mars/tensor/linalg/inner.py b/python/xorbits/_mars/tensor/linalg/inner.py new file mode 100644 index 000000000..48f7ef361 --- /dev/null +++ b/python/xorbits/_mars/tensor/linalg/inner.py @@ -0,0 +1,36 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..datasource import tensor as astensor +from .tensordot import tensordot + + +def inner(a, b, sparse=None): + """ + Returns the inner product of a and b for arrays of floating point types. + + Like the generic NumPy equivalent the product sum is over the last dimension + of a and b. The first argument is not conjugated. + + """ + a, b = astensor(a), astensor(b) + if a.isscalar() and b.isscalar(): + ret = a * b + else: + ret = tensordot(a, b, axes=(-1, -1), sparse=sparse) + + return ret + + +innerproduct = inner diff --git a/python/xorbits/_mars/tensor/linalg/inv.py b/python/xorbits/_mars/tensor/linalg/inv.py new file mode 100644 index 000000000..1b2898f11 --- /dev/null +++ b/python/xorbits/_mars/tensor/linalg/inv.py @@ -0,0 +1,154 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from numpy.linalg import LinAlgError + +from ... import opcodes as OperandDef +from ...core import recursive_tile +from ...serialization.serializables import KeyField +from ..array_utils import as_same_device, device +from ..core import TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorHasInput, TensorOperandMixin + + +class TensorInv(TensorHasInput, TensorOperandMixin): + _op_type_ = OperandDef.INV + + _input = KeyField("input") + + def __call__(self, a): + a = astensor(a) + return self.new_tensor([a], a.shape, order=TensorOrder.C_ORDER) + + @classmethod + def _tile_one_chunk(cls, op): + out = op.outputs[0] + chunk_op = op.copy().reset_key() + chunk_params = out.params + chunk_params["index"] = (0,) * out.ndim + out_chunk = chunk_op.new_chunk(op.inputs[0].chunks, kws=[chunk_params]) + + new_op = op.copy() + params = out.params + params["nsplits"] = tuple((s,) for s in out.shape) + params["chunks"] = [out_chunk] + return new_op.new_tensors(op.inputs, kws=[params]) + + @classmethod + def tile(cls, op): + """ + Use LU decomposition to compute inverse of matrix. + Given a square matrix A: + P, L, U = lu(A) + b_eye is an identity matrix with the same shape as matrix A, then, + (P * L * U) * A_inv = b_eye + L * (U * A_inv) = P.T * b_eye + use `solve_triangular` twice to compute the inverse of matrix A. + """ + from ..base.transpose import TensorTranspose + from ..datasource import eye + from .lu import lu + from .solve_triangular import solve_triangular + from .tensordot import tensordot + + in_tensor = op.input + is_sparse = in_tensor.is_sparse() + + if len(in_tensor.chunks) == 1: + return cls._tile_one_chunk(op) + + b_eye = eye(in_tensor.shape[0], chunk_size=in_tensor.nsplits, sparse=is_sparse) + + p, l, u = lu(in_tensor) + + # transposed p equals to inverse of p + p_transpose = TensorTranspose( + dtype=p.dtype, sparse=p.op.sparse, axes=list(range(in_tensor.ndim))[::-1] + ).new_tensor([p], p.shape) + + b = tensordot( + p_transpose, b_eye, axes=((p_transpose.ndim - 1,), (b_eye.ndim - 2,)) + ) + + # as `l` is a lower matrix, `lower=True` should be specified. + uy = solve_triangular(l, b, lower=True, sparse=op.sparse) + + a_inv = solve_triangular(u, uy, sparse=op.sparse) + a_inv = yield from recursive_tile(a_inv) + return [a_inv] + + @classmethod + def execute(cls, ctx, op): + (inp,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + ctx[op.outputs[0].key] = xp.linalg.inv(inp) + + +def inv(a, sparse=None): + """ + Compute the (multiplicative) inverse of a matrix. + Given a square matrix `a`, return the matrix `ainv` satisfying + ``dot(a, ainv) = dot(ainv, a) = eye(a.shape[0])``. + + Parameters + ---------- + a : (..., M, M) array_like + Matrix to be inverted. + sparse: bool, optional + Return sparse value or not. + + Returns + ------- + ainv : (..., M, M) ndarray or matrix + (Multiplicative) inverse of the matrix `a`. + + Raises + ------ + LinAlgError + If `a` is not square or inversion fails. + + Examples + -------- + >>> import mars.tensor as mt + >>> a = np.array([[1., 2.], [3., 4.]]) + >>> ainv = mt.linalg.inv(a) + >>> mt.allclose(mt.dot(a, ainv), mt.eye(2)).execute() + True + + >>> mt.allclose(mt.dot(ainv, a), mt.eye(2)).execute() + True + + >>> ainv.execute() + array([[ -2. , 1. ], + [ 1.5, -0.5]]) + """ + + # TODO: using some parallel algorithm for matrix inversion. + a = astensor(a) + if a.ndim != 2: + raise LinAlgError( + f"{a.ndim}-dimensional array given. Tensor must be two-dimensional" + ) + if a.shape[0] != a.shape[1]: + raise LinAlgError("Input must be square") + + tiny_inv = np.linalg.inv(np.array([[1, 2], [2, 5]], dtype=a.dtype)) + sparse = sparse if sparse is not None else a.issparse() + op = TensorInv(dtype=tiny_inv.dtype, sparse=sparse) + return op(a) diff --git a/python/xorbits/_mars/tensor/linalg/lu.py b/python/xorbits/_mars/tensor/linalg/lu.py new file mode 100644 index 000000000..eba59f767 --- /dev/null +++ b/python/xorbits/_mars/tensor/linalg/lu.py @@ -0,0 +1,510 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from numpy.linalg import LinAlgError + +from ... import opcodes as OperandDef +from ...core import ExecutableTuple, recursive_tile +from ...serialization.serializables import KeyField +from ...utils import has_unknown_shape +from ..array_utils import as_same_device, device, is_sparse_module +from ..datasource import tensor as astensor +from ..operands import TensorHasInput, TensorOperandMixin + + +class TensorLU(TensorHasInput, TensorOperandMixin): + _op_type_ = OperandDef.LU + + _input = KeyField("input") + + @property + def output_limit(self): + return 3 + + def __call__(self, a): + import scipy.linalg + + a = astensor(a) + if a.ndim != 2: + raise LinAlgError( + f"{a.ndim}-dimensional array given. Tensor must be two-dimensional" + ) + + if a.shape[0] > a.shape[1]: + p_shape = (a.shape[0],) * 2 + l_shape = a.shape + u_shape = (a.shape[1],) * 2 + elif a.shape[0] < a.shape[1]: + p_shape = (a.shape[0],) * 2 + l_shape = (a.shape[0],) * 2 + u_shape = a.shape + else: + p_shape, l_shape, u_shape = (a.shape,) * 3 + + tiny_p, tiny_l, tiny_u = scipy.linalg.lu( + np.array([[1, 2], [2, 5]], dtype=a.dtype) + ) + + order = a.order + p, l, u = self.new_tensors( + [a], + kws=[ + {"side": "p", "dtype": tiny_p.dtype, "shape": p_shape, "order": order}, + {"side": "l", "dtype": tiny_l.dtype, "shape": l_shape, "order": order}, + {"side": "u", "dtype": tiny_u.dtype, "shape": u_shape, "order": order}, + ], + ) + return ExecutableTuple([p, l, u]) + + @classmethod + def _tile_one_chunk(cls, op): + p, l, u = op.outputs + chunk_op = op.copy().reset_key() + chunk_kws = [ + { + "side": "p", + "dtype": p.dtype, + "shape": p.shape, + "order": p.order, + "index": (0,) * p.ndim, + }, + { + "side": "l", + "dtype": l.dtype, + "shape": l.shape, + "order": l.order, + "index": (0,) * l.ndim, + }, + { + "side": "u", + "dtype": u.dtype, + "shape": u.shape, + "order": u.order, + "index": (0,) * u.ndim, + }, + ] + chunks = chunk_op.new_chunks(op.input.chunks, kws=chunk_kws) + + new_op = op.copy() + kws = [p.params, l.params, u.params] + for i, out in enumerate([p, l, u]): + kws[i]["nsplits"] = tuple((s,) for s in out.shape) + kws[i]["chunks"] = [chunks[i]] + return new_op.new_tensors(op.inputs, kws=kws) + + @classmethod + def tile(cls, op): + if len(op.input.chunks) == 1: + return cls._tile_one_chunk(op) + + from ..arithmetic.add import TensorTreeAdd + from ..arithmetic.subtract import TensorSubtract + from ..base.transpose import TensorTranspose + from ..datasource.zeros import TensorZeros, zeros + from ..merge.hstack import hstack + from ..merge.vstack import vstack + from .dot import TensorDot + from .solve_triangular import TensorSolveTriangular + + P, L, U = op.outputs + raw_in_tensor = in_tensor = op.input + out_tensor = op.outputs[0] + + if in_tensor.shape[0] > in_tensor.shape[1]: + zero_tensor = zeros( + (in_tensor.shape[0], in_tensor.shape[0] - in_tensor.shape[1]), + dtype=in_tensor.dtype, + sparse=in_tensor.issparse(), + gpu=in_tensor.op.gpu, + chunk_size=(in_tensor.nsplits[0], max(in_tensor.nsplits[1])), + order=in_tensor.order.value, + ) + in_tensor = yield from recursive_tile(hstack([in_tensor, zero_tensor])) + elif in_tensor.shape[0] < in_tensor.shape[1]: + zero_tensor = zeros( + (in_tensor.shape[1] - in_tensor.shape[0], in_tensor.shape[1]), + dtype=in_tensor.dtype, + sparse=in_tensor.issparse(), + gpu=in_tensor.op.gpu, + chunk_size=(max(in_tensor.nsplits[0]), in_tensor.nsplits[1]), + order=in_tensor.order.value, + ) + in_tensor = yield from recursive_tile(vstack([in_tensor, zero_tensor])) + + if has_unknown_shape(in_tensor): + yield + if in_tensor.nsplits[0] != in_tensor.nsplits[1]: + # all chunks on diagonal should be square + nsplits = in_tensor.nsplits[0] + in_tensor = yield from recursive_tile(in_tensor.rechunk([nsplits, nsplits])) + + p_chunks, p_invert_chunks, lower_chunks, l_permuted_chunks, upper_chunks = ( + {}, + {}, + {}, + {}, + {}, + ) + for i in range(in_tensor.chunk_shape[0]): + for j in range(in_tensor.chunk_shape[1]): + if i < j: + chunk_shape = (in_tensor.nsplits[0][i], in_tensor.nsplits[1][j]) + p_chunk = TensorZeros( + sparse=op.sparse, + order=out_tensor.order.value, + shape=chunk_shape, + ).new_chunk( + None, shape=chunk_shape, index=(i, j), order=out_tensor.order + ) + lower_chunk = TensorZeros( + sparse=op.sparse, + order=out_tensor.order.value, + shape=chunk_shape, + ).new_chunk( + None, shape=chunk_shape, index=(i, j), order=out_tensor.order + ) + p_chunks[p_chunk.index] = p_chunk + lower_chunks[lower_chunk.index] = lower_chunk + + target_u = in_tensor.cix[i, j] + p_invert = p_invert_chunks[i, i] + target = TensorDot(dtype=U.dtype, sparse=U.op.sparse).new_chunk( + [p_invert, target_u], + shape=(p_invert.shape[0], target_u.shape[1]), + order=out_tensor.order, + ) + if i > 0: + prev_chunks_u = [] + for p in range(i): + a, b = lower_chunks[i, p], upper_chunks[p, j] + prev_chunk = TensorDot( + dtype=U.dtype, sparse=U.op.sparse + ).new_chunk( + [a, b], + shape=(a.shape[0], b.shape[1]), + order=out_tensor.order, + ) + prev_chunks_u.append(prev_chunk) + if len(prev_chunks_u) == 1: + s = prev_chunks_u[0] + else: + tree_add_op = TensorTreeAdd( + args=prev_chunks_u, + dtype=prev_chunks_u[0].dtype, + sparse=op.sparse, + ) + s = tree_add_op.new_chunk( + prev_chunks_u, shape=prev_chunks_u[0].shape + ) + target = TensorSubtract( + dtype=U.dtype, + lhs=target, + rhs=s, + order=out_tensor.order.value, + ).new_chunk( + [target, s], shape=target.shape, order=out_tensor.order + ) + upper_chunk = TensorSolveTriangular( + lower=True, + dtype=U.dtype, + strict=False, + sparse=lower_chunks[i, i].op.sparse, + ).new_chunk( + [lower_chunks[i, i], target], + shape=target.shape, + index=(i, j), + order=out_tensor.order, + ) + upper_chunks[upper_chunk.index] = upper_chunk + elif i == j: + target = in_tensor.cix[i, j] + if i > 0: + prev_chunks = [] + for p in range(i): + a, b = l_permuted_chunks[i, p], upper_chunks[p, j] + prev_chunk = TensorDot( + dtype=a.dtype, sparse=op.sparse + ).new_chunk( + [a, b], + shape=(a.shape[0], b.shape[1]), + order=out_tensor.order, + ) + prev_chunks.append(prev_chunk) + if len(prev_chunks) == 1: + s = prev_chunks[0] + else: + tree_add_op = TensorTreeAdd( + args=prev_chunks, + dtype=prev_chunks[0].dtype, + sparse=op.sparse, + ) + s = tree_add_op.new_chunk( + prev_chunks, shape=prev_chunks[0].shape + ) + target = TensorSubtract( + dtype=L.dtype, + lhs=target, + rhs=s, + order=out_tensor.order.value, + ).new_chunk([target, s], shape=target.shape) + new_op = TensorLU(dtype=op.dtype, sparse=target.op.sparse) + lu_chunks = new_op.new_chunks( + [target], + index=(i, j), + order=out_tensor.order, + kws=[ + {"side": "p", "dtype": P.dtype, "shape": target.shape}, + {"side": "l", "dtype": L.dtype, "shape": target.shape}, + {"side": "u", "dtype": U.dtype, "shape": target.shape}, + ], + ) + p_chunk, lower_chunk, upper_chunk = lu_chunks + # transposed p equals to inverted p + p_chunk_invert = TensorTranspose( + dtype=p_chunk.dtype, sparse=op.sparse + ).new_chunk( + [p_chunk], + shape=p_chunk.shape, + index=p_chunk.index, + order=out_tensor.order, + ) + p_chunks[p_chunk.index] = p_chunk + p_invert_chunks[p_chunk_invert.index] = p_chunk_invert + lower_chunks[lower_chunk.index] = lower_chunk + upper_chunks[upper_chunk.index] = upper_chunk + + # l_permuted should be transferred to the final lower triangular + for p in range(i): + l_permuted_chunk = l_permuted_chunks[i, p] + l_chunk = TensorDot( + dtype=L.dtype, sparse=L.op.sparse + ).new_chunk( + [p_chunk_invert, l_permuted_chunk], + shape=(p_chunk_invert.shape[0], l_permuted_chunk.shape[1]), + index=l_permuted_chunk.index, + order=out_tensor.order, + ) + lower_chunks[l_permuted_chunk.index] = l_chunk + else: + chunk_shape = (in_tensor.nsplits[0][i], in_tensor.nsplits[1][j]) + p_chunk = TensorZeros( + sparse=op.sparse, + order=out_tensor.order.value, + shape=chunk_shape, + ).new_chunk( + None, shape=chunk_shape, index=(i, j), order=out_tensor.order + ) + upper_chunk = TensorZeros( + sparse=op.sparse, + order=out_tensor.order.value, + shape=chunk_shape, + ).new_chunk( + None, shape=chunk_shape, index=(i, j), order=out_tensor.order + ) + p_chunks[p_chunk.index] = p_chunk + upper_chunks[upper_chunk.index] = upper_chunk + target_l = in_tensor.cix[i, j] + if j > 0: + prev_chunks_l = [] + for p in range(j): + a, b = l_permuted_chunks[i, p], upper_chunks[p, j] + prev_chunk = TensorDot( + dtype=L.dtype, sparse=L.op.sparse + ).new_chunk( + [a, b], + shape=(a.shape[0], b.shape[1]), + order=out_tensor.order, + ) + prev_chunks_l.append(prev_chunk) + if len(prev_chunks_l) == 1: + s = prev_chunks_l[0] + else: + tree_add_op = TensorTreeAdd( + args=prev_chunks_l, + dtype=prev_chunks_l[0].dtype, + sparse=op.sparse, + ) + s = tree_add_op.new_chunk( + prev_chunks_l, shape=prev_chunks_l[0].shape + ) + target_l = TensorSubtract( + dtype=L.dtype, + lhs=target_l, + rhs=s, + order=out_tensor.order.value, + ).new_chunk( + [target_l, s], shape=target_l.shape, order=out_tensor.order + ) + u = upper_chunks[j, j] + a_transpose = TensorTranspose( + dtype=u.dtype, sparse=op.sparse + ).new_chunk([u], shape=u.shape) + target_transpose = TensorTranspose( + dtype=target_l.dtype, sparse=op.sparse + ).new_chunk([target_l], shape=target_l.shape) + lower_permuted_chunk = TensorSolveTriangular( + lower=True, dtype=L.dtype, strict=False, sparse=op.sparse + ).new_chunk( + [a_transpose, target_transpose], + shape=target_l.shape, + index=(i, j), + order=out_tensor.order, + ) + lower_transpose = TensorTranspose( + dtype=lower_permuted_chunk.dtype, sparse=op.sparse + ).new_chunk( + [lower_permuted_chunk], + shape=lower_permuted_chunk.shape, + index=lower_permuted_chunk.index, + ) + l_permuted_chunks[lower_permuted_chunk.index] = lower_transpose + + new_op = op.copy() + kws = [ + { + "chunks": list(p_chunks.values()), + "nsplits": in_tensor.nsplits, + "dtype": P.dtype, + "shape": P.shape, + "order": P.order, + }, + { + "chunks": list(lower_chunks.values()), + "nsplits": in_tensor.nsplits, + "dtype": L.dtype, + "shape": L.shape, + "order": L.order, + }, + { + "chunks": list(upper_chunks.values()), + "nsplits": in_tensor.nsplits, + "dtype": U.dtype, + "shape": U.shape, + "order": U.order, + }, + ] + if raw_in_tensor.shape[0] == raw_in_tensor.shape[1]: + return new_op.new_tensors(op.inputs, kws=kws) + + p, l_, u = new_op.new_tensors(op.inputs, kws=kws) + if raw_in_tensor.shape[0] > raw_in_tensor.shape[1]: + l_ = yield from recursive_tile(l_[:, : raw_in_tensor.shape[1]]) + u = yield from recursive_tile( + u[: raw_in_tensor.shape[1], : raw_in_tensor.shape[1]] + ) + else: + p = yield from recursive_tile( + p[: raw_in_tensor.shape[0], : raw_in_tensor.shape[0]] + ) + l_ = yield from recursive_tile( + l_[: raw_in_tensor.shape[0], : raw_in_tensor.shape[0]] + ) + u = yield from recursive_tile(u[: raw_in_tensor.shape[0], :]) + kws = [ + { + "chunks": p.chunks, + "nsplits": p.nsplits, + "dtype": P.dtype, + "shape": p.shape, + "order": p.order, + }, + { + "chunks": l_.chunks, + "nsplits": l_.nsplits, + "dtype": l_.dtype, + "shape": l_.shape, + "order": l_.order, + }, + { + "chunks": u.chunks, + "nsplits": u.nsplits, + "dtype": u.dtype, + "shape": u.shape, + "order": u.order, + }, + ] + return new_op.new_tensors(op.inputs, kws=kws) + + @classmethod + def execute(cls, ctx, op): + (a,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + if xp is np: + import scipy.linalg + + p, l, u = scipy.linalg.lu(a) + elif is_sparse_module(xp): + p, l, u = xp.lu(a) + else: + raise NotImplementedError + pc, lc, uc = op.outputs + + ctx[pc.key] = p + ctx[lc.key] = l + ctx[uc.key] = u + + +def lu(a): + """ + LU decomposition + + The decomposition is:: + A = P L U + where P is a permutation matrix, L lower triangular with unit diagonal elements, + and U upper triangular. + + Parameters + ---------- + a : (M, N) array_like + Array to decompose + + Returns + ------- + p : (M, M) ndarray + Permutation matrix + l : (M, K) ndarray + Lower triangular or trapezoidal matrix with unit diagonal. + K = min(M, N) + u : (K, N) ndarray + Upper triangular or trapezoidal matrix + + Examples + -------- + >>> import mars.tensor as mt + + >>> A = mt.array([[1,2],[2,3]]) + >>> A.execute() + array([[ 1, 2], + [ 2, 3]]) + >>> P, L, U = mt.linalg.lu(A) + >>> P.execute() + array([[ 0, 1], + [ 1, 0]]) + >>> L.execute() + array([[ 1, 0], + [ 0.5, 1]]) + >>> U.execute() + array([[ 2, 3], + [ 0, 0.5]]) + >>> mt.dot(P.dot(L), U).execute() # verify that PL * U = A + array([[ 1, 2], + [ 2, 3]]) + + """ + op = TensorLU(sparse=a.issparse()) + return op(a) diff --git a/python/xorbits/_mars/tensor/linalg/matmul.py b/python/xorbits/_mars/tensor/linalg/matmul.py new file mode 100644 index 000000000..6fcc74e99 --- /dev/null +++ b/python/xorbits/_mars/tensor/linalg/matmul.py @@ -0,0 +1,336 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import KeyField, StringField +from ...utils import has_unknown_shape +from ..arithmetic.utils import chunk_tree_add +from ..array_utils import as_same_device, device, is_sparse_module +from ..core import Tensor, TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorOperand, TensorOperandMixin +from ..utils import broadcast_shape, check_order, check_out_param, unify_chunks + + +class TensorMatmul(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.MATMUL + + _a = KeyField("a") + _b = KeyField("b") + _casting = StringField("casting") + _order = StringField("order") + + def __init__(self, casting=None, order=None, **kw): + super().__init__(_casting=casting, _order=order, **kw) + if self._casting is None: + self._casting = "same_kind" + if self._order is None: + self._order = "K" + check_order(self._order) + + @property + def a(self): + return self._a + + @property + def b(self): + return self._b + + @property + def casting(self): + return self._casting + + @property + def order(self): + return self._order + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._a = self._inputs[0] + self._b = self._inputs[1] + + def _calc_order(self, a, b, out): + if out is not None: + return out.order + + if self._order in "A": + if a.order == TensorOrder.C_ORDER or b.order == TensorOrder.C_ORDER: + return TensorOrder.C_ORDER + else: + return TensorOrder.F_ORDER + elif self._order in "CK": + return TensorOrder.C_ORDER + else: + return TensorOrder.F_ORDER + + def __call__(self, a, b, out=None): + from ..base import broadcast_to + + if a.ndim == 0 or b.ndim == 0: + raise ValueError("Scalar operands are not allowed, use '*' instead") + if out is not None and not isinstance(out, Tensor): + raise TypeError(f"out must be a Tensor, got {type(out)} instead") + + a_is_1d = False + if a.ndim == 1: + a_is_1d = True + a = a[np.newaxis, :] + + b_is_1d = False + if b.ndim == 1: + b_is_1d = True + b = b[:, np.newaxis] + + if a.ndim < b.ndim: + a = a[(b.ndim - a.ndim) * (np.newaxis,)] + elif a.ndim > b.ndim: + b = b[(a.ndim - b.ndim) * (np.newaxis,)] + + if a.shape[-1] != b.shape[-2]: + raise ValueError( + f"shape {a.shape} and {b.shape} not aligned: " + f"{a.shape[-1]} (dim {a.ndim - 1}) != {b.shape[-2]} (dim {b.ndim - 2})" + ) + + shape = broadcast_shape(a.shape[:-2], b.shape[:-2]) + (a.shape[-2], b.shape[-1]) + order = self._calc_order(a, b, out) + t = self.new_tensor([a, b], shape, order=order) + + if a_is_1d: + t = t[..., 0, :] + if b_is_1d: + t = t[..., 0] + + if out is not None: + check_out_param(out, t, self._casting) + t = broadcast_to(t, out.shape) + out.data = t.data + return out + + return t + + @classmethod + def tile(cls, op): + a, b = op.inputs + tensor = op.outputs[0] + # the axes to align on + a_axes = list(range(a.ndim - 2))[::-1] + [tensor.ndim - 2, tensor.ndim - 1] + b_axes = list(range(b.ndim - 2))[::-1] + [tensor.ndim - 1, tensor.ndim] + if has_unknown_shape(a, b): + yield + a, b = yield from unify_chunks((a, a_axes), (b, b_axes)) + + get_nsplit = lambda i: a.nsplits[i] if a.nsplits[i] != (1,) else b.nsplits[i] + get_idx = lambda ch, idx: tuple( + 0 if ch.nsplits[j] == (1,) else ix for j, ix in enumerate(idx) + ) + + prefix_idxes = [range(len(get_nsplit(i))) for i in range(a.ndim - 2)] + out_idxes = prefix_idxes + [ + range(len(a.nsplits[-2])), + range(len(b.nsplits[-1])), + ] + + out_chunks = [] + for out_idx in itertools.product(*out_idxes): + chunks = [] + get_s = lambda x, idx: x[idx] if x != (1,) else x[0] + shape = tuple( + max(get_s(a_s, j), get_s(b_s, j)) + for a_s, b_s, j in zip(a.nsplits[:-2], b.nsplits[:-2], out_idx[:-2]) + ) + (get_s(a.nsplits[-2], out_idx[-2]), get_s(b.nsplits[-1], out_idx[-1])) + + for contract_idx in range(len(a.nsplits[-1])): + a_idx = get_idx(a, out_idx[: a.ndim - 1] + (contract_idx,)) + a_chunk = a.cix[a_idx] + b_idx = get_idx( + b, out_idx[: b.ndim - 2] + (contract_idx,) + out_idx[-1:] + ) + b_chunk = b.cix[b_idx] + chunk_op = op.copy().reset_key() + c = chunk_op.new_chunk( + [a_chunk, b_chunk], shape=shape, order=tensor.order + ) + chunks.append(c) + + if len(chunks) == 1: + c = chunks[0] + out_chunk_op = c.op.copy() + out_chunk = out_chunk_op.new_chunk( + out_chunk_op.inputs, + shape=c.shape, + index=out_idx, + order=tensor.order, + ) + else: + out_chunk = chunk_tree_add( + tensor.op.dtype, chunks, out_idx, shape, sparse=tensor.op.sparse + ) + + out_chunks.append(out_chunk) + + nsplits = tuple(get_nsplit(i) for i in range(a.ndim - 2)) + ( + a.nsplits[-2], + b.nsplits[-1], + ) + new_op = op.copy() + return new_op.new_tensors( + [a, b], tensor.shape, order=tensor.order, chunks=out_chunks, nsplits=nsplits + ) + + @classmethod + def execute(cls, ctx, op): + (a, b), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + if not op.sparse and is_sparse_module(xp): + # tell sparse to do calculation on numpy or cupy matmul + ctx[op.outputs[0].key] = xp.matmul(a, b, sparse=False) + else: + try: + # `np.matmul` support `order` argument in version 1.16 + ctx[op.outputs[0].key] = xp.matmul( + a, b, casting=op.casting, order=op.order + ) + except TypeError: # pragma: no cover + ctx[op.outputs[0].key] = xp.matmul(a, b).astype( + dtype=op.dtype, casting=op.casting, order=op.order + ) + + +def matmul(a, b, sparse=None, out=None, **kw): + """ + Matrix product of two tensors. + + The behavior depends on the arguments in the following way. + + - If both arguments are 2-D they are multiplied like conventional + matrices. + - If either argument is N-D, N > 2, it is treated as a stack of + matrices residing in the last two indexes and broadcast accordingly. + - If the first argument is 1-D, it is promoted to a matrix by + prepending a 1 to its dimensions. After matrix multiplication + the prepended 1 is removed. + - If the second argument is 1-D, it is promoted to a matrix by + appending a 1 to its dimensions. After matrix multiplication + the appended 1 is removed. + + Multiplication by a scalar is not allowed, use ``*`` instead. Note that + multiplying a stack of matrices with a vector will result in a stack of + vectors, but matmul will not recognize it as such. + + ``matmul`` differs from ``dot`` in two important ways. + + - Multiplication by scalars is not allowed. + - Stacks of matrices are broadcast together as if the matrices + were elements. + + Parameters + ---------- + a : array_like + First argument. + b : array_like + Second argument. + out : Tensor, optional + Output argument. This must have the exact kind that would be returned + if it was not used. In particular, it must have the right type, + and its dtype must be the dtype that would be returned + for `dot(a,b)`. This is a performance feature. Therefore, if these + conditions are not met, an exception is raised, instead of attempting + to be flexible. + + Returns + ------- + output : Tensor + Returns the dot product of `a` and `b`. If `a` and `b` are both + 1-D arrays then a scalar is returned; otherwise an array is + returned. If `out` is given, then it is returned. + + Raises + ------ + ValueError + If the last dimension of `a` is not the same size as + the second-to-last dimension of `b`. + + If scalar value is passed. + + See Also + -------- + vdot : Complex-conjugating dot product. + tensordot : Sum products over arbitrary axes. + dot : alternative matrix product with different broadcasting rules. + + Notes + ----- + The matmul function implements the semantics of the `@` operator introduced + in Python 3.5 following PEP465. + + Examples + -------- + For 2-D arrays it is the matrix product: + + >>> import mars.tensor as mt + + >>> a = [[1, 0], [0, 1]] + >>> b = [[4, 1], [2, 2]] + >>> mt.matmul(a, b).execute() + array([[4, 1], + [2, 2]]) + + For 2-D mixed with 1-D, the result is the usual. + + >>> a = [[1, 0], [0, 1]] + >>> b = [1, 2] + >>> mt.matmul(a, b).execute() + array([1, 2]) + >>> mt.matmul(b, a).execute() + array([1, 2]) + + + Broadcasting is conventional for stacks of arrays + + >>> a = mt.arange(2*2*4).reshape((2,2,4)) + >>> b = mt.arange(2*2*4).reshape((2,4,2)) + >>> mt.matmul(a,b).shape + (2, 2, 2) + >>> mt.matmul(a,b)[0,1,1].execute() + 98 + >>> mt.sum(a[0,1,:] * b[0,:,1]).execute() + 98 + + Vector, vector returns the scalar inner product, but neither argument + is complex-conjugated: + + >>> mt.matmul([2j, 3j], [2j, 3j]).execute() + (-13+0j) + + Scalar multiplication raises an error. + + >>> mt.matmul([1,2], 3) + Traceback (most recent call last): + ... + ValueError: Scalar operands are not allowed, use '*' instead + """ + a = astensor(a) + b = astensor(b) + + sparse = sparse if sparse is not None else a.issparse() and b.issparse() + op = TensorMatmul(dtype=np.promote_types(a.dtype, b.dtype), sparse=sparse, **kw) + return op(a, b, out=out) diff --git a/python/xorbits/_mars/tensor/linalg/norm.py b/python/xorbits/_mars/tensor/linalg/norm.py new file mode 100644 index 000000000..44e8028df --- /dev/null +++ b/python/xorbits/_mars/tensor/linalg/norm.py @@ -0,0 +1,342 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from collections.abc import Iterable + +import numpy as np + +from ... import opcodes as OperandDef +from ...core import recursive_tile +from ...serialization.serializables import ( + AnyField, + BoolField, + FieldTypes, + KeyField, + TupleField, +) +from ..arithmetic import sqrt +from ..array_utils import as_same_device, device +from ..datasource import tensor as astensor +from ..operands import TensorHasInput, TensorOperandMixin +from ..utils import validate_axis +from .svd import svd + + +class TensorNorm(TensorHasInput, TensorOperandMixin): + _op_type_ = OperandDef.NORM + + _input = KeyField("input") + _ord = AnyField("ord") + _axis = TupleField("axis", FieldTypes.int32) + _keepdims = BoolField("keepdims") + + def __init__(self, ord=None, axis=None, keepdims=None, **kw): + super().__init__(_ord=ord, _axis=axis, _keepdims=keepdims, **kw) + + @property + def ord(self): + return getattr(self, "_ord", None) + + @property + def axis(self): + return self._axis + + @property + def keepdims(self): + return self._keepdims + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + def __call__(self, x): + r = x.astype(self.dtype) + shape = self._norm(r, self._ord, self._axis, self._keepdims).shape + return self.new_tensor([x], shape) + + @classmethod + def tile(cls, op): + x = astensor(op.input) + axis = op.axis + ord = op.ord + keepdims = op.keepdims + + axis_chunk_shapes = tuple(x.chunk_shape[i] for i in axis) + can_apply_norm = all(s == 1 for s in axis_chunk_shapes) + + if can_apply_norm: + axis_set = set(axis) + get_shape = lambda shape: tuple( + s if i not in axis_set else 1 + for i, s in enumerate(shape) + if i not in axis_set or keepdims + ) + + out_chunk_shape = get_shape(x.chunk_shape) + out_chunks = [] + for idx in itertools.product(*[range(s) for s in out_chunk_shape]): + idx_iter = iter(idx) + in_idx = tuple( + 0 if i in axis_set and not keepdims else next(idx_iter) + for i in range(x.ndim) + ) + + c = x.cix[in_idx] + chunk_op = op.copy().reset_key() + out_chunk = chunk_op.new_chunk([c], shape=get_shape(c.shape), index=idx) + out_chunks.append(out_chunk) + + nsplits = [ + tuple( + c.shape[i] + for c in out_chunks + if all(idx == 0 for j, idx in enumerate(c.index) if j != i) + ) + for i in range(len(out_chunks[0].shape)) + ] + new_op = op.copy() + return new_op.new_tensors( + op.inputs, op.outputs[0].shape, chunks=out_chunks, nsplits=nsplits + ) + + r = yield from recursive_tile( + cls._norm(x.astype(op.outputs[0].dtype), ord, axis, keepdims) + ) + new_op = op.copy() + return new_op.new_tensors( + op.inputs, op.outputs[0].shape, chunks=r.chunks, nsplits=r.nsplits + ) + + @staticmethod + def _norm(r, ord, axis, keepdims): + if ord is None: + return sqrt((abs(r) ** 2).sum(axis=axis, keepdims=keepdims)) + elif ord == "nuc": + if len(axis) == 1: + raise ValueError("Invalid norm order for vectors.") + return svd(r)[1][np.newaxis].sum(keepdims=keepdims) + elif ord == np.inf: + if r.ndim > 2: + raise ValueError("Improper number of dimensions to norm.") + r = abs(r) + if len(axis) == 1: + return r.max(axis=axis, keepdims=keepdims) + else: + return r.sum(axis=axis[1], keepdims=keepdims).max(keepdims=keepdims) + elif ord == -np.inf: + if r.ndim > 2: + raise ValueError("Improper number of dimensions to norm.") + r = abs(r) + if len(axis) == 1: + return r.min(axis=axis, keepdims=keepdims) + else: + return r.sum(axis=axis[1], keepdims=keepdims).min(keepdims=keepdims) + elif ord == 0: + if r.ndim > 2: + raise ValueError("Improper number of dimensions to norm.") + if len(axis) == 2: + raise ValueError("Invalid norm order for matrices.") + return (r != 0).astype(r.dtype).sum(axis=axis, keepdims=keepdims) + elif ord == 1: + if r.ndim > 2: + raise ValueError("Improper number of dimensions to norm.") + r = abs(r) + if len(axis) == 1: + return r.sum(axis=axis, keepdims=keepdims) + else: + return r.sum(axis=axis[0], keepdims=keepdims).max(keepdims=keepdims) + elif ord == -1 and len(axis) == 2: + if r.ndim > 2: + raise ValueError("Improper number of dimensions to norm.") + return abs(r).sum(axis=axis[0], keepdims=keepdims).min(keepdims=keepdims) + elif ord == 2 and len(axis) == 2: + return svd(r)[1][np.newaxis].max(keepdims=keepdims) + elif ord == -2 and len(axis) == 2: + return svd(r)[1][np.newaxis].min(keepdims=keepdims) + else: + if len(axis) == 2: + raise ValueError("Invalid norm order for matrices.") + + return (abs(r) ** ord).sum(axis=axis, keepdims=keepdims) ** (1.0 / ord) + + @classmethod + def execute(cls, ctx, op): + (x,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + ctx[op.outputs[0].key] = xp.linalg.norm( + x, ord=op.ord, axis=op.axis, keepdims=op.keepdims + ) + + +def norm(x, ord=None, axis=None, keepdims=False): + r""" + Matrix or vector norm. + + This function is able to return one of eight different matrix norms, + or one of an infinite number of vector norms (described below), depending + on the value of the ``ord`` parameter. + + Parameters + ---------- + x : array_like + Input tensor. If `axis` is None, `x` must be 1-D or 2-D. + ord : {non-zero int, inf, -inf, 'fro', 'nuc'}, optional + Order of the norm (see table under ``Notes``). inf means mars tensor's + `inf` object. + axis : {int, 2-tuple of ints, None}, optional + If `axis` is an integer, it specifies the axis of `x` along which to + compute the vector norms. If `axis` is a 2-tuple, it specifies the + axes that hold 2-D matrices, and the matrix norms of these matrices + are computed. If `axis` is None then either a vector norm (when `x` + is 1-D) or a matrix norm (when `x` is 2-D) is returned. + keepdims : bool, optional + If this is set to True, the axes which are normed over are left in the + result as dimensions with size one. With this option the result will + broadcast correctly against the original `x`. + + Returns + ------- + n : float or Tensor + Norm of the matrix or vector(s). + + Notes + ----- + For values of ``ord <= 0``, the result is, strictly speaking, not a + mathematical 'norm', but it may still be useful for various numerical + purposes. + + The following norms can be calculated: + + ===== ============================ ========================== + ord norm for matrices norm for vectors + ===== ============================ ========================== + None Frobenius norm 2-norm + 'fro' Frobenius norm -- + 'nuc' nuclear norm -- + inf max(sum(abs(x), axis=1)) max(abs(x)) + -inf min(sum(abs(x), axis=1)) min(abs(x)) + 0 -- sum(x != 0) + 1 max(sum(abs(x), axis=0)) as below + -1 min(sum(abs(x), axis=0)) as below + 2 2-norm (largest sing. value) as below + -2 smallest singular value as below + other -- sum(abs(x)**ord)**(1./ord) + ===== ============================ ========================== + + The Frobenius norm is given by [1]_: + + :math:`||A||_F = [\\sum_{i,j} abs(a_{i,j})^2]^{1/2}` + + The nuclear norm is the sum of the singular values. + + References + ---------- + .. [1] G. H. Golub and C. F. Van Loan, *Matrix Computations*, + Baltimore, MD, Johns Hopkins University Press, 1985, pg. 15 + + Examples + -------- + >>> from mars.tensor import linalg as LA + >>> import mars.tensor as mt + >>> a = mt.arange(9) - 4 + >>> a.execute() + array([-4, -3, -2, -1, 0, 1, 2, 3, 4]) + >>> b = a.reshape((3, 3)) + >>> b.execute() + array([[-4, -3, -2], + [-1, 0, 1], + [ 2, 3, 4]]) + + >>> LA.norm(a).execute() + 7.745966692414834 + >>> LA.norm(b).execute() + 7.745966692414834 + >>> LA.norm(b, 'fro').execute() + 7.745966692414834 + >>> LA.norm(a, mt.inf).execute() + 4.0 + >>> LA.norm(b, mt.inf).execute() + 9.0 + >>> LA.norm(a, -mt.inf).execute() + 0.0 + >>> LA.norm(b, -mt.inf).execute() + 2.0 + + >>> LA.norm(a, 1).execute() + 20.0 + >>> LA.norm(b, 1).execute() + 7.0 + >>> LA.norm(a, -1).execute() + 0.0 + >>> LA.norm(b, -1).execute() + 6.0 + >>> LA.norm(a, 2).execute() + 7.745966692414834 + >>> LA.norm(b, 2).execute() + 7.3484692283495345 + + >>> LA.norm(a, -2).execute() + 0.0 + >>> LA.norm(b, -2).execute() + 4.351066026358965e-18 + >>> LA.norm(a, 3).execute() + 5.8480354764257312 + >>> LA.norm(a, -3).execute() + 0.0 + + Using the `axis` argument to compute vector norms: + + >>> c = mt.array([[ 1, 2, 3], + ... [-1, 1, 4]]) + >>> LA.norm(c, axis=0).execute() + array([ 1.41421356, 2.23606798, 5. ]) + >>> LA.norm(c, axis=1).execute() + array([ 3.74165739, 4.24264069]) + >>> LA.norm(c, ord=1, axis=1).execute() + array([ 6., 6.]) + + Using the `axis` argument to compute matrix norms: + + >>> m = mt.arange(8).reshape(2,2,2) + >>> LA.norm(m, axis=(1,2)).execute() + array([ 3.74165739, 11.22497216]) + >>> LA.norm(m[0, :, :]).execute(), LA.norm(m[1, :, :]).execute() + (3.7416573867739413, 11.224972160321824) + + """ + x = astensor(x) + ndim = x.ndim + + if ord == "fro": + ord = None + if axis is not None: + if isinstance(axis, Iterable): + axis = tuple(validate_axis(ndim, a) for a in axis) + else: + axis = (validate_axis(ndim, axis),) + else: + axis = tuple(range(x.ndim)) + + op = TensorNorm( + ord=ord, + axis=axis, + keepdims=keepdims, + dtype=np.result_type(x.dtype, np.float_), + sparse=x.issparse(), + ) + return op(x) diff --git a/python/xorbits/_mars/tensor/linalg/qr.py b/python/xorbits/_mars/tensor/linalg/qr.py new file mode 100644 index 000000000..c0e338d7c --- /dev/null +++ b/python/xorbits/_mars/tensor/linalg/qr.py @@ -0,0 +1,193 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from numpy.linalg import LinAlgError + +from ... import opcodes as OperandDef +from ...core import ExecutableTuple +from ...serialization.serializables import KeyField, StringField +from ..array_utils import as_same_device, device +from ..core import TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorHasInput, TensorOperandMixin +from .core import SFQR, TSQR + + +class TensorQR(TensorHasInput, TensorOperandMixin): + _op_type_ = OperandDef.QR + + _input = KeyField("input") + _method = StringField("method") + + def __init__(self, method=None, **kw): + super().__init__(_method=method, **kw) + + @property + def method(self): + return self._method + + @property + def output_limit(self): + return 2 + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + def __call__(self, a): + a = astensor(a) + + if a.ndim != 2: + raise LinAlgError( + f"{a.ndim}-dimensional tensor given. Tensor must be two-dimensional" + ) + + tiny_q, tiny_r = np.linalg.qr(np.ones((1, 1), dtype=a.dtype)) + + x, y = a.shape + q_shape, r_shape = (a.shape, (y, y)) if x > y else ((x, x), a.shape) + q, r = self.new_tensors( + [a], + kws=[ + { + "side": "q", + "dtype": tiny_q.dtype, + "shape": q_shape, + "order": TensorOrder.C_ORDER, + }, + { + "side": "r", + "dtype": tiny_r.dtype, + "shape": r_shape, + "order": TensorOrder.C_ORDER, + }, + ], + ) + return ExecutableTuple([q, r]) + + @classmethod + def tile(cls, op): + q, r = op.outputs + q_dtype, r_dtype = q.dtype, r.dtype + q_shape, r_shape = q.shape, r.shape + in_tensor = op.input + if in_tensor.chunk_shape == (1, 1): + in_chunk = in_tensor.chunks[0] + chunk_op = op.copy().reset_key() + qr_chunks = chunk_op.new_chunks( + [in_chunk], + kws=[ + {"side": "q", "shape": q_shape, "index": in_chunk.index}, + {"side": "r", "shape": r_shape, "index": in_chunk.index}, + ], + ) + q_chunk, r_chunk = qr_chunks + + new_op = op.copy() + kws = [ + { + "chunks": [q_chunk], + "nsplits": ((q_shape[0],), (q_shape[1],)), + "dtype": q_dtype, + "shape": q_shape, + "order": q.order, + }, + { + "chunks": [r_chunk], + "nsplits": ((r_shape[0],), (r_shape[1],)), + "dtype": r_dtype, + "shape": r_shape, + "order": r.order, + }, + ] + return new_op.new_tensors(op.inputs, kws=kws) + elif op.method == "tsqr": + return (yield from TSQR.tile(op)) + elif op.method == "sfqr": + return (yield from SFQR.tile(op)) + else: + raise NotImplementedError("Only tsqr method supported for now") + + @classmethod + def execute(cls, ctx, op): + (a,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + q, r = xp.linalg.qr(a) + qc, rc = op.outputs + ctx[qc.key] = q + ctx[rc.key] = r + + +def qr(a, method="tsqr"): + """ + Compute the qr factorization of a matrix. + + Factor the matrix `a` as *qr*, where `q` is orthonormal and `r` is + upper-triangular. + + Parameters + ---------- + a : array_like, shape (M, N) + Matrix to be factored. + method: {'tsqr', 'sfqr'}, optional + method to calculate qr factorization, tsqr as default + + TSQR is presented in: + + A. Benson, D. Gleich, and J. Demmel. + Direct QR factorizations for tall-and-skinny matrices in + MapReduce architectures. + IEEE International Conference on Big Data, 2013. + http://arxiv.org/abs/1301.1071 + + FSQR is a QR decomposition for fat and short matrix: + A = [A1, A2, A3, ...], A1 may be decomposed as A1 = Q1 * R1, + for A = Q * R, Q = Q1, R = [R1, R2, R3, ...] where A2 = Q1 * R2, A3 = Q1 * R3, ... + + Returns + ------- + q : Tensor of float or complex, optional + A matrix with orthonormal columns. When mode = 'complete' the + result is an orthogonal/unitary matrix depending on whether or not + a is real/complex. The determinant may be either +/- 1 in that + case. + r : Tensor of float or complex, optional + The upper-triangular matrix. + + Raises + ------ + LinAlgError + If factoring fails. + + Notes + ----- + For more information on the qr factorization, see for example: + http://en.wikipedia.org/wiki/QR_factorization + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.random.randn(9, 6) + >>> q, r = mt.linalg.qr(a) + >>> mt.allclose(a, mt.dot(q, r)).execute() # a does equal qr + True + + """ + op = TensorQR(method=method) + return op(a) diff --git a/python/xorbits/_mars/tensor/linalg/randomized_svd.py b/python/xorbits/_mars/tensor/linalg/randomized_svd.py new file mode 100644 index 000000000..66e491646 --- /dev/null +++ b/python/xorbits/_mars/tensor/linalg/randomized_svd.py @@ -0,0 +1,230 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..datasource import tensor as astensor +from ..utils import check_random_state +from .lu import lu +from .qr import qr +from .svd import svd +from .utils import svd_flip + +# --------------------------------------------------------------------- +# Original implementation is in `sklearn.utils.extmath.randomized_svd`. +# --------------------------------------------------------------------- + + +def randomized_range_finder( + A, size, n_iter, power_iteration_normalizer="auto", random_state=None +): + r"""Computes an orthonormal matrix whose range approximates the range of A. + + .. versionadded:: 0.1.3 + + Parameters + ---------- + A : 2D tensor + The input data tensor + + size : integer + Size of the return tensor + + n_iter : integer + Number of power iterations used to stabilize the result + + power_iteration_normalizer : 'auto' (default), 'QR', 'LU', 'none' + Whether the power iterations are normalized with step-by-step + QR factorization (the slowest but most accurate), 'none' + (the fastest but numerically unstable when `n_iter` is large, e.g. + typically 5 or larger), or 'LU' factorization (numerically stable + but can lose slightly in accuracy). The 'auto' mode applies no + normalization if `n_iter` <= 2 and switches to LU otherwise. + + random_state : int, RandomState instance or None, optional (default=None) + The seed of the pseudo random number generator to use when shuffling + the data. If int, random_state is the seed used by the random number + generator; If RandomState instance, random_state is the random number + generator; If None, the random number generator is the RandomState + instance used by `np.random`. + + Returns + ------- + Q : 2D array + A (size x size) projection matrix, the range of which + approximates well the range of the input matrix A. + + Notes + ----- + + Follows Algorithm 4.3 of + Finding structure with randomness: Stochastic algorithms for constructing + approximate matrix decompositions + Halko, et al., 2009 (arXiv:909) https://arxiv.org/pdf/0909.4061.pdf + + An implementation of a randomized algorithm for principal component + analysis + A. Szlam et al. 2014 + """ + random_state = check_random_state(random_state) + + # Generating normal random vectors with shape: (A.shape[1], size) + Q = random_state.normal(size=(A.shape[1], size)) + if A.dtype.kind == "f": + # Ensure f32 is preserved as f32 + Q = Q.astype(A.dtype, copy=False) + + # Deal with "auto" mode + if power_iteration_normalizer == "auto": + if n_iter <= 2: + power_iteration_normalizer = "none" + else: + power_iteration_normalizer = "LU" + + # Perform power iterations with Q to further 'imprint' the top + # singular vectors of A in Q + for _ in range(n_iter): + if power_iteration_normalizer == "none": + Q = A.dot(Q) + Q = A.T.dot(Q) + elif power_iteration_normalizer == "LU": + # TODO: directly get Q when lu supports `permute_l` + p, l, _ = lu(A.dot(Q)) + Q = p.dot(l) + p, l, _ = lu(A.T.dot(Q)) + Q = p.dot(l) + elif power_iteration_normalizer == "QR": + Q, _ = qr(A.dot(Q)) + Q, _ = qr(A.T.dot(Q)) + + # Sample the range of A using by linear projection of Q + # Extract an orthonormal basis + Q, _ = qr(A.dot(Q)) + return Q + + +def randomized_svd( + M, + n_components, + n_oversamples=10, + n_iter="auto", + power_iteration_normalizer="auto", + transpose="auto", + flip_sign=True, + random_state=0, +): + r""" + Computes a truncated randomized SVD + + .. versionadded:: 0.1.4 + + Parameters + ---------- + M : Tensor + tensor to decompose + n_components : int + Number of singular values and vectors to extract. + n_oversamples : int (default is 10) + Additional number of random vectors to sample the range of M so as + to ensure proper conditioning. The total number of random vectors + used to find the range of M is n_components + n_oversamples. Smaller + number can improve speed but can negatively impact the quality of + approximation of singular vectors and singular values. + n_iter : int or 'auto' (default is 'auto') + Number of power iterations. It can be used to deal with very noisy + problems. When 'auto', it is set to 4, unless `n_components` is small + (< .1 * min(X.shape)) `n_iter` in which case is set to 7. + This improves precision with few components. + power_iteration_normalizer : 'auto' (default), 'QR', 'LU', 'none' + Whether the power iterations are normalized with step-by-step + QR factorization (the slowest but most accurate), 'none' + (the fastest but numerically unstable when `n_iter` is large, e.g. + typically 5 or larger), or 'LU' factorization (numerically stable + but can lose slightly in accuracy). The 'auto' mode applies no + normalization if `n_iter` <= 2 and switches to LU otherwise. + transpose : True, False or 'auto' (default) + Whether the algorithm should be applied to M.T instead of M. The + result should approximately be the same. The 'auto' mode will + trigger the transposition if M.shape[1] > M.shape[0] since this + implementation of randomized SVD tend to be a little faster in that + case. + flip_sign : boolean, (True by default) + The output of a singular value decomposition is only unique up to a + permutation of the signs of the singular vectors. If `flip_sign` is + set to `True`, the sign ambiguity is resolved by making the largest + loadings for each component in the left singular vectors positive. + random_state : int, RandomState instance or None, optional (default=None) + The seed of the pseudo random number generator to use when shuffling + the data. If int, random_state is the seed used by the random number + generator; If RandomState instance, random_state is the random number + generator; If None, the random number generator is the RandomState + instance used by `np.random`. + Notes + ----- + This algorithm finds a (usually very good) approximate truncated + singular value decomposition using randomization to speed up the + computations. It is particularly fast on large matrices on which + you wish to extract only a small number of components. In order to + obtain further speed up, `n_iter` can be set <=2 (at the cost of + loss of precision). + References + ---------- + * Finding structure with randomness: Stochastic algorithms for constructing + approximate matrix decompositions + Halko, et al., 2009 https://arxiv.org/abs/0909.4061 + * A randomized algorithm for the decomposition of matrices + Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert + * An implementation of a randomized algorithm for principal component + analysis + A. Szlam et al. 2014 + """ + M = astensor(M) + random_state = check_random_state(random_state) + n_random = n_components + n_oversamples + n_samples, n_features = M.shape + + if n_iter == "auto": + # Check if the number of iterations is explicitly specified + # Adjust n_iter. 7 was found a good compromise for PCA. + # https://github.com/scikit-learn/scikit-learn/pull/5299 + n_iter = 7 if n_components < 0.1 * min(M.shape) else 4 + + if transpose == "auto": + transpose = n_samples < n_features + if transpose: + # this implementation is a bit faster with smaller shape[1] + M = M.T + + Q = randomized_range_finder( + M, n_random, n_iter, power_iteration_normalizer, random_state + ) + # project M to the (k + p) dimensional space using the basis vectors + B = Q.T.dot(M) + + # compute the SVD on the thin matrix: (k + p) wide + Uhat, s, V = svd(B) + + U = Q.dot(Uhat) + + if flip_sign: + if not transpose: + U, V = svd_flip(U, V) + else: + # In case of transpose u_based_decision=false + # to actually flip based on u and not v. + U, V = svd_flip(U, V, u_based_decision=False) + + if transpose: + # transpose back the results according to the input convention + return V[:n_components, :].T, s[:n_components], U[:, :n_components].T + else: + return U[:, :n_components], s[:n_components], V[:n_components, :] diff --git a/python/xorbits/_mars/tensor/linalg/solve.py b/python/xorbits/_mars/tensor/linalg/solve.py new file mode 100644 index 000000000..216ca57ea --- /dev/null +++ b/python/xorbits/_mars/tensor/linalg/solve.py @@ -0,0 +1,72 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..datasource import tensor as astensor +from .cholesky import cholesky +from .lu import lu +from .solve_triangular import solve_triangular + + +def solve(a, b, sym_pos=False, sparse=None): + """ + Solve the equation ``a x = b`` for ``x``. + + Parameters + ---------- + a : (M, M) array_like + A square matrix. + b : (M,) or (M, N) array_like + Right-hand side matrix in ``a x = b``. + sym_pos : bool + Assume `a` is symmetric and positive definite. If ``True``, use Cholesky + decomposition. + sparse: bool, optional + Return sparse value or not. + + Returns + ------- + x : (M,) or (M, N) ndarray + Solution to the system ``a x = b``. Shape of the return matches the + shape of `b`. + + Raises + ------ + LinAlgError + If `a` is singular. + + Examples + -------- + Given `a` and `b`, solve for `x`: + + >>> import mars.tensor as mt + >>> a = mt.array([[3, 2, 0], [1, -1, 0], [0, 5, 1]]) + >>> b = mt.array([2, 4, -1]) + >>> x = mt.linalg.solve(a, b) + >>> x.execute() + array([ 2., -2., 9.]) + + >>> mt.dot(a, x).execute() # Check the result + array([ 2., 4., -1.]) + """ + a = astensor(a) + b = astensor(b) + if sym_pos: + l_ = cholesky(a, lower=True) + u = l_.T + else: + p, l_, u = lu(a) + b = p.T.dot(b) + sparse = sparse if sparse is not None else a.issparse() + uy = solve_triangular(l_, b, lower=True, sparse=sparse) + return solve_triangular(u, uy, sparse=sparse) diff --git a/python/xorbits/_mars/tensor/linalg/solve_triangular.py b/python/xorbits/_mars/tensor/linalg/solve_triangular.py new file mode 100644 index 000000000..c1ce4e83a --- /dev/null +++ b/python/xorbits/_mars/tensor/linalg/solve_triangular.py @@ -0,0 +1,234 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from numpy.linalg import LinAlgError + +from ... import opcodes as OperandDef +from ...core import recursive_tile +from ...serialization.serializables import BoolField, KeyField +from ...utils import has_unknown_shape +from ..array_utils import as_same_device, cp, device +from ..core import TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorOperand, TensorOperandMixin +from ..utils import decide_unify_split + + +class TensorSolveTriangular(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.SOLVE_TRIANGULAR + + _a = KeyField("a") + _b = KeyField("b") + _lower = BoolField("lower") + _strict = BoolField("strict") + + def __init__(self, lower=None, strict=None, **kw): + super().__init__(_lower=lower, _strict=strict, **kw) + + @property + def a(self): + return self._a + + @property + def b(self): + return self._b + + @property + def lower(self): + return self._lower + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._a, self._b = self._inputs + + def __call__(self, a, b): + shape = (a.shape[1],) if len(b.shape) == 1 else (a.shape[1], b.shape[1]) + return self.new_tensor([a, b], shape, order=TensorOrder.F_ORDER) + + @property + def strict(self): + return self._strict + + @classmethod + def tile(cls, op): + from ..arithmetic.subtract import TensorSubtract + from ..arithmetic.utils import chunk_tree_add + from .dot import TensorDot + + if has_unknown_shape(*op.inputs): + yield + + a, b = op.a, op.b + unified_nsplit = decide_unify_split(a.nsplits[0], a.nsplits[1], b.nsplits[0]) + a = yield from recursive_tile(a.rechunk((unified_nsplit, unified_nsplit))) + b = yield from recursive_tile(b.rechunk((unified_nsplit,) + b.nsplits[1:])) + + b_multi_dim = b.ndim > 1 + b_hsplits = b.chunk_shape[1] if b_multi_dim else 1 + + def _x_shape(a_shape, b_shape): + return (a_shape[1],) if len(b_shape) == 1 else (a_shape[1], b_shape[1]) + + def _dot_shape(a_shape, b_shape): + return (a_shape[0],) if len(b_shape) == 1 else (a_shape[0], b_shape[1]) + + lower = op.lower + out_chunks = {} + if lower: + i_range = range(a.chunk_shape[0]) + else: + i_range = range(a.chunk_shape[0] - 1, -1, -1) + for i in i_range: + target_a = a.cix[i, i] + for j in range(b_hsplits): + idx = (i, j) if b_multi_dim else (i,) + target_b = b.cix[idx] + if (lower and i > 0) or (not lower and i < a.chunk_shape[0] - 1): + prev_chunks = [] + if lower: + k_range = range(i) + else: + k_range = range(i + 1, a.chunk_shape[0]) + for k in k_range: + a_chunk, b_chunk = ( + a.cix[i, k], + out_chunks[(k, j) if b_multi_dim else (k,)], + ) + prev_chunk = TensorDot( + dtype=op.dtype, sparse=a_chunk.issparse() + ).new_chunk( + [a_chunk, b_chunk], + shape=_dot_shape(a_chunk.shape, b_chunk.shape), + ) + prev_chunks.append(prev_chunk) + if len(prev_chunks) == 1: + s = prev_chunks[0] + else: + s = chunk_tree_add( + prev_chunks[0].dtype, + prev_chunks, + None, + prev_chunks[0].shape, + sparse=op.sparse, + ) + target_b = TensorSubtract( + dtype=op.dtype, lhs=target_b, rhs=s + ).new_chunk([target_b, s], shape=target_b.shape) + out_chunk = TensorSolveTriangular( + lower=lower, sparse=op.sparse, dtype=op.dtype + ).new_chunk( + [target_a, target_b], + shape=_x_shape(target_a.shape, target_b.shape), + index=idx, + order=op.outputs[0].order, + ) + out_chunks[out_chunk.index] = out_chunk + + new_op = op.copy() + nsplits = (a.nsplits[0],) if b.ndim == 1 else (a.nsplits[0], b.nsplits[1]) + return new_op.new_tensors( + op.inputs, + op.outputs[0].shape, + chunks=list(out_chunks.values()), + nsplits=nsplits, + ) + + @classmethod + def execute(cls, ctx, op): + (a, b), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + chunk = op.outputs[0] + with device(device_id): + if xp is np: + import scipy.linalg + + try: + ctx[chunk.key] = scipy.linalg.solve_triangular(a, b, lower=op.lower) + except np.linalg.LinAlgError: + if op.strict is not False: + raise + ctx[chunk.key] = np.linalg.lstsq(a, b, rcond=-1)[0] + elif xp is cp: + import cupyx + + ctx[chunk.key] = cupyx.scipy.linalg.solve_triangular( + a, b, lower=op.lower + ) + else: + ctx[chunk.key] = xp.solve_triangular( + a, b, lower=op.lower, sparse=op.sparse + ) + + +def solve_triangular(a, b, lower=False, sparse=None): + """ + Solve the equation `a x = b` for `x`, assuming a is a triangular matrix. + + Parameters + ---------- + a : (M, M) array_like + A triangular matrix + b : (M,) or (M, N) array_like + Right-hand side matrix in `a x = b` + lower : bool, optional + Use only data contained in the lower triangle of `a`. + Default is to use upper triangle. + sparse: bool, optional + Return sparse value or not. + + Returns + ------- + x : (M,) or (M, N) ndarray + Solution to the system `a x = b`. Shape of return matches `b`. + + Examples + -------- + Solve the lower triangular system a x = b, where:: + [3 0 0 0] [4] + a = [2 1 0 0] b = [2] + [1 0 1 0] [4] + [1 1 1 1] [2] + + >>> import mars.tensor as mt + >>> a = mt.array([[3, 0, 0, 0], [2, 1, 0, 0], [1, 0, 1, 0], [1, 1, 1, 1]]) + >>> b = mt.array([4, 2, 4, 2]) + >>> x = mt.linalg.solve_triangular(a, b, lower=True) + >>> x.execute() + array([ 1.33333333, -0.66666667, 2.66666667, -1.33333333]) + + >>> a.dot(x).execute() # Check the result + array([ 4., 2., 4., 2.]) + """ + import scipy.linalg + + a = astensor(a) + b = astensor(b) + + if a.ndim != 2: + raise LinAlgError("a must be 2 dimensional") + if b.ndim <= 2: + if a.shape[1] != b.shape[0]: + raise LinAlgError("a.shape[1] and b.shape[0] must be equal") + else: + raise LinAlgError("b must be 1 or 2 dimensional") + + tiny_x = scipy.linalg.solve_triangular( + np.array([[2, 0], [2, 1]], dtype=a.dtype), np.array([[2], [3]], dtype=b.dtype) + ) + sparse = sparse if sparse is not None else a.issparse() + op = TensorSolveTriangular(lower=lower, dtype=tiny_x.dtype, sparse=sparse) + return op(a, b) diff --git a/python/xorbits/_mars/tensor/linalg/svd.py b/python/xorbits/_mars/tensor/linalg/svd.py new file mode 100644 index 000000000..ec1919dca --- /dev/null +++ b/python/xorbits/_mars/tensor/linalg/svd.py @@ -0,0 +1,243 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from numpy.linalg import LinAlgError + +from ... import opcodes as OperandDef +from ...core import ExecutableTuple +from ...serialization.serializables import KeyField, StringField +from ..array_utils import as_same_device, device +from ..core import TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorHasInput, TensorOperandMixin +from .core import TSQR +from .utils import calc_svd_shapes + + +class TensorSVD(TensorHasInput, TensorOperandMixin): + _op_type_ = OperandDef.SVD + + _input = KeyField("input") + _method = StringField("method") + + def __init__(self, method=None, **kw): + super().__init__(_method=method, **kw) + + @property + def method(self): + return self._method + + @property + def output_limit(self): + return 3 + + @classmethod + def _is_svd(cls): + return True + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + def __call__(self, a): + a = astensor(a) + + if a.ndim != 2: + raise LinAlgError( + f"{a.ndim}-dimensional tensor given. Tensor must be two-dimensional" + ) + + tiny_U, tiny_s, tiny_V = np.linalg.svd(np.ones((1, 1), dtype=a.dtype)) + + # if a's shape is (6, 18), U's shape is (6, 6), s's shape is (6,), V's shape is (6, 18) + # if a's shape is (18, 6), U's shape is (18, 6), s's shape is (6,), V's shape is (6, 6) + U_shape, s_shape, V_shape = calc_svd_shapes(a) + U, s, V = self.new_tensors( + [a], + order=TensorOrder.C_ORDER, + kws=[ + {"side": "U", "dtype": tiny_U.dtype, "shape": U_shape}, + {"side": "s", "dtype": tiny_s.dtype, "shape": s_shape}, + {"side": "V", "dtype": tiny_V.dtype, "shape": V_shape}, + ], + ) + return ExecutableTuple([U, s, V]) + + @classmethod + def tile(cls, op): + U, s, V = op.outputs + U_dtype, s_dtype, V_dtype = U.dtype, s.dtype, V.dtype + U_shape, s_shape, V_shape = U.shape, s.shape, V.shape + in_tensor = op.input + if in_tensor.chunk_shape == (1, 1): + in_chunk = in_tensor.chunks[0] + chunk_op = op.copy().reset_key() + svd_chunks = chunk_op.new_chunks( + [in_chunk], + kws=[ + { + "side": "U", + "dtype": U_dtype, + "index": in_chunk.index, + "shape": U_shape, + "order": U.order, + }, + { + "side": "s", + "dtype": s_dtype, + "index": in_chunk.index[1:], + "shape": s_shape, + "order": s.order, + }, + { + "side": "V", + "dtype": V_dtype, + "index": in_chunk.index, + "shape": V_shape, + "order": V.order, + }, + ], + ) + U_chunk, s_chunk, V_chunk = svd_chunks + + new_op = op.copy() + kws = [ + { + "chunks": [U_chunk], + "nsplits": tuple((s,) for s in U_shape), + "dtype": U_dtype, + "shape": U_shape, + }, + { + "chunks": [s_chunk], + "nsplits": tuple((s,) for s in s_shape), + "dtype": s_dtype, + "shape": s_shape, + }, + { + "chunks": [V_chunk], + "nsplits": tuple((s,) for s in V_shape), + "dtype": V_dtype, + "shape": V_shape, + }, + ] + return new_op.new_tensors(op.inputs, kws=kws) + elif op.method == "tsqr": + return (yield from TSQR.tile(op)) + else: + raise NotImplementedError("Only tsqr method supported for now") + + @classmethod + def execute(cls, ctx, op): + (a,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + u, s, v = xp.linalg.svd(a, full_matrices=False) + uc, sc, vc = op.outputs + ctx[uc.key] = u + ctx[sc.key] = s + ctx[vc.key] = v + + +def svd(a, method="tsqr"): + """ + Singular Value Decomposition. + + When `a` is a 2D tensor, it is factorized as ``u @ np.diag(s) @ vh + = (u * s) @ vh``, where `u` and `vh` are 2D unitary tensors and `s` is a 1D + tensor of `a`'s singular values. When `a` is higher-dimensional, SVD is + applied in stacked mode as explained below. + + Parameters + ---------- + a : (..., M, N) array_like + A real or complex tensor with ``a.ndim >= 2``. + method: {'tsqr'}, optional + method to calculate qr factorization, tsqr as default + + TSQR is presented in: + + A. Benson, D. Gleich, and J. Demmel. + Direct QR factorizations for tall-and-skinny matrices in + MapReduce architectures. + IEEE International Conference on Big Data, 2013. + http://arxiv.org/abs/1301.1071 + + + Returns + ------- + u : { (..., M, M), (..., M, K) } tensor + Unitary tensor(s). The first ``a.ndim - 2`` dimensions have the same + size as those of the input `a`. The size of the last two dimensions + depends on the value of `full_matrices`. Only returned when + `compute_uv` is True. + s : (..., K) tensor + Vector(s) with the singular values, within each vector sorted in + descending order. The first ``a.ndim - 2`` dimensions have the same + size as those of the input `a`. + vh : { (..., N, N), (..., K, N) } tensor + Unitary tensor(s). The first ``a.ndim - 2`` dimensions have the same + size as those of the input `a`. The size of the last two dimensions + depends on the value of `full_matrices`. Only returned when + `compute_uv` is True. + + Raises + ------ + LinAlgError + If SVD computation does not converge. + + Notes + ----- + + SVD is usually described for the factorization of a 2D matrix :math:`A`. + The higher-dimensional case will be discussed below. In the 2D case, SVD is + written as :math:`A = U S V^H`, where :math:`A = a`, :math:`U= u`, + :math:`S= \\mathtt{np.diag}(s)` and :math:`V^H = vh`. The 1D tensor `s` + contains the singular values of `a` and `u` and `vh` are unitary. The rows + of `vh` are the eigenvectors of :math:`A^H A` and the columns of `u` are + the eigenvectors of :math:`A A^H`. In both cases the corresponding + (possibly non-zero) eigenvalues are given by ``s**2``. + + If `a` has more than two dimensions, then broadcasting rules apply, as + explained in :ref:`routines.linalg-broadcasting`. This means that SVD is + working in "stacked" mode: it iterates over all indices of the first + ``a.ndim - 2`` dimensions and for each combination SVD is applied to the + last two indices. The matrix `a` can be reconstructed from the + decomposition with either ``(u * s[..., None, :]) @ vh`` or + ``u @ (s[..., None] * vh)``. (The ``@`` operator can be replaced by the + function ``mt.matmul`` for python versions below 3.5.) + + Examples + -------- + >>> import mars.tensor as mt + >>> a = mt.random.randn(9, 6) + 1j*mt.random.randn(9, 6) + >>> b = mt.random.randn(2, 7, 8, 3) + 1j*mt.random.randn(2, 7, 8, 3) + + Reconstruction based on reduced SVD, 2D case: + + >>> u, s, vh = mt.linalg.svd(a) + >>> u.shape, s.shape, vh.shape + ((9, 6), (6,), (6, 6)) + >>> np.allclose(a, np.dot(u * s, vh)) + True + >>> smat = np.diag(s) + >>> np.allclose(a, np.dot(u, np.dot(smat, vh))) + True + + """ + op = TensorSVD(method=method) + return op(a) diff --git a/python/xorbits/_mars/tensor/linalg/tensordot.py b/python/xorbits/_mars/tensor/linalg/tensordot.py new file mode 100644 index 000000000..d31fedbad --- /dev/null +++ b/python/xorbits/_mars/tensor/linalg/tensordot.py @@ -0,0 +1,337 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from collections.abc import Iterable + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import FieldTypes, KeyField, TupleField +from ...utils import has_unknown_shape +from ..arithmetic.utils import chunk_tree_add +from ..array_utils import as_same_device, device, is_sparse_module +from ..core import TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorOperand, TensorOperandMixin +from ..utils import unify_chunks + + +class TensorTensorDot(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.TENSORDOT + + _a = KeyField("a") + _b = KeyField("b") + _a_axes = TupleField("a_axes", FieldTypes.int32) + _b_axes = TupleField("b_axes", FieldTypes.int32) + + def __init__(self, a_axes=None, b_axes=None, **kw): + super().__init__(_a_axes=a_axes, _b_axes=b_axes, **kw) + + @property + def a(self): + return self._a + + @property + def b(self): + return self._b + + @property + def a_axes(self): + return self._a_axes + + @property + def b_axes(self): + return self._b_axes + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._a = self._inputs[0] + self._b = self._inputs[1] + + def __call__(self, a, b): + shape = tuple( + s for i, s in enumerate(a.shape) if i not in set(self._a_axes) + ) + tuple(s for i, s in enumerate(b.shape) if i not in set(self._b_axes)) + return self.new_tensor([a, b], shape, order=TensorOrder.C_ORDER) + + @classmethod + def estimate_size(cls, ctx, op): + chunk = op.outputs[0] + if chunk.is_sparse(): + return super().estimate_size(ctx, op) + + # empirical value in real environments + calc_usage = chunk.nbytes + + # add input sizes when sparse-to-dense is needed + for inp in chunk.inputs: + if inp.is_sparse(): + calc_usage += inp.nbytes + + ctx[chunk.key] = (chunk.nbytes, calc_usage) + + @classmethod + def tile(cls, op): + a, b, a_axes, b_axes = op.a, op.b, op.a_axes, op.b_axes + + c = itertools.count(max(a.ndim, b.ndim)) + a_ax = tuple(a_axes.index(i) if i in a_axes else next(c) for i in range(a.ndim)) + b_ax = tuple(b_axes.index(i) if i in b_axes else next(c) for i in range(b.ndim)) + if has_unknown_shape(*op.inputs): + yield + a, b = yield from unify_chunks((a, a_ax), (b, b_ax)) + out = op.outputs[0] + + a_output_indexes = [ + range(len(a.nsplits[i])) for i in range(a.ndim) if i not in a_axes + ] + b_output_indexes = [ + range(len(b.nsplits[i])) for i in range(b.ndim) if i not in b_axes + ] + output_axes = [(0, i) for i in range(a.ndim) if i not in a_axes] + [ + (1, i) for i in range(b.ndim) if i not in b_axes + ] + + out_chunks = [] + for out_idx in itertools.product( + *itertools.chain(a_output_indexes, b_output_indexes) + ): + a_indexes = [None] * a.ndim + b_indexes = [None] * b.ndim + tensor_shape = [] + for i, idx in enumerate(out_idx): + t_idx, axis = output_axes[i] + t = (a, b)[t_idx] + (a_indexes if t_idx == 0 else b_indexes)[axis] = idx + tensor_shape.append(t.nsplits[axis][idx]) + tensor_shape = tuple(tensor_shape) + + tensordot_chunks = [] + for contract_indexes in itertools.product( + *[range(len(a.nsplits[ax])) for ax in a_axes] + ): + a_indices, b_indices = list(a_indexes), list(b_indexes) + for a_axis, contract_index in zip(a_axes, contract_indexes): + a_indices[a_axis] = contract_index + for b_axis, contract_index in zip(b_axes, contract_indexes): + b_indices[b_axis] = contract_index + + tensordot_chunk_op = op.copy().reset_key() + tensordot_chunk = tensordot_chunk_op.new_chunk( + [a.cix[tuple(a_indices)], b.cix[tuple(b_indices)]], + shape=tensor_shape, + order=out.order, + ) + tensordot_chunks.append(tensordot_chunk) + + if len(tensordot_chunks) == 1: + c = tensordot_chunks[0] + chunk_op = c.op.copy() + chunk = chunk_op.new_chunk( + c.inputs, shape=c.shape, index=out_idx, order=out.order + ) + else: + chunk = chunk_tree_add( + op.dtype, tensordot_chunks, out_idx, tensor_shape, sparse=op.sparse + ) + out_chunks.append(chunk) + + get_nsplits = lambda t_idx, i: (a, b)[t_idx].nsplits[i] + nsplits = [get_nsplits(*it) for it in output_axes] + new_op = op.copy() + return new_op.new_tensors([a, b], out.shape, chunks=out_chunks, nsplits=nsplits) + + @classmethod + def execute(cls, ctx, op): + (a, b), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + axes = op.a_axes, op.b_axes + with device(device_id): + if not op.sparse and is_sparse_module(xp): + # tell sparse to do calculation on numpy or cupy dot + ctx[op.outputs[0].key] = xp.tensordot(a, b, axes, sparse=False) + else: + ret = xp.tensordot(a, b, axes) + out = op.outputs[0] + ctx[out.key] = ret.astype(ret.dtype, order=out.order.value, copy=False) + + +def tensordot(a, b, axes=2, sparse=None): + """ + Compute tensor dot product along specified axes for tensors >= 1-D. + + Given two tensors (arrays of dimension greater than or equal to one), + `a` and `b`, and an array_like object containing two array_like + objects, ``(a_axes, b_axes)``, sum the products of `a`'s and `b`'s + elements (components) over the axes specified by ``a_axes`` and + ``b_axes``. The third argument can be a single non-negative + integer_like scalar, ``N``; if it is such, then the last ``N`` + dimensions of `a` and the first ``N`` dimensions of `b` are summed + over. + + Parameters + ---------- + a, b : array_like, len(shape) >= 1 + Tensors to "dot". + axes : int or (2,) array_like + * integer_like + If an int N, sum over the last N axes of `a` and the first N axes + of `b` in order. The sizes of the corresponding axes must match. + * (2,) array_like + Or, a list of axes to be summed over, first sequence applying to `a`, + second to `b`. Both elements array_like must be of the same length. + + See Also + -------- + dot, einsum + + Notes + ----- + Three common use cases are: + + * ``axes = 0`` : tensor product :math:`a\\otimes b` + * ``axes = 1`` : tensor dot product :math:`a\\cdot b` + * ``axes = 2`` : (default) tensor double contraction :math:`a:b` + + When `axes` is integer_like, the sequence for evaluation will be: first + the -Nth axis in `a` and 0th axis in `b`, and the -1th axis in `a` and + Nth axis in `b` last. + + When there is more than one axis to sum over - and they are not the last + (first) axes of `a` (`b`) - the argument `axes` should consist of + two sequences of the same length, with the first axis to sum over given + first in both sequences, the second axis second, and so forth. + + Examples + -------- + >>> import mars.tensor as mt + + A "traditional" example: + + >>> a = mt.arange(60.).reshape(3,4,5) + >>> b = mt.arange(24.).reshape(4,3,2) + >>> c = mt.tensordot(a,b, axes=([1,0],[0,1])) + >>> c.shape + (5, 2) + + >>> r = c.execute() + >>> r + array([[ 4400., 4730.], + [ 4532., 4874.], + [ 4664., 5018.], + [ 4796., 5162.], + [ 4928., 5306.]]) + + >>> # A slower but equivalent way of computing the same... + >>> ra = np.arange(60.).reshape(3,4,5) + >>> rb = np.arange(24.).reshape(4,3,2) + >>> d = np.zeros((5,2)) + >>> for i in range(5): + ... for j in range(2): + ... for k in range(3): + ... for n in range(4): + ... d[i,j] += ra[k,n,i] * rb[n,k,j] + >>> r == d + array([[ True, True], + [ True, True], + [ True, True], + [ True, True], + [ True, True]], dtype=bool) + + An extended example taking advantage of the overloading of + and \\*: + + >>> a = mt.array(range(1, 9)) + >>> a.shape = (2, 2, 2) + >>> A = mt.array(('a', 'b', 'c', 'd'), dtype=object) + >>> A.shape = (2, 2) + >>> a.execute(); A.execute() + array([[[1, 2], + [3, 4]], + [[5, 6], + [7, 8]]]) + array([[a, b], + [c, d]], dtype=object) + + >>> mt.tensordot(a, A).execute() # third argument default is 2 for double-contraction + array([abbcccdddd, aaaaabbbbbbcccccccdddddddd], dtype=object) + + >>> mt.tensordot(a, A, 1).execute() + array([[[acc, bdd], + [aaacccc, bbbdddd]], + [[aaaaacccccc, bbbbbdddddd], + [aaaaaaacccccccc, bbbbbbbdddddddd]]], dtype=object) + + >>> mt.tensordot(a, A, 0).execute() # tensor product (result too long to incl.) + array([[[[[a, b], + [c, d]], + ... + + >>> mt.tensordot(a, A, (0, 1)).execute() + array([[[abbbbb, cddddd], + [aabbbbbb, ccdddddd]], + [[aaabbbbbbb, cccddddddd], + [aaaabbbbbbbb, ccccdddddddd]]], dtype=object) + + >>> mt.tensordot(a, A, (2, 1)).execute() + array([[[abb, cdd], + [aaabbbb, cccdddd]], + [[aaaaabbbbbb, cccccdddddd], + [aaaaaaabbbbbbbb, cccccccdddddddd]]], dtype=object) + + >>> mt.tensordot(a, A, ((0, 1), (0, 1))).execute() + array([abbbcccccddddddd, aabbbbccccccdddddddd], dtype=object) + + >>> mt.tensordot(a, A, ((2, 1), (1, 0))).execute() + array([acccbbdddd, aaaaacccccccbbbbbbdddddddd], dtype=object) + """ + a = astensor(a) + b = astensor(b) + + if isinstance(axes, Iterable): + a_axes, b_axes = axes + else: + a_axes = tuple(range(a.ndim - 1, a.ndim - axes - 1, -1)) + b_axes = tuple(range(0, axes)) + + if isinstance(a_axes, Iterable): + a_axes = tuple(a_axes) + else: + a_axes = (a_axes,) + a_axes = tuple(axis if axis >= 0 else a.ndim + axis for axis in a_axes) + if isinstance(b_axes, Iterable): + b_axes = tuple(b_axes) + else: + b_axes = (b_axes,) + b_axes = tuple(axis if axis >= 0 else b.ndim + axis for axis in b_axes) + + if ( + a.shape + and b.shape + and not np.array_equal( + np.array(a.shape)[list(a_axes)], np.array(b.shape)[list(b_axes)] + ) + ): + raise ValueError("shape-mismatch for sum") + + sparse = sparse if sparse is not None else a.issparse() and b.issparse() + op = TensorTensorDot( + a_axes=a_axes, + b_axes=b_axes, + dtype=np.promote_types(a.dtype, b.dtype), + sparse=sparse, + ) + return op(a, b) diff --git a/python/xorbits/_mars/tensor/linalg/tests/__init__.py b/python/xorbits/_mars/tensor/linalg/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/tensor/linalg/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/tensor/linalg/tests/test_linalg.py b/python/xorbits/_mars/tensor/linalg/tests/test_linalg.py new file mode 100644 index 000000000..10fcea1ef --- /dev/null +++ b/python/xorbits/_mars/tensor/linalg/tests/test_linalg.py @@ -0,0 +1,481 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +import scipy.sparse as sps + +from .... import tensor as mt +from ....core import tile +from ... import dot, empty, ones, tensor +from ...core import SparseTensor, Tensor +from .. import matmul +from ..inv import TensorInv + + +def test_qr(): + a = mt.random.rand(9, 6, chunk_size=(3, 6)) + q, r = mt.linalg.qr(a) + + assert q.shape == (9, 6) + assert r.shape == (6, 6) + + q, r = tile(q, r) + + assert len(q.chunks) == 3 + assert len(r.chunks) == 1 + assert q.nsplits == ((3, 3, 3), (6,)) + assert r.nsplits == ((6,), (6,)) + + assert q.chunks[0].shape == (3, 6) + assert q.chunks[0].inputs[0].shape == (3, 3) + assert q.chunks[0].inputs[1].shape == (3, 6) + + a = mt.random.rand(18, 6, chunk_size=(9, 6)) + q, r = mt.linalg.qr(a) + + assert q.shape == (18, 6) + assert r.shape == (6, 6) + + q, r = tile(q, r) + + assert len(q.chunks) == 2 + assert len(r.chunks) == 1 + assert q.nsplits == ((9, 9), (6,)) + assert r.nsplits == ((6,), (6,)) + + assert q.chunks[0].shape == (9, 6) + assert q.chunks[0].inputs[0].shape == (9, 6) + assert q.chunks[0].inputs[1].shape == (6, 6) + + # for Short-and-Fat QR + a = mt.random.rand(6, 18, chunk_size=(6, 6)) + q, r = mt.linalg.qr(a, method="sfqr") + + assert q.shape == (6, 6) + assert r.shape == (6, 18) + + q, r = tile(q, r) + + assert len(q.chunks) == 1 + assert len(r.chunks) == 3 + assert q.nsplits == ((6,), (6,)) + assert r.nsplits == ((6,), (6, 6, 6)) + + # chunk width less than height + a = mt.random.rand(6, 9, chunk_size=(6, 3)) + q, r = mt.linalg.qr(a, method="sfqr") + + assert q.shape == (6, 6) + assert r.shape == (6, 9) + + q, r = tile(q, r) + + assert len(q.chunks) == 1 + assert len(r.chunks) == 2 + assert q.nsplits == ((6,), (6,)) + assert r.nsplits == ((6,), (6, 3)) + + a = mt.random.rand(9, 6, chunk_size=(9, 3)) + q, r = mt.linalg.qr(a, method="sfqr") + + assert q.shape == (9, 6) + assert r.shape == (6, 6) + + q, r = tile(q, r) + + assert len(q.chunks) == 1 + assert len(r.chunks) == 1 + assert q.nsplits == ((9,), (6,)) + assert r.nsplits == ((6,), (6,)) + + +def test_norm(): + data = np.random.rand(9, 6) + + a = mt.tensor(data, chunk_size=(2, 6)) + + for ord in (None, "nuc", np.inf, -np.inf, 0, 1, -1, 2, -2): + for axis in (0, 1, (0, 1)): + for keepdims in (True, False): + try: + res = mt.linalg.norm(a, ord=ord, axis=axis, keepdims=keepdims) + expect_shape = np.linalg.norm( + data, ord=ord, axis=axis, keepdims=keepdims + ).shape + assert res.shape == expect_shape + except ValueError: + continue + + +def test_svd(): + a = mt.random.rand(9, 6, chunk_size=(3, 6)) + U, s, V = mt.linalg.svd(a) + + assert U.shape == (9, 6) + assert s.shape == (6,) + assert V.shape == (6, 6) + + U, s, V = tile(U, s, V) + + assert len(U.chunks) == 3 + assert U.chunks[0].shape == (3, 6) + assert len(s.chunks) == 1 + assert s.chunks[0].shape == (6,) + assert len(V.chunks) == 1 + assert V.chunks[0].shape == (6, 6) + + assert U.chunks[0].inputs[0].shape == (3, 6) + assert U.chunks[0].inputs[0].inputs[0].shape == (3, 3) + assert U.chunks[0].inputs[0].inputs[1].shape == (3, 6) + + assert s.ndim == 1 + assert len(s.chunks[0].index) == 1 + + a = mt.random.rand(9, 6, chunk_size=(9, 6)) + U, s, V = mt.linalg.svd(a) + + assert U.shape == (9, 6) + assert s.shape == (6,) + assert V.shape == (6, 6) + + U, s, V = tile(U, s, V) + + assert len(U.chunks) == 1 + assert U.chunks[0].shape == (9, 6) + assert len(s.chunks) == 1 + assert s.chunks[0].shape == (6,) + assert len(V.chunks) == 1 + assert V.chunks[0].shape == (6, 6) + + assert s.ndim == 1 + assert len(s.chunks[0].index) == 1 + + a = mt.random.rand(6, 20, chunk_size=10) + U, s, V = mt.linalg.svd(a) + + assert U.shape == (6, 6) + assert s.shape == (6,) + assert V.shape == (6, 20) + + U, s, V = tile(U, s, V) + + assert len(U.chunks) == 1 + assert U.chunks[0].shape == (6, 6) + assert len(s.chunks) == 1 + assert s.chunks[0].shape == (6,) + assert len(V.chunks) == 1 + assert V.chunks[0].shape == (6, 20) + + a = mt.random.rand(6, 9, chunk_size=(6, 9)) + U, s, V = mt.linalg.svd(a) + + assert U.shape == (6, 6) + assert s.shape == (6,) + assert V.shape == (6, 9) + + rs = mt.random.RandomState(1) + + a = rs.rand(20, 10, chunk_size=10) + _, s, _ = mt.linalg.svd(a) + del _ + graph = s.build_graph() + assert len(graph) == 4 + + +def test_lu(): + a = mt.random.randint(1, 10, (6, 6), chunk_size=3) + p, l_, u = mt.linalg.lu(a) + + p, l_, u = tile(p, l_, u) + + assert l_.shape == (6, 6) + assert u.shape == (6, 6) + assert p.shape == (6, 6) + + a = mt.random.randint(1, 10, (6, 6), chunk_size=(3, 2)) + p, l_, u = mt.linalg.lu(a) + p, l_, u = tile(p, l_, u) + + assert l_.shape == (6, 6) + assert u.shape == (6, 6) + assert p.shape == (6, 6) + + assert p.nsplits == ((3, 3), (3, 3)) + assert l_.nsplits == ((3, 3), (3, 3)) + assert u.nsplits == ((3, 3), (3, 3)) + + a = mt.random.randint(1, 10, (7, 7), chunk_size=4) + p, l_, u = mt.linalg.lu(a) + p, l_, u = tile(p, l_, u) + + assert l_.shape == (7, 7) + assert u.shape == (7, 7) + assert p.shape == (7, 7) + + assert p.nsplits == ((4, 3), (4, 3)) + assert l_.nsplits == ((4, 3), (4, 3)) + assert u.nsplits == ((4, 3), (4, 3)) + + a = mt.random.randint(1, 10, (7, 5), chunk_size=4) + p, l_, u = mt.linalg.lu(a) + p, l_, u = tile(p, l_, u) + + assert l_.shape == (7, 5) + assert u.shape == (5, 5) + assert p.shape == (7, 7) + + a = mt.random.randint(1, 10, (5, 7), chunk_size=4) + p, l_, u = mt.linalg.lu(a) + p, l_, u = tile(p, l_, u) + + assert l_.shape == (5, 5) + assert u.shape == (5, 7) + assert p.shape == (5, 5) + + # test sparse + data = sps.csr_matrix( + [ + [2, 0, 0, 0, 5, 2], + [0, 6, 1, 0, 0, 6], + [8, 0, 9, 0, 0, 2], + [0, 6, 0, 8, 7, 3], + [7, 0, 6, 1, 7, 0], + [0, 0, 0, 7, 0, 8], + ] + ) + t = mt.tensor(data, chunk_size=3) + p, l_, u = mt.linalg.lu(t) + + assert p.op.sparse is True + assert isinstance(p, SparseTensor) + assert l_.op.sparse is True + assert isinstance(l_, SparseTensor) + assert u.op.sparse is True + assert isinstance(u, SparseTensor) + + p, l_, u = tile(p, l_, u) + + assert all(c.is_sparse() for c in p.chunks) is True + assert all(c.is_sparse() for c in l_.chunks) is True + assert all(c.is_sparse() for c in u.chunks) is True + + +def test_solve(): + a = mt.random.randint(1, 10, (20, 20)) + b = mt.random.randint(1, 10, (20,)) + x = tile(mt.linalg.solve(a, b)) + + assert x.shape == (20,) + + a = mt.random.randint(1, 10, (20, 20), chunk_size=5) + b = mt.random.randint(1, 10, (20, 3), chunk_size=5) + x = tile(mt.linalg.solve(a, b)) + + assert x.shape == (20, 3) + + a = mt.random.randint(1, 10, (20, 20), chunk_size=12) + b = mt.random.randint(1, 10, (20, 3)) + x = tile(mt.linalg.solve(a, b)) + + assert x.shape == (20, 3) + assert x.nsplits == ((12, 8), (3,)) + + # test sparse + a = sps.csr_matrix(np.random.randint(1, 10, (20, 20))) + b = mt.random.randint(1, 10, (20,), chunk_size=3) + x = tile(mt.linalg.solve(a, b)) + + assert x.shape == (20,) + assert x.op.sparse is True + assert x.chunks[0].op.sparse is True + + a = mt.tensor(a, chunk_size=7) + b = mt.random.randint(1, 10, (20,)) + x = tile(mt.linalg.solve(a, b)) + + assert x.shape == (20,) + assert x.nsplits == ((7, 7, 6),) + + x = tile(mt.linalg.solve(a, b, sparse=False)) + assert x.op.sparse is False + assert x.chunks[0].op.sparse is False + + +def test_inv(): + a = mt.random.randint(1, 10, (20, 20), chunk_size=8) + a_inv = tile(mt.linalg.inv(a)) + + assert a_inv.shape == (20, 20) + + # test 1 chunk + a = mt.random.randint(1, 10, (20, 20), chunk_size=20) + a_inv = tile(mt.linalg.inv(a)) + + assert a_inv.shape == (20, 20) + assert len(a_inv.chunks) == 1 + assert isinstance(a_inv.chunks[0].op, TensorInv) + + a = mt.random.randint(1, 10, (20, 20), chunk_size=11) + a_inv = tile(mt.linalg.inv(a)) + + assert a_inv.shape == (20, 20) + assert a_inv.nsplits == ((11, 9), (11, 9)) + + b = a.T.dot(a) + b_inv = tile(mt.linalg.inv(b)) + assert b_inv.shape == (20, 20) + + # test sparse + data = sps.csr_matrix(np.random.randint(1, 10, (20, 20))) + a = mt.tensor(data, chunk_size=10) + a_inv = tile(mt.linalg.inv(a)) + + assert a_inv.shape == (20, 20) + + assert a_inv.op.sparse is True + assert isinstance(a_inv, SparseTensor) + assert all(c.is_sparse() for c in a_inv.chunks) is True + + b = a.T.dot(a) + b_inv = tile(mt.linalg.inv(b)) + assert b_inv.shape == (20, 20) + + assert b_inv.op.sparse is True + assert isinstance(b_inv, SparseTensor) + assert all(c.is_sparse() for c in b_inv.chunks) is True + + b_inv = tile(mt.linalg.inv(b, sparse=False)) + assert b_inv.op.sparse is False + assert not all(c.is_sparse() for c in b_inv.chunks) is True + + +def test_tensordot(): + from .. import dot, inner, tensordot + + t1 = ones((3, 4, 6), chunk_size=2) + t2 = ones((4, 3, 5), chunk_size=2) + t3 = tensordot(t1, t2, axes=((0, 1), (1, 0))) + + assert t3.shape == (6, 5) + + t3 = tile(t3) + + assert t3.shape == (6, 5) + assert len(t3.chunks) == 9 + + a = ones((10000, 20000), chunk_size=5000) + b = ones((20000, 1000), chunk_size=5000) + + with pytest.raises(ValueError): + tensordot(a, b) + + a = ones(10, chunk_size=2) + b = ones((10, 20), chunk_size=2) + c = dot(a, b) + assert c.shape == (20,) + c = tile(c) + assert c.shape == tuple(sum(s) for s in c.nsplits) + + a = ones((10, 20), chunk_size=2) + b = ones(20, chunk_size=2) + c = dot(a, b) + assert c.shape == (10,) + c = tile(c) + assert c.shape == tuple(sum(s) for s in c.nsplits) + + v = ones((100, 100), chunk_size=10) + tv = v.dot(v) + assert tv.shape == (100, 100) + tv = tile(tv) + assert tv.shape == tuple(sum(s) for s in tv.nsplits) + + a = ones((10, 20), chunk_size=2) + b = ones((30, 20), chunk_size=2) + c = inner(a, b) + assert c.shape == (10, 30) + c = tile(c) + assert c.shape == tuple(sum(s) for s in c.nsplits) + + +def test_dot(): + t1 = tensor([[0, 1, 0], [1, 0, 0]], chunk_size=2).tosparse() + t2 = t1.T + + assert t1.dot(t2).issparse() is True + assert type(t1.dot(t2)) is SparseTensor + assert t1.dot(t2, sparse=False).issparse() is False + assert type(t1.dot(t2, sparse=False)) is Tensor + + with pytest.raises(TypeError): + dot(t1, t2, out=1) + + with pytest.raises(ValueError): + dot(t1, t2, empty((3, 6))) + + with pytest.raises(ValueError): + dot(t1, t2, empty((3, 3), dtype="i4")) + + with pytest.raises(ValueError): + dot(t1, t2, empty((3, 3), order="F")) + + t1.dot(t2, out=empty((2, 2), dtype=t1.dtype)) + + +def test_matmul(): + t1 = tensor([[0, 1, 0], [1, 0, 0]], chunk_size=2).tosparse() + t2 = t1.T + + t3 = matmul(t1, t2, out=empty((2, 2), dtype=t1.dtype, order="F")) + assert t3.order.value == "F" + + with pytest.raises(TypeError): + matmul(t1, t2, out=1) + + with pytest.raises(TypeError): + matmul(t1, t2, out=empty((2, 2), dtype="?")) + + with pytest.raises(ValueError): + matmul(t1, t2, out=empty((3, 2), dtype=t1.dtype)) + + raw1 = np.asfortranarray(np.random.rand(3, 3)) + raw2 = np.asfortranarray(np.random.rand(3, 3)) + raw3 = np.random.rand(3, 3) + + assert ( + matmul(tensor(raw1), tensor(raw2)).flags["C_CONTIGUOUS"] + == np.matmul(raw1, raw2).flags["C_CONTIGUOUS"] + ) + assert ( + matmul(tensor(raw1), tensor(raw2)).flags["F_CONTIGUOUS"] + == np.matmul(raw1, raw2).flags["F_CONTIGUOUS"] + ) + + assert ( + matmul(tensor(raw1), tensor(raw2), order="A").flags["C_CONTIGUOUS"] + == np.matmul(raw1, raw2, order="A").flags["C_CONTIGUOUS"] + ) + assert ( + matmul(tensor(raw1), tensor(raw2), order="A").flags["F_CONTIGUOUS"] + == np.matmul(raw1, raw2, order="A").flags["F_CONTIGUOUS"] + ) + + assert ( + matmul(tensor(raw1), tensor(raw3), order="A").flags["C_CONTIGUOUS"] + == np.matmul(raw1, raw3, order="A").flags["C_CONTIGUOUS"] + ) + assert ( + matmul(tensor(raw1), tensor(raw3), order="A").flags["F_CONTIGUOUS"] + == np.matmul(raw1, raw3, order="A").flags["F_CONTIGUOUS"] + ) diff --git a/python/xorbits/_mars/tensor/linalg/tests/test_linalg_execution.py b/python/xorbits/_mars/tensor/linalg/tests/test_linalg_execution.py new file mode 100644 index 000000000..e459c0a5a --- /dev/null +++ b/python/xorbits/_mars/tensor/linalg/tests/test_linalg_execution.py @@ -0,0 +1,991 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import scipy.sparse as sps + +from ....learn.datasets.samples_generator import make_low_rank_matrix +from ....lib.sparse import SparseNDArray, issparse +from ....utils import ignore_warning +from ...datasource import arange, diag, ones, tensor +from ...random import uniform +from .. import ( + cholesky, + dot, + inner, + inv, + lu, + matmul, + norm, + qr, + randomized_svd, + solve, + solve_triangular, + svd, + tensordot, + vdot, +) + + +def test_qr_execution(setup): + rs = np.random.RandomState(0) + data = rs.randn(18, 6) + + a = tensor(data, chunk_size=(3, 6)) + q, r = qr(a) + t = q.dot(r) + + res = t.execute().fetch() + np.testing.assert_array_almost_equal(res, data) + + a = tensor(data, chunk_size=(9, 6)) + q, r = qr(a) + t = q.dot(r) + + res = t.execute().fetch() + np.testing.assert_array_almost_equal(res, data) + + a = tensor(data, chunk_size=3) + q, r = qr(a) + t = q.dot(r) + + res = t.execute().fetch() + np.testing.assert_array_almost_equal(res, data) + + # test for Short-and-Fat QR + data = rs.randn(6, 18) + + a = tensor(data, chunk_size=(6, 9)) + q, r = qr(a, method="sfqr") + t = q.dot(r) + + res = t.execute().fetch() + np.testing.assert_array_almost_equal(res, data) + + a = tensor(data, chunk_size=(3, 3)) + q, r = qr(a, method="sfqr") + t = q.dot(r) + + res = t.execute().fetch() + np.testing.assert_array_almost_equal(res, data) + + a = tensor(data, chunk_size=(6, 3)) + q, r = qr(a, method="sfqr") + t = q.dot(r) + + res = t.execute().fetch() + np.testing.assert_array_almost_equal(res, data) + + +def test_svd_execution(setup): + rs = np.random.RandomState() + data = rs.randn(18, 6) + 1j * rs.randn(18, 6) + + a = tensor(data, chunk_size=(9, 6)) + U, s, V = svd(a) + t = U.dot(diag(s).dot(V)) + + res = t.execute().fetch() + np.testing.assert_array_almost_equal(res, data) + + a = tensor(data, chunk_size=(18, 6)) + U, s, V = svd(a) + t = U.dot(diag(s).dot(V)) + + res = t.execute().fetch() + np.testing.assert_array_almost_equal(res, data) + + a = tensor(data, chunk_size=(2, 6)) + U, s, V = svd(a) + t = U.dot(diag(s).dot(V)) + + res = t.execute().fetch() + np.testing.assert_array_almost_equal(res, data) + + data = rs.randn(6, 18) + 1j * rs.randn(6, 18) + + a = tensor(data) + U, s, V = svd(a) + t = U.dot(diag(s).dot(V)) + + res = t.execute().fetch() + np.testing.assert_array_almost_equal(res, data) + + # test for matrix of ones + data = np.ones((20, 10)) + + a = tensor(data, chunk_size=10) + s = svd(a)[1] + res = s.execute().fetch() + expected = np.linalg.svd(a)[1] + np.testing.assert_array_almost_equal(res, expected) + + +def test_randomized_svd_execution(setup): + n_samples = 100 + n_features = 500 + rank = 5 + k = 10 + for dtype in (np.int64, np.float64): + # generate a matrix X of approximate effective rank `rank` and no noise + # component (very structured signal): + X = make_low_rank_matrix( + n_samples=n_samples, + n_features=n_features, + effective_rank=rank, + tail_strength=0.0, + random_state=0, + ).astype(dtype, copy=False) + assert X.shape == (n_samples, n_features) + dtype = np.dtype(dtype) + decimal = 5 if dtype == np.float32 else 7 + + # compute the singular values of X using the slow exact method + X_res = X.execute().fetch() + U, s, V = np.linalg.svd(X_res, full_matrices=False) + + # Convert the singular values to the specific dtype + U = U.astype(dtype, copy=False) + s = s.astype(dtype, copy=False) + V = V.astype(dtype, copy=False) + + for normalizer in ["auto", "LU", "QR"]: # 'none' would not be stable + # compute the singular values of X using the fast approximate method + Ua, sa, Va = randomized_svd( + X, k, n_iter=1, power_iteration_normalizer=normalizer, random_state=0 + ) + + # If the input dtype is float, then the output dtype is float of the + # same bit size (f32 is not upcast to f64) + # But if the input dtype is int, the output dtype is float64 + if dtype.kind == "f": + assert Ua.dtype == dtype + assert sa.dtype == dtype + assert Va.dtype == dtype + else: + assert Ua.dtype == np.float64 + assert sa.dtype == np.float64 + assert Va.dtype == np.float64 + + assert Ua.shape == (n_samples, k) + assert sa.shape == (k,) + assert Va.shape == (k, n_features) + + # ensure that the singular values of both methods are equal up to the + # real rank of the matrix + sa_res = sa.execute().fetch() + np.testing.assert_almost_equal(s[:k], sa_res, decimal=decimal) + + # check the singular vectors too (while not checking the sign) + dot_res = dot(Ua, Va).execute().fetch() + np.testing.assert_almost_equal( + np.dot(U[:, :k], V[:k, :]), dot_res, decimal=decimal + ) + + +def test_cholesky_execution(setup): + rs = np.random.RandomState(0) + data = rs.randint(1, 10, (10, 10)) + symmetric_data = data.dot(data.T) + + a = tensor(symmetric_data, chunk_size=5) + + U = cholesky(a) + t = U.T.dot(U) + + res_u = U.execute().fetch() + np.testing.assert_allclose(np.triu(res_u), res_u) + + res = t.execute().fetch() + np.testing.assert_allclose(res, symmetric_data) + + L = cholesky(a, lower=True) + U = cholesky(a) + t = L.dot(U) + + res = t.execute().fetch() + np.testing.assert_allclose(res, symmetric_data) + + a = tensor(symmetric_data, chunk_size=5) + + L = cholesky(a, lower=True) + U = cholesky(a) + t = L.dot(U) + + res_u = U.execute().fetch() + np.testing.assert_allclose(np.triu(res_u), res_u) + + res = t.execute().fetch() + np.testing.assert_allclose(res, symmetric_data) + + a = tensor(symmetric_data, chunk_size=(2, 3)) + + L = cholesky(a, lower=True) + U = cholesky(a) + t = L.dot(U) + + res = t.execute().fetch() + np.testing.assert_allclose(res, symmetric_data) + + +def test_lu_execution(setup): + rs = np.random.RandomState(0) + + # square matrix + data = rs.randint(1, 10, (6, 6)) + + a = tensor(data) + P, L, U = lu(a) + + # check lower and upper triangular matrix + result_l = L.execute().fetch() + result_u = U.execute().fetch() + + np.testing.assert_allclose(np.tril(result_l), result_l) + np.testing.assert_allclose(np.triu(result_u), result_u) + + t = P.dot(L).dot(U) + res = t.execute().fetch() + np.testing.assert_allclose(res, data) + + a = tensor(data, chunk_size=(3, 4)) + P, L, U = lu(a) + + # check lower and upper triangular matrix + result_l = L.execute().fetch() + result_u = U.execute().fetch() + + np.testing.assert_allclose(np.tril(result_l), result_l) + np.testing.assert_allclose(np.triu(result_u), result_u) + + t = P.dot(L).dot(U) + res = t.execute().fetch() + np.testing.assert_allclose(res, data) + + # shape[0] > shape[1] + data = rs.randint(1, 10, (10, 6)) + + a = tensor(data) + P, L, U = lu(a) + + # check lower and upper triangular matrix + result_l = L.execute().fetch() + result_u = U.execute().fetch() + + np.testing.assert_allclose(np.tril(result_l), result_l) + np.testing.assert_allclose(np.triu(result_u), result_u) + + t = P.dot(L).dot(U) + res = t.execute().fetch() + np.testing.assert_allclose(res, data) + + a = tensor(data, chunk_size=5) + P, L, U = lu(a) + + # check lower and upper triangular matrix + result_l = L.execute().fetch() + result_u = U.execute().fetch() + + np.testing.assert_allclose(np.tril(result_l), result_l) + np.testing.assert_allclose(np.triu(result_u), result_u) + + t = P.dot(L).dot(U) + res = t.execute().fetch() + np.testing.assert_allclose(res, data) + + a = tensor(data, chunk_size=(4, 5)) + P, L, U = lu(a) + + # check lower and upper triangular matrix + result_l = L.execute().fetch() + result_u = U.execute().fetch() + + np.testing.assert_allclose(np.tril(result_l), result_l) + np.testing.assert_allclose(np.triu(result_u), result_u) + + t = P.dot(L).dot(U) + res = t.execute().fetch() + np.testing.assert_allclose(res, data) + + # shape[0] < shape[1] + data = rs.randint(1, 10, (6, 10)) + + a = tensor(data) + P, L, U = lu(a) + + # check lower and upper triangular matrix + result_l = L.execute().fetch() + result_u = U.execute().fetch() + + np.testing.assert_allclose(np.tril(result_l), result_l) + np.testing.assert_allclose(np.triu(result_u), result_u) + + t = P.dot(L).dot(U) + res = t.execute().fetch() + np.testing.assert_allclose(res, data) + + a = tensor(data, chunk_size=5) + P, L, U = lu(a) + + # check lower and upper triangular matrix + result_l = L.execute().fetch() + result_u = U.execute().fetch() + + np.testing.assert_allclose(np.tril(result_l), result_l) + np.testing.assert_allclose(np.triu(result_u), result_u) + + t = P.dot(L).dot(U) + res = t.execute().fetch() + np.testing.assert_allclose(res, data) + + a = tensor(data, chunk_size=(4, 5)) + P, L, U = lu(a) + + # check lower and upper triangular matrix + result_l = L.execute().fetch() + result_u = U.execute().fetch() + + np.testing.assert_allclose(np.tril(result_l), result_l) + np.testing.assert_allclose(np.triu(result_u), result_u) + + t = P.dot(L).dot(U) + res = t.execute().fetch() + np.testing.assert_allclose(res, data) + + # test for sparse + data = sps.csr_matrix( + [ + [2, 0, 0, 0, 5, 2], + [0, 6, 1, 0, 0, 6], + [8, 0, 9, 0, 0, 2], + [0, 6, 0, 8, 7, 3], + [7, 0, 6, 1, 7, 0], + [0, 0, 0, 7, 0, 8], + ] + ) + + a = tensor(data) + P, L, U = lu(a) + result_l = L.execute().fetch() + result_u = U.execute().fetch() + + # check lower and upper triangular matrix + np.testing.assert_allclose(np.tril(result_l), result_l) + np.testing.assert_allclose(np.triu(result_u), result_u) + assert isinstance(result_l, SparseNDArray) + assert isinstance(result_u, SparseNDArray) + + t = P.dot(L).dot(U) + res = t.execute().fetch() + np.testing.assert_array_almost_equal(data.A, res) + + a = tensor(data, chunk_size=5) + P, L, U = lu(a) + result_l = L.execute().fetch() + result_u = U.execute().fetch() + + # check lower and upper triangular matrix + np.testing.assert_allclose(np.tril(result_l), result_l) + np.testing.assert_allclose(np.triu(result_u), result_u) + assert isinstance(result_l, SparseNDArray) + assert isinstance(result_u, SparseNDArray) + + t = P.dot(L).dot(U) + res = t.execute().fetch() + np.testing.assert_array_almost_equal(data.A, res) + + +def test_solve_triangular(setup): + from ... import tril, triu + + rs = np.random.RandomState(0) + + data1 = rs.randint(1, 10, (20, 20)) + data2 = rs.randint(1, 10, (20,)) + + A = tensor(data1, chunk_size=20) + b = tensor(data2, chunk_size=20) + + x = solve_triangular(A, b) + t = triu(A).dot(x) + + res = t.execute().fetch() + np.testing.assert_allclose(res, data2) + + x = solve_triangular(A, b, lower=True) + t = tril(A).dot(x) + + res = t.execute().fetch() + np.testing.assert_allclose(res, data2) + + A = tensor(data1, chunk_size=10) + b = tensor(data2, chunk_size=10) + + x = solve_triangular(A, b) + t = triu(A).dot(x) + + res = t.execute().fetch() + np.testing.assert_allclose(res, data2) + + x = solve_triangular(A, b, lower=True) + t = tril(A).dot(x) + + res = t.execute().fetch() + np.testing.assert_allclose(res, data2) + + data1 = rs.randint(1, 10, (10, 10)) + data2 = rs.randint(1, 10, (10, 5)) + + A = tensor(data1, chunk_size=10) + b = tensor(data2, chunk_size=10) + + x = solve_triangular(A, b) + t = triu(A).dot(x) + + res = t.execute().fetch() + np.testing.assert_allclose(res, data2) + + x = solve_triangular(A, b, lower=True) + t = tril(A).dot(x) + + res = t.execute().fetch() + np.testing.assert_allclose(res, data2) + + # test sparse + data1 = sps.csr_matrix(np.triu(rs.randint(1, 10, (10, 10)))) + data2 = rs.random((10,)) + + A = tensor(data1, chunk_size=5) + b = tensor(data2, chunk_size=5) + + x = solve_triangular(A, b) + + result_x = x.execute().fetch() + result_b = data1.dot(result_x) + + assert isinstance(result_x, SparseNDArray) + np.testing.assert_allclose(result_b, data2) + + data1 = sps.csr_matrix(np.triu(rs.randint(1, 10, (10, 10)))) + data2 = rs.random((10, 2)) + + A = tensor(data1, chunk_size=5) + b = tensor(data2, chunk_size=5) + + x = solve_triangular(A, b) + + result_x = x.execute().fetch() + result_b = data1.dot(result_x) + + assert isinstance(result_x, SparseNDArray) + np.testing.assert_allclose(result_b, data2) + + +def test_solve(setup): + import scipy.linalg + + rs = np.random.RandomState(0) + + data1 = rs.randint(1, 10, (20, 20)) + data2 = rs.randint(1, 10, (20,)) + + A = tensor(data1, chunk_size=10) + b = tensor(data2, chunk_size=10) + + x = solve(A, b) + + res = x.execute().fetch() + np.testing.assert_allclose(res, scipy.linalg.solve(data1, data2)) + res = A.dot(x).execute().fetch() + np.testing.assert_allclose(res, data2) + + data2 = rs.randint(1, 10, (20, 5)) + + A = tensor(data1, chunk_size=10) + b = tensor(data2, chunk_size=10) + + x = solve(A, b) + + res = x.execute().fetch() + np.testing.assert_allclose(res, scipy.linalg.solve(data1, data2)) + res = A.dot(x).execute().fetch() + np.testing.assert_allclose(res, data2) + + # test for not all chunks are square in matrix A + data2 = rs.randint(1, 10, (20,)) + + A = tensor(data1, chunk_size=10) + b = tensor(data2, chunk_size=10) + + x = solve(A, b) + + res = x.execute().fetch() + np.testing.assert_allclose(res, scipy.linalg.solve(data1, data2)) + res = A.dot(x).execute().fetch() + np.testing.assert_allclose(res, data2) + + A = tensor(data1, chunk_size=(10, 15)) + b = tensor(data2, chunk_size=10) + + x = solve(A, b) + + res = x.execute().fetch() + np.testing.assert_allclose(res, scipy.linalg.solve(data1, data2)) + res = A.dot(x).execute().fetch() + np.testing.assert_allclose(res, data2) + + # test sparse + data1 = sps.csr_matrix(rs.randint(1, 10, (20, 20))) + data2 = rs.randint(1, 10, (20,)) + + A = tensor(data1, chunk_size=10) + b = tensor(data2, chunk_size=10) + + x = solve(A, b) + + res = x.execute().fetch() + assert isinstance(res, SparseNDArray) + np.testing.assert_allclose(data1.dot(res), data2) + + data2 = rs.randint(1, 10, (20, 5)) + + A = tensor(data1, chunk_size=10) + b = tensor(data2, chunk_size=10) + + x = solve(A, b) + + res = A.dot(x).execute().fetch() + assert isinstance(res, SparseNDArray) + np.testing.assert_allclose(res, data2) + + # test for not all chunks are square in matrix A + data2 = rs.randint(1, 10, (20,)) + + A = tensor(data1, chunk_size=10) + b = tensor(data2, chunk_size=10) + + x = solve(A, b) + + res = A.dot(x).execute().fetch() + np.testing.assert_allclose(res, data2) + + +def test_solve_sym_pos(setup): + import scipy.linalg + + rs = np.random.RandomState(0) + + data = rs.randint(1, 10, (20, 20)) + data_l = np.tril(data) + data1 = data_l.dot(data_l.T) + data2 = rs.randint(1, 10, (20,)) + + A = tensor(data1, chunk_size=10) + b = tensor(data2, chunk_size=10) + + x = solve(A, b, sym_pos=True) + + res = x.execute().fetch() + np.testing.assert_allclose(res, scipy.linalg.solve(data1, data2)) + res = A.dot(x).execute().fetch() + np.testing.assert_allclose(res, data2) + + +def test_inv(setup): + import scipy.linalg + + rs = np.random.RandomState(0) + + data = rs.randint(1, 10, (20, 20)) + + A = tensor(data) + inv_A = inv(A) + + res = inv_A.execute().fetch() + np.testing.assert_allclose(res, scipy.linalg.inv(data)) + res = A.dot(inv_A).execute().fetch() + np.testing.assert_array_almost_equal(res, np.eye(data.shape[0], dtype=float)) + + A = tensor(data, chunk_size=10) + inv_A = inv(A) + + res = inv_A.execute().fetch() + np.testing.assert_allclose(res, scipy.linalg.inv(data)) + res = A.dot(inv_A).execute().fetch() + np.testing.assert_array_almost_equal(res, np.eye(data.shape[0], dtype=float)) + + # test 1 chunk + A = tensor(data, chunk_size=20) + inv_A = inv(A) + + res = inv_A.execute().fetch() + np.testing.assert_allclose(res, scipy.linalg.inv(data)) + res = A.dot(inv_A).execute().fetch() + np.testing.assert_array_almost_equal(res, np.eye(data.shape[0], dtype=float)) + + B = A.T.dot(A) + inv_B = inv(B) + res = inv_B.execute().fetch() + np.testing.assert_array_almost_equal(res, scipy.linalg.inv(data.T.dot(data))) + res = B.dot(inv_B).execute().fetch() + np.testing.assert_array_almost_equal(res, np.eye(data.shape[0], dtype=float)) + + # test for not all chunks are square in matrix A + A = tensor(data, chunk_size=8) + inv_A = inv(A) + + res = inv_A.execute().fetch() + np.testing.assert_array_almost_equal(res, scipy.linalg.inv(data)) + res = A.dot(inv_A).execute().fetch() + np.testing.assert_array_almost_equal(res, np.eye(data.shape[0], dtype=float)) + + # test sparse + data = rs.randint(1, 10, (20, 20)) + sp_data = sps.csr_matrix(data) + + A = tensor(sp_data, chunk_size=10) + inv_A = inv(A) + + res = inv_A.execute().fetch() + assert isinstance(res, SparseNDArray) + np.testing.assert_array_almost_equal(res, scipy.linalg.inv(data)) + res = A.dot(inv_A).execute().fetch() + np.testing.assert_array_almost_equal(res, np.eye(data.shape[0], dtype=float)) + + # test for not all chunks are square in matrix A + A = tensor(sp_data, chunk_size=12) + inv_A = inv(A) + + res = inv_A.execute().fetch() + assert isinstance(res, SparseNDArray) + np.testing.assert_array_almost_equal(res, scipy.linalg.inv(data)) + res = A.dot(inv_A).execute().fetch() + np.testing.assert_array_almost_equal(res, np.eye(data.shape[0], dtype=float)) + + +@ignore_warning +def test_norm_execution(setup): + d = np.arange(9) - 4 + d2 = d.reshape(3, 3) + + ma = [tensor(d, chunk_size=2), tensor(d2, chunk_size=(2, 3))] + + for i, a in enumerate(ma): + data = d if i < 1 else d2 + for ord in (None, "nuc", np.inf, -np.inf, 0, 1, -1, 2, -2): + for axis in (0, 1, (0, 1), -1): + for keepdims in (True, False): + try: + expected = np.linalg.norm( + data, ord=ord, axis=axis, keepdims=keepdims + ) + t = norm(a, ord=ord, axis=axis, keepdims=keepdims) + res = t.execute().fetch() + + expected_shape = expected.shape + t_shape = t.shape + assert expected_shape == t_shape + + np.testing.assert_allclose(res, expected, atol=0.0001) + except ValueError: + continue + + m = norm(tensor(d)) + expected = m.execute().fetch() + res = np.linalg.norm(d) + assert expected == res + + d = uniform(-0.5, 0.5, size=(5000, 2), chunk_size=1000) + inside = (norm(d, axis=1) < 0.5).sum().astype(float) + t = inside / 5000 * 4 + res = t.execute().fetch() + np.testing.assert_almost_equal(3.14, res, decimal=1) + + raw = np.random.RandomState(0).rand(10, 10) + d = norm(tensor(raw, chunk_size=5)) + expected = d.execute().fetch() + result = np.linalg.norm(raw) + np.testing.assert_allclose(expected, result) + + +def test_tensordot_execution(setup): + rs = np.random.RandomState(0) + # size_executor = ExecutorForTest(sync_provider_type=ExecutorForTest.SyncProviderType.MOCK) + # + # a_data = np.arange(60).reshape(3, 4, 5) + # a = tensor(a_data, chunk_size=2) + # b_data = np.arange(24).reshape(4, 3, 2) + # b = tensor(b_data, chunk_size=2) + # + # axes = ([1, 0], [0, 1]) + # c = tensordot(a, b, axes=axes) + # size_res = size_executor.execute_tensor(c, mock=True) + # assert sum(s[0] for s in size_res) == c.nbytes + # assert sum(s[1] for s in size_res) == c.nbytes + + a = ones((100, 200), chunk_size=50) + b = ones((200, 10), chunk_size=50) + c = dot(a, b) + res = c.execute().fetch() + expected = np.dot(np.ones((100, 200)), np.ones((200, 10))) + np.testing.assert_array_equal(res, expected) + + a = ones((10, 8), chunk_size=4) + b = ones((8, 10), chunk_size=4) + c = a.dot(b) + res = c.execute().fetch() + np.testing.assert_array_equal(res, np.tile([8], [10, 10])) + + a = ones((500, 500), chunk_size=500) + b = ones((500, 100), chunk_size=500) + c = a.dot(b) + res = c.execute().fetch() + np.testing.assert_array_equal(res, np.tile([500], [500, 100])) + + raw_a = rs.random((100, 200, 50)) + raw_b = rs.random((200, 10, 100)) + a = tensor(raw_a, chunk_size=50) + b = tensor(raw_b, chunk_size=33) + c = tensordot(a, b, axes=((0, 1), (2, 0))) + res = c.execute().fetch() + expected = np.tensordot(raw_a, raw_b, axes=(c.op.a_axes, c.op.b_axes)) + np.testing.assert_array_almost_equal(res, expected) + + a = ones((100, 200), chunk_size=50) + b = ones((10, 200), chunk_size=50) + c = inner(a, b) + res = c.execute().fetch() + expected = np.inner(np.ones((100, 200)), np.ones((10, 200))) + np.testing.assert_array_equal(res, expected) + + a = ones((100, 100), chunk_size=30) + b = ones((100, 100), chunk_size=30) + c = a.dot(b) + res = c.execute().fetch() + np.testing.assert_array_equal(res, np.ones((100, 100)) * 100) + + +# def test_sparse_dot_size_execution(): +# from mars.tensor.linalg.tensordot import TensorTensorDot +# from mars.executor import register, register_default +# chunk_sizes = dict() +# chunk_nbytes = dict() +# chunk_input_sizes = dict() +# chunk_input_nbytes = dict() +# +# def execute_size(t): +# def _tensordot_size_recorder(ctx, op): +# TensorTensorDot.estimate_size(ctx, op) +# +# chunk_key = op.outputs[0].key +# chunk_sizes[chunk_key] = ctx[chunk_key] +# chunk_nbytes[chunk_key] = op.outputs[0].nbytes +# +# input_sizes = dict((inp.op.key, ctx[inp.key][0]) for inp in op.inputs) +# chunk_input_sizes[chunk_key] = sum(input_sizes.values()) +# input_nbytes = dict((inp.op.key, inp.nbytes) for inp in op.inputs) +# chunk_input_nbytes[chunk_key] = sum(input_nbytes.values()) +# +# size_executor = ExecutorForTest(sync_provider_type=ExecutorForTest.SyncProviderType.MOCK) +# try: +# chunk_sizes.clear() +# chunk_nbytes.clear() +# chunk_input_sizes.clear() +# chunk_input_nbytes.clear() +# register(TensorTensorDot, size_estimator=_tensordot_size_recorder) +# size_executor.execute_tensor(t, mock=True) +# finally: +# register_default(TensorTensorDot) +# +# a_data = sps.random(5, 9, density=.1) +# b_data = sps.random(9, 10, density=.2) +# a = tensor(a_data, chunk_size=2) +# b = tensor(b_data, chunk_size=3) +# +# c = dot(a, b) +# execute_size(c) +# +# for key in chunk_input_sizes.keys(): +# assert chunk_sizes[key][1] >= chunk_input_sizes[key] +# +# c2 = dot(a, b, sparse=False) +# execute_size(c2) +# +# for key in chunk_input_sizes.keys(): +# assert chunk_sizes[key][0] == chunk_nbytes[key] +# assert chunk_sizes[key][1] == chunk_input_nbytes[key] + chunk_nbytes[key] + + +def test_sparse_dot_execution(setup): + rs = np.random.RandomState(0) + + a_data = sps.random(5, 9, density=0.1) + b_data = sps.random(9, 10, density=0.2) + a = tensor(a_data, chunk_size=2) + b = tensor(b_data, chunk_size=3) + + c = dot(a, b) + + res = c.execute().fetch() + assert issparse(res) is True + np.testing.assert_allclose(res.toarray(), a_data.dot(b_data).toarray()) + + c2 = dot(a, b, sparse=False) + + res = c2.execute().fetch() + assert issparse(res) is False + np.testing.assert_allclose(res, a_data.dot(b_data).toarray()) + + c3 = tensordot(a, b.T, (-1, -1), sparse=False) + + res = c3.execute().fetch() + assert issparse(res) is False + np.testing.assert_allclose(res, a_data.dot(b_data).toarray()) + + c = inner(a, b.T) + + res = c.execute().fetch() + assert issparse(res) is True + np.testing.assert_allclose(res.toarray(), a_data.dot(b_data).toarray()) + + c = inner(a, b.T, sparse=False) + + res = c.execute().fetch() + assert issparse(res) is False + np.testing.assert_allclose(res, a_data.dot(b_data).toarray()) + + # test vector inner + a_data = rs.rand(5) + b_data = rs.rand(5) + a = tensor(a_data, chunk_size=2).tosparse() + b = tensor(b_data, chunk_size=2).tosparse() + + c = inner(a, b) + + res = c.execute().fetch() + assert np.isscalar(res) is True + np.testing.assert_allclose(res, np.inner(a_data, b_data)) + + +def test_vdot_execution(setup): + a_data = np.array([1 + 2j, 3 + 4j]) + b_data = np.array([5 + 6j, 7 + 8j]) + a = tensor(a_data, chunk_size=1) + b = tensor(b_data, chunk_size=1) + + t = vdot(a, b) + + res = t.execute().fetch() + expected = np.vdot(a_data, b_data) + np.testing.assert_equal(res, expected) + + a_data = np.array([[1, 4], [5, 6]]) + b_data = np.array([[4, 1], [2, 2]]) + a = tensor(a_data, chunk_size=1) + b = tensor(b_data, chunk_size=1) + + t = vdot(a, b) + + res = t.execute().fetch() + expected = np.vdot(a_data, b_data) + np.testing.assert_equal(res, expected) + + +def test_matmul_execution(setup): + rs = np.random.RandomState(0) + + data_a = rs.randn(10, 20) + data_b = rs.randn(20) + + a = tensor(data_a, chunk_size=5) + b = tensor(data_b, chunk_size=6) + c = matmul(a, b) + + res = c.execute().fetch() + expected = np.matmul(data_a, data_b) + np.testing.assert_allclose(res, expected) + + data_a = rs.randn(10, 20) + data_b = rs.randn(10) + + a = tensor(data_a, chunk_size=5) + b = tensor(data_b, chunk_size=6) + c = matmul(b, a) + + res = c.execute().fetch() + expected = np.matmul(data_b, data_a) + np.testing.assert_allclose(res, expected) + + data_a = rs.randn(15, 1, 20, 30) + data_b = rs.randn(1, 11, 30, 20) + + a = tensor(data_a, chunk_size=12) + b = tensor(data_b, chunk_size=13) + c = matmul(a, b) + + res = c.execute().fetch() + expected = np.matmul(data_a, data_b) + np.testing.assert_allclose(res, expected, atol=0.0001) + + a = arange(2 * 2 * 4, chunk_size=1).reshape((2, 2, 4)) + b = arange(2 * 2 * 4, chunk_size=1).reshape((2, 4, 2)) + c = matmul(a, b) + + res = c.execute().fetch() + expected = np.matmul( + np.arange(2 * 2 * 4).reshape(2, 2, 4), np.arange(2 * 2 * 4).reshape(2, 4, 2) + ) + np.testing.assert_allclose(res, expected, atol=0.0001) + + data_a = sps.random(10, 20) + data_b = sps.random(20, 5) + + a = tensor(data_a, chunk_size=5) + b = tensor(data_b, chunk_size=6) + c = matmul(a, b) + + res = c.execute().fetch() + expected = np.matmul(data_a.toarray(), data_b.toarray()) + np.testing.assert_allclose(res.toarray(), expected) + + # test order + data_a = np.asfortranarray(rs.randn(10, 20)) + data_b = np.asfortranarray(rs.randn(20, 30)) + + a = tensor(data_a, chunk_size=12) + b = tensor(data_b, chunk_size=13) + + c = matmul(a, b) + res = c.execute().fetch() + expected = np.matmul(data_a, data_b) + + np.testing.assert_allclose(res, expected) + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + c = matmul(a, b, order="A") + res = c.execute().fetch() + expected = np.matmul(data_a, data_b, order="A") + + np.testing.assert_allclose(res, expected) + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + c = matmul(a, b, order="C") + res = c.execute().fetch() + expected = np.matmul(data_a, data_b, order="C") + + np.testing.assert_allclose(res, expected) + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] diff --git a/python/xorbits/_mars/tensor/linalg/utils.py b/python/xorbits/_mars/tensor/linalg/utils.py new file mode 100644 index 000000000..8c0fcdda9 --- /dev/null +++ b/python/xorbits/_mars/tensor/linalg/utils.py @@ -0,0 +1,78 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + + +def calc_svd_shapes(a): + """ + Calculate output shapes of singular value decomposition. + Follow the behavior of `numpy`: + if a's shape is (6, 18), U's shape is (6, 6), s's shape is (6,), V's shape is (6, 18) + if a's shape is (18, 6), U's shape is (18, 6), s's shape is (6,), V's shape is (6, 6) + :param a: input tensor + :return: (U.shape, s.shape, V.shape) + """ + x, y = a.shape + if x > y: + return (x, y), (y,), (y, y) + else: + return (x, x), (x,), (x, y) + + +def svd_flip(u, v, u_based_decision=True): + """ + Sign correction to ensure deterministic output from SVD. + + Adjusts the columns of u and the rows of v such that the loadings in the + columns in u that are largest in absolute value are always positive. + + Parameters + ---------- + u : Tensor + u and v are the output of `linalg.svd` or + `randomized_svd`, with matching inner dimensions + so one can compute `mt.dot(u * s, v)`. + + v : Tensor + u and v are the output of `linalg.svd` or + `randomized_svd`, with matching inner dimensions + so one can compute `mt.dot(u * s, v)`. + + u_based_decision : boolean, (default=True) + If True, use the columns of u as the basis for sign flipping. + Otherwise, use the rows of v. The choice of which variable to base the + decision on is generally algorithm dependent. + + + Returns + ------- + u_adjusted, v_adjusted : arrays with the same dimensions as the input. + + """ + from ... import tensor as mt + + if u_based_decision: + # columns of u, rows of v + max_abs_cols = mt.argmax(mt.abs(u), axis=0) + signs = mt.sign(u[max_abs_cols, np.arange(u.shape[1])]) + u *= signs + v *= signs[:, mt.newaxis] + else: + # rows of v, columns of u + max_abs_rows = mt.argmax(mt.abs(v), axis=1) + signs = mt.sign(v[np.arange(v.shape[0]), max_abs_rows]) + u *= signs + v *= signs[:, mt.newaxis] + return u, v diff --git a/python/xorbits/_mars/tensor/linalg/vdot.py b/python/xorbits/_mars/tensor/linalg/vdot.py new file mode 100644 index 000000000..298812af8 --- /dev/null +++ b/python/xorbits/_mars/tensor/linalg/vdot.py @@ -0,0 +1,73 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..datasource import tensor as astensor +from .dot import dot + + +def vdot(a, b): + """ + Return the dot product of two vectors. + + The vdot(`a`, `b`) function handles complex numbers differently than + dot(`a`, `b`). If the first argument is complex the complex conjugate + of the first argument is used for the calculation of the dot product. + + Note that `vdot` handles multidimensional tensors differently than `dot`: + it does *not* perform a matrix product, but flattens input arguments + to 1-D vectors first. Consequently, it should only be used for vectors. + + Parameters + ---------- + a : array_like + If `a` is complex the complex conjugate is taken before calculation + of the dot product. + b : array_like + Second argument to the dot product. + + Returns + ------- + output : Tensor + Dot product of `a` and `b`. Can be an int, float, or + complex depending on the types of `a` and `b`. + + See Also + -------- + dot : Return the dot product without using the complex conjugate of the + first argument. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([1+2j,3+4j]) + >>> b = mt.array([5+6j,7+8j]) + >>> mt.vdot(a, b).execute() + (70-8j) + >>> mt.vdot(b, a).execute() + (70+8j) + + Note that higher-dimensional arrays are flattened! + + >>> a = mt.array([[1, 4], [5, 6]]) + >>> b = mt.array([[4, 1], [2, 2]]) + >>> mt.vdot(a, b).execute() + 30 + >>> mt.vdot(b, a).execute() + 30 + >>> 1*4 + 4*1 + 5*2 + 6*2 + 30 + """ + a, b = astensor(a), astensor(b) + return dot(a.conj().ravel(), b.ravel()) diff --git a/python/xorbits/_mars/tensor/merge/__init__.py b/python/xorbits/_mars/tensor/merge/__init__.py new file mode 100644 index 000000000..5c0d34f70 --- /dev/null +++ b/python/xorbits/_mars/tensor/merge/__init__.py @@ -0,0 +1,23 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .append import append +from .block import block +from .column_stack import column_stack +from .concatenate import TensorConcatenate, concatenate +from .dstack import dstack +from .hstack import hstack +from .stack import TensorStack, stack +from .union1d import union1d +from .vstack import vstack diff --git a/python/xorbits/_mars/tensor/merge/append.py b/python/xorbits/_mars/tensor/merge/append.py new file mode 100644 index 000000000..48844e150 --- /dev/null +++ b/python/xorbits/_mars/tensor/merge/append.py @@ -0,0 +1,74 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..base.ravel import ravel +from ..datasource.array import asarray +from .concatenate import concatenate + + +def append(arr, values, axis=None): + """ + Append values to the end of an array. + + Parameters + ---------- + arr : array_like + Values are appended to a copy of this array. + values : array_like + These values are appended to a copy of `arr`. It must be of the + correct shape (the same shape as `arr`, excluding `axis`). If + `axis` is not specified, `values` can be any shape and will be + flattened before use. + axis : int, optional + The axis along which `values` are appended. If `axis` is not + given, both `arr` and `values` are flattened before use. + + Returns + ------- + append : Tensor + A copy of `arr` with `values` appended to `axis`. Note that + `append` does not occur in-place: a new array is allocated and + filled. If `axis` is None, `out` is a flattened array. + + See Also + -------- + insert : Insert elements into an array. + delete : Delete elements from an array. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.append([1, 2, 3], [[4, 5, 6], [7, 8, 9]]).execute() + array([1, 2, 3, ..., 7, 8, 9]) + + When `axis` is specified, `values` must have the correct shape. + + >>> mt.append([[1, 2, 3], [4, 5, 6]], [[7, 8, 9]], axis=0).execute() + array([[1, 2, 3], + [4, 5, 6], + [7, 8, 9]]) + >>> mt.append([[1, 2, 3], [4, 5, 6]], [7, 8, 9], axis=0) + Traceback (most recent call last): + ... + ValueError: all the input tensors must have same number of dimensions + + """ + arr = asarray(arr) + if axis is None: + if arr.ndim != 1: + arr = arr.ravel() + values = ravel(values) + axis = arr.ndim - 1 + return concatenate((arr, values), axis=axis) diff --git a/python/xorbits/_mars/tensor/merge/block.py b/python/xorbits/_mars/tensor/merge/block.py new file mode 100644 index 000000000..25e76ee4c --- /dev/null +++ b/python/xorbits/_mars/tensor/merge/block.py @@ -0,0 +1,474 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import itertools +import operator + +import numpy as np + +from ..datasource.array import array +from ..datasource.empty import empty + +# Internal functions to eliminate the overhead of repeated dispatch in one of +# the two possible paths inside mt.block. +# Use getattr to protect against __array_function__ being disabled. +_size = getattr(np.size, "__wrapped__", np.size) +_ndim = getattr(np.ndim, "__wrapped__", np.ndim) + + +def _block_format_index(index): + """ + Convert a list of indices ``[0, 1, 2]`` into ``"arrays[0][1][2]"``. + """ + idx_str = "".join("[{}]".format(i) for i in index if i is not None) + return "arrays" + idx_str + + +def _block_check_depths_match(arrays, parent_index=[]): + """ + Recursive function checking that the depths of nested lists in `arrays` + all match. Mismatch raises a ValueError as described in the block + docstring below. + + The entire index (rather than just the depth) needs to be calculated + for each innermost list, in case an error needs to be raised, so that + the index of the offending list can be printed as part of the error. + + Parameters + ---------- + arrays : nested list of arrays + The arrays to check + parent_index : list of int + The full index of `arrays` within the nested lists passed to + `_block_check_depths_match` at the top of the recursion. + + Returns + ------- + first_index : list of int + The full index of an element from the bottom of the nesting in + `arrays`. If any element at the bottom is an empty list, this will + refer to it, and the last index along the empty axis will be None. + max_arr_ndim : int + The maximum of the ndims of the arrays nested in `arrays`. + final_size: int + The number of elements in the final array. This is used the motivate + the choice of algorithm used using benchmarking wisdom. + + """ + if type(arrays) is tuple: + # not strictly necessary, but saves us from: + # - more than one way to do things - no point treating tuples like + # lists + # - horribly confusing behaviour that results when tuples are + # treated like ndarray + raise TypeError( + "{} is a tuple. " + "Only lists can be used to arrange blocks, and mt.block does " + "not allow implicit conversion from tuple to ndarray.".format( + _block_format_index(parent_index) + ) + ) + elif type(arrays) is list and len(arrays) > 0: + idxs_ndims = ( + _block_check_depths_match(arr, parent_index + [i]) + for i, arr in enumerate(arrays) + ) + + first_index, max_arr_ndim, final_size = next(idxs_ndims) + for index, ndim, size in idxs_ndims: + final_size += size + if ndim > max_arr_ndim: + max_arr_ndim = ndim + if len(index) != len(first_index): + raise ValueError( + "List depths are mismatched. First element was at depth " + "{}, but there is an element at depth {} ({})".format( + len(first_index), len(index), _block_format_index(index) + ) + ) + # propagate our flag that indicates an empty list at the bottom + if index[-1] is None: + first_index = index + + return first_index, max_arr_ndim, final_size + elif type(arrays) is list and len(arrays) == 0: + # We've 'bottomed out' on an empty list + return parent_index + [None], 0, 0 + else: + # We've 'bottomed out' - arrays is either a scalar or an array + size = _size(arrays) + return parent_index, _ndim(arrays), size + + +def _atleast_nd(a, ndim): + # Ensures `a` has at least `ndim` dimensions by prepending + # ones to `a.shape` as necessary + return array(a, ndmin=ndim, copy=False) + + +def _accumulate(values): + return list(itertools.accumulate(values)) + + +def _concatenate_shapes(shapes, axis): + """Given array shapes, return the resulting shape and slices prefixes. + These help in nested concatenation. + + Returns + ------- + shape: tuple of int + This tuple satisfies: + ``` + shape, _ = _concatenate_shapes([arr.shape for shape in arrs], axis) + shape == concatenate(arrs, axis).shape + ``` + slice_prefixes: tuple of (slice(start, end), ) + For a list of arrays being concatenated, this returns the slice + in the larger array at axis that needs to be sliced into. + For example, the following holds: + ``` + ret = concatenate([a, b, c], axis) + _, (sl_a, sl_b, sl_c) = concatenate_slices([a, b, c], axis) + ret[(slice(None),) * axis + sl_a] == a + ret[(slice(None),) * axis + sl_b] == b + ret[(slice(None),) * axis + sl_c] == c + ``` + These are called slice prefixes since they are used in the recursive + blocking algorithm to compute the left-most slices during the + recursion. Therefore, they must be prepended to rest of the slice + that was computed deeper in the recursion. + These are returned as tuples to ensure that they can quickly be added + to existing slice tuple without creating a new tuple every time. + """ + # Cache a result that will be reused. + shape_at_axis = [shape[axis] for shape in shapes] + + # Take a shape, any shape + first_shape = shapes[0] + first_shape_pre = first_shape[:axis] + first_shape_post = first_shape[axis + 1 :] + + if any( + shape[:axis] != first_shape_pre or shape[axis + 1 :] != first_shape_post + for shape in shapes + ): + raise ValueError("Mismatched array shapes in block along axis {}.".format(axis)) + + shape = first_shape_pre + (sum(shape_at_axis),) + first_shape[axis + 1 :] + + offsets_at_axis = _accumulate(shape_at_axis) + slice_prefixes = [ + (slice(start, end),) + for start, end in zip([0] + offsets_at_axis, offsets_at_axis) + ] + return shape, slice_prefixes + + +def _block_info_recursion(arrays, max_depth, result_ndim, depth=0): + """ + Returns the shape of the final array, along with a list + of slices and a list of arrays that can be used for assignment inside the + new array + + Parameters + ---------- + arrays : nested list of arrays + The arrays to check + max_depth : list of int + The number of nested lists + result_ndim: int + The number of dimensions in thefinal array. + + Returns + ------- + shape : tuple of int + The shape that the final array will take on. + slices: list of tuple of slices + The slices into the full array required for assignment. These are + required to be prepended with ``(Ellipsis, )`` to obtain to correct + final index. + arrays: list of ndarray + The data to assign to each slice of the full array + + """ + if depth < max_depth: + shapes, slices, arrays = zip( + *[ + _block_info_recursion(arr, max_depth, result_ndim, depth + 1) + for arr in arrays + ] + ) + + axis = result_ndim - max_depth + depth + shape, slice_prefixes = _concatenate_shapes(shapes, axis) + + # Prepend the slice prefix and flatten the slices + slices = [ + slice_prefix + the_slice + for slice_prefix, inner_slices in zip(slice_prefixes, slices) + for the_slice in inner_slices + ] + + # Flatten the array list + arrays = functools.reduce(operator.add, arrays) + + return shape, slices, arrays + else: + # We've 'bottomed out' - arrays is either a scalar or an array + # type(arrays) is not list + # Return the slice and the array inside a list to be consistent with + # the recursive case. + arr = _atleast_nd(arrays, result_ndim) + return arr.shape, [()], [arr] + + +def _block(arrays, max_depth, result_ndim, depth=0): + """ + Internal implementation of block based on repeated concatenation. + `arrays` is the argument passed to + block. `max_depth` is the depth of nested lists within `arrays` and + `result_ndim` is the greatest of the dimensions of the arrays in + `arrays` and the depth of the lists in `arrays` (see block docstring + for details). + """ + from ..merge.concatenate import concatenate + + if depth < max_depth: + arrs = [_block(arr, max_depth, result_ndim, depth + 1) for arr in arrays] + return concatenate(arrs, axis=-(max_depth - depth)) + else: + # We've 'bottomed out' - arrays is either a scalar or an array + # type(arrays) is not list + return _atleast_nd(arrays, result_ndim) + + +def block(arrays): + """ + Assemble an nd-array from nested lists of blocks. + + Blocks in the innermost lists are concatenated (see `concatenate`) along + the last dimension (-1), then these are concatenated along the + second-last dimension (-2), and so on until the outermost list is reached. + + Blocks can be of any dimension, but will not be broadcasted using the normal + rules. Instead, leading axes of size 1 are inserted, to make ``block.ndim`` + the same for all blocks. This is primarily useful for working with scalars, + and means that code like ``mt.block([v, 1])`` is valid, where + ``v.ndim == 1``. + + When the nested list is two levels deep, this allows block matrices to be + constructed from their components. + + .. versionadded:: 1.13.0 + + Parameters + ---------- + arrays : nested list of array_like or scalars (but not tuples) + If passed a single ndarray or scalar (a nested list of depth 0), this + is returned unmodified (and not copied). + + Elements shapes must match along the appropriate axes (without + broadcasting), but leading 1s will be prepended to the shape as + necessary to make the dimensions match. + + Returns + ------- + block_array : Tensor + The array assembled from the given blocks. + + The dimensionality of the output is equal to the greatest of: + * the dimensionality of all the inputs + * the depth to which the input list is nested + + Raises + ------ + ValueError + * If list depths are mismatched - for instance, ``[[a, b], c]`` is + illegal, and should be spelt ``[[a, b], [c]]`` + * If lists are empty - for instance, ``[[a, b], []]`` + + See Also + -------- + concatenate : Join a sequence of arrays along an existing axis. + stack : Join a sequence of arrays along a new axis. + vstack : Stack arrays in sequence vertically (row wise). + hstack : Stack arrays in sequence horizontally (column wise). + dstack : Stack arrays in sequence depth wise (along third axis). + column_stack : Stack 1-D arrays as columns into a 2-D array. + vsplit : Split an array into multiple sub-arrays vertically (row-wise). + + Notes + ----- + + When called with only scalars, ``mt.block`` is equivalent to an ndarray + call. So ``mt.block([[1, 2], [3, 4]])`` is equivalent to + ``mt.array([[1, 2], [3, 4]])``. + + This function does not enforce that the blocks lie on a fixed grid. + ``mt.block([[a, b], [c, d]])`` is not restricted to arrays of the form:: + + AAAbb + AAAbb + cccDD + + But is also allowed to produce, for some ``a, b, c, d``:: + + AAAbb + AAAbb + cDDDD + + Since concatenation happens along the last axis first, `block` is _not_ + capable of producing the following directly:: + + AAAbb + cccbb + cccDD + + Matlab's "square bracket stacking", ``[A, B, ...; p, q, ...]``, is + equivalent to ``mt.block([[A, B, ...], [p, q, ...]])``. + + Examples + -------- + The most common use of this function is to build a block matrix + + >>> import mars.tensor as mt + >>> A = mt.eye(2) * 2 + >>> B = mt.eye(3) * 3 + >>> mt.block([ + ... [A, mt.zeros((2, 3))], + ... [mt.ones((3, 2)), B ] + ... ]).execute() + array([[2., 0., 0., 0., 0.], + [0., 2., 0., 0., 0.], + [1., 1., 3., 0., 0.], + [1., 1., 0., 3., 0.], + [1., 1., 0., 0., 3.]]) + + With a list of depth 1, `block` can be used as `hstack` + + >>> mt.block([1, 2, 3]).execute() # hstack([1, 2, 3]) + array([1, 2, 3]) + + >>> a = mt.array([1, 2, 3]) + >>> b = mt.array([2, 3, 4]) + >>> mt.block([a, b, 10]).execute() # hstack([a, b, 10]) + array([ 1, 2, 3, 2, 3, 4, 10]) + + >>> A = mt.ones((2, 2), int) + >>> B = 2 * A + >>> mt.block([A, B]).execute() # hstack([A, B]) + array([[1, 1, 2, 2], + [1, 1, 2, 2]]) + + With a list of depth 2, `block` can be used in place of `vstack`: + + >>> a = mt.array([1, 2, 3]) + >>> b = mt.array([2, 3, 4]) + >>> mt.block([[a], [b]]).execute() # vstack([a, b]) + array([[1, 2, 3], + [2, 3, 4]]) + + >>> A = mt.ones((2, 2), int) + >>> B = 2 * A + >>> mt.block([[A], [B]]).execute() # vstack([A, B]) + array([[1, 1], + [1, 1], + [2, 2], + [2, 2]]) + + It can also be used in places of `atleast_1d` and `atleast_2d` + + >>> a = mt.array(0) + >>> b = mt.array([1]) + >>> mt.block([a]).execute() # atleast_1d(a) + array([0]) + >>> mt.block([b]).execute() # atleast_1d(b) + array([1]) + + >>> mt.block([[a]]).execute() # atleast_2d(a) + array([[0]]) + >>> mt.block([[b]]).execute() # atleast_2d(b) + array([[1]]) + + + """ + arrays, list_ndim, result_ndim, final_size = _block_setup(arrays) + + # It was found through benchmarking that making an array of final size + # around 256x256 was faster by straight concatenation on a + # i7-7700HQ processor and dual channel ram 2400MHz. + # It didn't seem to matter heavily on the dtype used. + # + # A 2D array using repeated concatenation requires 2 copies of the array. + # + # The fastest algorithm will depend on the ratio of CPU power to memory + # speed. + # One can monitor the results of the benchmark + # https://pv.github.io/numpy-bench/#bench_shape_base.Block2D.time_block2d + # to tune this parameter until a C version of the `_block_info_recursion` + # algorithm is implemented which would likely be faster than the python + # version. + if list_ndim * final_size > (2 * 512 * 512): + return _block_slicing(arrays, list_ndim, result_ndim) + else: + return _block_concatenate(arrays, list_ndim, result_ndim) + + +# These helper functions are mostly used for testing. +# They allow us to write tests that directly call `_block_slicing` +# or `_block_concatenate` without blocking large arrays to force the wisdom +# to trigger the desired path. +def _block_setup(arrays): + """ + Returns + (`arrays`, list_ndim, result_ndim, final_size) + """ + bottom_index, arr_ndim, final_size = _block_check_depths_match(arrays) + list_ndim = len(bottom_index) + if bottom_index and bottom_index[-1] is None: + raise ValueError( + "List at {} cannot be empty".format(_block_format_index(bottom_index)) + ) + result_ndim = max(arr_ndim, list_ndim) + return arrays, list_ndim, result_ndim, final_size + + +def _block_slicing(arrays, list_ndim, result_ndim): + shape, slices, arrays = _block_info_recursion(arrays, list_ndim, result_ndim) + dtype = np.result_type(*[arr.dtype for arr in arrays]) + + # Test preferring F only in the case that all input arrays are F + F_order = all(arr.flags["F_CONTIGUOUS"] for arr in arrays) + C_order = all(arr.flags["C_CONTIGUOUS"] for arr in arrays) + order = "F" if F_order and not C_order else "C" + result = empty(shape=shape, dtype=dtype, order=order) + # Note: In a c implementation, the function + # PyArray_CreateMultiSortedStridePerm could be used for more advanced + # guessing of the desired order. + + for the_slice, arr in zip(slices, arrays): + result[(Ellipsis,) + the_slice] = arr + return result + + +def _block_concatenate(arrays, list_ndim, result_ndim): + result = _block(arrays, list_ndim, result_ndim) + if list_ndim == 0: + # Catch an edge case where _block returns a view because + # `arrays` is a single mars array and not a list of mars arrays. + # This might copy scalars or lists twice, but this isn't a likely + # usecase for those interested in performance + result = result.copy() + return result diff --git a/python/xorbits/_mars/tensor/merge/column_stack.py b/python/xorbits/_mars/tensor/merge/column_stack.py new file mode 100644 index 000000000..e4e7c011c --- /dev/null +++ b/python/xorbits/_mars/tensor/merge/column_stack.py @@ -0,0 +1,63 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..datasource import tensor as astensor +from .concatenate import concatenate + + +def column_stack(tup): + """ + Stack 1-D tensors as columns into a 2-D tensor. + + Take a sequence of 1-D tensors and stack them as columns + to make a single 2-D tensor. 2-D tensors are stacked as-is, + just like with `hstack`. 1-D tensors are turned into 2-D columns + first. + + Parameters + ---------- + tup : sequence of 1-D or 2-D tensors. + Tensors to stack. All of them must have the same first dimension. + + Returns + ------- + stacked : 2-D tensor + The tensor formed by stacking the given tensors. + + See Also + -------- + stack, hstack, vstack, concatenate + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array((1,2,3)) + >>> b = mt.array((2,3,4)) + >>> mt.column_stack((a,b)).execute() + array([[1, 2], + [2, 3], + [3, 4]]) + + """ + from ..datasource import array + + arrays = [] + for a in tup: + a = astensor(a) + if a.ndim < 2: + a = array(a, ndmin=2).T + arrays.append(a) + + return concatenate(arrays, 1) diff --git a/python/xorbits/_mars/tensor/merge/concatenate.py b/python/xorbits/_mars/tensor/merge/concatenate.py new file mode 100644 index 000000000..512b11dad --- /dev/null +++ b/python/xorbits/_mars/tensor/merge/concatenate.py @@ -0,0 +1,327 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import operator +import tempfile +from collections.abc import Iterable + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import ( + AnyField, + BoolField, + SliceField, + StringField, + TupleField, +) +from ..array_utils import as_same_device, device +from ..datasource import tensor as astensor +from ..indexing.slice import TensorSlice +from ..operands import TensorOperand, TensorOperandMixin +from ..utils import unify_chunks, validate_axis + + +def _get_index(chunk): + try: + return chunk.index + except AttributeError: + if isinstance(chunk.op, TensorSlice): + return chunk.inputs[0].index + raise + + +def _norm_axis(axis): + if isinstance(axis, int): + return axis, True + if isinstance(axis, Iterable): + axis = sorted(tuple(axis)) + if len(axis) == 1: + return axis[0], True + return axis, False + + assert axis is None + return None, False + + +class TensorConcatenate(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.CONCATENATE + + _axis = AnyField("axis") + + # for mmap + _mmap = BoolField("mmap") + _file_prefix = StringField("file_prefix") + _create_mmap_file = BoolField("create_mmap_file") + _partition_slice = SliceField("partition_slice") + _total_shape = TupleField("total_shape") + + def __init__( + self, + axis=None, + mmap=None, + file_prefix=None, + create_mmap_file=None, + partition_slice=None, + total_shape=None, + **kw + ): + super().__init__( + _axis=axis, + _mmap=mmap, + _file_prefix=file_prefix, + _create_mmap_file=create_mmap_file, + _partition_slice=partition_slice, + _total_shape=total_shape, + **kw + ) + + @property + def axis(self): + return getattr(self, "_axis", None) + + @property + def mmap(self): + return self._mmap + + @property + def file_prefix(self): + return self._file_prefix + + @property + def create_mmap_file(self): + return self._create_mmap_file + + @property + def partition_slice(self): + return self._partition_slice + + @property + def total_shape(self): + return self._total_shape + + def __call__(self, tensors): + if len(set(t.ndim for t in tensors)) != 1: + raise ValueError( + "all the input tensors must have same number of dimensions" + ) + + axis = self._axis + shapes = [t.shape[:axis] + t.shape[axis + 1 :] for t in tensors] + if len(set(shapes)) != 1: + raise ValueError( + "all the input tensor dimensions " + "except for the concatenation axis must match exactly" + ) + + shape = [ + 0 if i == axis else tensors[0].shape[i] for i in range(tensors[0].ndim) + ] + shape[axis] = sum(t.shape[axis] for t in tensors) + + if any(np.isnan(s) for i, s in enumerate(shape) if i != axis): + raise ValueError("cannot concatenate tensor with unknown shape") + + return self.new_tensor(tensors, shape=tuple(shape)) + + @classmethod + def tile(cls, op): + from ..indexing.slice import TensorSlice + + inputs = op.inputs + output = op.outputs[0] + axis = op.axis + + c = itertools.count(inputs[0].ndim) + tensor_axes = [ + (t, tuple(i if i != axis else next(c) for i in range(t.ndim))) + for t in inputs + ] + inputs = yield from unify_chunks(*tensor_axes) + + out_chunk_shape = [ + 0 if i == axis else inputs[0].chunk_shape[i] for i in range(inputs[0].ndim) + ] + out_chunk_shape[axis] = sum(t.chunk_shape[axis] for t in inputs) + out_nsplits = [ + None if i == axis else inputs[0].nsplits[i] for i in range(inputs[0].ndim) + ] + out_nsplits[axis] = tuple(itertools.chain(*[t.nsplits[axis] for t in inputs])) + + out_chunks = [] + axis_cum_chunk_shape = np.cumsum([t.chunk_shape[axis] for t in inputs]) + for out_idx in itertools.product(*[range(s) for s in out_chunk_shape]): + axis_index = np.searchsorted( + axis_cum_chunk_shape, out_idx[axis], side="right" + ) + t = inputs[axis_index] + axis_inner_index = out_idx[axis] - ( + 0 if axis_index < 1 else axis_cum_chunk_shape[axis_index - 1] + ) + idx = out_idx[:axis] + (axis_inner_index,) + out_idx[axis + 1 :] + in_chunk = t.cix[idx] + if idx == out_idx: + # if index is the same, just use the input chunk + out_chunks.append(in_chunk) + else: + chunk_op = TensorSlice( + slices=[slice(None) for _ in range(in_chunk.ndim)], + dtype=in_chunk.dtype, + sparse=in_chunk.op.sparse, + ) + out_chunk = chunk_op.new_chunk( + [in_chunk], shape=in_chunk.shape, index=out_idx, order=output.order + ) + + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + output.shape, + order=output.order, + nsplits=out_nsplits, + chunks=out_chunks, + ) + + @staticmethod + def _ensure_order(result, order): + return result.astype(result.dtype, order=order.value, copy=False) + + @classmethod + def execute(cls, ctx, op): + if op.mmap: # pragma: no cover + cls._execute_with_mmap(ctx, op) + else: + cls._execute(ctx, op) + + @classmethod + def _execute(cls, ctx, op): + def _base_concatenate(chunk, inputs): + inputs, device_id, xp = as_same_device( + inputs, device=chunk.op.device, ret_extra=True + ) + + axis, single_axis = _norm_axis(chunk.op.axis) + if single_axis: + with device(device_id): + res = xp.concatenate(tuple(inputs), axis=axis) + else: + axes = axis or list(range(chunk.ndim)) + chunks = [ + (_get_index(input), data) + for input, data in zip(chunk.inputs, inputs) + ] + with device(device_id): + for i in range(len(axes) - 1): + new_chunks = [] + for idx, cs in itertools.groupby( + chunks, key=lambda t: t[0][:-1] + ): + cs = list(map(operator.itemgetter(1), cs)) + new_chunks.append( + (idx, xp.concatenate(cs, axis=len(axes) - i - 1)) + ) + chunks = new_chunks + res = xp.concatenate( + list(map(operator.itemgetter(1), chunks)), axis=axes[0] + ) + return res + + chunk = op.outputs[0] + inputs = [ctx[input.key] for input in op.inputs] + + if isinstance(inputs[0], tuple): + ctx[chunk.key] = tuple( + cls._ensure_order( + _base_concatenate(chunk, [input[i] for input in inputs]), + chunk.order, + ) + for i in range(len(inputs[0])) + ) + else: + ctx[chunk.key] = cls._ensure_order( + _base_concatenate(chunk, inputs), chunk.order + ) + + @classmethod + def _execute_with_mmap(cls, ctx, op): # pragma: no cover + if op.create_mmap_file: + path = tempfile.mkstemp(prefix=op.file_prefix, suffix=".dat")[1] + np.memmap(path, dtype=op.dtype, mode="w+", shape=op.total_shape) + ctx[op.outputs[0].key] = path + else: + path = ctx[op.inputs[0].key] + array = ctx[op.inputs[1].key] + fp = np.memmap(path, dtype=op.dtype, mode="r+", shape=op.total_shape) + fp[op.partition_slice] = array + ctx[op.outputs[0].key] = path + + +def concatenate(tensors, axis=0): + """ + Join a sequence of arrays along an existing axis. + + Parameters + ---------- + a1, a2, ... : sequence of array_like + The tensors must have the same shape, except in the dimension + corresponding to `axis` (the first, by default). + axis : int, optional + The axis along which the tensors will be joined. Default is 0. + + Returns + ------- + res : Tensor + The concatenated tensor. + + See Also + -------- + array_split : Split a tensor into multiple sub-arrays of equal or + near-equal size. + split : Split tensor into a list of multiple sub-tensors of equal size. + hsplit : Split tensor into multiple sub-tensors horizontally (column wise) + vsplit : Split tensor into multiple sub-tensors vertically (row wise) + dsplit : Split tensor into multiple sub-tensors along the 3rd axis (depth). + stack : Stack a sequence of tensors along a new axis. + hstack : Stack tensors in sequence horizontally (column wise) + vstack : Stack tensors in sequence vertically (row wise) + dstack : Stack tensors in sequence depth wise (along third dimension) + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([[1, 2], [3, 4]]) + >>> b = mt.array([[5, 6]]) + >>> mt.concatenate((a, b), axis=0).execute() + array([[1, 2], + [3, 4], + [5, 6]]) + >>> mt.concatenate((a, b.T), axis=1).execute() + array([[1, 2, 5], + [3, 4, 6]]) + + """ + if axis is None: + axis = 0 + tensors = [astensor(t) for t in tensors] + + axis = validate_axis(tensors[0].ndim, axis) + dtype = np.result_type(*(t.dtype for t in tensors)) + sparse = all(t.issparse() for t in tensors) + + op = TensorConcatenate(axis=axis, dtype=dtype, sparse=sparse) + return op(tensors) diff --git a/python/xorbits/_mars/tensor/merge/dstack.py b/python/xorbits/_mars/tensor/merge/dstack.py new file mode 100644 index 000000000..2b9bf1da0 --- /dev/null +++ b/python/xorbits/_mars/tensor/merge/dstack.py @@ -0,0 +1,71 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..base import atleast_3d +from .concatenate import concatenate + + +def dstack(tup): + """ + Stack tensors in sequence depth wise (along third axis). + + This is equivalent to concatenation along the third axis after 2-D tensors + of shape `(M,N)` have been reshaped to `(M,N,1)` and 1-D arrays of shape + `(N,)` have been reshaped to `(1,N,1)`. Rebuilds arrays divided by + `dsplit`. + + This function makes most sense for arrays with up to 3 dimensions. For + instance, for pixel-data with a height (first axis), width (second axis), + and r/g/b channels (third axis). The functions `concatenate`, `stack` and + `block` provide more general stacking and concatenation operations. + + Parameters + ---------- + tup : sequence of tensors + The tensors must have the same shape along all but the third axis. + 1-D or 2-D arrays must have the same shape. + + Returns + ------- + stacked : Tensor + The array formed by stacking the given tensors, will be at least 3-D. + + See Also + -------- + stack : Join a sequence of tensors along a new axis. + vstack : Stack along first axis. + hstack : Stack along second axis. + concatenate : Join a sequence of arrays along an existing axis. + dsplit : Split tensor along third axis. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array((1,2,3)) + >>> b = mt.array((2,3,4)) + >>> mt.dstack((a,b)).execute() + array([[[1, 2], + [2, 3], + [3, 4]]]) + + >>> a = mt.array([[1],[2],[3]]) + >>> b = mt.array([[2],[3],[4]]) + >>> mt.dstack((a,b)).execute() + array([[[1, 2]], + [[2, 3]], + [[3, 4]]]) + + """ + return concatenate([atleast_3d(t) for t in tup], axis=2) diff --git a/python/xorbits/_mars/tensor/merge/hstack.py b/python/xorbits/_mars/tensor/merge/hstack.py new file mode 100644 index 000000000..1e01f9745 --- /dev/null +++ b/python/xorbits/_mars/tensor/merge/hstack.py @@ -0,0 +1,70 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .concatenate import concatenate + + +def hstack(tup): + """ + Stack tensors in sequence horizontally (column wise). + + This is equivalent to concatenation along the second axis, except for 1-D + tensors where it concatenates along the first axis. Rebuilds tensors divided + by `hsplit`. + + This function makes most sense for tensors with up to 3 dimensions. For + instance, for pixel-data with a height (first axis), width (second axis), + and r/g/b channels (third axis). The functions `concatenate`, `stack` and + `block` provide more general stacking and concatenation operations. + + Parameters + ---------- + tup : sequence of tensors + The tensors must have the same shape along all but the second axis, + except 1-D tensors which can be any length. + + Returns + ------- + stacked : Tensor + The tensor formed by stacking the given tensors. + + See Also + -------- + stack : Join a sequence of tensors along a new axis. + vstack : Stack tensors in sequence vertically (row wise). + dstack : Stack tensors in sequence depth wise (along third axis). + concatenate : Join a sequence of tensors along an existing axis. + hsplit : Split tensor along second axis. + block : Assemble tensors from blocks. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array((1,2,3)) + >>> b = mt.array((2,3,4)) + >>> mt.hstack((a,b)).execute() + array([1, 2, 3, 2, 3, 4]) + >>> a = mt.array([[1],[2],[3]]) + >>> b = mt.array([[2],[3],[4]]) + >>> mt.hstack((a,b)).execute() + array([[1, 2], + [2, 3], + [3, 4]]) + + """ + if all(x.ndim == 1 for x in tup): + return concatenate(tup, axis=0) + else: + return concatenate(tup, axis=1) diff --git a/python/xorbits/_mars/tensor/merge/stack.py b/python/xorbits/_mars/tensor/merge/stack.py new file mode 100644 index 000000000..3bc88f711 --- /dev/null +++ b/python/xorbits/_mars/tensor/merge/stack.py @@ -0,0 +1,217 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import Int32Field +from ...utils import has_unknown_shape +from ..array_utils import as_same_device, device +from ..core import Tensor, TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorOperand, TensorOperandMixin +from ..utils import check_out_param, unify_chunks + + +class TensorStack(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.STACK + + _axis = Int32Field("axis") + + def __init__(self, axis=None, **kw): + super().__init__(_axis=axis, **kw) + + @property + def axis(self): + return self._axis + + def __call__(self, tensors, out=None): + if out is not None and not isinstance(out, Tensor): + raise TypeError(f"`out` must be a Tensor, got {type(out)} instead") + + shape = ( + tensors[0].shape[: self._axis] + + (len(tensors),) + + tensors[0].shape[self._axis :] + ) + tensor_order = TensorOrder.C_ORDER if out is None else out.order + t = self.new_tensor(tensors, shape, order=tensor_order) + + if out is None: + return t + + if out.shape != t.shape: + raise ValueError("Output tensor has wrong dimensionality") + check_out_param(out, t, "same_kind") + out.data = t.data + return out + + @classmethod + def tile(cls, op): + from ..indexing.slice import TensorSlice + + if has_unknown_shape(*op.inputs): + yield + + if len(set([inp.shape for inp in op.inputs])) != 1: + # check shape again when input has unknown shape + raise ValueError("all input tensors must have the same shape") + + inputs = yield from unify_chunks(*op.inputs) + output = op.outputs[0] + axis = op.axis + + output_nsplits = ( + inputs[0].nsplits[:axis] + ((1,) * len(inputs),) + inputs[0].nsplits[axis:] + ) + output_idxes = itertools.product( + *[range(len(nsplit)) for nsplit in output_nsplits] + ) + + out_chunks = [] + for idx in output_idxes: + input_idx = idx[:axis] + idx[axis + 1 :] + i = idx[axis] + input_chunk = inputs[i].cix[input_idx] + slices = ( + [slice(None)] * axis + + [np.newaxis] + + [slice(None)] * (len(input_idx) - axis) + ) + shape = input_chunk.shape[:axis] + (1,) + input_chunk.shape[axis:] + chunk_op = TensorSlice(slices=slices, dtype=op.dtype, sparse=op.sparse) + out_chunk = chunk_op.new_chunk( + [input_chunk], shape=shape, index=idx, order=output.order + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, output.shape, chunks=out_chunks, nsplits=output_nsplits + ) + + @classmethod + def execute(cls, ctx, op): + raw_inputs = [ctx[c.key] for c in op.inputs] + is_input_tuple = isinstance(raw_inputs[0], tuple) + input_tuple_len = len(raw_inputs[0]) if is_input_tuple else 1 + + if is_input_tuple: + # situation that stack is used during tiling, not created by user + inputs = list(itertools.chain.from_iterable(raw_inputs)) + else: + inputs = raw_inputs + # move all the data to the same device + inputs, device_id, xp = as_same_device(inputs, device=op.device, ret_extra=True) + if is_input_tuple: + inputs = [ + inputs[i * input_tuple_len : (i + 1) * input_tuple_len] + for i in range(len(raw_inputs)) + ] + else: + inputs = [[inp] for inp in inputs] + + axis = op.axis + out = op.outputs[0] + with device(device_id): + rets = [] + for i in range(input_tuple_len): + ret = xp.stack([inp[i] for inp in inputs], axis=axis) + # make sure order is identical to out's order + ret = ret.astype(ret.dtype, order=out.order.value, copy=False) + rets.append(ret) + ctx[out.key] = rets if is_input_tuple else rets[0] + + +def stack(tensors, axis=0, out=None): + """ + Join a sequence of tensors along a new axis. + + The `axis` parameter specifies the index of the new axis in the dimensions + of the result. For example, if ``axis=0`` it will be the first dimension + and if ``axis=-1`` it will be the last dimension. + + Parameters + ---------- + tensors : sequence of array_like + Each tensor must have the same shape. + axis : int, optional + The axis in the result tensor along which the input tensors are stacked. + out : Tensor, optional + If provided, the destination to place the result. The shape must be + correct, matching that of what stack would have returned if no + out argument were specified. + + Returns + ------- + stacked : Tensor + The stacked tensor has one more dimension than the input tensors. + + See Also + -------- + concatenate : Join a sequence of tensors along an existing axis. + split : Split tensor into a list of multiple sub-tensors of equal size. + block : Assemble tensors from blocks. + + Examples + -------- + >>> import mars.tensor as mt + + >>> arrays = [mt.random.randn(3, 4) for _ in range(10)] + >>> mt.stack(arrays, axis=0).shape + (10, 3, 4) + + >>> mt.stack(arrays, axis=1).shape + (3, 10, 4) + + >>> mt.stack(arrays, axis=2).shape + (3, 4, 10) + + >>> a = mt.array([1, 2, 3]) + >>> b = mt.array([2, 3, 4]) + >>> mt.stack((a, b)).execute() + array([[1, 2, 3], + [2, 3, 4]]) + + >>> mt.stack((a, b), axis=-1).execute() + array([[1, 2], + [2, 3], + [3, 4]]) + + """ + tensors = [astensor(t) for t in tensors] + + to_check_shapes = [] + for t in tensors: + if not any(np.isnan(s) for s in t.shape): + to_check_shapes.append(t.shape) + if to_check_shapes and len(set(to_check_shapes)) != 1: + raise ValueError("all input tensors must have the same shape") + + ndim = len(tensors[0].shape) + raw_axis = axis + if axis < 0: + axis = ndim + axis + 1 + if axis > ndim or axis < 0: + raise np.AxisError( + f"axis {raw_axis} is out of bounds for tensor of dimension {ndim}" + ) + + dtype = np.result_type(*[t.dtype for t in tensors]) + sparse = all(t.issparse() for t in tensors) + + op = TensorStack(axis=axis, dtype=dtype, sparse=sparse) + return op(tensors, out=out) diff --git a/python/xorbits/_mars/tensor/merge/tests/__init__.py b/python/xorbits/_mars/tensor/merge/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/tensor/merge/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/tensor/merge/tests/test_merge.py b/python/xorbits/_mars/tensor/merge/tests/test_merge.py new file mode 100644 index 000000000..8c552a7fa --- /dev/null +++ b/python/xorbits/_mars/tensor/merge/tests/test_merge.py @@ -0,0 +1,103 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest + +from ....core import tile +from ...datasource import empty, ones +from .. import concatenate, stack + + +def test_concatenate(): + a = ones((10, 20, 30), chunk_size=10) + b = ones((20, 20, 30), chunk_size=20) + + c = concatenate([a, b]) + assert c.shape == (30, 20, 30) + + a = ones((10, 20, 30), chunk_size=10) + b = ones((10, 20, 40), chunk_size=20) + + c = concatenate([a, b], axis=-1) + assert c.shape == (10, 20, 70) + + with pytest.raises(ValueError): + a = ones((10, 20, 30), chunk_size=10) + b = ones((20, 30, 30), chunk_size=20) + + concatenate([a, b]) + + with pytest.raises(ValueError): + a = ones((10, 20, 30), chunk_size=10) + b = ones((20, 20), chunk_size=20) + + concatenate([a, b]) + + a = ones((10, 20, 30), chunk_size=5) + b = ones((20, 20, 30), chunk_size=10) + + a, c = tile(a, concatenate([a, b])) + assert c.chunk_shape[0] == 4 + assert c.chunk_shape[1] == 4 + assert c.chunk_shape[2] == 6 + assert c.nsplits == ((5, 5, 10, 10), (5,) * 4, (5,) * 6) + assert c.cix[0, 0, 0].key == a.cix[0, 0, 0].key + assert c.cix[1, 0, 0].key == a.cix[1, 0, 0].key + + +def test_stack(): + raw_arrs = [ones((3, 4), chunk_size=2) for _ in range(10)] + arr2 = stack(raw_arrs, axis=0) + + assert arr2.shape == (10, 3, 4) + + arr2 = tile(arr2) + assert arr2.nsplits == ((1,) * 10, (2, 1), (2, 2)) + + arr3 = stack(raw_arrs, axis=1) + + assert arr3.shape == (3, 10, 4) + + arr3 = tile(arr3) + assert arr3.nsplits == ((2, 1), (1,) * 10, (2, 2)) + + arr4 = stack(raw_arrs, axis=2) + + assert arr4.shape == (3, 4, 10) + + arr4 = tile(arr4) + assert arr4.nsplits == ((2, 1), (2, 2), (1,) * 10) + + with pytest.raises(ValueError): + raw_arrs2 = [ones((3, 4), chunk_size=2), ones((4, 3), chunk_size=2)] + stack(raw_arrs2) + + with pytest.raises(np.AxisError): + stack(raw_arrs, axis=3) + + arr5 = tile(stack(raw_arrs, -1)) + assert arr5.nsplits == ((2, 1), (2, 2), (1,) * 10) + + arr6 = tile(stack(raw_arrs, -3)) + assert arr6.nsplits == ((1,) * 10, (2, 1), (2, 2)) + + with pytest.raises(np.AxisError): + stack(raw_arrs, axis=-4) + + with pytest.raises(TypeError): + stack(raw_arrs, out=1) + + with pytest.raises(ValueError): + stack(raw_arrs, empty((1, 10, 3, 4))) diff --git a/python/xorbits/_mars/tensor/merge/tests/test_merge_execution.py b/python/xorbits/_mars/tensor/merge/tests/test_merge_execution.py new file mode 100644 index 000000000..9e86ecd11 --- /dev/null +++ b/python/xorbits/_mars/tensor/merge/tests/test_merge_execution.py @@ -0,0 +1,371 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +import scipy.sparse as sps + +from ... import ( + append, + array, + block, + column_stack, + concatenate, + dstack, + hstack, + stack, + union1d, + vstack, +) +from ...datasource import empty, eye, ones, tensor, zeros + + +def test_concatenate_execution(setup): + a_data = np.random.rand(10, 20, 30) + b_data = np.random.rand(10, 20, 40) + c_data = np.random.rand(10, 20, 50) + + a = tensor(a_data, chunk_size=8) + b = tensor(b_data, chunk_size=10) + c = tensor(c_data, chunk_size=15) + + d = concatenate([a, b, c], axis=-1) + res = d.execute().fetch() + expected = np.concatenate([a_data, b_data, c_data], axis=-1) + np.testing.assert_array_equal(res, expected) + + a_data = sps.random(10, 30) + b_data = sps.rand(10, 40) + c_data = sps.rand(10, 50) + + a = tensor(a_data, chunk_size=8) + b = tensor(b_data, chunk_size=10) + c = tensor(c_data, chunk_size=15) + + d = concatenate([a, b, c], axis=-1) + res = d.execute().fetch() + expected = np.concatenate([a_data.A, b_data.A, c_data.A], axis=-1) + np.testing.assert_array_equal(res.toarray(), expected) + + +def test_stack_execution(setup): + raw = [np.random.randn(3, 4) for _ in range(10)] + arrs = [tensor(a, chunk_size=3) for a in raw] + + arr2 = stack(arrs) + res = arr2.execute().fetch() + assert np.array_equal(res, np.stack(raw)) is True + + arr3 = stack(arrs, axis=1) + res = arr3.execute().fetch() + assert np.array_equal(res, np.stack(raw, axis=1)) is True + + arr4 = stack(arrs, axis=2) + res = arr4.execute().fetch() + assert np.array_equal(res, np.stack(raw, axis=2)) is True + + raw2 = [np.asfortranarray(np.random.randn(3, 4)) for _ in range(10)] + arr5 = [tensor(a, chunk_size=3) for a in raw2] + + arr6 = stack(arr5) + res = arr6.execute().fetch() + expected = np.stack(raw2).copy("A") + np.testing.assert_array_equal(res, expected) + + arr7 = stack(arr5, out=empty((10, 3, 4), order="F")) + res = arr7.execute().fetch() + expected = np.stack(raw2, out=np.empty((10, 3, 4), order="F")).copy("A") + np.testing.assert_array_equal(res, expected) + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + # test stack with unknown shapes + t = tensor(raw[0], chunk_size=3) + t2 = t[t[:, 0] > 0.0] + t3 = t2 + 1 + + arr8 = stack([t2, t3]) + result = arr8.execute().fetch() + e = raw[0] + e2 = e[e[:, 0] > 0.0] + e3 = e2 + 1 + np.testing.assert_array_equal(result, np.stack([e2, e3])) + + +def test_h_stack_execution(setup): + a_data = np.random.rand(10) + b_data = np.random.rand(20) + + a = tensor(a_data, chunk_size=8) + b = tensor(b_data, chunk_size=8) + + c = hstack([a, b]) + res = c.execute().fetch() + expected = np.hstack([a_data, b_data]) + assert np.array_equal(res, expected) is True + + a_data = np.random.rand(10, 20) + b_data = np.random.rand(10, 5) + + a = tensor(a_data, chunk_size=6) + b = tensor(b_data, chunk_size=8) + + c = hstack([a, b]) + res = c.execute().fetch() + expected = np.hstack([a_data, b_data]) + assert np.array_equal(res, expected) is True + + +def test_v_stack_execution(setup): + a_data = np.random.rand(10) + b_data = np.random.rand(10) + + a = tensor(a_data, chunk_size=8) + b = tensor(b_data, chunk_size=8) + + c = vstack([a, b]) + res = c.execute().fetch() + expected = np.vstack([a_data, b_data]) + assert np.array_equal(res, expected) is True + + a_data = np.random.rand(10, 20) + b_data = np.random.rand(5, 20) + + a = tensor(a_data, chunk_size=6) + b = tensor(b_data, chunk_size=8) + + c = vstack([a, b]) + res = c.execute().fetch() + expected = np.vstack([a_data, b_data]) + assert np.array_equal(res, expected) is True + + +def test_d_stack_execution(setup): + a_data = np.random.rand(10) + b_data = np.random.rand(10) + + a = tensor(a_data, chunk_size=8) + b = tensor(b_data, chunk_size=8) + + c = dstack([a, b]) + res = c.execute().fetch() + expected = np.dstack([a_data, b_data]) + assert np.array_equal(res, expected) is True + + a_data = np.random.rand(10, 20) + b_data = np.random.rand(10, 20) + + a = tensor(a_data, chunk_size=6) + b = tensor(b_data, chunk_size=8) + + c = dstack([a, b]) + res = c.execute().fetch() + expected = np.dstack([a_data, b_data]) + assert np.array_equal(res, expected) is True + + +def test_column_stack_execution(setup): + a_data = np.array((1, 2, 3)) + b_data = np.array((2, 3, 4)) + a = tensor(a_data, chunk_size=1) + b = tensor(b_data, chunk_size=2) + + c = column_stack((a, b)) + res = c.execute().fetch() + expected = np.column_stack((a_data, b_data)) + np.testing.assert_equal(res, expected) + + a_data = np.random.rand(4, 2, 3) + b_data = np.random.rand(4, 2, 3) + a = tensor(a_data, chunk_size=1) + b = tensor(b_data, chunk_size=2) + + c = column_stack((a, b)) + res = c.execute().fetch() + expected = np.column_stack((a_data, b_data)) + np.testing.assert_equal(res, expected) + + +def test_union1d_execution(setup): + rs = np.random.RandomState(0) + raw1 = rs.random(10) + raw2 = rs.random(9) + + t1 = tensor(raw1, chunk_size=3) + t2 = tensor(raw2, chunk_size=4) + + t = union1d(t1, t2, aggregate_size=1) + res = t.execute().fetch() + expected = np.union1d(raw1, raw2) + np.testing.assert_array_equal(res, expected) + + t = union1d(t1, t2) + res = t.execute().fetch() + expected = np.union1d(raw1, raw2) + np.testing.assert_array_equal(res, expected) + + +def test_block_execution(setup): + # arrays is a tuple. + with pytest.raises(TypeError): + block((1, 2, 3)) + + # List depths are mismatched. + with pytest.raises(ValueError): + block([[1, 2], [[3, 4]]]) + + # List at arrays cannot be empty. + with pytest.raises(ValueError): + block([]) + + # List at arrays[1] cannot be empty. + with pytest.raises(ValueError): + block([[1, 2], []]) + + # Mismatched array shapes. + with pytest.raises(ValueError): + block([eye(512), eye(512), ones((511, 1))]) + + # Test large block. + block([eye(512), eye(512), ones((512, 1))]) + + # Test block inputs a single array. + c = block(array([1, 2, 3])) + r = c.execute().fetch() + np.testing.assert_array_equal(r, array([1, 2, 3])) + + a = eye(2) * 2 + b = eye(3) * 3 + c = block([[a, zeros((2, 3))], [ones((3, 2)), b]]) + r = c.execute().fetch() + expected = array( + [ + [2.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 2.0, 0.0, 0.0, 0.0], + [1.0, 1.0, 3.0, 0.0, 0.0], + [1.0, 1.0, 0.0, 3.0, 0.0], + [1.0, 1.0, 0.0, 0.0, 3.0], + ] + ) + np.testing.assert_array_equal(r, expected) + + # eye with different chunk sizes + a = eye(5, chunk_size=2) * 2 + b = eye(4, chunk_size=3) * 3 + c = block([[a, zeros((5, 4), chunk_size=4)], [ones((4, 5), chunk_size=5), b]]) + r = c.execute().fetch() + expected = array( + [ + [2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0], + [1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 0.0, 0.0, 0.0], + [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 3.0, 0.0, 0.0], + [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 0.0], + [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 3.0], + ] + ) + np.testing.assert_array_equal(r, expected) + + # hstack([1, 2, 3]) + c = block([1, 2, 3]) + r = c.execute().fetch() + expected = array([1, 2, 3]) + np.testing.assert_array_equal(r, expected) + + # hstack([a, b, 10]) + a = array([1, 2, 3]) + b = array([2, 3, 4]) + c = block([a, b, 10]) + r = c.execute().fetch() + expected = array([1, 2, 3, 2, 3, 4, 10]) + np.testing.assert_array_equal(r, expected) + + # hstack([a, b, 10]) with different chunk sizes + a = array([1, 2, 3, 4, 5, 6, 7], chunk_size=3) + b = array([2, 3, 4, 5], chunk_size=4) + c = block([a, b, 10]) + r = c.execute().fetch() + expected = array([1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 10]) + np.testing.assert_array_equal(r, expected) + + # hstack([A, B]) + A = ones((2, 2), int) + B = 2 * A + c = block([A, B]) + r = c.execute().fetch() + expected = array([[1, 1, 2, 2], [1, 1, 2, 2]]) + np.testing.assert_array_equal(r, expected) + + # vstack([a, b]) + a = array([1, 2, 3]) + b = array([2, 3, 4]) + c = block([[a], [b]]) + r = c.execute().fetch() + expected = array([[1, 2, 3], [2, 3, 4]]) + np.testing.assert_array_equal(r, expected) + + # vstack([a, b]) with different chunk sizes + a = array([1, 2, 3, 4, 5, 6, 7], chunk_size=5) + b = array([2, 3, 4, 5, 6, 7, 8], chunk_size=6) + c = block([[a], [b]]) + r = c.execute().fetch() + expected = array([[1, 2, 3, 4, 5, 6, 7], [2, 3, 4, 5, 6, 7, 8]]) + np.testing.assert_array_equal(r, expected) + + # vstack([A, B]) + A = ones((2, 2), int) + B = 2 * A + c = block([[A], [B]]) + r = c.execute().fetch() + expected = array([[1, 1], [1, 1], [2, 2], [2, 2]]) + np.testing.assert_array_equal(r, expected) + + a = array(0) + b = array([1]) + # atleast_1d(a) + c = block([a]) + r = c.execute().fetch() + expected = array([0]) + np.testing.assert_array_equal(r, expected) + # atleast_1d(b) + c = block([b]) + r = c.execute().fetch() + expected = array([1]) + np.testing.assert_array_equal(r, expected) + # atleast_2d(a) + c = block([[a]]) + r = c.execute().fetch() + expected = array([[0]]) + np.testing.assert_array_equal(r, expected) + # atleast_2d(b) + c = block([[b]]) + r = c.execute().fetch() + expected = array([[1]]) + np.testing.assert_array_equal(r, expected) + + +@pytest.mark.parametrize("axis", [0, None]) +def test_append_execution(setup, axis): + raw1 = np.random.rand(10, 3) + raw2 = np.random.rand(6, 3) + + a1 = tensor(raw1, chunk_size=3) + a2 = tensor(raw2, chunk_size=4) + r = append(a1, a2, axis=axis) + result = r.execute().fetch() + expected = np.append(raw1, raw2, axis=axis) + np.testing.assert_array_equal(result, expected) diff --git a/python/xorbits/_mars/tensor/merge/union1d.py b/python/xorbits/_mars/tensor/merge/union1d.py new file mode 100644 index 000000000..275d7827c --- /dev/null +++ b/python/xorbits/_mars/tensor/merge/union1d.py @@ -0,0 +1,55 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def union1d(ar1, ar2, aggregate_size=None): + """ + Find the union of two tensors. + + Return the unique, sorted tensor of values that are in either of the two + input tensors. + + Parameters + ---------- + ar1, ar2 : array_like + Input tensors. They are flattened if they are not already 1D. + + Returns + ------- + union1d : Tensor + Unique, sorted union of the input tensors. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.union1d([-1, 0, 1], [-2, 0, 2]).execute() + array([-2, -1, 0, 1, 2]) + + To find the union of more than two arrays, use functools.reduce: + + >>> from functools import reduce + >>> reduce(mt.union1d, ([1, 3, 4, 3], [3, 1, 2, 1], [6, 3, 4, 2])).execute() + array([1, 2, 3, 4, 6]) + """ + + from ..base import sort, unique + from .concatenate import concatenate + + result = unique(concatenate((ar1, ar2), axis=None), aggregate_size=aggregate_size) + if aggregate_size == 1: + return result + # make sure the result is sorted + # TODO(xuye.qin): remove when `mt.unique` supports sort shuffle + return sort(result) diff --git a/python/xorbits/_mars/tensor/merge/vstack.py b/python/xorbits/_mars/tensor/merge/vstack.py new file mode 100644 index 000000000..1d0764820 --- /dev/null +++ b/python/xorbits/_mars/tensor/merge/vstack.py @@ -0,0 +1,73 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..base import atleast_2d +from .concatenate import concatenate + + +def vstack(tup): + """ + Stack tensors in sequence vertically (row wise). + + This is equivalent to concatenation along the first axis after 1-D tensors + of shape `(N,)` have been reshaped to `(1,N)`. Rebuilds tensors divided by + `vsplit`. + + This function makes most sense for tensors with up to 3 dimensions. For + instance, for pixel-data with a height (first axis), width (second axis), + and r/g/b channels (third axis). The functions `concatenate`, `stack` and + `block` provide more general stacking and concatenation operations. + + Parameters + ---------- + tup : sequence of tensors + The tensors must have the same shape along all but the first axis. + 1-D tensors must have the same length. + + Returns + ------- + stacked : Tensor + The tensor formed by stacking the given tensors, will be at least 2-D. + + See Also + -------- + stack : Join a sequence of tensors along a new axis. + hstack : Stack tensors in sequence horizontally (column wise). + dstack : Stack tensors in sequence depth wise (along third dimension). + concatenate : Join a sequence of tensors along an existing axis. + vsplit : Split tensor into a list of multiple sub-arrays vertically. + block : Assemble tensors from blocks. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([1, 2, 3]) + >>> b = mt.array([2, 3, 4]) + >>> mt.vstack((a,b)).execute() + array([[1, 2, 3], + [2, 3, 4]]) + + >>> a = mt.array([[1], [2], [3]]) + >>> b = mt.array([[2], [3], [4]]) + >>> mt.vstack((a,b)).execute() + array([[1], + [2], + [3], + [2], + [3], + [4]]) + + """ + return concatenate([atleast_2d(t) for t in tup], axis=0) diff --git a/python/xorbits/_mars/tensor/operands.py b/python/xorbits/_mars/tensor/operands.py new file mode 100644 index 000000000..df2934095 --- /dev/null +++ b/python/xorbits/_mars/tensor/operands.py @@ -0,0 +1,132 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..core import OutputType +from ..core.operand import ( + Fuse, + HasInput, + MapReduceOperand, + Operand, + ShuffleProxy, + TileableOperandMixin, +) +from ..serialization.serializables import DataTypeField +from ..utils import calc_nsplits + + +class TensorOperandMixin(TileableOperandMixin): + __slots__ = () + _op_module_ = "tensor" + _output_type_ = OutputType.tensor + + def new_tensors( + self, + inputs, + shape=None, + dtype=None, + order=None, + chunks=None, + nsplits=None, + output_limit=None, + kws=None, + **kw + ): + return self.new_tileables( + inputs, + shape=shape, + chunks=chunks, + nsplits=nsplits, + output_limit=output_limit, + kws=kws, + dtype=dtype, + order=order, + **kw + ) + + def new_tensor(self, inputs, shape, dtype=None, order=None, **kw): + if getattr(self, "output_limit") != 1: + raise TypeError("cannot new tensor with more than 1 outputs") + return self.new_tensors(inputs, shape=shape, dtype=dtype, order=order, **kw)[0] + + @classmethod + def concat_tileable_chunks(cls, tileable): + from .merge.concatenate import TensorConcatenate + + tensor = tileable + assert not tensor.is_coarse() + + op = TensorConcatenate(dtype=tensor.dtype) + chunk = TensorConcatenate(dtype=tensor.dtype).new_chunk( + tensor.chunks, shape=tensor.shape, index=(0,) * tileable.ndim + ) + return op.new_tensor( + [tensor], + tensor.shape, + chunks=[chunk], + nsplits=tuple((s,) for s in tensor.shape), + ) + + @classmethod + def create_tileable_from_chunks(cls, chunks, inputs=None, **kw): + chunk_idx_to_shape = {c.index: c.shape for c in chunks} + nsplits = calc_nsplits(chunk_idx_to_shape) + shape = tuple(sum(ns) for ns in nsplits) + op = chunks[0].op.copy().reset_key() + return op.new_tensor( + inputs, + shape=shape, + chunks=chunks, + nsplits=nsplits, + dtype=chunks[0].dtype, + **kw + ) + + def get_fuse_op_cls(self, _): + from .fuse import TensorFuseChunk + + return TensorFuseChunk + + +class TensorOperand(Operand): + _output_type_ = OutputType.tensor + + dtype = DataTypeField("dtype", default=None) + + +class TensorHasInput(HasInput): + _output_type_ = OutputType.tensor + + dtype = DataTypeField("dtype", default=None) + + +class TensorShuffleProxy(ShuffleProxy, TensorOperandMixin): + _output_type_ = OutputType.tensor + + dtype = DataTypeField("dtype", default=None) + + @classmethod + def execute(cls, ctx, op): + pass + + +class TensorMapReduceOperand(MapReduceOperand): + _output_type_ = OutputType.tensor + + dtype = DataTypeField("dtype", default=None) + + +class TensorFuse(Fuse): + _output_type_ = OutputType.tensor + + dtype = DataTypeField("dtype", default=None) diff --git a/python/xorbits/_mars/tensor/random/__init__.py b/python/xorbits/_mars/tensor/random/__init__.py new file mode 100644 index 000000000..ebd129073 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/__init__.py @@ -0,0 +1,166 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .beta import TensorRandBeta, beta +from .binomial import TensorBinomial, binomial +from .bytes import bytes +from .chisquare import TensorChisquareDist, chisquare +from .choice import TensorChoice, choice +from .core import RandomState, RandomStateField, _random_state +from .dirichlet import TensorDirichlet, dirichlet +from .exponential import TensorExponential, exponential +from .f import TensorF, f +from .gamma import TensorRandGamma, gamma +from .geometric import TensorGeometric, geometric +from .gumbel import TensorGumbel, gumbel +from .hypergeometric import TensorHypergeometric, hypergeometric +from .laplace import TensorLaplace, laplace +from .logistic import TensorLogistic, logistic +from .lognormal import TensorLognormal, lognormal +from .logseries import TensorLogseries, logseries +from .multinomial import TensorMultinomial, multinomial +from .multivariate_normal import TensorMultivariateNormal, multivariate_normal +from .negative_binomial import TensorNegativeBinomial, negative_binomial +from .noncentral_chisquare import TensorNoncentralChisquare, noncentral_chisquare +from .noncentral_f import TensorNoncentralF, noncentral_f +from .normal import TensorNormal, normal +from .pareto import TensorPareto, pareto +from .permutation import TensorPermutation, permutation +from .poisson import TensorPoisson, poisson +from .power import TensorRandomPower, power +from .rand import TensorRand, rand +from .randint import TensorRandint, randint +from .randn import TensorRandn, randn +from .random_integers import TensorRandomIntegers, random_integers +from .random_sample import TensorRandomSample, random_sample +from .rayleigh import TensorRayleigh, rayleigh +from .shuffle import shuffle +from .standard_cauchy import TensorStandardCauchy, standard_cauchy +from .standard_exponential import TensorStandardExponential, standard_exponential +from .standard_gamma import TensorStandardGamma, standard_gamma +from .standard_normal import TensorStandardNormal, standard_normal +from .standard_t import TensorStandardT, standard_t +from .triangular import TensorTriangular, triangular +from .uniform import TensorUniform, uniform +from .vonmises import TensorVonmises, vonmises +from .wald import TensorWald, wald +from .weibull import TensorWeibull, weibull +from .zipf import TensorZipf, zipf + + +def _install(): + setattr(RandomState, "rand", rand) + setattr(RandomState, "randn", randn) + setattr(RandomState, "randint", randint) + setattr(RandomState, "random_integers", random_integers) + setattr(RandomState, "random_sample", random_sample) + setattr(RandomState, "ranf", random_sample) + setattr(RandomState, "random", random_sample) + setattr(RandomState, "sample", random_sample) + setattr(RandomState, "choice", choice) + setattr(RandomState, "bytes", bytes) + setattr(RandomState, "beta", beta) + setattr(RandomState, "binomial", binomial) + setattr(RandomState, "chisquare", chisquare) + setattr(RandomState, "dirichlet", dirichlet) + setattr(RandomState, "exponential", exponential) + setattr(RandomState, "f", f) + setattr(RandomState, "gamma", gamma) + setattr(RandomState, "geometric", geometric) + setattr(RandomState, "gumbel", gumbel) + setattr(RandomState, "hypergeometric", hypergeometric) + setattr(RandomState, "laplace", laplace) + setattr(RandomState, "logistic", logistic) + setattr(RandomState, "lognormal", lognormal) + setattr(RandomState, "logseries", logseries) + setattr(RandomState, "multinomial", multinomial) + setattr(RandomState, "multivariate_normal", multivariate_normal) + setattr(RandomState, "negative_binomial", negative_binomial) + setattr(RandomState, "noncentral_chisquare", noncentral_chisquare) + setattr(RandomState, "noncentral_f", noncentral_f) + setattr(RandomState, "normal", normal) + setattr(RandomState, "pareto", pareto) + setattr(RandomState, "poisson", poisson) + setattr(RandomState, "power", power) + setattr(RandomState, "rayleigh", rayleigh) + setattr(RandomState, "standard_cauchy", standard_cauchy) + setattr(RandomState, "standard_exponential", standard_exponential) + setattr(RandomState, "standard_gamma", standard_gamma) + setattr(RandomState, "standard_normal", standard_normal) + setattr(RandomState, "standard_t", standard_t) + setattr(RandomState, "triangular", triangular) + setattr(RandomState, "uniform", uniform) + setattr(RandomState, "vonmises", vonmises) + setattr(RandomState, "wald", wald) + setattr(RandomState, "weibull", weibull) + setattr(RandomState, "zipf", zipf) + setattr(RandomState, "permutation", permutation) + setattr(RandomState, "shuffle", shuffle) + + +_install() +del _install + + +seed = _random_state.seed + +rand = _random_state.rand +randn = _random_state.randn +randint = _random_state.randint +random_integers = _random_state.random_integers +random_sample = _random_state.random_sample +random = _random_state.random +ranf = _random_state.ranf +sample = _random_state.sample +choice = _random_state.choice +bytes = _random_state.bytes + +permutation = _random_state.permutation +shuffle = _random_state.shuffle + +beta = _random_state.beta +binomial = _random_state.binomial +chisquare = _random_state.chisquare +dirichlet = _random_state.dirichlet +exponential = _random_state.exponential +f = _random_state.f +gamma = _random_state.gamma +geometric = _random_state.geometric +gumbel = _random_state.gumbel +hypergeometric = _random_state.hypergeometric +laplace = _random_state.laplace +logistic = _random_state.logistic +lognormal = _random_state.lognormal +logseries = _random_state.logseries +multinomial = _random_state.multinomial +multivariate_normal = _random_state.multivariate_normal +negative_binomial = _random_state.negative_binomial +noncentral_chisquare = _random_state.noncentral_chisquare +noncentral_f = _random_state.noncentral_f +normal = _random_state.normal +pareto = _random_state.pareto +poisson = _random_state.poisson +power = _random_state.power +rayleigh = _random_state.rayleigh +standard_cauchy = _random_state.standard_cauchy +standard_exponential = _random_state.standard_exponential +standard_gamma = _random_state.standard_gamma +standard_normal = _random_state.standard_normal +standard_t = _random_state.standard_t +triangular = _random_state.triangular +uniform = _random_state.uniform +vonmises = _random_state.vonmises +wald = _random_state.wald +weibull = _random_state.weibull +zipf = _random_state.zipf diff --git a/python/xorbits/_mars/tensor/random/beta.py b/python/xorbits/_mars/tensor/random/beta.py new file mode 100644 index 000000000..7cddc16a3 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/beta.py @@ -0,0 +1,87 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorRandBeta(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["a", "b"] + _op_type_ = OperandDef.RAND_BETA + + _fields_ = "a", "b", "size" + a = AnyField("a") + b = AnyField("b") + _func_name = "beta" + + def __call__(self, a, b, chunk_size=None): + return self.new_tensor([a, b], None, raw_chunk_size=chunk_size) + + +def beta(random_state, a, b, size=None, chunk_size=None, gpu=None, dtype=None): + r""" + Draw samples from a Beta distribution. + + The Beta distribution is a special case of the Dirichlet distribution, + and is related to the Gamma distribution. It has the probability + distribution function + + .. math:: f(x; a,b) = \frac{1}{B(\alpha, \beta)} x^{\alpha - 1} + (1 - x)^{\beta - 1}, + + where the normalisation, B, is the beta function, + + .. math:: B(\alpha, \beta) = \int_0^1 t^{\alpha - 1} + (1 - t)^{\beta - 1} dt. + + It is often seen in Bayesian inference and order statistics. + + Parameters + ---------- + a : float or array_like of floats + Alpha, non-negative. + b : float or array_like of floats + Beta, non-negative. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``a`` and ``b`` are both scalars. + Otherwise, ``mt.broadcast(a, b).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized beta distribution. + """ + if dtype is None: + dtype = ( + np.random.RandomState() + .beta(handle_array(a), handle_array(b), size=(0,)) + .dtype + ) + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorRandBeta(seed=seed, size=size, gpu=gpu, dtype=dtype) + return op(a, b, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/binomial.py b/python/xorbits/_mars/tensor/random/binomial.py new file mode 100644 index 000000000..e2dd5700e --- /dev/null +++ b/python/xorbits/_mars/tensor/random/binomial.py @@ -0,0 +1,135 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorBinomial(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["n", "p"] + _op_type_ = OperandDef.RAND_BINOMIAL + + _fields_ = "n", "p", "size" + n = AnyField("n") + p = AnyField("p") + _func_name = "binomial" + + def __call__(self, n, p, chunk_size=None): + return self.new_tensor([n, p], None, raw_chunk_size=chunk_size) + + +def binomial(random_state, n, p, size=None, chunk_size=None, gpu=None, dtype=None): + r""" + Draw samples from a binomial distribution. + + Samples are drawn from a binomial distribution with specified + parameters, n trials and p probability of success where + n an integer >= 0 and p is in the interval [0,1]. (n may be + input as a float, but it is truncated to an integer in use) + + Parameters + ---------- + n : int or array_like of ints + Parameter of the distribution, >= 0. Floats are also accepted, + but they will be truncated to integers. + p : float or array_like of floats + Parameter of the distribution, >= 0 and <=1. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``n`` and ``p`` are both scalars. + Otherwise, ``mt.broadcast(n, p).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized binomial distribution, where + each sample is equal to the number of successes over the n trials. + + See Also + -------- + scipy.stats.binom : probability density function, distribution or + cumulative density function, etc. + + Notes + ----- + The probability density for the binomial distribution is + + .. math:: P(N) = \binom{n}{N}p^N(1-p)^{n-N}, + + where :math:`n` is the number of trials, :math:`p` is the probability + of success, and :math:`N` is the number of successes. + + When estimating the standard error of a proportion in a population by + using a random sample, the normal distribution works well unless the + product p*n <=5, where p = population proportion estimate, and n = + number of samples, in which case the binomial distribution is used + instead. For example, a sample of 15 people shows 4 who are left + handed, and 11 who are right handed. Then p = 4/15 = 27%. 0.27*15 = 4, + so the binomial distribution should be used in this case. + + References + ---------- + .. [1] Dalgaard, Peter, "Introductory Statistics with R", + Springer-Verlag, 2002. + .. [2] Glantz, Stanton A. "Primer of Biostatistics.", McGraw-Hill, + Fifth Edition, 2002. + .. [3] Lentner, Marvin, "Elementary Applied Statistics", Bogden + and Quigley, 1972. + .. [4] Weisstein, Eric W. "Binomial Distribution." From MathWorld--A + Wolfram Web Resource. + http://mathworld.wolfram.com/BinomialDistribution.html + .. [5] Wikipedia, "Binomial distribution", + http://en.wikipedia.org/wiki/Binomial_distribution + + Examples + -------- + Draw samples from the distribution: + + >>> import mars.tensor as mt + + >>> n, p = 10, .5 # number of trials, probability of each trial + >>> s = mt.random.binomial(n, p, 1000).execute() + # result of flipping a coin 10 times, tested 1000 times. + + A real world example. A company drills 9 wild-cat oil exploration + wells, each with an estimated probability of success of 0.1. All nine + wells fail. What is the probability of that happening? + + Let's do 20,000 trials of the model, and count the number that + generate zero positive results. + + >>> (mt.sum(mt.random.binomial(9, 0.1, 20000) == 0)/20000.).execute() + # answer = 0.38885, or 38%. + """ + if dtype is None: + dtype = ( + np.random.RandomState() + .binomial(handle_array(n), handle_array(p), size=(0,)) + .dtype + ) + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorBinomial(seed=seed, size=size, gpu=gpu, dtype=dtype) + return op(n, p, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/bytes.py b/python/xorbits/_mars/tensor/random/bytes.py new file mode 100644 index 000000000..707f922bb --- /dev/null +++ b/python/xorbits/_mars/tensor/random/bytes.py @@ -0,0 +1,37 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def bytes(random_state, length): + """ + Return random bytes. + + Parameters + ---------- + length : int + Number of random bytes. + + Returns + ------- + out : str + String of length `length`. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.random.bytes(10) + ' eh\x85\x022SZ\xbf\xa4' #random + """ + return random_state._random_state.bytes(length) diff --git a/python/xorbits/_mars/tensor/random/chisquare.py b/python/xorbits/_mars/tensor/random/chisquare.py new file mode 100644 index 000000000..2d1736b5b --- /dev/null +++ b/python/xorbits/_mars/tensor/random/chisquare.py @@ -0,0 +1,108 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorChisquareDist(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["df"] + _op_type_ = OperandDef.RAND_CHISQUARE + + _fields_ = "df", "size" + df = AnyField("df") + _func_name = "chisquare" + + def __call__(self, df, chunk_size=None): + return self.new_tensor([df], self.size, raw_chunk_size=chunk_size) + + +def chisquare(random_state, df, size=None, chunk_size=None, gpu=None, dtype=None): + r""" + Draw samples from a chi-square distribution. + + When `df` independent random variables, each with standard normal + distributions (mean 0, variance 1), are squared and summed, the + resulting distribution is chi-square (see Notes). This distribution + is often used in hypothesis testing. + + Parameters + ---------- + df : float or array_like of floats + Number of degrees of freedom, should be > 0. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``df`` is a scalar. Otherwise, + ``mt.array(df).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized chi-square distribution. + + Raises + ------ + ValueError + When `df` <= 0 or when an inappropriate `size` (e.g. ``size=-1``) + is given. + + Notes + ----- + The variable obtained by summing the squares of `df` independent, + standard normally distributed random variables: + + .. math:: Q = \sum_{i=0}^{\mathtt{df}} X^2_i + + is chi-square distributed, denoted + + .. math:: Q \sim \chi^2_k. + + The probability density function of the chi-squared distribution is + + .. math:: p(x) = \frac{(1/2)^{k/2}}{\Gamma(k/2)} + x^{k/2 - 1} e^{-x/2}, + + where :math:`\Gamma` is the gamma function, + + .. math:: \Gamma(x) = \int_0^{-\infty} t^{x - 1} e^{-t} dt. + + References + ---------- + .. [1] NIST "Engineering Statistics Handbook" + http://www.itl.nist.gov/div898/handbook/eda/section3/eda3666.htm + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.random.chisquare(2,4).execute() + array([ 1.89920014, 9.00867716, 3.13710533, 5.62318272]) + """ + if dtype is None: + dtype = np.random.RandomState().chisquare(handle_array(df), size=(0,)).dtype + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorChisquareDist(seed=seed, size=size, gpu=gpu, dtype=dtype) + return op(df, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/choice.py b/python/xorbits/_mars/tensor/random/choice.py new file mode 100644 index 000000000..7d009140a --- /dev/null +++ b/python/xorbits/_mars/tensor/random/choice.py @@ -0,0 +1,381 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from numbers import Integral + +import numpy as np + +from ... import opcodes as OperandDef +from ...config import options +from ...core import recursive_tile +from ...serialization.serializables import ( + AnyField, + BoolField, + FieldTypes, + KeyField, + TupleField, +) +from ...utils import ceildiv, has_unknown_shape +from ..array_utils import as_same_device, device +from ..core import TENSOR_CHUNK_TYPE, TENSOR_TYPE, TensorOrder +from ..datasource import arange, array +from ..operands import TensorOperandMixin +from ..utils import decide_chunk_sizes, gen_random_seeds, normalize_chunk_sizes +from .core import RandomState, TensorRandomOperand + + +class TensorChoice(TensorRandomOperand, TensorOperandMixin): + _op_type_ = OperandDef.RAND_CHOICE + + a = AnyField("a") + size = TupleField("size", FieldTypes.int64) + replace = BoolField("replace") + p = KeyField("p") + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if isinstance(self.a, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)): + self.a = self._inputs[0] + if isinstance(self.p, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)): + self.p = self._inputs[-1] + + def __call__(self, a, p, chunk_size=None): + inputs = [] + if isinstance(a, TENSOR_TYPE): + inputs.append(a) + if isinstance(p, TENSOR_TYPE): + inputs.append(p) + return self.new_tensor( + inputs, + shape=self.size, + raw_chunk_size=chunk_size, + order=TensorOrder.C_ORDER, + ) + + @classmethod + def _tile_one_chunk(cls, op, a, p): + out = op.outputs[0] + chunk_op = op.copy().reset_key() + chunk_op.seed = gen_random_seeds(1, np.random.RandomState(op.seed))[0] + chunk_inputs = [] + if isinstance(a, TENSOR_TYPE): + chunk_op.a = a.chunks[0] + chunk_inputs.append(chunk_op.a) + else: + chunk_op.a = a + if isinstance(p, TENSOR_TYPE): + chunk_op.p = p.chunks[0] + chunk_inputs.append(chunk_op.p) + else: + chunk_op.p = p + chunk = chunk_op.new_chunk( + chunk_inputs, shape=out.shape, index=(0,) * out.ndim, order=out.order + ) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + shape=out.shape, + order=out.order, + chunks=[chunk], + nsplits=tuple((s,) for s in out.shape), + ) + + @classmethod + def _tile_sample_with_replacement(cls, op, a, nsplits): + out_shape = tuple(sum(ns) for ns in nsplits) + out_size = np.prod(out_shape).item() + most_chunk_size = np.prod([max(ns) for ns in nsplits]).item() + + is_a_int = False + if isinstance(a, Integral): + is_a_int = True + a_size = a + else: + a = array(a) + a_size = a.size + + rs = RandomState.from_numpy(np.random.RandomState(op.seed)) + + if is_a_int: + # the indices is just the result + ret = rs.randint(a_size, size=out_shape, chunk_size=nsplits) + else: + # gen indices first, need to be flattened + indices = rs.randint(a_size, size=out_size, chunk_size=most_chunk_size) + # get result via fancy indexing + ret = a[indices] + if len(out_shape) > 1: + # reshape back if out's ndim > 1 + ret = ret.reshape(out_shape) + ret = ret.rechunk(nsplits) + + return [(yield from recursive_tile(ret))] + + @classmethod + def _tile_sample_without_replacement(cls, op, a, nsplits): + from ..base import searchsorted + from ..indexing.getitem import TensorIndex + from ..merge.stack import TensorStack + + out = op.outputs[0] + out_shape = tuple(sum(ns) for ns in nsplits) + # to sample count + m = np.prod(out_shape).item() + + if isinstance(a, Integral): + a_size = a + a = arange(a) + else: + a = array(a) + a_size = a.size + a = yield from recursive_tile(a) + + if any(cs < m for cs in a.nsplits[0]): + # make sure all chunk > m + n_chunk = min(max(a.size // (m + 1), 1), a.chunk_shape[0]) + chunk_size = ceildiv(a.size, n_chunk) + chunk_sizes = normalize_chunk_sizes(a.size, chunk_size)[0] + if chunk_sizes[-1] < m and len(chunk_sizes) > 1: + # the last chunk may still less than m + # merge it into previous one + chunk_sizes[-2] += chunk_sizes[-1] + chunk_sizes = chunk_sizes[:-1] + a = yield from recursive_tile(a.rechunk({0: chunk_sizes})) + if len(chunk_sizes) == 1: + return cls._tile_one_chunk(op, a, None) + + # for each chunk in a, do regular sampling + sampled_chunks = [] + sample_seeds = gen_random_seeds(len(a.chunks), np.random.RandomState(op.seed)) + for seed, chunk in zip(sample_seeds, a.chunks): + chunk_op = op.copy().reset_key() + chunk_op._a = chunk + chunk_op.size = (m,) + chunk_op.seed = seed + sampled_chunk = chunk_op.new_chunk( + [chunk], shape=(m,), order=out.order, index=chunk.index + ) + sampled_chunks.append(sampled_chunk) + + if len(sampled_chunks) == 1: + out_chunk = sampled_chunks[0] + else: + stacked_chunk = TensorStack( + axis=0, dtype=sampled_chunks[0].dtype + ).new_chunk( + sampled_chunks, shape=(len(a.chunks), m), order=TensorOrder.C_ORDER + ) + + # gen indices with length m from 0...a.size + state = RandomState.from_numpy(np.random.RandomState(op.seed)) + indices = state.randint(a_size, size=(m,)) + cum_offsets = np.cumsum(a.nsplits[0]) + ind = yield from recursive_tile( + searchsorted(cum_offsets, indices, side="right") + ) + ind_chunk = ind.chunks[0] + + # do fancy index to find result + arange_tensor = yield from recursive_tile(arange(m)) + indexes = [ind_chunk, arange_tensor.chunks[0]] + out_chunk = TensorIndex( + dtype=stacked_chunk.dtype, indexes=indexes + ).new_chunk( + [stacked_chunk] + list(indexes), shape=(m,), order=TensorOrder.C_ORDER + ) + + ret = op.copy().new_tensor( + op.inputs, shape=(m,), order=out.order, nsplits=((m,),), chunks=[out_chunk] + ) + if len(out_shape) > 0: + ret = yield from recursive_tile(ret.reshape(out_shape)) + ret = yield from recursive_tile(ret.rechunk(nsplits)) + return [ret] + + @classmethod + def tile(cls, op): + if has_unknown_shape(*op.inputs): + yield + + out = op.outputs[0] + chunk_size = out.extra_params.raw_chunk_size or options.chunk_size + nsplits = decide_chunk_sizes(out.shape, chunk_size, out.dtype.itemsize) + inputs = op.inputs + + a, p = op.a, op.p + if p is not None: + # we cannot handle p in a parallel fashion + inputs = [] + if isinstance(a, TENSOR_TYPE): + a = yield from recursive_tile(a.rechunk(a.shape)) + inputs.append(a) + p = yield from recursive_tile(p.rechunk(p.shape)) + inputs.append(p) + + # ignore nsplits if p is specified + nsplits = ((s,) for s in out.shape) + + # all inputs and outputs has 1 chunk + if all(len(inp.chunks) == 1 for inp in inputs) and all( + len(ns) == 1 for ns in nsplits + ): + return cls._tile_one_chunk(op, a, p) + + if op.replace: + return (yield from cls._tile_sample_with_replacement(op, a, nsplits)) + else: + return (yield from cls._tile_sample_without_replacement(op, a, nsplits)) + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True + ) + if isinstance(op.a, TENSOR_CHUNK_TYPE): + a = inputs[0] + else: + a = op.a + if isinstance(op.p, TENSOR_CHUNK_TYPE): + p = inputs[-1] + else: + p = op.p + + with device(device_id): + rs = xp.random.RandomState(op.seed) + ctx[op.outputs[0].key] = rs.choice(a, size=op.size, replace=op.replace, p=p) + + +def choice(random_state, a, size=None, replace=True, p=None, chunk_size=None, gpu=None): + """ + Generates a random sample from a given 1-D array + + Parameters + ----------- + a : 1-D array-like or int + If a tensor, a random sample is generated from its elements. + If an int, the random sample is generated as if a were mt.arange(a) + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. Default is None, in which case a + single value is returned. + replace : boolean, optional + Whether the sample is with or without replacement + p : 1-D array-like, optional + The probabilities associated with each entry in a. + If not given the sample assumes a uniform distribution over all + entries in a. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + + Returns + -------- + samples : single item or tensor + The generated random samples + + Raises + ------- + ValueError + If a is an int and less than zero, if a or p are not 1-dimensional, + if a is an array-like of size 0, if p is not a vector of + probabilities, if a and p have different lengths, or if + replace=False and the sample size is greater than the population + size + + See Also + --------- + randint, shuffle, permutation + + Examples + --------- + Generate a uniform random sample from mt.arange(5) of size 3: + + >>> import mars.tensor as mt + + >>> mt.random.choice(5, 3).execute() + array([0, 3, 4]) + >>> #This is equivalent to mt.random.randint(0,5,3) + + Generate a non-uniform random sample from np.arange(5) of size 3: + + >>> mt.random.choice(5, 3, p=[0.1, 0, 0.3, 0.6, 0]).execute() + array([3, 3, 0]) + + Generate a uniform random sample from mt.arange(5) of size 3 without + replacement: + + >>> mt.random.choice(5, 3, replace=False).execute() + array([3,1,0]) + >>> #This is equivalent to np.random.permutation(np.arange(5))[:3] + + Generate a non-uniform random sample from mt.arange(5) of size + 3 without replacement: + + >>> mt.random.choice(5, 3, replace=False, p=[0.1, 0, 0.3, 0.6, 0]).execute() + array([2, 3, 0]) + + Any of the above can be repeated with an arbitrary array-like + instead of just integers. For instance: + + >>> aa_milne_arr = ['pooh', 'rabbit', 'piglet', 'Christopher'] + >>> np.random.choice(aa_milne_arr, 5, p=[0.5, 0.1, 0.1, 0.3]) + array(['pooh', 'pooh', 'pooh', 'Christopher', 'piglet'], + dtype='|S11') + """ + + if isinstance(a, Integral): + if a <= 0: + raise ValueError("a must be greater than 0") + a_size = a + dtype = np.random.choice( + 1, size=(), p=np.array([1]) if p is not None else p + ).dtype + else: + a = array(a) + if a.ndim != 1: + raise ValueError("a must be one dimensional") + a_size = a.size + dtype = a.dtype + + if p is not None: + if not isinstance(p, TENSOR_TYPE): + p = np.asarray(p) + if not np.isclose(p.sum(), 1, rtol=1e-7, atol=0): + raise ValueError("probabilities do not sum to 1") + p = array(p, chunk_size=p.size) + if p.ndim != 1: + raise ValueError("p must be one dimensional") + + if size is None: + size = () + length = 1 + else: + try: + tuple(size) + length = np.prod(size) + except TypeError: + length = size + if replace is False and length > a_size: + raise ValueError( + "Cannot take a larger sample than population when 'replace=False'" + ) + + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorChoice( + a=a, p=p, seed=seed, replace=replace, size=size, dtype=dtype, gpu=gpu + ) + return op(a, p, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/core.py b/python/xorbits/_mars/tensor/random/core.py new file mode 100644 index 000000000..3623d6773 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/core.py @@ -0,0 +1,416 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import itertools +from collections.abc import Iterable +from contextlib import contextmanager + +import numpy as np + +from ...config import options +from ...core import recursive_tile +from ...serialization.serializables import FieldTypes, Int32Field, TupleField +from ..array_utils import array_module, device +from ..base import broadcast_to +from ..core import TENSOR_CHUNK_TYPE, TENSOR_TYPE +from ..datasource import tensor as astensor +from ..operands import TensorMapReduceOperand, TensorOperand, TensorOperandMixin +from ..utils import broadcast_shape, decide_chunk_sizes, gen_random_seeds + + +class RandomState: + def __init__(self, seed=None): + self._random_state = np.random.RandomState(seed=seed) + + def seed(self, seed=None): + """ + Seed the generator. + + This method is called when `RandomState` is initialized. It can be + called again to re-seed the generator. For details, see `RandomState`. + + Parameters + ---------- + seed : int or 1-d array_like, optional + Seed for `RandomState`. + Must be convertible to 32 bit unsigned integers. + + See Also + -------- + RandomState + """ + self._random_state.seed(seed=seed) + + def to_numpy(self): + return self._random_state + + @classmethod + def from_numpy(cls, np_random_state): + state = RandomState() + state._random_state = np_random_state + return state + + @classmethod + def _handle_size(cls, size): + if size is None: + return size + try: + return tuple(int(s) for s in size) + except TypeError: + return (size,) + + +_random_state = RandomState() + + +def handle_array(arg): + if not isinstance(arg, TENSOR_TYPE): + if not isinstance(arg, Iterable): + return arg + + arg = np.asarray(arg) + return arg[(0,) * max(1, arg.ndim)] + elif hasattr(arg, "op") and hasattr(arg.op, "data"): + return arg.op.data[(0,) * max(1, arg.ndim)] + + return np.empty((0,), dtype=arg.dtype) + + +class TensorRandomOperandMixin(TensorOperandMixin): + __slots__ = () + + @classmethod + def tile(cls, op): + tensor = op.outputs[0] + chunk_size = tensor.extra_params.raw_chunk_size or options.chunk_size + nsplits = decide_chunk_sizes(tensor.shape, chunk_size, tensor.dtype.itemsize) + fields = getattr(op, "_input_fields_", []) + to_one_chunk_fields = set(getattr(op, "_into_one_chunk_fields_", list())) + + new_inputs = [] + changed = False + for field in fields: + t = getattr(op, field) + if not isinstance(t, TENSOR_TYPE): + continue + + if field not in to_one_chunk_fields: + t_nsplits = nsplits + else: + t_nsplits = t.shape # into 1 chunk + rechunked = t.rechunk(t_nsplits) + if rechunked is not t: + yield from recursive_tile(rechunked) + changed = True + new_inputs.append(rechunked) + else: + new_inputs.append(t) + if changed: + op.inputs = new_inputs + + idxes = list(itertools.product(*[range(len(s)) for s in nsplits])) + seeds = gen_random_seeds(len(idxes), np.random.RandomState(op.seed)) + + out_chunks = [] + for seed, idx, shape in zip(seeds, idxes, itertools.product(*nsplits)): + inputs = [] + for inp in op.inputs: + if len(inp.chunks) == 1: + inputs.append(inp.chunks[0]) + else: + inputs.append(inp.cix[idx]) + try: + s = len(tuple(op.size)) + size = shape[:s] + except TypeError: + if op.size is None: + size = None + else: + size = shape[:1] + except AttributeError: + size = shape + + chunk_op = op.copy().reset_key() + chunk_op.seed = int(seed) + chunk_op.size = size + out_chunk = chunk_op.new_chunk( + inputs, shape=shape, index=idx, order=tensor.order + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + tensor.shape, + order=tensor.order, + chunks=out_chunks, + nsplits=nsplits, + **tensor.extra_params + ) + + @classmethod + def execute(cls, ctx, op): + xp = array_module(op.gpu) + if xp is np: + device_id = -1 + else: + device_id = op.device or 0 + get_val = lambda x: ctx[x.key] if isinstance(x, TENSOR_CHUNK_TYPE) else x + + with device(device_id): + rs = xp.random.RandomState(op.seed) + + method_name = getattr(cls, "_func_name") + try: + if method_name in ("rand", "randn"): + try: + res = getattr(rs, method_name)(*op.size, dtype=op.dtype) + except TypeError: + res = getattr(rs, method_name)(*op.size) + elif method_name == "randint": + try: + res = rs.randint( + get_val(op.low), + get_val(op.high), + size=op.size, + dtype=op.dtype, + ) + except TypeError: + res = rs.randint( + get_val(op.low), get_val(op.high), size=op.size + ) + else: + try: + res = getattr(rs, method_name)( + *(get_val(getattr(op, arg)) for arg in op.args), + dtype=op.dtype + ) + except TypeError: + res = getattr(rs, method_name)( + *(get_val(getattr(op, arg)) for arg in op.args) + ) + if hasattr(res, "dtype") and res.dtype != op.dtype: + res = res.astype(op.dtype, copy=False) + if xp is not np: + ctx[op.outputs[0].key] = xp.asarray(res) + else: + ctx[op.outputs[0].key] = res + except AttributeError: + if xp is not np: + # cupy cannot generate data, fallback to numpy + rs = np.random.RandomState(op.seed) + if method_name in ("rand", "randn"): + res = getattr(rs, method_name)(*op.size) + else: + res = getattr(rs, method_name)( + *(get_val(getattr(op, arg)) for arg in op.args) + ) + if res.dtype != op.dtype: + res = res.astype(op.dtype, copy=False) + ctx[op.outputs[0].key] = xp.asarray(res) + else: + raise + + def _calc_shape(self, shapes): + shapes = list(shapes) + if getattr(self, "size", None) is not None: + shapes.append(getattr(self, "size")) + return broadcast_shape(*shapes) + + @classmethod + def _handle_arg(cls, arg, chunk_size): + if isinstance(arg, (list, np.ndarray)): + arg = astensor(arg, chunk_size=chunk_size) + + return arg + + @contextmanager + def _get_inputs_shape_by_given_fields( + self, inputs, shape, raw_chunk_size=None, tensor=True + ): + fields = getattr(self, "_input_fields_", []) + to_one_chunk_fields = set(getattr(self, "_into_one_chunk_fields_", list())) + + field_to_obj = dict() + to_broadcast_shapes = [] + if fields: + if getattr(self, fields[0], None) is None: + # create from beginning + for field, val in zip(fields, inputs): + if field not in to_one_chunk_fields: + if isinstance(val, list): + val = np.asarray(val) + if tensor: + val = self._handle_arg(val, raw_chunk_size) + if isinstance(val, TENSOR_TYPE + TENSOR_CHUNK_TYPE): + field_to_obj[field] = val + if field not in to_one_chunk_fields: + to_broadcast_shapes.append(val.shape) + setattr(self, field, val) + else: + inputs_iter = iter(inputs) + for field in fields: + if isinstance( + getattr(self, field), TENSOR_TYPE + TENSOR_CHUNK_TYPE + ): + field_to_obj[field] = next(inputs_iter) + + if tensor: + if shape is None: + shape = self._calc_shape(to_broadcast_shapes) + + for field, inp in field_to_obj.items(): + if field not in to_one_chunk_fields: + field_to_obj[field] = broadcast_to(inp, shape) + + yield [field_to_obj[f] for f in fields if f in field_to_obj], shape + + inputs_iter = iter(getattr(self, "_inputs")) + for field in fields: + if field in field_to_obj: + setattr(self, field, next(inputs_iter)) + + @classmethod + def _get_shape(cls, kws, kw): + if kw.get("shape") is not None: + return kw.get("shape") + elif kws is not None and len(kws) > 0: + return kws[0].get("shape") + + def _new_tileables(self, inputs, kws=None, **kw): + raw_chunk_size = kw.get("chunk_size", None) + shape = self._get_shape(kws, kw) + with self._get_inputs_shape_by_given_fields( + inputs, shape, raw_chunk_size, True + ) as (inputs, shape): + kw["shape"] = shape + return super()._new_tileables(inputs, kws=kws, **kw) + + def _new_chunks(self, inputs, kws=None, **kw): + shape = self._get_shape(kws, kw) + with self._get_inputs_shape_by_given_fields(inputs, shape, None, False) as ( + inputs, + shape, + ): + kw["shape"] = shape + return super()._new_chunks(inputs, kws=kws, **kw) + + +def _on_serialize_random_state(rs): + return rs.get_state() if rs is not None else None + + +def _on_deserialize_random_state(tup): + if tup is None: + return None + + rs = np.random.RandomState() + rs.set_state(tup) + return rs + + +def RandomStateField(name, **kwargs): + kwargs.update( + dict( + on_serialize=_on_serialize_random_state, + on_deserialize=_on_deserialize_random_state, + ) + ) + return TupleField(name, **kwargs) + + +class TensorSeedOperandMixin(object): + @property + def seed(self): + return getattr(self, "seed", None) + + @property + def args(self): + if hasattr(self, "_fields_"): + return self._fields_ + else: + return [ + field + for field in self._FIELDS + if field not in TensorRandomOperand._FIELDS + ] + + +class TensorRandomOperand(TensorSeedOperandMixin, TensorOperand): + seed = Int32Field("seed") + + def __init__(self, dtype=None, **kw): + dtype = np.dtype(dtype) if dtype is not None else dtype + if "state" in kw: + kw["_state"] = kw.pop("state") + super().__init__(dtype=dtype, **kw) + + +class TensorRandomMapReduceOperand(TensorSeedOperandMixin, TensorMapReduceOperand): + seed = Int32Field("seed") + + def __init__(self, dtype=None, **kw): + dtype = np.dtype(dtype) if dtype is not None else dtype + if "state" in kw: + kw["_state"] = kw.pop("state") + super().__init__(dtype=dtype, **kw) + + +class TensorDistribution(TensorRandomOperand): + size = TupleField("size", FieldTypes.int64) + + @classmethod + def execute(cls, ctx, op): + xp = array_module(op.gpu) + if xp is np: + device_id = -1 + else: + device_id = op.device or 0 + + with device(device_id): + rs = xp.random.RandomState(op.seed) + + args = [] + for k in op.args: + val = getattr(op, k, None) + if isinstance(val, TENSOR_CHUNK_TYPE): + args.append(ctx[val.key]) + else: + args.append(val) + + method_name = getattr(cls, "_func_name") + try: + res = getattr(rs, method_name)(*args) + if xp is not np: + ctx[op.outputs[0].key] = xp.asarray(res) + else: + ctx[op.outputs[0].key] = res + except AttributeError: + if xp is not np: + # cupy cannot generate, fall back to numpy + rs = np.random.RandomState(op.seed) + res = getattr(rs, method_name)(*args) + ctx[op.outputs[0].key] = xp.asarray(res) + else: + raise + + +class TensorSimpleRandomData(TensorRandomOperand): + size = TupleField("size", FieldTypes.int64) + + def __init__(self, size=None, **kw): + if type(size) is int: + size = (size,) + super().__init__(size=size, **kw) diff --git a/python/xorbits/_mars/tensor/random/dirichlet.py b/python/xorbits/_mars/tensor/random/dirichlet.py new file mode 100644 index 000000000..60f532f97 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/dirichlet.py @@ -0,0 +1,152 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from collections.abc import Iterable + +import numpy as np + +from ... import opcodes as OperandDef +from ...config import options +from ...serialization.serializables import TupleField +from ..utils import decide_chunk_sizes, gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin + + +class TensorDirichlet(TensorDistribution, TensorRandomOperandMixin): + _op_type_ = OperandDef.RAND_DIRICHLET + + _fields_ = "alpha", "size" + alpha = TupleField("alpha", default=None) + _func_name = "dirichlet" + + def _calc_shape(self, shapes): + shape = super()._calc_shape(shapes) + return shape + (len(self.alpha),) + + def __call__(self, chunk_size=None): + return self.new_tensor(None, None, raw_chunk_size=chunk_size) + + @classmethod + def tile(cls, op): + tensor = op.outputs[0] + chunk_size = tensor.extra_params.raw_chunk_size or options.chunk_size + nsplits = decide_chunk_sizes( + tensor.shape[:-1], chunk_size, tensor.dtype.itemsize + ) + nsplits += ((len(op.alpha),),) + + idxes = list(itertools.product(*[range(len(s)) for s in nsplits])) + seeds = gen_random_seeds(len(idxes), np.random.RandomState(op.seed)) + + out_chunks = [] + for seed, idx, shape in zip(seeds, idxes, itertools.product(*nsplits)): + inputs = [inp.cix[idx] for inp in op.inputs] + size = shape[:-1] + + chunk_op = op.copy().reset_key() + chunk_op._state = None + chunk_op.seed = seed + chunk_op.size = size + out_chunk = chunk_op.new_chunk(inputs, shape=shape, index=idx) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, tensor.shape, chunks=out_chunks, nsplits=nsplits + ) + + +def dirichlet(random_state, alpha, size=None, chunk_size=None, gpu=None, dtype=None): + r""" + Draw samples from the Dirichlet distribution. + + Draw `size` samples of dimension k from a Dirichlet distribution. A + Dirichlet-distributed random variable can be seen as a multivariate + generalization of a Beta distribution. Dirichlet pdf is the conjugate + prior of a multinomial in Bayesian inference. + + Parameters + ---------- + alpha : array + Parameter of the distribution (k dimension for sample of + dimension k). + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. Default is None, in which case a + single value is returned. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + samples : Tensor + The drawn samples, of shape (size, alpha.ndim). + + Raises + ------- + ValueError + If any value in alpha is less than or equal to zero + + Notes + ----- + .. math:: X \approx \prod_{i=1}^{k}{x^{\alpha_i-1}_i} + + Uses the following property for computation: for each dimension, + draw a random sample y_i from a standard gamma generator of shape + `alpha_i`, then + :math:`X = \frac{1}{\sum_{i=1}^k{y_i}} (y_1, \ldots, y_n)` is + Dirichlet distributed. + + References + ---------- + .. [1] David McKay, "Information Theory, Inference and Learning + Algorithms," chapter 23, + http://www.inference.phy.cam.ac.uk/mackay/ + .. [2] Wikipedia, "Dirichlet distribution", + http://en.wikipedia.org/wiki/Dirichlet_distribution + + Examples + -------- + Taking an example cited in Wikipedia, this distribution can be used if + one wanted to cut strings (each of initial length 1.0) into K pieces + with different lengths, where each piece had, on average, a designated + average length, but allowing some variation in the relative sizes of + the pieces. + + >>> import mars.tensor as mt + + >>> s = mt.random.dirichlet((10, 5, 3), 20).transpose() + + >>> import matplotlib.pyplot as plt + + >>> plt.barh(range(20), s[0].execute()) + >>> plt.barh(range(20), s[1].execute(), left=s[0].execute(), color='g') + >>> plt.barh(range(20), s[2].execute(), left=(s[0]+s[1]).execute(), color='r') + >>> plt.title("Lengths of Strings") + """ + if isinstance(alpha, Iterable): + alpha = tuple(alpha) + else: + raise TypeError("`alpha` should be an array") + if dtype is None: + dtype = np.random.RandomState().dirichlet(alpha, size=(0,)).dtype + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorDirichlet(seed=seed, alpha=alpha, size=size, gpu=gpu, dtype=dtype) + return op(chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/exponential.py b/python/xorbits/_mars/tensor/random/exponential.py new file mode 100644 index 000000000..97de3eed4 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/exponential.py @@ -0,0 +1,92 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorExponential(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["scale"] + _op_type_ = OperandDef.RAND_EXPONENTIAL + + _fields_ = "scale", "size" + scale = AnyField("scale") + _func_name = "exponential" + + def __call__(self, scale, chunk_size=None): + return self.new_tensor([scale], self.size, raw_chunk_size=chunk_size) + + +def exponential( + random_state, scale=1.0, size=None, chunk_size=None, gpu=None, dtype=None +): + r""" + Draw samples from an exponential distribution. + + Its probability density function is + + .. math:: f(x; \frac{1}{\beta}) = \frac{1}{\beta} \exp(-\frac{x}{\beta}), + + for ``x > 0`` and 0 elsewhere. :math:`\beta` is the scale parameter, + which is the inverse of the rate parameter :math:`\lambda = 1/\beta`. + The rate parameter is an alternative, widely used parameterization + of the exponential distribution [3]_. + + The exponential distribution is a continuous analogue of the + geometric distribution. It describes many common situations, such as + the size of raindrops measured over many rainstorms [1]_, or the time + between page requests to Wikipedia [2]_. + + Parameters + ---------- + scale : float or array_like of floats + The scale parameter, :math:`\beta = 1/\lambda`. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``scale`` is a scalar. Otherwise, + ``np.array(scale).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized exponential distribution. + + References + ---------- + .. [1] Peyton Z. Peebles Jr., "Probability, Random Variables and + Random Signal Principles", 4th ed, 2001, p. 57. + .. [2] Wikipedia, "Poisson process", + http://en.wikipedia.org/wiki/Poisson_process + .. [3] Wikipedia, "Exponential distribution", + http://en.wikipedia.org/wiki/Exponential_distribution + """ + if dtype is None: + dtype = ( + np.random.RandomState().exponential(handle_array(scale), size=(0,)).dtype + ) + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorExponential(seed=seed, size=size, gpu=gpu, dtype=dtype) + return op(scale, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/f.py b/python/xorbits/_mars/tensor/random/f.py new file mode 100644 index 000000000..b644a7af7 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/f.py @@ -0,0 +1,133 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorF(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["dfnum", "dfden"] + _op_type_ = OperandDef.RAND_F + + _fields_ = "dfnum", "dfden", "size" + dfnum = AnyField("dfnum") + dfden = AnyField("dfden") + _func_name = "f" + + def __call__(self, dfnum, dfden, chunk_size=None): + return self.new_tensor([dfnum, dfden], None, raw_chunk_size=chunk_size) + + +def f(random_state, dfnum, dfden, size=None, chunk_size=None, gpu=None, dtype=None): + """ + Draw samples from an F distribution. + + Samples are drawn from an F distribution with specified parameters, + `dfnum` (degrees of freedom in numerator) and `dfden` (degrees of + freedom in denominator), where both parameters should be greater than + zero. + + The random variate of the F distribution (also known as the + Fisher distribution) is a continuous probability distribution + that arises in ANOVA tests, and is the ratio of two chi-square + variates. + + Parameters + ---------- + dfnum : float or array_like of floats + Degrees of freedom in numerator, should be > 0. + dfden : float or array_like of float + Degrees of freedom in denominator, should be > 0. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``dfnum`` and ``dfden`` are both scalars. + Otherwise, ``np.broadcast(dfnum, dfden).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized Fisher distribution. + + See Also + -------- + scipy.stats.f : probability density function, distribution or + cumulative density function, etc. + + Notes + ----- + The F statistic is used to compare in-group variances to between-group + variances. Calculating the distribution depends on the sampling, and + so it is a function of the respective degrees of freedom in the + problem. The variable `dfnum` is the number of samples minus one, the + between-groups degrees of freedom, while `dfden` is the within-groups + degrees of freedom, the sum of the number of samples in each group + minus the number of groups. + + References + ---------- + .. [1] Glantz, Stanton A. "Primer of Biostatistics.", McGraw-Hill, + Fifth Edition, 2002. + .. [2] Wikipedia, "F-distribution", + http://en.wikipedia.org/wiki/F-distribution + + Examples + -------- + An example from Glantz[1], pp 47-40: + + Two groups, children of diabetics (25 people) and children from people + without diabetes (25 controls). Fasting blood glucose was measured, + case group had a mean value of 86.1, controls had a mean value of + 82.2. Standard deviations were 2.09 and 2.49 respectively. Are these + data consistent with the null hypothesis that the parents diabetic + status does not affect their children's blood glucose levels? + Calculating the F statistic from the data gives a value of 36.01. + + Draw samples from the distribution: + + >>> import mars.tensor as mt + + >>> dfnum = 1. # between group degrees of freedom + >>> dfden = 48. # within groups degrees of freedom + >>> s = mt.random.f(dfnum, dfden, 1000).execute() + + The lower bound for the top 1% of the samples is : + + >>> sorted(s)[-10] + 7.61988120985 + + So there is about a 1% chance that the F statistic will exceed 7.62, + the measured value is 36, so the null hypothesis is rejected at the 1% + level. + """ + if dtype is None: + dtype = ( + np.random.RandomState() + .f(handle_array(dfnum), handle_array(dfden), size=(0,)) + .dtype + ) + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorF(seed=seed, size=size, gpu=gpu, dtype=dtype) + return op(dfnum, dfden, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/gamma.py b/python/xorbits/_mars/tensor/random/gamma.py new file mode 100644 index 000000000..490e82f74 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/gamma.py @@ -0,0 +1,126 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorRandGamma(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["shape", "scale"] + _op_type_ = OperandDef.RAND_GAMMA + + _fields_ = "shape", "scale", "size" + shape = AnyField("shape") + scale = AnyField("scale") + _func_name = "gamma" + + def __call__(self, shape, scale, chunk_size=None): + return self.new_tensor([shape, scale], None, raw_chunk_size=chunk_size) + + +def gamma( + random_state, shape, scale=1.0, size=None, chunk_size=None, gpu=None, dtype=None +): + r""" + Draw samples from a Gamma distribution. + + Samples are drawn from a Gamma distribution with specified parameters, + `shape` (sometimes designated "k") and `scale` (sometimes designated + "theta"), where both parameters are > 0. + + Parameters + ---------- + shape : float or array_like of floats + The shape of the gamma distribution. Should be greater than zero. + scale : float or array_like of floats, optional + The scale of the gamma distribution. Should be greater than zero. + Default is equal to 1. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``shape`` and ``scale`` are both scalars. + Otherwise, ``np.broadcast(shape, scale).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized gamma distribution. + + See Also + -------- + scipy.stats.gamma : probability density function, distribution or + cumulative density function, etc. + + Notes + ----- + The probability density for the Gamma distribution is + + .. math:: p(x) = x^{k-1}\frac{e^{-x/\theta}}{\theta^k\Gamma(k)}, + + where :math:`k` is the shape and :math:`\theta` the scale, + and :math:`\Gamma` is the Gamma function. + + The Gamma distribution is often used to model the times to failure of + electronic components, and arises naturally in processes for which the + waiting times between Poisson distributed events are relevant. + + References + ---------- + .. [1] Weisstein, Eric W. "Gamma Distribution." From MathWorld--A + Wolfram Web Resource. + http://mathworld.wolfram.com/GammaDistribution.html + .. [2] Wikipedia, "Gamma distribution", + http://en.wikipedia.org/wiki/Gamma_distribution + + Examples + -------- + Draw samples from the distribution: + + >>> import mars.tensor as mt + + >>> shape, scale = 2., 2. # mean=4, std=2*sqrt(2) + >>> s = mt.random.gamma(shape, scale, 1000).execute() + + Display the histogram of the samples, along with + the probability density function: + + >>> import matplotlib.pyplot as plt + >>> import scipy.special as sps + >>> import numpy as np + >>> count, bins, ignored = plt.hist(s, 50, normed=True) + >>> y = bins**(shape-1)*(np.exp(-bins/scale) / + ... (sps.gamma(shape)*scale**shape)) + >>> plt.plot(bins, y, linewidth=2, color='r') + >>> plt.show() + """ + if dtype is None: + dtype = ( + np.random.RandomState() + .gamma(handle_array(shape), handle_array(scale), size=(0,)) + .dtype + ) + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorRandGamma(seed=seed, size=size, gpu=gpu, dtype=dtype) + return op(shape, scale, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/geometric.py b/python/xorbits/_mars/tensor/random/geometric.py new file mode 100644 index 000000000..3afcae356 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/geometric.py @@ -0,0 +1,91 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorGeometric(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["p"] + _op_type_ = OperandDef.RAND_GEOMETRIC + + _fields_ = "p", "size" + p = AnyField("p") + _func_name = "geometric" + + def __call__(self, p, chunk_size=None): + return self.new_tensor([p], None, raw_chunk_size=chunk_size) + + +def geometric(random_state, p, size=None, chunk_size=None, gpu=None, dtype=None): + """ + Draw samples from the geometric distribution. + + Bernoulli trials are experiments with one of two outcomes: + success or failure (an example of such an experiment is flipping + a coin). The geometric distribution models the number of trials + that must be run in order to achieve success. It is therefore + supported on the positive integers, ``k = 1, 2, ...``. + + The probability mass function of the geometric distribution is + + .. math:: f(k) = (1 - p)^{k - 1} p + + where `p` is the probability of success of an individual trial. + + Parameters + ---------- + p : float or array_like of floats + The probability of success of an individual trial. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``p`` is a scalar. Otherwise, + ``mt.array(p).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized geometric distribution. + + Examples + -------- + Draw ten thousand values from the geometric distribution, + with the probability of an individual success equal to 0.35: + + >>> import mars.tensor as mt + + >>> z = mt.random.geometric(p=0.35, size=10000) + + How many trials succeeded after a single run? + + >>> ((z == 1).sum() / 10000.).execute() + 0.34889999999999999 #random + """ + if dtype is None: + dtype = np.random.RandomState().geometric(handle_array(p), size=(0,)).dtype + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorGeometric(seed=seed, size=size, gpu=gpu, dtype=dtype) + return op(p, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/gumbel.py b/python/xorbits/_mars/tensor/random/gumbel.py new file mode 100644 index 000000000..d5b538117 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/gumbel.py @@ -0,0 +1,165 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorGumbel(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["loc", "scale"] + _op_type_ = OperandDef.RAND_GUMBEL + + _fields_ = "loc", "scale", "size" + loc = AnyField("loc") + scale = AnyField("scale") + _func_name = "gumbel" + + def __call__(self, loc, scale, chunk_size=None): + return self.new_tensor([loc, scale], None, raw_chunk_size=chunk_size) + + +def gumbel( + random_state, loc=0.0, scale=1.0, size=None, chunk_size=None, gpu=None, dtype=None +): + r""" + Draw samples from a Gumbel distribution. + + Draw samples from a Gumbel distribution with specified location and + scale. For more information on the Gumbel distribution, see + Notes and References below. + + Parameters + ---------- + loc : float or array_like of floats, optional + The location of the mode of the distribution. Default is 0. + scale : float or array_like of floats, optional + The scale parameter of the distribution. Default is 1. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``loc`` and ``scale`` are both scalars. + Otherwise, ``np.broadcast(loc, scale).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized Gumbel distribution. + + See Also + -------- + scipy.stats.gumbel_l + scipy.stats.gumbel_r + scipy.stats.genextreme + weibull + + Notes + ----- + The Gumbel (or Smallest Extreme Value (SEV) or the Smallest Extreme + Value Type I) distribution is one of a class of Generalized Extreme + Value (GEV) distributions used in modeling extreme value problems. + The Gumbel is a special case of the Extreme Value Type I distribution + for maximums from distributions with "exponential-like" tails. + + The probability density for the Gumbel distribution is + + .. math:: p(x) = \frac{e^{-(x - \mu)/ \beta}}{\beta} e^{ -e^{-(x - \mu)/ + \beta}}, + + where :math:`\mu` is the mode, a location parameter, and + :math:`\beta` is the scale parameter. + + The Gumbel (named for German mathematician Emil Julius Gumbel) was used + very early in the hydrology literature, for modeling the occurrence of + flood events. It is also used for modeling maximum wind speed and + rainfall rates. It is a "fat-tailed" distribution - the probability of + an event in the tail of the distribution is larger than if one used a + Gaussian, hence the surprisingly frequent occurrence of 100-year + floods. Floods were initially modeled as a Gaussian process, which + underestimated the frequency of extreme events. + + It is one of a class of extreme value distributions, the Generalized + Extreme Value (GEV) distributions, which also includes the Weibull and + Frechet. + + The function has a mean of :math:`\mu + 0.57721\beta` and a variance + of :math:`\frac{\pi^2}{6}\beta^2`. + + References + ---------- + .. [1] Gumbel, E. J., "Statistics of Extremes," + New York: Columbia University Press, 1958. + .. [2] Reiss, R.-D. and Thomas, M., "Statistical Analysis of Extreme + Values from Insurance, Finance, Hydrology and Other Fields," + Basel: Birkhauser Verlag, 2001. + + Examples + -------- + Draw samples from the distribution: + + >>> import mars.tensor as mt + + >>> mu, beta = 0, 0.1 # location and scale + >>> s = mt.random.gumbel(mu, beta, 1000).execute() + + Display the histogram of the samples, along with + the probability density function: + + >>> import matplotlib.pyplot as plt + >>> import numpy as np + >>> count, bins, ignored = plt.hist(s, 30, normed=True) + >>> plt.plot(bins, (1/beta)*np.exp(-(bins - mu)/beta) + ... * np.exp( -np.exp( -(bins - mu) /beta) ), + ... linewidth=2, color='r') + >>> plt.show() + + Show how an extreme value distribution can arise from a Gaussian process + and compare to a Gaussian: + + >>> means = [] + >>> maxima = [] + >>> for i in range(0,1000) : + ... a = mt.random.normal(mu, beta, 1000) + ... means.append(a.mean().execute()) + ... maxima.append(a.max().execute()) + >>> count, bins, ignored = plt.hist(maxima, 30, normed=True) + >>> beta = mt.std(maxima) * mt.sqrt(6) / mt.pi + >>> mu = mt.mean(maxima) - 0.57721*beta + >>> plt.plot(bins, ((1/beta)*mt.exp(-(bins - mu)/beta) + ... * mt.exp(-mt.exp(-(bins - mu)/beta))).execute(), + ... linewidth=2, color='r') + >>> plt.plot(bins, (1/(beta * mt.sqrt(2 * mt.pi)) + ... * mt.exp(-(bins - mu)**2 / (2 * beta**2))).execute(), + ... linewidth=2, color='g') + >>> plt.show() + """ + if dtype is None: + dtype = ( + np.random.RandomState() + .gumbel(handle_array(loc), handle_array(scale), size=(0,)) + .dtype + ) + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorGumbel(seed=seed, size=size, gpu=gpu, dtype=dtype) + return op(loc, scale, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/hypergeometric.py b/python/xorbits/_mars/tensor/random/hypergeometric.py new file mode 100644 index 000000000..378ddffb2 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/hypergeometric.py @@ -0,0 +1,146 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorHypergeometric(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["ngood", "nbad", "nsample"] + _op_type_ = OperandDef.RAND_HYPERGEOMETRIC + + _fields_ = "ngood", "nbad", "nsample", "size" + ngood = AnyField("ngood") + nbad = AnyField("nbad") + nsample = AnyField("nsample") + _func_name = "hypergeometric" + + def __call__(self, ngood, nbad, nsample, chunk_size=None): + return self.new_tensor([ngood, nbad, nsample], None, raw_chunk_size=chunk_size) + + +def hypergeometric( + random_state, ngood, nbad, nsample, size=None, chunk_size=None, gpu=None, dtype=None +): + r""" + Draw samples from a Hypergeometric distribution. + + Samples are drawn from a hypergeometric distribution with specified + parameters, ngood (ways to make a good selection), nbad (ways to make + a bad selection), and nsample = number of items sampled, which is less + than or equal to the sum ngood + nbad. + + Parameters + ---------- + ngood : int or array_like of ints + Number of ways to make a good selection. Must be nonnegative. + nbad : int or array_like of ints + Number of ways to make a bad selection. Must be nonnegative. + nsample : int or array_like of ints + Number of items sampled. Must be at least 1 and at most + ``ngood + nbad``. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``ngood``, ``nbad``, and ``nsample`` + are all scalars. Otherwise, ``np.broadcast(ngood, nbad, nsample).size`` + samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized hypergeometric distribution. + + See Also + -------- + scipy.stats.hypergeom : probability density function, distribution or + cumulative density function, etc. + + Notes + ----- + The probability density for the Hypergeometric distribution is + + .. math:: P(x) = \frac{\binom{m}{n}\binom{N-m}{n-x}}{\binom{N}{n}}, + + where :math:`0 \le x \le m` and :math:`n+m-N \le x \le n` + + for P(x) the probability of x successes, n = ngood, m = nbad, and + N = number of samples. + + Consider an urn with black and white marbles in it, ngood of them + black and nbad are white. If you draw nsample balls without + replacement, then the hypergeometric distribution describes the + distribution of black balls in the drawn sample. + + Note that this distribution is very similar to the binomial + distribution, except that in this case, samples are drawn without + replacement, whereas in the Binomial case samples are drawn with + replacement (or the sample space is infinite). As the sample space + becomes large, this distribution approaches the binomial. + + References + ---------- + .. [1] Lentner, Marvin, "Elementary Applied Statistics", Bogden + and Quigley, 1972. + .. [2] Weisstein, Eric W. "Hypergeometric Distribution." From + MathWorld--A Wolfram Web Resource. + http://mathworld.wolfram.com/HypergeometricDistribution.html + .. [3] Wikipedia, "Hypergeometric distribution", + http://en.wikipedia.org/wiki/Hypergeometric_distribution + + Examples + -------- + Draw samples from the distribution: + + >>> import mars.tensor as mt + + >>> ngood, nbad, nsamp = 100, 2, 10 + # number of good, number of bad, and number of samples + >>> s = mt.random.hypergeometric(ngood, nbad, nsamp, 1000) + >>> hist(s) + # note that it is very unlikely to grab both bad items + + Suppose you have an urn with 15 white and 15 black marbles. + If you pull 15 marbles at random, how likely is it that + 12 or more of them are one color? + + >>> s = mt.random.hypergeometric(15, 15, 15, 100000) + >>> (mt.sum(s>=12)/100000. + mt.sum(s<=3)/100000.).execute() + # answer = 0.003 ... pretty unlikely! + """ + if dtype is None: + dtype = ( + np.random.RandomState() + .hypergeometric( + handle_array(ngood), + handle_array(nbad), + handle_array(nsample), + size=(0,), + ) + .dtype + ) + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorHypergeometric(seed=seed, size=size, gpu=gpu, dtype=dtype) + return op(ngood, nbad, nsample, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/laplace.py b/python/xorbits/_mars/tensor/random/laplace.py new file mode 100644 index 000000000..1290264da --- /dev/null +++ b/python/xorbits/_mars/tensor/random/laplace.py @@ -0,0 +1,131 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorLaplace(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["loc", "scale"] + _op_type_ = OperandDef.RAND_LAPLACE + + _fields_ = "loc", "scale", "size" + loc = AnyField("loc") + scale = AnyField("scale") + _func_name = "laplace" + + def __call__(self, loc, scale, chunk_size=None): + return self.new_tensor([loc, scale], None, raw_chunk_size=chunk_size) + + +def laplace( + random_state, loc=0.0, scale=1.0, size=None, chunk_size=None, gpu=None, dtype=None +): + r""" + Draw samples from the Laplace or double exponential distribution with + specified location (or mean) and scale (decay). + + The Laplace distribution is similar to the Gaussian/normal distribution, + but is sharper at the peak and has fatter tails. It represents the + difference between two independent, identically distributed exponential + random variables. + + Parameters + ---------- + loc : float or array_like of floats, optional + The position, :math:`\mu`, of the distribution peak. Default is 0. + scale : float or array_like of floats, optional + :math:`\lambda`, the exponential decay. Default is 1. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``loc`` and ``scale`` are both scalars. + Otherwise, ``np.broadcast(loc, scale).size`` samples are drawn. + chunks : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized Laplace distribution. + + Notes + ----- + It has the probability density function + + .. math:: f(x; \mu, \lambda) = \frac{1}{2\lambda} + \exp\left(-\frac{|x - \mu|}{\lambda}\right). + + The first law of Laplace, from 1774, states that the frequency + of an error can be expressed as an exponential function of the + absolute magnitude of the error, which leads to the Laplace + distribution. For many problems in economics and health + sciences, this distribution seems to model the data better + than the standard Gaussian distribution. + + References + ---------- + .. [1] Abramowitz, M. and Stegun, I. A. (Eds.). "Handbook of + Mathematical Functions with Formulas, Graphs, and Mathematical + Tables, 9th printing," New York: Dover, 1972. + .. [2] Kotz, Samuel, et. al. "The Laplace Distribution and + Generalizations, " Birkhauser, 2001. + .. [3] Weisstein, Eric W. "Laplace Distribution." + From MathWorld--A Wolfram Web Resource. + http://mathworld.wolfram.com/LaplaceDistribution.html + .. [4] Wikipedia, "Laplace distribution", + http://en.wikipedia.org/wiki/Laplace_distribution + + Examples + -------- + Draw samples from the distribution + + >>> import mars.tensor as mt + + >>> loc, scale = 0., 1. + >>> s = mt.random.laplace(loc, scale, 1000) + + Display the histogram of the samples, along with + the probability density function: + + >>> import matplotlib.pyplot as plt + >>> count, bins, ignored = plt.hist(s.execute(), 30, normed=True) + >>> x = mt.arange(-8., 8., .01) + >>> pdf = mt.exp(-abs(x-loc)/scale)/(2.*scale) + >>> plt.plot(x.execute(), pdf.execute()) + + Plot Gaussian for comparison: + + >>> g = (1/(scale * mt.sqrt(2 * np.pi)) * + ... mt.exp(-(x - loc)**2 / (2 * scale**2))) + >>> plt.plot(x.execute(),g.execute()) + """ + if dtype is None: + dtype = ( + np.random.RandomState() + .laplace(handle_array(loc), handle_array(scale), size=(0,)) + .dtype + ) + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorLaplace(seed=seed, size=size, gpu=gpu, dtype=dtype) + return op(loc, scale, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/logistic.py b/python/xorbits/_mars/tensor/random/logistic.py new file mode 100644 index 000000000..1184fce3f --- /dev/null +++ b/python/xorbits/_mars/tensor/random/logistic.py @@ -0,0 +1,127 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorLogistic(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["loc", "scale"] + _op_type_ = OperandDef.RAND_LOGISTIC + + _fields_ = "loc", "scale", "size" + loc = AnyField("loc") + scale = AnyField("scale") + _func_name = "logistic" + + def __call__(self, loc, scale, chunk_size=None): + return self.new_tensor([loc, scale], None, raw_chunk_size=chunk_size) + + +def logistic( + random_state, loc=0.0, scale=1.0, size=None, chunk_size=None, gpu=None, dtype=None +): + r""" + Draw samples from a logistic distribution. + + Samples are drawn from a logistic distribution with specified + parameters, loc (location or mean, also median), and scale (>0). + + Parameters + ---------- + loc : float or array_like of floats, optional + Parameter of the distribution. Default is 0. + scale : float or array_like of floats, optional + Parameter of the distribution. Should be greater than zero. + Default is 1. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``loc`` and ``scale`` are both scalars. + Otherwise, ``np.broadcast(loc, scale).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized logistic distribution. + + See Also + -------- + scipy.stats.logistic : probability density function, distribution or + cumulative density function, etc. + + Notes + ----- + The probability density for the Logistic distribution is + + .. math:: P(x) = P(x) = \frac{e^{-(x-\mu)/s}}{s(1+e^{-(x-\mu)/s})^2}, + + where :math:`\mu` = location and :math:`s` = scale. + + The Logistic distribution is used in Extreme Value problems where it + can act as a mixture of Gumbel distributions, in Epidemiology, and by + the World Chess Federation (FIDE) where it is used in the Elo ranking + system, assuming the performance of each player is a logistically + distributed random variable. + + References + ---------- + .. [1] Reiss, R.-D. and Thomas M. (2001), "Statistical Analysis of + Extreme Values, from Insurance, Finance, Hydrology and Other + Fields," Birkhauser Verlag, Basel, pp 132-133. + .. [2] Weisstein, Eric W. "Logistic Distribution." From + MathWorld--A Wolfram Web Resource. + http://mathworld.wolfram.com/LogisticDistribution.html + .. [3] Wikipedia, "Logistic-distribution", + http://en.wikipedia.org/wiki/Logistic_distribution + + Examples + -------- + Draw samples from the distribution: + + >>> import mars.tensor as mt + >>> import matplotlib.pyplot as plt + + >>> loc, scale = 10, 1 + >>> s = mt.random.logistic(loc, scale, 10000) + >>> count, bins, ignored = plt.hist(s.execute(), bins=50) + + # plot against distribution + + >>> def logist(x, loc, scale): + ... return mt.exp((loc-x)/scale)/(scale*(1+mt.exp((loc-x)/scale))**2) + >>> plt.plot(bins, logist(bins, loc, scale).execute()*count.max()/\ + ... logist(bins, loc, scale).max().execute()) + >>> plt.show() + """ + if dtype is None: + dtype = ( + np.random.RandomState() + .logistic(handle_array(loc), handle_array(scale), size=(0,)) + .dtype + ) + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorLogistic(seed=seed, size=size, gpu=gpu, dtype=dtype) + return op(loc, scale, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/lognormal.py b/python/xorbits/_mars/tensor/random/lognormal.py new file mode 100644 index 000000000..c0788d299 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/lognormal.py @@ -0,0 +1,157 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorLognormal(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["mean", "sigma"] + _op_type_ = OperandDef.RAND_LOGNORMAL + + _fields_ = "mean", "sigma", "size" + mean = AnyField("mean") + sigma = AnyField("sigma") + _func_name = "lognormal" + + def __call__(self, mean, sigma, chunk_size=None): + return self.new_tensor([mean, sigma], None, raw_chunk_size=chunk_size) + + +def lognormal( + random_state, mean=0.0, sigma=1.0, size=None, chunk_size=None, gpu=None, dtype=None +): + r""" + Draw samples from a log-normal distribution. + + Draw samples from a log-normal distribution with specified mean, + standard deviation, and array shape. Note that the mean and standard + deviation are not the values for the distribution itself, but of the + underlying normal distribution it is derived from. + + Parameters + ---------- + mean : float or array_like of floats, optional + Mean value of the underlying normal distribution. Default is 0. + sigma : float or array_like of floats, optional + Standard deviation of the underlying normal distribution. Should + be greater than zero. Default is 1. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``mean`` and ``sigma`` are both scalars. + Otherwise, ``np.broadcast(mean, sigma).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized log-normal distribution. + + See Also + -------- + scipy.stats.lognorm : probability density function, distribution, + cumulative density function, etc. + + Notes + ----- + A variable `x` has a log-normal distribution if `log(x)` is normally + distributed. The probability density function for the log-normal + distribution is: + + .. math:: p(x) = \frac{1}{\sigma x \sqrt{2\pi}} + e^{(-\frac{(ln(x)-\mu)^2}{2\sigma^2})} + + where :math:`\mu` is the mean and :math:`\sigma` is the standard + deviation of the normally distributed logarithm of the variable. + A log-normal distribution results if a random variable is the *product* + of a large number of independent, identically-distributed variables in + the same way that a normal distribution results if the variable is the + *sum* of a large number of independent, identically-distributed + variables. + + References + ---------- + .. [1] Limpert, E., Stahel, W. A., and Abbt, M., "Log-normal + Distributions across the Sciences: Keys and Clues," + BioScience, Vol. 51, No. 5, May, 2001. + http://stat.ethz.ch/~stahel/lognormal/bioscience.pdf + .. [2] Reiss, R.D. and Thomas, M., "Statistical Analysis of Extreme + Values," Basel: Birkhauser Verlag, 2001, pp. 31-32. + + Examples + -------- + Draw samples from the distribution: + + >>> import mars.tensor as mt + + >>> mu, sigma = 3., 1. # mean and standard deviation + >>> s = mt.random.lognormal(mu, sigma, 1000) + + Display the histogram of the samples, along with + the probability density function: + + >>> import matplotlib.pyplot as plt + >>> count, bins, ignored = plt.hist(s.execute(), 100, normed=True, align='mid') + + >>> x = mt.linspace(min(bins), max(bins), 10000) + >>> pdf = (mt.exp(-(mt.log(x) - mu)**2 / (2 * sigma**2)) + ... / (x * sigma * mt.sqrt(2 * mt.pi))) + + >>> plt.plot(x.execute(), pdf.execute(), linewidth=2, color='r') + >>> plt.axis('tight') + >>> plt.show() + + Demonstrate that taking the products of random samples from a uniform + distribution can be fit well by a log-normal probability density + function. + + >>> # Generate a thousand samples: each is the product of 100 random + >>> # values, drawn from a normal distribution. + >>> b = [] + >>> for i in range(1000): + ... a = 10. + mt.random.random(100) + ... b.append(mt.product(a).execute()) + + >>> b = mt.array(b) / mt.min(b) # scale values to be positive + >>> count, bins, ignored = plt.hist(b.execute(), 100, normed=True, align='mid') + >>> sigma = mt.std(mt.log(b)) + >>> mu = mt.mean(mt.log(b)) + + >>> x = mt.linspace(min(bins), max(bins), 10000) + >>> pdf = (mt.exp(-(mt.log(x) - mu)**2 / (2 * sigma**2)) + ... / (x * sigma * mt.sqrt(2 * mt.pi))) + + >>> plt.plot(x.execute(), pdf.execute(), color='r', linewidth=2) + >>> plt.show() + """ + if dtype is None: + dtype = ( + np.random.RandomState() + .lognormal(handle_array(mean), handle_array(sigma), size=(0,)) + .dtype + ) + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorLognormal(seed=seed, size=size, gpu=gpu, dtype=dtype) + return op(mean, sigma, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/logseries.py b/python/xorbits/_mars/tensor/random/logseries.py new file mode 100644 index 000000000..2f7444ab8 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/logseries.py @@ -0,0 +1,120 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorLogseries(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["p"] + _op_type_ = OperandDef.RAND_LOGSERIES + + _fields_ = "p", "size" + p = AnyField("p") + _func_name = "logseries" + + def __call__(self, p, chunk_size=None): + return self.new_tensor([p], None, raw_chunk_size=chunk_size) + + +def logseries(random_state, p, size=None, chunk_size=None, gpu=None, dtype=None): + r""" + Draw samples from a logarithmic series distribution. + + Samples are drawn from a log series distribution with specified + shape parameter, 0 < ``p`` < 1. + + Parameters + ---------- + p : float or array_like of floats + Shape parameter for the distribution. Must be in the range (0, 1). + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``p`` is a scalar. Otherwise, + ``np.array(p).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized logarithmic series distribution. + + See Also + -------- + scipy.stats.logser : probability density function, distribution or + cumulative density function, etc. + + Notes + ----- + The probability density for the Log Series distribution is + + .. math:: P(k) = \frac{-p^k}{k \ln(1-p)}, + + where p = probability. + + The log series distribution is frequently used to represent species + richness and occurrence, first proposed by Fisher, Corbet, and + Williams in 1943 [2]. It may also be used to model the numbers of + occupants seen in cars [3]. + + References + ---------- + .. [1] Buzas, Martin A.; Culver, Stephen J., Understanding regional + species diversity through the log series distribution of + occurrences: BIODIVERSITY RESEARCH Diversity & Distributions, + Volume 5, Number 5, September 1999 , pp. 187-195(9). + .. [2] Fisher, R.A,, A.S. Corbet, and C.B. Williams. 1943. The + relation between the number of species and the number of + individuals in a random sample of an animal population. + Journal of Animal Ecology, 12:42-58. + .. [3] D. J. Hand, F. Daly, D. Lunn, E. Ostrowski, A Handbook of Small + Data Sets, CRC Press, 1994. + .. [4] Wikipedia, "Logarithmic distribution", + http://en.wikipedia.org/wiki/Logarithmic_distribution + + Examples + -------- + Draw samples from the distribution: + + >>> import mars.tensor as mt + >>> import matplotlib.pyplot as plt + + >>> a = .6 + >>> s = mt.random.logseries(a, 10000) + >>> count, bins, ignored = plt.hist(s.execute()) + + # plot against distribution + + >>> def logseries(k, p): + ... return -p**k/(k*mt.log(1-p)) + >>> plt.plot(bins, (logseries(bins, a)*count.max()/ + ... logseries(bins, a).max()).execute(), 'r') + >>> plt.show() + """ + if dtype is None: + dtype = np.random.RandomState().logseries(handle_array(p), size=(0,)).dtype + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorLogseries(seed=seed, size=size, gpu=gpu, dtype=dtype) + return op(p, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/multinomial.py b/python/xorbits/_mars/tensor/random/multinomial.py new file mode 100644 index 000000000..d0c488053 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/multinomial.py @@ -0,0 +1,131 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import FieldTypes, Int64Field, TupleField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin + + +class TensorMultinomial(TensorDistribution, TensorRandomOperandMixin): + _op_type_ = OperandDef.RAND_MULTINOMIAL + + _fields_ = "n", "pvals", "size" + n = Int64Field("n") + pvals = TupleField("pvals", FieldTypes.float64) + _func_name = "multinomial" + + def __call__(self, chunk_size=None): + if self.size is None: + shape = (len(self.pvals),) + else: + try: + shape = tuple(self.size) + (len(self.pvals),) + except TypeError: + shape = (self.size, len(self.pvals)) + return self.new_tensor(None, shape, raw_chunk_size=chunk_size) + + +def multinomial( + random_state, n, pvals, size=None, chunk_size=None, gpu=None, dtype=None +): + """ + Draw samples from a multinomial distribution. + + The multinomial distribution is a multivariate generalisation of the + binomial distribution. Take an experiment with one of ``p`` + possible outcomes. An example of such an experiment is throwing a dice, + where the outcome can be 1 through 6. Each sample drawn from the + distribution represents `n` such experiments. Its values, + ``X_i = [X_0, X_1, ..., X_p]``, represent the number of times the + outcome was ``i``. + + Parameters + ---------- + n : int + Number of experiments. + pvals : sequence of floats, length p + Probabilities of each of the ``p`` different outcomes. These + should sum to 1 (however, the last element is always assumed to + account for the remaining probability, as long as + ``sum(pvals[:-1]) <= 1)``. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. Default is None, in which case a + single value is returned. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor + The drawn samples, of shape *size*, if that was provided. If not, + the shape is ``(N,)``. + + In other words, each entry ``out[i,j,...,:]`` is an N-dimensional + value drawn from the distribution. + + Examples + -------- + Throw a dice 20 times: + + >>> import mars.tensor as mt + + >>> mt.random.multinomial(20, [1/6.]*6, size=1).execute() + array([[4, 1, 7, 5, 2, 1]]) + + It landed 4 times on 1, once on 2, etc. + + Now, throw the dice 20 times, and 20 times again: + + >>> mt.random.multinomial(20, [1/6.]*6, size=2).execute() + array([[3, 4, 3, 3, 4, 3], + [2, 4, 3, 4, 0, 7]]) + + For the first run, we threw 3 times 1, 4 times 2, etc. For the second, + we threw 2 times 1, 4 times 2, etc. + + A loaded die is more likely to land on number 6: + + >>> mt.random.multinomial(100, [1/7.]*5 + [2/7.]).execute() + array([11, 16, 14, 17, 16, 26]) + + The probability inputs should be normalized. As an implementation + detail, the value of the last entry is ignored and assumed to take + up any leftover probability mass, but this should not be relied on. + A biased coin which has twice as much weight on one side as on the + other should be sampled like so: + + >>> mt.random.multinomial(100, [1.0 / 3, 2.0 / 3]).execute() # RIGHT + array([38, 62]) + + not like: + + >>> mt.random.multinomial(100, [1.0, 2.0]).execute() # WRONG + array([100, 0]) + """ + n = int(n) + pvals = tuple(pvals) + if dtype is None: + dtype = np.random.RandomState().multinomial(n, pvals, size=(0,)).dtype + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorMultinomial(n=n, pvals=pvals, seed=seed, size=size, gpu=gpu, dtype=dtype) + return op(chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/multivariate_normal.py b/python/xorbits/_mars/tensor/random/multivariate_normal.py new file mode 100644 index 000000000..47a3ec5c3 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/multivariate_normal.py @@ -0,0 +1,266 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np + +from ... import opcodes as OperandDef +from ...config import options +from ...serialization.serializables import Float64Field, NDArrayField, StringField +from ..array_utils import array_module, device +from ..utils import decide_chunk_sizes, gen_random_seeds +from .core import TENSOR_CHUNK_TYPE, TensorDistribution, TensorRandomOperandMixin + + +class TensorMultivariateNormal(TensorDistribution, TensorRandomOperandMixin): + _op_type_ = OperandDef.RAND_MULTIVARIATE_NORMAL + + _fields_ = "mean", "cov", "size", "check_valid", "tol" + mean = NDArrayField("mean") + cov = NDArrayField("cov") + check_valid = StringField("check_valid") + tol = Float64Field("tol") + _func_name = "multivariate_normal" + + def __call__(self, chunk_size=None): + N = self.mean.size + if self.size is None: + shape = (N,) + else: + try: + shape = tuple(self.size) + (N,) + except TypeError: + shape = (self.size, N) + + return self.new_tensor(None, shape, raw_chunk_size=chunk_size) + + @classmethod + def tile(cls, op): + tensor = op.outputs[0] + chunk_size = tensor.extra_params.raw_chunk_size or options.chunk_size + nsplits = decide_chunk_sizes( + tensor.shape[:-1], chunk_size, tensor.dtype.itemsize + ) + ((tensor.shape[-1],),) + + mean_chunk = op.mean.chunks[0] if hasattr(op.mean, "chunks") else op.mean + cov_chunk = op.cov.chunks[0] if hasattr(op.cov, "chunks") else op.cov + + idxes = list(itertools.product(*[range(len(s)) for s in nsplits])) + seeds = gen_random_seeds(len(idxes), np.random.RandomState(op.seed)) + + out_chunks = [] + for seed, out_idx, shape in zip(seeds, idxes, itertools.product(*nsplits)): + chunk_op = op.copy().reset_key() + chunk_op._state = None + chunk_op.seed = seed + chunk_op.size = shape[:-1] + out_chunk = chunk_op.new_chunk( + [mean_chunk, cov_chunk], shape=shape, index=out_idx + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, tensor.shape, chunks=out_chunks, nsplits=nsplits + ) + + @classmethod + def execute(cls, ctx, op): + xp = array_module(op.gpu) + if xp is np: + device_id = -1 + else: + device_id = op.device or 0 + + with device(device_id): + rs = xp.random.RandomState(op.seed) + + args = [] + for k in op.args: + val = getattr(op, k, None) + if isinstance(val, TENSOR_CHUNK_TYPE): + args.append(ctx[val.key]) + else: + args.append(val) + mean, cov = args[:2] + kw = {} + if args[2] is not None: + kw["size"] = args[2] + if args[3] is not None: + kw["check_valid"] = args[3] + if args[4] is not None: + kw["tol"] = args[4] + + try: + res = rs.multivariate_normal(mean, cov, **kw) + if xp is not np: + ctx[op.outputs[0].key] = xp.asarray(res) + else: + ctx[op.outputs[0].key] = res + except AttributeError: + if xp is not np: + # cupy cannot generate data, fallback to numpy first + rs = np.random.RandomState(op.seed) + res = rs.multivariate_normal(mean, cov, **kw) + ctx[op.outputs[0].key] = xp.asarray(res) + else: + raise + + +def multivariate_normal( + random_state, + mean, + cov, + size=None, + check_valid=None, + tol=None, + chunk_size=None, + gpu=None, + dtype=None, +): + """ + Draw random samples from a multivariate normal distribution. + + The multivariate normal, multinormal or Gaussian distribution is a + generalization of the one-dimensional normal distribution to higher + dimensions. Such a distribution is specified by its mean and + covariance matrix. These parameters are analogous to the mean + (average or "center") and variance (standard deviation, or "width," + squared) of the one-dimensional normal distribution. + + Parameters + ---------- + mean : 1-D array_like, of length N + Mean of the N-dimensional distribution. + cov : 2-D array_like, of shape (N, N) + Covariance matrix of the distribution. It must be symmetric and + positive-semidefinite for proper sampling. + size : int or tuple of ints, optional + Given a shape of, for example, ``(m,n,k)``, ``m*n*k`` samples are + generated, and packed in an `m`-by-`n`-by-`k` arrangement. Because + each sample is `N`-dimensional, the output shape is ``(m,n,k,N)``. + If no shape is specified, a single (`N`-D) sample is returned. + check_valid : { 'warn', 'raise', 'ignore' }, optional + Behavior when the covariance matrix is not positive semidefinite. + tol : float, optional + Tolerance when checking the singular values in covariance matrix. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor + The drawn samples, of shape *size*, if that was provided. If not, + the shape is ``(N,)``. + + In other words, each entry ``out[i,j,...,:]`` is an N-dimensional + value drawn from the distribution. + + Notes + ----- + The mean is a coordinate in N-dimensional space, which represents the + location where samples are most likely to be generated. This is + analogous to the peak of the bell curve for the one-dimensional or + univariate normal distribution. + + Covariance indicates the level to which two variables vary together. + From the multivariate normal distribution, we draw N-dimensional + samples, :math:`X = [x_1, x_2, ... x_N]`. The covariance matrix + element :math:`C_{ij}` is the covariance of :math:`x_i` and :math:`x_j`. + The element :math:`C_{ii}` is the variance of :math:`x_i` (i.e. its + "spread"). + + Instead of specifying the full covariance matrix, popular + approximations include: + + - Spherical covariance (`cov` is a multiple of the identity matrix) + - Diagonal covariance (`cov` has non-negative elements, and only on + the diagonal) + + This geometrical property can be seen in two dimensions by plotting + generated data-points: + + >>> mean = [0, 0] + >>> cov = [[1, 0], [0, 100]] # diagonal covariance + + Diagonal covariance means that points are oriented along x or y-axis: + + >>> import matplotlib.pyplot as plt + >>> import mars.tensor as mt + >>> x, y = mt.random.multivariate_normal(mean, cov, 5000).T + >>> plt.plot(x.execute(), y.execute(), 'x') + >>> plt.axis('equal') + >>> plt.show() + + Note that the covariance matrix must be positive semidefinite (a.k.a. + nonnegative-definite). Otherwise, the behavior of this method is + undefined and backwards compatibility is not guaranteed. + + References + ---------- + .. [1] Papoulis, A., "Probability, Random Variables, and Stochastic + Processes," 3rd ed., New York: McGraw-Hill, 1991. + .. [2] Duda, R. O., Hart, P. E., and Stork, D. G., "Pattern + Classification," 2nd ed., New York: Wiley, 2001. + + Examples + -------- + >>> mean = (1, 2) + >>> cov = [[1, 0], [0, 1]] + >>> x = mt.random.multivariate_normal(mean, cov, (3, 3)) + >>> x.shape + (3, 3, 2) + + The following is probably true, given that 0.6 is roughly twice the + standard deviation: + + >>> list(((x[0,0,:] - mean) < 0.6).execute()) + [True, True] + """ + mean = np.asarray(mean) + cov = np.asarray(cov) + + if mean.ndim != 1: + raise ValueError("mean must be 1 dimensional") + if cov.ndim != 2: + raise ValueError("cov must be 1 dimensional") + if len(set(mean.shape + cov.shape)) != 1: + raise ValueError("mean and cov must have same length") + + if dtype is None: + small_kw = {} + if check_valid: + small_kw["check_valid"] = check_valid + if tol: + small_kw["tol"] = tol + dtype = np.random.multivariate_normal(mean, cov, size=(0,), **small_kw).dtype + + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorMultivariateNormal( + mean=mean, + cov=cov, + size=size, + check_valid=check_valid, + tol=tol, + seed=seed, + gpu=gpu, + dtype=dtype, + ) + return op(chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/negative_binomial.py b/python/xorbits/_mars/tensor/random/negative_binomial.py new file mode 100644 index 000000000..4d6a547cc --- /dev/null +++ b/python/xorbits/_mars/tensor/random/negative_binomial.py @@ -0,0 +1,123 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorNegativeBinomial(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["n", "p"] + _op_type_ = OperandDef.RAND_NEGATIVE_BINOMIAL + + _fields_ = "n", "p", "size" + n = AnyField("n") + p = AnyField("p") + _func_name = "negative_binomial" + + def __call__(self, n, p, chunk_size=None): + return self.new_tensor([n, p], None, raw_chunk_size=chunk_size) + + +def negative_binomial( + random_state, n, p, size=None, chunk_size=None, gpu=None, dtype=None +): + r""" + Draw samples from a negative binomial distribution. + + Samples are drawn from a negative binomial distribution with specified + parameters, `n` trials and `p` probability of success where `n` is an + integer > 0 and `p` is in the interval [0, 1]. + + Parameters + ---------- + n : int or array_like of ints + Parameter of the distribution, > 0. Floats are also accepted, + but they will be truncated to integers. + p : float or array_like of floats + Parameter of the distribution, >= 0 and <=1. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``n`` and ``p`` are both scalars. + Otherwise, ``np.broadcast(n, p).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized negative binomial distribution, + where each sample is equal to N, the number of trials it took to + achieve n - 1 successes, N - (n - 1) failures, and a success on the, + (N + n)th trial. + + Notes + ----- + The probability density for the negative binomial distribution is + + .. math:: P(N;n,p) = \binom{N+n-1}{n-1}p^{n}(1-p)^{N}, + + where :math:`n-1` is the number of successes, :math:`p` is the + probability of success, and :math:`N+n-1` is the number of trials. + The negative binomial distribution gives the probability of n-1 + successes and N failures in N+n-1 trials, and success on the (N+n)th + trial. + + If one throws a die repeatedly until the third time a "1" appears, + then the probability distribution of the number of non-"1"s that + appear before the third "1" is a negative binomial distribution. + + References + ---------- + .. [1] Weisstein, Eric W. "Negative Binomial Distribution." From + MathWorld--A Wolfram Web Resource. + http://mathworld.wolfram.com/NegativeBinomialDistribution.html + .. [2] Wikipedia, "Negative binomial distribution", + http://en.wikipedia.org/wiki/Negative_binomial_distribution + + Examples + -------- + Draw samples from the distribution: + + A real world example. A company drills wild-cat oil + exploration wells, each with an estimated probability of + success of 0.1. What is the probability of having one success + for each successive well, that is what is the probability of a + single success after drilling 5 wells, after 6 wells, etc.? + + >>> import mars.tensor as mt + + >>> s = mt.random.negative_binomial(1, 0.1, 100000) + >>> for i in range(1, 11): + ... probability = (mt.sum(s 0. + nonc : float or array_like of floats + Non-centrality, should be non-negative. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``df`` and ``nonc`` are both scalars. + Otherwise, ``mt.broadcast(df, nonc).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized noncentral chi-square distribution. + + Notes + ----- + The probability density function for the noncentral Chi-square + distribution is + + .. math:: P(x;df,nonc) = \sum^{\infty}_{i=0} + \frac{e^{-nonc/2}(nonc/2)^{i}}{i!} + \P_{Y_{df+2i}}(x), + + where :math:`Y_{q}` is the Chi-square with q degrees of freedom. + + In Delhi (2007), it is noted that the noncentral chi-square is + useful in bombing and coverage problems, the probability of + killing the point target given by the noncentral chi-squared + distribution. + + References + ---------- + .. [1] Delhi, M.S. Holla, "On a noncentral chi-square distribution in + the analysis of weapon systems effectiveness", Metrika, + Volume 15, Number 1 / December, 1970. + .. [2] Wikipedia, "Noncentral chi-square distribution" + http://en.wikipedia.org/wiki/Noncentral_chi-square_distribution + + Examples + -------- + Draw values from the distribution and plot the histogram + + >>> import matplotlib.pyplot as plt + >>> import mars.tensor as mt + >>> values = plt.hist(mt.random.noncentral_chisquare(3, 20, 100000).execute(), + ... bins=200, normed=True) + >>> plt.show() + + Draw values from a noncentral chisquare with very small noncentrality, + and compare to a chisquare. + + >>> plt.figure() + >>> values = plt.hist(mt.random.noncentral_chisquare(3, .0000001, 100000).execute(), + ... bins=mt.arange(0., 25, .1).execute(), normed=True) + >>> values2 = plt.hist(mt.random.chisquare(3, 100000).execute(), + ... bins=mt.arange(0., 25, .1).execute(), normed=True) + >>> plt.plot(values[1][0:-1], values[0]-values2[0], 'ob') + >>> plt.show() + + Demonstrate how large values of non-centrality lead to a more symmetric + distribution. + + >>> plt.figure() + >>> values = plt.hist(mt.random.noncentral_chisquare(3, 20, 100000).execute(), + ... bins=200, normed=True) + >>> plt.show() + """ + if dtype is None: + dtype = ( + np.random.RandomState() + .noncentral_chisquare(handle_array(df), handle_array(nonc), size=(0,)) + .dtype + ) + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorNoncentralChisquare(size=size, seed=seed, gpu=gpu, dtype=dtype) + return op(df, nonc, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/noncentral_f.py b/python/xorbits/_mars/tensor/random/noncentral_f.py new file mode 100644 index 000000000..f8923c372 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/noncentral_f.py @@ -0,0 +1,124 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorNoncentralF(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["dfnum", "dfden", "nonc"] + _op_type_ = OperandDef.RAND_NONCENTRAL_F + + _fields_ = "dfnum", "dfden", "nonc", "size" + dfnum = AnyField("dfnum") + dfden = AnyField("dfden") + nonc = AnyField("nonc") + _func_name = "noncentral_f" + + def __call__(self, dfnum, dfden, nonc, chunk_size=None): + return self.new_tensor([dfnum, dfden, nonc], None, raw_chunk_size=chunk_size) + + +def noncentral_f( + random_state, dfnum, dfden, nonc, size=None, chunk_size=None, gpu=None, dtype=None +): + """ + Draw samples from the noncentral F distribution. + + Samples are drawn from an F distribution with specified parameters, + `dfnum` (degrees of freedom in numerator) and `dfden` (degrees of + freedom in denominator), where both parameters > 1. + `nonc` is the non-centrality parameter. + + Parameters + ---------- + dfnum : float or array_like of floats + Numerator degrees of freedom, should be > 0. + dfden : float or array_like of floats + Denominator degrees of freedom, should be > 0. + nonc : float or array_like of floats + Non-centrality parameter, the sum of the squares of the numerator + means, should be >= 0. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``dfnum``, ``dfden``, and ``nonc`` + are all scalars. Otherwise, ``np.broadcast(dfnum, dfden, nonc).size`` + samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized noncentral Fisher distribution. + + Notes + ----- + When calculating the power of an experiment (power = probability of + rejecting the null hypothesis when a specific alternative is true) the + non-central F statistic becomes important. When the null hypothesis is + true, the F statistic follows a central F distribution. When the null + hypothesis is not true, then it follows a non-central F statistic. + + References + ---------- + .. [1] Weisstein, Eric W. "Noncentral F-Distribution." + From MathWorld--A Wolfram Web Resource. + http://mathworld.wolfram.com/NoncentralF-Distribution.html + .. [2] Wikipedia, "Noncentral F-distribution", + http://en.wikipedia.org/wiki/Noncentral_F-distribution + + Examples + -------- + In a study, testing for a specific alternative to the null hypothesis + requires use of the Noncentral F distribution. We need to calculate the + area in the tail of the distribution that exceeds the value of the F + distribution for the null hypothesis. We'll plot the two probability + distributions for comparison. + + >>> import mars.tensor as mt + >>> import matplotlib.pyplot as plt + + >>> dfnum = 3 # between group deg of freedom + >>> dfden = 20 # within groups degrees of freedom + >>> nonc = 3.0 + >>> nc_vals = mt.random.noncentral_f(dfnum, dfden, nonc, 1000000) + >>> NF = np.histogram(nc_vals.execute(), bins=50, normed=True) # TODO(jisheng): implement mt.histogram + >>> c_vals = mt.random.f(dfnum, dfden, 1000000) + >>> F = np.histogram(c_vals.execute(), bins=50, normed=True) + >>> plt.plot(F[1][1:], F[0]) + >>> plt.plot(NF[1][1:], NF[0]) + >>> plt.show() + """ + if dtype is None: + dtype = ( + np.random.RandomState() + .noncentral_f( + handle_array(dfnum), handle_array(dfden), handle_array(nonc), size=(0,) + ) + .dtype + ) + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorNoncentralF(size=size, seed=seed, gpu=gpu, dtype=dtype) + return op(dfnum, dfden, nonc, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/normal.py b/python/xorbits/_mars/tensor/random/normal.py new file mode 100644 index 000000000..8c3f74485 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/normal.py @@ -0,0 +1,141 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorNormal(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["loc", "scale"] + _op_type_ = OperandDef.RAND_NORMAL + + _fields_ = "loc", "scale", "size" + loc = AnyField("loc") + scale = AnyField("scale") + _func_name = "normal" + + def __call__(self, loc, scale, chunk_size=None): + return self.new_tensor([loc, scale], None, raw_chunk_size=chunk_size) + + +def normal( + random_state, loc=0.0, scale=1.0, size=None, chunk_size=None, gpu=None, dtype=None +): + r""" + Draw random samples from a normal (Gaussian) distribution. + + The probability density function of the normal distribution, first + derived by De Moivre and 200 years later by both Gauss and Laplace + independently [2]_, is often called the bell curve because of + its characteristic shape (see the example below). + + The normal distributions occurs often in nature. For example, it + describes the commonly occurring distribution of samples influenced + by a large number of tiny, random disturbances, each with its own + unique distribution [2]_. + + Parameters + ---------- + loc : float or array_like of floats + Mean ("centre") of the distribution. + scale : float or array_like of floats + Standard deviation (spread or "width") of the distribution. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``loc`` and ``scale`` are both scalars. + Otherwise, ``mt.broadcast(loc, scale).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized normal distribution. + + See Also + -------- + scipy.stats.norm : probability density function, distribution or + cumulative density function, etc. + + Notes + ----- + The probability density for the Gaussian distribution is + + .. math:: p(x) = \frac{1}{\sqrt{ 2 \pi \sigma^2 }} + e^{ - \frac{ (x - \mu)^2 } {2 \sigma^2} }, + + where :math:`\mu` is the mean and :math:`\sigma` the standard + deviation. The square of the standard deviation, :math:`\sigma^2`, + is called the variance. + + The function has its peak at the mean, and its "spread" increases with + the standard deviation (the function reaches 0.607 times its maximum at + :math:`x + \sigma` and :math:`x - \sigma` [2]_). This implies that + `numpy.random.normal` is more likely to return samples lying close to + the mean, rather than those far away. + + References + ---------- + .. [1] Wikipedia, "Normal distribution", + http://en.wikipedia.org/wiki/Normal_distribution + .. [2] P. R. Peebles Jr., "Central Limit Theorem" in "Probability, + Random Variables and Random Signal Principles", 4th ed., 2001, + pp. 51, 51, 125. + + Examples + -------- + Draw samples from the distribution: + + >>> import mars.tensor as mt + + >>> mu, sigma = 0, 0.1 # mean and standard deviation + >>> s = mt.random.normal(mu, sigma, 1000) + + Verify the mean and the variance: + + >>> (abs(mu - mt.mean(s)) < 0.01).execute() + True + + >>> (abs(sigma - mt.std(s, ddof=1)) < 0.01).execute() + True + + Display the histogram of the samples, along with + the probability density function: + + >>> import matplotlib.pyplot as plt + >>> count, bins, ignored = plt.hist(s.execute(), 30, normed=True) + >>> plt.plot(bins, (1/(sigma * mt.sqrt(2 * mt.pi)) * + ... mt.exp( - (bins - mu)**2 / (2 * sigma**2) )).execute(), + ... linewidth=2, color='r') + >>> plt.show() + """ + if dtype is None: + dtype = ( + np.random.RandomState() + .normal(handle_array(loc), handle_array(scale), size=(0,)) + .dtype + ) + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorNormal(size=size, seed=seed, gpu=gpu, dtype=dtype) + return op(loc, scale, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/pareto.py b/python/xorbits/_mars/tensor/random/pareto.py new file mode 100644 index 000000000..526659f66 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/pareto.py @@ -0,0 +1,138 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorPareto(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["a"] + _op_type_ = OperandDef.RAND_PARETO + + _fields_ = "a", "size" + a = AnyField("a") + _func_name = "pareto" + + def __call__(self, a, chunk_size=None): + return self.new_tensor([a], None, raw_chunk_size=chunk_size) + + +def pareto(random_state, a, size=None, chunk_size=None, gpu=None, dtype=None): + r""" + Draw samples from a Pareto II or Lomax distribution with + specified shape. + + The Lomax or Pareto II distribution is a shifted Pareto + distribution. The classical Pareto distribution can be + obtained from the Lomax distribution by adding 1 and + multiplying by the scale parameter ``m`` (see Notes). The + smallest value of the Lomax distribution is zero while for the + classical Pareto distribution it is ``mu``, where the standard + Pareto distribution has location ``mu = 1``. Lomax can also + be considered as a simplified version of the Generalized + Pareto distribution (available in SciPy), with the scale set + to one and the location set to zero. + + The Pareto distribution must be greater than zero, and is + unbounded above. It is also known as the "80-20 rule". In + this distribution, 80 percent of the weights are in the lowest + 20 percent of the range, while the other 20 percent fill the + remaining 80 percent of the range. + + Parameters + ---------- + a : float or array_like of floats + Shape of the distribution. Should be greater than zero. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``a`` is a scalar. Otherwise, + ``mt.array(a).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized Pareto distribution. + + See Also + -------- + scipy.stats.lomax : probability density function, distribution or + cumulative density function, etc. + scipy.stats.genpareto : probability density function, distribution or + cumulative density function, etc. + + Notes + ----- + The probability density for the Pareto distribution is + + .. math:: p(x) = \frac{am^a}{x^{a+1}} + + where :math:`a` is the shape and :math:`m` the scale. + + The Pareto distribution, named after the Italian economist + Vilfredo Pareto, is a power law probability distribution + useful in many real world problems. Outside the field of + economics it is generally referred to as the Bradford + distribution. Pareto developed the distribution to describe + the distribution of wealth in an economy. It has also found + use in insurance, web page access statistics, oil field sizes, + and many other problems, including the download frequency for + projects in Sourceforge [1]_. It is one of the so-called + "fat-tailed" distributions. + + + References + ---------- + .. [1] Francis Hunt and Paul Johnson, On the Pareto Distribution of + Sourceforge projects. + .. [2] Pareto, V. (1896). Course of Political Economy. Lausanne. + .. [3] Reiss, R.D., Thomas, M.(2001), Statistical Analysis of Extreme + Values, Birkhauser Verlag, Basel, pp 23-30. + .. [4] Wikipedia, "Pareto distribution", + http://en.wikipedia.org/wiki/Pareto_distribution + + Examples + -------- + Draw samples from the distribution: + + >>> import mars.tensor as mt + + >>> a, m = 3., 2. # shape and mode + >>> s = (mt.random.pareto(a, 1000) + 1) * m + + Display the histogram of the samples, along with the probability + density function: + + >>> import matplotlib.pyplot as plt + >>> count, bins, _ = plt.hist(s.execute(), 100, normed=True) + >>> fit = a*m**a / bins**(a+1) + >>> plt.plot(bins, max(count)*fit/max(fit), linewidth=2, color='r') + >>> plt.show() + """ + if dtype is None: + dtype = np.random.RandomState().pareto(handle_array(a), size=(0,)).dtype + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorPareto(size=size, seed=seed, gpu=gpu, dtype=dtype) + return op(a, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/permutation.py b/python/xorbits/_mars/tensor/random/permutation.py new file mode 100644 index 000000000..78bc63133 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/permutation.py @@ -0,0 +1,239 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from numbers import Integral + +import numpy as np + +from ... import opcodes as OperandDef +from ...core.operand import OperandStage +from ...serialization.serializables import Int32Field, KeyField +from ..array_utils import as_same_device, device +from ..datasource import tensor as astensor +from ..operands import TensorOperandMixin, TensorShuffleProxy +from ..utils import gen_random_seeds, validate_axis +from .core import TensorRandomMapReduceOperand + + +def _permutation_on_axis(ar, axis, rs, xp): + try: + return rs.permutation(ar, axis=axis) + except TypeError: + # numpy starts to support axis from 1.18 + if axis == 0: + return rs.permutation(ar) + indices = xp.arange(ar.shape[axis]) + rs.shuffle(indices) + slc = (slice(None),) * axis + (indices,) + return ar[slc] + + +class TensorPermutation(TensorRandomMapReduceOperand, TensorOperandMixin): + _op_type_ = OperandDef.PERMUTATION + + input = KeyField("input") + axis = Int32Field("axis") + + reduce_size = Int32Field("reduce_size") + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self.input = self._inputs[0] + + def __call__(self, x): + return self.new_tensor([x], x.shape, order=x.order) + + @classmethod + def tile(cls, op): + in_tensor = op.inputs[0] + out_tensor = op.outputs[0] + + state = np.random.RandomState(op.seed) + if len(op.input.chunks) == 1: + chunk_op = op.copy().reset_key() + chunk_op._state = None + chunk_op.seed = gen_random_seeds(1, state)[0] + c = op.input.chunks[0] + chunk = chunk_op.new_chunk([c], shape=c.shape, index=c.index, order=c.order) + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + shape=out_tensor.shape, + order=out_tensor.order, + nsplits=op.input.nsplits, + chunks=[chunk], + ) + + chunk_size = in_tensor.chunk_shape[op.axis] + map_seeds = gen_random_seeds(chunk_size, state) + reduce_seeds = gen_random_seeds(chunk_size, state) + reduce_chunks = [] + if in_tensor.ndim > 1: + cs = in_tensor.chunk_shape + left_chunk_shape = cs[: op.axis] + cs[op.axis + 1 :] + idx_iter = itertools.product(*[range(s) for s in left_chunk_shape]) + else: + idx_iter = [()] + for idx in idx_iter: + map_chunks = [] + for j in range(chunk_size): + in_idx = list(idx) + in_idx.insert(op.axis, j) + c = in_tensor.cix[tuple(in_idx)] + chunk_op = TensorPermutation( + stage=OperandStage.map, + seed=map_seeds[c.index[op.axis]], + axis=op.axis, + reduce_size=chunk_size, + dtype=c.dtype, + gpu=c.op.gpu, + ) + map_chunk = chunk_op.new_chunk( + [c], shape=c.shape, index=c.index, order=out_tensor.order + ) + map_chunks.append(map_chunk) + + proxy_chunk = TensorShuffleProxy( + dtype=out_tensor.dtype, _tensor_keys=[in_tensor.key] + ).new_chunk(map_chunks, shape=()) + + for c in map_chunks: + chunk_op = TensorPermutation( + stage=OperandStage.reduce, + n_reducers=len(map_chunks), + seed=reduce_seeds[c.index[op.axis]], + axis=op.axis, + ) + chunk_shape = list(c.shape) + chunk_shape[op.axis] = np.nan + reduce_chunk = chunk_op.new_chunk( + [proxy_chunk], + shape=tuple(chunk_shape), + order=out_tensor.order, + index=c.index, + dtype=out_tensor.dtype, + ) + reduce_chunks.append(reduce_chunk) + + new_op = op.copy() + nsplits = list(in_tensor.nsplits) + nsplits[op.axis] = [np.nan] * len(nsplits[op.axis]) + return new_op.new_tensors( + op.inputs, + out_tensor.shape, + order=out_tensor.order, + chunks=reduce_chunks, + nsplits=nsplits, + ) + + @classmethod + def _execute_map(cls, ctx, op): + (x,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + out_chunk = op.outputs[0] + reduce_size = op.reduce_size + with device(device_id): + rs = xp.random.RandomState(op.seed) + to_reduce_idxes = rs.randint(reduce_size, size=x.shape[op.axis]) + for to_reduce_idx in range(reduce_size): + reduce_idx = ( + out_chunk.index[: op.axis] + + (to_reduce_idx,) + + out_chunk.index[op.axis + 1 :] + ) + slc = (slice(None),) * op.axis + (to_reduce_idxes == to_reduce_idx,) + ctx[out_chunk.key, reduce_idx] = x[slc] + + @classmethod + def _execute_reduce(cls, ctx, op: "TensorPermutation"): + inputs = list(op.iter_mapper_data(ctx)) + inputs, device_id, xp = as_same_device(inputs, device=op.device, ret_extra=True) + + with device(device_id): + rs = xp.random.RandomState(op.seed) + data = xp.concatenate(inputs, axis=op.axis) + if op.axis == 0: + rs.shuffle(data) + else: + data[...] = _permutation_on_axis(data, op.axis, rs, xp) + ctx[op.outputs[0].key] = data + + @classmethod + def execute(cls, ctx, op): + if op.stage == OperandStage.map: + cls._execute_map(ctx, op) + elif op.stage == OperandStage.reduce: + cls._execute_reduce(ctx, op) + else: + (x,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + rs = xp.random.RandomState(op.seed) + ctx[op.outputs[0].key] = _permutation_on_axis(x, op.axis, rs, xp) + + +def permutation(random_state, x, axis=0, chunk_size=None): + r""" + Randomly permute a sequence, or return a permuted range. + + Parameters + ---------- + x : int or array_like + If `x` is an integer, randomly permute ``mt.arange(x)``. + If `x` is an array, make a copy and shuffle the elements + randomly. + axis : int, optional + The axis which `x` is shuffled along. Default is 0. + chunk_size : : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + Returns + ------- + out : Tensor + Permuted sequence or tensor range. + Examples + -------- + >>> import mars.tensor as mt + >>> rng = mt.random.RandomState() + >>> rng.permutation(10).execute() + array([1, 2, 3, 7, 9, 8, 0, 6, 4, 5]) # random + >>> rng.permutation([1, 4, 9, 12, 15]).execute() + array([ 9, 4, 12, 1, 15]) # random + >>> arr = mt.arange(9).reshape((3, 3)) + >>> rng.permutation(arr).execute() + array([[3, 4, 5], # random + [6, 7, 8], + [0, 1, 2]]) + >>> rng.permutation("abc") + Traceback (most recent call last): + ... + numpy.AxisError: x must be an integer or at least 1-dimensional + """ + if isinstance(x, (Integral, np.integer)): + from ..datasource import arange + + x = arange(x, chunk_size=chunk_size) + else: + x = astensor(x, chunk_size=chunk_size) + if x.ndim < 1: + raise np.AxisError("x must be an integer or at least 1-dimensional") + + axis = validate_axis(x.ndim, axis) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorPermutation(seed=seed, axis=axis, dtype=x.dtype, gpu=x.op.gpu) + return op(x) diff --git a/python/xorbits/_mars/tensor/random/poisson.py b/python/xorbits/_mars/tensor/random/poisson.py new file mode 100644 index 000000000..b63d62724 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/poisson.py @@ -0,0 +1,109 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorPoisson(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["lam"] + _op_type_ = OperandDef.RAND_POSSION + + _fields_ = "lam", "size" + lam = AnyField("lam") + _func_name = "poisson" + + def __call__(self, lam, chunk_size=None): + return self.new_tensor([lam], None, raw_chunk_size=chunk_size) + + +def poisson(random_state, lam=1.0, size=None, chunk_size=None, gpu=None, dtype=None): + r""" + Draw samples from a Poisson distribution. + + The Poisson distribution is the limit of the binomial distribution + for large N. + + Parameters + ---------- + lam : float or array_like of floats + Expectation of interval, should be >= 0. A sequence of expectation + intervals must be broadcastable over the requested size. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``lam`` is a scalar. Otherwise, + ``mt.array(lam).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized Poisson distribution. + + Notes + ----- + The Poisson distribution + + .. math:: f(k; \lambda)=\frac{\lambda^k e^{-\lambda}}{k!} + + For events with an expected separation :math:`\lambda` the Poisson + distribution :math:`f(k; \lambda)` describes the probability of + :math:`k` events occurring within the observed + interval :math:`\lambda`. + + Because the output is limited to the range of the C long type, a + ValueError is raised when `lam` is within 10 sigma of the maximum + representable value. + + References + ---------- + .. [1] Weisstein, Eric W. "Poisson Distribution." + From MathWorld--A Wolfram Web Resource. + http://mathworld.wolfram.com/PoissonDistribution.html + .. [2] Wikipedia, "Poisson distribution", + http://en.wikipedia.org/wiki/Poisson_distribution + + Examples + -------- + Draw samples from the distribution: + + >>> import mars.tensor as mt + >>> s = mt.random.poisson(5, 10000) + + Display histogram of the sample: + + >>> import matplotlib.pyplot as plt + >>> count, bins, ignored = plt.hist(s.execute(), 14, normed=True) + >>> plt.show() + + Draw each 100 values for lambda 100 and 500: + + >>> s = mt.random.poisson(lam=(100., 500.), size=(100, 2)) + """ + if dtype is None: + dtype = np.random.RandomState().poisson(handle_array(lam), size=(0,)).dtype + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorPoisson(size=size, seed=seed, gpu=gpu, dtype=dtype) + return op(lam, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/power.py b/python/xorbits/_mars/tensor/random/power.py new file mode 100644 index 000000000..685cd2d71 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/power.py @@ -0,0 +1,140 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorRandomPower(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["a"] + _op_type_ = OperandDef.RAND_POWER + + _fields_ = "a", "size" + a = AnyField("a") + _func_name = "power" + + def __call__(self, a, chunk_size=None): + return self.new_tensor([a], None, raw_chunk_size=chunk_size) + + +def power(random_state, a, size=None, chunk_size=None, gpu=None, dtype=None): + r""" + Draws samples in [0, 1] from a power distribution with positive + exponent a - 1. + + Also known as the power function distribution. + + Parameters + ---------- + a : float or array_like of floats + Parameter of the distribution. Should be greater than zero. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``a`` is a scalar. Otherwise, + ``mt.array(a).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized power distribution. + + Raises + ------ + ValueError + If a < 1. + + Notes + ----- + The probability density function is + + .. math:: P(x; a) = ax^{a-1}, 0 \le x \le 1, a>0. + + The power function distribution is just the inverse of the Pareto + distribution. It may also be seen as a special case of the Beta + distribution. + + It is used, for example, in modeling the over-reporting of insurance + claims. + + References + ---------- + .. [1] Christian Kleiber, Samuel Kotz, "Statistical size distributions + in economics and actuarial sciences", Wiley, 2003. + .. [2] Heckert, N. A. and Filliben, James J. "NIST Handbook 148: + Dataplot Reference Manual, Volume 2: Let Subcommands and Library + Functions", National Institute of Standards and Technology + Handbook Series, June 2003. + http://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/powpdf.pdf + + Examples + -------- + Draw samples from the distribution: + + >>> import mars.tensor as mt + + >>> a = 5. # shape + >>> samples = 1000 + >>> s = mt.random.power(a, samples) + + Display the histogram of the samples, along with + the probability density function: + + >>> import matplotlib.pyplot as plt + >>> count, bins, ignored = plt.hist(s.execute(), bins=30) + >>> x = mt.linspace(0, 1, 100) + >>> y = a*x**(a-1.) + >>> normed_y = samples*mt.diff(bins)[0]*y + >>> plt.plot(x.execute(), normed_y.execute()) + >>> plt.show() + + Compare the power function distribution to the inverse of the Pareto. + + >>> from scipy import stats + >>> rvs = mt.random.power(5, 1000000) + >>> rvsp = mt.random.pareto(5, 1000000) + >>> xx = mt.linspace(0,1,100) + >>> powpdf = stats.powerlaw.pdf(xx.execute(),5) + + >>> plt.figure() + >>> plt.hist(rvs.execute(), bins=50, normed=True) + >>> plt.plot(xx.execute(),powpdf,'r-') + >>> plt.title('np.random.power(5)') + + >>> plt.figure() + >>> plt.hist((1./(1.+rvsp)).execute(), bins=50, normed=True) + >>> plt.plot(xx.execute(),powpdf,'r-') + >>> plt.title('inverse of 1 + np.random.pareto(5)') + + >>> plt.figure() + >>> plt.hist((1./(1.+rvsp)).execute(), bins=50, normed=True) + >>> plt.plot(xx.execute(),powpdf,'r-') + >>> plt.title('inverse of stats.pareto(5)') + """ + if dtype is None: + dtype = np.random.RandomState().power(handle_array(a), size=(0,)).dtype + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorRandomPower(size=size, seed=seed, gpu=gpu, dtype=dtype) + return op(a, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/rand.py b/python/xorbits/_mars/tensor/random/rand.py new file mode 100644 index 000000000..fed83c38a --- /dev/null +++ b/python/xorbits/_mars/tensor/random/rand.py @@ -0,0 +1,80 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import gen_random_seeds +from .core import TensorRandomOperandMixin, TensorSimpleRandomData + + +class TensorRand(TensorSimpleRandomData, TensorRandomOperandMixin): + _op_type_ = OperandDef.RAND_RAND + _func_name = "rand" + + def __call__(self, chunk_size=None): + return self.new_tensor(None, None, raw_chunk_size=chunk_size) + + +def rand(random_state, *dn, **kw): + """ + Random values in a given shape. + + Create a tensor of the given shape and populate it with + random samples from a uniform distributionc + over ``[0, 1)``. + + Parameters + ---------- + d0, d1, ..., dn : int, optional + The dimensions of the returned tensor, should all be positive. + If no argument is given a single Python float is returned. + + Returns + ------- + out : Tensor, shape ``(d0, d1, ..., dn)`` + Random values. + + See Also + -------- + random + + Notes + ----- + This is a convenience function. If you want an interface that + takes a shape-tuple as the first argument, refer to + mt.random.random_sample . + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.random.rand(3, 2).execute() + array([[ 0.14022471, 0.96360618], #random + [ 0.37601032, 0.25528411], #random + [ 0.49313049, 0.94909878]]) #random + """ + if len(dn) == 1 and isinstance(dn[0], (tuple, list)): + raise TypeError("'tuple' object cannot be interpreted as an integer") + if "dtype" not in kw: + kw["dtype"] = np.dtype("f8") + chunk_size = kw.pop("chunk_size", None) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorRand(seed=seed, size=dn, **kw) + + for key in op.extra_params: + if not key.startswith("_"): + raise ValueError(f"rand got unexpected key arguments {key}") + + return op(chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/randint.py b/python/xorbits/_mars/tensor/random/randint.py new file mode 100644 index 000000000..7dd0def25 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/randint.py @@ -0,0 +1,173 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import Float64Field, Int64Field +from ..array_utils import array_module +from ..utils import gen_random_seeds +from .core import TensorRandomOperandMixin, TensorSimpleRandomData + + +class TensorRandint(TensorSimpleRandomData, TensorRandomOperandMixin): + _op_type_ = OperandDef.RAND_RANDINT + + _fields_ = "low", "high", "density", "size" + low = Int64Field("low") + high = Int64Field("high") + density = Float64Field("density") + _func_name = "randint" + + def __call__(self, chunk_size=None): + return self.new_tensor(None, None, raw_chunk_size=chunk_size) + + @classmethod + def execute(cls, ctx, op): + if op.sparse: + cls.execute_sparse(ctx, op) + else: + super().execute(ctx, op) + + @classmethod + def execute_sparse(cls, ctx, op): + from ...lib.sparse import SparseNDArray + from ...lib.sparse.core import cps, sps + + xp = array_module(op.gpu) + if op.seed: + rs = np.random.RandomState(op.seed) + else: + rs = None + + chunk = op.outputs[0] + if chunk.ndim > 2: + raise NotImplementedError + + low = 1 if op.low == 0 else op.low + + rs = rs or xp.random + size = int(np.ceil(np.prod(chunk.shape) * op.density)) + xps = cps if op.gpu else sps + ij = xp.empty((2, size)) + ij[0] = rs.randint(chunk.shape[0], size=size) + ij[1] = rs.randint(chunk.shape[1], size=size) + data = rs.randint(low, op.high, size=size).astype(op.dtype) + m = xps.coo_matrix((data, ij), chunk.shape).tocsr() + m.data[m.data >= op.high] = op.high - 1 + + # scipy.sparse is too slow, we remove the precise version due to the performance + # m = sps.random(*chunk.shape, density=op.density, format='csr') + # m.data = (rs or xp.random).randint(low, op.high, size=m.data.size)\ + # .astype(op.dtype) + + ctx[chunk.key] = SparseNDArray(m) + + @classmethod + def estimate_size(cls, ctx, op): + chunk = op.outputs[0] + if not op.sparse or not getattr(op, "_density", None): + super().estimate_size(ctx, op) + else: + # use density to estimate real memory usage + nbytes = int(chunk.nbytes * getattr(chunk.op, "_density")) + ctx[chunk.key] = (nbytes, nbytes) + + +def randint( + random_state, + low, + high=None, + size=None, + dtype="l", + density=None, + chunk_size=None, + gpu=None, +): + """ + Return random integers from `low` (inclusive) to `high` (exclusive). + + Return random integers from the "discrete uniform" distribution of + the specified dtype in the "half-open" interval [`low`, `high`). If + `high` is None (the default), then results are from [0, `low`). + + Parameters + ---------- + low : int + Lowest (signed) integer to be drawn from the distribution (unless + ``high=None``, in which case this parameter is one above the + *highest* such integer). + high : int, optional + If provided, one above the largest (signed) integer to be drawn + from the distribution (see above for behavior if ``high=None``). + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. Default is None, in which case a + single value is returned. + dtype : dtype, optional + Desired dtype of the result. All dtypes are determined by their + name, i.e., 'int64', 'int', etc, so byteorder is not available + and a specific precision may have different C types depending + on the platform. The default value is 'np.int'. + density: float, optional + if density specified, a sparse tensor will be created + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : int or Tensor of ints + `size`-shaped tensor of random integers from the appropriate + distribution, or a single such random int if `size` not provided. + + See Also + -------- + random.random_integers : similar to `randint`, only for the closed + interval [`low`, `high`], and 1 is the lowest value if `high` is + omitted. In particular, this other one is the one to use to generate + uniformly distributed discrete non-integers. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.random.randint(2, size=10).execute() + array([1, 0, 0, 0, 1, 1, 0, 0, 1, 0]) + >>> mt.random.randint(1, size=10).execute() + array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + + Generate a 2 x 4 tensor of ints between 0 and 4, inclusive: + + >>> mt.random.randint(5, size=(2, 4)).execute() + array([[4, 0, 2, 1], + [3, 2, 2, 0]]) + """ + sparse = bool(density) + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorRandint( + seed=seed, + low=low, + high=high, + size=size, + dtype=dtype, + gpu=gpu, + sparse=sparse, + density=density, + ) + return op(chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/randn.py b/python/xorbits/_mars/tensor/random/randn.py new file mode 100644 index 000000000..790992e3b --- /dev/null +++ b/python/xorbits/_mars/tensor/random/randn.py @@ -0,0 +1,94 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import gen_random_seeds +from .core import TensorRandomOperandMixin, TensorSimpleRandomData + + +class TensorRandn(TensorSimpleRandomData, TensorRandomOperandMixin): + _op_type_ = OperandDef.RAND_RANDN + _func_name = "randn" + + def __call__(self, chunk_size=None): + return self.new_tensor(None, None, raw_chunk_size=chunk_size) + + +def randn(random_state, *dn, **kw): + r""" + Return a sample (or samples) from the "standard normal" distribution. + + If positive, int_like or int-convertible arguments are provided, + `randn` generates an array of shape ``(d0, d1, ..., dn)``, filled + with random floats sampled from a univariate "normal" (Gaussian) + distribution of mean 0 and variance 1 (if any of the :math:`d_i` are + floats, they are first converted to integers by truncation). A single + float randomly sampled from the distribution is returned if no + argument is provided. + + This is a convenience function. If you want an interface that takes a + tuple as the first argument, use `numpy.random.standard_normal` instead. + + Parameters + ---------- + d0, d1, ..., dn : int, optional + The dimensions of the returned tensor, should be all positive. + If no argument is given a single Python float is returned. + + Returns + ------- + Z : Tensor or float + A ``(d0, d1, ..., dn)``-shaped array of floating-point samples from + the standard normal distribution, or a single such float if + no parameters were supplied. + + See Also + -------- + random.standard_normal : Similar, but takes a tuple as its argument. + + Notes + ----- + For random samples from :math:`N(\mu, \sigma^2)`, use: + + ``sigma * mt.random.randn(...) + mu`` + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.random.randn().execute() + 2.1923875335537315 #random + + Two-by-four tensor of samples from N(3, 6.25): + + >>> (2.5 * mt.random.randn(2, 4) + 3).execute() + array([[-4.49401501, 4.00950034, -1.81814867, 7.29718677], #random + [ 0.39924804, 4.68456316, 4.99394529, 4.84057254]]) #random + """ + if len(dn) == 1 and isinstance(dn[0], (tuple, list)): + raise TypeError("'tuple' object cannot be interpreted as an integer") + if "dtype" not in kw: + kw["dtype"] = np.dtype("f8") + chunk_size = kw.pop("chunk_size", None) + + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorRandn(seed=seed, size=dn, **kw) + + for key in op.extra_params: + if not key.startswith("_"): + raise ValueError(f"randn got unexpected key arguments {key}") + + return op(chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/random_integers.py b/python/xorbits/_mars/tensor/random/random_integers.py new file mode 100644 index 000000000..e986a0f81 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/random_integers.py @@ -0,0 +1,121 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import Int64Field +from ..utils import gen_random_seeds +from .core import TensorRandomOperandMixin, TensorSimpleRandomData + + +class TensorRandomIntegers(TensorSimpleRandomData, TensorRandomOperandMixin): + _op_type_ = OperandDef.RAND_RANDOM_INTEGERS + + _fields_ = "low", "high", "size" + low = Int64Field("low") + high = Int64Field("high") + _func_name = "random_integers" + + def __call__(self, chunk_size=None): + return self.new_tensor(None, None, raw_chunk_size=chunk_size) + + +def random_integers(random_state, low, high=None, size=None, chunk_size=None, gpu=None): + """ + Random integers of type mt.int between `low` and `high`, inclusive. + + Return random integers of type mt.int from the "discrete uniform" + distribution in the closed interval [`low`, `high`]. If `high` is + None (the default), then results are from [1, `low`]. The np.int + type translates to the C long type used by Python 2 for "short" + integers and its precision is platform dependent. + + This function has been deprecated. Use randint instead. + + Parameters + ---------- + low : int + Lowest (signed) integer to be drawn from the distribution (unless + ``high=None``, in which case this parameter is the *highest* such + integer). + high : int, optional + If provided, the largest (signed) integer to be drawn from the + distribution (see above for behavior if ``high=None``). + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. Default is None, in which case a + single value is returned. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + + Returns + ------- + out : int or Tensor of ints + `size`-shaped array of random integers from the appropriate + distribution, or a single such random int if `size` not provided. + + See Also + -------- + random.randint : Similar to `random_integers`, only for the half-open + interval [`low`, `high`), and 0 is the lowest value if `high` is + omitted. + + Notes + ----- + To sample from N evenly spaced floating-point numbers between a and b, + use:: + + a + (b - a) * (np.random.random_integers(N) - 1) / (N - 1.) + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.random.random_integers(5).execute() + 4 + >>> type(mt.random.random_integers(5).execute()) + + >>> mt.random.random_integers(5, size=(3,2)).execute() + array([[5, 4], + [3, 3], + [4, 5]]) + + Choose five random numbers from the set of five evenly-spaced + numbers between 0 and 2.5, inclusive (*i.e.*, from the set + :math:`{0, 5/8, 10/8, 15/8, 20/8}`): + + >>> (2.5 * (mt.random.random_integers(5, size=(5,)) - 1) / 4.).execute() + array([ 0.625, 1.25 , 0.625, 0.625, 2.5 ]) + + Roll two six sided dice 1000 times and sum the results: + + >>> d1 = mt.random.random_integers(1, 6, 1000) + >>> d2 = mt.random.random_integers(1, 6, 1000) + >>> dsums = d1 + d2 + + Display results as a histogram: + + >>> import matplotlib.pyplot as plt + >>> count, bins, ignored = plt.hist(dsums.execute(), 11, normed=True) + >>> plt.show() + """ + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorRandomIntegers( + seed=seed, size=size, dtype=np.dtype(int), low=low, high=high, gpu=gpu + ) + return op(chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/random_sample.py b/python/xorbits/_mars/tensor/random/random_sample.py new file mode 100644 index 000000000..7e13dd802 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/random_sample.py @@ -0,0 +1,84 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import gen_random_seeds +from .core import TensorRandomOperandMixin, TensorSimpleRandomData + + +class TensorRandomSample(TensorSimpleRandomData, TensorRandomOperandMixin): + _op_type_ = OperandDef.RAND_RANDOM_SAMPLE + + _fields_ = ("size",) + _func_name = "random_sample" + + def __call__(self, chunk_size): + return self.new_tensor(None, None, raw_chunk_size=chunk_size) + + +def random_sample(random_state, size=None, chunk_size=None, gpu=None, dtype=None): + """ + Return random floats in the half-open interval [0.0, 1.0). + + Results are from the "continuous uniform" distribution over the + stated interval. To sample :math:`Unif[a, b), b > a` multiply + the output of `random_sample` by `(b-a)` and add `a`:: + + (b - a) * random_sample() + a + + Parameters + ---------- + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. Default is None, in which case a + single value is returned. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : float or Tensor of floats + Array of random floats of shape `size` (unless ``size=None``, in which + case a single float is returned). + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.random.random_sample().execute() + 0.47108547995356098 + >>> type(mt.random.random_sample().execute()) + + >>> mt.random.random_sample((5,)).execute() + array([ 0.30220482, 0.86820401, 0.1654503 , 0.11659149, 0.54323428]) + + Three-by-two array of random numbers from [-5, 0): + + >>> (5 * mt.random.random_sample((3, 2)) - 5).execute() + array([[-3.99149989, -0.52338984], + [-2.99091858, -0.79479508], + [-1.23204345, -1.75224494]]) + """ + if dtype is None: + dtype = np.dtype("f8") + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorRandomSample(seed=seed, size=size, gpu=gpu, dtype=dtype) + return op(chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/rayleigh.py b/python/xorbits/_mars/tensor/random/rayleigh.py new file mode 100644 index 000000000..fbfdc5bba --- /dev/null +++ b/python/xorbits/_mars/tensor/random/rayleigh.py @@ -0,0 +1,108 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorRayleigh(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["scale"] + _op_type_ = OperandDef.RAND_RAYLEIGH + + _fields_ = "scale", "size" + scale = AnyField("scale") + _func_name = "rayleigh" + + def __call__(self, scale, chunk_size=None): + return self.new_tensor([scale], None, raw_chunk_size=chunk_size) + + +def rayleigh(random_state, scale=1.0, size=None, chunk_size=None, gpu=None, dtype=None): + r""" + Draw samples from a Rayleigh distribution. + + The :math:`\chi` and Weibull distributions are generalizations of the + Rayleigh. + + Parameters + ---------- + scale : float or array_like of floats, optional + Scale, also equals the mode. Should be >= 0. Default is 1. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``scale`` is a scalar. Otherwise, + ``mt.array(scale).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized Rayleigh distribution. + + Notes + ----- + The probability density function for the Rayleigh distribution is + + .. math:: P(x;scale) = \frac{x}{scale^2}e^{\frac{-x^2}{2 \cdotp scale^2}} + + The Rayleigh distribution would arise, for example, if the East + and North components of the wind velocity had identical zero-mean + Gaussian distributions. Then the wind speed would have a Rayleigh + distribution. + + References + ---------- + .. [1] Brighton Webs Ltd., "Rayleigh Distribution," + http://www.brighton-webs.co.uk/distributions/rayleigh.asp + .. [2] Wikipedia, "Rayleigh distribution" + http://en.wikipedia.org/wiki/Rayleigh_distribution + + Examples + -------- + Draw values from the distribution and plot the histogram + + >>> import matplotlib.pyplot as plt + >>> import mars.tensor as mt + + >>> values = plt.hist(mt.random.rayleigh(3, 100000).execute(), bins=200, normed=True) + + Wave heights tend to follow a Rayleigh distribution. If the mean wave + height is 1 meter, what fraction of waves are likely to be larger than 3 + meters? + + >>> meanvalue = 1 + >>> modevalue = mt.sqrt(2 / mt.pi) * meanvalue + >>> s = mt.random.rayleigh(modevalue, 1000000) + + The percentage of waves larger than 3 meters is: + + >>> (100.*mt.sum(s>3)/1000000.).execute() + 0.087300000000000003 + """ + if dtype is None: + dtype = np.random.RandomState().rayleigh(handle_array(scale), size=(0,)).dtype + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorRayleigh(size=size, seed=seed, gpu=gpu, dtype=dtype) + return op(scale, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/shuffle.py b/python/xorbits/_mars/tensor/random/shuffle.py new file mode 100644 index 000000000..0db013543 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/shuffle.py @@ -0,0 +1,61 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ..core import TENSOR_TYPE +from ..datasource import tensor as astensor + + +def shuffle(random_state, x, axis=0): + r""" + Modify a sequence in-place by shuffling its contents. + The order of sub-arrays is changed but their contents remains the same. + + Parameters + ---------- + x : array_like + The array or list to be shuffled. + axis : int, optional + The axis which `x` is shuffled along. Default is 0. + + Returns + ------- + None + + Examples + -------- + >>> import mars.tensor as mt + >>> rng = mt.random.RandomState() + >>> arr = mt.arange(10) + >>> rng.shuffle(arr) + >>> arr.execute() + array([0, 1, 4, 2, 8, 6, 5, 9, 3, 7]) # random + + >>> arr = mt.arange(9).reshape((3, 3)) + >>> rng.shuffle(arr) + >>> arr.execute() + array([[6, 7, 8], # random + [0, 1, 2], + [3, 4, 5]]) + """ + from .permutation import permutation + + if isinstance(x, (list, np.ndarray, TENSOR_TYPE)): + x = astensor(x) + else: + raise TypeError("x should be list, numpy ndarray or tensor") + + ret = permutation(random_state, x, axis=axis) + x.data = ret.data diff --git a/python/xorbits/_mars/tensor/random/standard_cauchy.py b/python/xorbits/_mars/tensor/random/standard_cauchy.py new file mode 100644 index 000000000..9a35029c7 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/standard_cauchy.py @@ -0,0 +1,103 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin + + +class TensorStandardCauchy(TensorDistribution, TensorRandomOperandMixin): + _op_type_ = OperandDef.RAND_STANDARD_CAUCHY + _func_name = "standard_cauchy" + _fields_ = ("size",) + + def __call__(self, chunk_size=None): + return self.new_tensor(None, None, raw_chunk_size=chunk_size) + + +def standard_cauchy(random_state, size=None, chunk_size=None, gpu=None, dtype=None): + r""" + Draw samples from a standard Cauchy distribution with mode = 0. + + Also known as the Lorentz distribution. + + Parameters + ---------- + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. Default is None, in which case a + single value is returned. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + samples : Tensor or scalar + The drawn samples. + + Notes + ----- + The probability density function for the full Cauchy distribution is + + .. math:: P(x; x_0, \gamma) = \frac{1}{\pi \gamma \bigl[ 1+ + (\frac{x-x_0}{\gamma})^2 \bigr] } + + and the Standard Cauchy distribution just sets :math:`x_0=0` and + :math:`\gamma=1` + + The Cauchy distribution arises in the solution to the driven harmonic + oscillator problem, and also describes spectral line broadening. It + also describes the distribution of values at which a line tilted at + a random angle will cut the x axis. + + When studying hypothesis tests that assume normality, seeing how the + tests perform on data from a Cauchy distribution is a good indicator of + their sensitivity to a heavy-tailed distribution, since the Cauchy looks + very much like a Gaussian distribution, but with heavier tails. + + References + ---------- + .. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "Cauchy + Distribution", + http://www.itl.nist.gov/div898/handbook/eda/section3/eda3663.htm + .. [2] Weisstein, Eric W. "Cauchy Distribution." From MathWorld--A + Wolfram Web Resource. + http://mathworld.wolfram.com/CauchyDistribution.html + .. [3] Wikipedia, "Cauchy distribution" + http://en.wikipedia.org/wiki/Cauchy_distribution + + Examples + -------- + Draw samples and plot the distribution: + + >>> import mars.tensor as mt + >>> import matplotlib.pyplot as plt + + >>> s = mt.random.standard_cauchy(1000000) + >>> s = s[(s>-25) & (s<25)] # truncate distribution so it plots well + >>> plt.hist(s.execute(), bins=100) + >>> plt.show() + """ + if dtype is None: + dtype = np.random.RandomState().standard_cauchy(size=(0,)).dtype + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorStandardCauchy(size=size, seed=seed, gpu=gpu, dtype=dtype) + return op(chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/standard_exponential.py b/python/xorbits/_mars/tensor/random/standard_exponential.py new file mode 100644 index 000000000..248d37586 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/standard_exponential.py @@ -0,0 +1,70 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin + + +class TensorStandardExponential(TensorDistribution, TensorRandomOperandMixin): + _op_type_ = OperandDef.RAND_STANDARD_EXPONENTIAL + _func_name = "standard_exponential" + _fields_ = ("size",) + + def __call__(self, chunk_size=None): + return self.new_tensor(None, None, raw_chunk_size=chunk_size) + + +def standard_exponential( + random_state, size=None, chunk_size=None, gpu=None, dtype=None +): + """ + Draw samples from the standard exponential distribution. + + `standard_exponential` is identical to the exponential distribution + with a scale parameter of 1. + + Parameters + ---------- + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. Default is None, in which case a + single value is returned. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : float or Tensor + Drawn samples. + + Examples + -------- + Output a 3x8000 tensor: + + >>> import mars.tensor as mt + >>> n = mt.random.standard_exponential((3, 8000)) + """ + if dtype is None: + dtype = np.random.RandomState().standard_exponential(size=(0,)).dtype + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorStandardExponential(size=size, seed=seed, gpu=gpu, dtype=dtype) + return op(chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/standard_gamma.py b/python/xorbits/_mars/tensor/random/standard_gamma.py new file mode 100644 index 000000000..30a0032f5 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/standard_gamma.py @@ -0,0 +1,118 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorStandardGamma(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["shape"] + _op_type_ = OperandDef.RAND_STANDARD_GAMMMA + + _fields_ = "shape", "size" + shape = AnyField("shape") + _func_name = "standard_gamma" + + def __call__(self, shape, chunk_size=None): + return self.new_tensor([shape], None, raw_chunk_size=chunk_size) + + +def standard_gamma( + random_state, shape, size=None, chunk_size=None, gpu=None, dtype=None +): + r""" + Draw samples from a standard Gamma distribution. + + Samples are drawn from a Gamma distribution with specified parameters, + shape (sometimes designated "k") and scale=1. + + Parameters + ---------- + shape : float or array_like of floats + Parameter, should be > 0. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``shape`` is a scalar. Otherwise, + ``mt.array(shape).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized standard gamma distribution. + + See Also + -------- + scipy.stats.gamma : probability density function, distribution or + cumulative density function, etc. + + Notes + ----- + The probability density for the Gamma distribution is + + .. math:: p(x) = x^{k-1}\frac{e^{-x/\theta}}{\theta^k\Gamma(k)}, + + where :math:`k` is the shape and :math:`\theta` the scale, + and :math:`\Gamma` is the Gamma function. + + The Gamma distribution is often used to model the times to failure of + electronic components, and arises naturally in processes for which the + waiting times between Poisson distributed events are relevant. + + References + ---------- + .. [1] Weisstein, Eric W. "Gamma Distribution." From MathWorld--A + Wolfram Web Resource. + http://mathworld.wolfram.com/GammaDistribution.html + .. [2] Wikipedia, "Gamma distribution", + http://en.wikipedia.org/wiki/Gamma_distribution + + Examples + -------- + Draw samples from the distribution: + + >>> import mars.tensor as mt + + >>> shape, scale = 2., 1. # mean and width + >>> s = mt.random.standard_gamma(shape, 1000000) + + Display the histogram of the samples, along with + the probability density function: + + >>> import matplotlib.pyplot as plt + >>> import scipy.special as sps + >>> count, bins, ignored = plt.hist(s.execute(), 50, normed=True) + >>> y = bins**(shape-1) * ((mt.exp(-bins/scale))/ \ + ... (sps.gamma(shape) * scale**shape)) + >>> plt.plot(bins, y.execute(), linewidth=2, color='r') + >>> plt.show() + """ + if dtype is None: + dtype = ( + np.random.RandomState().standard_gamma(handle_array(shape), size=(0,)).dtype + ) + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorStandardGamma(size=size, seed=seed, gpu=gpu, dtype=dtype) + return op(shape, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/standard_normal.py b/python/xorbits/_mars/tensor/random/standard_normal.py new file mode 100644 index 000000000..f80603363 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/standard_normal.py @@ -0,0 +1,72 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin + + +class TensorStandardNormal(TensorDistribution, TensorRandomOperandMixin): + _op_type_ = OperandDef.RAND_STANDARD_NORMAL + _func_name = "standard_normal" + _fields_ = ("size",) + + def __call__(self, chunk_size=None): + return self.new_tensor(None, None, raw_chunk_size=chunk_size) + + +def standard_normal(random_state, size=None, chunk_size=None, gpu=None, dtype=None): + """ + Draw samples from a standard Normal distribution (mean=0, stdev=1). + + Parameters + ---------- + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. Default is None, in which case a + single value is returned. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : float or Tensor + Drawn samples. + + Examples + -------- + >>> import mars.tensor as mt + + >>> s = mt.random.standard_normal(8000) + >>> s.execute() + array([ 0.6888893 , 0.78096262, -0.89086505, ..., 0.49876311, #random + -0.38672696, -0.4685006 ]) #random + >>> s.shape + (8000,) + >>> s = mt.random.standard_normal(size=(3, 4, 2)) + >>> s.shape + (3, 4, 2) + """ + if dtype is None: + dtype = np.random.RandomState().standard_normal(size=(0,)).dtype + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorStandardNormal(size=size, seed=seed, gpu=gpu, dtype=dtype) + return op(chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/standard_t.py b/python/xorbits/_mars/tensor/random/standard_t.py new file mode 100644 index 000000000..6b4006fa7 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/standard_t.py @@ -0,0 +1,133 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorStandardT(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["df"] + _op_type_ = OperandDef.RAND_STANDARD_T + + _fields_ = "df", "size" + df = AnyField("df") + _func_name = "standard_t" + + def __call__(self, df, chunk_size=None): + return self.new_tensor([df], None, raw_chunk_size=chunk_size) + + +def standard_t(random_state, df, size=None, chunk_size=None, gpu=None, dtype=None): + r""" + Draw samples from a standard Student's t distribution with `df` degrees + of freedom. + + A special case of the hyperbolic distribution. As `df` gets + large, the result resembles that of the standard normal + distribution (`standard_normal`). + + Parameters + ---------- + df : float or array_like of floats + Degrees of freedom, should be > 0. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``df`` is a scalar. Otherwise, + ``mt.array(df).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized standard Student's t distribution. + + Notes + ----- + The probability density function for the t distribution is + + .. math:: P(x, df) = \frac{\Gamma(\frac{df+1}{2})}{\sqrt{\pi df} + \Gamma(\frac{df}{2})}\Bigl( 1+\frac{x^2}{df} \Bigr)^{-(df+1)/2} + + The t test is based on an assumption that the data come from a + Normal distribution. The t test provides a way to test whether + the sample mean (that is the mean calculated from the data) is + a good estimate of the true mean. + + The derivation of the t-distribution was first published in + 1908 by William Gosset while working for the Guinness Brewery + in Dublin. Due to proprietary issues, he had to publish under + a pseudonym, and so he used the name Student. + + References + ---------- + .. [1] Dalgaard, Peter, "Introductory Statistics With R", + Springer, 2002. + .. [2] Wikipedia, "Student's t-distribution" + http://en.wikipedia.org/wiki/Student's_t-distribution + + Examples + -------- + From Dalgaard page 83 [1]_, suppose the daily energy intake for 11 + women in Kj is: + + >>> import mars.tensor as mt + + >>> intake = mt.array([5260., 5470, 5640, 6180, 6390, 6515, 6805, 7515, \ + ... 7515, 8230, 8770]) + + Does their energy intake deviate systematically from the recommended + value of 7725 kJ? + + We have 10 degrees of freedom, so is the sample mean within 95% of the + recommended value? + + >>> s = mt.random.standard_t(10, size=100000) + >>> mt.mean(intake).execute() + 6753.636363636364 + >>> intake.std(ddof=1).execute() + 1142.1232221373727 + + Calculate the t statistic, setting the ddof parameter to the unbiased + value so the divisor in the standard deviation will be degrees of + freedom, N-1. + + >>> t = (mt.mean(intake)-7725)/(intake.std(ddof=1)/mt.sqrt(len(intake))) + >>> import matplotlib.pyplot as plt + >>> h = plt.hist(s.execute(), bins=100, normed=True) + + For a one-sided t-test, how far out in the distribution does the t + statistic appear? + + >>> (mt.sum(s= 1).toarray().sum() == pytest.approx(30 * 50 * 0.1, abs=20) + + +random_test_options = namedtuple("random_test_options", ["func_name", "args", "kwargs"]) + +random_params = [ + random_test_options("beta", ([1, 2], [3, 4]), dict(chunk_size=2)), + random_test_options("binomial", (10, 0.5, 100), dict(chunk_size=50)), + random_test_options("chisquare", (2, 100), dict(chunk_size=50)), + random_test_options("dirichlet", ((10, 5, 3), 100), dict(chunk_size=50)), + random_test_options("exponential", (1.0, 100), dict(chunk_size=50)), + random_test_options("f", (1.0, 2.0, 100), dict(chunk_size=50)), + random_test_options("gamma", (1.0, 2.0, 100), dict(chunk_size=50)), + random_test_options("geometric", (1.0, 100), dict(chunk_size=50)), + random_test_options("gumbel", (0.5, 1.0, 100), dict(chunk_size=50)), + random_test_options("hypergeometric", (10, 20, 15, 100), dict(chunk_size=50)), + random_test_options("laplace", (0.5, 1.0, 100), dict(chunk_size=50)), + random_test_options("logistic", (0.5, 1.0, 100), dict(chunk_size=50)), + random_test_options("lognormal", (0.5, 1.0, 100), dict(chunk_size=50)), + random_test_options("logseries", (0.5, 100), dict(chunk_size=50)), + random_test_options("multinomial", (10, [0.2, 0.5, 0.3], 100), dict(chunk_size=50)), + random_test_options( + "multivariate_normal", ([1, 2], [[1, 0], [0, 1]], 100), dict(chunk_size=50) + ), + random_test_options("negative_binomial", (5, 1.0, 100), dict(chunk_size=50)), + random_test_options("noncentral_chisquare", (0.5, 1.0, 100), dict(chunk_size=50)), + random_test_options("noncentral_f", (1.5, 1.0, 1.1, 100), dict(chunk_size=50)), + random_test_options("pareto", (1.0, 100), dict(chunk_size=50)), + random_test_options("poisson", (1.0, 100), dict(chunk_size=50)), + random_test_options("power", (1.0, 100), dict(chunk_size=50)), + random_test_options("rayleigh", (1.0, 100), dict(chunk_size=50)), + random_test_options("standard_cauchy", (100,), dict(chunk_size=50)), + random_test_options("standard_exponential", (100,), dict(chunk_size=50)), + random_test_options("standard_gamma", (1.0, 100), dict(chunk_size=50)), + random_test_options("standard_normal", (100,), dict(chunk_size=50)), + random_test_options("standard_t", (1.0, 100), dict(chunk_size=50)), + random_test_options("triangular", (0.1, 0.2, 0.3, 100), dict(chunk_size=50)), + random_test_options("uniform", (0.1, 0.2, 100), dict(chunk_size=50)), + random_test_options("vonmises", (0.1, 0.2, 100), dict(chunk_size=50)), + random_test_options("wald", (0.1, 0.2, 100), dict(chunk_size=50)), + random_test_options("weibull", (0.1, 100), dict(chunk_size=50)), + random_test_options("zipf", (1.1, 100), dict(chunk_size=50)), +] + + +@pytest.mark.parametrize("test_opts", random_params) +def test_random_execute(setup, test_opts): + rs = tensor.random.RandomState(0) + arr1 = getattr(rs, test_opts.func_name)(*test_opts.args, **test_opts.kwargs) + rs = tensor.random.RandomState(0) + arr2 = getattr(rs, test_opts.func_name)(*test_opts.args, **test_opts.kwargs) + assert np.array_equal(arr1.execute().fetch(), arr2.execute().fetch()) + + +def test_permutation_execute(setup): + rs = tensor.random.RandomState(0) + x = rs.permutation(10) + res = x.execute().fetch() + assert not np.all(res[:-1] < res[1:]) + np.testing.assert_array_equal(np.sort(res), np.arange(10)) + + arr = from_ndarray([1, 4, 9, 12, 15], chunk_size=2) + x = rs.permutation(arr) + res = x.execute().fetch() + assert not np.all(res[:-1] < res[1:]) + np.testing.assert_array_equal(np.sort(res), np.asarray([1, 4, 9, 12, 15])) + + arr = from_ndarray(np.arange(48).reshape(12, 4), chunk_size=2) + # axis = 0 + x = rs.permutation(arr) + res = x.execute().fetch() + assert not np.all(res[:-1] < res[1:]) + np.testing.assert_array_equal(np.sort(res, axis=0), np.arange(48).reshape(12, 4)) + # axis != 0 + x2 = rs.permutation(arr, axis=1) + res = x2.execute().fetch() + assert not np.all(res[:, :-1] < res[:, 1:]) + np.testing.assert_array_equal(np.sort(res, axis=1), np.arange(48).reshape(12, 4)) diff --git a/python/xorbits/_mars/tensor/random/triangular.py b/python/xorbits/_mars/tensor/random/triangular.py new file mode 100644 index 000000000..a923495e6 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/triangular.py @@ -0,0 +1,117 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorTriangular(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["left", "mode", "right"] + _op_type_ = OperandDef.RAND_TRIANGULAR + + _fields_ = "left", "mode", "right", "size" + left = AnyField("left") + mode = AnyField("mode") + right = AnyField("right") + _func_name = "triangular" + + def __call__(self, left, mode, right, chunk_size=None): + return self.new_tensor([left, mode, right], None, raw_chunk_size=chunk_size) + + +def triangular( + random_state, left, mode, right, size=None, chunk_size=None, gpu=None, dtype=None +): + r""" + Draw samples from the triangular distribution over the + interval ``[left, right]``. + + The triangular distribution is a continuous probability + distribution with lower limit left, peak at mode, and upper + limit right. Unlike the other distributions, these parameters + directly define the shape of the pdf. + + Parameters + ---------- + left : float or array_like of floats + Lower limit. + mode : float or array_like of floats + The value where the peak of the distribution occurs. + The value should fulfill the condition ``left <= mode <= right``. + right : float or array_like of floats + Upper limit, should be larger than `left`. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``left``, ``mode``, and ``right`` + are all scalars. Otherwise, ``mt.broadcast(left, mode, right).size`` + samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized triangular distribution. + + Notes + ----- + The probability density function for the triangular distribution is + + .. math:: P(x;l, m, r) = \begin{cases} + \frac{2(x-l)}{(r-l)(m-l)}& \text{for $l \leq x \leq m$},\\ + \frac{2(r-x)}{(r-l)(r-m)}& \text{for $m \leq x \leq r$},\\ + 0& \text{otherwise}. + \end{cases} + + The triangular distribution is often used in ill-defined + problems where the underlying distribution is not known, but + some knowledge of the limits and mode exists. Often it is used + in simulations. + + References + ---------- + .. [1] Wikipedia, "Triangular distribution" + http://en.wikipedia.org/wiki/Triangular_distribution + + Examples + -------- + Draw values from the distribution and plot the histogram: + + >>> import matplotlib.pyplot as plt + >>> import mars.tensor as mt + >>> h = plt.hist(mt.random.triangular(-3, 0, 8, 100000).execute(), bins=200, + ... normed=True) + >>> plt.show() + """ + if dtype is None: + dtype = ( + np.random.RandomState() + .triangular( + handle_array(left), handle_array(mode), handle_array(right), size=(0,) + ) + .dtype + ) + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorTriangular(size=size, seed=seed, gpu=gpu, dtype=dtype) + return op(left, mode, right, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/uniform.py b/python/xorbits/_mars/tensor/random/uniform.py new file mode 100644 index 000000000..ac7225bcf --- /dev/null +++ b/python/xorbits/_mars/tensor/random/uniform.py @@ -0,0 +1,129 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorUniform(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["low", "high"] + _op_type_ = OperandDef.RAND_UNIFORM + + _fields_ = "low", "high", "size" + low = AnyField("low") + high = AnyField("high") + _func_name = "uniform" + + def __call__(self, low, high, chunk_size=None): + return self.new_tensor([low, high], None, raw_chunk_size=chunk_size) + + +def uniform( + random_state, low=0.0, high=1.0, size=None, chunk_size=None, gpu=None, dtype=None +): + r""" + Draw samples from a uniform distribution. + + Samples are uniformly distributed over the half-open interval + ``[low, high)`` (includes low, but excludes high). In other words, + any value within the given interval is equally likely to be drawn + by `uniform`. + + Parameters + ---------- + low : float or array_like of floats, optional + Lower boundary of the output interval. All values generated will be + greater than or equal to low. The default value is 0. + high : float or array_like of floats + Upper boundary of the output interval. All values generated will be + less than high. The default value is 1.0. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``low`` and ``high`` are both scalars. + Otherwise, ``mt.broadcast(low, high).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized uniform distribution. + + See Also + -------- + randint : Discrete uniform distribution, yielding integers. + random_integers : Discrete uniform distribution over the closed + interval ``[low, high]``. + random_sample : Floats uniformly distributed over ``[0, 1)``. + random : Alias for `random_sample`. + rand : Convenience function that accepts dimensions as input, e.g., + ``rand(2,2)`` would generate a 2-by-2 array of floats, + uniformly distributed over ``[0, 1)``. + + Notes + ----- + The probability density function of the uniform distribution is + + .. math:: p(x) = \frac{1}{b - a} + + anywhere within the interval ``[a, b)``, and zero elsewhere. + + When ``high`` == ``low``, values of ``low`` will be returned. + If ``high`` < ``low``, the results are officially undefined + and may eventually raise an error, i.e. do not rely on this + function to behave when passed arguments satisfying that + inequality condition. + + Examples + -------- + Draw samples from the distribution: + + >>> import mars.tensor as mt + + >>> s = mt.random.uniform(-1,0,1000) + + All values are within the given interval: + + >>> mt.all(s >= -1).execute() + True + >>> mt.all(s < 0).execute() + True + + Display the histogram of the samples, along with the + probability density function: + + >>> import matplotlib.pyplot as plt + >>> count, bins, ignored = plt.hist(s.execute(), 15, normed=True) + >>> plt.plot(bins, mt.ones_like(bins).execute(), linewidth=2, color='r') + >>> plt.show() + """ + if dtype is None: + dtype = ( + np.random.RandomState() + .uniform(handle_array(low), handle_array(high), size=(0,)) + .dtype + ) + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorUniform(size=size, seed=seed, gpu=gpu, dtype=dtype) + return op(low, high, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/vonmises.py b/python/xorbits/_mars/tensor/random/vonmises.py new file mode 100644 index 000000000..083ac29b1 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/vonmises.py @@ -0,0 +1,129 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorVonmises(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["mu", "kappa"] + _op_type_ = OperandDef.RAND_VONMISES + + _fields_ = "mu", "kappa", "size" + mu = AnyField("mu") + kappa = AnyField("kappa") + _func_name = "vonmises" + + def __call__(self, mu, kappa, chunk_size=None): + return self.new_tensor([mu, kappa], None, raw_chunk_size=chunk_size) + + +def vonmises(random_state, mu, kappa, size=None, chunk_size=None, gpu=None, dtype=None): + r""" + Draw samples from a von Mises distribution. + + Samples are drawn from a von Mises distribution with specified mode + (mu) and dispersion (kappa), on the interval [-pi, pi]. + + The von Mises distribution (also known as the circular normal + distribution) is a continuous probability distribution on the unit + circle. It may be thought of as the circular analogue of the normal + distribution. + + Parameters + ---------- + mu : float or array_like of floats + Mode ("center") of the distribution. + kappa : float or array_like of floats + Dispersion of the distribution, has to be >=0. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``mu`` and ``kappa`` are both scalars. + Otherwise, ``np.broadcast(mu, kappa).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized von Mises distribution. + + See Also + -------- + scipy.stats.vonmises : probability density function, distribution, or + cumulative density function, etc. + + Notes + ----- + The probability density for the von Mises distribution is + + .. math:: p(x) = \frac{e^{\kappa cos(x-\mu)}}{2\pi I_0(\kappa)}, + + where :math:`\mu` is the mode and :math:`\kappa` the dispersion, + and :math:`I_0(\kappa)` is the modified Bessel function of order 0. + + The von Mises is named for Richard Edler von Mises, who was born in + Austria-Hungary, in what is now the Ukraine. He fled to the United + States in 1939 and became a professor at Harvard. He worked in + probability theory, aerodynamics, fluid mechanics, and philosophy of + science. + + References + ---------- + .. [1] Abramowitz, M. and Stegun, I. A. (Eds.). "Handbook of + Mathematical Functions with Formulas, Graphs, and Mathematical + Tables, 9th printing," New York: Dover, 1972. + .. [2] von Mises, R., "Mathematical Theory of Probability + and Statistics", New York: Academic Press, 1964. + + Examples + -------- + Draw samples from the distribution: + + >>> import mars.tensor as mt + + >>> mu, kappa = 0.0, 4.0 # mean and dispersion + >>> s = mt.random.vonmises(mu, kappa, 1000) + + Display the histogram of the samples, along with + the probability density function: + + >>> import matplotlib.pyplot as plt + >>> from scipy.special import i0 + >>> plt.hist(s.execute(), 50, normed=True) + >>> x = mt.linspace(-mt.pi, mt.pi, num=51) + >>> y = mt.exp(kappa*mt.cos(x-mu))/(2*mt.pi*i0(kappa)) + >>> plt.plot(x.execute(), y.execute(), linewidth=2, color='r') + >>> plt.show() + """ + if dtype is None: + dtype = ( + np.random.RandomState() + .vonmises(handle_array(mu), handle_array(kappa), size=(0,)) + .dtype + ) + + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorVonmises(size=size, seed=seed, gpu=gpu, dtype=dtype) + return op(mu, kappa, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/wald.py b/python/xorbits/_mars/tensor/random/wald.py new file mode 100644 index 000000000..08e195d12 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/wald.py @@ -0,0 +1,112 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorWald(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["mean", "scale"] + _op_type_ = OperandDef.RAND_WALD + + _fields_ = "mean", "scale", "size" + mean = AnyField("mean") + scale = AnyField("scale") + _func_name = "wald" + + def __call__(self, mean, scale, chunk_size=None): + return self.new_tensor([mean, scale], None, raw_chunk_size=chunk_size) + + +def wald(random_state, mean, scale, size=None, chunk_size=None, gpu=None, dtype=None): + r""" + Draw samples from a Wald, or inverse Gaussian, distribution. + + As the scale approaches infinity, the distribution becomes more like a + Gaussian. Some references claim that the Wald is an inverse Gaussian + with mean equal to 1, but this is by no means universal. + + The inverse Gaussian distribution was first studied in relationship to + Brownian motion. In 1956 M.C.K. Tweedie used the name inverse Gaussian + because there is an inverse relationship between the time to cover a + unit distance and distance covered in unit time. + + Parameters + ---------- + mean : float or array_like of floats + Distribution mean, should be > 0. + scale : float or array_like of floats + Scale parameter, should be >= 0. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``mean`` and ``scale`` are both scalars. + Otherwise, ``np.broadcast(mean, scale).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized Wald distribution. + + Notes + ----- + The probability density function for the Wald distribution is + + .. math:: P(x;mean,scale) = \sqrt{\frac{scale}{2\pi x^3}}e^ + \frac{-scale(x-mean)^2}{2\cdotp mean^2x} + + As noted above the inverse Gaussian distribution first arise + from attempts to model Brownian motion. It is also a + competitor to the Weibull for use in reliability modeling and + modeling stock returns and interest rate processes. + + References + ---------- + .. [1] Brighton Webs Ltd., Wald Distribution, + http://www.brighton-webs.co.uk/distributions/wald.asp + .. [2] Chhikara, Raj S., and Folks, J. Leroy, "The Inverse Gaussian + Distribution: Theory : Methodology, and Applications", CRC Press, + 1988. + .. [3] Wikipedia, "Wald distribution" + http://en.wikipedia.org/wiki/Wald_distribution + + Examples + -------- + Draw values from the distribution and plot the histogram: + + >>> import matplotlib.pyplot as plt + >>> import mars.tensor as mt + >>> h = plt.hist(mt.random.wald(3, 2, 100000).execute(), bins=200, normed=True) + >>> plt.show() + """ + if dtype is None: + dtype = ( + np.random.RandomState() + .wald(handle_array(mean), handle_array(scale), size=(0,)) + .dtype + ) + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorWald(size=size, seed=seed, gpu=gpu, dtype=dtype) + return op(mean, scale, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/weibull.py b/python/xorbits/_mars/tensor/random/weibull.py new file mode 100644 index 000000000..2dc93c895 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/weibull.py @@ -0,0 +1,138 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorWeibull(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["a"] + _op_type_ = OperandDef.RAND_WEIBULL + + _fields_ = "a", "size" + a = AnyField("a") + _func_name = "weibull" + + def __call__(self, a, chunk_size=None): + return self.new_tensor([a], None, raw_chunk_size=chunk_size) + + +def weibull(random_state, a, size=None, chunk_size=None, gpu=None, dtype=None): + r""" + Draw samples from a Weibull distribution. + + Draw samples from a 1-parameter Weibull distribution with the given + shape parameter `a`. + + .. math:: X = (-ln(U))^{1/a} + + Here, U is drawn from the uniform distribution over (0,1]. + + The more common 2-parameter Weibull, including a scale parameter + :math:`\lambda` is just :math:`X = \lambda(-ln(U))^{1/a}`. + + Parameters + ---------- + a : float or array_like of floats + Shape of the distribution. Should be greater than zero. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``a`` is a scalar. Otherwise, + ``mt.array(a).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized Weibull distribution. + + See Also + -------- + scipy.stats.weibull_max + scipy.stats.weibull_min + scipy.stats.genextreme + gumbel + + Notes + ----- + The Weibull (or Type III asymptotic extreme value distribution + for smallest values, SEV Type III, or Rosin-Rammler + distribution) is one of a class of Generalized Extreme Value + (GEV) distributions used in modeling extreme value problems. + This class includes the Gumbel and Frechet distributions. + + The probability density for the Weibull distribution is + + .. math:: p(x) = \frac{a} + {\lambda}(\frac{x}{\lambda})^{a-1}e^{-(x/\lambda)^a}, + + where :math:`a` is the shape and :math:`\lambda` the scale. + + The function has its peak (the mode) at + :math:`\lambda(\frac{a-1}{a})^{1/a}`. + + When ``a = 1``, the Weibull distribution reduces to the exponential + distribution. + + References + ---------- + .. [1] Waloddi Weibull, Royal Technical University, Stockholm, + 1939 "A Statistical Theory Of The Strength Of Materials", + Ingeniorsvetenskapsakademiens Handlingar Nr 151, 1939, + Generalstabens Litografiska Anstalts Forlag, Stockholm. + .. [2] Waloddi Weibull, "A Statistical Distribution Function of + Wide Applicability", Journal Of Applied Mechanics ASME Paper + 1951. + .. [3] Wikipedia, "Weibull distribution", + http://en.wikipedia.org/wiki/Weibull_distribution + + Examples + -------- + Draw samples from the distribution: + + >>> import mars.tensor as mt + + >>> a = 5. # shape + >>> s = mt.random.weibull(a, 1000) + + Display the histogram of the samples, along with + the probability density function: + + >>> import matplotlib.pyplot as plt + >>> x = mt.arange(1,100.)/50. + >>> def weib(x,n,a): + ... return (a / n) * (x / n)**(a - 1) * mt.exp(-(x / n)**a) + + >>> count, bins, ignored = plt.hist(mt.random.weibull(5.,1000).execute()) + >>> x = mt.arange(1,100.)/50. + >>> scale = count.max()/weib(x, 1., 5.).max() + >>> plt.plot(x.execute(), (weib(x, 1., 5.)*scale).execute()) + >>> plt.show() + """ + if dtype is None: + dtype = np.random.RandomState().weibull(handle_array(a), size=(0,)).dtype + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorWeibull(size=size, seed=seed, gpu=gpu, dtype=dtype) + return op(a, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/random/zipf.py b/python/xorbits/_mars/tensor/random/zipf.py new file mode 100644 index 000000000..cef646748 --- /dev/null +++ b/python/xorbits/_mars/tensor/random/zipf.py @@ -0,0 +1,120 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ..utils import gen_random_seeds +from .core import TensorDistribution, TensorRandomOperandMixin, handle_array + + +class TensorZipf(TensorDistribution, TensorRandomOperandMixin): + _input_fields_ = ["a"] + _op_type_ = OperandDef.RAND_ZIPF + + _fields_ = "a", "size" + a = AnyField("a") + _func_name = "zipf" + + def __call__(self, a, chunk_size=None): + return self.new_tensor([a], None, raw_chunk_size=chunk_size) + + +def zipf(random_state, a, size=None, chunk_size=None, gpu=None, dtype=None): + r""" + Draw samples from a Zipf distribution. + + Samples are drawn from a Zipf distribution with specified parameter + `a` > 1. + + The Zipf distribution (also known as the zeta distribution) is a + continuous probability distribution that satisfies Zipf's law: the + frequency of an item is inversely proportional to its rank in a + frequency table. + + Parameters + ---------- + a : float or array_like of floats + Distribution parameter. Should be greater than 1. + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. If size is ``None`` (default), + a single value is returned if ``a`` is a scalar. Otherwise, + ``mt.array(a).size`` samples are drawn. + chunk_size : int or tuple of int or tuple of ints, optional + Desired chunk size on each dimension + gpu : bool, optional + Allocate the tensor on GPU if True, False as default + dtype : data-type, optional + Data-type of the returned tensor. + + Returns + ------- + out : Tensor or scalar + Drawn samples from the parameterized Zipf distribution. + + See Also + -------- + scipy.stats.zipf : probability density function, distribution, or + cumulative density function, etc. + + Notes + ----- + The probability density for the Zipf distribution is + + .. math:: p(x) = \frac{x^{-a}}{\zeta(a)}, + + where :math:`\zeta` is the Riemann Zeta function. + + It is named for the American linguist George Kingsley Zipf, who noted + that the frequency of any word in a sample of a language is inversely + proportional to its rank in the frequency table. + + References + ---------- + .. [1] Zipf, G. K., "Selected Studies of the Principle of Relative + Frequency in Language," Cambridge, MA: Harvard Univ. Press, + 1932. + + Examples + -------- + Draw samples from the distribution: + + >>> import mars.tensor as mt + + >>> a = 2. # parameter + >>> s = mt.random.zipf(a, 1000) + + Display the histogram of the samples, along with + the probability density function: + + >>> import matplotlib.pyplot as plt + >>> from scipy import special + + Truncate s values at 50 so plot is interesting: + + >>> count, bins, ignored = plt.hist(s[s<50].execute(), 50, normed=True) + >>> x = mt.arange(1., 50.) + >>> y = x**(-a) / special.zetac(a) + >>> plt.plot(x.execute(), (y/mt.max(y)).execute(), linewidth=2, color='r') + >>> plt.show() + """ + if dtype is None: + dtype = np.random.RandomState().zipf(handle_array(a), size=(0,)).dtype + + size = random_state._handle_size(size) + seed = gen_random_seeds(1, random_state.to_numpy())[0] + op = TensorZipf(size=size, seed=seed, gpu=gpu, dtype=dtype) + return op(a, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/rechunk/__init__.py b/python/xorbits/_mars/tensor/rechunk/__init__.py new file mode 100644 index 000000000..1f747bf42 --- /dev/null +++ b/python/xorbits/_mars/tensor/rechunk/__init__.py @@ -0,0 +1,26 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .rechunk import rechunk + + +def _install(): + from ..core import Tensor, TensorData + + setattr(Tensor, "rechunk", rechunk) + setattr(TensorData, "rechunk", rechunk) + + +_install() +del _install diff --git a/python/xorbits/_mars/tensor/rechunk/core.py b/python/xorbits/_mars/tensor/rechunk/core.py new file mode 100644 index 000000000..cb21458f7 --- /dev/null +++ b/python/xorbits/_mars/tensor/rechunk/core.py @@ -0,0 +1,108 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from dataclasses import dataclass +from typing import List, Tuple, Union + +import numpy as np + +from ...typing import ChunkType, TileableType +from ..utils import decide_chunk_sizes + +chunk_size_type = Union[int, Tuple[int], Tuple[Tuple[int], ...]] + + +def get_nsplits( + tileable: TileableType, new_chunk_size: chunk_size_type, itemsize: int +) -> Tuple[Tuple[int], ...]: + if isinstance(new_chunk_size, dict): + chunk_size = list(tileable.nsplits) + for idx, c in new_chunk_size.items(): + chunk_size[idx] = c + else: + chunk_size = new_chunk_size + + return decide_chunk_sizes(tileable.shape, chunk_size, itemsize) + + +@dataclass +class RechunkInfo: + out_index: Tuple[int] + shape: Tuple[int] + input_chunks: List[ChunkType] + input_slices: List[Tuple[slice]] + input_chunk_shape: List[int] + + +def gen_rechunk_infos( + inp: TileableType, chunk_size: Tuple[Tuple[int], ...] +) -> List[RechunkInfo]: + cum_in_nsplits = [np.cumsum(ns) for ns in inp.nsplits] + cum_out_nsplits = [np.cumsum(ns) for ns in chunk_size] + out_starts = [[0] + cum_ns[:-1].tolist() for cum_ns in cum_out_nsplits] + out_ends = cum_out_nsplits + out_start_indexes = [ + np.searchsorted(cum_ns, starts) + for cum_ns, starts in zip(cum_in_nsplits, out_starts) + ] + out_end_indexes = [ + np.searchsorted(cum_ns, ends) for cum_ns, ends in zip(cum_in_nsplits, out_ends) + ] + + chunk_index_iter = itertools.product(*(range(len(s)) for s in chunk_size)) + rechunk_infos = [] + for chunk_index in chunk_index_iter: + shape = tuple(chunk_size[dim][i] for dim, i in enumerate(chunk_index)) + inp_chunk_slices = [list() for _ in range(len(chunk_index))] + inp_chunk_indexes = [list() for _ in range(len(chunk_index))] + for dim, i in enumerate(chunk_index): + size_start = out_starts[dim][i] + size_end = out_ends[dim][i] + start_index = out_start_indexes[dim][i] + end_index = out_end_indexes[dim][i] + for inp_i in range(start_index, end_index + 1): + inp_start = cum_in_nsplits[dim][inp_i - 1] if inp_i > 0 else 0 + inp_end = cum_in_nsplits[dim][inp_i] + slice_start = max(inp_start, size_start) - inp_start + slice_end = min(inp_end, size_end) - inp_start + if slice_start == 0 and slice_end == inp_end - inp_start: + # slice all + slc = slice(None) + elif slice_start == slice_end and size_start != size_end: + continue + else: + slc = slice(slice_start, slice_end) + inp_chunk_slices[dim].append(slc) + inp_chunk_indexes[dim].append(inp_i) + + inp_chunks = [] + inp_slices = [] + rechunk_info = RechunkInfo( + out_index=chunk_index, + shape=shape, + input_chunks=inp_chunks, + input_slices=inp_slices, + input_chunk_shape=list(len(s) for s in inp_chunk_indexes), + ) + for inp_chunk_index, inp_chunk_slice in zip( + itertools.product(*inp_chunk_indexes), + itertools.product(*inp_chunk_slices), + ): + inp_chunk = inp.cix[tuple(inp_chunk_index)] + inp_chunks.append(inp_chunk) + inp_slices.append(inp_chunk_slice) + rechunk_infos.append(rechunk_info) + + return rechunk_infos diff --git a/python/xorbits/_mars/tensor/rechunk/rechunk.py b/python/xorbits/_mars/tensor/rechunk/rechunk.py new file mode 100644 index 000000000..ab25afdd8 --- /dev/null +++ b/python/xorbits/_mars/tensor/rechunk/rechunk.py @@ -0,0 +1,115 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField +from ...utils import has_unknown_shape +from ..core import Tensor +from ..datasource import tensor as astensor +from ..operands import TensorOperand, TensorOperandMixin +from ..utils import calc_sliced_size +from .core import chunk_size_type, gen_rechunk_infos, get_nsplits + + +class TensorRechunk(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.RECHUNK + + chunk_size = AnyField("chunk_size") + + def __call__(self, tensor: Tensor): + return self.new_tensor([tensor], tensor.shape, order=tensor.order) + + @classmethod + def tile(cls, op: "TensorRechunk"): + from ..indexing.slice import TensorSlice + from ..merge.concatenate import TensorConcatenate + + if has_unknown_shape(*op.inputs): + yield + + out = op.outputs[0] + tensor = astensor(op.inputs[0]) + chunk_size = get_nsplits(tensor, op.chunk_size, tensor.dtype.itemsize) + if chunk_size == tensor.nsplits: + return [tensor] + + rechunk_infos = gen_rechunk_infos(tensor, chunk_size) + out_chunks = [] + for rechunk_info in rechunk_infos: + chunk_index = rechunk_info.out_index + shape = rechunk_info.shape + inp_chunks = rechunk_info.input_chunks + inp_chunk_slices = rechunk_info.input_slices + inp_slice_chunks = [] + for inp_chunk, inp_chunk_slice in zip(inp_chunks, inp_chunk_slices): + if all(slc == slice(None) for slc in inp_chunk_slice): + inp_slice_chunks.append(inp_chunk) + else: + slc_chunk = TensorSlice(slices=list(inp_chunk_slice)).new_chunk( + [inp_chunk], + dtype=inp_chunk.dtype, + shape=tuple( + calc_sliced_size(s, slc) + for s, slc in zip(inp_chunk.shape, inp_chunk_slice) + ), + index=inp_chunk.index, + ) + inp_slice_chunks.append(slc_chunk) + + if len(inp_slice_chunks) > 1 or inp_slice_chunks[0].index != chunk_index: + chunk_op = TensorConcatenate() + out_chunk = chunk_op.new_chunk( + inp_slice_chunks, + shape=shape, + index=chunk_index, + dtype=out.dtype, + order=out.order, + ) + out_chunks.append(out_chunk) + else: + out_chunks.append(inp_slice_chunks[0]) + + new_op = op.copy() + params = out.params + params["nsplits"] = chunk_size + params["chunks"] = out_chunks + tensor = new_op.new_tileable(op.inputs, kws=[params]) + + if op.reassign_worker: + for c in tensor.chunks: + c.op.reassign_worker = True + + return [tensor] + + +def rechunk( + tensor: Tensor, chunk_size: chunk_size_type, reassign_worker=False +) -> Tensor: + if not any(np.isnan(s) for s in tensor.shape) and not tensor.is_coarse(): + if not has_unknown_shape(tensor): + # do client check only when tensor has no unknown shape, + # otherwise, recalculate chunk_size in `tile` + chunk_size = get_nsplits(tensor, chunk_size, tensor.dtype.itemsize) + if chunk_size == tensor.nsplits: + return tensor + + op = TensorRechunk( + chunk_size=chunk_size, + reassign_worker=reassign_worker, + dtype=tensor.dtype, + sparse=tensor.issparse(), + ) + return op(tensor) diff --git a/python/xorbits/_mars/tensor/rechunk/tests/__init__.py b/python/xorbits/_mars/tensor/rechunk/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/tensor/rechunk/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/tensor/rechunk/tests/test_rechunk.py b/python/xorbits/_mars/tensor/rechunk/tests/test_rechunk.py new file mode 100644 index 000000000..88c985267 --- /dev/null +++ b/python/xorbits/_mars/tensor/rechunk/tests/test_rechunk.py @@ -0,0 +1,42 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +import scipy.sparse as sps + +from .... import tensor as mt + +# dense +raw = np.random.RandomState(0).rand(12, 9) +raw2 = raw.copy() +raw2.ravel()[::2] = 0 +# dense, F-order +raw3 = np.asfortranarray(raw) +# sparse +raw_s = sps.csr_matrix(raw2) + + +@pytest.mark.parametrize("data", [raw, raw3, raw_s]) +@pytest.mark.parametrize("chunk_size", [3, (12, 9), (4, 8)]) +def test_rechunk_execute(setup, data, chunk_size): + tensor = mt.tensor(data, chunk_size=4) + new_tensor = tensor.rechunk(chunk_size) + result = new_tensor.execute().fetch() + if hasattr(result, "toarray"): + # sparse + result = result.toarray() + data = data.toarray() + assert result.flags["C_CONTIGUOUS"] == data.flags["C_CONTIGUOUS"] + np.testing.assert_allclose(result, data) diff --git a/python/xorbits/_mars/tensor/reduction/__init__.py b/python/xorbits/_mars/tensor/reduction/__init__.py new file mode 100644 index 000000000..9dd96d7c7 --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/__init__.py @@ -0,0 +1,64 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .all import TensorAll, all +from .allclose import allclose +from .any import TensorAny, any +from .argmax import TensorArgmax, argmax +from .argmin import TensorArgmin, argmin +from .array_equal import array_equal +from .count_nonzero import TensorCountNonzero, count_nonzero +from .cumprod import TensorCumprod, cumprod +from .cumsum import TensorCumsum, cumsum +from .max import TensorMax, max +from .mean import TensorMean, mean +from .min import TensorMin, min +from .nanargmax import TensorNanArgmax, nanargmax +from .nanargmin import TensorNanArgmin, nanargmin +from .nancumprod import TensorNanCumprod, nancumprod +from .nancumsum import TensorNanCumsum, nancumsum +from .nanmax import TensorNanMax, nanmax +from .nanmean import TensorNanMean, nanmean +from .nanmin import TensorNanMin, nanmin +from .nanprod import TensorNanProd, nanprod +from .nanstd import nanstd +from .nansum import TensorNanSum, nansum +from .nanvar import TensorNanMoment, TensorNanVar, nanvar +from .prod import TensorProd, prod +from .std import std +from .sum import TensorSum, sum +from .var import TensorMoment, TensorVar, var + + +def _install(): + from ..core import Tensor, TensorData + + for cls in (Tensor, TensorData): + setattr(cls, "sum", sum) + setattr(cls, "prod", prod) + setattr(cls, "max", max) + setattr(cls, "min", min) + setattr(cls, "all", all) + setattr(cls, "any", any) + setattr(cls, "mean", mean) + setattr(cls, "argmax", argmax) + setattr(cls, "argmin", argmin) + setattr(cls, "cumsum", cumsum) + setattr(cls, "cumprod", cumprod) + setattr(cls, "var", var) + setattr(cls, "std", std) + + +_install() +del _install diff --git a/python/xorbits/_mars/tensor/reduction/all.py b/python/xorbits/_mars/tensor/reduction/all.py new file mode 100644 index 000000000..1005b0c63 --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/all.py @@ -0,0 +1,113 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorReduction, TensorReductionMixin + + +class TensorAll(TensorReduction, TensorReductionMixin): + _op_type_ = OperandDef.ALL + _func_name = "all" + + def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw): + stage = self._rewrite_stage(stage) + super().__init__( + _axis=axis, + _keepdims=keepdims, + _combine_size=combine_size, + stage=stage, + **kw + ) + + +def all(a, axis=None, out=None, keepdims=None, combine_size=None): + """ + Test whether all array elements along a given axis evaluate to True. + + Parameters + ---------- + a : array_like + Input tensor or object that can be converted to a tensor. + axis : None or int or tuple of ints, optional + Axis or axes along which a logical AND reduction is performed. + The default (`axis` = `None`) is to perform a logical AND over all + the dimensions of the input array. `axis` may be negative, in + which case it counts from the last to the first axis. + + If this is a tuple of ints, a reduction is performed on multiple + axes, instead of a single axis or all the axes as before. + out : Tensor, optional + Alternate output tensor in which to place the result. + It must have the same shape as the expected output and its + type is preserved (e.g., if ``dtype(out)`` is float, the result + will consist of 0.0's and 1.0's). See `doc.ufuncs` (Section + "Output arguments") for more details. + + keepdims : bool, optional + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the input tensor. + + If the default value is passed, then `keepdims` will not be + passed through to the `all` method of sub-classes of + `ndarray`, however any non-default value will be. If the + sub-classes `sum` method does not implement `keepdims` any + exceptions will be raised. + combine_size: int, optional + The number of chunks to combine. + + Returns + ------- + all : Tensor, bool + A new boolean or tensor is returned unless `out` is specified, + in which case a reference to `out` is returned. + + See Also + -------- + Tensor.all : equivalent method + + any : Test whether any element along a given axis evaluates to True. + + Notes + ----- + Not a Number (NaN), positive infinity and negative infinity + evaluate to `True` because these are not equal to zero. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.all([[True,False],[True,True]]).execute() + False + + >>> mt.all([[True,False],[True,True]], axis=0).execute() + array([ True, False]) + + >>> mt.all([-1, 4, 5]).execute() + True + + >>> mt.all([1.0, mt.nan]).execute() + True + + """ + a = astensor(a) + if a.dtype == object: + dtype = a.dtype + else: + dtype = np.dtype(bool) + op = TensorAll(axis=axis, dtype=dtype, keepdims=keepdims, combine_size=combine_size) + return op(a, out=out) diff --git a/python/xorbits/_mars/tensor/reduction/allclose.py b/python/xorbits/_mars/tensor/reduction/allclose.py new file mode 100644 index 000000000..c8f1db1ff --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/allclose.py @@ -0,0 +1,86 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def allclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): + """ + Returns True if two tensors are element-wise equal within a tolerance. + + The tolerance values are positive, typically very small numbers. The + relative difference (`rtol` * abs(`b`)) and the absolute difference + `atol` are added together to compare against the absolute difference + between `a` and `b`. + + If either array contains one or more NaNs, False is returned. + Infs are treated as equal if they are in the same place and of the same + sign in both tensors. + + Parameters + ---------- + a, b : array_like + Input tensors to compare. + rtol : float + The relative tolerance parameter (see Notes). + atol : float + The absolute tolerance parameter (see Notes). + equal_nan : bool + Whether to compare NaN's as equal. If True, NaN's in `a` will be + considered equal to NaN's in `b` in the output tensor. + + Returns + ------- + allclose : bool + Returns True if the two tensors are equal within the given + tolerance; False otherwise. + + See Also + -------- + isclose, all, any, equal + + Notes + ----- + If the following equation is element-wise True, then allclose returns + True. + + absolute(`a` - `b`) <= (`atol` + `rtol` * absolute(`b`)) + + The above equation is not symmetric in `a` and `b`, so that + ``allclose(a, b)`` might be different from ``allclose(b, a)`` in + some rare cases. + + The comparison of `a` and `b` uses standard broadcasting, which + means that `a` and `b` need not have the same shape in order for + ``allclose(a, b)`` to evaluate to True. The same is true for + `equal` but not `array_equal`. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.allclose([1e10,1e-7], [1.00001e10,1e-8]).execute() + False + >>> mt.allclose([1e10,1e-8], [1.00001e10,1e-9]).execute() + True + >>> mt.allclose([1e10,1e-8], [1.0001e10,1e-9]).execute() + False + >>> mt.allclose([1.0, mt.nan], [1.0, mt.nan]).execute() + False + >>> mt.allclose([1.0, mt.nan], [1.0, mt.nan], equal_nan=True).execute() + True + + """ + from ..arithmetic.isclose import isclose + from .all import all + + return all(isclose(a, b, rtol=rtol, atol=atol, equal_nan=equal_nan)) diff --git a/python/xorbits/_mars/tensor/reduction/any.py b/python/xorbits/_mars/tensor/reduction/any.py new file mode 100644 index 000000000..658db4627 --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/any.py @@ -0,0 +1,115 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorReduction, TensorReductionMixin + + +class TensorAny(TensorReduction, TensorReductionMixin): + _op_type_ = OperandDef.ANY + _func_name = "any" + + def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw): + stage = self._rewrite_stage(stage) + super().__init__( + _axis=axis, + _keepdims=keepdims, + _combine_size=combine_size, + stage=stage, + **kw + ) + + +def any(a, axis=None, out=None, keepdims=None, combine_size=None): + """ + Test whether any tensor element along a given axis evaluates to True. + + Returns single boolean unless `axis` is not ``None`` + + Parameters + ---------- + a : array_like + Input tensor or object that can be converted to an array. + axis : None or int or tuple of ints, optional + Axis or axes along which a logical OR reduction is performed. + The default (`axis` = `None`) is to perform a logical OR over all + the dimensions of the input array. `axis` may be negative, in + which case it counts from the last to the first axis. + + If this is a tuple of ints, a reduction is performed on multiple + axes, instead of a single axis or all the axes as before. + out : Tensor, optional + Alternate output tensor in which to place the result. It must have + the same shape as the expected output and its type is preserved + (e.g., if it is of type float, then it will remain so, returning + 1.0 for True and 0.0 for False, regardless of the type of `a`). + See `doc.ufuncs` (Section "Output arguments") for details. + + keepdims : bool, optional + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the input tensor. + + If the default value is passed, then `keepdims` will not be + passed through to the `any` method of sub-classes of + `Tensor`, however any non-default value will be. If the + sub-classes `sum` method does not implement `keepdims` any + exceptions will be raised. + combine_size: int, optional + The number of chunks to combine. + + Returns + ------- + any : bool or Tensor + A new boolean or `Tensor` is returned unless `out` is specified, + in which case a reference to `out` is returned. + + See Also + -------- + Tensor.any : equivalent method + + all : Test whether all elements along a given axis evaluate to True. + + Notes + ----- + Not a Number (NaN), positive infinity and negative infinity evaluate + to `True` because these are not equal to zero. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.any([[True, False], [True, True]]).execute() + True + + >>> mt.any([[True, False], [False, False]], axis=0).execute() + array([ True, False]) + + >>> mt.any([-1, 0, 5]).execute() + True + + >>> mt.any(mt.nan).execute() + True + + """ + a = astensor(a) + if a.dtype == object: + dtype = a.dtype + else: + dtype = np.dtype(bool) + op = TensorAny(axis=axis, dtype=dtype, keepdims=keepdims, combine_size=combine_size) + return op(a, out=out) diff --git a/python/xorbits/_mars/tensor/reduction/argmax.py b/python/xorbits/_mars/tensor/reduction/argmax.py new file mode 100644 index 000000000..d01a45cb1 --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/argmax.py @@ -0,0 +1,127 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField, TupleField +from .core import TensorArgReductionMixin, TensorReduction + + +class TensorArgmax(TensorReduction, TensorArgReductionMixin): + _op_type_ = OperandDef.ARGMAX + _func_name = "argmax" + _agg_func_name = "max" + + _offset = AnyField("offset") + _total_shape = TupleField("total_shape") + + def __init__( + self, + axis=None, + dtype=None, + combine_size=None, + offset=None, + total_shape=None, + stage=None, + **kw + ): + if dtype is None: + dtype = np.dtype(int) + stage = self._rewrite_stage(stage) + super().__init__( + _axis=axis, + _combine_size=combine_size, + _offset=offset, + _total_shape=total_shape, + dtype=dtype, + stage=stage, + **kw + ) + + @property + def offset(self): + return getattr(self, "_offset", None) + + @property + def total_shape(self): + return getattr(self, "_total_shape", None) + + +def argmax(a, axis=None, out=None, combine_size=None): + """ + Returns the indices of the maximum values along an axis. + + Parameters + ---------- + a : array_like + Input tensor. + axis : int, optional + By default, the index is into the flattened tensor, otherwise + along the specified axis. + out : Tensor, optional + If provided, the result will be inserted into this tensor. It should + be of the appropriate shape and dtype. + combine_size: int, optional + The number of chunks to combine. + + Returns + ------- + index_array : Tensor of ints + Tensor of indices into the tensor. It has the same shape as `a.shape` + with the dimension along `axis` removed. + + See Also + -------- + Tensor.argmax, argmin + amax : The maximum value along a given axis. + unravel_index : Convert a flat index into an index tuple. + + Notes + ----- + In case of multiple occurrences of the maximum values, the indices + corresponding to the first occurrence are returned. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.arange(6).reshape(2,3) + >>> a.execute() + array([[0, 1, 2], + [3, 4, 5]]) + >>> mt.argmax(a).execute() + 5 + >>> mt.argmax(a, axis=0).execute() + array([1, 1, 1]) + >>> mt.argmax(a, axis=1).execute() + array([2, 2]) + + Indexes of the maximal elements of a N-dimensional tensor: + + >>> ind = mt.unravel_index(mt.argmax(a, axis=None), a.shape) + >>> ind.execute() + (1, 2) + >>> a[ind].execute() # TODO(jisheng): accomplish when fancy index on tensor is supported + + >>> b = mt.arange(6) + >>> b[1] = 5 + >>> b.execute() + array([0, 5, 2, 3, 4, 5]) + >>> mt.argmax(b).execute() # Only the first occurrence is returned. + 1 + + """ + op = TensorArgmax(axis=axis, dtype=np.dtype(int), combine_size=combine_size) + return op(a, out=out) diff --git a/python/xorbits/_mars/tensor/reduction/argmin.py b/python/xorbits/_mars/tensor/reduction/argmin.py new file mode 100644 index 000000000..d5ae03129 --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/argmin.py @@ -0,0 +1,127 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField, TupleField +from .core import TensorArgReductionMixin, TensorReduction + + +class TensorArgmin(TensorReduction, TensorArgReductionMixin): + _op_type_ = OperandDef.ARGMIN + _func_name = "argmin" + _agg_func_name = "min" + + _offset = AnyField("offset") + _total_shape = TupleField("total_shape") + + def __init__( + self, + axis=None, + dtype=None, + combine_size=None, + offset=None, + total_shape=None, + stage=None, + **kw + ): + if dtype is None: + dtype = np.dtype(int) + stage = self._rewrite_stage(stage) + super().__init__( + _axis=axis, + _combine_size=combine_size, + _offset=offset, + _total_shape=total_shape, + dtype=dtype, + stage=stage, + **kw + ) + + @property + def offset(self): + return getattr(self, "_offset", None) + + @property + def total_shape(self): + return getattr(self, "_total_shape", None) + + +def argmin(a, axis=None, out=None, combine_size=None): + """ + Returns the indices of the minimum values along an axis. + + Parameters + ---------- + a : array_like + Input tensor. + axis : int, optional + By default, the index is into the flattened tensor, otherwise + along the specified axis. + out : Tensor, optional + If provided, the result will be inserted into this tensor. It should + be of the appropriate shape and dtype. + combine_size: int, optional + The number of chunks to combine. + + Returns + ------- + index_array : Tensor of ints + Tensor of indices into the tensor. It has the same shape as `a.shape` + with the dimension along `axis` removed. + + See Also + -------- + Tensor.argmin, argmax + amin : The minimum value along a given axis. + unravel_index : Convert a flat index into an index tuple. + + Notes + ----- + In case of multiple occurrences of the minimum values, the indices + corresponding to the first occurrence are returned. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.arange(6).reshape(2,3) + >>> a.execute() + array([[0, 1, 2], + [3, 4, 5]]) + >>> mt.argmin(a).execute() + 0 + >>> mt.argmin(a, axis=0).execute() + array([0, 0, 0]) + >>> mt.argmin(a, axis=1).execute() + array([0, 0]) + + Indices of the minimum elements of a N-dimensional tensor: + + >>> ind = mt.unravel_index(mt.argmin(a, axis=None), a.shape) + >>> ind.execute() + (0, 0) + >>> a[ind] # TODO(jisheng): accomplish when fancy index on tensor is supported + + >>> b = mt.arange(6) + >>> b[4] = 0 + >>> b.execute() + array([0, 1, 2, 3, 0, 5]) + >>> mt.argmin(b).execute() # Only the first occurrence is returned. + 0 + + """ + op = TensorArgmin(axis=axis, dtype=np.dtype(int), combine_size=combine_size) + return op(a, out=out) diff --git a/python/xorbits/_mars/tensor/reduction/array_equal.py b/python/xorbits/_mars/tensor/reduction/array_equal.py new file mode 100644 index 000000000..0d8967619 --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/array_equal.py @@ -0,0 +1,62 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def array_equal(a1, a2): + """ + True if two tensors have the same shape and elements, False otherwise. + + Parameters + ---------- + a1, a2 : array_like + Input arrays. + + Returns + ------- + b : bool + Returns True if the tensors are equal. + + See Also + -------- + allclose: Returns True if two tensors are element-wise equal within a + tolerance. + array_equiv: Returns True if input tensors are shape consistent and all + elements equal. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.array_equal([1, 2], [1, 2]).execute() + True + >>> mt.array_equal(mt.array([1, 2]), mt.array([1, 2])).execute() + True + >>> mt.array_equal([1, 2], [1, 2, 3]).execute() + False + >>> mt.array_equal([1, 2], [1, 4]).execute() + False + + """ + from ..datasource import tensor as astensor + from ..datasource.scalar import scalar + from .all import all + + try: + a1, a2 = astensor(a1), astensor(a2) + except Exception: + return scalar(False) + + if a1.shape != a2.shape: + return scalar(False) + return all(astensor(a1 == a2)) diff --git a/python/xorbits/_mars/tensor/reduction/core.py b/python/xorbits/_mars/tensor/reduction/core.py new file mode 100644 index 000000000..54b5b0a9c --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/core.py @@ -0,0 +1,659 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import builtins +import copy +import inspect +import itertools +import operator +from collections.abc import Iterable +from functools import reduce +from math import ceil, log + +import numpy as np + +from ...config import options +from ...core.operand import OperandStage +from ...serialization.serializables import AnyField, BoolField, Int32Field, KeyField +from ..array_utils import as_same_device, cp, device, get_array_module +from ..core import Tensor, TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorHasInput, TensorOperandMixin +from ..utils import check_out_param, validate_axis + + +def numel(x, **kwargs): + xp = get_array_module(x) + return xp.sum(xp.ones_like(x), **kwargs) + + +def nannumel(x, **kwargs): + x_size = reduce(operator.mul, x.shape) + xp = get_array_module(x) + return x_size - xp.sum(xp.isnan(x), **kwargs) + + +class TensorReductionMixin(TensorOperandMixin): + __slots__ = () + + @classmethod + def _is_cum(cls): + return False + + @classmethod + def _calc_order(cls, a, out): + return out.order if out is not None else a.order + + @classmethod + def _is_sparse(cls, input_sparse, shape): + return False + + def _call(self, a, out): + a = astensor(a) + if out is not None and not isinstance(out, Tensor): + raise TypeError(f"out should be Tensor object, got {type(out)} instead") + + axis = getattr(self, "axis", None) + keepdims = getattr(self, "keepdims", None) + order = self._calc_order(a, out) + + if self._is_cum(): + if axis is None: + a, axis = a.ravel(), 0 + setattr(self, "_axis", axis) + shape = a.shape + else: + axis = list(range(len(a.shape))) if axis is None else axis + if not isinstance(axis, Iterable): + axis = (validate_axis(a.ndim, axis),) + axis = set(axis) + + shape = tuple( + s if i not in axis else 1 + for i, s in enumerate(a.shape) + if keepdims or i not in axis + ) + + self.sparse = self._is_sparse(a.issparse(), shape) + t = self.new_tensor([a], shape, order=order) + + if out is None: + return t + + check_out_param(out, t, "same_kind") + out_shape, out_dtype = out.shape, out.dtype + # if `out` is specified, use out's dtype and shape + if out_shape != t.shape: + if out.ndim > t.ndim: + raise ValueError("output has too many dimensions") + raise ValueError(f"output shape should be {t.shape}, got {out_shape}") + + setattr(self, "dtype", out_dtype) + + out.data = t.data + return out + + def _new_chunks(self, inputs, kws=None, **kw): + chunks = super()._new_chunks(inputs, kws=kws, **kw) + setattr(self, "_input", getattr(self, "_inputs")[0]) + return chunks + + def _new_tileables(self, inputs, kws=None, **kw): + tensors = super()._new_tileables(inputs, kws=kws, **kw) + setattr(self, "_input", getattr(self, "_inputs")[0]) + return tensors + + def __call__(self, a, out=None): + return self._call(a, out=out) + + @staticmethod + def _reduced_shape(shape, axes): + return tuple(1 if i in axes else s for i, s in enumerate(shape)) + + @staticmethod + def _reduced_nsplits(nsplits, axes): + return tuple((1,) * len(c) if i in axes else c for i, c in enumerate(nsplits)) + + @staticmethod + def _concatenate_shape(tensor, combine_block): + return tuple( + builtins.sum(nsplit[i] for i in cb) + for nsplit, cb in zip(tensor.nsplits, combine_block) + ) + + @staticmethod + def _combine_split(ax, combine_size, chunk_shape): + if ax not in combine_size: + return tuple((i,) for i in range(chunk_shape[ax])) + else: + size = combine_size[ax] + shape = chunk_shape[ax] + index = tuple(range(shape)) + return tuple(index[i : i + size] for i in range(0, shape, size)) + + def _get_op_kw(self): + return None + + @classmethod + def get_axis(cls, axis): + return tuple(axis) if axis is not None else axis + + @classmethod + def get_arg_axis(cls, axis, ndim): + return None if len(axis) == ndim or ndim == 1 else axis[0] + + @classmethod + def _tree_reduction(cls, tensor, axis): + op = tensor.op + kw = getattr(op, "_get_op_kw")() or {} + keepdims = op.keepdims + combine_size = op.combine_size or options.combine_size + if isinstance(combine_size, dict): + combine_size = dict((ax, combine_size.get(ax)) for ax in axis) + else: + assert isinstance(combine_size, int) + n = builtins.max(int(combine_size ** (1.0 / (len(axis) or 1))), 2) + combine_size = dict((ax, n) for ax in axis) + + times = 1 + for i, n in enumerate(tensor.chunk_shape): + if i in combine_size and combine_size[i] != 1: + times = int(builtins.max(times, ceil(log(n, combine_size[i])))) + + for i in range(times - 1): + [tensor] = cls._partial_reduction( + tensor, axis, op.dtype, True, combine_size, OperandStage.combine + ) + + return cls._partial_reduction( + tensor, axis, op.dtype, keepdims, combine_size, OperandStage.agg, kw + ) + + @classmethod + def _partial_reduction( + cls, tensor, axis, dtype, keepdims, combine_size, stage, kw=None + ): + from ..merge.concatenate import TensorConcatenate + + kw = kw or {} + axes = sorted(combine_size.keys()) + op_type = type(tensor.op) + + combine_blocks = [ + cls._combine_split(i, combine_size, tensor.chunk_shape) + for i in range(tensor.ndim) + ] + combine_blocks_idxes = [range(len(blocks)) for blocks in combine_blocks] + + chunks = [] + for combine_block_idx, combine_block in zip( + itertools.product(*combine_blocks_idxes), itertools.product(*combine_blocks) + ): + chks = [tensor.cix[idx] for idx in itertools.product(*combine_block)] + if len(chks) > 1: + op = TensorConcatenate(axis=axes, dtype=chks[0].dtype) + chk = op.new_chunk( + chks, + shape=cls._concatenate_shape(tensor, combine_block), + order=tensor.order, + ) + else: + chk = chks[0] + shape = tuple( + s if i not in combine_size else 1 + for i, s in enumerate(chk.shape) + if keepdims or i not in combine_size + ) + agg_op = op_type( + stage=stage, axis=axis, dtype=dtype, keepdims=keepdims, **kw + ) + chunk = agg_op.new_chunk( + [chk], + shape=shape, + index=tuple( + idx + for i, idx in enumerate(combine_block_idx) + if keepdims or i not in combine_size + ), + order=tensor.order, + ) + chunks.append(chunk) + + nsplits = [ + tuple( + c.shape[i] + for c in chunks + if builtins.all(idx == 0 for j, idx in enumerate(c.index) if j != i) + ) + for i in range(len(chunks[0].shape)) + ] + shape = tuple(builtins.sum(nsplit) for nsplit in nsplits) + agg_op = op_type( + stage=stage, + axis=axis, + dtype=dtype, + keepdims=keepdims, + combine_size=combine_size, + **kw, + ) + return agg_op.new_tensors( + [tensor], shape, order=tensor.order, chunks=chunks, nsplits=nsplits + ) + + @classmethod + def tile(cls, op): + in_tensor = op.inputs[0] + out_tensor = op.outputs[0] + axis = tuple(range(in_tensor.ndim)) if op.axis is None else op.axis + if isinstance(axis, int): + axis = (axis,) + axis = tuple(validate_axis(in_tensor.ndim, ax) for ax in axis) + + if len(in_tensor.chunks) == 1: + c = in_tensor.chunks[0] + new_op = op.copy().reset_key() + setattr(new_op, "_axis", axis) + shape = list(cls._reduced_shape(c.shape, axis)) + nsplits = list(cls._reduced_nsplits(in_tensor.nsplits, axis)) + chunk_index = list(c.index) + if not op.keepdims and axis: + for ax in axis: + shape[ax] = None + nsplits[ax] = None + chunk_index[ax] = None + shape = tuple(s for s in shape if s is not None) + nsplits = tuple(ns for ns in nsplits if ns is not None) + chunk_index = tuple(i for i in chunk_index if i is not None) + + chunks = new_op.new_chunks( + [c], shape=shape, index=chunk_index, order=out_tensor.order + ) + return op.copy().new_tensors( + op.inputs, + op.outputs[0].shape, + order=out_tensor.order, + chunks=chunks, + nsplits=nsplits, + ) + + chunks = [] + kw = getattr(op, "_get_op_kw")() or {} + for c in in_tensor.chunks: + chunk_op = type(op)( + stage=OperandStage.map, + axis=axis, + dtype=op.dtype, + keepdims=True, + combine_size=op.combine_size, + **kw, + ) + chunks.append( + chunk_op.new_chunk( + [c], + shape=cls._reduced_shape(c.shape, axis), + order=out_tensor.order, + index=c.index, + ) + ) + + new_op = op.copy() + tensor = new_op.new_tensor( + op.inputs, + cls._reduced_shape(in_tensor.shape, axis), + order=out_tensor.order, + nsplits=cls._reduced_nsplits(in_tensor.nsplits, axis), + chunks=chunks, + ) + return cls._tree_reduction(tensor, axis) + + @classmethod + def execute_agg(cls, ctx, op): + (input_chunk,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + axis = cls.get_axis(op.axis) + func_name = getattr(cls, "_func_name", None) + reduce_func = getattr(xp, func_name) + out = op.outputs[0] + with device(device_id): + if input_chunk.size == 0 and op.keepdims: + # input chunk is empty, when keepdims is True, return itself + ret = input_chunk + elif "dtype" in inspect.getfullargspec(reduce_func).args: + ret = reduce_func( + input_chunk, axis=axis, dtype=op.dtype, keepdims=bool(op.keepdims) + ) + else: + ret = reduce_func(input_chunk, axis=axis, keepdims=bool(op.keepdims)) + + if hasattr(ret, "astype"): + # for non-object dtype + ret = ret.astype(op.dtype, order=out.order.value, copy=False) + ctx[out.key] = ret + + @classmethod + def execute_one_chunk(cls, ctx, op): + cls.execute_agg(ctx, op) + + @classmethod + def execute(cls, ctx, op): + if op.stage == OperandStage.map: + return cls.execute_map(ctx, op) + elif op.stage == OperandStage.combine: + return cls.execute_combine(ctx, op) + elif op.stage == OperandStage.agg: + return cls.execute_agg(ctx, op) + else: + return cls.execute_one_chunk(ctx, op) + + +class TensorArgReductionMixin(TensorReductionMixin): + __slots__ = () + + @staticmethod + def _get_arg_axis(axis, ndim): + if axis is None: + axis = tuple(range(ndim)) + ravel = True + elif isinstance(axis, int): + axis = validate_axis(ndim, axis) + axis = (axis,) + ravel = ndim == 1 + else: + raise TypeError(f"axis must be either `None` or int, got '{axis}'") + return axis, ravel + + @staticmethod + def _get_offset(tensor, axis, chunk, ravel): + nsplits = tensor.nsplits + offset = tuple( + builtins.sum(split[:idx]) for split, idx in zip(nsplits, chunk.index) + ) + if not ravel: + offset = offset[axis[0]] + return offset + + @classmethod + def _calc_order(cls, a, out): + return out.order if out is not None else TensorOrder.C_ORDER + + @classmethod + def tile(cls, op): + in_tensor = op.inputs[0] + out_tensor = op.outputs[0] + axis, ravel = cls._get_arg_axis(op.axis, in_tensor.ndim) + + chunks = [] + for c in in_tensor.chunks: + offset = cls._get_offset(in_tensor, axis, c, ravel) + chunk_op = type(op)( + stage=OperandStage.map, + axis=axis, + dtype=op.dtype, + offset=offset, + total_shape=in_tensor.shape, + combine_size=op.combine_size, + ) + chunk = chunk_op.new_chunk( + [c], + shape=cls._reduced_shape(c.shape, axis), + index=c.index, + order=out_tensor.order, + ) + chunks.append(chunk) + new_op = op.copy() + tensor = new_op.new_tensor( + op.inputs, + cls._reduced_shape(in_tensor.shape, axis), + order=out_tensor.order, + nsplits=cls._reduced_nsplits(in_tensor.nsplits, axis), + chunks=chunks, + ) + return cls._tree_reduction(tensor, axis) + + @classmethod + def execute_agg(cls, ctx, op): + axis = cls.get_arg_axis(op.axis, op.inputs[0].ndim) + (vals, arg), device_id, xp = as_same_device( + ctx[op.inputs[0].key], device=op.device, ret_extra=True + ) + + func_name = getattr(cls, "_func_name") + arg_func = getattr(xp, func_name) + + with device(device_id): + if xp.any(xp.isnan(vals)) and "nan" in func_name: + raise ValueError("All NaN slice encountered") + if axis is None: + local_args = arg_func(vals, axis=axis) + arg = arg.ravel()[local_args] + else: + local_args = arg_func(vals, axis=axis) + inds = np.ogrid[tuple(map(slice, local_args.shape))] + if xp != np: + inds = [xp.asarray(it) for it in inds] + inds.insert(axis, local_args) + arg = arg[tuple(inds)] + ctx[op.outputs[0].key] = arg + + @classmethod + def execute_map(cls, ctx, op): + arg_axis = cls.get_arg_axis(op.axis, op.inputs[0].ndim) + (in_chunk,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + func_name = getattr(cls, "_func_name") + agg_func_name = getattr(cls, "_agg_func_name") + arg_func = getattr(xp, func_name) + agg_func_name = getattr(xp, agg_func_name) + + offset = op.offset + chunk = op.outputs[0] + with device(device_id): + vals = agg_func_name(in_chunk, axis=arg_axis) + if hasattr(vals, "reshape"): + vals = vals.reshape(chunk.shape) + try: + arg = arg_func(in_chunk, axis=arg_axis) + if hasattr(arg, "reshape"): + arg = arg.reshape(chunk.shape) + except ValueError: + # handle all NaN + arg = arg_func( + xp.where(xp.isnan(in_chunk), np.inf, in_chunk), axis=arg_axis + ).reshape(chunk.shape) + + if arg_axis is None: + if xp == cp: + # we need to copy to do cpu computation, then copy back to gpu + # cuz unravel_index and ravel_multi_index are not implemented in cupy + in_chunk = in_chunk.get() + + total_shape = op.total_shape + ind = np.unravel_index(arg.ravel()[0], in_chunk.shape) + total_ind = tuple(o + i for (o, i) in zip(offset, ind)) + res = np.ravel_multi_index(total_ind, total_shape) + + if xp == cp: + # copy back + with xp.cuda.Device(in_chunk.device.id): + arg[:] = xp.asarray(res) + else: + arg[:] = res + else: + arg += offset + ctx[op.outputs[0].key] = (vals, arg) + + @classmethod + def execute_combine(cls, ctx, op): + axis = cls.get_arg_axis(op.axis, op.inputs[0].ndim) + (vals, arg), device_id, xp = as_same_device( + ctx[op.inputs[0].key], device=op.device, ret_extra=True + ) + + func_name = getattr(cls, "_func_name") + arg_func = getattr(xp, func_name) + with device(device_id): + if axis is None: + local_args = arg_func(vals, axis=axis).reshape(op.outputs[0].shape) + vals = vals.ravel()[local_args] + arg = arg.ravel()[local_args] + else: + local_args = arg_func(vals, axis=axis) + inds = np.ogrid[tuple(map(slice, local_args.shape))] + if xp != np: + inds = [xp.asarray(it) for it in inds] + inds.insert(axis, local_args) + inds_tuple = tuple(inds) + vals = vals[inds_tuple].reshape(op.outputs[0].shape) + arg = arg[inds_tuple].reshape(op.outputs[0].shape) + ctx[op.outputs[0].key] = (vals, arg) + + +class TensorCumReductionMixin(TensorReductionMixin): + __slots__ = () + + @classmethod + def _is_cum(cls): + return True + + @staticmethod + def _get_op_types(): + raise NotImplementedError + + @classmethod + def tile(cls, op): + from ..indexing.slice import TensorSlice + + in_tensor = op.inputs[0] + out_tensor = op.outputs[0] + axis = op.axis + if not isinstance(axis, int): + raise ValueError("axis must be a integer") + axis = validate_axis(in_tensor.ndim, axis) + if axis is None: + raise NotImplementedError + + op_type, bin_op_type = getattr(op, "_get_op_types")() + + chunks = [] + for c in in_tensor.chunks: + chunk_op = op_type(axis=op.axis, dtype=op.dtype) + chunks.append( + chunk_op.new_chunk( + [c], shape=c.shape, index=c.index, order=out_tensor.order + ) + ) + inter_tensor = copy.copy(in_tensor) + inter_tensor._chunks = chunks + + slc = [ + slice(None) if i != axis else slice(-1, None) for i in range(in_tensor.ndim) + ] + + output_chunks = [] + for chunk in chunks: + if chunk.index[axis] == 0: + output_chunks.append(chunk) + continue + + to_cum_chunks = [] + for i in range(chunk.index[axis]): + to_cum_index = chunk.index[:axis] + (i,) + chunk.index[axis + 1 :] + shape = chunk.shape[:axis] + (1,) + chunk.shape[axis + 1 :] + to_cum_chunk = inter_tensor.cix[to_cum_index] + slice_op = TensorSlice(slices=slc, dtype=chunk.dtype) + sliced_chunk = slice_op.new_chunk( + [to_cum_chunk], + shape=shape, + index=to_cum_index, + order=out_tensor.order, + ) + to_cum_chunks.append(sliced_chunk) + to_cum_chunks.append(chunk) + + # GH#3132: some chunks of to_cum_chunks may be empty, + # so we tell tree_add&tree_multiply to ignore them + bin_op = bin_op_type( + args=to_cum_chunks, dtype=chunk.dtype, ignore_empty_input=True + ) + output_chunk = bin_op.new_chunk( + to_cum_chunks, + shape=chunk.shape, + index=chunk.index, + order=out_tensor.order, + ) + output_chunks.append(output_chunk) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + in_tensor.shape, + order=out_tensor.order, + chunks=output_chunks, + nsplits=in_tensor.nsplits, + ) + + @classmethod + def execute(cls, ctx, op): + (x,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + func_name = getattr(cls, "_func_name") + cum_func = getattr(xp, func_name) + if xp != np: + func = getattr(xp, cum_func.__name__) + else: + func = cum_func + + with device(device_id): + ctx[op.outputs[0].key] = func(x, axis=op.axis, dtype=op.dtype) + + +class TensorReduction(TensorHasInput): + _input = KeyField("input") + _out = KeyField("out") + _axis = AnyField("axis") # can be None or int or tuple of ints, just infer the data + _keepdims = BoolField("keepdims") + _combine_size = AnyField("combine_size") + + @property + def axis(self): + return getattr(self, "_axis", None) + + @property + def keepdims(self): + return getattr(self, "_keepdims", None) + + @property + def combine_size(self): + return getattr(self, "_combine_size", None) + + def _rewrite_stage(self, stage): + if stage == OperandStage.map and not hasattr(self, "execute_map"): + return OperandStage.agg + elif stage == OperandStage.combine and not hasattr(self, "execute_combine"): + return OperandStage.agg + return stage + + +class TensorCumReduction(TensorHasInput): + _input = KeyField("input") + _axis = Int32Field("axis") + + @property + def axis(self): + return getattr(self, "_axis", None) diff --git a/python/xorbits/_mars/tensor/reduction/count_nonzero.py b/python/xorbits/_mars/tensor/reduction/count_nonzero.py new file mode 100644 index 000000000..de04b084f --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/count_nonzero.py @@ -0,0 +1,124 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..array_utils import as_same_device, device +from .core import TensorReduction, TensorReductionMixin +from .sum import TensorSum + + +class TensorCountNonzero(TensorReduction, TensorReductionMixin): + _op_type_ = OperandDef.COUNT_NONZERO + + def __init__( + self, axis=None, dtype=None, keepdims=None, combine_size=None, stage=None, **kw + ): + if dtype is None: + dtype = np.dtype(np.intp) + stage = self._rewrite_stage(stage) + super().__init__( + _axis=axis, + _keepdims=keepdims, + _combine_size=combine_size, + dtype=dtype, + stage=stage, + **kw + ) + + @classmethod + def execute_map(cls, ctx, op): + (x,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], op.device, ret_extra=True + ) + + axis = cls.get_arg_axis(op.axis, op.inputs[0].ndim) + keepdims = op.keepdims + with device(device_id): + nz = xp.count_nonzero(x, axis=axis) + if keepdims: + slcs = [slice(None)] * op.inputs[0].ndim + for ax in op.axis: + slcs[ax] = np.newaxis + nz = xp.asarray(nz)[tuple(slcs)] + + ctx[op.outputs[0].key] = nz + + @classmethod + def execute_agg(cls, ctx, op): + return TensorSum.execute_agg(ctx, op) + + @classmethod + def execute_one_chunk(cls, ctx, op): + a = ctx[op.inputs[0].key] + (inp,), device_id, xp = as_same_device([a], device=op.device, ret_extra=True) + with device(device_id): + ctx[op.outputs[0].key] = xp.count_nonzero(inp, axis=op.axis) + + +def count_nonzero(a, axis=None, combine_size=None): + """ + Counts the number of non-zero values in the tensor ``a``. + + The word "non-zero" is in reference to the Python 2.x + built-in method ``__nonzero__()`` (renamed ``__bool__()`` + in Python 3.x) of Python objects that tests an object's + "truthfulness". For example, any number is considered + truthful if it is nonzero, whereas any string is considered + truthful if it is not the empty string. Thus, this function + (recursively) counts how many elements in ``a`` (and in + sub-tensors thereof) have their ``__nonzero__()`` or ``__bool__()`` + method evaluated to ``True``. + + Parameters + ---------- + a : array_like + The tensor for which to count non-zeros. + axis : int or tuple, optional + Axis or tuple of axes along which to count non-zeros. + Default is None, meaning that non-zeros will be counted + along a flattened version of ``a``. + combine_size: int, optional + The number of chunks to combine. + + Returns + ------- + count : int or tensor of int + Number of non-zero values in the array along a given axis. + Otherwise, the total number of non-zero values in the tensor + is returned. + + See Also + -------- + nonzero : Return the coordinates of all the non-zero values. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.count_nonzero(mt.eye(4)).execute() + 4 + >>> mt.count_nonzero([[0,1,7,0,0],[3,0,0,2,19]]).execute() + 5 + >>> mt.count_nonzero([[0,1,7,0,0],[3,0,0,2,19]], axis=0).execute() + array([1, 1, 1, 1, 1]) + >>> mt.count_nonzero([[0,1,7,0,0],[3,0,0,2,19]], axis=1).execute() + array([2, 3]) + + """ + op = TensorCountNonzero( + axis=axis, dtype=np.dtype(np.int_), keepdims=None, combine_size=combine_size + ) + return op(a) diff --git a/python/xorbits/_mars/tensor/reduction/cumprod.py b/python/xorbits/_mars/tensor/reduction/cumprod.py new file mode 100644 index 000000000..d2a5072bd --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/cumprod.py @@ -0,0 +1,101 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..arithmetic.multiply import TensorTreeMultiply +from ..datasource import tensor as astensor +from .core import TensorCumReduction, TensorCumReductionMixin + + +class TensorCumprod(TensorCumReduction, TensorCumReductionMixin): + _op_type_ = OperandDef.CUMPROD + _func_name = "cumprod" + + def __init__(self, axis=None, **kw): + super().__init__(_axis=axis, **kw) + + @staticmethod + def _get_op_types(): + return TensorCumprod, TensorTreeMultiply + + +def cumprod(a, axis=None, dtype=None, out=None): + """ + Return the cumulative product of elements along a given axis. + + Parameters + ---------- + a : array_like + Input tensor. + axis : int, optional + Axis along which the cumulative product is computed. By default + the input is flattened. + dtype : dtype, optional + Type of the returned tensor, as well as of the accumulator in which + the elements are multiplied. If *dtype* is not specified, it + defaults to the dtype of `a`, unless `a` has an integer dtype with + a precision less than that of the default platform integer. In + that case, the default platform integer is used instead. + out : Tensor, optional + Alternative output tensor in which to place the result. It must + have the same shape and buffer length as the expected output + but the type of the resulting values will be cast if necessary. + + Returns + ------- + cumprod : Tensor + A new tensor holding the result is returned unless `out` is + specified, in which case a reference to out is returned. + + See Also + -------- + numpy.doc.ufuncs : Section "Output arguments" + + Notes + ----- + Arithmetic is modular when using integer types, and no error is + raised on overflow. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([1,2,3]) + >>> mt.cumprod(a).execute() # intermediate results 1, 1*2 + ... # total product 1*2*3 = 6 + array([1, 2, 6]) + >>> a = mt.array([[1, 2, 3], [4, 5, 6]]) + >>> mt.cumprod(a, dtype=float).execute() # specify type of output + array([ 1., 2., 6., 24., 120., 720.]) + + The cumulative product for each column (i.e., over the rows) of `a`: + + >>> mt.cumprod(a, axis=0).execute() + array([[ 1, 2, 3], + [ 4, 10, 18]]) + + The cumulative product for each row (i.e. over the columns) of `a`: + + >>> mt.cumprod(a,axis=1).execute() + array([[ 1, 2, 6], + [ 4, 20, 120]]) + + """ + a = astensor(a) + if dtype is None: + dtype = np.empty((1,), dtype=a.dtype).cumprod().dtype + op = TensorCumprod(axis=axis, dtype=dtype) + return op(a, out=out) diff --git a/python/xorbits/_mars/tensor/reduction/cumsum.py b/python/xorbits/_mars/tensor/reduction/cumsum.py new file mode 100644 index 000000000..f7fb4d0f0 --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/cumsum.py @@ -0,0 +1,105 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..arithmetic.add import TensorTreeAdd +from ..datasource import tensor as astensor +from .core import TensorCumReduction, TensorCumReductionMixin + + +class TensorCumsum(TensorCumReduction, TensorCumReductionMixin): + _op_type_ = OperandDef.CUMSUM + _func_name = "cumsum" + + def __init__(self, axis=None, **kw): + super().__init__(_axis=axis, **kw) + + @staticmethod + def _get_op_types(): + return TensorCumsum, TensorTreeAdd + + +def cumsum(a, axis=None, dtype=None, out=None): + """ + Return the cumulative sum of the elements along a given axis. + + Parameters + ---------- + a : array_like + Input tensor. + axis : int, optional + Axis along which the cumulative sum is computed. The default + (None) is to compute the cumsum over the flattened tensor. + dtype : dtype, optional + Type of the returned tensor and of the accumulator in which the + elements are summed. If `dtype` is not specified, it defaults + to the dtype of `a`, unless `a` has an integer dtype with a + precision less than that of the default platform integer. In + that case, the default platform integer is used. + out : Tensor, optional + Alternative output tensor in which to place the result. It must + have the same shape and buffer length as the expected output + but the type will be cast if necessary. See `doc.ufuncs` + (Section "Output arguments") for more details. + + Returns + ------- + cumsum_along_axis : Tensor. + A new tensor holding the result is returned unless `out` is + specified, in which case a reference to `out` is returned. The + result has the same size as `a`, and the same shape as `a` if + `axis` is not None or `a` is a 1-d tensor. + + + See Also + -------- + sum : Sum tensor elements. + + trapz : Integration of tensor values using the composite trapezoidal rule. + + diff : Calculate the n-th discrete difference along given axis. + + Notes + ----- + Arithmetic is modular when using integer types, and no error is + raised on overflow. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([[1,2,3], [4,5,6]]) + >>> a.execute() + array([[1, 2, 3], + [4, 5, 6]]) + >>> mt.cumsum(a).execute() + array([ 1, 3, 6, 10, 15, 21]) + >>> mt.cumsum(a, dtype=float).execute() # specifies type of output value(s) + array([ 1., 3., 6., 10., 15., 21.]) + + >>> mt.cumsum(a,axis=0).execute() # sum over rows for each of the 3 columns + array([[1, 2, 3], + [5, 7, 9]]) + >>> mt.cumsum(a,axis=1).execute() # sum over columns for each of the 2 rows + array([[ 1, 3, 6], + [ 4, 9, 15]]) + + """ + a = astensor(a) + if dtype is None: + dtype = np.empty((1,), dtype=a.dtype).cumsum().dtype + op = TensorCumsum(axis=axis, dtype=dtype) + return op(a, out=out) diff --git a/python/xorbits/_mars/tensor/reduction/max.py b/python/xorbits/_mars/tensor/reduction/max.py new file mode 100644 index 000000000..b7de28f6b --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/max.py @@ -0,0 +1,132 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorReduction, TensorReductionMixin + + +class TensorMax(TensorReduction, TensorReductionMixin): + _op_type_ = OperandDef.MAX + _func_name = "max" + + def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw): + stage = self._rewrite_stage(stage) + super().__init__( + _axis=axis, + _keepdims=keepdims, + _combine_size=combine_size, + stage=stage, + **kw + ) + + @classmethod + def _is_sparse(cls, input_sparse, shape): + if input_sparse and len(shape) > 0: + return True + return False + + +def max(a, axis=None, out=None, keepdims=None, combine_size=None): + """ + Return the maximum of an array or maximum along an axis. + + Parameters + ---------- + a : array_like + Input data. + axis : None or int or tuple of ints, optional + Axis or axes along which to operate. By default, flattened input is + used. + + If this is a tuple of ints, the maximum is selected over multiple axes, + instead of a single axis or all the axes as before. + out : Tensor, optional + Alternative output tensor in which to place the result. Must + be of the same shape and buffer length as the expected output. + See `doc.ufuncs` (Section "Output arguments") for more details. + + keepdims : bool, optional + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the input array. + + If the default value is passed, then `keepdims` will not be + passed through to the `amax` method of sub-classes of + `ndarray`, however any non-default value will be. If the + sub-classes `sum` method does not implement `keepdims` any + exceptions will be raised. + combine_size: int, optional + The number of chunks to combine. + + Returns + ------- + amax : Tensor or scalar + Maximum of `a`. If `axis` is None, the result is a scalar value. + If `axis` is given, the result is a tensor of dimension + ``a.ndim - 1``. + + See Also + -------- + amin : + The minimum value of a tensor along a given axis, propagating any NaNs. + nanmax : + The maximum value of a tensor along a given axis, ignoring any NaNs. + maximum : + Element-wise maximum of two tensors, propagating any NaNs. + fmax : + Element-wise maximum of two tensors, ignoring any NaNs. + argmax : + Return the indices of the maximum values. + + nanmin, minimum, fmin + + Notes + ----- + NaN values are propagated, that is if at least one item is NaN, the + corresponding max value will be NaN as well. To ignore NaN values + (MATLAB behavior), please use nanmax. + + Don't use `amax` for element-wise comparison of 2 arrays; when + ``a.shape[0]`` is 2, ``maximum(a[0], a[1])`` is faster than + ``amax(a, axis=0)``. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.arange(4).reshape((2,2)) + >>> a.execute() + array([[0, 1], + [2, 3]]) + >>> mt.amax(a).execute() # Maximum of the flattened array + 3 + >>> mt.amax(a, axis=0).execute() # Maxima along the first axis + array([2, 3]) + >>> mt.amax(a, axis=1).execute() # Maxima along the second axis + array([1, 3]) + + >>> b = mt.arange(5, dtype=float) + >>> b[2] = mt.NaN + >>> mt.amax(b).execute() + nan + >>> mt.nanmax(b).execute() + 4.0 + + """ + a = astensor(a) + op = TensorMax( + axis=axis, dtype=a.dtype, keepdims=keepdims, combine_size=combine_size + ) + return op(a, out=out) diff --git a/python/xorbits/_mars/tensor/reduction/mean.py b/python/xorbits/_mars/tensor/reduction/mean.py new file mode 100644 index 000000000..0c4c941c9 --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/mean.py @@ -0,0 +1,199 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..array_utils import as_same_device, device +from ..datasource import tensor as astensor +from .core import TensorReduction, TensorReductionMixin, numel + + +class TensorMean(TensorReduction, TensorReductionMixin): + _op_type_ = OperandDef.MEAN + + def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw): + stage = self._rewrite_stage(stage) + super().__init__( + _axis=axis, + _keepdims=keepdims, + _combine_size=combine_size, + stage=stage, + **kw + ) + + @classmethod + def execute_agg(cls, ctx, op): + axis = cls.get_axis(op.axis) + + a = ctx[op.inputs[0].key] + if not isinstance(a, (list, tuple)): + (inp,), device_id, xp = as_same_device( + [a], device=op.device, ret_extra=True + ) + + with device(device_id): + ctx[op.outputs[0].key] = xp.mean( + inp, axis=axis, dtype=op.dtype, keepdims=bool(op.keepdims) + ) + else: + (_data, _count), device_id, xp = as_same_device( + a, device=op.device, ret_extra=True + ) + + with device(device_id): + chunk_count = xp.sum( + _count, axis=axis, dtype=op.dtype, keepdims=bool(op.keepdims) + ) + chunk_sum = xp.sum( + _data, axis=axis, dtype=op.dtype, keepdims=bool(op.keepdims) + ) + ctx[op.outputs[0].key] = xp.true_divide( + chunk_sum, chunk_count, dtype=op.dtype + ) + + @classmethod + def execute_map(cls, ctx, op): + (in_chunk,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + axis = cls.get_axis(op.axis) + + with device(device_id): + chunk_count = numel( + in_chunk, axis=axis, dtype=np.int64, keepdims=bool(op.keepdims) + ) + chunk_sum = xp.sum( + in_chunk, axis=axis, dtype=op.dtype, keepdims=bool(op.keepdims) + ) + ctx[op.outputs[0].key] = (chunk_sum, chunk_count) + + @classmethod + def execute_combine(cls, ctx, op): + axis = cls.get_axis(op.axis) + (_data, _count), device_id, xp = as_same_device( + ctx[op.inputs[0].key], device=op.device, ret_extra=True + ) + + with device(device_id): + chunk_count = xp.sum( + _count, axis=axis, dtype=np.int64, keepdims=bool(op.keepdims) + ) + chunk_sum = xp.sum( + _data, axis=axis, dtype=op.dtype, keepdims=bool(op.keepdims) + ) + ctx[op.outputs[0].key] = (chunk_sum, chunk_count) + + +def mean(a, axis=None, dtype=None, out=None, keepdims=None, combine_size=None): + """ + Compute the arithmetic mean along the specified axis. + + Returns the average of the array elements. The average is taken over + the flattened tensor by default, otherwise over the specified axis. + `float64` intermediate and return values are used for integer inputs. + + Parameters + ---------- + a : array_like + Tensor containing numbers whose mean is desired. If `a` is not an + tensor, a conversion is attempted. + axis : None or int or tuple of ints, optional + Axis or axes along which the means are computed. The default is to + compute the mean of the flattened array. + + If this is a tuple of ints, a mean is performed over multiple axes, + instead of a single axis or all the axes as before. + dtype : data-type, optional + Type to use in computing the mean. For integer inputs, the default + is `float64`; for floating point inputs, it is the same as the + input dtype. + out : Tensor, optional + Alternate output tensor in which to place the result. The default + is ``None``; if provided, it must have the same shape as the + expected output, but the type will be cast if necessary. + See `doc.ufuncs` for details. + + keepdims : bool, optional + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the input tensor. + + If the default value is passed, then `keepdims` will not be + passed through to the `mean` method of sub-classes of + `Tensor`, however any non-default value will be. If the + sub-classes `sum` method does not implement `keepdims` any + exceptions will be raised. + combine_size: int, optional + The number of chunks to combine. + + Returns + ------- + m : Tensor, see dtype parameter above + If `out=None`, returns a new tensor containing the mean values, + otherwise a reference to the output array is returned. + + See Also + -------- + average : Weighted average + std, var, nanmean, nanstd, nanvar + + Notes + ----- + The arithmetic mean is the sum of the elements along the axis divided + by the number of elements. + + Note that for floating-point input, the mean is computed using the + same precision the input has. Depending on the input data, this can + cause the results to be inaccurate, especially for `float32` (see + example below). Specifying a higher-precision accumulator using the + `dtype` keyword can alleviate this issue. + + By default, `float16` results are computed using `float32` intermediates + for extra precision. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([[1, 2], [3, 4]]) + >>> mt.mean(a).execute() + 2.5 + >>> mt.mean(a, axis=0).execute() + array([ 2., 3.]) + >>> mt.mean(a, axis=1).execute() + array([ 1.5, 3.5]) + + In single precision, `mean` can be inaccurate: + + >>> a = mt.zeros((2, 512*512), dtype=mt.float32) + >>> a[0, :] = 1.0 + >>> a[1, :] = 0.1 + >>> mt.mean(a).execute() + 0.54999924 + + Computing the mean in float64 is more accurate: + + >>> mt.mean(a, dtype=mt.float64).execute() + 0.55000000074505806 + + """ + a = astensor(a) + if dtype is None: + dtype = np.mean(np.empty((1,), dtype=a.dtype)).dtype + op = TensorMean( + axis=axis, dtype=dtype, keepdims=keepdims, combine_size=combine_size + ) + return op(a, out=out) diff --git a/python/xorbits/_mars/tensor/reduction/min.py b/python/xorbits/_mars/tensor/reduction/min.py new file mode 100644 index 000000000..5c4ef9c0a --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/min.py @@ -0,0 +1,132 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorReduction, TensorReductionMixin + + +class TensorMin(TensorReduction, TensorReductionMixin): + _op_type_ = OperandDef.MIN + _func_name = "min" + + def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw): + stage = self._rewrite_stage(stage) + super().__init__( + _axis=axis, + _keepdims=keepdims, + _combine_size=combine_size, + stage=stage, + **kw + ) + + @classmethod + def _is_sparse(cls, input_sparse, shape): + if input_sparse and len(shape) > 0: + return True + return False + + +def min(a, axis=None, out=None, keepdims=None, combine_size=None): + """ + Return the minimum of a tensor or minimum along an axis. + + Parameters + ---------- + a : array_like + Input data. + axis : None or int or tuple of ints, optional + Axis or axes along which to operate. By default, flattened input is + used. + + If this is a tuple of ints, the minimum is selected over multiple axes, + instead of a single axis or all the axes as before. + out : Tensor, optional + Alternative output tensor in which to place the result. Must + be of the same shape and buffer length as the expected output. + See `doc.ufuncs` (Section "Output arguments") for more details. + + keepdims : bool, optional + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the input tensor. + + If the default value is passed, then `keepdims` will not be + passed through to the `amin` method of sub-classes of + `Tensor`, however any non-default value will be. If the + sub-classes `sum` method does not implement `keepdims` any + exceptions will be raised. + combine_size: int, optional + The number of chunks to combine. + + Returns + ------- + amin : Tensor or scalar + Minimum of `a`. If `axis` is None, the result is a scalar value. + If `axis` is given, the result is an array of dimension + ``a.ndim - 1``. + + See Also + -------- + amax : + The maximum value of a tensor along a given axis, propagating any NaNs. + nanmin : + The minimum value of a tensor along a given axis, ignoring any NaNs. + minimum : + Element-wise minimum of two tensors, propagating any NaNs. + fmin : + Element-wise minimum of two tensors, ignoring any NaNs. + argmin : + Return the indices of the minimum values. + + nanmax, maximum, fmax + + Notes + ----- + NaN values are propagated, that is if at least one item is NaN, the + corresponding min value will be NaN as well. To ignore NaN values + (MATLAB behavior), please use nanmin. + + Don't use `amin` for element-wise comparison of 2 tensors; when + ``a.shape[0]`` is 2, ``minimum(a[0], a[1])`` is faster than + ``amin(a, axis=0)``. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.arange(4).reshape((2,2)) + >>> a.execute() + array([[0, 1], + [2, 3]]) + >>> mt.amin(a).execute() # Minimum of the flattened array + 0 + >>> mt.amin(a, axis=0).execute() # Minima along the first axis + array([0, 1]) + >>> mt.amin(a, axis=1).execute() # Minima along the second axis + array([0, 2]) + + >>> b = mt.arange(5, dtype=float) + >>> b[2] = mt.NaN + >>> mt.amin(b).execute() + nan + >>> mt.nanmin(b).execute() + 0.0 + + """ + a = astensor(a) + op = TensorMin( + axis=axis, dtype=a.dtype, keepdims=keepdims, combine_size=combine_size + ) + return op(a, out=out) diff --git a/python/xorbits/_mars/tensor/reduction/nanargmax.py b/python/xorbits/_mars/tensor/reduction/nanargmax.py new file mode 100644 index 000000000..df8e9a673 --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/nanargmax.py @@ -0,0 +1,108 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField, TupleField +from .core import TensorArgReductionMixin, TensorReduction + + +class TensorNanArgmax(TensorReduction, TensorArgReductionMixin): + _op_type_ = OperandDef.NANARGMAX + _func_name = "nanargmax" + _agg_func_name = "nanmax" + + _offset = AnyField("offset") + _total_shape = TupleField("total_shape") + + def __init__( + self, + axis=None, + dtype=None, + combine_size=None, + offset=None, + total_shape=None, + stage=None, + **kw + ): + if dtype is None: + dtype = np.dtype(int) + stage = self._rewrite_stage(stage) + super().__init__( + _axis=axis, + _combine_size=combine_size, + _offset=offset, + _total_shape=total_shape, + dtype=dtype, + stage=stage, + **kw + ) + + @property + def offset(self): + return getattr(self, "_offset", None) + + @property + def total_shape(self): + return getattr(self, "_total_shape", None) + + +def nanargmax(a, axis=None, out=None, combine_size=None): + """ + Return the indices of the maximum values in the specified axis ignoring + NaNs. For all-NaN slices ``ValueError`` is raised. Warning: the + results cannot be trusted if a slice contains only NaNs and -Infs. + + + Parameters + ---------- + a : array_like + Input data. + axis : int, optional + Axis along which to operate. By default flattened input is used. + out : Tensor, optional + Alternate output tensor in which to place the result. The default + is ``None``; if provided, it must have the same shape as the + expected output, but the type will be cast if necessary. + See `doc.ufuncs` for details. + combine_size: int, optional + The number of chunks to combine. + + Returns + ------- + index_array : Tensor + An tensor of indices or a single index value. + + See Also + -------- + argmax, nanargmin + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([[mt.nan, 4], [2, 3]]) + >>> mt.argmax(a).execute() + 0 + >>> mt.nanargmax(a).execute() + 1 + >>> mt.nanargmax(a, axis=0).execute() + array([1, 0]) + >>> mt.nanargmax(a, axis=1).execute() + array([1, 1]) + + """ + op = TensorNanArgmax(axis=axis, dtype=np.dtype(int), combine_size=combine_size) + return op(a, out=out) diff --git a/python/xorbits/_mars/tensor/reduction/nanargmin.py b/python/xorbits/_mars/tensor/reduction/nanargmin.py new file mode 100644 index 000000000..2189d87d9 --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/nanargmin.py @@ -0,0 +1,102 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import AnyField, TupleField +from .core import TensorArgReductionMixin, TensorReduction + + +class TensorNanArgmin(TensorReduction, TensorArgReductionMixin): + _op_type_ = OperandDef.NANARGMIN + _func_name = "nanargmin" + _agg_func_name = "nanmin" + + _offset = AnyField("offset") + _total_shape = TupleField("total_shape") + + def __init__( + self, + axis=None, + dtype=None, + combine_size=None, + offset=None, + total_shape=None, + stage=None, + **kw + ): + if dtype is None: + dtype = np.dtype(int) + stage = self._rewrite_stage(stage) + super().__init__( + _axis=axis, + _combine_size=combine_size, + _offset=offset, + _total_shape=total_shape, + dtype=dtype, + stage=stage, + **kw + ) + + @property + def offset(self): + return getattr(self, "_offset", None) + + @property + def total_shape(self): + return getattr(self, "_total_shape", None) + + +def nanargmin(a, axis=None, out=None, combine_size=None): + """ + Return the indices of the minimum values in the specified axis ignoring + NaNs. For all-NaN slices ``ValueError`` is raised. Warning: the results + cannot be trusted if a slice contains only NaNs and Infs. + + Parameters + ---------- + a : array_like + Input data. + axis : int, optional + Axis along which to operate. By default flattened input is used. + combine_size: int, optional + The number of chunks to combine. + + Returns + ------- + index_array : Tensor + A tensor of indices or a single index value. + + See Also + -------- + argmin, nanargmax + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([[mt.nan, 4], [2, 3]]) + >>> mt.argmin(a).execute() + 0 + >>> mt.nanargmin(a).execute() + 2 + >>> mt.nanargmin(a, axis=0).execute() + array([1, 1]) + >>> mt.nanargmin(a, axis=1).execute() + array([1, 0]) + + """ + op = TensorNanArgmin(axis=axis, dtype=np.dtype(int), combine_size=combine_size) + return op(a, out=out) diff --git a/python/xorbits/_mars/tensor/reduction/nancumprod.py b/python/xorbits/_mars/tensor/reduction/nancumprod.py new file mode 100644 index 000000000..3159ee3a5 --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/nancumprod.py @@ -0,0 +1,97 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..arithmetic.multiply import TensorTreeMultiply +from ..datasource import tensor as astensor +from .core import TensorCumReduction, TensorCumReductionMixin + + +class TensorNanCumprod(TensorCumReduction, TensorCumReductionMixin): + _op_type_ = OperandDef.NANCUMPROD + _func_name = "nancumprod" + + def __init__(self, axis=None, **kw): + super().__init__(_axis=axis, **kw) + + @staticmethod + def _get_op_types(): + return TensorNanCumprod, TensorTreeMultiply + + +def nancumprod(a, axis=None, dtype=None, out=None): + """ + Return the cumulative product of tensor elements over a given axis treating Not a + Numbers (NaNs) as one. The cumulative product does not change when NaNs are + encountered and leading NaNs are replaced by ones. + + Ones are returned for slices that are all-NaN or empty. + + Parameters + ---------- + a : array_like + Input tensor. + axis : int, optional + Axis along which the cumulative product is computed. By default + the input is flattened. + dtype : dtype, optional + Type of the returned tensor, as well as of the accumulator in which + the elements are multiplied. If *dtype* is not specified, it + defaults to the dtype of `a`, unless `a` has an integer dtype with + a precision less than that of the default platform integer. In + that case, the default platform integer is used instead. + out : Tensor, optional + Alternative output tensor in which to place the result. It must + have the same shape and buffer length as the expected output + but the type of the resulting values will be cast if necessary. + + Returns + ------- + nancumprod : Tensor + A new array holding the result is returned unless `out` is + specified, in which case it is returned. + + See Also + -------- + mt.cumprod : Cumulative product across array propagating NaNs. + isnan : Show which elements are NaN. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.nancumprod(1).execute() + array([1]) + >>> mt.nancumprod([1]).execute() + array([1]) + >>> mt.nancumprod([1, mt.nan]).execute() + array([ 1., 1.]) + >>> a = mt.array([[1, 2], [3, mt.nan]]) + >>> mt.nancumprod(a).execute() + array([ 1., 2., 6., 6.]) + >>> mt.nancumprod(a, axis=0).execute() + array([[ 1., 2.], + [ 3., 2.]]) + >>> mt.nancumprod(a, axis=1).execute() + array([[ 1., 2.], + [ 3., 3.]]) + + """ + a = astensor(a) + if dtype is None: + dtype = np.nancumprod(np.empty((1,), dtype=a.dtype)).dtype + op = TensorNanCumprod(axis=axis, dtype=dtype) + return op(a, out=out) diff --git a/python/xorbits/_mars/tensor/reduction/nancumsum.py b/python/xorbits/_mars/tensor/reduction/nancumsum.py new file mode 100644 index 000000000..521292629 --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/nancumsum.py @@ -0,0 +1,100 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..arithmetic.add import TensorTreeAdd +from ..datasource import tensor as astensor +from .core import TensorCumReduction, TensorCumReductionMixin + + +class TensorNanCumsum(TensorCumReduction, TensorCumReductionMixin): + _op_type_ = OperandDef.NANCUMSUM + _func_name = "nancumsum" + + def __init__(self, axis=None, **kw): + super().__init__(_axis=axis, **kw) + + @staticmethod + def _get_op_types(): + return TensorNanCumsum, TensorTreeAdd + + +def nancumsum(a, axis=None, dtype=None, out=None): + """ + Return the cumulative sum of tensor elements over a given axis treating Not a + Numbers (NaNs) as zero. The cumulative sum does not change when NaNs are + encountered and leading NaNs are replaced by zeros. + + Zeros are returned for slices that are all-NaN or empty. + + Parameters + ---------- + a : array_like + Input tensor. + axis : int, optional + Axis along which the cumulative sum is computed. The default + (None) is to compute the cumsum over the flattened tensor. + dtype : dtype, optional + Type of the returned tensor and of the accumulator in which the + elements are summed. If `dtype` is not specified, it defaults + to the dtype of `a`, unless `a` has an integer dtype with a + precision less than that of the default platform integer. In + that case, the default platform integer is used. + out : Tensor, optional + Alternative output tensor in which to place the result. It must + have the same shape and buffer length as the expected output + but the type will be cast if necessary. See `doc.ufuncs` + (Section "Output arguments") for more details. + + Returns + ------- + nancumsum : Tensor. + A new tensor holding the result is returned unless `out` is + specified, in which it is returned. The result has the same + size as `a`, and the same shape as `a` if `axis` is not None + or `a` is a 1-d tensor. + + See Also + -------- + numpy.cumsum : Cumulative sum across tensor propagating NaNs. + isnan : Show which elements are NaN. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.nancumsum(1).execute() + array([1]) + >>> mt.nancumsum([1]).execute() + array([1]) + >>> mt.nancumsum([1, mt.nan]).execute() + array([ 1., 1.]) + >>> a = mt.array([[1, 2], [3, mt.nan]]) + >>> mt.nancumsum(a).execute() + array([ 1., 3., 6., 6.]) + >>> mt.nancumsum(a, axis=0).execute() + array([[ 1., 2.], + [ 4., 2.]]) + >>> mt.nancumsum(a, axis=1).execute() + array([[ 1., 3.], + [ 3., 3.]]) + + """ + a = astensor(a) + if dtype is None: + dtype = np.nancumsum(np.empty((1,), dtype=a.dtype)).dtype + op = TensorNanCumsum(axis=axis, dtype=dtype) + return op(a, out=out) diff --git a/python/xorbits/_mars/tensor/reduction/nanmax.py b/python/xorbits/_mars/tensor/reduction/nanmax.py new file mode 100644 index 000000000..4730e1e98 --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/nanmax.py @@ -0,0 +1,123 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorReduction, TensorReductionMixin + + +class TensorNanMax(TensorReduction, TensorReductionMixin): + _op_type_ = OperandDef.NANMAX + _func_name = "nanmax" + + def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw): + stage = self._rewrite_stage(stage) + super().__init__( + _axis=axis, + _keepdims=keepdims, + _combine_size=combine_size, + stage=stage, + **kw + ) + + +def nanmax(a, axis=None, out=None, keepdims=None, combine_size=None): + """ + Return the maximum of an array or maximum along an axis, ignoring any + NaNs. When all-NaN slices are encountered a ``RuntimeWarning`` is + raised and NaN is returned for that slice. + + Parameters + ---------- + a : array_like + Tensor containing numbers whose maximum is desired. If `a` is not a + tensor, a conversion is attempted. + axis : int, optional + Axis along which the maximum is computed. The default is to compute + the maximum of the flattened tensor. + out : ndarray, optional + Alternate output array in which to place the result. The default + is ``None``; if provided, it must have the same shape as the + expected output, but the type will be cast if necessary. See + `doc.ufuncs` for details. + keepdims : bool, optional + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the original `a`. + + If the value is anything but the default, then + `keepdims` will be passed through to the `max` method + of sub-classes of `Tensor`. If the sub-classes methods + does not implement `keepdims` any exceptions will be raised. + combine_size: int, optional + The number of chunks to combine. + + Returns + ------- + nanmax : Tensor + A tensor with the same shape as `a`, with the specified axis removed. + If `a` is a 0-d tensor, or if axis is None, a Tensor scalar is + returned. The same dtype as `a` is returned. + + See Also + -------- + nanmin : + The minimum value of a tensor along a given axis, ignoring any NaNs. + amax : + The maximum value of a tensor along a given axis, propagating any NaNs. + fmax : + Element-wise maximum of two tensors, ignoring any NaNs. + maximum : + Element-wise maximum of two tensors, propagating any NaNs. + isnan : + Shows which elements are Not a Number (NaN). + isfinite: + Shows which elements are neither NaN nor infinity. + + amin, fmin, minimum + + Notes + ----- + Mars uses the IEEE Standard for Binary Floating-Point for Arithmetic + (IEEE 754). This means that Not a Number is not equivalent to infinity. + Positive infinity is treated as a very large number and negative + infinity is treated as a very small (i.e. negative) number. + + If the input has a integer type the function is equivalent to np.max. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([[1, 2], [3, mt.nan]]) + >>> mt.nanmax(a).execute() + 3.0 + >>> mt.nanmax(a, axis=0).execute() + array([ 3., 2.]) + >>> mt.nanmax(a, axis=1).execute() + array([ 2., 3.]) + + When positive infinity and negative infinity are present: + + >>> mt.nanmax([1, 2, mt.nan, mt.NINF]).execute() + 2.0 + >>> mt.nanmax([1, 2, mt.nan, mt.inf]).execute() + inf + + """ + a = astensor(a) + op = TensorNanMax( + axis=axis, dtype=a.dtype, keepdims=keepdims, combine_size=combine_size + ) + return op(a, out=out) diff --git a/python/xorbits/_mars/tensor/reduction/nanmean.py b/python/xorbits/_mars/tensor/reduction/nanmean.py new file mode 100644 index 000000000..f9d5d6314 --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/nanmean.py @@ -0,0 +1,171 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..array_utils import as_same_device, device +from ..datasource import tensor as astensor +from .core import TensorReduction, TensorReductionMixin, nannumel +from .mean import TensorMean + + +class TensorNanMean(TensorReduction, TensorReductionMixin): + _op_type_ = OperandDef.NANMEAN + + def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw): + stage = self._rewrite_stage(stage) + super().__init__( + _axis=axis, + _keepdims=keepdims, + _combine_size=combine_size, + stage=stage, + **kw + ) + + @classmethod + def execute_map(cls, ctx, op): + (in_chunk,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + axis = cls.get_axis(op.axis) + + with device(device_id): + chunk_count = nannumel( + in_chunk, axis=axis, dtype=np.int64, keepdims=bool(op.keepdims) + ) + chunk_sum = xp.nansum( + in_chunk, axis=axis, dtype=op.dtype, keepdims=bool(op.keepdims) + ) + ctx[op.outputs[0].key] = (chunk_sum, chunk_count) + + @classmethod + def execute_agg(cls, ctx, op): + axis = cls.get_axis(op.axis) + + a = ctx[op.inputs[0].key] + if not isinstance(a, (list, tuple)): + (inp,), device_id, xp = as_same_device( + [a], device=op.device, ret_extra=True + ) + + with device(device_id): + ctx[op.outputs[0].key] = xp.nanmean( + inp, axis=axis, dtype=op.dtype, keepdims=bool(op.keepdims) + ) + else: + (_data, _count), device_id, xp = as_same_device( + a, device=op.device, ret_extra=True + ) + + with device(device_id): + chunk_count = xp.sum( + _count, axis=axis, dtype=op.dtype, keepdims=bool(op.keepdims) + ) + chunk_sum = xp.sum( + _data, axis=axis, dtype=op.dtype, keepdims=bool(op.keepdims) + ) + ctx[op.outputs[0].key] = xp.true_divide( + chunk_sum, chunk_count, dtype=op.dtype + ) + + @classmethod + def execute_combine(cls, ctx, op): + TensorMean.execute_combine(ctx, op) + + +def nanmean(a, axis=None, dtype=None, out=None, keepdims=None, combine_size=None): + """ + Compute the arithmetic mean along the specified axis, ignoring NaNs. + + Returns the average of the tensor elements. The average is taken over + the flattened tensor by default, otherwise over the specified axis. + `float64` intermediate and return values are used for integer inputs. + + For all-NaN slices, NaN is returned and a `RuntimeWarning` is raised. + + Parameters + ---------- + a : array_like + Tensor containing numbers whose mean is desired. If `a` is not an + tensor, a conversion is attempted. + axis : int, optional + Axis along which the means are computed. The default is to compute + the mean of the flattened tensor. + dtype : data-type, optional + Type to use in computing the mean. For integer inputs, the default + is `float64`; for inexact inputs, it is the same as the input + dtype. + out : Tensor, optional + Alternate output tensor in which to place the result. The default + is ``None``; if provided, it must have the same shape as the + expected output, but the type will be cast if necessary. See + `doc.ufuncs` for details. + keepdims : bool, optional + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the original `a`. + + If the value is anything but the default, then + `keepdims` will be passed through to the `mean` or `sum` methods + of sub-classes of `Tensor`. If the sub-classes methods + does not implement `keepdims` any exceptions will be raised. + combine_size: int, optional + The number of chunks to combine. + + Returns + ------- + m : Tensor, see dtype parameter above + If `out=None`, returns a new array containing the mean values, + otherwise a reference to the output array is returned. Nan is + returned for slices that contain only NaNs. + + See Also + -------- + average : Weighted average + mean : Arithmetic mean taken while not ignoring NaNs + var, nanvar + + Notes + ----- + The arithmetic mean is the sum of the non-NaN elements along the axis + divided by the number of non-NaN elements. + + Note that for floating-point input, the mean is computed using the same + precision the input has. Depending on the input data, this can cause + the results to be inaccurate, especially for `float32`. Specifying a + higher-precision accumulator using the `dtype` keyword can alleviate + this issue. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([[1, mt.nan], [3, 4]]) + >>> mt.nanmean(a).execute() + 2.6666666666666665 + >>> mt.nanmean(a, axis=0).execute() + array([ 2., 4.]) + >>> mt.nanmean(a, axis=1).execute() + array([ 1., 3.5]) + + """ + a = astensor(a) + if dtype is None: + dtype = np.nanmean(np.empty((1,), dtype=a.dtype)).dtype + op = TensorNanMean( + axis=axis, dtype=dtype, keepdims=keepdims, combine_size=combine_size + ) + return op(a, out=out) diff --git a/python/xorbits/_mars/tensor/reduction/nanmin.py b/python/xorbits/_mars/tensor/reduction/nanmin.py new file mode 100644 index 000000000..ff6572ee1 --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/nanmin.py @@ -0,0 +1,123 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorReduction, TensorReductionMixin + + +class TensorNanMin(TensorReduction, TensorReductionMixin): + _op_type_ = OperandDef.NANMIN + _func_name = "nanmin" + + def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw): + stage = self._rewrite_stage(stage) + super().__init__( + _axis=axis, + _keepdims=keepdims, + _combine_size=combine_size, + stage=stage, + **kw + ) + + +def nanmin(a, axis=None, out=None, keepdims=None, combine_size=None): + """ + Return minimum of a tensor or minimum along an axis, ignoring any NaNs. + When all-NaN slices are encountered a ``RuntimeWarning`` is raised and + Nan is returned for that slice. + + Parameters + ---------- + a : array_like + Tensor containing numbers whose minimum is desired. If `a` is not an + tensor, a conversion is attempted. + axis : int, optional + Axis along which the minimum is computed. The default is to compute + the minimum of the flattened tensor. + out : Tensor, optional + Alternate output tensor in which to place the result. The default + is ``None``; if provided, it must have the same shape as the + expected output, but the type will be cast if necessary. See + `doc.ufuncs` for details. + keepdims : bool, optional + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the original `a`. + + If the value is anything but the default, then + `keepdims` will be passed through to the `min` method + of sub-classes of `Tensor`. If the sub-classes methods + does not implement `keepdims` any exceptions will be raised. + combine_size: int, optional + The number of chunks to combine. + + Returns + ------- + nanmin : Tensor + An tensor with the same shape as `a`, with the specified axis + removed. If `a` is a 0-d tensor, or if axis is None, a tensor + scalar is returned. The same dtype as `a` is returned. + + See Also + -------- + nanmax : + The maximum value of an array along a given axis, ignoring any NaNs. + amin : + The minimum value of an array along a given axis, propagating any NaNs. + fmin : + Element-wise minimum of two arrays, ignoring any NaNs. + minimum : + Element-wise minimum of two arrays, propagating any NaNs. + isnan : + Shows which elements are Not a Number (NaN). + isfinite: + Shows which elements are neither NaN nor infinity. + + amax, fmax, maximum + + Notes + ----- + Mars uses the IEEE Standard for Binary Floating-Point for Arithmetic + (IEEE 754). This means that Not a Number is not equivalent to infinity. + Positive infinity is treated as a very large number and negative + infinity is treated as a very small (i.e. negative) number. + + If the input has a integer type the function is equivalent to mt.min. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([[1, 2], [3, mt.nan]]) + >>> mt.nanmin(a).execute() + 1.0 + >>> mt.nanmin(a, axis=0).execute() + array([ 1., 2.]) + >>> mt.nanmin(a, axis=1).execute() + array([ 1., 3.]) + + When positive infinity and negative infinity are present: + + >>> mt.nanmin([1, 2, mt.nan, mt.inf]).execute() + 1.0 + >>> mt.nanmin([1, 2, mt.nan, mt.NINF]).execute() + -inf + + """ + a = astensor(a) + op = TensorNanMin( + axis=axis, dtype=a.dtype, keepdims=keepdims, combine_size=combine_size + ) + return op(a, out=out) diff --git a/python/xorbits/_mars/tensor/reduction/nanprod.py b/python/xorbits/_mars/tensor/reduction/nanprod.py new file mode 100644 index 000000000..237ecfd97 --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/nanprod.py @@ -0,0 +1,106 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorReduction, TensorReductionMixin + + +class TensorNanProd(TensorReduction, TensorReductionMixin): + _op_type_ = OperandDef.NANPROD + _func_name = "nanprod" + + def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw): + stage = self._rewrite_stage(stage) + super().__init__( + _axis=axis, + _keepdims=keepdims, + _combine_size=combine_size, + stage=stage, + **kw + ) + + +def nanprod(a, axis=None, dtype=None, out=None, keepdims=None, combine_size=None): + """ + Return the product of array elements over a given axis treating Not a + Numbers (NaNs) as ones. + + One is returned for slices that are all-NaN or empty. + + Parameters + ---------- + a : array_like + Tensor containing numbers whose product is desired. If `a` is not an + tensor, a conversion is attempted. + axis : int, optional + Axis along which the product is computed. The default is to compute + the product of the flattened tensor. + dtype : data-type, optional + The type of the returned tensor and of the accumulator in which the + elements are summed. By default, the dtype of `a` is used. An + exception is when `a` has an integer type with less precision than + the platform (u)intp. In that case, the default will be either + (u)int32 or (u)int64 depending on whether the platform is 32 or 64 + bits. For inexact inputs, dtype must be inexact. + out : Tensor, optional + Alternate output tensor in which to place the result. The default + is ``None``. If provided, it must have the same shape as the + expected output, but the type will be cast if necessary. See + `doc.ufuncs` for details. The casting of NaN to integer can yield + unexpected results. + keepdims : bool, optional + If True, the axes which are reduced are left in the result as + dimensions with size one. With this option, the result will + broadcast correctly against the original `arr`. + combine_size: int, optional + The number of chunks to combine. + + Returns + ------- + nanprod : Tensor + A new tensor holding the result is returned unless `out` is + specified, in which case it is returned. + + See Also + -------- + mt.prod : Product across array propagating NaNs. + isnan : Show which elements are NaN. + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.nanprod(1).execute() + 1 + >>> mt.nanprod([1]).execute() + 1 + >>> mt.nanprod([1, mt.nan]).execute() + 1.0 + >>> a = mt.array([[1, 2], [3, mt.nan]]) + >>> mt.nanprod(a).execute() + 6.0 + >>> mt.nanprod(a, axis=0).execute() + array([ 3., 2.]) + + """ + a = astensor(a) + if dtype is None: + dtype = np.nanprod(np.empty((1,), dtype=a.dtype)).dtype + op = TensorNanProd( + axis=axis, dtype=dtype, keepdims=keepdims, combine_size=combine_size + ) + return op(a, out=out) diff --git a/python/xorbits/_mars/tensor/reduction/nanstd.py b/python/xorbits/_mars/tensor/reduction/nanstd.py new file mode 100644 index 000000000..f5828b1d0 --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/nanstd.py @@ -0,0 +1,129 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..arithmetic.sqrt import sqrt +from .nanvar import nanvar + + +def nanstd( + a, axis=None, dtype=None, out=None, ddof=0, keepdims=None, combine_size=None +): + """ + Compute the standard deviation along the specified axis, while + ignoring NaNs. + + Returns the standard deviation, a measure of the spread of a + distribution, of the non-NaN tensor elements. The standard deviation is + computed for the flattened tensor by default, otherwise over the + specified axis. + + For all-NaN slices or slices with zero degrees of freedom, NaN is + returned and a `RuntimeWarning` is raised. + + Parameters + ---------- + a : array_like + Calculate the standard deviation of the non-NaN values. + axis : int, optional + Axis along which the standard deviation is computed. The default is + to compute the standard deviation of the flattened tensor. + dtype : dtype, optional + Type to use in computing the standard deviation. For tensors of + integer type the default is float64, for tensors of float types it + is the same as the tensor type. + out : Tensor, optional + Alternative output tensor in which to place the result. It must have + the same shape as the expected output but the type (of the + calculated values) will be cast if necessary. + ddof : int, optional + Means Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of non-NaN + elements. By default `ddof` is zero. + + keepdims : bool, optional + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the original `a`. + + If this value is anything but the default it is passed through + as-is to the relevant functions of the sub-classes. If these + functions do not have a `keepdims` kwarg, a RuntimeError will + be raised. + combine_size: int, optional + The number of chunks to combine. + + Returns + ------- + standard_deviation : ndarray, see dtype parameter above. + If `out` is None, return a new array containing the standard + deviation, otherwise return a reference to the output tensor. If + ddof is >= the number of non-NaN elements in a slice or the slice + contains only NaNs, then the result for that slice is NaN. + + See Also + -------- + var, mean, std + nanvar, nanmean + + Notes + ----- + The standard deviation is the square root of the average of the squared + deviations from the mean: ``std = sqrt(mean(abs(x - x.mean())**2))``. + + The average squared deviation is normally calculated as + ``x.sum() / N``, where ``N = len(x)``. If, however, `ddof` is + specified, the divisor ``N - ddof`` is used instead. In standard + statistical practice, ``ddof=1`` provides an unbiased estimator of the + variance of the infinite population. ``ddof=0`` provides a maximum + likelihood estimate of the variance for normally distributed variables. + The standard deviation computed in this function is the square root of + the estimated variance, so even with ``ddof=1``, it will not be an + unbiased estimate of the standard deviation per se. + + Note that, for complex numbers, `std` takes the absolute value before + squaring, so that the result is always real and nonnegative. + + For floating-point input, the *std* is computed using the same + precision the input has. Depending on the input data, this can cause + the results to be inaccurate, especially for float32 (see example + below). Specifying a higher-accuracy accumulator using the `dtype` + keyword can alleviate this issue. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([[1, mt.nan], [3, 4]]) + >>> mt.nanstd(a).execute() + 1.247219128924647 + >>> mt.nanstd(a, axis=0).execute() + array([ 1., 0.]) + >>> mt.nanstd(a, axis=1).execute() + array([ 0., 0.5]) + + """ + ret = sqrt( + nanvar( + a, + axis=axis, + dtype=dtype, + out=out, + ddof=ddof, + keepdims=keepdims, + combine_size=combine_size, + ) + ) + if dtype is not None and ret.dtype != dtype: + ret = ret.astype(dtype) + return ret diff --git a/python/xorbits/_mars/tensor/reduction/nansum.py b/python/xorbits/_mars/tensor/reduction/nansum.py new file mode 100644 index 000000000..32bf13a3b --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/nansum.py @@ -0,0 +1,127 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorReduction, TensorReductionMixin + + +class TensorNanSum(TensorReduction, TensorReductionMixin): + _op_type_ = OperandDef.NANSUM + _func_name = "nansum" + + def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw): + stage = self._rewrite_stage(stage) + super().__init__( + _axis=axis, + _keepdims=keepdims, + _combine_size=combine_size, + stage=stage, + **kw + ) + + +def nansum(a, axis=None, dtype=None, out=None, keepdims=None, combine_size=None): + """ + Return the sum of array elements over a given axis treating Not a + Numbers (NaNs) as zero. + + Zero is returned for slices that are all-NaN or + empty. + + Parameters + ---------- + a : array_like + Tensor containing numbers whose sum is desired. If `a` is not an + tensor, a conversion is attempted. + axis : int, optional + Axis along which the sum is computed. The default is to compute the + sum of the flattened array. + dtype : data-type, optional + The type of the returned tensor and of the accumulator in which the + elements are summed. By default, the dtype of `a` is used. An + exception is when `a` has an integer type with less precision than + the platform (u)intp. In that case, the default will be either + (u)int32 or (u)int64 depending on whether the platform is 32 or 64 + bits. For inexact inputs, dtype must be inexact. + out : Tensor, optional + Alternate output tensor in which to place the result. The default + is ``None``. If provided, it must have the same shape as the + expected output, but the type will be cast if necessary. See + `doc.ufuncs` for details. The casting of NaN to integer can yield + unexpected results. + keepdims : bool, optional + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the original `a`. + + + If the value is anything but the default, then + `keepdims` will be passed through to the `mean` or `sum` methods + of sub-classes of `Tensor`. If the sub-classes methods + does not implement `keepdims` any exceptions will be raised. + combine_size: int, optional + The number of chunks to combine. + + Returns + ------- + nansum : Tensor. + A new tensor holding the result is returned unless `out` is + specified, in which it is returned. The result has the same + size as `a`, and the same shape as `a` if `axis` is not None + or `a` is a 1-d array. + + See Also + -------- + mt.sum : Sum across tensor propagating NaNs. + isnan : Show which elements are NaN. + isfinite: Show which elements are not NaN or +/-inf. + + Notes + ----- + If both positive and negative infinity are present, the sum will be Not + A Number (NaN). + + Examples + -------- + >>> import mars.tensor as mt + + >>> mt.nansum(1).execute() + 1 + >>> mt.nansum([1]).execute() + 1 + >>> mt.nansum([1, mt.nan]).execute() + 1.0 + >>> a = mt.array([[1, 1], [1, mt.nan]]) + >>> mt.nansum(a).execute() + 3.0 + >>> mt.nansum(a, axis=0).execute() + array([ 2., 1.]) + >>> mt.nansum([1, mt.nan, mt.inf]).execute() + inf + >>> mt.nansum([1, mt.nan, mt.NINF]).execute() + -inf + >>> mt.nansum([1, mt.nan, mt.inf, -mt.inf]).execute() # both +/- infinity present + nan + + """ + a = astensor(a) + if dtype is None: + dtype = np.nansum(np.empty((1,), dtype=a.dtype)).dtype + op = TensorNanSum( + axis=axis, dtype=dtype, keepdims=keepdims, combine_size=combine_size + ) + return op(a, out=out) diff --git a/python/xorbits/_mars/tensor/reduction/nanvar.py b/python/xorbits/_mars/tensor/reduction/nanvar.py new file mode 100644 index 000000000..77e8a879c --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/nanvar.py @@ -0,0 +1,294 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import Int32Field +from ..array_utils import as_same_device, device, get_array_module +from ..datasource import tensor as astensor +from .core import TensorReduction, TensorReductionMixin, nannumel +from .var import reduce_var_square + + +class TensorNanMoment(TensorReduction, TensorReductionMixin): + _op_type_ = OperandDef.NANMOMENT + + _moment = Int32Field("moment", default=2) + _ddof = Int32Field("ddof") + + def __init__( + self, + axis=None, + keepdims=None, + moment=None, + ddof=None, + combine_size=None, + stage=None, + **kw + ): + stage = self._rewrite_stage(stage) + if moment is not None: + kw["_moment"] = moment + super().__init__( + _axis=axis, + _keepdims=keepdims, + _ddof=ddof, + _combine_size=combine_size, + stage=stage, + **kw + ) + + @property + def moment(self): + return getattr(self, "_moment", 2) + + @property + def ddof(self): + return self._ddof + + @classmethod + def execute_agg(cls, ctx, op): + axis = cls.get_axis(op.axis) + dtype = op.dtype + + (_data, _count, _var_square), device_id, xp = as_same_device( + ctx[op.inputs[0].key], device=op.device, ret_extra=True + ) + + with device(device_id): + chunk_count = xp.nansum(_count, axis=axis, dtype=np.int64, keepdims=True) + chunk_sum = xp.nansum(_data, axis=axis, dtype=dtype, keepdims=True) + avg = xp.true_divide(chunk_sum, chunk_count, dtype=dtype) + avg_diff = xp.true_divide(_data, _count, dtype=dtype) - avg + var_square = reduce_var_square( + _var_square, avg_diff, _count, op, axis, xp.nansum + ) + + ctx[op.outputs[0].key] = xp.true_divide( + var_square, + xp.nansum( + chunk_count, axis=axis, dtype=dtype, keepdims=bool(op.keepdims) + ) + - op.ddof, + dtype=dtype, + ) + + @classmethod + def execute_map(cls, ctx, op): + (in_chunk,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + axis = cls.get_axis(op.axis) + moment = op.moment + dtype = op.dtype + empty = get_array_module(in_chunk, nosparse=True).empty + + with device(device_id): + chunk_count = nannumel( + in_chunk, axis=axis, dtype=np.int64, keepdims=bool(op.keepdims) + ) + chunk_sum = xp.nansum( + in_chunk, axis=axis, dtype=dtype, keepdims=bool(op.keepdims) + ) + avg = xp.true_divide(chunk_sum, chunk_count) + var_square = empty(chunk_count.shape + (moment - 1,), dtype=dtype) + for i in range(2, moment + 1): + var_square[..., i - 2] = xp.nansum( + (in_chunk - avg) ** i, + axis=axis, + dtype=dtype, + keepdims=bool(op.keepdims), + ) + ctx[op.outputs[0].key] = (chunk_sum, chunk_count, var_square) + + @classmethod + def execute_combine(cls, ctx, op): + axis = cls.get_axis(op.axis) + moment = op.moment + dtype = op.dtype + + (_data, _count, _var_square), device_id, xp = as_same_device( + ctx[op.inputs[0].key], device=op.device, ret_extra=True + ) + empty = get_array_module(_data, nosparse=True).empty + + with device(device_id): + chunk_count = xp.nansum( + _count, axis=axis, dtype=np.int64, keepdims=bool(op.keepdims) + ) + chunk_sum = xp.nansum( + _data, axis=axis, dtype=dtype, keepdims=bool(op.keepdims) + ) + avg = xp.true_divide(chunk_sum, chunk_count, dtype=dtype) + avg_diff = xp.true_divide(_data, _count, dtype=dtype) - avg + var_square = empty(chunk_count.shape + (moment - 1,), dtype=dtype) + + for m in range(2, moment + 1): + var_square[..., m - 2] = reduce_var_square( + _var_square, avg_diff, _count, op, axis, xp.nansum + ) + + ctx[op.outputs[0].key] = (chunk_sum, chunk_count, var_square) + + +class TensorNanVar(TensorReduction, TensorReductionMixin): + _op_type_ = OperandDef.NANVAR + + _ddof = Int32Field("ddof") + + def __new__(cls, *args, **kwargs): + if kwargs.get("stage") is not None: + return TensorNanMoment(*args, **kwargs) + return super().__new__(cls) + + def __init__( + self, axis=None, dtype=None, keepdims=None, ddof=0, combine_size=None, **kw + ): + super().__init__( + _axis=axis, + dtype=dtype, + _keepdims=keepdims, + _ddof=ddof, + _combine_size=combine_size, + **kw + ) + + @property + def ddof(self): + return self._ddof + + def _get_op_kw(self): + kw = dict() + kw["ddof"] = self.ddof + return kw + + @classmethod + def execute(cls, ctx, op): + axis = cls.get_axis(op.axis) + (in_chunk,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + ctx[op.outputs[0].key] = xp.nanvar( + in_chunk, + axis=axis, + dtype=op.dtype, + ddof=op.ddof, + keepdims=bool(op.keepdims), + ) + + +def nanvar( + a, axis=None, dtype=None, out=None, ddof=0, keepdims=None, combine_size=None +): + """ + Compute the variance along the specified axis, while ignoring NaNs. + + Returns the variance of the tensor elements, a measure of the spread of + a distribution. The variance is computed for the flattened tensor by + default, otherwise over the specified axis. + + For all-NaN slices or slices with zero degrees of freedom, NaN is + returned and a `RuntimeWarning` is raised. + + Parameters + ---------- + a : array_like + Tensor containing numbers whose variance is desired. If `a` is not a + tensor, a conversion is attempted. + axis : int, optional + Axis along which the variance is computed. The default is to compute + the variance of the flattened array. + dtype : data-type, optional + Type to use in computing the variance. For tensors of integer type + the default is `float32`; for tensors of float types it is the same as + the tensor type. + out : Tensor, optional + Alternate output tensor in which to place the result. It must have + the same shape as the expected output, but the type is cast if + necessary. + ddof : int, optional + "Delta Degrees of Freedom": the divisor used in the calculation is + ``N - ddof``, where ``N`` represents the number of non-NaN + elements. By default `ddof` is zero. + keepdims : bool, optional + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the original `a`. + combine_size: int, optional + The number of chunks to combine. + + + Returns + ------- + variance : Tensor, see dtype parameter above + If `out` is None, return a new tensor containing the variance, + otherwise return a reference to the output tensor. If ddof is >= the + number of non-NaN elements in a slice or the slice contains only + NaNs, then the result for that slice is NaN. + + See Also + -------- + std : Standard deviation + mean : Average + var : Variance while not ignoring NaNs + nanstd, nanmean + + Notes + ----- + The variance is the average of the squared deviations from the mean, + i.e., ``var = mean(abs(x - x.mean())**2)``. + + The mean is normally calculated as ``x.sum() / N``, where ``N = len(x)``. + If, however, `ddof` is specified, the divisor ``N - ddof`` is used + instead. In standard statistical practice, ``ddof=1`` provides an + unbiased estimator of the variance of a hypothetical infinite + population. ``ddof=0`` provides a maximum likelihood estimate of the + variance for normally distributed variables. + + Note that for complex numbers, the absolute value is taken before + squaring, so that the result is always real and nonnegative. + + For floating-point input, the variance is computed using the same + precision the input has. Depending on the input data, this can cause + the results to be inaccurate, especially for `float32` (see example + below). Specifying a higher-accuracy accumulator using the ``dtype`` + keyword can alleviate this issue. + + For this function to work on sub-classes of Tensor, they must define + `sum` with the kwarg `keepdims` + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([[1, mt.nan], [3, 4]]) + >>> mt.nanvar(a).execute() + 1.5555555555555554 + >>> mt.nanvar(a, axis=0).execute() + array([ 1., 0.]) + >>> mt.nanvar(a, axis=1).execute() + array([ 0., 0.25]) + + """ + a = astensor(a) + if dtype is None: + dtype = np.nanvar(np.ones((1,), dtype=a.dtype)).dtype + op = TensorNanVar( + axis=axis, dtype=dtype, keepdims=keepdims, ddof=ddof, combine_size=combine_size + ) + return op(a, out=out) diff --git a/python/xorbits/_mars/tensor/reduction/prod.py b/python/xorbits/_mars/tensor/reduction/prod.py new file mode 100644 index 000000000..eeb692807 --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/prod.py @@ -0,0 +1,142 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorReduction, TensorReductionMixin + + +class TensorProd(TensorReduction, TensorReductionMixin): + _op_type_ = OperandDef.PROD + _func_name = "prod" + + def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw): + stage = self._rewrite_stage(stage) + super().__init__( + _axis=axis, + _keepdims=keepdims, + _combine_size=combine_size, + stage=stage, + **kw + ) + + +def prod(a, axis=None, dtype=None, out=None, keepdims=None, combine_size=None): + """ + Return the product of tensor elements over a given axis. + + Parameters + ---------- + a : array_like + Input data. + axis : None or int or tuple of ints, optional + Axis or axes along which a product is performed. The default, + axis=None, will calculate the product of all the elements in the + input tensor. If axis is negative it counts from the last to the + first axis. + + If axis is a tuple of ints, a product is performed on all of the + axes specified in the tuple instead of a single axis or all the + axes as before. + dtype : dtype, optional + The type of the returned tensor, as well as of the accumulator in + which the elements are multiplied. The dtype of `a` is used by + default unless `a` has an integer dtype of less precision than the + default platform integer. In that case, if `a` is signed then the + platform integer is used while if `a` is unsigned then an unsigned + integer of the same precision as the platform integer is used. + out : Tensor, optional + Alternative output tensor in which to place the result. It must have + the same shape as the expected output, but the type of the output + values will be cast if necessary. + keepdims : bool, optional + If this is set to True, the axes which are reduced are left in the + result as dimensions with size one. With this option, the result + will broadcast correctly against the input array. + + If the default value is passed, then `keepdims` will not be + passed through to the `prod` method of sub-classes of + `Tensor`, however any non-default value will be. If the + sub-classes `sum` method does not implement `keepdims` any + exceptions will be raised. + combine_size: int, optional + The number of chunks to combine. + + Returns + ------- + product_along_axis : Tensor, see `dtype` parameter above. + An tensor shaped as `a` but with the specified axis removed. + Returns a reference to `out` if specified. + + See Also + -------- + Tensor.prod : equivalent method + + Notes + ----- + Arithmetic is modular when using integer types, and no error is + raised on overflow. That means that, on a 32-bit platform: + + >>> import mars.tensor as mt + + >>> x = mt.array([536870910, 536870910, 536870910, 536870910]) + >>> mt.prod(x).execute() # random + 16 + + The product of an empty array is the neutral element 1: + + >>> mt.prod([]).execute() + 1.0 + + Examples + -------- + By default, calculate the product of all elements: + + >>> mt.prod([1.,2.]).execute() + 2.0 + + Even when the input array is two-dimensional: + + >>> mt.prod([[1.,2.],[3.,4.]]).execute() + 24.0 + + But we can also specify the axis over which to multiply: + + >>> mt.prod([[1.,2.],[3.,4.]], axis=1).execute() + array([ 2., 12.]) + + If the type of `x` is unsigned, then the output type is + the unsigned platform integer: + + >>> x = mt.array([1, 2, 3], dtype=mt.uint8) + >>> mt.prod(x).dtype == mt.uint + True + + If `x` is of a signed integer type, then the output type + is the default platform integer: + + >>> x = mt.array([1, 2, 3], dtype=mt.int8) + >>> mt.prod(x).dtype == int + True + + """ + a = astensor(a) + if dtype is None: + dtype = np.empty((1,), dtype=a.dtype).prod().dtype + op = TensorProd( + axis=axis, dtype=dtype, keepdims=keepdims, combine_size=combine_size + ) + return op(a, out=out) diff --git a/python/xorbits/_mars/tensor/reduction/std.py b/python/xorbits/_mars/tensor/reduction/std.py new file mode 100644 index 000000000..10dc803a3 --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/std.py @@ -0,0 +1,135 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..arithmetic.sqrt import sqrt +from .var import var + + +def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=None, combine_size=None): + """ + Compute the standard deviation along the specified axis. + + Returns the standard deviation, a measure of the spread of a distribution, + of the tensor elements. The standard deviation is computed for the + flattened tensor by default, otherwise over the specified axis. + + Parameters + ---------- + a : array_like + Calculate the standard deviation of these values. + axis : None or int or tuple of ints, optional + Axis or axes along which the standard deviation is computed. The + default is to compute the standard deviation of the flattened tensor. + + If this is a tuple of ints, a standard deviation is performed over + multiple axes, instead of a single axis or all the axes as before. + dtype : dtype, optional + Type to use in computing the standard deviation. For tensors of + integer type the default is float64, for tensors of float types it is + the same as the array type. + out : Tensor, optional + Alternative output tensor in which to place the result. It must have + the same shape as the expected output but the type (of the calculated + values) will be cast if necessary. + ddof : int, optional + Means Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + By default `ddof` is zero. + keepdims : bool, optional + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the input tensor. + + If the default value is passed, then `keepdims` will not be + passed through to the `std` method of sub-classes of + `Tensor`, however any non-default value will be. If the + sub-classes `sum` method does not implement `keepdims` any + exceptions will be raised. + combine_size: int, optional + The number of chunks to combine. + + Returns + ------- + standard_deviation : Tensor, see dtype parameter above. + If `out` is None, return a new tensor containing the standard deviation, + otherwise return a reference to the output array. + + See Also + -------- + var, mean, nanmean, nanstd, nanvar + + Notes + ----- + The standard deviation is the square root of the average of the squared + deviations from the mean, i.e., ``std = sqrt(mean(abs(x - x.mean())**2))``. + + The average squared deviation is normally calculated as + ``x.sum() / N``, where ``N = len(x)``. If, however, `ddof` is specified, + the divisor ``N - ddof`` is used instead. In standard statistical + practice, ``ddof=1`` provides an unbiased estimator of the variance + of the infinite population. ``ddof=0`` provides a maximum likelihood + estimate of the variance for normally distributed variables. The + standard deviation computed in this function is the square root of + the estimated variance, so even with ``ddof=1``, it will not be an + unbiased estimate of the standard deviation per se. + + Note that, for complex numbers, `std` takes the absolute + value before squaring, so that the result is always real and nonnegative. + + For floating-point input, the *std* is computed using the same + precision the input has. Depending on the input data, this can cause + the results to be inaccurate, especially for float32 (see example below). + Specifying a higher-accuracy accumulator using the `dtype` keyword can + alleviate this issue. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([[1, 2], [3, 4]]) + >>> mt.std(a).execute() + 1.1180339887498949 + >>> mt.std(a, axis=0).execute() + array([ 1., 1.]) + >>> mt.std(a, axis=1).execute() + array([ 0.5, 0.5]) + + In single precision, std() can be inaccurate: + + >>> a = mt.zeros((2, 512*512), dtype=mt.float32) + >>> a[0, :] = 1.0 + >>> a[1, :] = 0.1 + >>> mt.std(a).execute() + 0.45000005 + + Computing the standard deviation in float64 is more accurate: + + >>> mt.std(a, dtype=mt.float64).execute() + 0.44999999925494177 + + """ + ret = sqrt( + var( + a, + axis=axis, + dtype=dtype, + out=out, + ddof=ddof, + keepdims=keepdims, + combine_size=combine_size, + ) + ) + if dtype is not None and ret.dtype != dtype: + ret = ret.astype(dtype) + return ret diff --git a/python/xorbits/_mars/tensor/reduction/sum.py b/python/xorbits/_mars/tensor/reduction/sum.py new file mode 100644 index 000000000..d50e6b584 --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/sum.py @@ -0,0 +1,135 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ..datasource import tensor as astensor +from .core import TensorReduction, TensorReductionMixin + + +class TensorSum(TensorReduction, TensorReductionMixin): + _op_type_ = OperandDef.SUM + _func_name = "sum" + + def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw): + stage = self._rewrite_stage(stage) + super().__init__( + _axis=axis, + _keepdims=keepdims, + _combine_size=combine_size, + stage=stage, + **kw + ) + + +def sum(a, axis=None, dtype=None, out=None, keepdims=None, combine_size=None): + """ + Sum of tensor elements over a given axis. + + Parameters + ---------- + a : array_like + Elements to sum. + axis : None or int or tuple of ints, optional + Axis or axes along which a sum is performed. The default, + axis=None, will sum all of the elements of the input tensor. If + axis is negative it counts from the last to the first axis. + + If axis is a tuple of ints, a sum is performed on all of the axes + specified in the tuple instead of a single axis or all the axes as + before. + dtype : dtype, optional + The type of the returned tensor and of the accumulator in which the + elements are summed. The dtype of `a` is used by default unless `a` + has an integer dtype of less precision than the default platform + integer. In that case, if `a` is signed then the platform integer + is used while if `a` is unsigned then an unsigned integer of the + same precision as the platform integer is used. + out : Tensor, optional + Alternative output tensor in which to place the result. It must have + the same shape as the expected output, but the type of the output + values will be cast if necessary. + keepdims : bool, optional + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the input tensor. + + If the default value is passed, then `keepdims` will not be + passed through to the `sum` method of sub-classes of + `Tensor`, however any non-default value will be. If the + sub-classes `sum` method does not implement `keepdims` any + exceptions will be raised. + combine_size: int, optional + The number of chunks to combine. + + Returns + ------- + sum_along_axis : Tensor + An array with the same shape as `a`, with the specified + axis removed. If `a` is a 0-d tensor, or if `axis` is None, a scalar + is returned. If an output array is specified, a reference to + `out` is returned. + + See Also + -------- + Tensor.sum : Equivalent method. + + cumsum : Cumulative sum of tensor elements. + + trapz : Integration of tensor values using the composite trapezoidal rule. + + mean, average + + Notes + ----- + Arithmetic is modular when using integer types, and no error is + raised on overflow. + + The sum of an empty array is the neutral element 0: + + >>> import mars.tensor as mt + + >>> mt.sum([]).execute() + 0.0 + + Examples + -------- + >>> mt.sum([0.5, 1.5]).execute() + 2.0 + >>> mt.sum([0.5, 0.7, 0.2, 1.5], dtype=mt.int32).execute() + 1 + >>> mt.sum([[0, 1], [0, 5]]).execute() + 6 + >>> mt.sum([[0, 1], [0, 5]], axis=0).execute() + array([0, 6]) + >>> mt.sum([[0, 1], [0, 5]], axis=1).execute() + array([1, 5]) + + If the accumulator is too small, overflow occurs: + + >>> mt.ones(128, dtype=mt.int8).sum(dtype=mt.int8).execute() + -128 + + """ + a = astensor(a) + if dtype is None: + if a.dtype == object: + dtype = a.dtype + else: + dtype = np.empty((1,), dtype=a.dtype).sum().dtype + else: + dtype = np.dtype(dtype) + op = TensorSum(axis=axis, dtype=dtype, keepdims=keepdims, combine_size=combine_size) + return op(a, out=out) diff --git a/python/xorbits/_mars/tensor/reduction/tests/__init__.py b/python/xorbits/_mars/tensor/reduction/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/tensor/reduction/tests/test_reduction.py b/python/xorbits/_mars/tensor/reduction/tests/test_reduction.py new file mode 100644 index 000000000..85133206c --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/tests/test_reduction.py @@ -0,0 +1,211 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest + +from ....core import tile +from ....core.operand import OperandStage +from ...datasource import ones, tensor +from ...merge import TensorConcatenate +from .. import TensorArgmax, TensorArgmin, TensorMean, all + + +def test_base_reduction(): + sum = lambda x, *args, **kwargs: tile(x.sum(*args, **kwargs)) + prod = lambda x, *args, **kwargs: tile(x.prod(*args, **kwargs)) + max = lambda x, *args, **kwargs: tile(x.max(*args, **kwargs)) + min = lambda x, *args, **kwargs: tile(x.min(*args, **kwargs)) + all = lambda x, *args, **kwargs: tile(x.all(*args, **kwargs)) + any = lambda x, *args, **kwargs: tile(x.any(*args, **kwargs)) + + for f in [sum, prod, max, min, all, any]: + res = f(ones((8, 8), chunk_size=8)) + assert res.shape == () + + res = f(ones((10, 8), chunk_size=3)) + assert res.dtype is not None + assert res.shape == () + + res = f(ones((10, 8), chunk_size=3), axis=0) + assert res.shape == (8,) + + res = f(ones((10, 8), chunk_size=3), axis=1) + assert res.shape == (10,) + + with pytest.raises(np.AxisError): + f(ones((10, 8), chunk_size=3), axis=2) + + res = f(ones((10, 8), chunk_size=3), axis=-1) + assert res.shape == (10,) + + with pytest.raises(np.AxisError): + f(ones((10, 8), chunk_size=3), axis=-3) + + res = f(ones((10, 8), chunk_size=3), keepdims=True) + assert res.shape == (1, 1) + + res = f(ones((10, 8), chunk_size=3), axis=0, keepdims=True) + assert res.shape == (1, 8) + + res = f(ones((10, 8), chunk_size=3), axis=1, keepdims=True) + assert res.shape == (10, 1) + + res = f(ones((10, 8, 10), chunk_size=3), axis=1) + assert res.shape == (10, 10) + + res = f(ones((10, 8, 10), chunk_size=3), axis=1, keepdims=True) + assert res.shape == (10, 1, 10) + + res = f(ones((10, 8, 10), chunk_size=3), axis=(0, 2)) + assert res.shape == (8,) + + res = f(ones((10, 8, 10), chunk_size=3), axis=(0, 2), keepdims=True) + assert res.shape == (1, 8, 1) + + +def test_mean_reduction(): + mean = lambda x, *args, **kwargs: tile(x.mean(*args, **kwargs)) + + res = mean(ones((10, 8), chunk_size=3)) + assert res.shape == () + assert res.dtype is not None + assert isinstance(res.chunks[0].op, TensorMean) + assert isinstance(res.chunks[0].inputs[0].op, TensorConcatenate) + assert isinstance(res.chunks[0].inputs[0].inputs[0].op, TensorMean) + assert res.chunks[0].inputs[0].inputs[0].op.stage == OperandStage.combine + + res = mean(ones((8, 8), chunk_size=8)) + assert res.shape == () + + res = mean(ones((10, 8), chunk_size=3), axis=0) + assert res.shape == (8,) + + res = mean(ones((10, 8), chunk_size=3), axis=1) + assert res.shape == (10,) + + with pytest.raises(np.AxisError): + mean(ones((10, 8), chunk_size=3), axis=2) + + res = mean(ones((10, 8), chunk_size=3), axis=-1) + assert res.shape == (10,) + + with pytest.raises(np.AxisError): + mean(ones((10, 8), chunk_size=3), axis=-3) + + res = mean(ones((10, 8), chunk_size=3), keepdims=True) + assert res.shape == (1, 1) + + res = mean(ones((10, 8), chunk_size=3), axis=0, keepdims=True) + assert res.shape == (1, 8) + + res = mean(ones((10, 8), chunk_size=3), axis=1, keepdims=True) + assert res.shape == (10, 1) + assert isinstance(res.chunks[0].op, TensorMean) + assert res.chunks[0].op.stage == OperandStage.agg + assert isinstance(res.chunks[0].inputs[0].op, TensorConcatenate) + assert isinstance(res.chunks[0].inputs[0].inputs[0].op, TensorMean) + assert res.chunks[0].inputs[0].inputs[0].op.stage == OperandStage.map + + +def test_arg_reduction(): + argmax = lambda x, *args, **kwargs: tile(x.argmax(*args, **kwargs)) + argmin = lambda x, *args, **kwargs: tile(x.argmin(*args, **kwargs)) + + res1 = argmax(ones((10, 8, 10), chunk_size=3)) + res2 = argmin(ones((10, 8, 10), chunk_size=3)) + assert res1.shape == () + assert res1.dtype is not None + assert res2.shape == () + assert isinstance(res1.chunks[0].op, TensorArgmax) + assert res1.chunks[0].op.stage == OperandStage.agg + assert isinstance(res2.chunks[0].op, TensorArgmin) + assert res2.chunks[0].op.stage == OperandStage.agg + assert isinstance(res1.chunks[0].inputs[0].op, TensorConcatenate) + assert isinstance(res2.chunks[0].inputs[0].op, TensorConcatenate) + assert isinstance(res1.chunks[0].inputs[0].inputs[0].op, TensorArgmax) + assert res1.chunks[0].inputs[0].inputs[0].op.stage == OperandStage.combine + assert isinstance(res2.chunks[0].inputs[0].inputs[0].op, TensorArgmin) + assert res2.chunks[0].inputs[0].inputs[0].op.stage == OperandStage.combine + + res1 = argmax(ones((10, 8), chunk_size=3), axis=1) + res2 = argmin(ones((10, 8), chunk_size=3), axis=1) + assert res1.shape == (10,) + assert res2.shape == (10,) + assert isinstance(res1.chunks[0].op, TensorArgmax) + assert res1.chunks[0].op.stage == OperandStage.agg + assert isinstance(res2.chunks[0].op, TensorArgmin) + assert res2.chunks[0].op.stage == OperandStage.agg + assert isinstance(res1.chunks[0].inputs[0].op, TensorConcatenate) + assert isinstance(res2.chunks[0].inputs[0].op, TensorConcatenate) + assert isinstance(res1.chunks[0].inputs[0].inputs[0].op, TensorArgmax) + assert res1.chunks[0].inputs[0].inputs[0].op.stage == OperandStage.map + assert isinstance(res2.chunks[0].inputs[0].inputs[0].op, TensorArgmin) + assert res2.chunks[0].inputs[0].inputs[0].op.stage == OperandStage.map + + pytest.raises( + TypeError, lambda: argmax(ones((10, 8, 10), chunk_size=3), axis=(0, 1)) + ) + pytest.raises( + TypeError, lambda: argmin(ones((10, 8, 10), chunk_size=3), axis=(0, 1)) + ) + pytest.raises(np.AxisError, lambda: argmin(ones((10, 8, 10), chunk_size=3), axis=3)) + pytest.raises( + np.AxisError, lambda: argmin(ones((10, 8, 10), chunk_size=3), axis=-4) + ) + + +def test_cum_reduction(): + cumsum = lambda x, *args, **kwargs: tile(x.cumsum(*args, **kwargs)) + cumprod = lambda x, *args, **kwargs: tile(x.cumprod(*args, **kwargs)) + + res1 = cumsum(ones((10, 8), chunk_size=3), axis=0) + res2 = cumprod(ones((10, 8), chunk_size=3), axis=0) + assert res1.shape == (10, 8) + assert res1.dtype is not None + assert res2.shape == (10, 8) + assert res2.dtype is not None + + res1 = cumsum(ones((10, 8, 8), chunk_size=3), axis=1) + res2 = cumprod(ones((10, 8, 8), chunk_size=3), axis=1) + assert res1.shape == (10, 8, 8) + assert res2.shape == (10, 8, 8) + + res1 = cumsum(ones((10, 8, 8), chunk_size=3), axis=-2) + res2 = cumprod(ones((10, 8, 8), chunk_size=3), axis=-2) + assert res1.shape == (10, 8, 8) + assert res2.shape == (10, 8, 8) + + with pytest.raises(np.AxisError): + cumsum(ones((10, 8), chunk_size=3), axis=2) + with pytest.raises(np.AxisError): + cumsum(ones((10, 8), chunk_size=3), axis=-3) + + +def test_all_reduction(): + o = tensor([False]) + + with pytest.raises(ValueError): + all([-1, 4, 5], out=o) + + +def test_var_reduction(): + var = lambda x, *args, **kwargs: tile(x.var(*args, **kwargs)) + + res1 = var(ones((10, 8), chunk_size=3), ddof=2) + assert res1.shape == () + assert res1.op.ddof == 2 + + res1 = var(ones((10, 8, 8), chunk_size=3), axis=1) + assert res1.shape == (10, 8) diff --git a/python/xorbits/_mars/tensor/reduction/tests/test_reduction_execution.py b/python/xorbits/_mars/tensor/reduction/tests/test_reduction_execution.py new file mode 100644 index 000000000..4d5924618 --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/tests/test_reduction_execution.py @@ -0,0 +1,662 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +import scipy.sparse as sps + +from ....utils import ignore_warning +from ...datasource import ones, tensor +from .. import ( + allclose, + array_equal, + count_nonzero, + mean, + nanargmax, + nanargmin, + nancumprod, + nancumsum, + nanmax, + nanmean, + nanmin, + nanprod, + nanstd, + nansum, + nanvar, + std, + var, +) + + +def test_sum_prod_execution(setup): + arr = ones((10, 8), chunk_size=6) + assert 80 == arr.sum().execute().fetch() + np.testing.assert_array_equal( + arr.sum(axis=0).execute().fetch(), np.full((8,), fill_value=10) + ) + + arr = ones((3, 3), chunk_size=2) + assert 512 == (arr * 2).prod().execute().fetch() + np.testing.assert_array_equal( + (arr * 2).prod(axis=0).execute().fetch(), np.full((3,), fill_value=8) + ) + + raw = sps.random(10, 20, density=0.1) + arr = tensor(raw, chunk_size=3) + res = arr.sum().execute().fetch() + + assert pytest.approx(res) == raw.sum() + + # test order + raw = np.asfortranarray(np.random.rand(10, 20, 30)) + arr = tensor(raw, chunk_size=13) + arr2 = arr.sum(axis=-1) + + res = arr2.execute().fetch() + expected = raw.sum(axis=-1) + np.testing.assert_allclose(res, expected) + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + # test string dtype + a = tensor(list("abcdefghi"), dtype=object) + assert a.sum().execute().fetch() == "abcdefghi" + a = tensor(list("abcdefghi"), dtype=object, chunk_size=2) + assert a.sum().execute().fetch() == "abcdefghi" + + +def test_max_min_execution(setup): + raw = np.random.randint(10000, size=(10, 10, 10)) + + arr = tensor(raw, chunk_size=3) + + assert raw.max() == arr.max().execute().fetch() + assert raw.min() == arr.min().execute().fetch() + + np.testing.assert_array_equal(raw.max(axis=0), arr.max(axis=0).execute().fetch()) + assert arr.max(axis=0).issparse() is False + np.testing.assert_array_equal(raw.min(axis=0), arr.min(axis=0).execute().fetch()) + assert arr.min(axis=0).issparse() is False + + np.testing.assert_array_equal( + raw.max(axis=(1, 2)), arr.max(axis=(1, 2)).execute().fetch() + ) + np.testing.assert_array_equal( + raw.min(axis=(1, 2)), arr.min(axis=(1, 2)).execute().fetch() + ) + + raw = sps.random(10, 10, density=0.5) + + arr = tensor(raw, chunk_size=3) + + assert raw.max() == arr.max().execute().fetch() + assert raw.min() == arr.min().execute().fetch() + + np.testing.assert_almost_equal( + raw.max(axis=1).A.ravel(), arr.max(axis=1).execute().fetch().toarray() + ) + assert arr.max(axis=1).issparse() is True + np.testing.assert_almost_equal( + raw.min(axis=1).A.ravel(), arr.min(axis=1).execute().fetch().toarray() + ) + assert arr.min(axis=1).issparse() is True + + # test string dtype + a = tensor(list("abcdefghi"), dtype=object) + assert a.max().execute().fetch() == "i" + a = tensor(list("abcdefghi"), dtype=object, chunk_size=2) + assert a.max().execute().fetch() == "i" + + # test empty chunks + raw = np.arange(3, 10) + arr = tensor(np.arange(0, 10), chunk_size=3) + arr = arr[arr >= 3] + assert raw.max() == arr.max().execute().fetch() + assert raw.min() == arr.min().execute().fetch() + + +def test_all_any_execution(setup): + raw1 = np.zeros((10, 15)) + raw2 = np.ones((10, 15)) + raw3 = np.array( + [ + [True, False, True, False], + [True, True, True, True], + [False, False, False, False], + [False, True, False, True], + ] + ) + + arr1 = tensor(raw1, chunk_size=3) + arr2 = tensor(raw2, chunk_size=3) + arr3 = tensor(raw3, chunk_size=4) + + assert not arr1.all().execute().fetch() + assert arr2.all().execute().fetch() + assert not arr1.any().execute().fetch() + np.testing.assert_array_equal(raw3.all(axis=1), arr3.all(axis=1).execute().fetch()) + np.testing.assert_array_equal(raw3.any(axis=0), arr3.any(axis=0).execute().fetch()) + + raw = sps.random(10, 10, density=0.5) > 0.5 + + arr = tensor(raw, chunk_size=3) + + assert raw.A.all() == arr.all().execute().fetch() + assert raw.A.any() == arr.any().execute().fetch() + + # test string dtype + a = tensor(list("abcdefghi"), dtype=object) + assert a.all().execute().fetch() == "i" + a = tensor(list("abcdefghi"), dtype=object, chunk_size=2) + assert a.any().execute().fetch() == "a" + + +def test_mean_execution(setup): + raw1 = np.random.random((20, 25)) + raw2 = np.random.randint(10, size=(20, 25)) + + arr1 = tensor(raw1, chunk_size=6) + + res1 = arr1.mean().execute().fetch() + expected1 = raw1.mean() + np.testing.assert_allclose(res1, expected1) + + res2 = arr1.mean(axis=0).execute().fetch() + expected2 = raw1.mean(axis=0) + assert np.allclose(res2, expected2) is True + + res3 = arr1.mean(axis=1, keepdims=True).execute().fetch() + expected3 = raw1.mean(axis=1, keepdims=True) + np.testing.assert_allclose(res3, expected3) + + arr2 = tensor(raw2, chunk_size=6) + + res1 = arr2.mean().execute().fetch() + expected1 = raw2.mean() + assert res1 == expected1 + + res2 = arr2.mean(axis=0).execute().fetch() + expected2 = raw2.mean(axis=0) + np.testing.assert_allclose(res2, expected2) + + res3 = arr2.mean(axis=1, keepdims=True).execute().fetch() + expected3 = raw2.mean(axis=1, keepdims=True) + np.testing.assert_allclose(res3, expected3) + + raw1 = sps.random(20, 25, density=0.1) + + arr1 = tensor(raw1, chunk_size=6) + + res1 = arr1.mean().execute().fetch() + expected1 = raw1.mean() + np.testing.assert_allclose(res1, expected1) + + arr2 = tensor(raw1, chunk_size=30) + + res1 = arr2.mean().execute().fetch() + expected1 = raw1.mean() + np.testing.assert_allclose(res1, expected1) + + arr = mean(1) + assert arr.execute().fetch() == 1 + + with pytest.raises(TypeError): + tensor(list("abcdefghi"), dtype=object).mean().execute() + + +def test_var_execution(setup): + raw1 = np.random.random((20, 25)) + raw2 = np.random.randint(10, size=(20, 25)) + + arr0 = tensor(raw1, chunk_size=25) + + res1 = arr0.var().execute().fetch() + expected1 = raw1.var() + np.testing.assert_allclose(res1, expected1) + + arr1 = tensor(raw1, chunk_size=6) + + res1 = arr1.var().execute().fetch() + expected1 = raw1.var() + np.testing.assert_allclose(res1, expected1) + + res2 = arr1.var(axis=0).execute().fetch() + expected2 = raw1.var(axis=0) + np.testing.assert_allclose(res2, expected2) + + res3 = arr1.var(axis=1, keepdims=True).execute().fetch() + expected3 = raw1.var(axis=1, keepdims=True) + np.testing.assert_allclose(res3, expected3) + + arr2 = tensor(raw2, chunk_size=6) + + res1 = arr2.var().execute().fetch() + expected1 = raw2.var() + assert pytest.approx(res1) == expected1 + + res2 = arr2.var(axis=0).execute().fetch() + expected2 = raw2.var(axis=0) + np.testing.assert_allclose(res2, expected2) + + res3 = arr2.var(axis=1, keepdims=True).execute().fetch() + expected3 = raw2.var(axis=1, keepdims=True) + np.testing.assert_allclose(res3, expected3) + + res4 = arr2.var(ddof=1).execute().fetch() + expected4 = raw2.var(ddof=1) + assert pytest.approx(res4) == expected4 + + raw1 = sps.random(20, 25, density=0.1) + + arr1 = tensor(raw1, chunk_size=6) + + res1 = arr1.var().execute().fetch() + expected1 = raw1.toarray().var() + np.testing.assert_allclose(res1, expected1) + + arr2 = tensor(raw1, chunk_size=30) + + res1 = arr2.var().execute().fetch() + expected1 = raw1.toarray().var() + np.testing.assert_allclose(res1, expected1) + + arr = var(1) + assert arr.execute().fetch() == 0 + + +def test_std_execution(setup): + raw1 = np.random.random((20, 25)) + raw2 = np.random.randint(10, size=(20, 25)) + + arr1 = tensor(raw1, chunk_size=6) + + res1 = arr1.std().execute().fetch() + expected1 = raw1.std() + np.testing.assert_allclose(res1, expected1) + + res2 = arr1.std(axis=0).execute().fetch() + expected2 = raw1.std(axis=0) + np.testing.assert_allclose(res2, expected2) + + res3 = arr1.std(axis=1, keepdims=True).execute().fetch() + expected3 = raw1.std(axis=1, keepdims=True) + np.testing.assert_allclose(res3, expected3) + + arr2 = tensor(raw2, chunk_size=6) + + res1 = arr2.std().execute().fetch() + expected1 = raw2.std() + assert pytest.approx(res1) == expected1 + + res2 = arr2.std(axis=0).execute().fetch() + expected2 = raw2.std(axis=0) + np.testing.assert_allclose(res2, expected2) + + res3 = arr2.std(axis=1, keepdims=True).execute().fetch() + expected3 = raw2.std(axis=1, keepdims=True) + np.testing.assert_allclose(res3, expected3) + + res4 = arr2.std(ddof=1).execute().fetch() + expected4 = raw2.std(ddof=1) + assert pytest.approx(res4) == expected4 + + raw1 = sps.random(20, 25, density=0.1) + + arr1 = tensor(raw1, chunk_size=6) + + res1 = arr1.std().execute().fetch() + expected1 = raw1.toarray().std() + np.testing.assert_allclose(res1, expected1) + + arr2 = tensor(raw1, chunk_size=30) + + res1 = arr2.std().execute().fetch() + expected1 = raw1.toarray().std() + np.testing.assert_allclose(res1, expected1) + + arr = std(1) + assert arr.execute().fetch() == 0 + + +def test_arg_reduction(setup): + raw = np.random.random((20, 20, 20)) + + arr = tensor(raw, chunk_size=6) + + assert raw.argmax() == arr.argmax().execute().fetch() + assert raw.argmin() == arr.argmin().execute().fetch() + + np.testing.assert_array_equal( + raw.argmax(axis=0), arr.argmax(axis=0).execute().fetch() + ) + np.testing.assert_array_equal( + raw.argmin(axis=0), arr.argmin(axis=0).execute().fetch() + ) + + raw_format = sps.random(20, 20, density=0.1, format="lil") + + random_min = np.random.randint(0, 200) + random_max = np.random.randint(200, 400) + raw_format[np.unravel_index(random_min, raw_format.shape)] = -1 + raw_format[np.unravel_index(random_max, raw_format.shape)] = 2 + + raw = raw_format.tocoo() + arr = tensor(raw, chunk_size=6) + + assert raw.argmax() == arr.argmax().execute().fetch() + assert raw.argmin() == arr.argmin().execute().fetch() + + # test order + raw = np.asfortranarray(np.random.rand(10, 20, 30)) + arr = tensor(raw, chunk_size=13) + arr2 = arr.argmax(axis=-1) + + res = arr2.execute().fetch() + expected = raw.argmax(axis=-1) + np.testing.assert_allclose(res, expected) + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + with pytest.raises(TypeError): + tensor(list("abcdefghi"), dtype=object).argmax().execute() + + +@ignore_warning +def test_nan_reduction(setup): + raw = np.random.choice(a=[0, 1, np.nan], size=(10, 10), p=[0.3, 0.4, 0.3]) + + arr = tensor(raw, chunk_size=6) + + assert np.nansum(raw) == nansum(arr).execute().fetch() + assert np.nanprod(raw) == nanprod(arr).execute().fetch() + assert np.nanmax(raw) == nanmax(arr).execute().fetch() + assert np.nanmin(raw) == nanmin(arr).execute().fetch() + assert np.nanmean(raw) == nanmean(arr).execute().fetch() + assert pytest.approx(np.nanvar(raw)) == nanvar(arr).execute().fetch() + assert ( + pytest.approx(np.nanvar(raw, ddof=1)) == nanvar(arr, ddof=1).execute().fetch() + ) + assert pytest.approx(np.nanstd(raw)) == nanstd(arr).execute().fetch() + assert ( + pytest.approx(np.nanstd(raw, ddof=1)) == nanstd(arr, ddof=1).execute().fetch() + ) + + arr = tensor(raw, chunk_size=10) + + assert np.nansum(raw) == nansum(arr).execute().fetch() + assert np.nanprod(raw) == nanprod(arr).execute().fetch() + assert np.nanmax(raw) == nanmax(arr).execute().fetch() + assert np.nanmin(raw) == nanmin(arr).execute().fetch() + assert np.nanmean(raw) == nanmean(arr).execute().fetch() + assert pytest.approx(np.nanvar(raw)) == nanvar(arr).execute().fetch() + assert ( + pytest.approx(np.nanvar(raw, ddof=1)) == nanvar(arr, ddof=1).execute().fetch() + ) + assert pytest.approx(np.nanstd(raw)) == nanstd(arr).execute().fetch() + assert ( + pytest.approx(np.nanstd(raw, ddof=1)) == nanstd(arr, ddof=1).execute().fetch() + ) + + raw = np.random.random((10, 10)) + raw[:3, :3] = np.nan + arr = tensor(raw, chunk_size=6) + assert np.nanargmin(raw) == nanargmin(arr).execute().fetch() + assert np.nanargmax(raw) == nanargmax(arr).execute().fetch() + + raw = np.full((10, 10), np.nan) + arr = tensor(raw, chunk_size=6) + + assert 0 == nansum(arr).execute().fetch() + assert 1 == nanprod(arr).execute().fetch() + assert np.isnan(nanmax(arr).execute().fetch()) + assert np.isnan(nanmin(arr).execute().fetch()) + assert np.isnan(nanmean(arr).execute().fetch()) + with pytest.raises(ValueError): + _ = nanargmin(arr).execute() # noqa: F841 + with pytest.raises(ValueError): + _ = nanargmax(arr).execute() # noqa: F841 + + raw = sps.random(10, 10, density=0.1, format="csr") + raw[:3, :3] = np.nan + arr = tensor(raw, chunk_size=6) + + assert pytest.approx(np.nansum(raw.A)) == nansum(arr).execute().fetch() + assert pytest.approx(np.nanprod(raw.A)) == nanprod(arr).execute().fetch() + assert pytest.approx(np.nanmax(raw.A)) == nanmax(arr).execute().fetch() + assert pytest.approx(np.nanmin(raw.A)) == nanmin(arr).execute().fetch() + assert pytest.approx(np.nanmean(raw.A)) == nanmean(arr).execute().fetch() + assert pytest.approx(np.nanvar(raw.A)) == nanvar(arr).execute().fetch() + assert ( + pytest.approx(np.nanvar(raw.A, ddof=1)) == nanvar(arr, ddof=1).execute().fetch() + ) + assert pytest.approx(np.nanstd(raw.A)) == nanstd(arr).execute().fetch() + assert ( + pytest.approx(np.nanstd(raw.A, ddof=1)) == nanstd(arr, ddof=1).execute().fetch() + ) + + arr = nansum(1) + assert arr.execute().fetch() == 1 + + +def test_cum_reduction(setup): + raw = np.random.randint(5, size=(8, 8, 8)) + + arr = tensor(raw, chunk_size=6) + + res1 = arr.cumsum(axis=1).execute().fetch() + res2 = arr.cumprod(axis=1).execute().fetch() + expected1 = raw.cumsum(axis=1) + expected2 = raw.cumprod(axis=1) + np.testing.assert_array_equal(res1, expected1) + np.testing.assert_array_equal(res2, expected2) + + raw = sps.random(8, 8, density=0.1) + + arr = tensor(raw, chunk_size=6) + + res1 = arr.cumsum(axis=1).execute().fetch() + res2 = arr.cumprod(axis=1).execute().fetch() + expected1 = raw.A.cumsum(axis=1) + expected2 = raw.A.cumprod(axis=1) + assert np.allclose(res1, expected1) + assert np.allclose(res2, expected2) + + # test order + raw = np.asfortranarray(np.random.rand(10, 20, 30)) + arr = tensor(raw, chunk_size=13) + arr2 = arr.cumsum(axis=-1) + + res = arr2.execute().fetch() + expected = raw.cumsum(axis=-1) + np.testing.assert_allclose(res, expected) + assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"] + assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"] + + # test string dtype + a = tensor(list("abcdefghi"), dtype=object) + np.testing.assert_array_equal( + a.cumsum().execute().fetch(), + np.cumsum(np.array(list("abcdefghi"), dtype=object)), + ) + a = tensor(list("abcdefghi"), dtype=object, chunk_size=2) + np.testing.assert_array_equal( + a.cumsum().execute().fetch(), + np.cumsum(np.array(list("abcdefghi"), dtype=object)), + ) + + # test empty chunks + raw = np.random.rand(100) + arr = tensor(raw, chunk_size=((0, 100),)) + res = arr.cumsum().execute().fetch() + expected = raw.cumsum() + np.testing.assert_allclose(res, expected) + res = arr.cumprod().execute().fetch() + expected = raw.cumprod() + np.testing.assert_allclose(res, expected) + + +def test_nan_cum_reduction(setup): + raw = np.random.randint(5, size=(8, 8, 8)).astype(float) + raw[:2, 2:4, 4:6] = np.nan + + arr = tensor(raw, chunk_size=6) + + res1 = nancumsum(arr, axis=1).execute().fetch() + res2 = nancumprod(arr, axis=1).execute().fetch() + expected1 = np.nancumsum(raw, axis=1) + expected2 = np.nancumprod(raw, axis=1) + np.testing.assert_array_equal(res1, expected1) + np.testing.assert_array_equal(res2, expected2) + + raw = sps.random(8, 8, density=0.1, format="lil") + raw[:2, 2:4] = np.nan + + arr = tensor(raw, chunk_size=6) + + res1 = nancumsum(arr, axis=1).execute().fetch() + res2 = nancumprod(arr, axis=1).execute().fetch() + expected1 = np.nancumsum(raw.A, axis=1) + expected2 = np.nancumprod(raw.A, axis=1) + assert np.allclose(res1, expected1) is True + assert np.allclose(res2, expected2) is True + + +def test_out_reduction_execution(setup): + raw = np.random.randint(5, size=(8, 8, 8)) + + arr = tensor(raw, chunk_size=6) + arr2 = ones((8, 8), dtype="i8", chunk_size=6) + arr.sum(axis=1, out=arr2) + + res = arr2.execute().fetch() + expected = raw.sum(axis=1) + + np.testing.assert_array_equal(res, expected) + + +def test_out_cum_reduction_execution(setup): + raw = np.random.randint(5, size=(8, 8, 8)) + + arr = tensor(raw, chunk_size=6) + arr.cumsum(axis=0, out=arr) + + res = arr.execute().fetch() + expected = raw.cumsum(axis=0) + + np.testing.assert_array_equal(res, expected) + + +def test_count_nonzero_execution(setup): + raw = [[0, 1, 7, 0, 0], [3, 0, 0, 2, 19]] + + arr = tensor(raw, chunk_size=5) + t = count_nonzero(arr) + + res = t.execute().fetch() + expected = np.count_nonzero(raw) + np.testing.assert_equal(res, expected) + + arr = tensor(raw, chunk_size=2) + t = count_nonzero(arr) + + res = t.execute().fetch() + expected = np.count_nonzero(raw) + np.testing.assert_equal(res, expected) + + t = count_nonzero(arr, axis=0) + + res = t.execute().fetch() + expected = np.count_nonzero(raw, axis=0) + np.testing.assert_equal(res, expected) + + t = count_nonzero(arr, axis=1) + + res = t.execute().fetch() + expected = np.count_nonzero(raw, axis=1) + np.testing.assert_equal(res, expected) + + raw = sps.csr_matrix(raw) + + arr = tensor(raw, chunk_size=2) + t = count_nonzero(arr) + + res = t.execute().fetch() + expected = np.count_nonzero(raw.A) + np.testing.assert_equal(res, expected) + + t = count_nonzero(arr, axis=0) + + res = t.execute().fetch() + expected = np.count_nonzero(raw.A, axis=0) + np.testing.assert_equal(res, expected) + + t = count_nonzero(arr, axis=1) + + res = t.execute().fetch() + expected = np.count_nonzero(raw.A, axis=1) + np.testing.assert_equal(res, expected) + + # test string dtype + a = tensor(list("abcdefghi"), dtype=object) + assert count_nonzero(a).execute().fetch() == 9 + a = tensor(list("abcdefghi"), dtype=object, chunk_size=2) + assert count_nonzero(a).execute().fetch() == 9 + + +def test_allclose_execution(setup): + a = tensor([1e10, 1e-7], chunk_size=1) + b = tensor([1.00001e10, 1e-8], chunk_size=1) + + t = allclose(a, b) + + res = t.execute().fetch() + assert res is False + + a = tensor([1e10, 1e-8], chunk_size=1) + b = tensor([1.00001e10, 1e-9], chunk_size=1) + + t = allclose(a, b) + + res = t.execute().fetch() + assert res is True + + a = tensor([1.0, np.nan], chunk_size=1) + b = tensor([1.0, np.nan], chunk_size=1) + + t = allclose(a, b, equal_nan=True) + + res = t.execute().fetch() + assert res is True + + a = tensor(sps.csr_matrix([[1e10, 1e-7], [0, 0]]), chunk_size=1) + b = tensor(sps.csr_matrix([[1.00001e10, 1e-8], [0, 0]]), chunk_size=1) + + t = allclose(a, b) + + res = t.execute().fetch() + assert res is False + + # test string dtype + with pytest.raises(TypeError): + a = tensor(list("abcdefghi"), dtype=object) + allclose(a, a).execute() + + +def test_array_equal(setup): + a = ones((10, 5), chunk_size=4) + b = ones((10, 5), chunk_size=5) + + c = array_equal(a, b) + + assert c.execute().fetch() diff --git a/python/xorbits/_mars/tensor/reduction/var.py b/python/xorbits/_mars/tensor/reduction/var.py new file mode 100644 index 000000000..2d763bcc0 --- /dev/null +++ b/python/xorbits/_mars/tensor/reduction/var.py @@ -0,0 +1,312 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from math import factorial + +import numpy as np + +from ... import opcodes as OperandDef +from ...serialization.serializables import Int32Field +from ..array_utils import as_same_device, device, get_array_module +from ..datasource import tensor as astensor +from .core import TensorReduction, TensorReductionMixin, numel + + +def reduce_var_square(var_square, avg_diff, count, op, axis, sum_func): + moment = op.moment + dtype = op.dtype + kw = dict(axis=axis, dtype=dtype, keepdims=bool(op.keepdims)) + + reduced_var_square = var_square[..., moment - 2].sum(**kw) + sum_func( + count * avg_diff**moment, **kw + ) + for i in range(1, moment - 1): + coeff = factorial(moment) / float(factorial(i) * factorial(moment - i)) + reduced_var_square += coeff * sum_func( + var_square[..., moment - i - 2] * avg_diff**moment, **kw + ) + return reduced_var_square + + +class TensorMoment(TensorReduction, TensorReductionMixin): + _op_type_ = OperandDef.MOMENT + + _moment = Int32Field("moment", default=2) + _ddof = Int32Field("ddof") + + def __init__( + self, + axis=None, + keepdims=None, + moment=None, + ddof=None, + combine_size=None, + stage=None, + **kw + ): + stage = self._rewrite_stage(stage) + if moment is not None: + kw["_moment"] = moment + super().__init__( + _axis=axis, + _keepdims=keepdims, + _ddof=ddof, + _combine_size=combine_size, + stage=stage, + **kw + ) + + @property + def moment(self): + return getattr(self, "_moment", 2) + + @property + def ddof(self): + return self._ddof + + @classmethod + def execute_agg(cls, ctx, op): + axis = cls.get_axis(op.axis) + dtype = op.dtype + + (_data, _count, _var_square), device_id, xp = as_same_device( + ctx[op.inputs[0].key], device=op.device, ret_extra=True + ) + + with device(device_id): + chunk_count = xp.sum(_count, axis=axis, dtype=np.int64, keepdims=True) + chunk_sum = xp.sum(_data, axis=axis, dtype=dtype, keepdims=True) + avg = xp.true_divide(chunk_sum, chunk_count, dtype=dtype) + avg_diff = xp.true_divide(_data, _count, dtype=dtype) - avg + var_square = reduce_var_square( + _var_square, avg_diff, _count, op, axis, xp.sum + ) + + ctx[op.outputs[0].key] = xp.true_divide( + var_square, + xp.sum(chunk_count, axis=axis, dtype=dtype, keepdims=bool(op.keepdims)) + - op.ddof, + dtype=dtype, + ) + + @classmethod + def execute_map(cls, ctx, op): + (in_chunk,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + axis = cls.get_axis(op.axis) + moment = op.moment + dtype = op.dtype + empty = get_array_module(in_chunk, nosparse=True).empty + + with device(device_id): + chunk_count = numel( + in_chunk, axis=axis, dtype=np.int64, keepdims=bool(op.keepdims) + ) + chunk_sum = xp.sum( + in_chunk, axis=axis, dtype=dtype, keepdims=bool(op.keepdims) + ) + avg = xp.true_divide(chunk_sum, chunk_count) + var_square = empty(chunk_count.shape + (moment - 1,), dtype=dtype) + for i in range(2, moment + 1): + var_square[..., i - 2] = xp.sum( + (in_chunk - avg) ** i, + axis=axis, + dtype=dtype, + keepdims=bool(op.keepdims), + ) + ctx[op.outputs[0].key] = (chunk_sum, chunk_count, var_square) + + @classmethod + def execute_combine(cls, ctx, op): + axis = cls.get_axis(op.axis) + moment = op.moment + dtype = op.dtype + + (_data, _count, _var_square), device_id, xp = as_same_device( + ctx[op.inputs[0].key], device=op.device, ret_extra=True + ) + empty = get_array_module(_data, nosparse=True).empty + + with device(device_id): + chunk_count = xp.sum( + _count, axis=axis, dtype=np.int64, keepdims=bool(op.keepdims) + ) + chunk_sum = xp.sum( + _data, axis=axis, dtype=dtype, keepdims=bool(op.keepdims) + ) + avg = xp.true_divide(chunk_sum, chunk_count, dtype=dtype) + avg_diff = xp.true_divide(_data, _count, dtype=dtype) - avg + var_square = empty(chunk_count.shape + (moment - 1,), dtype=dtype) + + for m in range(2, moment + 1): + var_square[..., m - 2] = reduce_var_square( + _var_square, avg_diff, _count, op, axis, xp.sum + ) + + ctx[op.outputs[0].key] = (chunk_sum, chunk_count, var_square) + + +class TensorVar(TensorReduction, TensorReductionMixin): + _op_type_ = OperandDef.VAR + + _ddof = Int32Field("ddof") + + def __new__(cls, *args, **kwargs): + if kwargs.get("stage") is not None: + return TensorMoment(*args, **kwargs) + return super().__new__(cls) + + def __init__(self, axis=None, keepdims=None, ddof=0, combine_size=None, **kw): + super().__init__( + _axis=axis, _keepdims=keepdims, _ddof=ddof, _combine_size=combine_size, **kw + ) + + @property + def ddof(self): + return self._ddof + + def _get_op_kw(self): + kw = dict() + kw["ddof"] = self.ddof + return kw + + @classmethod + def execute(cls, ctx, op): + axis = cls.get_axis(op.axis) + (in_chunk,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + ctx[op.outputs[0].key] = xp.var( + in_chunk, + axis=axis, + dtype=op.dtype, + ddof=op.ddof, + keepdims=bool(op.keepdims), + ) + + +def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=None, combine_size=None): + """ + Compute the variance along the specified axis. + + Returns the variance of the tensor elements, a measure of the spread of a + distribution. The variance is computed for the flattened tensor by + default, otherwise over the specified axis. + + Parameters + ---------- + a : array_like + Tensor containing numbers whose variance is desired. If `a` is not a + tensor, a conversion is attempted. + axis : None or int or tuple of ints, optional + Axis or axes along which the variance is computed. The default is to + compute the variance of the flattened array. + + If this is a tuple of ints, a variance is performed over multiple axes, + instead of a single axis or all the axes as before. + dtype : data-type, optional + Type to use in computing the variance. For arrays of integer type + the default is `float32`; for tensors of float types it is the same as + the tensor type. + out : Tensor, optional + Alternate output array in which to place the result. It must have + the same shape as the expected output, but the type is cast if + necessary. + ddof : int, optional + "Delta Degrees of Freedom": the divisor used in the calculation is + ``N - ddof``, where ``N`` represents the number of elements. By + default `ddof` is zero. + keepdims : bool, optional + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the input tensor. + + If the default value is passed, then `keepdims` will not be + passed through to the `var` method of sub-classes of + `Tensor`, however any non-default value will be. If the + sub-classes `sum` method does not implement `keepdims` any + exceptions will be raised. + combine_size: int, optional + The number of chunks to combine. + + Returns + ------- + variance : Tensor, see dtype parameter above + If ``out=None``, returns a new tensor containing the variance; + otherwise, a reference to the output tensor is returned. + + See Also + -------- + std , mean, nanmean, nanstd, nanvar + + Notes + ----- + The variance is the average of the squared deviations from the mean, + i.e., ``var = mean(abs(x - x.mean())**2)``. + + The mean is normally calculated as ``x.sum() / N``, where ``N = len(x)``. + If, however, `ddof` is specified, the divisor ``N - ddof`` is used + instead. In standard statistical practice, ``ddof=1`` provides an + unbiased estimator of the variance of a hypothetical infinite population. + ``ddof=0`` provides a maximum likelihood estimate of the variance for + normally distributed variables. + + Note that for complex numbers, the absolute value is taken before + squaring, so that the result is always real and nonnegative. + + For floating-point input, the variance is computed using the same + precision the input has. Depending on the input data, this can cause + the results to be inaccurate, especially for `float32` (see example + below). Specifying a higher-accuracy accumulator using the ``dtype`` + keyword can alleviate this issue. + + Examples + -------- + >>> import mars.tensor as mt + + >>> a = mt.array([[1, 2], [3, 4]]) + >>> mt.var(a).execute() + 1.25 + >>> mt.var(a, axis=0).execute() + array([ 1., 1.]) + >>> mt.var(a, axis=1).execute() + array([ 0.25, 0.25]) + + In single precision, var() can be inaccurate: + + >>> a = mt.zeros((2, 512*512), dtype=mt.float32) + >>> a[0, :] = 1.0 + >>> a[1, :] = 0.1 + >>> mt.var(a).execute() + 0.20250003 + + Computing the variance in float64 is more accurate: + + >>> mt.var(a, dtype=mt.float64).execute() + 0.20249999932944759 + >>> ((1-0.55)**2 + (0.1-0.55)**2)/2 + 0.2025 + + """ + a = astensor(a) + if dtype is None: + dtype = np.var(np.ones((1,), dtype=a.dtype)).dtype + op = TensorVar( + axis=axis, dtype=dtype, keepdims=keepdims, ddof=ddof, combine_size=combine_size + ) + return op(a, out=out) diff --git a/python/xorbits/_mars/tensor/reshape/__init__.py b/python/xorbits/_mars/tensor/reshape/__init__.py new file mode 100644 index 000000000..e5a45443f --- /dev/null +++ b/python/xorbits/_mars/tensor/reshape/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .reshape import reshape diff --git a/python/xorbits/_mars/tensor/reshape/reshape.py b/python/xorbits/_mars/tensor/reshape/reshape.py new file mode 100644 index 000000000..c38d5fa34 --- /dev/null +++ b/python/xorbits/_mars/tensor/reshape/reshape.py @@ -0,0 +1,634 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import logging + +import numpy as np + +from ... import opcodes as OperandDef +from ...core import recursive_tile +from ...core.operand import OperandStage +from ...serialization.serializables import FieldTypes, KeyField, StringField, TupleField +from ...utils import has_unknown_shape +from ..array_utils import as_same_device, device +from ..datasource import tensor as astensor +from ..operands import TensorMapReduceOperand, TensorOperandMixin, TensorShuffleProxy +from ..utils import decide_chunk_sizes, get_order + +logger = logging.getLogger(__name__) + + +class TensorReshape(TensorMapReduceOperand, TensorOperandMixin): + _op_type_ = OperandDef.RESHAPE + + _input = KeyField("input") + _newshape = TupleField("newshape", FieldTypes.int64) + _order = StringField("order") + + _axis_offsets = TupleField("axis_offsets", FieldTypes.uint64) + _oldshape = TupleField("oldshape", FieldTypes.uint64) + _new_chunk_size = TupleField("new_chunk_size", FieldTypes.uint64) + + def __init__( + self, + newshape=None, + order=None, + axis_offsets=None, + oldshape=None, + new_chunk_size=None, + **kw, + ): + super().__init__( + _newshape=newshape, + _order=order, + _axis_offsets=axis_offsets, + _oldshape=oldshape, + _new_chunk_size=new_chunk_size, + **kw, + ) + + @property + def input(self): + return self._input + + @property + def newshape(self): + return self._newshape + + @property + def axis_offsets(self): + return self._axis_offsets + + @property + def oldshape(self): + return self._oldshape + + @property + def new_chunk_size(self): + return self._new_chunk_size + + @property + def order(self): + return self._order + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + + def on_output_modify(self, new_output): + return reshape(new_output, self._input.shape) + + def on_input_modify(self, new_input): + op = self.copy().reset_key() + return op(new_input) + + def __call__(self, a, order, out_shape): + return self.new_tensor([a], out_shape, order=order) + + @staticmethod + def _gen_reshape_rechunk_nsplits(old_shape, new_shape, nsplits): + old_idx = len(old_shape) - 1 + new_idx = len(new_shape) - 1 + rechunk_nsplists = [None for _ in old_shape] + reshape_nsplists = [None for _ in new_shape] + + while old_idx >= 0 or new_idx >= 0: + old_dim_size = old_shape[old_idx] + new_dim_size = new_shape[new_idx] + + if old_dim_size == new_dim_size: + # nothing need to do + rechunk_nsplists[old_idx] = nsplits[old_idx] + reshape_nsplists[new_idx] = nsplits[old_idx] + old_idx -= 1 + new_idx -= 1 + continue + + if old_dim_size == 1: + rechunk_nsplists[old_idx] = (1,) + old_idx -= 1 + elif new_dim_size == 1: + reshape_nsplists[new_idx] = (1,) + new_idx -= 1 + elif old_dim_size < new_dim_size: + left_old_idx = old_idx - 1 + while ( + left_old_idx >= 0 + and np.prod(old_shape[left_old_idx : old_idx + 1]) < new_dim_size + ): + left_old_idx -= 1 + if np.prod(old_shape[left_old_idx : old_idx + 1]) != new_dim_size: + raise ValueError("shapes not compatible") + + for i in range(left_old_idx + 1, old_idx + 1): + # rechunk the higher dimension into 1 chunk + # e.g. ((2, 2, 2), [(3, 3), (4, 4))] -> [6, 8] + rechunk_nsplists[i] = (old_shape[i],) + + chunk_reduce = np.prod( + [len(c) for c in nsplits[left_old_idx + 1 : old_idx + 1]] + ).item() + # cause the higher dimension has been concatenated, + # the lowest dimension should be expanded to reduce size + rechunk_nsplists[left_old_idx] = TensorReshape._expand_nsplit_by_reduce( + nsplits[left_old_idx], chunk_reduce + ) + + size_reduce = np.prod(old_shape[left_old_idx + 1 : old_idx + 1]).item() + reshape_nsplists[new_idx] = tuple( + size_reduce * c for c in rechunk_nsplists[left_old_idx] + ) + + old_idx = left_old_idx - 1 + new_idx -= 1 + else: + assert old_dim_size > new_dim_size + lef_new_idx = new_idx - 1 + while ( + lef_new_idx >= 0 + and np.prod(new_shape[lef_new_idx : new_idx + 1]) < old_dim_size + ): + lef_new_idx -= 1 + if np.prod(new_shape[lef_new_idx : new_idx + 1]) != old_dim_size: + raise ValueError("shapes not compatible") + + chunk_expand = np.prod(new_shape[lef_new_idx + 1 : new_idx + 1]).item() + rechunk_nsplists[old_idx] = TensorReshape._reduce_nsplit_by_expand( + nsplits[old_idx], chunk_expand + ) + + for i in range(lef_new_idx + 1, new_idx + 1): + reshape_nsplists[i] = (new_shape[i],) + reshape_nsplists[lef_new_idx] = tuple( + c // chunk_expand for c in rechunk_nsplists[old_idx] + ) + + old_idx -= 1 + new_idx = lef_new_idx - 1 + + assert np.prod([len(s) for s in rechunk_nsplists]) == np.prod( + [len(s) for s in reshape_nsplists] + ) + return rechunk_nsplists, reshape_nsplists + + @staticmethod + def _expand_nsplit_by_reduce(splits, reduced): + if reduced == 1: + return splits + + out = [] + for s in splits: + x = s + part = max(x / reduced, 1) + while x >= 2 * part: + out.append(int(part)) + x -= int(part) + if x: + out.append(x) + assert sum(splits) == sum(out) + return tuple(out) + + @staticmethod + def _reduce_nsplit_by_expand(splits, expand): + assert sum(splits) % expand == 0 + + out = [] + residual = 0 + for chunk in splits: + chunk += residual + div = chunk // expand + residual = chunk % expand + good = expand * div + if good: + out.append(good) + return tuple(out) + + @staticmethod + def _tile_as_shuffle(op): + in_tensor = op.input + tensor = op.outputs[0] + new_shape = op.newshape + shuffle_inputs, shuffle_outputs = [], [] + axis_offsets = [[0] + np.cumsum(ns)[:-1].tolist() for ns in in_tensor.nsplits] + + max_chunk_size = max(max(tp) for tp in in_tensor.nsplits) + out_nsplits = decide_chunk_sizes( + new_shape, max_chunk_size, tensor.dtype.itemsize + ) + chunk_size_idxes = (range(len(size)) for size in out_nsplits) + + for inp in in_tensor.chunks: + offset = tuple( + axis_offsets[axis][idx] for axis, idx in enumerate(inp.index) + ) + chunk_op = TensorReshape( + stage=OperandStage.map, + axis_offsets=offset, + oldshape=in_tensor.shape, + newshape=new_shape, + new_chunk_size=(max_chunk_size,) * len(new_shape), + dtype=inp.dtype, + ) + shuffle_inputs.append( + chunk_op.new_chunk([inp], shape=(np.nan,), index=inp.index) + ) + + proxy_chunk = TensorShuffleProxy( + dtype=in_tensor.dtype, _tensor_keys=[in_tensor.op.key] + ).new_chunk(shuffle_inputs, shape=()) + + out_indices = list( + zip(itertools.product(*out_nsplits), itertools.product(*chunk_size_idxes)) + ) + for chunk_shape, chunk_idx in out_indices: + chunk_op = TensorReshape( + stage=OperandStage.reduce, + dtype=tensor.dtype, + n_reducers=len(out_indices), + ) + shuffle_outputs.append( + chunk_op.new_chunk( + [proxy_chunk], + shape=chunk_shape, + order=tensor.order, + index=chunk_idx, + ) + ) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + new_shape, + order=tensor.order, + chunks=shuffle_outputs, + nsplits=out_nsplits, + ) + + @classmethod + def tile(cls, op): + in_tensor = op.input + tensor = op.outputs[0] + + # check unknown shape + if has_unknown_shape(*op.inputs): + yield + + if any(np.isnan(s) for s in tensor.shape): + # -1 exists in newshape and input tensor has unknown shape + # recalculate new shape + shape = tuple(-1 if np.isnan(s) else s for s in tensor.shape) + op._newshape = newshape = calc_shape(in_tensor.size, shape) + tensor._shape = newshape + + if op.order == "F": + # do transpose first, then do regular reshape, then transpose back + result = in_tensor.transpose().reshape(op.newshape[::-1]) + if getattr(op, "_reshape_with_shuffle", True): + result.op.extra_params["_reshape_with_shuffle"] = True + result = result.transpose() + return [(yield from recursive_tile(result))] + + if len(in_tensor.chunks) == 1: + # 1 chunk + chunk_op = op.copy().reset_key() + chunk = chunk_op.new_chunk( + in_tensor.chunks, + shape=tensor.shape, + order=tensor.order, + index=(0,) * tensor.ndim, + ) + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + shape=tensor.shape, + order=tensor.order, + chunks=[chunk], + nsplits=tuple((s,) for s in tensor.shape), + ) + try: + rechunk_nsplits, reshape_nsplits = cls._gen_reshape_rechunk_nsplits( + in_tensor.shape, tensor.shape, in_tensor.nsplits + ) + rechunked_tensor = yield from recursive_tile( + in_tensor.rechunk(rechunk_nsplits) + ) + in_idxes = itertools.product(*[range(len(s)) for s in rechunk_nsplits]) + out_idxes = itertools.product(*[range(len(s)) for s in reshape_nsplits]) + out_shape = itertools.product(*[s for s in reshape_nsplits]) + out_chunks = [] + for input_idx, out_idx, out_shape in zip(in_idxes, out_idxes, out_shape): + in_chunk = rechunked_tensor.cix[input_idx] + chunk_op = op.copy().reset_key() + chunk_op._newshape = out_shape + out_chunk = chunk_op.new_chunk( + [in_chunk], shape=out_shape, order=tensor.order, index=out_idx + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + tensor.shape, + order=tensor.order, + chunks=out_chunks, + nsplits=reshape_nsplits, + ) + except ValueError: + # TODO: make this as default when shuffle is mature + if getattr(op.extra_params, "_reshape_with_shuffle", False): + return cls._tile_as_shuffle(op) + + # shape incompatible, we will first do flatten, then reshape to the new shape + return [ + ( + yield from recursive_tile( + in_tensor.reshape(-1, order=tensor.op.order).reshape( + tensor.shape, order=tensor.op.order + ) + ) + ) + ] + + @classmethod + def estimate_size(cls, ctx, op): + chunk = op.outputs[0] + if op.stage == OperandStage.map: + inp_chunk = chunk.inputs[0] + inp_size, inp_calc = ctx[inp_chunk.key] + store_overhead = np.int64().itemsize * inp_chunk.ndim + calc_overhead = np.int64().itemsize * (inp_chunk.ndim + 2) + ctx[chunk.key] = (store_overhead + inp_size, calc_overhead + inp_calc) + elif op.stage == OperandStage.reduce: + sum_size = 0 + for shuffle_input in chunk.inputs[0].inputs or (): + key = (shuffle_input.key, chunk.index) + if ctx.get(key) is not None: + sum_size += ctx[key][0] + else: + ctx[key] = None + ctx[chunk.key] = (chunk.nbytes, max(sum_size, chunk.nbytes)) + else: + super().estimate_size(ctx, op) + + @classmethod + def _execute_map(cls, ctx, op): + chunk = op.outputs[0] + # todo this function is an experimental one making shuffle runnable. + # try elevate performance when needed. + old_shape = op.oldshape + new_shape = op.newshape + new_chunk_size = op.new_chunk_size + axis_offset = op.axis_offsets + + logger.debug("Reshape mapper: Start mapping step for %s", chunk.key) + + data = ctx[op.inputs[0].key] + indices = list(np.nonzero(data)) + nz_data = data[tuple(indices)] + + for idx in range(len(old_shape)): + indices[idx] = np.add(indices[idx], axis_offset[idx], out=indices[idx]) + rest_indices = indices[0] + indices[0] = None + for idx in range(1, len(old_shape)): + rest_indices = np.multiply(rest_indices, old_shape[idx], out=rest_indices) + rest_indices = np.add(rest_indices, indices[idx], out=rest_indices) + indices[idx] = None + del indices + + new_indices = [] + for dim_size in reversed(new_shape[1:]): + new_index = rest_indices % dim_size + new_indices.append(new_index) + rest_indices = np.floor_divide(rest_indices, dim_size, out=rest_indices) + new_indices.append(rest_indices) + new_indices.reverse() + del rest_indices + + logger.debug("Reshape mapper: remapping to new locations for %s", chunk.key) + + dim_chunk_counts = [ + int(np.ceil(dim_size * 1.0 / chunk_size)) + for dim_size, chunk_size in zip(new_shape, new_chunk_size) + ] + target = new_indices[0] // new_chunk_size[0] + for new_index, chunk_size, dim_chunk_count in zip( + new_indices[1:], new_chunk_size[1:], dim_chunk_counts[1:] + ): + target = np.multiply(target, dim_chunk_count, out=target) + target = np.add(target, new_index // chunk_size, out=target) + + for idx, chunk_size in enumerate(new_chunk_size): + new_indices[idx] = np.mod( + new_indices[idx], chunk_size, out=new_indices[idx] + ) + + logger.debug("Reshape mapper: sorting for %s", chunk.key) + + sort_idx = np.argsort(target) + target = target[sort_idx] + nz_data = nz_data[sort_idx] + for idx in range(len(new_indices)): + new_indices[idx] = new_indices[idx][sort_idx] + del sort_idx + + logger.debug("Reshape mapper: splitting for %s", chunk.key) + + mapper_outputs = {} + for t in np.unique(target): + data_slice = slice( + np.searchsorted(target, t), np.searchsorted(target, t, "right") + ) + group_indices = tuple( + new_indices[idx][data_slice] for idx in range(len(new_shape)) + ) + group_data = nz_data[data_slice] + + target_chunk_idx = [None] * len(dim_chunk_counts) + for idx, dim_chunk_count in enumerate(reversed(dim_chunk_counts)): + t, target_chunk_idx[idx] = divmod(t, dim_chunk_count) + target_chunk_idx.reverse() + + mapper_outputs[chunk.key, tuple(target_chunk_idx)] = group_indices + ( + group_data, + ) + + # ensure all mapper data are inserted context and fill missing partition with None + for target_chunk_idx in itertools.product( + *(range(dim_chunk_cnt) for dim_chunk_cnt in dim_chunk_counts) + ): + data_key = chunk.key, tuple(target_chunk_idx) + ctx[data_key] = mapper_outputs.get(data_key) + + @classmethod + def _execute_reduce(cls, ctx, op: "TensorReshape"): + chunk = op.outputs[0] + try: + result_array = ctx[chunk.key] + except KeyError: + result_array = np.zeros( + chunk.shape, dtype=chunk.dtype, order=chunk.order.value + ) + for data_tuple in op.iter_mapper_data(ctx, skip_none=True): + if data_tuple is None: + # skip missing partition data + continue + result_array[data_tuple[:-1]] = data_tuple[-1] + ctx[chunk.key] = result_array + + @classmethod + def execute(cls, ctx, op): + if op.stage == OperandStage.map: + cls._execute_map(ctx, op) + elif op.stage == OperandStage.reduce: + cls._execute_reduce(ctx, op) + else: + (x,), device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + ctx[op.outputs[0].key] = x.reshape(op.newshape, order=op.order) + + +def calc_shape(size, newshape): + if isinstance(newshape, int): + newshape = (newshape,) + else: + newshape = tuple(int(s) for s in newshape) + + known_shape = [s for s in newshape if s >= 0] + missing_dim = len(newshape) - len(known_shape) + if missing_dim > 1: + raise ValueError("can only specify one unknown dimension") + if missing_dim == 1: + known_size = np.prod(known_shape) + newshape = tuple( + int(size / known_size) if s < 0 and known_size > 0 else s for s in newshape + ) + + return newshape + + +def reshape(a, newshape, order="C"): + """ + Gives a new shape to a tensor without changing its data. + + Parameters + ---------- + a : array_like + Tensor to be reshaped. + newshape : int or tuple of ints + The new shape should be compatible with the original shape. If + an integer, then the result will be a 1-D tensor of that length. + One shape dimension can be -1. In this case, the value is + inferred from the length of the tensor and remaining dimensions. + order : {'C', 'F', 'A'}, optional + Read the elements of `a` using this index order, and place the + elements into the reshaped array using this index order. 'C' + means to read / write the elements using C-like index order, + with the last axis index changing fastest, back to the first + axis index changing slowest. 'F' means to read / write the + elements using Fortran-like index order, with the first index + changing fastest, and the last index changing slowest. Note that + the 'C' and 'F' options take no account of the memory layout of + the underlying array, and only refer to the order of indexing. + 'A' means to read / write the elements in Fortran-like index + order if `a` is Fortran *contiguous* in memory, C-like order + otherwise. + + Returns + ------- + reshaped_array : Tensor + This will be a new view object if possible; otherwise, it will + be a copy. + + See Also + -------- + Tensor.reshape : Equivalent method. + + Notes + ----- + It is not always possible to change the shape of a tensor without + copying the data. If you want an error to be raised when the data is copied, + you should assign the new shape to the shape attribute of the array:: + + >>> import mars.tensor as mt + + >>> a = mt.arange(6).reshape((3, 2)) + >>> a.execute() + array([[0, 1], + [2, 3], + [4, 5]]) + + You can think of reshaping as first raveling the tensor (using the given + index order), then inserting the elements from the raveled tensor into the + new tensor using the same kind of index ordering as was used for the + raveling. + + >>> mt.reshape(a, (2, 3)).execute() + array([[0, 1, 2], + [3, 4, 5]]) + >>> mt.reshape(mt.ravel(a), (2, 3)).execute() + array([[0, 1, 2], + [3, 4, 5]]) + + Examples + -------- + >>> a = mt.array([[1,2,3], [4,5,6]]) + >>> mt.reshape(a, 6).execute() + array([1, 2, 3, 4, 5, 6]) + + >>> mt.reshape(a, (3,-1)).execute() # the unspecified value is inferred to be 2 + array([[1, 2], + [3, 4], + [5, 6]]) + """ + a = astensor(a) + + if np.isnan(sum(a.shape)): + # some shape is nan + new_shape = [newshape] if isinstance(newshape, int) else list(newshape) + # if -1 exists in newshape, just treat it as unknown shape + new_shape = [s if s != -1 else np.nan for s in new_shape] + out_shape = tuple(new_shape) + else: + out_shape = newshape = calc_shape(a.size, newshape) + if a.size != np.prod(newshape): + raise ValueError( + f"cannot reshape array of size {a.size} into shape {newshape}" + ) + + tensor_order = get_order(order, a.order, available_options="CFA") + + if a.shape == newshape and ( + a.ndim <= 1 or (a.ndim > 1 and tensor_order == a.order) + ): + # does not need to reshape + return a + return _reshape( + a, newshape, order=order, tensor_order=tensor_order, out_shape=out_shape + ) + + +def _reshape(a, newshape, order="C", tensor_order=None, out_shape=None): + if tensor_order is None: + tensor_order = get_order(order, a.order, available_options="CFA") + op = TensorReshape( + newshape, order, dtype=a.dtype, create_view=tensor_order == a.order + ) + if out_shape is None: + out_shape = newshape + return op(a, tensor_order, out_shape) diff --git a/python/xorbits/_mars/tensor/reshape/tests/__init__.py b/python/xorbits/_mars/tensor/reshape/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/tensor/reshape/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/tensor/reshape/tests/test_reshape.py b/python/xorbits/_mars/tensor/reshape/tests/test_reshape.py new file mode 100644 index 000000000..94447fc47 --- /dev/null +++ b/python/xorbits/_mars/tensor/reshape/tests/test_reshape.py @@ -0,0 +1,72 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest + +from ....core import tile +from ....core.operand import OperandStage +from ...datasource import ones +from ..reshape import TensorReshape + + +def test_reshape(): + a = ones((10, 20, 30), chunk_size=5) + b = a.reshape(10, 600) + + b = tile(b) + + assert tuple(sum(s) for s in b.nsplits) == (10, 600) + + a = ones((10, 600), chunk_size=5) + b = a.reshape(10, 30, 20) + + b = tile(b) + + assert tuple(sum(s) for s in b.nsplits) == (10, 30, 20) + + a = ones((10, 600), chunk_size=5) + a.shape = [10, 30, 20] + + a = tile(a) + + assert tuple(sum(s) for s in a.nsplits) == (10, 30, 20) + + # test reshape unknown shape + c = a[a > 0] + d = c.reshape(10, 600) + assert d.shape == (10, 600) + d = c.reshape(-1, 10) + assert len(d.shape) == 2 + assert np.isnan(d.shape[0]) + assert d.shape[1] + + with pytest.raises(TypeError): + a.reshape((10, 30, 20), other_argument=True) + + +def test_shuffle_reshape(): + a = ones((31, 27), chunk_size=10) + b = a.reshape(27, 31) + b.op.extra_params["_reshape_with_shuffle"] = True + + b = tile(b) + + assert tuple(sum(s) for s in b.nsplits) == (27, 31) + assert isinstance(b.chunks[0].op, TensorReshape) + assert b.chunks[0].op.stage == OperandStage.reduce + + shuffle_map_sample = b.chunks[0].inputs[0].inputs[0] + assert isinstance(shuffle_map_sample.op, TensorReshape) + assert shuffle_map_sample.op.stage == OperandStage.map diff --git a/python/xorbits/_mars/tensor/reshape/tests/test_reshape_execution.py b/python/xorbits/_mars/tensor/reshape/tests/test_reshape_execution.py new file mode 100644 index 000000000..34b9360dc --- /dev/null +++ b/python/xorbits/_mars/tensor/reshape/tests/test_reshape_execution.py @@ -0,0 +1,98 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ...datasource import ones, tensor + + +def test_reshape_execution(setup): + x = ones((1, 2, 3), chunk_size=[4, 3, 5]) + y = x.reshape(3, 2) + res = y.execute().fetch() + assert y.shape == (3, 2) + np.testing.assert_equal(res, np.ones((3, 2))) + + data = np.random.rand(6, 4) + x2 = tensor(data, chunk_size=2) + y2 = x2.reshape(3, 8, order="F") + res = y2.execute().fetch() + expected = data.reshape((3, 8), order="F") + np.testing.assert_array_equal(res, expected) + assert res.flags["F_CONTIGUOUS"] is True + assert res.flags["C_CONTIGUOUS"] is False + + data2 = np.asfortranarray(np.random.rand(6, 4)) + x3 = tensor(data2, chunk_size=2) + y3 = x3.reshape(3, 8) + res = y3.execute().fetch() + expected = data2.reshape((3, 8)) + np.testing.assert_array_equal(res, expected) + assert res.flags["C_CONTIGUOUS"] is True + assert res.flags["F_CONTIGUOUS"] is False + + data2 = np.asfortranarray(np.random.rand(6, 4)) + x3 = tensor(data2, chunk_size=2) + y3 = x3.reshape(3, 8, order="F") + res = y3.execute().fetch() + expected = data2.reshape((3, 8), order="F") + np.testing.assert_array_equal(res, expected) + assert res.flags["F_CONTIGUOUS"] is True + assert res.flags["C_CONTIGUOUS"] is False + + for chunk_size in [None, 3]: + rs = np.random.RandomState(0) + data = rs.rand(3, 4, 5) + x = tensor(data, chunk_size=chunk_size) + x = x[x[:, 0, 0] < 0.7] + y = x.reshape(-1, 20) + assert np.isnan(y.shape[0]) + res = y.execute().fetch() + expected = data[data[:, 0, 0] < 0.7].reshape(-1, 20) + np.testing.assert_array_equal(res, expected) + + +def test_shuffle_reshape_execution(setup): + a = ones((31, 27), chunk_size=10) + b = a.reshape(27, 31) + b.op.extra_params["_reshape_with_shuffle"] = True + + res = b.execute().fetch() + np.testing.assert_array_equal(res, np.ones((27, 31))) + + b2 = a.reshape(27, 31, order="F") + b.op.extra_params["_reshape_with_shuffle"] = True + res = b2.execute().fetch() + assert res.flags["F_CONTIGUOUS"] is True + assert res.flags["C_CONTIGUOUS"] is False + + data = np.random.rand(6, 4) + x2 = tensor(data, chunk_size=2) + y2 = x2.reshape(4, 6, order="F") + y2.op.extra_params["_reshape_with_shuffle"] = True + res = y2.execute().fetch() + expected = data.reshape((4, 6), order="F") + np.testing.assert_array_equal(res, expected) + assert res.flags["F_CONTIGUOUS"] is True + assert res.flags["C_CONTIGUOUS"] is False + + data2 = np.asfortranarray(np.random.rand(6, 4)) + x3 = tensor(data2, chunk_size=2) + y3 = x3.reshape(4, 6) + y3.op.extra_params["_reshape_with_shuffle"] = True + res = y3.execute().fetch() + expected = data2.reshape((4, 6)) + np.testing.assert_array_equal(res, expected) + assert res.flags["C_CONTIGUOUS"] is True + assert res.flags["F_CONTIGUOUS"] is False diff --git a/python/xorbits/_mars/tensor/spatial/__init__.py b/python/xorbits/_mars/tensor/spatial/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/tensor/spatial/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/tensor/spatial/distance/__init__.py b/python/xorbits/_mars/tensor/spatial/distance/__init__.py new file mode 100644 index 000000000..0e31d9a3f --- /dev/null +++ b/python/xorbits/_mars/tensor/spatial/distance/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .cdist import cdist +from .pdist import pdist +from .squareform import squareform diff --git a/python/xorbits/_mars/tensor/spatial/distance/cdist.py b/python/xorbits/_mars/tensor/spatial/distance/cdist.py new file mode 100644 index 000000000..5b975b441 --- /dev/null +++ b/python/xorbits/_mars/tensor/spatial/distance/cdist.py @@ -0,0 +1,564 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from typing import Tuple + +import numpy as np + +from .... import opcodes as OperandDef +from ....core import recursive_tile +from ....serialization.serializables import AnyField, Float16Field, KeyField +from ....utils import ensure_own_data, has_unknown_shape, require_module +from ...array_utils import as_same_device, cp, device +from ...core import TensorOrder +from ...datasource import tensor as astensor +from ...operands import TensorOperand, TensorOperandMixin + + +class TensorCdist(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.CDIST + + _xa = KeyField("XA") + _xb = KeyField("XB") + _metric = AnyField("metric") + _p = Float16Field("p", on_serialize=lambda x: float(x) if x is not None else x) + _w = KeyField("w") + _v = KeyField("V") + _vi = KeyField("VI") + + def __init__(self, metric=None, p=None, w=None, v=None, vi=None, **kw): + super().__init__(_metric=metric, _p=p, _w=w, _v=v, _vi=vi, **kw) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + self._xa = next(inputs_iter) + self._xb = next(inputs_iter) + if self._w is not None: + self._w = next(inputs_iter) + if self._v is not None: + self._v = next(inputs_iter) + if self._vi is not None: + self._vi = next(inputs_iter) + + @property + def xa(self): + return self._xa + + @property + def xb(self): + return self._xb + + @property + def metric(self): + return self._metric + + @property + def p(self): + return self._p + + @property + def w(self): + return self._w + + @property + def v(self): + return self._v + + @property + def vi(self): + return self._vi + + def __call__(self, xa, xb, shape: Tuple): + inputs = [xa, xb] + for val in [self._w, self._v, self._vi]: + if val is not None: + inputs.append(val) + return self.new_tensor(inputs, shape=shape, order=TensorOrder.C_ORDER) + + @classmethod + def _tile_one_chunk(cls, op, xa, xb, w, v, vi): + out_tensor = op.outputs[0] + chunk_op = op.copy().reset_key() + chunk_inputs = [xa.chunks[0], xb.chunks[0]] + for val in [w, v, vi]: + if val is not None: + chunk_inputs.append(val.chunks[0]) + chunk = chunk_op.new_chunk( + chunk_inputs, + shape=out_tensor.shape, + order=out_tensor.order, + index=(0,) * out_tensor.ndim, + ) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + shape=out_tensor.shape, + order=out_tensor.order, + nsplits=tuple((s,) for s in out_tensor.shape), + chunks=[chunk], + ) + + @classmethod + def _tile_chunks(cls, op, xa, xb, w, v, vi): + out_tensor = op.outputs[0] + acs, bcs = xa.chunk_shape[0], xb.chunk_shape[0] + + out_chunks = [] + for idx in itertools.product(range(acs), range(bcs)): + ixa, ixb = idx + chunk_op = op.copy().reset_key() + + chunk_inputs = [] + xa_chunk = xa.cix[ixa, 0] + xb_chunk = xb.cix[ixb, 0] + chunk_inputs.extend([xa_chunk, xb_chunk]) + if w is not None: + w_chunk = chunk_op._w = w.chunks[0] + chunk_inputs.append(w_chunk) + if v is not None: + v_chunk = chunk_op._v = v.chunks[0] + chunk_inputs.append(v_chunk) + if vi is not None: + vi_chunk = chunk_op._vi = vi.chunks[0] + chunk_inputs.append(vi_chunk) + chunk = chunk_op.new_chunk( + chunk_inputs, + shape=(xa_chunk.shape[0], xb_chunk.shape[0]), + order=out_tensor.order, + index=idx, + ) + out_chunks.append(chunk) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + shape=out_tensor.shape, + order=out_tensor.order, + chunks=out_chunks, + nsplits=(xa.nsplits[0], xb.nsplits[0]), + ) + + @classmethod + def tile(cls, op): + # make sure every inputs have known shape + if has_unknown_shape(*op.inputs): + yield + + xa = op.xa.rechunk({1: op.xa.shape[1]}) + xb = op.xb.rechunk({1: op.xb.shape[1]}) + xa, xb = yield from recursive_tile(xa, xb) + + # rechunk w, v, vi into one chunk if any of them has value + extra_inputs = [None] * 3 + for i, ei in enumerate([op.w, op.v, op.vi]): + if ei is None: + continue + new_ei = yield from recursive_tile(ei.rechunk(ei.shape)) + extra_inputs[i] = new_ei + w, v, vi = extra_inputs + + if len(xa.chunks) == 1 and len(xb.chunks) == 1: + # only 1 chunk + return cls._tile_one_chunk(op, xa, xb, w, v, vi) + else: + return cls._tile_chunks(op, xa, xb, w, v, vi) + + @classmethod + def execute(cls, ctx, op): + from scipy.spatial.distance import cdist + + inputs, device_id, xp = as_same_device( + [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True + ) + + if xp is cp: # pragma: no cover + raise NotImplementedError("`cdist` does not support running on GPU yet") + + with device(device_id): + inputs_iter = iter(inputs) + xa = next(inputs_iter) + xb = next(inputs_iter) + kw = dict() + if op.p is not None: + kw["p"] = op.p + if op.w is not None: + kw["w"] = next(inputs_iter) + if op.v is not None: + kw["V"] = next(inputs_iter) + if op.vi is not None: + kw["VI"] = next(inputs_iter) + + ctx[op.outputs[0].key] = cdist( + ensure_own_data(xa), ensure_own_data(xb), metric=op.metric, **kw + ) + + +@require_module("scipy.spatial.distance") +def cdist(XA, XB, metric="euclidean", **kwargs): + """ + Compute distance between each pair of the two collections of inputs. + + See Notes for common calling conventions. + + Parameters + ---------- + XA : Tensor + An :math:`m_A` by :math:`n` tensor of :math:`m_A` + original observations in an :math:`n`-dimensional space. + Inputs are converted to float type. + XB : Tensor + An :math:`m_B` by :math:`n` tensor of :math:`m_B` + original observations in an :math:`n`-dimensional space. + Inputs are converted to float type. + metric : str or callable, optional + The distance metric to use. If a string, the distance function can be + 'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', + 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'jensenshannon', + 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', + 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', + 'wminkowski', 'yule'. + **kwargs : dict, optional + Extra arguments to `metric`: refer to each metric documentation for a + list of all possible arguments. + + Some possible arguments: + + p : scalar + The p-norm to apply for Minkowski, weighted and unweighted. + Default: 2. + + w : Tensor + The weight vector for metrics that support weights (e.g., Minkowski). + + V : Tensor + The variance vector for standardized Euclidean. + Default: var(vstack([XA, XB]), axis=0, ddof=1) + + VI : Tensor + The inverse of the covariance matrix for Mahalanobis. + Default: inv(cov(vstack([XA, XB].T))).T + + out : Tensor + The output tensor + If not None, the distance matrix Y is stored in this tensor. + Note: metric independent, it will become a regular keyword arg in a + future scipy version + + Returns + ------- + Y : Tensor + A :math:`m_A` by :math:`m_B` distance matrix is returned. + For each :math:`i` and :math:`j`, the metric + ``dist(u=XA[i], v=XB[j])`` is computed and stored in the + :math:`ij` th entry. + + Raises + ------ + ValueError + An exception is thrown if `XA` and `XB` do not have + the same number of columns. + + Notes + ----- + The following are common calling conventions: + + 1. ``Y = cdist(XA, XB, 'euclidean')`` + + Computes the distance between :math:`m` points using + Euclidean distance (2-norm) as the distance metric between the + points. The points are arranged as :math:`m` + :math:`n`-dimensional row vectors in the matrix X. + + 2. ``Y = cdist(XA, XB, 'minkowski', p=2.)`` + + Computes the distances using the Minkowski distance + :math:`||u-v||_p` (:math:`p`-norm) where :math:`p \\geq 1`. + + 3. ``Y = cdist(XA, XB, 'cityblock')`` + + Computes the city block or Manhattan distance between the + points. + + 4. ``Y = cdist(XA, XB, 'seuclidean', V=None)`` + + Computes the standardized Euclidean distance. The standardized + Euclidean distance between two n-vectors ``u`` and ``v`` is + + .. math:: + + \\sqrt{\\sum {(u_i-v_i)^2 / V[x_i]}}. + + V is the variance vector; V[i] is the variance computed over all + the i'th components of the points. If not passed, it is + automatically computed. + + 5. ``Y = cdist(XA, XB, 'sqeuclidean')`` + + Computes the squared Euclidean distance :math:`||u-v||_2^2` between + the vectors. + + 6. ``Y = cdist(XA, XB, 'cosine')`` + + Computes the cosine distance between vectors u and v, + + .. math:: + + 1 - \\frac{u \\cdot v} + {{||u||}_2 {||v||}_2} + + where :math:`||*||_2` is the 2-norm of its argument ``*``, and + :math:`u \\cdot v` is the dot product of :math:`u` and :math:`v`. + + 7. ``Y = cdist(XA, XB, 'correlation')`` + + Computes the correlation distance between vectors u and v. This is + + .. math:: + + 1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})} + {{||(u - \\bar{u})||}_2 {||(v - \\bar{v})||}_2} + + where :math:`\\bar{v}` is the mean of the elements of vector v, + and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`. + + + 8. ``Y = cdist(XA, XB, 'hamming')`` + + Computes the normalized Hamming distance, or the proportion of + those vector elements between two n-vectors ``u`` and ``v`` + which disagree. To save memory, the matrix ``X`` can be of type + boolean. + + 9. ``Y = cdist(XA, XB, 'jaccard')`` + + Computes the Jaccard distance between the points. Given two + vectors, ``u`` and ``v``, the Jaccard distance is the + proportion of those elements ``u[i]`` and ``v[i]`` that + disagree where at least one of them is non-zero. + + 10. ``Y = cdist(XA, XB, 'chebyshev')`` + + Computes the Chebyshev distance between the points. The + Chebyshev distance between two n-vectors ``u`` and ``v`` is the + maximum norm-1 distance between their respective elements. More + precisely, the distance is given by + + .. math:: + + d(u,v) = \\max_i {|u_i-v_i|}. + + 11. ``Y = cdist(XA, XB, 'canberra')`` + + Computes the Canberra distance between the points. The + Canberra distance between two points ``u`` and ``v`` is + + .. math:: + + d(u,v) = \\sum_i \\frac{|u_i-v_i|} + {|u_i|+|v_i|}. + + 12. ``Y = cdist(XA, XB, 'braycurtis')`` + + Computes the Bray-Curtis distance between the points. The + Bray-Curtis distance between two points ``u`` and ``v`` is + + + .. math:: + + d(u,v) = \\frac{\\sum_i (|u_i-v_i|)} + {\\sum_i (|u_i+v_i|)} + + 13. ``Y = cdist(XA, XB, 'mahalanobis', VI=None)`` + + Computes the Mahalanobis distance between the points. The + Mahalanobis distance between two points ``u`` and ``v`` is + :math:`\\sqrt{(u-v)(1/V)(u-v)^T}` where :math:`(1/V)` (the ``VI`` + variable) is the inverse covariance. If ``VI`` is not None, + ``VI`` will be used as the inverse covariance matrix. + + 14. ``Y = cdist(XA, XB, 'yule')`` + + Computes the Yule distance between the boolean + vectors. (see `yule` function documentation) + + 15. ``Y = cdist(XA, XB, 'matching')`` + + Synonym for 'hamming'. + + 16. ``Y = cdist(XA, XB, 'dice')`` + + Computes the Dice distance between the boolean vectors. (see + `dice` function documentation) + + 17. ``Y = cdist(XA, XB, 'kulsinski')`` + + Computes the Kulsinski distance between the boolean + vectors. (see `kulsinski` function documentation) + + 18. ``Y = cdist(XA, XB, 'rogerstanimoto')`` + + Computes the Rogers-Tanimoto distance between the boolean + vectors. (see `rogerstanimoto` function documentation) + + 19. ``Y = cdist(XA, XB, 'russellrao')`` + + Computes the Russell-Rao distance between the boolean + vectors. (see `russellrao` function documentation) + + 20. ``Y = cdist(XA, XB, 'sokalmichener')`` + + Computes the Sokal-Michener distance between the boolean + vectors. (see `sokalmichener` function documentation) + + 21. ``Y = cdist(XA, XB, 'sokalsneath')`` + + Computes the Sokal-Sneath distance between the vectors. (see + `sokalsneath` function documentation) + + + 22. ``Y = cdist(XA, XB, 'wminkowski', p=2., w=w)`` + + Computes the weighted Minkowski distance between the + vectors. (see `wminkowski` function documentation) + + 23. ``Y = cdist(XA, XB, f)`` + + Computes the distance between all pairs of vectors in X + using the user supplied 2-arity function f. For example, + Euclidean distance between the vectors could be computed + as follows:: + + dm = cdist(XA, XB, lambda u, v: np.sqrt(((u-v)**2).sum())) + + Note that you should avoid passing a reference to one of + the distance functions defined in this library. For example,:: + + dm = cdist(XA, XB, sokalsneath) + + would calculate the pair-wise distances between the vectors in + X using the Python function `sokalsneath`. This would result in + sokalsneath being called :math:`{n \\choose 2}` times, which + is inefficient. Instead, the optimized C version is more + efficient, and we call it using the following syntax:: + + dm = cdist(XA, XB, 'sokalsneath') + + Examples + -------- + Find the Euclidean distances between four 2-D coordinates: + + >>> from mars.tensor.spatial import distance + >>> coords = [(35.0456, -85.2672), + ... (35.1174, -89.9711), + ... (35.9728, -83.9422), + ... (36.1667, -86.7833)] + >>> distance.cdist(coords, coords, 'euclidean').execute() + array([[ 0. , 4.7044, 1.6172, 1.8856], + [ 4.7044, 0. , 6.0893, 3.3561], + [ 1.6172, 6.0893, 0. , 2.8477], + [ 1.8856, 3.3561, 2.8477, 0. ]]) + + + Find the Manhattan distance from a 3-D point to the corners of the unit + cube: + + >>> import mars.tensor as mt + >>> a = mt.array([[0, 0, 0], + ... [0, 0, 1], + ... [0, 1, 0], + ... [0, 1, 1], + ... [1, 0, 0], + ... [1, 0, 1], + ... [1, 1, 0], + ... [1, 1, 1]]) + >>> b = mt.array([[ 0.1, 0.2, 0.4]]) + >>> distance.cdist(a, b, 'cityblock').execute() + array([[ 0.7], + [ 0.9], + [ 1.3], + [ 1.5], + [ 1.5], + [ 1.7], + [ 2.1], + [ 2.3]]) + + """ + XA = astensor(XA, order="C") + XB = astensor(XB, order="C") + + if XA.issparse() or XB.issparse(): + raise ValueError("Sparse tensors are not supported by this function.") + + s = XA.shape + sB = XB.shape + + if len(s) != 2: + raise ValueError("XA must be a 2-dimensional array.") + if len(sB) != 2: + raise ValueError("XB must be a 2-dimensional array.") + if s[1] != sB[1]: + raise ValueError( + "XA and XB must have the same number of columns " + "(i.e. feature dimension.)" + ) + + mA = s[0] + mB = sB[0] + out = kwargs.pop("out", None) + if out is not None: + if not hasattr(out, "shape"): + raise TypeError("return arrays must be a tensor") + if out.shape != (mA, mB): + raise ValueError("Output tensor has incorrect shape.") + if out.dtype != np.double: + raise ValueError("Output tensor must be double type.") + + if not isinstance(metric, str) and not callable(metric): + raise TypeError( + "3rd argument metric must be a string identifier or a function." + ) + + # scipy remove "wminkowski" since v1.8.0, use "minkowski" with `w=` + # keyword-argument for the given weight. + if metric == "wminkowski": + metric = "minkowski" + + p = kwargs.pop("p", None) + w = kwargs.pop("w", None) + if w is not None: + w = astensor(w) + v = kwargs.pop("V", None) + if v is not None: + v = astensor(v) + vi = kwargs.pop("VI", None) + if vi is not None: + vi = astensor(vi) + + if len(kwargs) > 0: + raise TypeError( + f"`cdist` got an unexpected keyword argument '{next(iter(kwargs))}'" + ) + + op = TensorCdist(metric=metric, p=p, w=w, v=v, vi=vi, dtype=np.dtype(float)) + shape = (XA.shape[0], XB.shape[0]) + ret = op(XA, XB, shape) + + if out is None: + return ret + else: + out.data = ret.data + return out diff --git a/python/xorbits/_mars/tensor/spatial/distance/pdist.py b/python/xorbits/_mars/tensor/spatial/distance/pdist.py new file mode 100644 index 000000000..ea630b907 --- /dev/null +++ b/python/xorbits/_mars/tensor/spatial/distance/pdist.py @@ -0,0 +1,740 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Tuple + +import numpy as np + +from .... import opcodes as OperandDef +from ....config import options +from ....core import recursive_tile +from ....core.operand import OperandStage +from ....serialization.serializables import ( + AnyField, + FieldTypes, + Float16Field, + Int32Field, + KeyField, + TupleField, +) +from ....utils import ensure_own_data, has_unknown_shape, require_module +from ...array_utils import as_same_device, cp, device +from ...core import TensorOrder +from ...datasource.array import tensor as astensor +from ...operands import TensorMapReduceOperand, TensorOperandMixin, TensorShuffleProxy + + +class TensorPdist(TensorMapReduceOperand, TensorOperandMixin): + _op_type_ = OperandDef.PDIST + + _input = KeyField("input") + _metric = AnyField("metric") + _p = Float16Field("p", on_serialize=lambda x: float(x) if x is not None else x) + _w = KeyField("w") + _v = KeyField("V") + _vi = KeyField("VI") + _aggregate_size = Int32Field("aggregate_size") + + _a = KeyField("a") + _a_offset = Int32Field("a_offset") + _b = KeyField("b") + _b_offset = Int32Field("b_offset") + _out_sizes = TupleField("out_sizes", FieldTypes.int32) + _n = Int32Field("n") + + def __init__( + self, + metric=None, + p=None, + w=None, + v=None, + vi=None, + a=None, + a_offset=None, + b=None, + b_offset=None, + out_sizes=None, + n=None, + aggregate_size=None, + **kw, + ): + super().__init__( + _metric=metric, + _p=p, + _w=w, + _v=v, + _vi=vi, + _a=a, + _a_offset=a_offset, + _b=b, + _b_offset=b_offset, + _out_sizes=out_sizes, + _n=n, + _aggregate_size=aggregate_size, + **kw, + ) + + def _set_inputs(self, inputs: List) -> None: + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + + if self.stage == OperandStage.map: + self._a = next(inputs_iter) + if self._b is not None: + self._b = next(inputs_iter) + else: + self._input = next(inputs_iter) + + if self._w is not None: + self._w = next(inputs_iter) + if self._v is not None: + self._v = next(inputs_iter) + if self._vi is not None: + self._vi = next(inputs_iter) + + @property + def input(self): + return self._input + + @property + def metric(self): + return self._metric + + @property + def p(self): + return self._p + + @property + def w(self): + return self._w + + @property + def v(self): + return self._v + + @property + def vi(self): + return self._vi + + @property + def aggregate_size(self): + return self._aggregate_size + + @property + def a(self): + return self._a + + @property + def a_offset(self): + return self._a_offset + + @property + def b(self): + return self._b + + @property + def b_offset(self): + return self._b_offset + + @property + def out_sizes(self): + return self._out_sizes + + @property + def n(self): + return self._n + + def __call__(self, x, shape: Tuple): + inputs = [x] + for val in [self._w, self._v, self._vi]: + if val is not None: + inputs.append(val) + return self.new_tensor(inputs, shape=shape, order=TensorOrder.C_ORDER) + + @classmethod + def _tile_one_chunk(cls, op, in_tensor, w, v, vi): + out_tensor = op.outputs[0] + chunk_op = op.copy().reset_key() + chunk_inputs = [in_tensor.chunks[0]] + for val in [w, v, vi]: + if val is not None: + chunk_inputs.append(val.chunks[0]) + chunk = chunk_op.new_chunk( + chunk_inputs, + shape=out_tensor.shape, + order=out_tensor.order, + index=(0,) * out_tensor.ndim, + ) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + shape=out_tensor.shape, + order=out_tensor.order, + nsplits=tuple((s,) for s in out_tensor.shape), + chunks=[chunk], + ) + + @classmethod + def _tile_chunks(cls, op, in_tensor, w, v, vi): + out_tensor = op.outputs[0] + extra_inputs = [] + for val in [w, v, vi]: + if val is not None: + extra_inputs.append(val.chunks[0]) + + n = in_tensor.shape[0] + aggregate_size = op.aggregate_size + if aggregate_size is None: + aggregate_size = ( + np.ceil( + out_tensor.size + * out_tensor.dtype.itemsize + / options.chunk_store_limit + ) + .astype(int) + .item() + ) + out_sizes = [out_tensor.size // aggregate_size for _ in range(aggregate_size)] + for i in range(out_tensor.size % aggregate_size): + out_sizes[i] += 1 + + chunk_size = in_tensor.chunk_shape[0] + map_chunks = [] + axis_0_cum_size = np.cumsum(in_tensor.nsplits[0]).tolist() + for i in range(chunk_size): + for j in range(i, chunk_size): + kw = { + "stage": OperandStage.map, + "a": in_tensor.cix[i, 0], + "a_offset": axis_0_cum_size[i - 1] if i > 0 else 0, + "out_sizes": tuple(out_sizes), + "n": n, + "metric": op.metric, + "p": op.p, + "w": w.chunks[0] if w is not None else None, + "v": v.chunks[0] if v is not None else None, + "vi": vi.chunks[0] if vi is not None else None, + "dtype": out_tensor.dtype, + } + if i != j: + kw["b"] = in_tensor.cix[j, 0] + kw["b_offset"] = axis_0_cum_size[j - 1] if j > 0 else 0 + map_op = TensorPdist(**kw) + map_chunk_inputs = [kw["a"]] + if "b" in kw: + map_chunk_inputs.append(kw["b"]) + if kw["w"] is not None: + map_chunk_inputs.append(kw["w"]) + if kw["v"] is not None: + map_chunk_inputs.append(kw["v"]) + if kw["vi"] is not None: + map_chunk_inputs.append(kw["vi"]) + # calc chunk shape + if i == j: + a_axis_0_size = kw["a"].shape[0] + chunk_shape = (a_axis_0_size * (a_axis_0_size - 1) // 2,) + else: + chunk_shape = (kw["a"].shape[0] * kw["b"].shape[0],) + map_chunk = map_op.new_chunk( + map_chunk_inputs, + shape=chunk_shape, + order=out_tensor.order, + index=(i * chunk_size + j,), + ) + map_chunks.append(map_chunk) + + proxy_chunk = TensorShuffleProxy(dtype=out_tensor.dtype).new_chunk( + map_chunks, shape=() + ) + + reduce_chunks = [] + for p in range(aggregate_size): + reduce_chunk_op = TensorPdist( + stage=OperandStage.reduce, + dtype=out_tensor.dtype, + n_reducers=aggregate_size, + ) + reduce_chunk = reduce_chunk_op.new_chunk( + [proxy_chunk], shape=(out_sizes[p],), order=out_tensor.order, index=(p,) + ) + reduce_chunks.append(reduce_chunk) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + shape=out_tensor.shape, + order=out_tensor.order, + nsplits=(tuple(out_sizes),), + chunks=reduce_chunks, + ) + + @classmethod + def tile(cls, op): + # make sure every inputs have known shape + if has_unknown_shape(*op.inputs): + yield + + in_tensor = yield from recursive_tile(op.input.rechunk({1: op.input.shape[1]})) + # rechunk w, v, vi into one chunk if any of them has value + extra_inputs = [None] * 3 + for i, ei in enumerate([op.w, op.v, op.vi]): + if ei is None: + continue + new_ei = yield from recursive_tile(ei.rechunk(ei.shape)) + extra_inputs[i] = new_ei + w, v, vi = extra_inputs + + if len(in_tensor.chunks) == 1: + # only 1 chunk + return cls._tile_one_chunk(op, in_tensor, w, v, vi) + else: + return cls._tile_chunks(op, in_tensor, w, v, vi) + + @classmethod + def _execute_map(cls, ctx, op): + from scipy.spatial.distance import cdist, pdist + + inputs, device_id, xp = as_same_device( + [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True + ) + + if xp is cp: # pragma: no cover + raise NotImplementedError("`pdist` does not support running on GPU yet") + + with device(device_id): + inputs_iter = iter(inputs) + a = next(inputs_iter) + if op.b is not None: + b = next(inputs_iter) + else: + b = None + kw = dict() + if op.p is not None: + kw["p"] = op.p + if op.w is not None: + kw["w"] = next(inputs_iter) + if op.v is not None: + kw["V"] = next(inputs_iter) + if op.vi is not None: + kw["VI"] = next(inputs_iter) + metric = op.metric + + if b is None: + # one input, pdist on same chunk + dists = pdist(ensure_own_data(a), metric=metric, **kw) + i_indices, j_indices = xp.triu_indices(a.shape[0], k=1) + i_indices += op.a_offset + j_indices += op.a_offset + else: + # two inputs, pdist on different chunks + dists = cdist( + ensure_own_data(a), ensure_own_data(b), metric=metric, **kw + ).ravel() + mgrid = xp.mgrid[ + op.a_offset : op.a_offset + a.shape[0], + op.b_offset : op.b_offset + b.shape[0], + ] + i_indices, j_indices = mgrid[0].ravel(), mgrid[1].ravel() + + out_row_sizes = xp.arange(op.n - 1, -1, -1) + out_row_cum_sizes = xp.empty((op.n + 1,), dtype=int) + out_row_cum_sizes[0] = 0 + xp.cumsum(out_row_sizes, out=out_row_cum_sizes[1:]) + indices = ( + out_row_cum_sizes[i_indices] + + j_indices + - (op.n - out_row_sizes[i_indices]) + ) + + # save as much memory as possible + del i_indices, j_indices, out_row_sizes, out_row_cum_sizes + + out_cum_size = xp.cumsum(op.out_sizes) + out = op.outputs[0] + for i in range(len(op.out_sizes)): + start_index = out_cum_size[i - 1] if i > 0 else 0 + end_index = out_cum_size[i] + to_filter = (indices >= start_index) & (indices < end_index) + downside_indices = indices[to_filter] - start_index + downside_dists = dists[to_filter] + ctx[out.key, (i,)] = (downside_indices, downside_dists) + + @classmethod + def _execute_single(cls, ctx, op): + from scipy.spatial.distance import pdist + + inputs, device_id, xp = as_same_device( + [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True + ) + + if xp is cp: # pragma: no cover + raise NotImplementedError("`pdist` does not support running on GPU yet") + + with device(device_id): + inputs_iter = iter(inputs) + x = next(inputs_iter) + kw = dict() + if op.p is not None: + kw["p"] = op.p + if op.w is not None: + kw["w"] = next(inputs_iter) + if op.v is not None: + kw["V"] = next(inputs_iter) + if op.vi is not None: + kw["VI"] = next(inputs_iter) + + ctx[op.outputs[0].key] = pdist(ensure_own_data(x), metric=op.metric, **kw) + + @classmethod + def _execute_reduce(cls, ctx, op: "TensorPdist"): + raw_inputs = list(op.iter_mapper_data(ctx)) + raw_indices = [inp[0] for inp in raw_inputs] + raw_dists = [inp[1] for inp in raw_inputs] + inputs, device_id, xp = as_same_device( + raw_indices + raw_dists, op.device, ret_extra=True + ) + raw_indices = inputs[: len(raw_indices)] + raw_dists = inputs[len(raw_indices) :] + output = op.outputs[0] + + with device(device_id): + indices = xp.concatenate(raw_indices) + dists = xp.concatenate(raw_dists) + out_dists = xp.empty(output.shape, dtype=float) + out_dists[indices] = dists + ctx[output.key] = out_dists + + @classmethod + def execute(cls, ctx, op): + if op.stage == OperandStage.map: + cls._execute_map(ctx, op) + elif op.stage == OperandStage.reduce: + cls._execute_reduce(ctx, op) + else: + cls._execute_single(ctx, op) + + +@require_module("scipy.spatial.distance") +def pdist(X, metric="euclidean", **kwargs): + """ + Pairwise distances between observations in n-dimensional space. + + See Notes for common calling conventions. + + Parameters + ---------- + X : Tensor + An m by n tensor of m original observations in an + n-dimensional space. + metric : str or function, optional + The distance metric to use. The distance function can + be 'braycurtis', 'canberra', 'chebyshev', 'cityblock', + 'correlation', 'cosine', 'dice', 'euclidean', 'hamming', + 'jaccard', 'jensenshannon', 'kulsinski', 'mahalanobis', 'matching', + 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', + 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'. + **kwargs : dict, optional + Extra arguments to `metric`: refer to each metric documentation for a + list of all possible arguments. + + Some possible arguments: + + p : scalar + The p-norm to apply for Minkowski, weighted and unweighted. + Default: 2. + + w : Tensor + The weight vector for metrics that support weights (e.g., Minkowski). + + V : Tensor + The variance vector for standardized Euclidean. + Default: var(X, axis=0, ddof=1) + + VI : Tensor + The inverse of the covariance matrix for Mahalanobis. + Default: inv(cov(X.T)).T + + out : Tensor. + The output tensor + If not None, condensed distance matrix Y is stored in this tensor. + Note: metric independent, it will become a regular keyword arg in a + future scipy version + + Returns + ------- + Y : Tensor + Returns a condensed distance matrix Y. For + each :math:`i` and :math:`j` (where :math:`i 0: + raise TypeError( + f"`pdist` got an unexpected keyword argument '{next(iter(kwargs))}'" + ) + + op = TensorPdist( + metric=metric, + p=p, + w=w, + v=v, + vi=vi, + aggregate_size=aggregate_size, + dtype=np.dtype(float), + ) + shape = (m * (m - 1) // 2,) + ret = op(X, shape) + + if out is None: + return ret + else: + out.data = ret.data + return out diff --git a/python/xorbits/_mars/tensor/spatial/distance/squareform.py b/python/xorbits/_mars/tensor/spatial/distance/squareform.py new file mode 100644 index 000000000..b55473fa7 --- /dev/null +++ b/python/xorbits/_mars/tensor/spatial/distance/squareform.py @@ -0,0 +1,452 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np + +from .... import opcodes as OperandDef +from ....config import options +from ....core import recursive_tile +from ....core.operand import OperandStage +from ....serialization.serializables import BoolField, FieldTypes, KeyField, TupleField +from ....utils import has_unknown_shape, require_module +from ...arithmetic import equal +from ...array_utils import as_same_device, cp, device +from ...core import TensorOrder +from ...datasource import array, ascontiguousarray, zeros +from ...operands import TensorMapReduceOperand, TensorOperandMixin, TensorShuffleProxy +from ...utils import decide_chunk_sizes + + +class TensorSquareform(TensorMapReduceOperand, TensorOperandMixin): + _op_type_ = OperandDef.SQUAREFORM + + _input = KeyField("input") + _checks = BoolField("checks") + + _checks_input = KeyField("checks_input") + _x_shape = TupleField("x_shape", FieldTypes.int32) + _reduce_sizes = TupleField("reduce_sizes", FieldTypes.tuple) + _start_positions = TupleField("start_positions", FieldTypes.int32) + + def __init__( + self, + checks=None, + checks_input=None, + x_shape=None, + reduce_sizes=None, + start_positions=None, + **kw + ): + super().__init__( + _checks=checks, + _checks_input=checks_input, + _x_shape=x_shape, + _reduce_sizes=reduce_sizes, + _start_positions=start_positions, + **kw + ) + + @property + def input(self): + return self._input + + @property + def checks(self): + return self._checks + + @property + def checks_input(self): + return self._checks_input + + @property + def x_shape(self): + return self._x_shape + + @property + def reduce_sizes(self): + return self._reduce_sizes + + @property + def start_positions(self): + return self._start_positions + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + if self._checks_input is not None: + self._checks_input = self._inputs[-1] + + def __call__(self, X, force="no", chunk_size=None): + s = X.shape + + if force.lower() == "tomatrix": + if len(s) != 1: + raise ValueError( + "Forcing 'tomatrix' but input X is not a distance vector." + ) + elif force.lower() == "tovector": + if len(s) != 2: + raise ValueError( + "Forcing 'tovector' but input X is not a distance matrix." + ) + + # X = squareform(v) + if len(s) == 1: + if s[0] == 0: + return zeros((1, 1), dtype=X.dtype) + + # Grab the closest value to the square root of the number + # of elements times 2 to see if the number of elements + # is indeed a binomial coefficient. + d = int(np.ceil(np.sqrt(s[0] * 2))) + + # Check that v is of valid dimensions. + if d * (d - 1) != s[0] * 2: + raise ValueError( + "Incompatible vector size. It must be a binomial " + "coefficient n choose 2 for some integer n >= 2." + ) + + shape = (d, d) + elif len(s) == 2: + if s[0] != s[1]: + raise ValueError("The matrix argument must be square.") + + # One-side of the dimensions is set here. + d = s[0] + + if d <= 1: + return array([], dtype=X.dtype) + + shape = ((d * (d - 1)) // 2,) + else: + raise ValueError( + ( + "The first argument must be one or two dimensional " + "tensor. A %d-dimensional tensor is not " + "permitted" + ) + % len(s) + ) + + return self.new_tensor( + [X], shape=shape, order=TensorOrder.C_ORDER, raw_chunk_size=chunk_size + ) + + @classmethod + def tile(cls, op): + tensor = op.outputs[0] + chunk_size = tensor.extra_params.raw_chunk_size or options.chunk_size + chunk_size = decide_chunk_sizes(tensor.shape, chunk_size, tensor.dtype.itemsize) + n_chunk = np.product([len(cs) for cs in chunk_size]) + + if len(op.input.chunks) == 1 and n_chunk == 1: + return cls._tile_one_chunk(op) + else: + return (yield from cls._tile_chunks(op, chunk_size)) + + @classmethod + def _tile_one_chunk(cls, op): + out = op.outputs[0] + chunk_op = op.copy().reset_key() + chunk = chunk_op.new_chunk( + op.input.chunks, shape=out.shape, order=out.order, index=(0,) * out.ndim + ) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + shape=out.shape, + order=out.order, + chunks=[chunk], + nsplits=tuple((s,) for s in out.shape), + ) + + @classmethod + def _gen_checks_input(cls, op): + if op.input.ndim != 2 or not op.checks: + return + + x = op.input + ret = yield from recursive_tile(equal(x, x.T).all()) + return ret.chunks[0] + + @classmethod + def _tile_chunks(cls, op, chunk_size): + if has_unknown_shape(*op.inputs): + yield + out = op.outputs[0] + + checks_input = yield from cls._gen_checks_input(op) + + map_chunks = [] + cum_sizes = [[0] + np.cumsum(ns).tolist() for ns in op.input.nsplits] + to_vec = op.input.ndim == 2 + for in_chunk in op.input.chunks: + if to_vec and in_chunk.index[0] > in_chunk.index[1]: + # if apply squareform to 2-d tensor which is symmetric, + # we don't need to calculate for lower triangle chunks + continue + map_chunk_op = TensorSquareform( + stage=OperandStage.map, + checks_input=checks_input, + reduce_sizes=chunk_size, + x_shape=op.input.shape, + start_positions=tuple( + cum_sizes[ax][j] for ax, j in enumerate(in_chunk.index) + ), + dtype=out.dtype, + gpu=out.op.gpu, + ) + chunk_inputs = [in_chunk] + if checks_input is not None: + chunk_inputs.append(checks_input) + map_chunk = map_chunk_op.new_chunk( + chunk_inputs, shape=(2, np.nan), index=in_chunk.index, order=out.order + ) + map_chunks.append(map_chunk) + + proxy_chunk = TensorShuffleProxy(dtype=out.dtype).new_chunk( + map_chunks, shape=() + ) + + reduce_chunks = [] + out_shape_iter = itertools.product(*chunk_size) + out_indices = list(itertools.product(*(range(len(cs)) for cs in chunk_size))) + for out_idx, out_shape in zip(out_indices, out_shape_iter): + reduce_chunk_op = TensorSquareform( + stage=OperandStage.reduce, + dtype=out.dtype, + n_reducers=len(out_indices), + ) + reduce_chunk = reduce_chunk_op.new_chunk( + [proxy_chunk], shape=out_shape, index=out_idx, order=out.order + ) + reduce_chunks.append(reduce_chunk) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + shape=out.shape, + order=out.order, + nsplits=chunk_size, + chunks=reduce_chunks, + ) + + @classmethod + def _to_matrix(cls, ctx, xp, x, op): + assert x.ndim == 1 + out_chunk_size = op.reduce_sizes + out_shape = tuple(sum(ns) for ns in out_chunk_size) + d = out_shape[0] + + # calculate the index for the 1-d chunk + index = xp.arange(x.shape[0]) + index = xp.add(index, op.start_positions[0], out=index) + + # input length for each row + row_sizes = xp.arange(d, -1, -1) + row_sizes[0] = 0 + xp.cumsum(row_sizes[1:], out=row_sizes[1:]) + # calculate row for each element + rows = xp.searchsorted(row_sizes, index, side="right") + xp.subtract(rows, 1, out=rows) + # calculate col for each element + # offsets + cols_offsets = xp.arange(1, d + 1) + cols = xp.empty(x.shape, dtype=np.int32) + xp.add( + xp.subtract(index, row_sizes[rows], out=cols), cols_offsets[rows], out=cols + ) + + cum_sizes = [[0] + np.cumsum(cs).tolist() for cs in out_chunk_size] + for idx in itertools.product(*(range(len(ns)) for ns in out_chunk_size)): + i, j = idx + row_range = cum_sizes[0][i], cum_sizes[0][i + 1] + col_range = cum_sizes[1][j], cum_sizes[1][j + 1] + # for upper + filtered = ( + (rows >= row_range[0]) + & (rows < row_range[1]) + & (cols >= col_range[0]) + & (cols < col_range[1]) + ) + inds_tup = rows[filtered] - row_range[0], cols[filtered] - col_range[0] + upper_inds = xp.ravel_multi_index( + inds_tup, (out_chunk_size[0][i], out_chunk_size[1][j]) + ) + upper_values = x[filtered] + # for lower + filtered = ( + (rows >= col_range[0]) + & (rows < col_range[1]) + & (cols >= row_range[0]) + & (cols < row_range[1]) + ) + inds_tup = cols[filtered] - row_range[0], rows[filtered] - col_range[0] + lower_inds = xp.ravel_multi_index( + inds_tup, (out_chunk_size[0][i], out_chunk_size[1][j]) + ) + lower_values = x[filtered] + + inds = xp.concatenate([upper_inds, lower_inds]) + values = xp.concatenate([upper_values, lower_values]) + + ctx[op.outputs[0].key, idx] = inds, values + + @classmethod + def _to_vector(cls, ctx, xp, x, op): + out_chunk_size = op.reduce_sizes + start_poses = op.start_positions + + i_indices, j_indices = xp.mgrid[ + start_poses[0] : start_poses[0] + x.shape[0], + start_poses[1] : start_poses[1] + x.shape[1], + ] + filtered = i_indices < j_indices + i_indices, j_indices, x = i_indices[filtered], j_indices[filtered], x[filtered] + + d = op.x_shape[0] + row_sizes = xp.arange(d - 1, -1, -1) + row_cum_sizes = xp.empty((d + 1,), dtype=int) + row_cum_sizes[0] = 0 + xp.cumsum(row_sizes, out=row_cum_sizes[1:]) + to_indices = row_cum_sizes[i_indices] + j_indices - (d - row_sizes[i_indices]) + + cum_chunk_size = [0] + np.cumsum(out_chunk_size).tolist() + for i in range(len(out_chunk_size[0])): + index_range = cum_chunk_size[i], cum_chunk_size[i + 1] + filtered = (to_indices >= index_range[0]) & (to_indices < index_range[1]) + out_indices = to_indices[filtered] - cum_chunk_size[i] + ctx[op.outputs[0].key, (i,)] = out_indices, x[filtered] + + @classmethod + def _execute_map(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True + ) + + if len(inputs) == 2 and not inputs[1]: + # check fail + raise ValueError("Distance matrix X must be symmetric.") + + if xp is cp: # pragma: no cover + raise NotImplementedError( + "`squareform` does not support running on GPU yet" + ) + + with device(device_id): + x = inputs[0] + if x.ndim == 1: + cls._to_matrix(ctx, xp, x, op) + else: + cls._to_vector(ctx, xp, x, op) + + @classmethod + def _execute_reduce(cls, ctx, op: "TensorSquareform"): + raw_inputs = list(op.iter_mapper_data(ctx)) + raw_indices = [inp[0] for inp in raw_inputs] + raw_dists = [inp[1] for inp in raw_inputs] + inputs, device_id, xp = as_same_device( + raw_indices + raw_dists, op.device, ret_extra=True + ) + raw_indices = inputs[: len(raw_indices)] + raw_dists = inputs[len(raw_indices) :] + output = op.outputs[0] + + with device(device_id): + out_dists = xp.zeros(output.shape, dtype=output.dtype) + indices = xp.concatenate(raw_indices) + dists = xp.concatenate(raw_dists) + out_dists.flat[indices] = dists + ctx[output.key] = out_dists + + @classmethod + def execute(cls, ctx, op): + if op.stage == OperandStage.map: + cls._execute_map(ctx, op) + elif op.stage == OperandStage.reduce: + cls._execute_reduce(ctx, op) + else: + from scipy.spatial.distance import squareform + + (x,), device_id, xp = as_same_device( + [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True + ) + + if xp is cp: # pragma: no cover + raise NotImplementedError( + "`squareform` does not support running on GPU yet" + ) + + with device(device_id): + ctx[op.outputs[0].key] = squareform(x, checks=op.checks) + + +@require_module("scipy.spatial.distance") +def squareform(X, force="no", checks=True, chunk_size=None): + """ + Convert a vector-form distance vector to a square-form distance + matrix, and vice-versa. + + Parameters + ---------- + X : Tensor + Either a condensed or redundant distance matrix. + force : str, optional + As with MATLAB(TM), if force is equal to ``'tovector'`` or + ``'tomatrix'``, the input will be treated as a distance matrix or + distance vector respectively. + checks : bool, optional + If set to False, no checks will be made for matrix + symmetry nor zero diagonals. This is useful if it is known that + ``X - X.T1`` is small and ``diag(X)`` is close to zero. + These values are ignored any way so they do not disrupt the + squareform transformation. + + Returns + ------- + Y : Tensor + If a condensed distance matrix is passed, a redundant one is + returned, or if a redundant one is passed, a condensed distance + matrix is returned. + + Notes + ----- + 1. v = squareform(X) + + Given a square d-by-d symmetric distance matrix X, + ``v = squareform(X)`` returns a ``d * (d-1) / 2`` (or + :math:`{n \\choose 2}`) sized vector v. + + :math:`v[{n \\choose 2}-{n-i \\choose 2} + (j-i-1)]` is the distance + between points i and j. If X is non-square or asymmetric, an error + is returned. + + 2. X = squareform(v) + + Given a ``d*(d-1)/2`` sized v for some integer ``d >= 2`` encoding + distances as described, ``X = squareform(v)`` returns a d by d distance + matrix X. The ``X[i, j]`` and ``X[j, i]`` values are set to + :math:`v[{n \\choose 2}-{n-i \\choose 2} + (j-i-1)]` and all + diagonal elements are zero. + + """ + + X = ascontiguousarray(X) + + op = TensorSquareform(checks=checks, dtype=X.dtype, gpu=X.op.gpu) + return op(X, force=force, chunk_size=chunk_size) diff --git a/python/xorbits/_mars/tensor/spatial/distance/tests/__init__.py b/python/xorbits/_mars/tensor/spatial/distance/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/tensor/spatial/distance/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/tensor/spatial/distance/tests/test_distance.py b/python/xorbits/_mars/tensor/spatial/distance/tests/test_distance.py new file mode 100644 index 000000000..a71e130a1 --- /dev/null +++ b/python/xorbits/_mars/tensor/spatial/distance/tests/test_distance.py @@ -0,0 +1,160 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +import scipy.sparse as sps + +from .....core import tile +from ....datasource import tensor +from ... import distance + + +def test_pdist(): + raw = np.random.rand(100, 10) + + # test 1 chunk + a = tensor(raw, chunk_size=100) + dist = distance.pdist(a) + assert dist.shape == (100 * 99 // 2,) + + dist = tile(dist) + assert len(dist.chunks) == 1 + for c in dist.chunks: + assert c.shape == (dist.shape[0],) + + # test multiple chunks + a = tensor(raw, chunk_size=15) + dist = distance.pdist(a, aggregate_size=2) + assert dist.shape == (100 * 99 // 2,) + + dist = tile(dist) + assert len(dist.chunks) == 2 + for c in dist.chunks: + assert c.shape == (dist.shape[0] // 2,) + + # X cannot be sparse + raw = sps.csr_matrix(np.zeros((4, 3))) + a = tensor(raw) + with pytest.raises(ValueError): + distance.pdist(a) + + # X can only be 2-d + with pytest.raises(ValueError): + distance.pdist(np.random.rand(3, 3, 3)) + + # out type wrong + with pytest.raises(TypeError): + distance.pdist(np.random.rand(3, 3), out=2) + + # out shape wrong + with pytest.raises(ValueError): + distance.pdist(np.random.rand(3, 3), out=tensor(np.random.rand(2))) + + # out dtype wrong + with pytest.raises(ValueError): + distance.pdist( + np.random.rand(3, 3), out=tensor(np.random.randint(2, size=(3,))) + ) + + # test extra param + with pytest.raises(TypeError): + distance.pdist(np.random.rand(3, 3), unknown_kw="unknown_kw") + + +def test_cdist(): + raw_a = np.random.rand(100, 10) + raw_b = np.random.rand(90, 10) + + # test 1 chunk + a = tensor(raw_a, chunk_size=100) + b = tensor(raw_b, chunk_size=100) + dist = distance.cdist(a, b) + assert dist.shape == (100, 90) + + dist = tile(dist) + assert len(dist.chunks) == 1 + for c in dist.chunks: + assert c.shape == dist.shape + + # test multiple chunks + a = tensor(raw_a, chunk_size=15) + b = tensor(raw_b, chunk_size=16) + dist = distance.cdist(a, b) + assert dist.shape == (100, 90) + + ta, tb, dist = tile(a, b, dist) + assert len(dist.chunks) == (100 // 15 + 1) * (90 // 16 + 1) + assert dist.nsplits == (ta.nsplits[0], tb.nsplits[0]) + for c in dist.chunks: + assert c.shape == ( + ta.cix[c.index[0], 0].shape[0], + tb.cix[c.index[1], 0].shape[0], + ) + + # XA can only be 2-d + with pytest.raises(ValueError): + distance.cdist(np.random.rand(3, 3, 3), np.random.rand(3, 3)) + + # XB can only be 2-d + with pytest.raises(ValueError): + distance.cdist(np.random.rand(3, 3), np.random.rand(3, 3, 3)) + + # XA cannot be sparse + raw = sps.csr_matrix(np.zeros((4, 3))) + a = tensor(raw) + with pytest.raises(ValueError): + distance.cdist(a, np.random.rand(10, 3)) + + # XB cannot be sparse + raw = sps.csr_matrix(np.zeros((4, 3))) + b = tensor(raw) + with pytest.raises(ValueError): + distance.cdist(np.random.rand(10, 3), b) + + # out type wrong + with pytest.raises(TypeError): + distance.cdist(raw_a, raw_b, out=2) + + # out shape wrong + with pytest.raises(ValueError): + distance.cdist(raw_a, raw_b, out=tensor(np.random.rand(100, 91))) + + # out dtype wrong + with pytest.raises(ValueError): + distance.cdist(raw_a, raw_b, out=tensor(np.random.randint(2, size=(100, 90)))) + + # test extra param + with pytest.raises(TypeError): + distance.cdist(raw_a, raw_b, unknown_kw="unknown_kw") + + +def test_squareform(): + assert distance.squareform(np.array([], dtype=float)).shape == (1, 1) + assert distance.squareform(np.atleast_2d(np.random.rand())).shape == (0,) + + with pytest.raises(ValueError): + distance.squareform(np.random.rand(3, 3), force="tomatrix") + + with pytest.raises(ValueError): + distance.squareform(np.random.rand(3), force="tovector") + + with pytest.raises(ValueError): + distance.squareform(np.random.rand(3, 3, 3)) + + with pytest.raises(ValueError): + distance.squareform(np.random.rand(2, 4)) + + with pytest.raises(ValueError): + distance.squareform(np.random.rand(7)) diff --git a/python/xorbits/_mars/tensor/spatial/distance/tests/test_distance_execution.py b/python/xorbits/_mars/tensor/spatial/distance/tests/test_distance_execution.py new file mode 100644 index 000000000..51ccfd0e1 --- /dev/null +++ b/python/xorbits/_mars/tensor/spatial/distance/tests/test_distance_execution.py @@ -0,0 +1,238 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest + +from .....core import tile +from ....datasource import tensor +from ... import distance + + +@pytest.mark.skipif(distance.pdist is None, reason="scipy not installed") +def test_pdist_execution(setup): + from scipy.spatial.distance import pdist as sp_pdist + + raw = np.random.rand(100, 10) + + # test 1 chunk + x = tensor(raw, chunk_size=100) + + dist = distance.pdist(x) + result = dist.execute().fetch() + expected = sp_pdist(raw) + np.testing.assert_array_equal(result, expected) + + dist = distance.pdist(x, metric="hamming") + result = dist.execute().fetch() + expected = sp_pdist(raw, metric="hamming") + np.testing.assert_array_equal(result, expected) + + f = lambda u, v: np.sqrt(((u - v) ** 2).sum()) + dist = distance.pdist(x, metric=f) + result = dist.execute().fetch() + expected = sp_pdist(raw, metric=f) + np.testing.assert_array_equal(result, expected) + + # test more than 1 chunk + x = tensor(raw, chunk_size=12) + + dist = distance.pdist(x) + tdist = tile(dist) + assert len(tdist.chunks) == 1 + result = dist.execute().fetch() + expected = sp_pdist(raw) + np.testing.assert_array_equal(result, expected) + + dist = distance.pdist(x, aggregate_size=3) + tdist = tile(dist) + assert len(tdist.chunks) == 3 + result = dist.execute().fetch() + expected = sp_pdist(raw) + np.testing.assert_array_equal(result, expected) + + dist = distance.pdist(x, metric="hamming", aggregate_size=2) + tdist = tile(dist) + assert len(tdist.chunks) == 2 + result = dist.execute().fetch() + expected = sp_pdist(raw, metric="hamming") + np.testing.assert_array_equal(result, expected) + + f = lambda u, v: np.sqrt(((u - v) ** 2).sum()) + dist = distance.pdist(x, metric=f, aggregate_size=2) + result = dist.execute().fetch() + expected = sp_pdist(raw, metric=f) + np.testing.assert_array_equal(result, expected) + + for x in [tensor(raw), tensor(raw, chunk_size=12)]: + # test w + weight = np.random.rand(10) + w = tensor(weight, chunk_size=7) + dist = distance.pdist(x, metric="wminkowski", p=3, w=w) + result = dist.execute().fetch() + expected = sp_pdist(raw, metric="minkowski", p=3, w=weight) + np.testing.assert_array_equal(result, expected) + + # test V + v = np.random.rand(10) + V = tensor(v, chunk_size=7) + dist = distance.pdist(x, metric="seuclidean", V=V) + result = dist.execute().fetch() + expected = sp_pdist(raw, metric="seuclidean", V=v) + np.testing.assert_array_equal(result, expected) + + # test VI + vi = np.random.rand(10, 10) + VI = tensor(vi, chunk_size=8) + dist = distance.pdist(x, metric="mahalanobis", VI=VI) + result = dist.execute().fetch() + expected = sp_pdist(raw, metric="mahalanobis", VI=vi) + np.testing.assert_array_equal(result, expected) + + +@pytest.mark.skipif(distance.cdist is None, reason="scipy not installed") +def test_cdist_execution(setup): + from scipy.spatial.distance import cdist as sp_cdist + + raw_a = np.random.rand(100, 10) + raw_b = np.random.rand(89, 10) + + # test 1 chunk + xa = tensor(raw_a, chunk_size=100) + xb = tensor(raw_b, chunk_size=100) + + dist = distance.cdist(xa, xb) + result = dist.execute().fetch() + expected = sp_cdist(raw_a, raw_b) + np.testing.assert_array_equal(result, expected) + + dist = distance.cdist(xa, xb, metric="hamming") + result = dist.execute().fetch() + expected = sp_cdist(raw_a, raw_b, metric="hamming") + np.testing.assert_array_equal(result, expected) + + f = lambda u, v: np.sqrt(((u - v) ** 2).sum()) + dist = distance.cdist(xa, xb, metric=f) + result = dist.execute().fetch() + expected = sp_cdist(raw_a, raw_b, metric=f) + np.testing.assert_array_equal(result, expected) + + # test more than 1 chunk + xa = tensor(raw_a, chunk_size=12) + xb = tensor(raw_b, chunk_size=13) + + dist = distance.cdist(xa, xb) + result = dist.execute().fetch() + expected = sp_cdist(raw_a, raw_b) + np.testing.assert_array_equal(result, expected) + + dist = distance.cdist(xa, xb, metric="hamming") + result = dist.execute().fetch() + expected = sp_cdist(raw_a, raw_b, metric="hamming") + np.testing.assert_array_equal(result, expected) + + f = lambda u, v: np.sqrt(((u - v) ** 2).sum()) + dist = distance.cdist(xa, xb, metric=f) + result = dist.execute().fetch() + expected = sp_cdist(raw_a, raw_b, metric=f) + np.testing.assert_array_equal(result, expected) + + for xa, xb in [ + (tensor(raw_a), tensor(raw_b)), + (tensor(raw_a, chunk_size=12), tensor(raw_b, chunk_size=13)), + ]: + # test w + weight = np.random.rand(10) + w = tensor(weight, chunk_size=7) + dist = distance.cdist(xa, xb, metric="wminkowski", p=3, w=w) + result = dist.execute().fetch() + expected = sp_cdist(raw_a, raw_b, metric="minkowski", p=3, w=weight) + np.testing.assert_array_equal(result, expected) + + # test V + v = np.random.rand(10) + V = tensor(v, chunk_size=7) + dist = distance.cdist(xa, xb, metric="seuclidean", V=V) + result = dist.execute().fetch() + expected = sp_cdist(raw_a, raw_b, metric="seuclidean", V=v) + np.testing.assert_array_equal(result, expected) + + # test VI + vi = np.random.rand(10, 10) + VI = tensor(vi, chunk_size=8) + dist = distance.cdist(xa, xb, metric="mahalanobis", VI=VI) + result = dist.execute().fetch() + expected = sp_cdist(raw_a, raw_b, metric="mahalanobis", VI=vi) + np.testing.assert_array_equal(result, expected) + + +@pytest.mark.skipif(distance.cdist is None, reason="scipy not installed") +def test_squareform_execution(setup): + from scipy.spatial.distance import pdist as sp_pdist + from scipy.spatial.distance import squareform as sp_squareform + + raw_a = np.random.rand(80, 10) + raw_pdsit = sp_pdist(raw_a) + raw_square = sp_squareform(raw_pdsit) + + # tomatrix, test 1 chunk + vec = tensor(raw_pdsit, chunk_size=raw_pdsit.shape[0]) + mat = distance.squareform(vec, chunk_size=100) + result = mat.execute().fetch() + np.testing.assert_array_equal(result, raw_square) + + # tomatrix, test more than 1 chunk + vec = tensor(raw_pdsit, chunk_size=33) + assert len(tile(vec).chunks) > 1 + mat = distance.squareform(vec, chunk_size=34) + result = mat.execute().fetch() + np.testing.assert_array_equal(result, raw_square) + + # tovec, test 1 chunk + mat = tensor(raw_square) + vec = distance.squareform(mat, chunk_size=raw_pdsit.shape[0]) + assert len(tile(mat).chunks) == 1 + assert len(tile(vec).chunks) == 1 + result = vec.execute().fetch() + np.testing.assert_array_equal(result, raw_pdsit) + + # tovec, test more than 1 chunk + mat = tensor(raw_square, chunk_size=31) + vec = distance.squareform(mat, chunk_size=40) + assert len(tile(vec).chunks) > 1 + result = vec.execute().fetch() + np.testing.assert_array_equal(result, raw_pdsit) + + # test checks + # generate non-symmetric matrix + non_sym_arr = np.random.RandomState(0).rand(10, 10) + + # 1 chunk + mat = tensor(non_sym_arr) + vec = distance.squareform(mat, checks=True, chunk_size=100) + with pytest.raises(ValueError): + _ = vec.execute().fetch() + # force checks=False + vec = distance.squareform(mat, checks=False, chunk_size=100) + _ = vec.execute().fetch() + + # more than 1 chunk + mat = tensor(non_sym_arr, chunk_size=6) + vec = distance.squareform(mat, checks=True, chunk_size=8) + assert len(tile(vec).chunks) > 1 + with pytest.raises(ValueError): + _ = vec.execute().fetch() + # force checks=False + vec = distance.squareform(mat, checks=False, chunk_size=100) + _ = vec.execute().fetch() diff --git a/python/xorbits/_mars/tensor/special/__init__.py b/python/xorbits/_mars/tensor/special/__init__.py new file mode 100644 index 000000000..e1644e19b --- /dev/null +++ b/python/xorbits/_mars/tensor/special/__init__.py @@ -0,0 +1,167 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +try: + import scipy + + from .airy import TensorAiry, TensorAirye, TensorItairy, airy, airye, itairy + from .bessel import ( + TensorHankel1, + TensorHankel1e, + TensorHankel2, + TensorHankel2e, + TensorIV, + TensorIVE, + TensorJV, + TensorJVE, + TensorKN, + TensorKV, + TensorKVE, + TensorYN, + TensorYV, + TensorYVE, + hankel1, + hankel1e, + hankel2, + hankel2e, + iv, + ive, + jv, + jve, + kn, + kv, + kve, + yn, + yv, + yve, + ) + from .convenience import TensorXLogY, xlogy + from .ellip_func_integrals import ( + TensorEllipe, + TensorEllipeinc, + TensorEllipk, + TensorEllipkinc, + TensorEllipkm1, + TensorElliprc, + TensorElliprd, + TensorElliprf, + TensorElliprg, + TensorElliprj, + ellipe, + ellipeinc, + ellipk, + ellipkinc, + ellipkm1, + elliprc, + elliprd, + elliprf, + elliprg, + elliprj, + ) + from .ellip_harm import ( + TensorEllipHarm, + TensorEllipHarm2, + TensorEllipNormal, + ellip_harm, + ellip_harm_2, + ellip_normal, + ) + from .err_fresnel import ( + TensorDawsn, + TensorErf, + TensorErfc, + TensorErfcinv, + TensorErfcx, + TensorErfi, + TensorErfinv, + TensorFresnel, + TensorModFresnelM, + TensorModFresnelP, + TensorVoigtProfile, + TensorWofz, + dawsn, + erf, + erfc, + erfcinv, + erfcx, + erfi, + erfinv, + fresnel, + modfresnelm, + modfresnelp, + voigt_profile, + wofz, + ) + from .gamma_funcs import ( + TensorBeta, + TensorBetaInc, + TensorBetaIncInv, + TensorBetaLn, + TensorDiGamma, + TensorGamma, + TensorGammaInc, + TensorGammaIncc, + TensorGammaInccInv, + TensorGammaIncInv, + TensorGammaln, + TensorGammaSgn, + TensorLogGamma, + TensorMultiGammaLn, + TensorPoch, + TensorPolyGamma, + TensorPsi, + TensorRGamma, + beta, + betainc, + betaincinv, + betaln, + digamma, + gamma, + gammainc, + gammaincc, + gammainccinv, + gammaincinv, + gammaln, + gammasgn, + loggamma, + multigammaln, + poch, + polygamma, + psi, + rgamma, + ) + from .hypergeometric_funcs import ( + TensorHYP0F1, + TensorHYP1F1, + TensorHYP2F1, + TensorHYPERU, + hyp0f1, + hyp1f1, + hyp2f1, + hyperu, + ) + from .info_theory import ( + TensorEntr, + TensorKlDiv, + TensorRelEntr, + entr, + kl_div, + rel_entr, + ) +except ImportError: # pragma: no cover + pass + +_names_to_del = [_name for _name, _val in globals().items() if _val is None] +[globals().pop(_name) for _name in _names_to_del] +del _names_to_del diff --git a/python/xorbits/_mars/tensor/special/airy.py b/python/xorbits/_mars/tensor/special/airy.py new file mode 100644 index 000000000..6b9d916b7 --- /dev/null +++ b/python/xorbits/_mars/tensor/special/airy.py @@ -0,0 +1,57 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import scipy.special as spspecial + +from ..utils import implement_scipy, infer_dtype +from .core import TensorTupleOp, _register_special_op + + +@_register_special_op +class TensorAiry(TensorTupleOp): + _func_name = "airy" + _n_outputs = 4 + + +@implement_scipy(spspecial.airy) +@infer_dtype(spspecial.airy, multi_outputs=True) +def airy(z, out=None, **kwargs): + op = TensorAiry(**kwargs) + return op(z, out=out) + + +@_register_special_op +class TensorAirye(TensorTupleOp): + _func_name = "airye" + _n_outputs = 4 + + +@implement_scipy(spspecial.airye) +@infer_dtype(spspecial.airye, multi_outputs=True) +def airye(z, out=None, **kwargs): + op = TensorAirye(**kwargs) + return op(z, out=out) + + +@_register_special_op +class TensorItairy(TensorTupleOp): + _func_name = "itairy" + _n_outputs = 4 + + +@implement_scipy(spspecial.itairy) +@infer_dtype(spspecial.itairy, multi_outputs=True) +def itairy(x, out=None, **kwargs): + op = TensorItairy(**kwargs) + return op(x, out=out) diff --git a/python/xorbits/_mars/tensor/special/bessel.py b/python/xorbits/_mars/tensor/special/bessel.py new file mode 100644 index 000000000..edf64890e --- /dev/null +++ b/python/xorbits/_mars/tensor/special/bessel.py @@ -0,0 +1,201 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import scipy.special as spspecial + +from ..arithmetic.utils import arithmetic_operand +from ..utils import implement_scipy, infer_dtype +from .core import TensorSpecialBinOp, _register_special_op + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorJV(TensorSpecialBinOp): + _func_name = "jv" + + +@implement_scipy(spspecial.jv) +@infer_dtype(spspecial.jv) +def jv(v, z, **kwargs): + op = TensorJV(**kwargs) + return op(v, z) + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorJVE(TensorSpecialBinOp): + _func_name = "jve" + + +@implement_scipy(spspecial.jve) +@infer_dtype(spspecial.jve) +def jve(v, z, **kwargs): + op = TensorJVE(**kwargs) + return op(v, z) + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorYN(TensorSpecialBinOp): + _func_name = "yn" + + +@implement_scipy(spspecial.yn) +@infer_dtype(spspecial.yn) +def yn(n, x, **kwargs): + op = TensorYN(**kwargs) + return op(n, x) + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorYV(TensorSpecialBinOp): + _func_name = "yv" + + +@implement_scipy(spspecial.yv) +@infer_dtype(spspecial.yv) +def yv(v, z, **kwargs): + op = TensorYV(**kwargs) + return op(v, z) + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorYVE(TensorSpecialBinOp): + _func_name = "yve" + + +@implement_scipy(spspecial.yve) +@infer_dtype(spspecial.yve) +def yve(v, z, **kwargs): + op = TensorYVE(**kwargs) + return op(v, z) + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorKN(TensorSpecialBinOp): + _func_name = "kn" + + +@implement_scipy(spspecial.kn) +@infer_dtype(spspecial.kn) +def kn(n, x, **kwargs): + op = TensorKN(**kwargs) + return op(n, x) + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorKV(TensorSpecialBinOp): + _func_name = "kv" + + +@implement_scipy(spspecial.kv) +@infer_dtype(spspecial.kv) +def kv(v, z, **kwargs): + op = TensorKV(**kwargs) + return op(v, z) + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorKVE(TensorSpecialBinOp): + _func_name = "kve" + + +@implement_scipy(spspecial.kve) +@infer_dtype(spspecial.kve) +def kve(v, z, **kwargs): + op = TensorKVE(**kwargs) + return op(v, z) + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorIV(TensorSpecialBinOp): + _func_name = "iv" + + +@implement_scipy(spspecial.iv) +@infer_dtype(spspecial.iv) +def iv(v, z, **kwargs): + op = TensorIV(**kwargs) + return op(v, z) + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorIVE(TensorSpecialBinOp): + _func_name = "ive" + + +@implement_scipy(spspecial.ive) +@infer_dtype(spspecial.ive) +def ive(v, z, **kwargs): + op = TensorIVE(**kwargs) + return op(v, z) + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorHankel1(TensorSpecialBinOp): + _func_name = "hankel1" + + +@implement_scipy(spspecial.hankel1) +@infer_dtype(spspecial.hankel1) +def hankel1(v, z, **kwargs): + op = TensorHankel1(**kwargs) + return op(v, z) + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorHankel1e(TensorSpecialBinOp): + _func_name = "hankel1e" + + +@implement_scipy(spspecial.hankel1e) +@infer_dtype(spspecial.hankel1e) +def hankel1e(v, z, **kwargs): + op = TensorHankel1e(**kwargs) + return op(v, z) + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorHankel2(TensorSpecialBinOp): + _func_name = "hankel2" + + +@implement_scipy(spspecial.hankel2) +@infer_dtype(spspecial.hankel2) +def hankel2(v, z, **kwargs): + op = TensorHankel2(**kwargs) + return op(v, z) + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorHankel2e(TensorSpecialBinOp): + _func_name = "hankel2e" + + +@implement_scipy(spspecial.hankel2e) +@infer_dtype(spspecial.hankel2e) +def hankel2e(v, z, **kwargs): + op = TensorHankel2e(**kwargs) + return op(v, z) diff --git a/python/xorbits/_mars/tensor/special/convenience.py b/python/xorbits/_mars/tensor/special/convenience.py new file mode 100644 index 000000000..a5ad51d01 --- /dev/null +++ b/python/xorbits/_mars/tensor/special/convenience.py @@ -0,0 +1,36 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import scipy.special as spspecial + +from ..utils import implement_scipy, infer_dtype +from .core import TensorSpecialBinOp, _register_special_op + + +@_register_special_op +class TensorXLogY(TensorSpecialBinOp): + _func_name = "xlogy" + + @classmethod + def _is_sparse(cls, x1, x2): + if hasattr(x1, "issparse") and x1.issparse(): + return True + return False + + +@implement_scipy(spspecial.xlogy) +@infer_dtype(spspecial.xlogy) +def xlogy(x1, x2, out=None, where=None, **kwargs): + op = TensorXLogY(**kwargs) + return op(x1, x2, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/special/core.py b/python/xorbits/_mars/tensor/special/core.py new file mode 100644 index 000000000..3f19a0f52 --- /dev/null +++ b/python/xorbits/_mars/tensor/special/core.py @@ -0,0 +1,172 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import scipy.special as spspecial + +from ... import opcodes +from ...core import ExecutableTuple +from ..arithmetic.core import TensorBinOp, TensorMultiOp, TensorUnaryOp +from ..array_utils import ( + as_same_device, + convert_order, + cp, + device, + issparse, + np, + sparse, +) +from ..datasource import tensor as astensor + +_func_name_to_special_cls = {} + + +def _register_special_op(cls): + if cls._func_name is not None: + _func_name_to_special_cls[cls._func_name] = cls + return cls + + +class TensorSpecialOperandMixin: + _op_code_ = opcodes.SPECIAL + _func_name = None + + def __new__(cls, *args, **kwargs): + if cls._func_name is not None: + return object.__new__(_func_name_to_special_cls[cls._func_name]) + return super().__new__(cls, *args, **kwargs) + + @classmethod + def _get_func(cls, xp): + if xp is np: + from scipy import special + + return getattr(special, cls._func_name) + elif cp is not None and xp is cp: + from cupyx.scipy import special + + return getattr(special, cls._func_name) + else: + assert xp is sparse + return getattr(sparse, cls._func_name) + + +class TensorSpecialUnaryOp(TensorSpecialOperandMixin, TensorUnaryOp): + pass + + +class TensorSpecialBinOp(TensorSpecialOperandMixin, TensorBinOp): + pass + + +class TensorSpecialMultiOp(TensorSpecialOperandMixin, TensorMultiOp): + @classmethod + def _execute_gpu(cls, op, xp, *args, **kw): + if kw.get("out") is not None: + kw["out"] = xp.asarray(kw["out"]) + r = cls._get_func(xp)(*args, **kw) + return convert_order(r, op.outputs[0].order.value) + + @classmethod + def _execute_cpu(cls, op, xp, *args, **kw): + kw["order"] = op.order + if kw.get("out") is not None: + kw["out"] = np.asarray(kw["out"]) + try: + return cls._get_func(xp)(*args, **kw) + except TypeError: + kw.pop("order") + r = cls._get_func(xp)(*args, **kw) + if issparse(r): + return r + return convert_order(r, op.outputs[0].order.value) + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + kw = {"casting": op.casting} if op.out is not None else {} + + inputs_iter = iter(inputs) + args = [a if np.isscalar(a) else next(inputs_iter) for a in op.args] + if op.out is not None: + kw["out"] = next(inputs_iter).copy() + + with np.errstate(**op.err): + if op.is_gpu(): + ret = cls._execute_gpu(op, xp, *args, **kw) + else: + ret = cls._execute_cpu(op, xp, *args, **kw) + + if ret.dtype != op.dtype: + ret = ret.astype(op.dtype) + ctx[op.outputs[0].key] = ret + + +class TensorTupleOp(TensorSpecialUnaryOp): + @property + def output_limit(self): + return self._n_outputs + + def __call__(self, x, out=None): + x = astensor(x) + + if out is not None: + if not isinstance(out, ExecutableTuple): + raise TypeError( + f"out should be ExecutableTuple object, got {type(out)} instead" + ) + if len(out) != self._n_outputs: + raise TypeError( + f"out should be an ExecutableTuple object with {self._n_outputs} elements, got {len(out)} instead" + ) + + func = getattr(spspecial, self._func_name) + res = func(np.ones(x.shape, dtype=x.dtype)) + res_tensors = self.new_tensors( + [x], + kws=[ + { + "side": f"{self._func_name}[{i}]", + "dtype": output.dtype, + "shape": output.shape, + } + for i, output in enumerate(res) + ], + ) + + if out is None: + return ExecutableTuple(res_tensors) + + for res_tensor, out_tensor in zip(res_tensors, out): + out_tensor.data = res_tensor.data + return out + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + with device(device_id): + with np.errstate(**op.err): + if op.is_gpu(): + ret = cls._execute_gpu(op, xp, inputs[0]) + else: + ret = cls._execute_cpu(op, xp, inputs[0]) + + for output, ret_element in zip(op.outputs, ret): + ctx[output.key] = ret_element diff --git a/python/xorbits/_mars/tensor/special/ellip_func_integrals.py b/python/xorbits/_mars/tensor/special/ellip_func_integrals.py new file mode 100644 index 000000000..761a20cf8 --- /dev/null +++ b/python/xorbits/_mars/tensor/special/ellip_func_integrals.py @@ -0,0 +1,157 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import scipy.special as spspecial + +from ..arithmetic.utils import arithmetic_operand +from ..utils import implement_scipy, infer_dtype +from .core import ( + TensorSpecialBinOp, + TensorSpecialMultiOp, + TensorSpecialUnaryOp, + _register_special_op, +) + + +@_register_special_op +@arithmetic_operand(sparse_mode="unary") +class TensorEllipk(TensorSpecialUnaryOp): + _func_name = "ellipk" + + +@_register_special_op +@arithmetic_operand(sparse_mode="unary") +class TensorEllipkm1(TensorSpecialUnaryOp): + _func_name = "ellipkm1" + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorEllipkinc(TensorSpecialBinOp): + _func_name = "ellipkinc" + + +@_register_special_op +@arithmetic_operand(sparse_mode="unary") +class TensorEllipe(TensorSpecialUnaryOp): + _func_name = "ellipe" + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorEllipeinc(TensorSpecialBinOp): + _func_name = "ellipeinc" + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorElliprc(TensorSpecialBinOp): + _func_name = "elliprc" + + +@_register_special_op +class TensorElliprd(TensorSpecialMultiOp): + _ARG_COUNT = 3 + _func_name = "elliprd" + + +@_register_special_op +class TensorElliprf(TensorSpecialMultiOp): + _ARG_COUNT = 3 + _func_name = "elliprf" + + +@_register_special_op +class TensorElliprg(TensorSpecialMultiOp): + _ARG_COUNT = 3 + _func_name = "elliprg" + + +@_register_special_op +class TensorElliprj(TensorSpecialMultiOp): + _ARG_COUNT = 4 + _func_name = "elliprj" + + +@implement_scipy(spspecial.ellipk) +@infer_dtype(spspecial.ellipk) +def ellipk(x, **kwargs): + op = TensorEllipk(**kwargs) + return op(x) + + +@implement_scipy(spspecial.ellipkm1) +@infer_dtype(spspecial.ellipkm1) +def ellipkm1(x, **kwargs): + op = TensorEllipkm1(**kwargs) + return op(x) + + +@implement_scipy(spspecial.ellipkinc) +@infer_dtype(spspecial.ellipkinc) +def ellipkinc(phi, m, **kwargs): + op = TensorEllipkinc(**kwargs) + return op(phi, m) + + +@implement_scipy(spspecial.ellipe) +@infer_dtype(spspecial.ellipe) +def ellipe(x, **kwargs): + op = TensorEllipe(**kwargs) + return op(x) + + +@implement_scipy(spspecial.ellipeinc) +@infer_dtype(spspecial.ellipeinc) +def ellipeinc(phi, m, **kwargs): + op = TensorEllipeinc(**kwargs) + return op(phi, m) + + +try: + + @implement_scipy(spspecial.elliprc) + @infer_dtype(spspecial.elliprc) + def elliprc(x, y, **kwargs): + op = TensorElliprc(**kwargs) + return op(x, y) + + @implement_scipy(spspecial.elliprd) + @infer_dtype(spspecial.elliprd) + def elliprd(x, y, z, **kwargs): + op = TensorElliprd(**kwargs) + return op(x, y, z) + + @implement_scipy(spspecial.elliprf) + @infer_dtype(spspecial.elliprf) + def elliprf(x, y, z, **kwargs): + op = TensorElliprf(**kwargs) + return op(x, y, z) + + @implement_scipy(spspecial.elliprg) + @infer_dtype(spspecial.elliprg) + def elliprg(x, y, z, **kwargs): + op = TensorElliprg(**kwargs) + return op(x, y, z) + + @implement_scipy(spspecial.elliprj) + @infer_dtype(spspecial.elliprj) + def elliprj(x, y, z, p, **kwargs): + op = TensorElliprj(**kwargs) + return op(x, y, z, p) + +except AttributeError: + # These functions are not implemented before scipy v1.8 so + # spsecial.func may cause AttributeError + elliprc = elliprd = elliprf = elliprg = elliprj = None diff --git a/python/xorbits/_mars/tensor/special/ellip_harm.py b/python/xorbits/_mars/tensor/special/ellip_harm.py new file mode 100644 index 000000000..e53e2bbf6 --- /dev/null +++ b/python/xorbits/_mars/tensor/special/ellip_harm.py @@ -0,0 +1,57 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import scipy.special as spspecial + +from ..utils import implement_scipy, infer_dtype +from .core import TensorSpecialMultiOp, _register_special_op + + +@_register_special_op +class TensorEllipHarm(TensorSpecialMultiOp): + _ARG_COUNT = 5 + _func_name = "ellip_harm" + + +@implement_scipy(spspecial.ellip_harm) +@infer_dtype(spspecial.ellip_harm) +def ellip_harm(h2, k2, n, p, s, signm=1, signn=1, **kwargs): + op = TensorEllipHarm(**kwargs) + return op(h2, k2, n, p, s, signm, signn) + + +@_register_special_op +class TensorEllipHarm2(TensorSpecialMultiOp): + _ARG_COUNT = 5 + _func_name = "ellip_harm_2" + + +@implement_scipy(spspecial.ellip_harm_2) +@infer_dtype(spspecial.ellip_harm_2) +def ellip_harm_2(h2, k2, n, p, s, **kwargs): + op = TensorEllipHarm2(**kwargs) + return op(h2, k2, n, p, s) + + +@_register_special_op +class TensorEllipNormal(TensorSpecialMultiOp): + _ARG_COUNT = 4 + _func_name = "ellip_normal" + + +@implement_scipy(spspecial.ellip_normal) +@infer_dtype(spspecial.ellip_normal) +def ellip_normal(h2, k2, n, p, **kwargs): + op = TensorEllipNormal(**kwargs) + return op(h2, k2, n, p) diff --git a/python/xorbits/_mars/tensor/special/err_fresnel.py b/python/xorbits/_mars/tensor/special/err_fresnel.py new file mode 100644 index 000000000..53ca312eb --- /dev/null +++ b/python/xorbits/_mars/tensor/special/err_fresnel.py @@ -0,0 +1,225 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import scipy.special as spspecial + +from ..arithmetic.utils import arithmetic_operand +from ..utils import implement_scipy, infer_dtype +from .core import ( + TensorSpecialMultiOp, + TensorSpecialUnaryOp, + TensorTupleOp, + _register_special_op, +) + + +@_register_special_op +@arithmetic_operand(sparse_mode="unary") +class TensorErf(TensorSpecialUnaryOp): + _func_name = "erf" + + +@_register_special_op +@arithmetic_operand(sparse_mode="unary") +class TensorErfc(TensorSpecialUnaryOp): + _func_name = "erfc" + + +@_register_special_op +@arithmetic_operand(sparse_mode="unary") +class TensorErfcx(TensorSpecialUnaryOp): + _func_name = "erfcx" + + +@_register_special_op +@arithmetic_operand(sparse_mode="unary") +class TensorErfi(TensorSpecialUnaryOp): + _func_name = "erfi" + + +@_register_special_op +@arithmetic_operand(sparse_mode="unary") +class TensorErfinv(TensorSpecialUnaryOp): + _func_name = "erfinv" + + +@_register_special_op +@arithmetic_operand(sparse_mode="unary") +class TensorErfcinv(TensorSpecialUnaryOp): + _func_name = "erfcinv" + + +@_register_special_op +@arithmetic_operand(sparse_mode="unary") +class TensorWofz(TensorSpecialUnaryOp): + _func_name = "wofz" + + +@_register_special_op +@arithmetic_operand(sparse_mode="unary") +class TensorDawsn(TensorSpecialUnaryOp): + _func_name = "dawsn" + + +@_register_special_op +class TensorFresnel(TensorTupleOp): + _func_name = "fresnel" + _n_outputs = 2 + + +@_register_special_op +class TensorModFresnelP(TensorTupleOp): + _func_name = "modfresnelp" + _n_outputs = 2 + + +@_register_special_op +class TensorModFresnelM(TensorTupleOp): + _func_name = "modfresnelm" + _n_outputs = 2 + + +@_register_special_op +class TensorVoigtProfile(TensorSpecialMultiOp): + _ARG_COUNT = 3 + _func_name = "voigt_profile" + + +@implement_scipy(spspecial.erf) +@infer_dtype(spspecial.erf) +def erf(x, out=None, where=None, **kwargs): + """ + Returns the error function of complex argument. + + It is defined as ``2/sqrt(pi)*integral(exp(-t**2), t=0..z)``. + + Parameters + ---------- + x : Tensor + Input tensor. + + Returns + ------- + res : Tensor + The values of the error function at the given points `x`. + + See Also + -------- + erfc, erfinv, erfcinv, wofz, erfcx, erfi + + Notes + ----- + The cumulative of the unit normal distribution is given by + ``Phi(z) = 1/2[1 + erf(z/sqrt(2))]``. + + References + ---------- + .. [1] https://en.wikipedia.org/wiki/Error_function + .. [2] Milton Abramowitz and Irene A. Stegun, eds. + Handbook of Mathematical Functions with Formulas, + Graphs, and Mathematical Tables. New York: Dover, + 1972. http://www.math.sfu.ca/~cbm/aands/page_297.htm + .. [3] Steven G. Johnson, Faddeeva W function implementation. + http://ab-initio.mit.edu/Faddeeva + + Examples + -------- + >>> import mars.tensor as mt + >>> from mars.tensor import special + >>> import matplotlib.pyplot as plt + >>> x = mt.linspace(-3, 3) + >>> plt.plot(x, special.erf(x)) + >>> plt.xlabel('$x$') + >>> plt.ylabel('$erf(x)$') + >>> plt.show() + """ + op = TensorErf(**kwargs) + return op(x, out=out, where=where) + + +@implement_scipy(spspecial.erfc) +@infer_dtype(spspecial.erfc) +def erfc(x, out=None, where=None, **kwargs): + op = TensorErfc(**kwargs) + return op(x, out=out, where=where) + + +@implement_scipy(spspecial.erfcx) +@infer_dtype(spspecial.erfcx) +def erfcx(x, out=None, where=None, **kwargs): + op = TensorErfcx(**kwargs) + return op(x, out=out, where=where) + + +@implement_scipy(spspecial.erfi) +@infer_dtype(spspecial.erfi) +def erfi(x, out=None, where=None, **kwargs): + op = TensorErfi(**kwargs) + return op(x, out=out, where=where) + + +@implement_scipy(spspecial.erfinv) +@infer_dtype(spspecial.erfinv) +def erfinv(x, out=None, where=None, **kwargs): + op = TensorErfinv(**kwargs) + return op(x, out=out, where=where) + + +@implement_scipy(spspecial.erfcinv) +@infer_dtype(spspecial.erfcinv) +def erfcinv(x, out=None, where=None, **kwargs): + op = TensorErfcinv(**kwargs) + return op(x, out=out, where=where) + + +@implement_scipy(spspecial.wofz) +@infer_dtype(spspecial.wofz) +def wofz(x, out=None, where=None, **kwargs): + op = TensorWofz(**kwargs) + return op(x, out=out, where=where) + + +@implement_scipy(spspecial.dawsn) +@infer_dtype(spspecial.dawsn) +def dawsn(x, out=None, where=None, **kwargs): + op = TensorDawsn(**kwargs) + return op(x, out=out, where=where) + + +@implement_scipy(spspecial.fresnel) +@infer_dtype(spspecial.fresnel, multi_outputs=True) +def fresnel(x, out=None, **kwargs): + op = TensorFresnel(**kwargs) + return op(x, out=out) + + +@implement_scipy(spspecial.modfresnelp) +@infer_dtype(spspecial.modfresnelp, multi_outputs=True) +def modfresnelp(x, out=None, **kwargs): + op = TensorModFresnelP(**kwargs) + return op(x, out=out) + + +@implement_scipy(spspecial.modfresnelm) +@infer_dtype(spspecial.modfresnelm, multi_outputs=True) +def modfresnelm(x, out=None, **kwargs): + op = TensorModFresnelM(**kwargs) + return op(x, out=out) + + +@implement_scipy(spspecial.voigt_profile) +@infer_dtype(spspecial.voigt_profile) +def voigt_profile(x, sigma, gamma, **kwargs): + op = TensorVoigtProfile(**kwargs) + return op(x, sigma, gamma) diff --git a/python/xorbits/_mars/tensor/special/gamma_funcs.py b/python/xorbits/_mars/tensor/special/gamma_funcs.py new file mode 100644 index 000000000..2ebe19efa --- /dev/null +++ b/python/xorbits/_mars/tensor/special/gamma_funcs.py @@ -0,0 +1,305 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import scipy.special as spspecial + +from ..arithmetic.utils import arithmetic_operand +from ..utils import implement_scipy, infer_dtype +from .core import ( + TensorSpecialBinOp, + TensorSpecialMultiOp, + TensorSpecialUnaryOp, + _register_special_op, +) + + +class NoOrderSpecialMixin: + @classmethod + def _get_func(cls, xp): + func = super()._get_func(xp) + + def _wrapped(*args, **kw): + kw.pop("order", None) + return func(*args, **kw) + + return _wrapped + + +@_register_special_op +@arithmetic_operand(sparse_mode="unary") +class TensorGamma(TensorSpecialUnaryOp): + _func_name = "gamma" + + +@implement_scipy(spspecial.gamma) +@infer_dtype(spspecial.gamma) +def gamma(x, **kwargs): + op = TensorGamma(**kwargs) + return op(x) + + +@_register_special_op +@arithmetic_operand(sparse_mode="unary") +class TensorGammaln(TensorSpecialUnaryOp): + _func_name = "gammaln" + + +@implement_scipy(spspecial.gammaln) +@infer_dtype(spspecial.gammaln) +def gammaln(x, out=None, where=None, **kwargs): + """ + Logarithm of the absolute value of the Gamma function. + + Parameters + ---------- + x : array-like + Values on the real line at which to compute ``gammaln`` + out : Tensor, None, or tuple of Tensor and None, optional + A location into which the result is stored. If provided, it must have + a shape that the inputs broadcast to. If not provided or `None`, + a freshly-allocated tensor is returned. A tuple (possible only as a + keyword argument) must have length equal to the number of outputs. + where : array_like, optional + Values of True indicate to calculate the ufunc at that position, values + of False indicate to leave the value in the output alone. + **kwargs + + Returns + ------- + gammaln : Tensor + Values of ``gammaln`` at x. + + See Also + -------- + gammasgn : sign of the gamma function + loggamma : principal branch of the logarithm of the gamma function + + Notes + ----- + When used in conjunction with `gammasgn`, this function is useful + for working in logspace on the real axis without having to deal with + complex numbers, via the relation ``exp(gammaln(x)) = gammasgn(x)*gamma(x)``. + + For complex-valued log-gamma, use `loggamma` instead of `gammaln`. + """ + op = TensorGammaln(**kwargs) + return op(x, out=out, where=where) + + +@_register_special_op +@arithmetic_operand(sparse_mode="unary") +class TensorLogGamma(TensorSpecialUnaryOp): + _func_name = "loggamma" + + +@implement_scipy(spspecial.loggamma) +@infer_dtype(spspecial.loggamma) +def loggamma(x, **kwargs): + op = TensorLogGamma(**kwargs) + return op(x) + + +@_register_special_op +@arithmetic_operand(sparse_mode="unary") +class TensorGammaSgn(TensorSpecialUnaryOp): + _func_name = "gammasgn" + + +@implement_scipy(spspecial.gammasgn) +@infer_dtype(spspecial.gammasgn) +def gammasgn(x, **kwargs): + op = TensorGammaSgn(**kwargs) + return op(x) + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorGammaInc(TensorSpecialBinOp): + _func_name = "gammainc" + + +@implement_scipy(spspecial.gammainc) +@infer_dtype(spspecial.gammainc) +def gammainc(a, b, **kwargs): + op = TensorGammaInc(**kwargs) + return op(a, b) + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorGammaIncInv(TensorSpecialBinOp): + _func_name = "gammaincinv" + + +@implement_scipy(spspecial.gammaincinv) +@infer_dtype(spspecial.gammaincinv) +def gammaincinv(a, b, **kwargs): + op = TensorGammaIncInv(**kwargs) + return op(a, b) + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorGammaIncc(TensorSpecialBinOp): + _func_name = "gammaincc" + + +@implement_scipy(spspecial.gammainc) +@infer_dtype(spspecial.gammainc) +def gammaincc(a, b, **kwargs): + op = TensorGammaIncc(**kwargs) + return op(a, b) + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorGammaInccInv(TensorSpecialBinOp): + _func_name = "gammainccinv" + + +@implement_scipy(spspecial.gammainccinv) +@infer_dtype(spspecial.gammainccinv) +def gammainccinv(a, b, **kwargs): + op = TensorGammaInccInv(**kwargs) + return op(a, b) + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorBeta(TensorSpecialBinOp): + _func_name = "beta" + + +@implement_scipy(spspecial.beta) +@infer_dtype(spspecial.beta) +def beta(a, b, out=None, **kwargs): + op = TensorBeta(**kwargs) + return op(a, b, out=out) + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorBetaLn(TensorSpecialBinOp): + _func_name = "betaln" + + +@implement_scipy(spspecial.betaln) +@infer_dtype(spspecial.betaln) +def betaln(a, b, out=None, **kwargs): + op = TensorBetaLn(**kwargs) + return op(a, b, out=out) + + +@_register_special_op +class TensorBetaInc(TensorSpecialMultiOp): + _ARG_COUNT = 3 + _func_name = "betainc" + + +@implement_scipy(spspecial.betainc) +@infer_dtype(spspecial.betainc) +def betainc(a, b, x, out=None, **kwargs): + op = TensorBetaInc(**kwargs) + return op(a, b, x, out=out) + + +@_register_special_op +class TensorBetaIncInv(TensorSpecialMultiOp): + _ARG_COUNT = 3 + _func_name = "betaincinv" + + +@implement_scipy(spspecial.betaincinv) +@infer_dtype(spspecial.betaincinv) +def betaincinv(a, b, y, out=None, **kwargs): + op = TensorBetaIncInv(**kwargs) + return op(a, b, y, out=out) + + +@_register_special_op +@arithmetic_operand(sparse_mode="unary") +class TensorPsi(TensorSpecialUnaryOp): + _func_name = "psi" + + +@implement_scipy(spspecial.psi) +@infer_dtype(spspecial.psi) +def psi(x, out=None, **kwargs): + op = TensorPsi(**kwargs) + return op(x, out=out) + + +@_register_special_op +@arithmetic_operand(sparse_mode="unary") +class TensorRGamma(TensorSpecialUnaryOp): + _func_name = "rgamma" + + +@implement_scipy(spspecial.rgamma) +@infer_dtype(spspecial.rgamma) +def rgamma(x, out=None, **kwargs): + op = TensorRGamma(**kwargs) + return op(x, out=out) + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorPolyGamma(NoOrderSpecialMixin, TensorSpecialBinOp): + _func_name = "polygamma" + + +@implement_scipy(spspecial.polygamma) +@infer_dtype(spspecial.polygamma) +def polygamma(a, b, **kwargs): + op = TensorPolyGamma(**kwargs) + return op(a, b) + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorMultiGammaLn(NoOrderSpecialMixin, TensorSpecialBinOp): + _func_name = "multigammaln" + + +@implement_scipy(spspecial.multigammaln) +@infer_dtype(spspecial.multigammaln) +def multigammaln(a, b, **kwargs): + op = TensorMultiGammaLn(**kwargs) + return op(a, b) + + +@_register_special_op +@arithmetic_operand(sparse_mode="unary") +class TensorDiGamma(TensorSpecialUnaryOp): + _func_name = "digamma" + + +@implement_scipy(spspecial.digamma) +@infer_dtype(spspecial.digamma) +def digamma(x, out=None, **kwargs): + op = TensorDiGamma(**kwargs) + return op(x, out=out) + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorPoch(TensorSpecialBinOp): + _func_name = "poch" + + +@implement_scipy(spspecial.poch) +@infer_dtype(spspecial.poch) +def poch(a, b, **kwargs): + op = TensorPoch(**kwargs) + return op(a, b) diff --git a/python/xorbits/_mars/tensor/special/hypergeometric_funcs.py b/python/xorbits/_mars/tensor/special/hypergeometric_funcs.py new file mode 100644 index 000000000..af9774b94 --- /dev/null +++ b/python/xorbits/_mars/tensor/special/hypergeometric_funcs.py @@ -0,0 +1,71 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import scipy.special as spspecial + +from ..arithmetic.utils import arithmetic_operand +from ..utils import implement_scipy, infer_dtype +from .core import TensorSpecialBinOp, TensorSpecialMultiOp, _register_special_op + + +@_register_special_op +class TensorHYP2F1(TensorSpecialMultiOp): + _ARG_COUNT = 4 + _func_name = "hyp2f1" + + +@implement_scipy(spspecial.hyp2f1) +@infer_dtype(spspecial.hyp2f1) +def hyp2f1(a, b, c, z, **kwargs): + op = TensorHYP2F1(**kwargs) + return op(a, b, c, z) + + +@_register_special_op +class TensorHYP1F1(TensorSpecialMultiOp): + _ARG_COUNT = 3 + _func_name = "hyp1f1" + + +@implement_scipy(spspecial.hyp1f1) +@infer_dtype(spspecial.hyp1f1) +def hyp1f1(a, b, x, out=None, **kwargs): + op = TensorHYP1F1(**kwargs) + return op(a, b, x, out=out) + + +@_register_special_op +class TensorHYPERU(TensorSpecialMultiOp): + _ARG_COUNT = 3 + _func_name = "hyperu" + + +@implement_scipy(spspecial.hyperu) +@infer_dtype(spspecial.hyperu) +def hyperu(a, b, x, out=None, **kwargs): + op = TensorHYPERU(**kwargs) + return op(a, b, x, out=out) + + +@_register_special_op +@arithmetic_operand(sparse_mode="binary_and") +class TensorHYP0F1(TensorSpecialBinOp): + _func_name = "hyp0f1" + + +@implement_scipy(spspecial.hyp0f1) +@infer_dtype(spspecial.hyp0f1) +def hyp0f1(v, z, out=None, **kwargs): + op = TensorHYP0F1(**kwargs) + return op(v, z, out=out) diff --git a/python/xorbits/_mars/tensor/special/info_theory.py b/python/xorbits/_mars/tensor/special/info_theory.py new file mode 100644 index 000000000..5cc67b1a1 --- /dev/null +++ b/python/xorbits/_mars/tensor/special/info_theory.py @@ -0,0 +1,191 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import scipy.special as spspecial + +from ..arithmetic.utils import arithmetic_operand +from ..utils import implement_scipy, infer_dtype +from .core import TensorSpecialBinOp, TensorSpecialUnaryOp, _register_special_op + + +@_register_special_op +@arithmetic_operand(sparse_mode="unary") +class TensorEntr(TensorSpecialUnaryOp): + _func_name = "entr" + + +@implement_scipy(spspecial.entr) +@infer_dtype(spspecial.entr) +def entr(x, out=None, where=None, **kwargs): + r""" + Elementwise function for computing entropy. + + .. math:: \text{entr}(x) = \begin{cases} - x \log(x) & x > 0 \\ 0 & x = 0 \\ -\infty & \text{otherwise} \end{cases} + + Parameters + ---------- + x : Tensor + Input tensor. + + Returns + ------- + res : Tensor + The value of the elementwise entropy function at the given points `x`. + + See Also + -------- + kl_div, rel_entr + + Notes + ----- + This function is concave. + """ + op = TensorEntr(**kwargs) + return op(x, out=out, where=where) + + +@_register_special_op +class TensorRelEntr(TensorSpecialBinOp): + _func_name = "rel_entr" + + @classmethod + def _is_sparse(cls, x1, x2): + if hasattr(x1, "issparse") and x1.issparse(): + return True + return False + + +@implement_scipy(spspecial.rel_entr) +@infer_dtype(spspecial.rel_entr) +def rel_entr(x, y, out=None, where=None, **kwargs): + r""" + Elementwise function for computing relative entropy. + + .. math:: + + \mathrm{rel\_entr}(x, y) = + \begin{cases} + x \log(x / y) & x > 0, y > 0 \\ + 0 & x = 0, y \ge 0 \\ + \infty & \text{otherwise} + \end{cases} + + Parameters + ---------- + x, y : array_like + Input arrays + out : ndarray, optional + Optional output array for the function results + + Returns + ------- + scalar or ndarray + Relative entropy of the inputs + + See Also + -------- + entr, kl_div + + Notes + ----- + This function is jointly convex in x and y. + + The origin of this function is in convex programming; see + [1]_. Given two discrete probability distributions :math:`p_1, + \ldots, p_n` and :math:`q_1, \ldots, q_n`, to get the relative + entropy of statistics compute the sum + + .. math:: + + \sum_{i = 1}^n \mathrm{rel\_entr}(p_i, q_i). + + See [2]_ for details. + + References + ---------- + .. [1] Grant, Boyd, and Ye, "CVX: Matlab Software for Disciplined Convex + Programming", http://cvxr.com/cvx/ + .. [2] Kullback-Leibler divergence, + https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence + """ + op = TensorRelEntr(**kwargs) + return op(x, y, out=out, where=where) + + +@_register_special_op +class TensorKlDiv(TensorSpecialBinOp): + _func_name = "kl_div" + + @classmethod + def _is_sparse(cls, x1, x2): + if hasattr(x1, "issparse") and x1.issparse(): + return True + return False + + +@implement_scipy(spspecial.kl_div) +@infer_dtype(spspecial.kl_div) +def kl_div(x, y, out=None, where=None, **kwargs): + r""" + Elementwise function for computing relative entropy. + + .. math:: + + \mathrm{rel\_entr}(x, y) = + \begin{cases} + x \log(x / y) & x > 0, y > 0 \\ + 0 & x = 0, y \ge 0 \\ + \infty & \text{otherwise} + \end{cases} + + Parameters + ---------- + x, y : array_like + Input arrays + out : ndarray, optional + Optional output array for the function results + + Returns + ------- + scalar or ndarray + Relative entropy of the inputs + + See Also + -------- + entr, kl_div + + Notes + ----- + This function is jointly convex in x and y. + + The origin of this function is in convex programming; see + [1]_. Given two discrete probability distributions :math:`p_1, + \ldots, p_n` and :math:`q_1, \ldots, q_n`, to get the relative + entropy of statistics compute the sum + + .. math:: + + \sum_{i = 1}^n \mathrm{rel\_entr}(p_i, q_i). + + See [2]_ for details. + + References + ---------- + .. [1] Grant, Boyd, and Ye, "CVX: Matlab Software for Disciplined Convex + Programming", http://cvxr.com/cvx/ + .. [2] Kullback-Leibler divergence, + https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence + """ + op = TensorKlDiv(**kwargs) + return op(x, y, out=out, where=where) diff --git a/python/xorbits/_mars/tensor/special/tests/__init__.py b/python/xorbits/_mars/tensor/special/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/tensor/special/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/tensor/special/tests/test_special.py b/python/xorbits/_mars/tensor/special/tests/test_special.py new file mode 100644 index 000000000..a75f7bd44 --- /dev/null +++ b/python/xorbits/_mars/tensor/special/tests/test_special.py @@ -0,0 +1,321 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +import scipy +import scipy.special as spsecial + +from ....core import ExecutableTuple, tile +from ....lib.version import parse as parse_version +from ... import special as mt_special +from ... import tensor +from ..airy import TensorAiry, TensorAirye, TensorItairy +from ..ellip_func_integrals import ( + TensorEllipe, + TensorEllipeinc, + TensorEllipk, + TensorEllipkinc, + TensorEllipkm1, + TensorElliprc, + TensorElliprd, + TensorElliprf, + TensorElliprg, + TensorElliprj, +) +from ..err_fresnel import ( + TensorDawsn, + TensorErf, + TensorErfc, + TensorErfcinv, + TensorErfcx, + TensorErfi, + TensorErfinv, + TensorFresnel, + TensorModFresnelM, + TensorModFresnelP, + TensorVoigtProfile, + TensorWofz, +) +from ..gamma_funcs import TensorBetaInc, TensorGammaln + + +@pytest.mark.parametrize( + "func,tensor_cls", + [ + ("gammaln", TensorGammaln), + ("erf", TensorErf), + ("erfinv", TensorErfinv), + ("erfcinv", TensorErfcinv), + ("wofz", TensorWofz), + ("dawsn", TensorDawsn), + ("ellipk", TensorEllipk), + ("ellipkm1", TensorEllipkm1), + ("ellipe", TensorEllipe), + ("erfc", TensorErfc), + ("erfcx", TensorErfcx), + ("erfi", TensorErfi), + ], +) +def test_unary_operand_no_out(func, tensor_cls): + sp_func = getattr(spsecial, func) + mt_func = getattr(mt_special, func) + + raw = np.random.rand(10, 8, 5) + t = tensor(raw, chunk_size=3) + + r = mt_func(t) + expect = sp_func(raw) + + assert r.shape == raw.shape + assert r.dtype == expect.dtype + + t, r = tile(t, r) + + assert r.nsplits == t.nsplits + for c in r.chunks: + assert isinstance(c.op, tensor_cls) + assert c.index == c.inputs[0].index + assert c.shape == c.inputs[0].shape + + +@pytest.mark.parametrize( + "func,tensor_cls", + [ + ("erfc", TensorErfc), + ("erfcx", TensorErfcx), + ("erfi", TensorErfi), + ], +) +def test_unary_operand_out(func, tensor_cls): + sp_func = getattr(spsecial, func) + mt_func = getattr(mt_special, func) + + raw = np.random.rand(10, 8, 5) + t = tensor(raw, chunk_size=3) + + out = tensor(raw, chunk_size=3) + r_with_optional = mt_func(t, out) + expect = sp_func(raw) + + assert out.shape == raw.shape + assert out.dtype == expect.dtype + + assert r_with_optional.shape == raw.shape + assert r_with_optional.dtype == expect.dtype + + t_optional_out, out = tile(t, out) + + assert out.nsplits == t_optional_out.nsplits + for c in out.chunks: + assert isinstance(c.op, tensor_cls) + assert c.index == c.inputs[0].index + assert c.shape == c.inputs[0].shape + + t_optional_r, r_with_optional = tile(t, r_with_optional) + + assert r_with_optional.nsplits == t_optional_r.nsplits + for c in r_with_optional.chunks: + assert isinstance(c.op, tensor_cls) + assert c.index == c.inputs[0].index + assert c.shape == c.inputs[0].shape + + +@pytest.mark.parametrize( + "func,tensor_cls,n_outputs", + [ + ("fresnel", TensorFresnel, 2), + ("modfresnelp", TensorModFresnelP, 2), + ("modfresnelm", TensorModFresnelM, 2), + ("airy", TensorAiry, 4), + ("airye", TensorAirye, 4), + ("itairy", TensorItairy, 4), + ], +) +def test_unary_tuple_operand(func, tensor_cls, n_outputs): + sp_func = getattr(spsecial, func) + mt_func = getattr(mt_special, func) + + raw = np.random.rand(10, 8, 5) + t = tensor(raw, chunk_size=3) + + r = mt_func(t) + expect = sp_func(raw) + + assert isinstance(r, ExecutableTuple) + + for r_i, expect_i in zip(r, expect): + assert r_i.shape == expect_i.shape + assert r_i.dtype == expect_i.dtype + assert isinstance(r_i.op, tensor_cls) + + non_tuple_out = tensor(raw, chunk_size=3) + with pytest.raises(TypeError): + r = mt_func(t, non_tuple_out) + + mismatch_size_tuple = ExecutableTuple([t]) + with pytest.raises(TypeError): + r = mt_func(t, mismatch_size_tuple) + + out = ExecutableTuple([t] * n_outputs) + r_out = mt_func(t, out=out) + + assert isinstance(out, ExecutableTuple) + assert isinstance(r_out, ExecutableTuple) + + for r_output, expected_output, out_output in zip(r, expect, out): + assert r_output.shape == expected_output.shape + assert r_output.dtype == expected_output.dtype + assert isinstance(r_output.op, tensor_cls) + + assert out_output.shape == expected_output.shape + assert out_output.dtype == expected_output.dtype + assert isinstance(out_output.op, tensor_cls) + + +@pytest.mark.parametrize( + "func,tensor_cls", + [ + ("betainc", TensorBetaInc), + ("voigt_profile", TensorVoigtProfile), + pytest.param( + "elliprd", + TensorElliprd, + marks=pytest.mark.skipif( + parse_version(scipy.__version__) < parse_version("1.8.0"), + reason="function not implemented in scipy.", + ), + ), + pytest.param( + "elliprf", + TensorElliprf, + marks=pytest.mark.skipif( + parse_version(scipy.__version__) < parse_version("1.8.0"), + reason="function not implemented in scipy.", + ), + ), + pytest.param( + "elliprg", + TensorElliprg, + marks=pytest.mark.skipif( + parse_version(scipy.__version__) < parse_version("1.8.0"), + reason="function not implemented in scipy.", + ), + ), + ], +) +def test_triple_operand(func, tensor_cls): + sp_func = getattr(spsecial, func) + mt_func = getattr(mt_special, func) + + raw1 = np.random.rand(4, 3, 2) + raw2 = np.random.rand(4, 3, 2) + raw3 = np.random.rand(4, 3, 2) + a = tensor(raw1, chunk_size=3) + b = tensor(raw2, chunk_size=3) + c = tensor(raw3, chunk_size=3) + + r = mt_func(a, b, c) + expect = sp_func(raw1, raw2, raw3) + + assert r.shape == raw1.shape + assert r.dtype == expect.dtype + + tiled_a, r = tile(a, r) + + assert r.nsplits == tiled_a.nsplits + for chunk in r.chunks: + assert isinstance(chunk.op, tensor_cls) + assert chunk.index == chunk.inputs[0].index + assert chunk.shape == chunk.inputs[0].shape + + +@pytest.mark.parametrize( + "func,tensor_cls", + [ + ("ellipkinc", TensorEllipkinc), + ("ellipeinc", TensorEllipeinc), + pytest.param( + "elliprc", + TensorElliprc, + marks=pytest.mark.skipif( + parse_version(scipy.__version__) < parse_version("1.8.0"), + reason="function not implemented in scipy.", + ), + ), + ], +) +def test_binary_operand(func, tensor_cls): + sp_func = getattr(spsecial, func) + mt_func = getattr(mt_special, func) + + raw1 = np.random.rand(4, 3, 2) + raw2 = np.random.rand(4, 3, 2) + a = tensor(raw1, chunk_size=3) + b = tensor(raw2, chunk_size=3) + + r = mt_func(a, b) + expect = sp_func(raw1, raw2) + + assert r.shape == raw1.shape + assert r.dtype == expect.dtype + + tiled_a, r = tile(a, r) + + assert r.nsplits == tiled_a.nsplits + for chunk in r.chunks: + assert isinstance(chunk.op, tensor_cls) + assert chunk.index == chunk.inputs[0].index + assert chunk.shape == chunk.inputs[0].shape + + +@pytest.mark.parametrize( + "func,tensor_cls", + [ + pytest.param( + "elliprj", + TensorElliprj, + marks=pytest.mark.skipif( + parse_version(scipy.__version__) < parse_version("1.8.0"), + reason="function not implemented in scipy.", + ), + ), + ], +) +def test_quadruple_operand(func, tensor_cls): + sp_func = getattr(spsecial, func) + mt_func = getattr(mt_special, func) + + raw1 = np.random.rand(4, 3, 2) + raw2 = np.random.rand(4, 3, 2) + raw3 = np.random.rand(4, 3, 2) + raw4 = np.random.rand(4, 3, 2) + a = tensor(raw1, chunk_size=3) + b = tensor(raw2, chunk_size=3) + c = tensor(raw3, chunk_size=3) + d = tensor(raw4, chunk_size=3) + + r = mt_func(a, b, c, d) + expect = sp_func(raw1, raw2, raw3, raw4) + + assert r.shape == raw1.shape + assert r.dtype == expect.dtype + + tiled_a, r = tile(a, r) + + assert r.nsplits == tiled_a.nsplits + for chunk in r.chunks: + assert isinstance(chunk.op, tensor_cls) + assert chunk.index == chunk.inputs[0].index + assert chunk.shape == chunk.inputs[0].shape diff --git a/python/xorbits/_mars/tensor/special/tests/test_special_execution.py b/python/xorbits/_mars/tensor/special/tests/test_special_execution.py new file mode 100644 index 000000000..afb1fb274 --- /dev/null +++ b/python/xorbits/_mars/tensor/special/tests/test_special_execution.py @@ -0,0 +1,325 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +import scipy +import scipy.sparse as sps +import scipy.special as spspecial + +from ....lib.version import parse as parse_version +from ... import special as mt_special +from ... import tensor + + +@pytest.mark.parametrize( + "func", + [ + "gamma", + "gammaln", + "loggamma", + "gammasgn", + "psi", + "rgamma", + "digamma", + "erf", + "erfc", + "erfcx", + "erfi", + "erfinv", + "erfcinv", + "wofz", + "dawsn", + "entr", + "ellipk", + "ellipkm1", + "ellipe", + ], +) +def test_unary_execution(setup, func): + sp_func = getattr(spspecial, func) + mt_func = getattr(mt_special, func) + + raw = np.random.rand(10, 8, 6) + a = tensor(raw, chunk_size=3) + + r = mt_func(a) + + result = r.execute().fetch() + expected = sp_func(raw) + + np.testing.assert_array_equal(result, expected) + + # test sparse + raw = sps.csr_matrix(np.array([0, 1.0, 1.01, np.nan])) + a = tensor(raw, chunk_size=3) + + r = mt_func(a) + + result = r.execute().fetch() + + data = sp_func(raw.data) + expected = sps.csr_matrix((data, raw.indices, raw.indptr), raw.shape) + + np.testing.assert_array_equal(result.toarray(), expected.toarray()) + + +@pytest.mark.parametrize( + "func", + [ + "gammainc", + "gammaincinv", + "gammaincc", + "gammainccinv", + "beta", + "betaln", + "polygamma", + "poch", + "rel_entr", + "kl_div", + "xlogy", + "jv", + "jve", + "yn", + "yv", + "yve", + "kn", + "kv", + "kve", + "iv", + "ive", + "hankel1", + "hankel1e", + "hankel2", + "hankel2e", + "hyp0f1", + "ellipkinc", + "ellipeinc", + pytest.param( + "elliprc", + marks=pytest.mark.skipif( + parse_version(scipy.__version__) < parse_version("1.8.0"), + reason="function not implemented in scipy.", + ), + ), + ], +) +def test_binary_execution(setup, func): + sp_func = getattr(spspecial, func) + mt_func = getattr(mt_special, func) + + raw1 = np.random.rand(4, 3, 2) + raw2 = np.random.rand(4, 3, 2) + a = tensor(raw1, chunk_size=3) + b = tensor(raw2, chunk_size=3) + + r = mt_func(a, b) + + result = r.execute().fetch() + expected = sp_func(raw1, raw2) + + np.testing.assert_array_equal(result, expected) + + # test sparse + raw1 = sps.csr_matrix(np.array([0, 1.0, 1.01, np.nan] * 3).reshape(4, 3)) + a = tensor(raw1, chunk_size=3) + raw2 = np.random.rand(4, 3) + b = tensor(raw2, chunk_size=3) + + r = mt_func(a, b) + + result = r.execute().fetch() + + expected = sp_func(raw1.toarray(), raw2) + np.testing.assert_array_equal(result.toarray(), expected) + + +@pytest.mark.parametrize( + "func", + [ + "betainc", + "betaincinv", + "hyp1f1", + "hyperu", + "voigt_profile", + pytest.param( + "elliprd", + marks=pytest.mark.skipif( + parse_version(scipy.__version__) < parse_version("1.8.0"), + reason="function not implemented in scipy.", + ), + ), + pytest.param( + "elliprf", + marks=pytest.mark.skipif( + parse_version(scipy.__version__) < parse_version("1.8.0"), + reason="function not implemented in scipy.", + ), + ), + pytest.param( + "elliprg", + marks=pytest.mark.skipif( + parse_version(scipy.__version__) < parse_version("1.8.0"), + reason="function not implemented in scipy.", + ), + ), + ], +) +def test_triple_execution(setup, func): + sp_func = getattr(spspecial, func) + mt_func = getattr(mt_special, func) + + raw1 = np.random.rand(4, 3, 2) + raw2 = np.random.rand(4, 3, 2) + raw3 = np.random.rand(4, 3, 2) + a = tensor(raw1, chunk_size=3) + b = tensor(raw2, chunk_size=3) + c = tensor(raw3, chunk_size=3) + + r = mt_func(a, b, c) + + result = r.execute().fetch() + expected = sp_func(raw1, raw2, raw3) + + np.testing.assert_array_equal(result, expected) + + # test sparse + raw1 = sps.csr_matrix(np.array([0, 1.0, 1.01, np.nan] * 3).reshape(4, 3)) + a = tensor(raw1, chunk_size=3) + raw2 = np.random.rand(4, 3) + b = tensor(raw2, chunk_size=3) + raw3 = np.random.rand(4, 3) + c = tensor(raw3, chunk_size=3) + + r = mt_func(a, b, c) + + result = r.execute().fetch() + + expected = sp_func(raw1.toarray(), raw2, raw3) + np.testing.assert_array_equal(result.toarray(), expected) + + +@pytest.mark.parametrize( + "func", + [ + "hyp2f1", + "ellip_normal", + pytest.param( + "elliprj", + marks=pytest.mark.skipif( + parse_version(scipy.__version__) < parse_version("1.8.0"), + reason="function not implemented in scipy.", + ), + ), + ], +) +def test_quadruple_execution(setup, func): + sp_func = getattr(spspecial, func) + mt_func = getattr(mt_special, func) + + raw1 = np.random.rand(4, 3, 2) + raw2 = np.random.rand(4, 3, 2) + raw3 = np.random.rand(4, 3, 2) + raw4 = np.random.rand(4, 3, 2) + a = tensor(raw1, chunk_size=3) + b = tensor(raw2, chunk_size=3) + c = tensor(raw3, chunk_size=3) + d = tensor(raw4, chunk_size=3) + + r = mt_func(a, b, c, d) + + result = r.execute().fetch() + expected = sp_func(raw1, raw2, raw3, raw4) + + np.testing.assert_array_equal(result, expected) + + # test sparse + raw1 = sps.csr_matrix(np.array([0, 1.0, 1.01, np.nan] * 3).reshape(4, 3)) + a = tensor(raw1, chunk_size=3) + raw2 = np.random.rand(4, 3) + b = tensor(raw2, chunk_size=3) + raw3 = np.random.rand(4, 3) + c = tensor(raw3, chunk_size=3) + raw4 = np.random.rand(4, 3) + d = tensor(raw4, chunk_size=3) + + r = mt_func(a, b, c, d) + + result = r.execute().fetch() + + expected = sp_func(raw1.toarray(), raw2, raw3, raw4) + np.testing.assert_array_equal(result.toarray(), expected) + + +@pytest.mark.parametrize("func", ["ellip_harm", "ellip_harm_2"]) +def test_quintuple_execution(setup, func): + sp_func = getattr(spspecial, func) + mt_func = getattr(mt_special, func) + + raw1 = np.random.rand(4, 3, 2) + raw2 = np.random.rand(4, 3, 2) + raw3 = np.random.rand(4, 3, 2) + raw4 = np.random.rand(4, 3, 2) + raw5 = np.random.rand(4, 3, 2) + a = tensor(raw1, chunk_size=3) + b = tensor(raw2, chunk_size=3) + c = tensor(raw3, chunk_size=3) + d = tensor(raw4, chunk_size=3) + e = tensor(raw5, chunk_size=3) + + r = mt_func(a, b, c, d, e) + + result = r.execute().fetch() + expected = sp_func(raw1, raw2, raw3, raw4, raw5) + + np.testing.assert_array_equal(result, expected) + + # test sparse + raw1 = sps.csr_matrix(np.array([0, 1.0, 1.01, np.nan] * 3).reshape(4, 3)) + a = tensor(raw1, chunk_size=3) + raw2 = np.random.rand(4, 3) + b = tensor(raw2, chunk_size=3) + raw3 = np.random.rand(4, 3) + c = tensor(raw3, chunk_size=3) + raw4 = np.random.rand(4, 3) + d = tensor(raw4, chunk_size=3) + raw5 = np.random.rand(4, 3) + e = tensor(raw5, chunk_size=3) + + r = mt_func(a, b, c, d, e) + + result = r.execute().fetch() + + expected = sp_func(raw1.toarray(), raw2, raw3, raw4, raw5) + np.testing.assert_array_equal(result.toarray(), expected) + + +@pytest.mark.parametrize( + "func", + ["fresnel", "modfresnelp", "modfresnelm", "airy", "airye", "itairy"], +) +def test_unary_tuple_execution(setup, func): + sp_func = getattr(spspecial, func) + mt_func = getattr(mt_special, func) + + raw = np.random.rand(10, 8, 6) + a = tensor(raw, chunk_size=3) + + r = mt_func(a) + + result = r.execute().fetch() + expected = sp_func(raw) + + for actual_output, expected_output in zip(result, expected): + np.testing.assert_array_equal(actual_output, expected_output) diff --git a/python/xorbits/_mars/tensor/statistics/__init__.py b/python/xorbits/_mars/tensor/statistics/__init__.py new file mode 100644 index 000000000..950de82c9 --- /dev/null +++ b/python/xorbits/_mars/tensor/statistics/__init__.py @@ -0,0 +1,40 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .average import average +from .bincount import bincount +from .corrcoef import corrcoef +from .cov import cov +from .digitize import TensorDigitize, digitize +from .histogram import ( + TensorHistogram, + TensorHistogramBinEdges, + histogram, + histogram_bin_edges, +) +from .median import median +from .percentile import percentile +from .ptp import ptp +from .quantile import quantile + + +def _install(): + from ..core import Tensor, TensorData + + for cls in (Tensor, TensorData): + setattr(cls, "ptp", ptp) + + +_install() +del _install diff --git a/python/xorbits/_mars/tensor/statistics/average.py b/python/xorbits/_mars/tensor/statistics/average.py new file mode 100644 index 000000000..b4700d8cb --- /dev/null +++ b/python/xorbits/_mars/tensor/statistics/average.py @@ -0,0 +1,143 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ..base.broadcast_to import broadcast_to +from ..base.swapaxes import swapaxes +from ..datasource import tensor as astensor + + +def average(a, axis=None, weights=None, returned=False): + """ + Compute the weighted average along the specified axis. + + Parameters + ---------- + a : array_like + Tensor containing data to be averaged. If `a` is not a tensor, a + conversion is attempted. + axis : None or int or tuple of ints, optional + Axis or axes along which to average `a`. The default, + axis=None, will average over all of the elements of the input tensor. + If axis is negative it counts from the last to the first axis. + + If axis is a tuple of ints, averaging is performed on all of the axes + specified in the tuple instead of a single axis or all the axes as + before. + weights : array_like, optional + A tensor of weights associated with the values in `a`. Each value in + `a` contributes to the average according to its associated weight. + The weights tensor can either be 1-D (in which case its length must be + the size of `a` along the given axis) or of the same shape as `a`. + If `weights=None`, then all data in `a` are assumed to have a + weight equal to one. + returned : bool, optional + Default is `False`. If `True`, the tuple (`average`, `sum_of_weights`) + is returned, otherwise only the average is returned. + If `weights=None`, `sum_of_weights` is equivalent to the number of + elements over which the average is taken. + + + Returns + ------- + average, [sum_of_weights] : tensor_type or double + Return the average along the specified axis. When returned is `True`, + return a tuple with the average as the first element and the sum + of the weights as the second element. The return type is `Float` + if `a` is of integer type, otherwise it is of the same type as `a`. + `sum_of_weights` is of the same type as `average`. + + Raises + ------ + ZeroDivisionError + When all weights along axis are zero. See `numpy.ma.average` for a + version robust to this type of error. + TypeError + When the length of 1D `weights` is not the same as the shape of `a` + along axis. + + See Also + -------- + mean + + Examples + -------- + >>> import mars.tensor as mt + + >>> data = list(range(1,5)) + >>> data + [1, 2, 3, 4] + >>> mt.average(data).execute() + 2.5 + >>> mt.average(range(1,11), weights=range(10,0,-1)).execute() + 4.0 + + >>> data = mt.arange(6).reshape((3,2)) + >>> data.execute() + array([[0, 1], + [2, 3], + [4, 5]]) + >>> mt.average(data, axis=1, weights=[1./4, 3./4]).execute() + array([ 0.75, 2.75, 4.75]) + >>> mt.average(data, weights=[1./4, 3./4]).execute() + Traceback (most recent call last): + ... + TypeError: Axis must be specified when shapes of a and weights differ. + + """ + from ..arithmetic import multiply, truediv + + a = astensor(a) + + if weights is None: + avg = a.mean(axis) + scl = avg.dtype.type(a.size / avg.size) + else: + wgt = astensor(weights) + + if issubclass(a.dtype.type, (np.integer, np.bool_)): + result_dtype = np.result_type(a.dtype, wgt.dtype, "f8") + else: + result_dtype = np.result_type(a.dtype, wgt.dtype) + + # sanity checks + if a.shape != wgt.shape: + if axis is None: + raise TypeError( + "Axis must be specified when shapes of a and weights differ." + ) + if wgt.ndim != 1: + raise TypeError( + "1D weights expected when shapes of a and weights differ." + ) + if wgt.shape[0] != a.shape[axis]: + raise ValueError( + "Length of weights not compatible with specified axis." + ) + + # setup wgt to broadcast along axis + wgt = broadcast_to(wgt, (a.ndim - 1) * (1,) + wgt.shape) + wgt = swapaxes(wgt, -1, axis) + + scl = wgt.sum(axis=axis, dtype=result_dtype) + with np.errstate(divide="raise"): + avg = truediv(multiply(a, wgt, dtype=result_dtype).sum(axis), scl) + + if returned: + if scl.shape != avg.shape: + scl = broadcast_to(scl, avg.shape) + return avg, scl + else: + return avg diff --git a/python/xorbits/_mars/tensor/statistics/bincount.py b/python/xorbits/_mars/tensor/statistics/bincount.py new file mode 100644 index 000000000..257fb24de --- /dev/null +++ b/python/xorbits/_mars/tensor/statistics/bincount.py @@ -0,0 +1,301 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import itertools +from typing import Optional + +import numpy as np +import pandas as pd + +from ... import get_context, opcodes, options +from ...core import OutputType, recursive_tile +from ...core.operand import OperandStage +from ...serialization.serializables import Int64Field, ReferenceField +from ...utils import ceildiv, has_unknown_shape +from ..datasource import tensor as astensor +from ..operands import TensorMapReduceOperand, TensorOperandMixin + + +class TensorBinCount(TensorMapReduceOperand, TensorOperandMixin): + _op_type_ = opcodes.BINCOUNT + + weights = ReferenceField("weights", default=None) + minlength: Optional[int] = Int64Field("minlength", default=0) + chunk_size_limit: int = Int64Field("chunk_size_limit") + + chunk_count: Optional[int] = Int64Field("chunk_count") + tileable_right_bound: Optional[int] = Int64Field("tileable_right_bound") + + def __call__(self, x, weights=None): + inputs = [x] + self.weights = weights + dtype = np.dtype(np.int_) + if weights is not None: + inputs.append(weights) + dtype = weights.dtype + return self.new_tensor(inputs, dtype=dtype, shape=(np.nan,)) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if len(inputs) > 1: + self.weights = inputs[1] + + @classmethod + def _tile_single(cls, op: "TensorBinCount"): + out = op.outputs[0] + new_chunk_op = op.copy().reset_key() + chunk_inputs = [op.inputs[0].chunks[0]] + if op.weights is not None: + chunk_inputs.append(op.weights.chunks[0]) + new_chunk = new_chunk_op.new_chunk(chunk_inputs, index=(0,), **out.params) + + new_op = op.copy().reset_key() + return new_op.new_tileables( + op.inputs, chunks=[new_chunk], nsplits=((np.nan,),), **out.params + ) + + @classmethod + def tile(cls, op: "TensorBinCount"): + from ...dataframe.operands import DataFrameShuffleProxy + from ...dataframe.utils import parse_index + + if has_unknown_shape(*op.inputs): + yield + + ctx = get_context() + a = op.inputs[0] + out = op.outputs[0] + + if op.weights is not None and a.shape != op.weights.shape: + raise ValueError("The weights and list don't have the same length.") + + input_max = yield from recursive_tile(a.max()) + yield input_max.chunks + [c for inp in op.inputs for c in inp.chunks] + [max_val] = ctx.get_chunks_result([input_max.chunks[0].key]) + tileable_right_bound = max(op.minlength, int(max_val) + 1) + + chunk_count = max(1, ceildiv(tileable_right_bound, op.chunk_size_limit)) + + if ( + len(op.inputs[0].chunks) == 1 + and (op.weights is None or len(op.weights.chunks) == 1) + and chunk_count == 1 + ): + return cls._tile_single(op) + + if op.weights is not None: + weights = yield from recursive_tile(op.weights.rechunk(a.nsplits)) + weights_chunks = weights.chunks + else: + weights_chunks = itertools.repeat(None) + + map_chunks = [] + for a_chunk, weights_chunk in zip(a.chunks, weights_chunks): + new_op = op.copy().reset_key() + new_op.chunk_count = chunk_count + new_op.tileable_right_bound = tileable_right_bound + new_op.stage = OperandStage.map + new_op._output_types = [OutputType.series] + + inputs = [a_chunk] + if weights_chunk is not None: + inputs.append(weights_chunk) + map_chunks.append( + new_op.new_chunk( + inputs, + dtype=out.dtype, + shape=(np.nan,), + index=a_chunk.index, + index_value=parse_index(pd.Index([0], dtype=np.int64), a_chunk.key), + ) + ) + + shuffle_op = DataFrameShuffleProxy(output_types=[OutputType.tensor]).new_chunk( + map_chunks, dtype=out.dtype, shape=() + ) + + reduce_chunks = [] + reduce_nsplits = [] + left_offset = 0 + for chunk_idx in range(chunk_count): + right_offset = min(tileable_right_bound, left_offset + op.chunk_size_limit) + + new_op = op.copy().reset_key() + new_op.stage = OperandStage.reduce + new_op.reducer_ordinal = chunk_idx + new_op.n_reducers = chunk_count + new_op.chunk_count = chunk_count + new_op.tileable_right_bound = tileable_right_bound + + reduce_chunks.append( + new_op.new_chunk( + [shuffle_op], + dtype=out.dtype, + shape=(right_offset - left_offset,), + index=(chunk_idx,), + ) + ) + reduce_nsplits.append(right_offset - left_offset) + left_offset = right_offset + + new_op = op.copy().reset_key() + params = out.params.copy() + params["shape"] = (tileable_right_bound,) + return new_op.new_tileables( + op.inputs, + chunks=reduce_chunks, + nsplits=(tuple(reduce_nsplits),), + **params, + ) + + @classmethod + def _execute_map(cls, ctx, op: "TensorBinCount"): + input_val = ctx[op.inputs[0].key] + if op.weights is not None: + weights_val = ctx[op.weights.key] + df = pd.DataFrame({"data": input_val, "weights": weights_val}) + res = df.groupby("data")["weights"].sum() + else: + res = pd.Series(input_val).groupby(input_val).count() + + if res.index.min() < 0: + raise ValueError("'list' argument must have no negative elements") + + left_bound = 0 + for target_idx in range(op.chunk_count): + right_bound = res.index.searchsorted( + (1 + target_idx) * op.chunk_size_limit, "left" + ) + sliced = res.iloc[left_bound:right_bound] + if len(sliced) > 0: + ctx[op.outputs[0].key, (target_idx,)] = sliced + else: + # ensure all mapper data are inserted context + ctx[op.outputs[0].key, (target_idx,)] = None + left_bound = right_bound + + @classmethod + def _execute_reduce(cls, ctx, op: "TensorBinCount"): + out = op.outputs[0] + input_list = list( + d for d in op.iter_mapper_data(ctx, skip_none=True) if d is not None + ) + left_bound = op.chunk_size_limit * out.index[0] + right_bound = min(left_bound + op.chunk_size_limit, op.tileable_right_bound) + if not input_list: + ctx[op.outputs[0].key] = np.zeros(right_bound - left_bound) + else: + res = functools.reduce( + lambda a, b: a.add(b, fill_value=0), input_list + ).astype(out.dtype) + res = res.reindex(pd.RangeIndex(left_bound, right_bound), fill_value=0) + ctx[op.outputs[0].key] = res.values + + @classmethod + def execute(cls, ctx, op: "TensorBinCount"): + if op.stage == OperandStage.map: + op._execute_map(ctx, op) + elif op.stage == OperandStage.reduce: + op._execute_reduce(ctx, op) + else: + input_val = ctx[op.inputs[0].key] + weights_val = ctx[op.weights.key] if op.weights is not None else None + ctx[op.outputs[0].key] = np.bincount( + input_val, weights=weights_val, minlength=op.minlength + ) + + +def bincount(x, weights=None, minlength=0, chunk_size_limit=None): + """ + Count number of occurrences of each value in array of non-negative ints. + + The number of bins (of size 1) is one larger than the largest value in + `x`. If `minlength` is specified, there will be at least this number + of bins in the output array (though it will be longer if necessary, + depending on the contents of `x`). + Each bin gives the number of occurrences of its index value in `x`. + If `weights` is specified the input array is weighted by it, i.e. if a + value ``n`` is found at position ``i``, ``out[n] += weight[i]`` instead + of ``out[n] += 1``. + + Parameters + ---------- + x : tensor or array_like, 1 dimension, nonnegative ints + Input array. + weights : tensor or array_like, optional + Weights, array of the same shape as `x`. + minlength : int, optional + A minimum number of bins for the output array. + + Returns + ------- + out : tensor of ints + The result of binning the input array. + The length of `out` is equal to ``np.amax(x)+1``. + + Raises + ------ + ValueError + If the input is not 1-dimensional, or contains elements with negative + values, or if `minlength` is negative. + TypeError + If the type of the input is float or complex. + + See Also + -------- + histogram, digitize, unique + + Examples + -------- + >>> import mars.tensor as mt + >>> mt.bincount(mt.arange(5)).execute() + array([1, 1, 1, 1, 1]) + >>> mt.bincount(mt.tensor([0, 1, 1, 3, 2, 1, 7])).execute() + array([1, 3, 1, 1, 0, 0, 0, 1]) + + The input array needs to be of integer dtype, otherwise a + TypeError is raised: + + >>> mt.bincount(mt.arange(5, dtype=float)).execute() + Traceback (most recent call last): + ....execute() + TypeError: Cannot cast array data from dtype('float64') to dtype('int64') + according to the rule 'safe' + + A possible use of ``bincount`` is to perform sums over + variable-size chunks of an array, using the ``weights`` keyword. + + >>> w = mt.array([0.3, 0.5, 0.2, 0.7, 1., -0.6]) # weights + >>> x = mt.array([0, 1, 1, 2, 2, 2]) + >>> mt.bincount(x, weights=w).execute() + array([ 0.3, 0.7, 1.1]) + """ + x = astensor(x) + weights = astensor(weights) if weights is not None else None + + if not np.issubdtype(x.dtype, np.int_): + raise TypeError(f"Cannot cast array data from {x.dtype} to {np.dtype(np.int_)}") + if x.ndim != 1: + raise ValueError("'x' must be 1 dimension") + if minlength < 0: + raise ValueError("'minlength' must not be negative") + + chunk_size_limit = ( + chunk_size_limit + if chunk_size_limit is not None + else options.bincount.chunk_size_limit + ) + op = TensorBinCount(minlength=minlength, chunk_size_limit=chunk_size_limit) + return op(x, weights=weights) diff --git a/python/xorbits/_mars/tensor/statistics/core.py b/python/xorbits/_mars/tensor/statistics/core.py new file mode 100644 index 000000000..d512ede9f --- /dev/null +++ b/python/xorbits/_mars/tensor/statistics/core.py @@ -0,0 +1,69 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..utils import normalize_axis_tuple + + +def _ureduce(a, func, **kwargs): + """ + Internal Function. + Call `func` with `a` as first argument swapping the axes to use extended + axis on functions that don't support it natively. + + Returns result and a.shape with axis dims set to 1. + + Parameters + ---------- + a : array_like + Input tensor or object that can be converted to a tensor. + func : callable + Reduction function capable of receiving a single axis argument. + It is called with `a` as first argument followed by `kwargs`. + kwargs : keyword arguments + additional keyword arguments to pass to `func`. + + Returns + ------- + result : tuple + Result of func(a, **kwargs) and a.shape with axis dims set to 1 + which can be used to reshape the result to the same shape a ufunc with + keepdims=True would produce. + + """ + axis = kwargs.get("axis", None) + if axis is not None: + keepdim = list(a.shape) + nd = a.ndim + axis = normalize_axis_tuple(axis, nd) + + for ax in axis: + keepdim[ax] = 1 + + if len(axis) == 1: + kwargs["axis"] = axis[0] + else: + keep = set(range(nd)) - set(axis) + nkeep = len(keep) + # swap axis that should not be reduced to front + for i, s in enumerate(sorted(keep)): + a = a.swapaxes(i, s) + # merge reduced axis + a = a.reshape(a.shape[:nkeep] + (-1,)) + kwargs["axis"] = -1 + keepdim = tuple(keepdim) + else: + keepdim = (1,) * a.ndim + + r = func(a, **kwargs) + return r, keepdim diff --git a/python/xorbits/_mars/tensor/statistics/corrcoef.py b/python/xorbits/_mars/tensor/statistics/corrcoef.py new file mode 100644 index 000000000..461fd204e --- /dev/null +++ b/python/xorbits/_mars/tensor/statistics/corrcoef.py @@ -0,0 +1,77 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .cov import cov + + +def corrcoef(x, y=None, rowvar=True): + r""" + Return Pearson product-moment correlation coefficients. + + Please refer to the documentation for `cov` for more detail. The + relationship between the correlation coefficient matrix, `R`, and the + covariance matrix, `C`, is + + .. math:: R_{ij} = \frac{ C_{ij} } { \sqrt{ C_{ii} * C_{jj} } } + + The values of `R` are between -1 and 1, inclusive. + + Parameters + ---------- + x : array_like + A 1-D or 2-D array containing multiple variables and observations. + Each row of `x` represents a variable, and each column a single + observation of all those variables. Also see `rowvar` below. + y : array_like, optional + An additional set of variables and observations. `y` has the same + shape as `x`. + rowvar : bool, optional + If `rowvar` is True (default), then each row represents a + variable, with observations in the columns. Otherwise, the relationship + is transposed: each column represents a variable, while the rows + contain observations. + + Returns + ------- + R : Tensor + The correlation coefficient matrix of the variables. + + See Also + -------- + cov : Covariance matrix + + Notes + ----- + Due to floating point rounding the resulting tensor may not be Hermitian, + the diagonal elements may not be 1, and the elements may not satisfy the + inequality abs(a) <= 1. The real and imaginary parts are clipped to the + interval [-1, 1] in an attempt to improve on that situation but is not + much help in the complex case. + + This function accepts but discards arguments `bias` and `ddof`. This is + for backwards compatibility with previous versions of this function. These + arguments had no effect on the return values of the function and can be + safely ignored in this and previous versions of numpy. + + """ + from ..arithmetic import sqrt + from ..datasource import diag + + c = cov(x, y, rowvar) + if c.ndim == 0: + return c / c + d = diag(c) + d = d.reshape(d.shape[0], 1) + sqrt_d = sqrt(d) + return (c / sqrt_d) / sqrt_d.T diff --git a/python/xorbits/_mars/tensor/statistics/cov.py b/python/xorbits/_mars/tensor/statistics/cov.py new file mode 100644 index 000000000..7a838c095 --- /dev/null +++ b/python/xorbits/_mars/tensor/statistics/cov.py @@ -0,0 +1,222 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings + +import numpy as np + +from ..base.squeeze import squeeze +from ..base.where import where +from ..core import Tensor +from ..datasource import array +from ..datasource import tensor as astensor +from .average import average + + +def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None, aweights=None): + """ + Estimate a covariance matrix, given data and weights. + + Covariance indicates the level to which two variables vary together. + If we examine N-dimensional samples, :math:`X = [x_1, x_2, ... x_N]^T`, + then the covariance matrix element :math:`C_{ij}` is the covariance of + :math:`x_i` and :math:`x_j`. The element :math:`C_{ii}` is the variance + of :math:`x_i`. + + See the notes for an outline of the algorithm. + + Parameters + ---------- + m : array_like + A 1-D or 2-D array containing multiple variables and observations. + Each row of `m` represents a variable, and each column a single + observation of all those variables. Also see `rowvar` below. + y : array_like, optional + An additional set of variables and observations. `y` has the same form + as that of `m`. + rowvar : bool, optional + If `rowvar` is True (default), then each row represents a + variable, with observations in the columns. Otherwise, the relationship + is transposed: each column represents a variable, while the rows + contain observations. + bias : bool, optional + Default normalization (False) is by ``(N - 1)``, where ``N`` is the + number of observations given (unbiased estimate). If `bias` is True, + then normalization is by ``N``. These values can be overridden by using + the keyword ``ddof`` in numpy versions >= 1.5. + ddof : int, optional + If not ``None`` the default value implied by `bias` is overridden. + Note that ``ddof=1`` will return the unbiased estimate, even if both + `fweights` and `aweights` are specified, and ``ddof=0`` will return + the simple average. See the notes for the details. The default value + is ``None``. + fweights : array_like, int, optional + 1-D tensor of integer freguency weights; the number of times each + observation vector should be repeated. + aweights : array_like, optional + 1-D tensor of observation vector weights. These relative weights are + typically large for observations considered "important" and smaller for + observations considered less "important". If ``ddof=0`` the array of + weights can be used to assign probabilities to observation vectors. + + Returns + ------- + out : Tensor + The covariance matrix of the variables. + + See Also + -------- + corrcoef : Normalized covariance matrix + + Notes + ----- + Assume that the observations are in the columns of the observation + array `m` and let ``f = fweights`` and ``a = aweights`` for brevity. The + steps to compute the weighted covariance are as follows:: + + >>> w = f * a + >>> v1 = mt.sum(w) + >>> v2 = mt.sum(w * a) + >>> m -= mt.sum(m * w, axis=1, keepdims=True) / v1 + >>> cov = mt.dot(m * w, m.T) * v1 / (v1**2 - ddof * v2) + + Note that when ``a == 1``, the normalization factor + ``v1 / (v1**2 - ddof * v2)`` goes over to ``1 / (np.sum(f) - ddof)`` + as it should. + + Examples + -------- + Consider two variables, :math:`x_0` and :math:`x_1`, which + correlate perfectly, but in opposite directions: + + >>> import mars.tensor as mt + + >>> x = mt.array([[0, 2], [1, 1], [2, 0]]).T + >>> x.execute() + array([[0, 1, 2], + [2, 1, 0]]) + + Note how :math:`x_0` increases while :math:`x_1` decreases. The covariance + matrix shows this clearly: + + >>> mt.cov(x).execute() + array([[ 1., -1.], + [-1., 1.]]) + + Note that element :math:`C_{0,1}`, which shows the correlation between + :math:`x_0` and :math:`x_1`, is negative. + + Further, note how `x` and `y` are combined: + + >>> x = [-2.1, -1, 4.3] + >>> y = [3, 1.1, 0.12] + >>> X = mt.stack((x, y), axis=0) + >>> print(mt.cov(X).execute()) + [[ 11.71 -4.286 ] + [ -4.286 2.14413333]] + >>> print(mt.cov(x, y).execute()) + [[ 11.71 -4.286 ] + [ -4.286 2.14413333]] + >>> print(mt.cov(x).execute()) + 11.71 + + """ + from ..linalg import dot + from ..merge import vstack + + if ddof is not None and ddof != int(ddof): + raise ValueError("ddof must be integer") + + m = astensor(m) + if m.ndim > 2: + raise ValueError("m has more than 2 dimensions") + + if y is None: + dtype = np.result_type(m.dtype, np.float64) + else: + y = astensor(y) + if y.ndim > 2: + raise ValueError("y has more than 2 dimensions") + dtype = np.result_type(m.dtype, y.dtype, np.float64) + + X = array(m, ndmin=2, dtype=dtype) + if not rowvar and X.shape[0] != 1: + X = X.T + if y is not None: + y = array(y, copy=False, ndmin=2, dtype=dtype) + if not rowvar and y.shape[0] != 1: + y = y.T + X = vstack((X, y)) + + if ddof is None: + if bias == 0: + ddof = 1 + else: + ddof = 0 + + # Get the product of frequencies and weights + w = None + if fweights is not None: + fweights = astensor(fweights, dtype=float) + if fweights.ndim > 1: + raise RuntimeError("cannot handle multidimensional fweights") + if fweights.shape[0] != X.shape[1]: + raise RuntimeError("incompatible numbers of samples and fweights") + if any(fweights < 0): + raise ValueError("fweights cannot be negative") + w = fweights + if aweights is not None: + aweights = astensor(aweights, dtype=float) + if aweights.ndim > 1: + raise RuntimeError("cannot handle multidimensional aweights") + if aweights.shape[0] != X.shape[1]: + raise RuntimeError("incompatible numbers of samples and aweights") + if any(aweights < 0): + raise ValueError("aweights cannot be negative") + if w is None: + w = aweights + else: + w *= aweights + + avg, w_sum = average(X, axis=1, weights=w, returned=True) + w_sum = w_sum[0] + + # Determine the normalization + if w is None: + fact = X.shape[1] - ddof + elif ddof == 0: + fact = w_sum + elif aweights is None: + fact = w_sum - ddof + else: + fact = w_sum - ddof * sum(w * aweights) / w_sum + + X -= avg[:, None] + if w is None: + X_T = X.T + else: + X_T = (X * w).T + c = dot(X, X_T.conj()) + if isinstance(fact, Tensor): + fact = where(fact <= 0, 0.0, fact) + fact = fact.astype(float) + else: + if fact <= 0: + warnings.warn( + "Degrees of freedom <= 0 for slice", RuntimeWarning, stacklevel=2 + ) + fact = 0.0 + fact = np.float64(fact) + c = c * (1.0 / fact) + return squeeze(c) diff --git a/python/xorbits/_mars/tensor/statistics/digitize.py b/python/xorbits/_mars/tensor/statistics/digitize.py new file mode 100644 index 000000000..39407a977 --- /dev/null +++ b/python/xorbits/_mars/tensor/statistics/digitize.py @@ -0,0 +1,186 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import opcodes as OperandDef +from ...core import recursive_tile +from ...lib.sparse.core import get_array_module +from ...serialization.serializables import AnyField, BoolField, KeyField +from ...utils import has_unknown_shape +from ..array_utils import as_same_device, device +from ..core import Tensor, TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorHasInput, TensorOperandMixin + + +class TensorDigitize(TensorHasInput, TensorOperandMixin): + _op_type_ = OperandDef.DIGITIZE + + _input = KeyField("input") + _bins = AnyField("bins") + _right = BoolField("right") + + def __init__(self, right=False, **kw): + super().__init__(_right=right, **kw) + + @property + def bins(self): + return self._bins + + @property + def right(self): + return self._right + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._input = self._inputs[0] + if len(inputs) > 1: + self._bins = self._inputs[1] + + def __call__(self, x, bins): + x = astensor(x) + inputs = [x] + if not isinstance(bins, Tensor): + bins = get_array_module(bins).asarray(bins) + self._bins = bins + else: + inputs.append(bins) + self.dtype = np.digitize( + [0], np.empty(1, dtype=bins.dtype), right=self._right + ).dtype + + return self.new_tensor(inputs, x.shape, order=TensorOrder.C_ORDER) + + @classmethod + def tile(cls, op): + tensor = op.outputs[0] + in_tensor = op.input + bins = op.bins + if len(op.inputs) == 2: + # bins is TensorData + if has_unknown_shape(bins): + yield + bins = (yield from recursive_tile(bins.rechunk(tensor.shape))).chunks[0] + + out_chunks = [] + for c in in_tensor.chunks: + input_chunks = [c] + if len(op.inputs) == 2: + input_chunks.append(bins) + out_chunk = ( + op.copy() + .reset_key() + .new_chunk( + input_chunks, shape=c.shape, index=c.index, order=tensor.order + ) + ) + out_chunks.append(out_chunk) + + new_op = op.copy() + return new_op.new_tensors( + op.inputs, + tensor.shape, + order=tensor.order, + chunks=out_chunks, + nsplits=in_tensor.nsplits, + ) + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True + ) + + x = inputs[0] + if len(inputs) > 1: + bins = inputs[1] + else: + bins = op.bins + + with device(device_id): + ctx[op.outputs[0].key] = xp.digitize(x, bins=bins, right=op.right) + + +def digitize(x, bins, right=False): + """ + Return the indices of the bins to which each value in input tensor belongs. + + Each index ``i`` returned is such that ``bins[i-1] <= x < bins[i]`` if + `bins` is monotonically increasing, or ``bins[i-1] > x >= bins[i]`` if + `bins` is monotonically decreasing. If values in `x` are beyond the + bounds of `bins`, 0 or ``len(bins)`` is returned as appropriate. If right + is True, then the right bin is closed so that the index ``i`` is such + that ``bins[i-1] < x <= bins[i]`` or ``bins[i-1] >= x > bins[i]`` if `bins` + is monotonically increasing or decreasing, respectively. + + Parameters + ---------- + x : array_like + Input tensor to be binned. + bins : array_like + Array of bins. It has to be 1-dimensional and monotonic. + right : bool, optional + Indicating whether the intervals include the right or the left bin + edge. Default behavior is (right==False) indicating that the interval + does not include the right edge. The left bin end is open in this + case, i.e., bins[i-1] <= x < bins[i] is the default behavior for + monotonically increasing bins. + + Returns + ------- + out : Tensor of ints + Output tensor of indices, of same shape as `x`. + + Raises + ------ + ValueError + If `bins` is not monotonic. + TypeError + If the type of the input is complex. + + See Also + -------- + bincount, histogram, unique, searchsorted + + Notes + ----- + If values in `x` are such that they fall outside the bin range, + attempting to index `bins` with the indices that `digitize` returns + will result in an IndexError. + + `mt.digitize` is implemented in terms of `mt.searchsorted`. This means + that a binary search is used to bin the values, which scales much better + for larger number of bins than the previous linear search. It also removes + the requirement for the input array to be 1-dimensional. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.array([0.2, 6.4, 3.0, 1.6]) + >>> bins = mt.array([0.0, 1.0, 2.5, 4.0, 10.0]) + >>> inds = mt.digitize(x, bins) + >>> inds.execute() + array([1, 4, 3, 2]) + + >>> x = mt.array([1.2, 10.0, 12.4, 15.5, 20.]) + >>> bins = mt.array([0, 5, 10, 15, 20]) + >>> mt.digitize(x,bins,right=True).execute() + array([1, 2, 3, 4, 4]) + >>> mt.digitize(x,bins,right=False).execute() + array([1, 3, 3, 4, 5]) + """ + op = TensorDigitize(right=right) + return op(x, bins) diff --git a/python/xorbits/_mars/tensor/statistics/histogram.py b/python/xorbits/_mars/tensor/statistics/histogram.py new file mode 100644 index 000000000..b990c46b1 --- /dev/null +++ b/python/xorbits/_mars/tensor/statistics/histogram.py @@ -0,0 +1,1006 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator +import warnings + +import numpy as np + +from ... import opcodes as OperandDef +from ... import tensor as mt +from ...core import recursive_tile +from ...core.context import get_context +from ...serialization.serializables import AnyField, BoolField, KeyField, TupleField +from ...utils import has_unknown_shape +from ..arithmetic.utils import chunk_tree_add +from ..array_utils import as_same_device, device +from ..core import TENSOR_CHUNK_TYPE, TENSOR_TYPE, TensorOrder +from ..datasource import tensor as astensor +from ..operands import TensorOperand, TensorOperandMixin +from ..utils import is_asc_sorted + +# note: some logic of this file were adopted from `numpy/lib/histograms` + + +def _ptp(range_): + """Peak-to-peak value of x. + + This implementation avoids the problem of signed integer arrays having a + peak-to-peak value that cannot be represented with the array's data type. + This function returns an unsigned value for signed integer arrays. + """ + return _unsigned_subtract(*range_[::-1]) + + +class HistBinSelector: + def __init__(self, histogram_bin_edges_op, x, range, raw_range): + self._op = histogram_bin_edges_op + self._x = x + self._range = range + self._raw_range = raw_range + self._width = None + + def check(self): + # not checked before + width = self() + if width is None: + return + self._width = width = yield from recursive_tile(width) + yield [c.data for c in width.chunks] + + def __call__(self): + return + + def get_result(self): + ctx = get_context() + width = ctx.get_chunks_result([self._width.chunks[0].key])[0] + return width + + +class HistBinSqrtSelector(HistBinSelector): + """ + Square root histogram bin estimator. + + Bin width is inversely proportional to the data size. Used by many + programs for its simplicity. + """ + + def get_result(self): + return _ptp(self._raw_range) / np.sqrt(self._x.size) + + +class HistBinSturgesSelector(HistBinSelector): + """ + Sturges histogram bin estimator. + + A very simplistic estimator based on the assumption of normality of + the data. This estimator has poor performance for non-normal data, + which becomes especially obvious for large data sets. The estimate + depends only on size of the data. + """ + + def get_result(self): + return _ptp(self._raw_range) / (np.log2(self._x.size) + 1.0) + + +class HistBinRiceSelector(HistBinSelector): + """ + Rice histogram bin estimator. + + Another simple estimator with no normality assumption. It has better + performance for large data than Sturges, but tends to overestimate + the number of bins. The number of bins is proportional to the cube + root of data size (asymptotically optimal). The estimate depends + only on size of the data. + """ + + def get_result(self): + return _ptp(self._raw_range) / (2.0 * self._x.size ** (1.0 / 3)) + + +class HistBinScottSelector(HistBinSelector): + """ + Scott histogram bin estimator. + + The binwidth is proportional to the standard deviation of the data + and inversely proportional to the cube root of data size + (asymptotically optimal). + """ + + def __call__(self): + return (24.0 * np.pi**0.5 / self._x.size) ** (1.0 / 3.0) * mt.std(self._x) + + +class HistBinStoneSelector(HistBinSelector): + """ + Histogram bin estimator based on minimizing the estimated integrated squared error (ISE). + + The number of bins is chosen by minimizing the estimated ISE against the unknown true distribution. + The ISE is estimated using cross-validation and can be regarded as a generalization of Scott's rule. + https://en.wikipedia.org/wiki/Histogram#Scott.27s_normal_reference_rule + + This paper by Stone appears to be the origination of this rule. + http://digitalassets.lib.berkeley.edu/sdtr/ucb/text/34.pdf + """ + + def __call__(self): + n = self._x.size + ptp_x = _ptp(self._raw_range) + + if n <= 1 or ptp_x == 0: + return + + nbins_upper_bound = max(100, int(np.sqrt(n))) + candidates = [] + for nbins in range(1, nbins_upper_bound + 1): + hh = ptp_x / nbins + p_k = histogram(self._x, bins=nbins, range=self._range)[0] / n + candidate = (2 - (n + 1) * p_k.dot(p_k)) / hh + candidates.append(candidate) + nbins = mt.stack(candidates).argmin() + 1 + return ptp_x / nbins + + def get_result(self): + ptp_x = _ptp(self._raw_range) + if self._x.size <= 1 or ptp_x == 0: + return 0.0 + else: + return super().get_result() + + +class HistBinDoaneSelector(HistBinSelector): + """ + Doane's histogram bin estimator. + + Improved version of Sturges' formula which works better for + non-normal data. See + stats.stackexchange.com/questions/55134/doanes-formula-for-histogram-binning + """ + + def __call__(self): + x = self._x + if x.size <= 2: + return + + sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3))) + sigma = mt.std(x) + g1 = mt.mean(((x - mt.mean(x)) / sigma) ** 3) + ret = _ptp(self._raw_range) / ( + 1.0 + np.log2(x.size) + mt.log2(1.0 + mt.absolute(g1) / sg1) + ) + return mt.where(sigma > 0.0, ret, 0.0) + + def get_result(self): + if self._x.size <= 2: + return 0.0 + else: + return super().get_result() + + +class HistBinFdSelector(HistBinSelector): + """ + The Freedman-Diaconis histogram bin estimator. + + The Freedman-Diaconis rule uses interquartile range (IQR) to + estimate binwidth. It is considered a variation of the Scott rule + with more robustness as the IQR is less affected by outliers than + the standard deviation. However, the IQR depends on fewer points + than the standard deviation, so it is less accurate, especially for + long tailed distributions. + + If the IQR is 0, this function returns 1 for the number of bins. + Binwidth is inversely proportional to the cube root of data size + (asymptotically optimal). + """ + + def __call__(self): + iqr = mt.subtract(*mt.percentile(self._x, [75, 25])) + return 2.0 * iqr * self._x.size ** (-1.0 / 3.0) + + +class HistBinAutoSelector(HistBinSelector): + """ + Histogram bin estimator that uses the minimum width of the + Freedman-Diaconis and Sturges estimators if the FD bandwidth is non zero + and the Sturges estimator if the FD bandwidth is 0. + + The FD estimator is usually the most robust method, but its width + estimate tends to be too large for small `x` and bad for data with limited + variance. The Sturges estimator is quite good for small (<1000) datasets + and is the default in the R language. This method gives good off the shelf + behaviour. + + If there is limited variance the IQR can be 0, which results in the + FD bin width being 0 too. This is not a valid bin width, so + ``np.histogram_bin_edges`` chooses 1 bin instead, which may not be optimal. + If the IQR is 0, it's unlikely any variance based estimators will be of + use, so we revert to the sturges estimator, which only uses the size of the + dataset in its calculation. + """ + + def __init__(self, histogram_bin_edges_op, x, range, raw_range): + super().__init__(histogram_bin_edges_op, x, range, raw_range) + self._bin_fd = HistBinFdSelector(histogram_bin_edges_op, x, range, raw_range) + self._bin_sturges = HistBinSturgesSelector( + histogram_bin_edges_op, x, range, raw_range + ) + + def __call__(self): + return self._bin_fd() + + def get_result(self): + fd_bw = super().get_result() + sturges_bw = self._bin_sturges.get_result() + if fd_bw: + return min(fd_bw, sturges_bw) + else: + # limited variance, so we return a len dependent bw estimator + return sturges_bw + + +# Private dict initialized at module load time +_hist_bin_selectors = { + "stone": HistBinStoneSelector, + "auto": HistBinAutoSelector, + "doane": HistBinDoaneSelector, + "fd": HistBinFdSelector, + "rice": HistBinRiceSelector, + "scott": HistBinScottSelector, + "sqrt": HistBinSqrtSelector, + "sturges": HistBinSturgesSelector, +} + + +def _ravel_and_check_weights(a, weights): + """Check a and weights have matching shapes, and ravel both""" + a = astensor(a) + + # Ensure that the array is a "subtractable" dtype + if a.dtype == np.bool_: + warnings.warn( + f"Converting input from {a.dtype} to {np.uint8} for compatibility.", + RuntimeWarning, + stacklevel=3, + ) + a = a.astype(np.uint8) + + if weights is not None: + weights = astensor(weights) + if weights.shape != a.shape: + raise ValueError("weights should have the same shape as a.") + weights = weights.ravel() + a = a.ravel() + return a, weights + + +def _check_range(range): + first_edge, last_edge = range + if first_edge > last_edge: + raise ValueError("max must be larger than min in range parameter.") + if not (np.isfinite(first_edge) and np.isfinite(last_edge)): + raise ValueError(f"supplied range of [{first_edge}, {last_edge}] is not finite") + return first_edge, last_edge + + +def _get_outer_edges(a, range): + """ + Determine the outer bin edges to use, from either the data or the range + argument + """ + if range is not None: + first_edge, last_edge = _check_range(range) + else: + assert a.size == 0 + # handle empty arrays. Can't determine range, so use 0-1. + first_edge, last_edge = 0, 1 + + # expand empty range to avoid divide by zero + if first_edge == last_edge: + first_edge = first_edge - 0.5 + last_edge = last_edge + 0.5 + + return first_edge, last_edge + + +def _unsigned_subtract(a, b): + """ + Subtract two values where a >= b, and produce an unsigned result + + This is needed when finding the difference between the upper and lower + bound of an int16 histogram + """ + # coerce to a single type + signed_to_unsigned = { + np.byte: np.ubyte, + np.short: np.ushort, + np.intc: np.uintc, + np.int_: np.uint, + np.longlong: np.ulonglong, + } + dt = np.result_type(a, b) + try: + dt = signed_to_unsigned[dt.type] + except KeyError: # pragma: no cover + return np.subtract(a, b, dtype=dt) + else: + # we know the inputs are integers, and we are deliberately casting + # signed to unsigned + return np.subtract(a, b, casting="unsafe", dtype=dt) + + +def _get_bin_edges(op, a, bins, range, weights): + # parse the overloaded bins argument + n_equal_bins = None + bin_edges = None + first_edge = None + last_edge = None + + if isinstance(bins, str): + # when `bins` is str, x.min() and x.max() + # will be calculated in advance + bin_name = bins + if a.size > 0: + assert range is not None + + raw_range = range + first_edge, last_edge = _get_outer_edges(a, range) + + if a.size == 0: + n_equal_bins = 1 + else: + # Do not call selectors on empty arrays + selector = _hist_bin_selectors[bin_name]( + op, a, (first_edge, last_edge), raw_range + ) + yield from selector.check() + width = selector.get_result() + if width: + n_equal_bins = int( + np.ceil(_unsigned_subtract(last_edge, first_edge) / width) + ) + else: + # Width can be zero for some estimators, e.g. FD when + # the IQR of the data is zero. + n_equal_bins = 1 + + elif mt.ndim(bins) == 0: + first_edge, last_edge = _get_outer_edges(a, range) + n_equal_bins = bins + + else: + # cannot be Tensor, must be calculated first + assert mt.ndim(bins) == 1 and not isinstance(bins, TENSOR_TYPE) + bin_edges = np.asarray(bins) + if not is_asc_sorted(bin_edges): + raise ValueError("`bins` must increase monotonically, when an array") + + if n_equal_bins is not None: + # numpy gh-10322 means that type resolution rules are dependent on array + # shapes. To avoid this causing problems, we pick a type now and stick + # with it throughout. + bin_type = np.result_type(first_edge, last_edge, a) + if np.issubdtype(bin_type, np.integer): + bin_type = np.result_type(bin_type, float) + + # bin edges must be computed + bin_edges = mt.linspace( + first_edge, + last_edge, + n_equal_bins + 1, + endpoint=True, + dtype=bin_type, + gpu=op.gpu, + ) + return bin_edges, (first_edge, last_edge, n_equal_bins) + else: + return mt.tensor(bin_edges), None + + +class TensorHistogramBinEdges(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.HISTOGRAM_BIN_EDGES + + _input = KeyField("input") + _bins = AnyField("bins") + _range = TupleField("range") + _weights = KeyField("weights") + _uniform_bins = TupleField("uniform_bins") + + def __init__( + self, + input=None, + bins=None, + range=None, + weights=None, + **kw, + ): + super().__init__(_input=input, _bins=bins, _range=range, _weights=weights, **kw) + + @property + def input(self): + return self._input + + @property + def bins(self): + return self._bins + + @property + def range(self): + return self._range + + @property + def weights(self): + return self._weights + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + self._input = next(inputs_iter) + if isinstance(self._bins, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)): + self._bins = next(inputs_iter) + if self._weights is not None: + self._weights = next(inputs_iter) + + def __call__(self, a, bins, range, weights): + if range is not None: + _check_range(range) + if isinstance(bins, str): + # string, 'auto', 'stone', ... + # shape is unknown + bin_name = bins + # if `bins` is a string for an automatic method, + # this will replace it with the number of bins calculated + if bin_name not in _hist_bin_selectors: + raise ValueError(f"{bin_name!r} is not a valid estimator for `bins`") + if weights is not None: + raise TypeError( + "Automated estimation of the number of " + "bins is not supported for weighted data" + ) + if isinstance(range, tuple) and len(range) == 2: + # if `bins` is a string, e.g. 'auto', 'stone'..., + # and `range` provided as well, + # `a` should be trimmed first + first_edge, last_edge = _get_outer_edges(a, range) + a = a[(a >= first_edge) & (a <= last_edge)] + shape = (np.nan,) + elif mt.ndim(bins) == 0: + try: + n_equal_bins = operator.index(bins) + except TypeError: # pragma: no cover + raise TypeError("`bins` must be an integer, a string, or an array") + if n_equal_bins < 1: + raise ValueError("`bins` must be positive, when an integer") + shape = (bins + 1,) + elif mt.ndim(bins) == 1: + if not isinstance(bins, TENSOR_TYPE): + bins = np.asarray(bins) + if not is_asc_sorted(bins): + raise ValueError( + "`bins` must increase monotonically, when an array" + ) + shape = astensor(bins).shape + else: + raise ValueError("`bins` must be 1d, when an array") + + inputs = [a] + if isinstance(bins, TENSOR_TYPE): + inputs.append(bins) + if weights is not None: + inputs.append(weights) + + return self.new_tensor(inputs, shape=shape, order=TensorOrder.C_ORDER) + + @classmethod + def tile(cls, op): + ctx = get_context() + a = op.input + range_ = op.range + bins = op.bins + + if isinstance(bins, str): + if has_unknown_shape(a): + yield + if ( + (a.size > 0 or np.isnan(a.size)) + and (isinstance(bins, str) or mt.ndim(bins) == 0) + and not range_ + ): + input_min = a.min(keepdims=True) + input_max = a.max(keepdims=True) + input_min, input_max = yield from recursive_tile(input_min, input_max) + chunks = [input_min.chunks[0], input_max.chunks[0]] + yield chunks + a.chunks + range_results = ctx.get_chunks_result([c.key for c in chunks]) + # make sure returned bounds are valid + if all(x.size > 0 for x in range_results): + range_ = tuple(x[0] for x in range_results) + if isinstance(bins, TENSOR_TYPE): + # `bins` is a Tensor, needs to be calculated first + yield + bin_datas = ctx.get_chunks_result([c.key for c in bins.chunks]) + bins = np.concatenate(bin_datas) + else: + bins = op.bins + + bin_edges, _ = yield from _get_bin_edges(op, op.input, bins, range_, op.weights) + bin_edges = yield from recursive_tile(bin_edges) + return [bin_edges] + + +def histogram_bin_edges(a, bins=10, range=None, weights=None): + r""" + Function to calculate only the edges of the bins used by the `histogram` + function. + + Parameters + ---------- + a : array_like + Input data. The histogram is computed over the flattened tensor. + bins : int or sequence of scalars or str, optional + If `bins` is an int, it defines the number of equal-width + bins in the given range (10, by default). If `bins` is a + sequence, it defines the bin edges, including the rightmost + edge, allowing for non-uniform bin widths. + + If `bins` is a string from the list below, `histogram_bin_edges` will use + the method chosen to calculate the optimal bin width and + consequently the number of bins (see `Notes` for more detail on + the estimators) from the data that falls within the requested + range. While the bin width will be optimal for the actual data + in the range, the number of bins will be computed to fill the + entire range, including the empty portions. For visualisation, + using the 'auto' option is suggested. Weighted data is not + supported for automated bin size selection. + + 'auto' + Maximum of the 'sturges' and 'fd' estimators. Provides good + all around performance. + + 'fd' (Freedman Diaconis Estimator) + Robust (resilient to outliers) estimator that takes into + account data variability and data size. + + 'doane' + An improved version of Sturges' estimator that works better + with non-normal datasets. + + 'scott' + Less robust estimator that that takes into account data + variability and data size. + + 'stone' + Estimator based on leave-one-out cross-validation estimate of + the integrated squared error. Can be regarded as a generalization + of Scott's rule. + + 'rice' + Estimator does not take variability into account, only data + size. Commonly overestimates number of bins required. + + 'sturges' + R's default method, only accounts for data size. Only + optimal for gaussian data and underestimates number of bins + for large non-gaussian datasets. + + 'sqrt' + Square root (of data size) estimator, used by Excel and + other programs for its speed and simplicity. + + range : (float, float), optional + The lower and upper range of the bins. If not provided, range + is simply ``(a.min(), a.max())``. Values outside the range are + ignored. The first element of the range must be less than or + equal to the second. `range` affects the automatic bin + computation as well. While bin width is computed to be optimal + based on the actual data within `range`, the bin count will fill + the entire range including portions containing no data. + + weights : array_like, optional + A tensor of weights, of the same shape as `a`. Each value in + `a` only contributes its associated weight towards the bin count + (instead of 1). This is currently not used by any of the bin estimators, + but may be in the future. + + Returns + ------- + bin_edges : tensor of dtype float + The edges to pass into `histogram` + + See Also + -------- + histogram + + Notes + ----- + The methods to estimate the optimal number of bins are well founded + in literature, and are inspired by the choices R provides for + histogram visualisation. Note that having the number of bins + proportional to :math:`n^{1/3}` is asymptotically optimal, which is + why it appears in most estimators. These are simply plug-in methods + that give good starting points for number of bins. In the equations + below, :math:`h` is the binwidth and :math:`n_h` is the number of + bins. All estimators that compute bin counts are recast to bin width + using the `ptp` of the data. The final bin count is obtained from + ``np.round(np.ceil(range / h))``. + + 'auto' (maximum of the 'sturges' and 'fd' estimators) + A compromise to get a good value. For small datasets the Sturges + value will usually be chosen, while larger datasets will usually + default to FD. Avoids the overly conservative behaviour of FD + and Sturges for small and large datasets respectively. + Switchover point is usually :math:`a.size \approx 1000`. + + 'fd' (Freedman Diaconis Estimator) + .. math:: h = 2 \frac{IQR}{n^{1/3}} + + The binwidth is proportional to the interquartile range (IQR) + and inversely proportional to cube root of a.size. Can be too + conservative for small datasets, but is quite good for large + datasets. The IQR is very robust to outliers. + + 'scott' + .. math:: h = \sigma \sqrt[3]{\frac{24 * \sqrt{\pi}}{n}} + + The binwidth is proportional to the standard deviation of the + data and inversely proportional to cube root of ``x.size``. Can + be too conservative for small datasets, but is quite good for + large datasets. The standard deviation is not very robust to + outliers. Values are very similar to the Freedman-Diaconis + estimator in the absence of outliers. + + 'rice' + .. math:: n_h = 2n^{1/3} + + The number of bins is only proportional to cube root of + ``a.size``. It tends to overestimate the number of bins and it + does not take into account data variability. + + 'sturges' + .. math:: n_h = \log _{2}n+1 + + The number of bins is the base 2 log of ``a.size``. This + estimator assumes normality of data and is too conservative for + larger, non-normal datasets. This is the default method in R's + ``hist`` method. + + 'doane' + .. math:: n_h = 1 + \log_{2}(n) + + \log_{2}(1 + \frac{|g_1|}{\sigma_{g_1}}) + + g_1 = mean[(\frac{x - \mu}{\sigma})^3] + + \sigma_{g_1} = \sqrt{\frac{6(n - 2)}{(n + 1)(n + 3)}} + + An improved version of Sturges' formula that produces better + estimates for non-normal datasets. This estimator attempts to + account for the skew of the data. + + 'sqrt' + .. math:: n_h = \sqrt n + + The simplest and fastest estimator. Only takes into account the + data size. + + Examples + -------- + >>> import mars.tensor as mt + >>> arr = mt.array([0, 0, 0, 1, 2, 3, 3, 4, 5]) + >>> mt.histogram_bin_edges(arr, bins='auto', range=(0, 1)).execute() + array([0. , 0.25, 0.5 , 0.75, 1. ]) + >>> mt.histogram_bin_edges(arr, bins=2).execute() + array([0. , 2.5, 5. ]) + + For consistency with histogram, a tensor of pre-computed bins is + passed through unmodified: + + >>> mt.histogram_bin_edges(arr, [1, 2]).execute() + array([1, 2]) + + This function allows one set of bins to be computed, and reused across + multiple histograms: + + >>> shared_bins = mt.histogram_bin_edges(arr, bins='auto') + >>> shared_bins.execute() + array([0., 1., 2., 3., 4., 5.]) + + >>> group_id = mt.array([0, 1, 1, 0, 1, 1, 0, 1, 1]) + >>> a = arr[group_id == 0] + >>> a.execute() + array([0, 1, 3]) + >>> hist_0, _ = mt.histogram(a, bins=shared_bins).execute() + >>> b = arr[group_id == 1] + >>> b.execute() + array([0, 0, 2, 3, 4, 5]) + >>> hist_1, _ = mt.histogram(b, bins=shared_bins).execute() + + >>> hist_0; hist_1 + array([1, 1, 0, 1, 0]) + array([2, 0, 1, 1, 2]) + + Which gives more easily comparable results than using separate bins for + each histogram: + + >>> hist_0, bins_0 = mt.histogram(a, bins='auto').execute() + >>> hist_1, bins_1 = mt.histogram(b, bins='auto').execute() + >>> hist_0; hist_1 + array([1, 1, 1]) + array([2, 1, 1, 2]) + >>> bins_0; bins_1 + array([0., 1., 2., 3.]) + array([0. , 1.25, 2.5 , 3.75, 5. ]) + + """ + a, weights = _ravel_and_check_weights(a, weights) + op = TensorHistogramBinEdges( + input=a, bins=bins, range=range, weights=weights, dtype=a.dtype + ) + return op(a, bins, range, weights) + + +class TensorHistogram(TensorOperand, TensorOperandMixin): + _op_type_ = OperandDef.HISTOGRAM + + _input = KeyField("input") + _bins = AnyField("bins") + _range = TupleField("range") + _weights = KeyField("weights") + _density = BoolField("density") + _ret_bins = BoolField("ret_bins") + + def __init__( + self, + input=None, + bins=None, + range=None, + weights=None, + density=None, + ret_bins=None, + **kw, + ): + super().__init__( + _input=input, + _bins=bins, + _range=range, + _weights=weights, + _density=density, + _ret_bins=ret_bins, + **kw, + ) + + @property + def input(self): + return self._input + + @property + def bins(self): + return self._bins + + @property + def range(self): + return self._range + + @property + def weights(self): + return self._weights + + @property + def density(self): + return self._density + + @property + def ret_bins(self): + return self._ret_bins + + @property + def output_limit(self): + return 1 if not self._ret_bins else 2 + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + inputs_iter = iter(self._inputs) + self._input = next(inputs_iter) + if isinstance(self._bins, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)): + self._bins = next(inputs_iter) + if self._weights is not None: + self._weights = next(inputs_iter) + + def __call__(self, a, bins, range, weights): + a, weights = _ravel_and_check_weights(a, weights) + histogram_bin_edges_op = TensorHistogramBinEdges( + input=a, bins=bins, range=range, weights=weights, dtype=np.dtype(np.float64) + ) + bins = self._bins = histogram_bin_edges_op(a, bins, range, weights) + + inputs = [histogram_bin_edges_op.input] + if isinstance(bins, TENSOR_TYPE): + inputs.append(bins) + # Histogram is an integer or a float array depending on the weights. + if weights is None: + dtype = np.dtype(np.intp) + else: + inputs.append(weights) + dtype = weights.dtype + self.dtype = dtype + + hist = self.new_tensor( + inputs, shape=(bins.size - 1,), order=TensorOrder.C_ORDER + ) + return mt.ExecutableTuple([hist, bins]) + + @classmethod + def tile(cls, op): + bins = op.bins.rechunk(op.bins.shape) + shape = (bins.size - 1,) + out = op.outputs[0] + weights = None + if op.weights is not None: + # make input and weights have the same nsplits + weights = yield from recursive_tile(op.weights.rechunk(op.input.nsplits)) + + out_chunks = [] + for chunk in op.input.chunks: + chunk_op = op.copy().reset_key() + chunk_op._range = None + chunk_op._ret_bins = False + chunk_op._density = False + chunk_inputs = [chunk, bins.chunks[0]] + if weights is not None: + weights_chunk = weights.cix[chunk.index] + chunk_inputs.append(weights_chunk) + out_chunk = chunk_op.new_chunk( + chunk_inputs, shape=shape, index=chunk.index, order=out.order + ) + out_chunks.append(out_chunk) + + # merge chunks together + chunk = chunk_tree_add(out.dtype, out_chunks, (0,), shape) + new_op = op.copy() + n = new_op.new_tensor( + op.inputs, + shape=shape, + order=out.order, + chunks=[chunk], + nsplits=((shape[0],),), + ) + if op.density: + db = mt.array(mt.diff(bins), float) + hist = n / db / n.sum() + hist = yield from recursive_tile(hist) + return [hist] + else: + return [n] + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True + ) + a = inputs[0] + bins = inputs[1] if isinstance(op.bins, TENSOR_CHUNK_TYPE) else op.bins + weights = None + if op.weights is not None: + weights = inputs[-1] + with device(device_id): + hist, bin_edges = xp.histogram( + a, bins=bins, range=op.range, weights=weights, density=op.density + ) + ctx[op.outputs[0].key] = hist + if op.ret_bins: + ctx[op.outputs[1].key] = bin_edges + + +def histogram(a, bins=10, range=None, weights=None, density=None): + r""" + Compute the histogram of a set of data. + + Parameters + ---------- + a : array_like + Input data. The histogram is computed over the flattened tensor. + bins : int or sequence of scalars or str, optional + If `bins` is an int, it defines the number of equal-width + bins in the given range (10, by default). If `bins` is a + sequence, it defines a monotonically increasing tensor of bin edges, + including the rightmost edge, allowing for non-uniform bin widths. + + If `bins` is a string, it defines the method used to calculate the + optimal bin width, as defined by `histogram_bin_edges`. + + range : (float, float), optional + The lower and upper range of the bins. If not provided, range + is simply ``(a.min(), a.max())``. Values outside the range are + ignored. The first element of the range must be less than or + equal to the second. `range` affects the automatic bin + computation as well. While bin width is computed to be optimal + based on the actual data within `range`, the bin count will fill + the entire range including portions containing no data. + + weights : array_like, optional + A tensor of weights, of the same shape as `a`. Each value in + `a` only contributes its associated weight towards the bin count + (instead of 1). If `density` is True, the weights are + normalized, so that the integral of the density over the range + remains 1. + density : bool, optional + If ``False``, the result will contain the number of samples in + each bin. If ``True``, the result is the value of the + probability *density* function at the bin, normalized such that + the *integral* over the range is 1. Note that the sum of the + histogram values will not be equal to 1 unless bins of unity + width are chosen; it is not a probability *mass* function. + + Overrides the ``normed`` keyword if given. + + Returns + ------- + hist : tensor + The values of the histogram. See `density` and `weights` for a + description of the possible semantics. + bin_edges : tensor of dtype float + Return the bin edges ``(length(hist)+1)``. + + + See Also + -------- + histogramdd, bincount, searchsorted, digitize, histogram_bin_edges + + Notes + ----- + All but the last (righthand-most) bin is half-open. In other words, + if `bins` is:: + + [1, 2, 3, 4] + + then the first bin is ``[1, 2)`` (including 1, but excluding 2) and + the second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which + *includes* 4. + + + Examples + -------- + >>> import mars.tensor as mt + >>> mt.histogram([1, 2, 1], bins=[0, 1, 2, 3]).execute() + (array([0, 2, 1]), array([0, 1, 2, 3])) + >>> mt.histogram(mt.arange(4), bins=mt.arange(5), density=True).execute() + (array([0.25, 0.25, 0.25, 0.25]), array([0, 1, 2, 3, 4])) + >>> mt.histogram([[1, 2, 1], [1, 0, 1]], bins=[0,1,2,3]).execute() + (array([1, 4, 1]), array([0, 1, 2, 3])) + + >>> a = mt.arange(5) + >>> hist, bin_edges = mt.histogram(a, density=True) + >>> hist.execute() + array([0.5, 0. , 0.5, 0. , 0. , 0.5, 0. , 0.5, 0. , 0.5]) + >>> hist.sum().execute() + 2.4999999999999996 + >>> mt.sum(hist * mt.diff(bin_edges)).execute() + 1.0 + + Automated Bin Selection Methods example, using 2 peak random data + with 2000 points: + + >>> import matplotlib.pyplot as plt + >>> rng = mt.random.RandomState(10) # deterministic random data + >>> a = mt.hstack((rng.normal(size=1000), + ... rng.normal(loc=5, scale=2, size=1000))) + >>> _ = plt.hist(np.asarray(a), bins='auto') # arguments are passed to np.histogram + >>> plt.title("Histogram with 'auto' bins") + Text(0.5, 1.0, "Histogram with 'auto' bins") + >>> plt.show() + + """ + a, weights = _ravel_and_check_weights(a, weights) + op = TensorHistogram( + input=a, bins=bins, range=range, weights=weights, density=density + ) + return op(a, bins, range, weights) diff --git a/python/xorbits/_mars/tensor/statistics/median.py b/python/xorbits/_mars/tensor/statistics/median.py new file mode 100644 index 000000000..a2d6789c6 --- /dev/null +++ b/python/xorbits/_mars/tensor/statistics/median.py @@ -0,0 +1,85 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .quantile import quantile + + +def median(a, axis=None, out=None, overwrite_input=False, keepdims=False): + """ + Compute the median along the specified axis. + + Returns the median of the tensor elements. + + Parameters + ---------- + a : array_like + Input tensor or object that can be converted to a tensor. + axis : {int, sequence of int, None}, optional + Axis or axes along which the medians are computed. The default + is to compute the median along a flattened version of the tensor. + A sequence of axes is supported since version 1.9.0. + out : Tensor, optional + Alternative output tensor in which to place the result. It must + have the same shape and buffer length as the expected output, + but the type (of the output) will be cast if necessary. + overwrite_input : bool, optional + Just for compatibility with Numpy, would not take effect. + keepdims : bool, optional + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the original `arr`. + + Returns + ------- + median : Tensor + A new tensor holding the result. If the input contains integers + or floats smaller than ``float64``, then the output data-type is + ``np.float64``. Otherwise, the data-type of the output is the + same as that of the input. If `out` is specified, that tensor is + returned instead. + + See Also + -------- + mean, percentile + + Notes + ----- + Given a vector ``V`` of length ``N``, the median of ``V`` is the + middle value of a sorted copy of ``V``, ``V_sorted`` - i + e., ``V_sorted[(N-1)/2]``, when ``N`` is odd, and the average of the + two middle values of ``V_sorted`` when ``N`` is even. + + Examples + -------- + >>> import mars.tensor as mt + >>> a = mt.array([[10, 7, 4], [3, 2, 1]]) + >>> a.execute() + array([[10, 7, 4], + [ 3, 2, 1]]) + >>> mt.median(a).execute() + 3.5 + >>> mt.median(a, axis=0).execute() + array([6.5, 4.5, 2.5]) + >>> mt.median(a, axis=1).execute() + array([7., 2.]) + >>> m = mt.median(a, axis=0) + >>> out = mt.zeros_like(m) + >>> mt.median(a, axis=0, out=m).execute() + array([6.5, 4.5, 2.5]) + >>> m.execute() + array([6.5, 4.5, 2.5]) + """ + return quantile( + a, 0.5, axis=axis, out=out, overwrite_input=overwrite_input, keepdims=keepdims + ) diff --git a/python/xorbits/_mars/tensor/statistics/percentile.py b/python/xorbits/_mars/tensor/statistics/percentile.py new file mode 100644 index 000000000..aefcdf348 --- /dev/null +++ b/python/xorbits/_mars/tensor/statistics/percentile.py @@ -0,0 +1,175 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ...core import ENTITY_TYPE +from ..arithmetic import truediv +from .quantile import _quantile_is_valid, _quantile_unchecked + +q_error_msg = "Percentiles must be in the range [0, 100]" + + +def percentile( + a, + q, + axis=None, + out=None, + overwrite_input=False, + interpolation="linear", + keepdims=False, +): + """ + Compute the q-th percentile of the data along the specified axis. + + Returns the q-th percentile(s) of the array elements. + + Parameters + ---------- + a : array_like + Input tensor or object that can be converted to a tensor. + q : array_like of float + Percentile or sequence of percentiles to compute, which must be between + 0 and 100 inclusive. + axis : {int, tuple of int, None}, optional + Axis or axes along which the percentiles are computed. The + default is to compute the percentile(s) along a flattened + version of the tensor. + out : ndarray, optional + Alternative output array in which to place the result. It must + have the same shape and buffer length as the expected output, + but the type (of the output) will be cast if necessary. + overwrite_input : bool, optional + Just for compatibility with Numpy, would not take effect. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + This optional parameter specifies the interpolation method to + use when the desired percentile lies between two data points + ``i < j``: + + * 'linear': ``i + (j - i) * fraction``, where ``fraction`` + is the fractional part of the index surrounded by ``i`` + and ``j``. + * 'lower': ``i``. + * 'higher': ``j``. + * 'nearest': ``i`` or ``j``, whichever is nearest. + * 'midpoint': ``(i + j) / 2``. + keepdims : bool, optional + If this is set to True, the axes which are reduced are left in + the result as dimensions with size one. With this option, the + result will broadcast correctly against the original array `a`. + + Returns + ------- + percentile : scalar or ndarray + If `q` is a single percentile and `axis=None`, then the result + is a scalar. If multiple percentiles are given, first axis of + the result corresponds to the percentiles. The other axes are + the axes that remain after the reduction of `a`. If the input + contains integers or floats smaller than ``float64``, the output + data-type is ``float64``. Otherwise, the output data-type is the + same as that of the input. If `out` is specified, that array is + returned instead. + + See Also + -------- + mean + median : equivalent to ``percentile(..., 50)`` + nanpercentile + quantile : equivalent to percentile, except with q in the range [0, 1]. + + Notes + ----- + Given a vector ``V`` of length ``N``, the q-th percentile of + ``V`` is the value ``q/100`` of the way from the minimum to the + maximum in a sorted copy of ``V``. The values and distances of + the two nearest neighbors as well as the `interpolation` parameter + will determine the percentile if the normalized ranking does not + match the location of ``q`` exactly. This function is the same as + the median if ``q=50``, the same as the minimum if ``q=0`` and the + same as the maximum if ``q=100``. + + Examples + -------- + >>> import mars.tensor as mt + >>> a = mt.array([[10, 7, 4], [3, 2, 1]]) + >>> a.execute() + array([[10, 7, 4], + [ 3, 2, 1]]) + >>> mt.percentile(a, 50).execute() + 3.5 + >>> mt.percentile(a, 50, axis=0).execute() + array([6.5, 4.5, 2.5]) + >>> mt.percentile(a, 50, axis=1).execute() + array([7., 2.]) + >>> mt.percentile(a, 50, axis=1, keepdims=True).execute() + array([[7.], + [2.]]) + + >>> m = mt.percentile(a, 50, axis=0) + >>> out = mt.zeros_like(m) + >>> mt.percentile(a, 50, axis=0, out=out).execute() + array([6.5, 4.5, 2.5]) + >>> m.execute() + array([6.5, 4.5, 2.5]) + + The different types of interpolation can be visualized graphically: + + .. plot:: + + import matplotlib.pyplot as plt + import mars.tensor as mt + import numpy as np + + a = mt.arange(4) + p = mt.linspace(0, 100, 6001) + ax = plt.gca() + lines = [ + ('linear', None), + ('higher', '--'), + ('lower', '--'), + ('nearest', '-.'), + ('midpoint', '-.'), + ] + for interpolation, style in lines: + ax.plot( + np.asarray(p), np.asarray(mt.percentile(a, p, interpolation=interpolation)), + label=interpolation, linestyle=style) + ax.set( + title='Interpolation methods for list: ' + str(a), + xlabel='Percentile', + ylabel='List item returned', + yticks=np.asarray(a)) + ax.legend() + plt.show() + + """ + if not isinstance(q, ENTITY_TYPE): + q = np.asanyarray(q) + q = np.true_divide(q, 100) + # do check instantly if q is not a tensor + if not _quantile_is_valid(q): + raise ValueError(q_error_msg) + else: + q = truediv(q, 100) + + return _quantile_unchecked( + a, + q, + axis=axis, + out=out, + overwrite_input=overwrite_input, + interpolation=interpolation, + keepdims=keepdims, + q_error_msg=q_error_msg, + ) diff --git a/python/xorbits/_mars/tensor/statistics/ptp.py b/python/xorbits/_mars/tensor/statistics/ptp.py new file mode 100644 index 000000000..08484397b --- /dev/null +++ b/python/xorbits/_mars/tensor/statistics/ptp.py @@ -0,0 +1,88 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..base.ravel import ravel +from ..core import Tensor +from ..datasource import tensor as astensor +from ..utils import check_out_param, validate_axis + + +def ptp(a, axis=None, out=None, keepdims=None): + """ + Range of values (maximum - minimum) along an axis. + + The name of the function comes from the acronym for 'peak to peak'. + + Parameters + ---------- + a : array_like + Input values. + axis : int, optional + Axis along which to find the peaks. By default, flatten the + array. + out : array_like + Alternative output tensor in which to place the result. It must + have the same shape and buffer length as the expected output, + but the type of the output values will be cast if necessary. + keepdims : bool, optional + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the input array. + + If the default value is passed, then `keepdims` will not be + passed through to the `ptp` method of sub-classes of + `Tensor`, however any non-default value will be. If the + sub-class' method does not implement `keepdims` any + exceptions will be raised. + + Returns + ------- + ptp : Tensor + A new tensor holding the result, unless `out` was + specified, in which case a reference to `out` is returned. + + Examples + -------- + >>> import mars.tensor as mt + + >>> x = mt.arange(4).reshape((2,2)) + >>> x.execute() + array([[0, 1], + [2, 3]]) + + >>> mt.ptp(x, axis=0).execute() + array([2, 2]) + + >>> mt.ptp(x, axis=1).execute() + array([1, 1]) + + """ + a = astensor(a) + + if axis is None: + a = ravel(a) + else: + validate_axis(a.ndim, axis) + + t = a.max(axis=axis, keepdims=keepdims) - a.min(axis=axis, keepdims=keepdims) + + if out is not None: + if not isinstance(out, Tensor): + raise TypeError(f"out should be Tensor object, got {type(out)} instead") + + check_out_param(out, t, "same_kind") + out.data = t.data + return out + + return t diff --git a/python/xorbits/_mars/tensor/statistics/quantile.py b/python/xorbits/_mars/tensor/statistics/quantile.py new file mode 100644 index 000000000..fa9f18f0b --- /dev/null +++ b/python/xorbits/_mars/tensor/statistics/quantile.py @@ -0,0 +1,566 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import Iterable + +import numpy as np + +from ... import opcodes as OperandDef +from ...core import ENTITY_TYPE, recursive_tile +from ...core.context import get_context +from ...serialization.serializables import AnyField, BoolField, KeyField, StringField +from ...utils import has_unknown_shape +from ..arithmetic import add, isnan +from ..array_utils import as_same_device, device +from ..base import moveaxis, where +from ..core import TENSOR_CHUNK_TYPE, TENSOR_TYPE, TensorOrder +from ..datasource import tensor as astensor +from ..indexing import take +from ..operands import TensorOperand, TensorOperandMixin +from ..reduction import any as tensor_any +from ..utils import check_out_param +from .core import _ureduce + + +def _quantile_is_valid(q): + # avoid expensive reductions, relevant for arrays with < O(1000) elements + if q.ndim == 1 and q.size < 10: + for i in range(q.size): + if q[i] < 0.0 or q[i] > 1.0: + return False + else: + # faster than any() + if np.count_nonzero(q < 0.0) or np.count_nonzero(q > 1.0): + return False + return True + + +def _quantile_ureduce_func( + a, + q, + axis=None, + out=None, + overwrite_input=False, + interpolation="linear", + keepdims=False, +): + a = astensor(a) + out = astensor(out) if out is not None else None + + if q.ndim == 0: + # Do not allow 0-d arrays because following code fails for scalar + zerod = True + q = q[None] + else: + zerod = False + + # prepare a for partitioning + if overwrite_input: + if axis is None: + ap = a.ravel() + else: + ap = a + else: + if axis is None: + ap = a.flatten() + else: + ap = a.copy() + + if axis is None: + axis = 0 + + Nx = ap.shape[axis] + indices = q * (Nx - 1) + + # round fractional indices according to interpolation method + if interpolation == "lower": + indices = np.floor(indices).astype(np.intp) + elif interpolation == "higher": + indices = np.ceil(indices).astype(np.intp) + elif interpolation == "midpoint": + indices = 0.5 * (np.floor(indices) + np.ceil(indices)) + elif interpolation == "nearest": + indices = np.around(indices).astype(np.intp) + else: + assert interpolation == "linear" + # keep index as fraction and interpolate + + n = np.array(False, dtype=bool) # check for nan's flag + if indices.dtype == np.intp: # take the points along axis + # Check if the array contains any nan's + if np.issubdtype(a.dtype, np.inexact): + indices = np.concatenate((indices, [-1])) + + ap.partition(indices, axis=axis, need_align=True) + # ensure axis with q-th is first + ap = moveaxis(ap, axis, 0) + axis = 0 + + # Check if the array contains any nan's + if np.issubdtype(a.dtype, np.inexact): + indices = indices[:-1] + n = isnan(ap[-1:, ...]) + + if zerod: + indices = indices[0] + r = take(ap, indices, axis=axis, out=out) + + else: # weight the points above and below the indices + indices_below = np.floor(indices).astype(np.intp) + indices_above = indices_below + 1 + indices_above[indices_above > Nx - 1] = Nx - 1 + + # Check if the array contains any nan's + if np.issubdtype(a.dtype, np.inexact): + indices_above = np.concatenate((indices_above, [-1])) + + weights_above = indices - indices_below + weights_below = 1 - weights_above + + weights_shape = [1] * ap.ndim + weights_shape[axis] = len(indices) + weights_below.shape = weights_shape + weights_above.shape = weights_shape + + ap.partition( + np.concatenate((indices_below, indices_above)), axis=axis, need_align=True + ) + + # ensure axis with q-th is first + ap = moveaxis(ap, axis, 0) + weights_below = np.moveaxis(weights_below, axis, 0) + weights_above = np.moveaxis(weights_above, axis, 0) + axis = 0 + + # Check if the array contains any nan's + if np.issubdtype(a.dtype, np.inexact): + indices_above = indices_above[:-1] + n = isnan(ap[-1:, ...]) + + x1 = take(ap, indices_below, axis=axis) * weights_below + x2 = take(ap, indices_above, axis=axis) * weights_above + + # ensure axis with q-th is first + x1 = moveaxis(x1, axis, 0) + x2 = moveaxis(x2, axis, 0) + + if zerod: + x1 = x1.squeeze(0) + x2 = x2.squeeze(0) + + if out is not None: + r = add(x1, x2, out=out) + else: + r = add(x1, x2) + + if isinstance(n, TENSOR_TYPE): + if zerod: + if ap.ndim == 1: + r.data = where(tensor_any(n), a.dtype.type(np.nan), r).data + if out is not None: + out.data = r.data + else: + r[:] = where( + tensor_any(n), where(n.squeeze(0), a.dtype.type(np.nan), r), r + ) + else: + if r.ndim == 1: + r[:] = where(tensor_any(n), np.full(r.shape, a.dtype.type(np.nan)), r) + else: + r[:] = where( + tensor_any(n), + where(n.repeat(q.size, 0), a.dtype.type(np.nan), r), + r, + ) + + return r + + +q_error_msg = "Quantiles must be in the range [0, 1]" + + +class TensorQuantile(TensorOperand, TensorOperandMixin): + __slots__ = ("q_error_msg",) + _op_type_ = OperandDef.QUANTILE + + _a = KeyField("a") + _q = AnyField("q") + _axis = AnyField("axis") + _out = KeyField("out") + _overwrite_input = BoolField("overwrite_input") + _interpolation = StringField("interpolation") + _keepdims = BoolField("keepdims") + + def __init__( + self, + q=None, + axis=None, + out=None, + overwrite_input=None, + interpolation=None, + keepdims=None, + **kw, + ): + self.q_error_msg = kw.pop("q_error_msg", q_error_msg) + super().__init__( + _q=q, + _axis=axis, + _interpolation=interpolation, + _out=out, + _overwrite_input=overwrite_input, + _keepdims=keepdims, + **kw, + ) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + self._a = self._inputs[0] + if isinstance(self._q, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)): + self._q = self._inputs[1] + if isinstance(self._out, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)): + self._out = self._inputs[-1] + + @property + def a(self): + return self._a + + @property + def q(self): + return self._q + + @property + def axis(self): + return self._axis + + @property + def out(self): + return self._out + + @property + def overwrite_input(self): + return self._overwrite_input + + @property + def interpolation(self): + return self._interpolation + + @property + def keepdims(self): + return self._keepdims + + def __call__(self, a, q=None, out=None): + shape = [self._q.size] if self._q.ndim > 0 else [] + if self._axis is None: + exclude_axes = set(range(a.ndim)) + elif isinstance(self._axis, tuple): + exclude_axes = set(self._axis) + else: + exclude_axes = {self._axis} + for ax, s in enumerate(a.shape): + if ax not in exclude_axes: + shape.append(s) + elif self._keepdims: + shape.append(1) + inputs = [a] if q is None else [a, q] + order = TensorOrder.C_ORDER + if out is not None: + inputs.append(out) + order = out.order + shape = out.shape + t = self.new_tensor(inputs, shape=tuple(shape), order=order) + if out is not None: + check_out_param(out, t, "same_kind") + out.data = t.data + return out + else: + return t + + @classmethod + def _tile(cls, op, q): + r, k = _ureduce( + op.a, + func=_quantile_ureduce_func, + q=q, + axis=op.axis, + out=op.out, + overwrite_input=op.overwrite_input, + interpolation=op.interpolation, + ) + if op.keepdims: + return r.reshape(q.shape + k) + else: + return r + + @classmethod + def _tile_one_chunk(cls, op, q): + in_tensor = op.inputs[0] + out_tensor = op.outputs[0] + chunk_op = op.copy().reset_key() + chunk_op._q = q + chunk_inputs = [in_tensor.chunks[0]] + if op.out is not None: + chunk_inputs.append(op.out.chunks[0]) + chunk = chunk_op.new_chunk( + chunk_inputs, + shape=out_tensor.shape, + index=(0,) * out_tensor.ndim, + order=out_tensor.order, + ) + op = op.copy() + return op.new_tensors( + op.inputs, + shape=out_tensor.shape, + order=out_tensor.order, + nsplits=tuple((s,) for s in out_tensor.shape), + chunks=[chunk], + ) + + @classmethod + def tile(cls, op): + if isinstance(op.q, TENSOR_TYPE): + ctx = get_context() + # get q's data + q_chunk_keys = [c.key for c in op.q.chunks] + try: + q_data = ctx.get_chunks_result(q_chunk_keys) + except KeyError: + # trigger execution of `q` + yield op.q.chunks + q_data = ctx.get_chunks_result(q_chunk_keys) + op._q = q = np.concatenate(q_data) + if not _quantile_is_valid(q): + raise ValueError(op.q_error_msg) + else: + if has_unknown_shape(*op.inputs): + yield + q = np.asarray(op.q) + + if len(op.a.chunks) == 1 and (op.out is None or len(op.out.chunks) == 1): + return cls._tile_one_chunk(op, q) + else: + tiled = yield from recursive_tile(cls._tile(op, q)) + return [tiled] + + @classmethod + def execute(cls, ctx, op): + inputs, device_id, xp = as_same_device( + [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True + ) + a = inputs[0] + out = inputs[-1].copy() if op.out is not None else None + + with device(device_id): + ctx[op.outputs[0].key] = xp.quantile( + a, + q=op.q, + axis=op.axis, + out=out, + interpolation=op.interpolation, + keepdims=op.keepdims, + ) + + +INTERPOLATION_TYPES = {"linear", "lower", "higher", "midpoint", "nearest"} + + +def _quantile_unchecked( + a, + q, + axis=None, + out=None, + overwrite_input=False, + interpolation="linear", + keepdims=False, + q_error_msg=None, + handle_non_numeric=None, +): + a = astensor(a) + raw_dtype = a.dtype + need_view_back = False + if handle_non_numeric and not np.issubdtype(a.dtype, np.number): + # enable handle_non_numeric is often used + # to handle the datetime-like dtype + a = a.astype("i8") + need_view_back = True + if isinstance(q, ENTITY_TYPE): + q = astensor(q) + # do check in tile + q_input = q + else: + q_input = None + + if isinstance(axis, Iterable): + axis = tuple(axis) + + if q.ndim > 1: + raise ValueError("`q` should be a scalar or array of float") + + if out is not None and not isinstance(out, TENSOR_TYPE): + raise TypeError(f"`out` should be a tensor, got {type(out)}") + + if interpolation not in INTERPOLATION_TYPES: + raise ValueError( + "interpolation can only be 'linear', 'lower' " + "'higher', 'midpoint', or 'nearest'" + ) + + # infer dtype + q_tiny = np.random.rand(2 if q.size % 2 == 0 else 1).astype(q.dtype) + if handle_non_numeric and not np.issubdtype(a.dtype, np.number): + dtype = a.dtype + else: + dtype = np.quantile( + np.empty(1, dtype=a.dtype), q_tiny, interpolation=interpolation + ).dtype + op = TensorQuantile( + q=q, + axis=axis, + out=out, + overwrite_input=overwrite_input, + interpolation=interpolation, + keepdims=keepdims, + handle_non_numeric=handle_non_numeric, + q_error_msg=q_error_msg, + dtype=dtype, + gpu=a.op.gpu, + ) + ret = op(a, q=q_input, out=out) + if need_view_back: + ret = ret.astype(raw_dtype) + return ret + + +def quantile( + a, + q, + axis=None, + out=None, + overwrite_input=False, + interpolation="linear", + keepdims=False, + **kw, +): + """ + Compute the q-th quantile of the data along the specified axis. + + Parameters + ---------- + a : array_like + Input tensor or object that can be converted to a tensor. + q : array_like of float + Quantile or sequence of quantiles to compute, which must be between + 0 and 1 inclusive. + axis : {int, tuple of int, None}, optional + Axis or axes along which the quantiles are computed. The + default is to compute the quantile(s) along a flattened + version of the tensor. + out : Tensor, optional + Alternative output tensor in which to place the result. It must + have the same shape and buffer length as the expected output, + but the type (of the output) will be cast if necessary. + overwrite_input : bool, optional + Just for compatibility with Numpy, would not take effect. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + This optional parameter specifies the interpolation method to + use when the desired quantile lies between two data points + ``i < j``: + + * linear: ``i + (j - i) * fraction``, where ``fraction`` + is the fractional part of the index surrounded by ``i`` + and ``j``. + * lower: ``i``. + * higher: ``j``. + * nearest: ``i`` or ``j``, whichever is nearest. + * midpoint: ``(i + j) / 2``. + keepdims : bool, optional + If this is set to True, the axes which are reduced are left in + the result as dimensions with size one. With this option, the + result will broadcast correctly against the original tensor `a`. + + Returns + ------- + quantile : scalar or Tensor + If `q` is a single quantile and `axis=None`, then the result + is a scalar. If multiple quantiles are given, first axis of + the result corresponds to the quantiles. The other axes are + the axes that remain after the reduction of `a`. If the input + contains integers or floats smaller than ``float64``, the output + data-type is ``float64``. Otherwise, the output data-type is the + same as that of the input. If `out` is specified, that tensor is + returned instead. + + See Also + -------- + mean + percentile : equivalent to quantile, but with q in the range [0, 100]. + median : equivalent to ``quantile(..., 0.5)`` + nanquantile + + Notes + ----- + Given a vector ``V`` of length ``N``, the q-th quantile of + ``V`` is the value ``q`` of the way from the minimum to the + maximum in a sorted copy of ``V``. The values and distances of + the two nearest neighbors as well as the `interpolation` parameter + will determine the quantile if the normalized ranking does not + match the location of ``q`` exactly. This function is the same as + the median if ``q=0.5``, the same as the minimum if ``q=0.0`` and the + same as the maximum if ``q=1.0``. + + Examples + -------- + >>> import mars.tensor as mt + >>> a = mt.array([[10, 7, 4], [3, 2, 1]]) + >>> a.execute() + array([[10, 7, 4], + [ 3, 2, 1]]) + >>> mt.quantile(a, 0.5).execute() + 3.5 + >>> mt.quantile(a, 0.5, axis=0).execute() + array([6.5, 4.5, 2.5]) + >>> mt.quantile(a, 0.5, axis=1).execute() + array([7., 2.]) + >>> mt.quantile(a, 0.5, axis=1, keepdims=True).execute() + array([[7.], + [2.]]) + >>> m = mt.quantile(a, 0.5, axis=0) + >>> out = mt.zeros_like(m) + >>> mt.quantile(a, 0.5, axis=0, out=out).execute() + array([6.5, 4.5, 2.5]) + >>> m.execute() + array([6.5, 4.5, 2.5]) + """ + + handle_non_numeric = kw.pop("handle_non_numeric", None) + if len(kw) > 0: # pragma: no cover + raise TypeError( + f"quantile() got an unexpected keyword argument '{next(iter(kw))}'" + ) + + if not isinstance(q, ENTITY_TYPE): + q = np.asanyarray(q) + # do check instantly if q is not a tensor + if not _quantile_is_valid(q): + raise ValueError(q_error_msg) + + return _quantile_unchecked( + a, + q, + axis=axis, + out=out, + overwrite_input=overwrite_input, + interpolation=interpolation, + keepdims=keepdims, + handle_non_numeric=handle_non_numeric, + ) diff --git a/python/xorbits/_mars/tensor/statistics/tests/__init__.py b/python/xorbits/_mars/tensor/statistics/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/tensor/statistics/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/tensor/statistics/tests/test_statistics.py b/python/xorbits/_mars/tensor/statistics/tests/test_statistics.py new file mode 100644 index 000000000..aaf08cf4a --- /dev/null +++ b/python/xorbits/_mars/tensor/statistics/tests/test_statistics.py @@ -0,0 +1,159 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest + +from ....core import tile +from ...datasource import array, tensor +from .. import digitize, histogram_bin_edges, percentile, quantile +from ..quantile import INTERPOLATION_TYPES + + +def test_digitize(): + x = tensor(np.array([0.2, 6.4, 3.0, 1.6]), chunk_size=2) + bins = np.array([0.0, 1.0, 2.5, 4.0, 10.0]) + inds = digitize(x, bins) + + assert inds.shape == (4,) + assert inds.dtype is not None + + inds = tile(inds) + + assert len(inds.chunks) == 2 + + +def test_histogram_bin_edges(): + a = array([0, 0, 0, 1, 2, 3, 3, 4, 5], chunk_size=3) + + with pytest.raises(ValueError): + histogram_bin_edges(a, bins="unknown") + + with pytest.raises(TypeError): + # bins is str, weights cannot be provided + histogram_bin_edges(a, bins="scott", weights=a) + + with pytest.raises(ValueError): + histogram_bin_edges(a, bins=-1) + + with pytest.raises(ValueError): + # not asc + histogram_bin_edges(a, bins=[3, 2, 1]) + + with pytest.raises(ValueError): + # bins cannot be 2d + histogram_bin_edges(a, bins=np.random.rand(2, 3)) + + with pytest.raises(ValueError): + histogram_bin_edges(a, range=(5, 0)) + + with pytest.raises(ValueError): + histogram_bin_edges(a, range=(np.nan, np.nan)) + + bins = histogram_bin_edges(a, bins=3, range=(0, 5)) + # if range specified, no error will occur + tile(bins) + + +def test_quantile(): + raw = np.random.rand(100) + q = np.random.rand(10) + + for dtype in [np.float32, np.int64, np.complex128]: + raw2 = raw.astype(dtype) + a = tensor(raw2, chunk_size=100) + + b = quantile(a, q, overwrite_input=True) + assert b.shape == (10,) + assert b.dtype == np.quantile(raw2, q).dtype + + b = tile(b) + assert len(b.chunks) == 1 + + raw = np.random.rand(20, 10) + q = np.random.rand(10) + + for dtype in [np.float32, np.int64, np.complex128]: + for axis in (None, 0, 1, [0, 1]): + for interpolation in INTERPOLATION_TYPES: + for keepdims in [True, False]: + raw2 = raw.astype(dtype) + a = tensor(raw2, chunk_size=(8, 6)) + + b = quantile( + a, q, axis=axis, interpolation=interpolation, keepdims=keepdims + ) + expected = np.quantile( + raw2, + q, + axis=axis, + interpolation=interpolation, + keepdims=keepdims, + ) + assert b.shape == expected.shape + assert b.dtype == expected.dtype + + a = tensor(raw, chunk_size=10) + b = quantile(a, q) + + b = tile(b) + assert b.shape == (10,) + + b = quantile(a, 0.3) + assert b.ndim == 0 + + raw2 = np.random.rand(3, 4, 5) + a2 = tensor(raw2, chunk_size=3) + b2 = quantile(a2, q, axis=(0, 2)) + expected = np.quantile(raw2, q, axis=(0, 2)) + assert b2.shape == expected.shape + + b2 = tile(b2) + assert b2.shape == expected.shape + + # q has to be 1-d + with pytest.raises(ValueError): + quantile(a, q.reshape(5, 2)) + + # wrong out type + with pytest.raises(TypeError): + quantile(a, q, out=2) + + # wrong q + with pytest.raises(ValueError): + q2 = q.copy() + q2[0] = 1.1 + quantile(a, q2) + + # wrong q, with size < 10 + with pytest.raises(ValueError): + q2 = np.random.rand(5) + q2[0] = 1.1 + quantile(a, q2) + + # wrong interpolation + with pytest.raises(ValueError): + quantile(a, q, interpolation="unknown") + + +def test_percentile(): + raw = np.random.rand(100) + q = [101] + + a = tensor(raw, chunk_size=100) + + with pytest.raises(ValueError) as cm: + percentile(a, q) + the_exception = cm.value.args[0] + assert "Percentiles" in the_exception diff --git a/python/xorbits/_mars/tensor/statistics/tests/test_statistics_execution.py b/python/xorbits/_mars/tensor/statistics/tests/test_statistics_execution.py new file mode 100644 index 000000000..1290fbe7c --- /dev/null +++ b/python/xorbits/_mars/tensor/statistics/tests/test_statistics_execution.py @@ -0,0 +1,563 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +import scipy.sparse as sps + +from ....utils import ignore_warning +from ...base import sort +from ...datasource import arange, empty, tensor +from ...merge import stack +from ...reduction import all as tall +from .. import ( + average, + bincount, + corrcoef, + cov, + digitize, + histogram, + histogram_bin_edges, + median, + percentile, + ptp, + quantile, +) +from ..quantile import INTERPOLATION_TYPES + + +def test_average_execution(setup): + data = arange(1, 5, chunk_size=1) + t = average(data) + + res = t.execute().fetch() + expected = np.average(np.arange(1, 5)) + assert res == expected + + t = average(arange(1, 11, chunk_size=2), weights=arange(10, 0, -1, chunk_size=2)) + + res = t.execute().fetch() + expected = np.average(range(1, 11), weights=range(10, 0, -1)) + assert res == expected + + data = arange(6, chunk_size=2).reshape((3, 2)) + t = average(data, axis=1, weights=tensor([1.0 / 4, 3.0 / 4], chunk_size=2)) + + res = t.execute().fetch() + expected = np.average( + np.arange(6).reshape(3, 2), axis=1, weights=(1.0 / 4, 3.0 / 4) + ) + np.testing.assert_equal(res, expected) + + with pytest.raises(TypeError): + average(data, weights=tensor([1.0 / 4, 3.0 / 4], chunk_size=2)) + + +def test_cov_execution(setup): + data = np.array([[0, 2], [1, 1], [2, 0]]).T + x = tensor(data, chunk_size=1) + + t = cov(x) + + res = t.execute().fetch() + expected = np.cov(data) + np.testing.assert_equal(res, expected) + + data_x = [-2.1, -1, 4.3] + data_y = [3, 1.1, 0.12] + x = tensor(data_x, chunk_size=1) + y = tensor(data_y, chunk_size=1) + + X = stack((x, y), axis=0) + t = cov(x, y) + r = tall(t == cov(X)) + assert r.execute().fetch() + + +def test_corrcoef_execution(setup): + data_x = [-2.1, -1, 4.3] + data_y = [3, 1.1, 0.12] + x = tensor(data_x, chunk_size=1) + y = tensor(data_y, chunk_size=1) + + t = corrcoef(x, y) + + res = t.execute().fetch() + expected = np.corrcoef(data_x, data_y) + np.testing.assert_equal(res, expected) + + +def test_ptp_execution(setup): + x = arange(4, chunk_size=1).reshape(2, 2) + + t = ptp(x, axis=0) + + res = t.execute().fetch() + expected = np.ptp(np.arange(4).reshape(2, 2), axis=0) + np.testing.assert_equal(res, expected) + + t = ptp(x, axis=1) + + res = t.execute().fetch() + expected = np.ptp(np.arange(4).reshape(2, 2), axis=1) + np.testing.assert_equal(res, expected) + + t = ptp(x) + + res = t.execute().fetch() + expected = np.ptp(np.arange(4).reshape(2, 2)) + np.testing.assert_equal(res, expected) + + +def test_digitize_execution(setup): + data = np.array([0.2, 6.4, 3.0, 1.6]) + x = tensor(data, chunk_size=2) + bins = np.array([0.0, 1.0, 2.5, 4.0, 10.0]) + inds = digitize(x, bins) + + res = inds.execute().fetch() + expected = np.digitize(data, bins) + np.testing.assert_equal(res, expected) + + b = tensor(bins, chunk_size=2) + inds = digitize(x, b) + + res = inds.execute().fetch() + expected = np.digitize(data, bins) + np.testing.assert_equal(res, expected) + + data = np.array([1.2, 10.0, 12.4, 15.5, 20.0]) + x = tensor(data, chunk_size=2) + bins = np.array([0, 5, 10, 15, 20]) + inds = digitize(x, bins, right=True) + + res = inds.execute().fetch() + expected = np.digitize(data, bins, right=True) + np.testing.assert_equal(res, expected) + + inds = digitize(x, bins, right=False) + + res = inds.execute().fetch() + expected = np.digitize(data, bins, right=False) + np.testing.assert_equal(res, expected) + + data = sps.random(10, 1, density=0.1) * 12 + x = tensor(data, chunk_size=2) + bins = np.array([1.0, 2.0, 2.5, 4.0, 10.0]) + inds = digitize(x, bins) + + res = inds.execute().fetch() + expected = np.digitize(data.toarray(), bins, right=False) + np.testing.assert_equal(res.toarray(), expected) + + +@ignore_warning +def test_histogram_bin_edges_execution(setup): + rs = np.random.RandomState(0) + + raw = rs.randint(10, size=(20,)) + a = tensor(raw, chunk_size=6) + + # range provided + for range_ in [(0, 10), (3, 11), (3, 7)]: + bin_edges = histogram_bin_edges(a, range=range_) + result = bin_edges.execute().fetch() + expected = np.histogram_bin_edges(raw, range=range_) + np.testing.assert_array_equal(result, expected) + + raw2 = rs.randint(10, size=(1,)) + b = tensor(raw2) + raw3 = rs.randint(10, size=(0,)) + c = tensor(raw3) + for t, r in [(a, raw), (b, raw2), (c, raw3), (sort(a), raw)]: + test_bins = [ + 10, + "stone", + "auto", + "doane", + "fd", + "rice", + "scott", + "sqrt", + "sturges", + ] + for bins in test_bins: + bin_edges = histogram_bin_edges(t, bins=bins) + result = bin_edges.execute().fetch() + expected = np.histogram_bin_edges(r, bins=bins) + np.testing.assert_array_equal(result, expected) + + test_bins = [[0, 4, 8], tensor([0, 4, 8], chunk_size=2)] + for bins in test_bins: + bin_edges = histogram_bin_edges(t, bins=bins) + result = bin_edges.execute().fetch() + expected = np.histogram_bin_edges(r, bins=[0, 4, 8]) + np.testing.assert_array_equal(result, expected) + + raw = np.arange(5) + a = tensor(raw, chunk_size=3) + bin_edges = histogram_bin_edges(a) + result = bin_edges.execute().fetch() + expected = np.histogram_bin_edges(raw) + assert bin_edges.shape == expected.shape + np.testing.assert_array_equal(result, expected) + + +@ignore_warning +def test_histogram_execution(setup): + rs = np.random.RandomState(0) + + raw = rs.randint(10, size=(20,)) + a = tensor(raw, chunk_size=6) + raw_weights = rs.random(20) + weights = tensor(raw_weights, chunk_size=8) + + # range provided + for range_ in [(0, 10), (3, 11), (3, 7)]: + bin_edges = histogram(a, range=range_)[0] + result = bin_edges.execute().fetch() + expected = np.histogram(raw, range=range_)[0] + np.testing.assert_array_equal(result, expected) + + for wt in (raw_weights, weights): + for density in (True, False): + bins = [1, 4, 6, 9] + bin_edges = histogram(a, bins=bins, weights=wt, density=density)[0] + result = bin_edges.execute().fetch() + expected = np.histogram( + raw, bins=bins, weights=raw_weights, density=density + )[0] + np.testing.assert_almost_equal(result, expected) + + raw2 = rs.randint(10, size=(1,)) + b = tensor(raw2) + raw3 = rs.randint(10, size=(0,)) + c = tensor(raw3) + for t, r in [(a, raw), (b, raw2), (c, raw3), (sort(a), raw)]: + for density in (True, False): + test_bins = [ + 10, + "stone", + "auto", + "doane", + "fd", + "rice", + "scott", + "sqrt", + "sturges", + ] + for bins in test_bins: + hist = histogram(t, bins=bins, density=density)[0] + result = hist.execute().fetch() + expected = np.histogram(r, bins=bins, density=density)[0] + np.testing.assert_array_equal(result, expected) + + test_bins = [[0, 4, 8], tensor([0, 4, 8], chunk_size=2)] + for bins in test_bins: + hist = histogram(t, bins=bins, density=density)[0] + result = hist.execute().fetch() + expected = np.histogram(r, bins=[0, 4, 8], density=density)[0] + np.testing.assert_array_equal(result, expected) + + # test unknown shape + raw4 = rs.rand(10) + d = tensor(raw4, chunk_size=6) + d = d[d < 0.9] + hist = histogram(d) + result = hist.execute().fetch()[0] + expected = np.histogram(raw4[raw4 < 0.9])[0] + np.testing.assert_array_equal(result, expected) + + raw5 = np.arange(3, 10) + e = arange(10, chunk_size=6) + e = e[e >= 3] + hist = histogram(e) + result = hist.execute().fetch()[0] + expected = np.histogram(raw5)[0] + np.testing.assert_array_equal(result, expected) + + +def test_quantile_execution(setup): + # test 1 chunk, 1-d + raw = np.random.rand(20) + a = tensor(raw, chunk_size=20) + + raw2 = raw.copy() + raw2[np.random.RandomState(0).randint(raw.size, size=3)] = np.nan + a2 = tensor(raw2, chunk_size=20) + + for q in [np.random.RandomState(0).rand(), np.random.RandomState(0).rand(5)]: + for interpolation in INTERPOLATION_TYPES: + for keepdims in [True, False]: + r = quantile(a, q, interpolation=interpolation, keepdims=keepdims) + + result = r.execute().fetch() + expected = np.quantile( + raw, q, interpolation=interpolation, keepdims=keepdims + ) + + np.testing.assert_array_equal(result, expected) + + r2 = quantile(a2, q, interpolation=interpolation, keepdims=keepdims) + + result = r2.execute().fetch() + expected = np.quantile( + raw2, q, interpolation=interpolation, keepdims=keepdims + ) + + np.testing.assert_array_equal(result, expected) + + # test 1 chunk, 2-d + raw = np.random.rand(20, 10) + a = tensor(raw, chunk_size=20) + + raw2 = raw.copy() + raw2.flat[np.random.RandomState(0).randint(raw.size, size=3)] = np.nan + a2 = tensor(raw2, chunk_size=20) + + for q in [np.random.RandomState(0).rand(), np.random.RandomState(0).rand(5)]: + for interpolation in INTERPOLATION_TYPES: + for keepdims in [True, False]: + for axis in [None, 0, 1]: + r = quantile( + a, q, axis=axis, interpolation=interpolation, keepdims=keepdims + ) + + result = r.execute().fetch() + expected = np.quantile( + raw, + q, + axis=axis, + interpolation=interpolation, + keepdims=keepdims, + ) + + np.testing.assert_array_equal(result, expected) + + r2 = quantile( + a2, q, axis=axis, interpolation=interpolation, keepdims=keepdims + ) + + result = r2.execute().fetch() + expected = np.quantile( + raw2, + q, + axis=axis, + interpolation=interpolation, + keepdims=keepdims, + ) + + np.testing.assert_array_equal(result, expected) + + # test multi chunks, 1-d + raw = np.random.rand(20) + a = tensor(raw, chunk_size=6) + + raw2 = raw.copy() + raw2[np.random.RandomState(0).randint(raw.size, size=3)] = np.nan + a2 = tensor(raw2, chunk_size=20) + + for q in [np.random.RandomState(0).rand(), np.random.RandomState(0).rand(5)]: + for interpolation in INTERPOLATION_TYPES: + for keepdims in [True, False]: + r = quantile(a, q, interpolation=interpolation, keepdims=keepdims) + + result = r.execute().fetch() + expected = np.quantile( + raw, q, interpolation=interpolation, keepdims=keepdims + ) + + np.testing.assert_almost_equal(result, expected) + + r2 = quantile(a2, q, interpolation=interpolation, keepdims=keepdims) + + result = r2.execute().fetch() + expected = np.quantile( + raw2, q, interpolation=interpolation, keepdims=keepdims + ) + + np.testing.assert_almost_equal(result, expected) + + # test multi chunk, 2-d + raw = np.random.rand(20, 10) + a = tensor(raw, chunk_size=(12, 6)) + + raw2 = raw.copy() + raw2.flat[np.random.RandomState(0).randint(raw.size, size=3)] = np.nan + a2 = tensor(raw2, chunk_size=(12, 6)) + + for q in [np.random.RandomState(0).rand(), np.random.RandomState(0).rand(5)]: + for interpolation in INTERPOLATION_TYPES: + for keepdims in [True, False]: + for axis in [None, 0, 1]: + r = quantile( + a, q, axis=axis, interpolation=interpolation, keepdims=keepdims + ) + + result = r.execute().fetch() + expected = np.quantile( + raw, + q, + axis=axis, + interpolation=interpolation, + keepdims=keepdims, + ) + + np.testing.assert_almost_equal(result, expected) + + r2 = quantile( + a2, q, axis=axis, interpolation=interpolation, keepdims=keepdims + ) + + result = r2.execute().fetch() + expected = np.quantile( + raw2, + q, + axis=axis, + interpolation=interpolation, + keepdims=keepdims, + ) + + np.testing.assert_almost_equal(result, expected) + + # test out, 1 chunk + raw = np.random.rand(20) + q = np.random.rand(11) + a = tensor(raw, chunk_size=20) + out = empty((5, 11)) + quantile(a, q, out=out) + + result = out.execute().fetch() + expected = np.quantile(raw, q, out=np.empty((5, 11))) + np.testing.assert_array_equal(result, expected) + + # test out, multi chunks + raw = np.random.rand(20) + q = np.random.rand(11) + a = tensor(raw, chunk_size=6) + out = empty((5, 11)) + quantile(a, q, out=out) + + result = out.execute().fetch() + expected = np.quantile(raw, q, out=np.empty((5, 11))) + np.testing.assert_almost_equal(result, expected) + + # test q which is a tensor + q_raw = np.random.RandomState(0).rand(5) + q = tensor(q_raw, chunk_size=6) + + r = quantile(a, q, axis=None) + + result = r.execute().fetch() + expected = np.quantile(raw, q_raw, axis=None) + + np.testing.assert_almost_equal(result, expected) + + with pytest.raises(ValueError): + q[0] = 1.1 + r = quantile(a, q, axis=None) + _ = r.execute() + + +def test_percentile_execution(setup): + raw = np.random.rand(20, 10) + q = np.random.RandomState(0).randint(100, size=11) + a = tensor(raw, chunk_size=7) + r = percentile(a, q) + + result = r.execute().fetch() + expected = np.percentile(raw, q) + np.testing.assert_almost_equal(result, expected) + + mq = tensor(q) + + r = percentile(a, mq) + result = r.execute().fetch() + + np.testing.assert_almost_equal(result, expected) + + +def test_median_execution(setup): + raw = np.random.rand(20, 10) + a = tensor(raw, chunk_size=7) + r = median(a) + + result = r.execute().fetch() + expected = np.median(raw) + + np.testing.assert_array_equal(result, expected) + + r = median(a, axis=1) + + result = r.execute().fetch() + expected = np.median(raw, axis=1) + + np.testing.assert_array_equal(result, expected) + + +def test_bincount_execution(setup): + rs = np.random.RandomState(0) + raw = rs.randint(0, 9, (100,)) + raw[raw == 3] = 0 + raw_weights = rs.rand(100) + + # test non-chunked + a = tensor(raw) + result = bincount(a).execute().fetch() + expected = np.bincount(raw) + np.testing.assert_array_equal(result, expected) + + weights = tensor(raw_weights) + result = bincount(a, weights=weights).execute().fetch() + expected = np.bincount(raw, weights=raw_weights) + np.testing.assert_array_equal(result, expected) + + # test chunked + a = tensor(raw, chunk_size=13) + result = bincount(a, chunk_size_limit=5).execute().fetch() + expected = np.bincount(raw) + np.testing.assert_array_equal(result, expected) + + # test minlength + a = tensor(raw, chunk_size=13) + result = bincount(a, chunk_size_limit=5, minlength=15).execute().fetch() + expected = np.bincount(raw, minlength=15) + np.testing.assert_array_equal(result, expected) + + # test with gap + raw1 = np.concatenate([raw, [20]]) + a = tensor(raw1, chunk_size=13) + result = bincount(a, chunk_size_limit=5).execute().fetch() + expected = np.bincount(raw1) + np.testing.assert_array_equal(result, expected) + + # test with weights + a = tensor(raw, chunk_size=13) + weights = tensor(raw_weights, chunk_size=15) + result = bincount(a, chunk_size_limit=5, weights=weights).execute().fetch() + expected = np.bincount(raw, weights=raw_weights) + np.testing.assert_array_almost_equal(result, expected) + + # test errors + a = tensor(raw, chunk_size=13) + with pytest.raises(TypeError, match="cast array data"): + bincount(a.astype(float)).execute() + with pytest.raises(ValueError, match="1 dimension"): + bincount(np.array([[1, 2], [3, 4]])).execute() + with pytest.raises(ValueError, match="be negative"): + bincount(a, minlength=-1).execute() + with pytest.raises(ValueError, match="the same length"): + bincount([-1, 1, 2, 3], weights=[3, 4]).execute() + with pytest.raises(ValueError, match="negative elements"): + bincount(tensor([-1, 1, 2, 3], chunk_size=2)).execute() diff --git a/python/xorbits/_mars/tensor/stats/__init__.py b/python/xorbits/_mars/tensor/stats/__init__.py new file mode 100644 index 000000000..5906e2601 --- /dev/null +++ b/python/xorbits/_mars/tensor/stats/__init__.py @@ -0,0 +1,20 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .chisquare import chisquare +from .entropy import entropy +from .ks import ks_1samp, ks_2samp +from .power_divergence import power_divergence +from .rankdata import rankdata +from .ttest import ttest_1samp, ttest_ind, ttest_ind_from_stats, ttest_rel diff --git a/python/xorbits/_mars/tensor/stats/chisquare.py b/python/xorbits/_mars/tensor/stats/chisquare.py new file mode 100644 index 000000000..98d0c0bb5 --- /dev/null +++ b/python/xorbits/_mars/tensor/stats/chisquare.py @@ -0,0 +1,124 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .power_divergence import power_divergence + + +def chisquare(f_obs, f_exp=None, ddof=0, axis=0): + """ + Calculate a one-way chi-square test. + + The chi-square test tests the null hypothesis that the categorical data + has the given frequencies. + + Parameters + ---------- + f_obs : array_like + Observed frequencies in each category. + f_exp : array_like, optional + Expected frequencies in each category. By default the categories are + assumed to be equally likely. + ddof : int, optional + "Delta degrees of freedom": adjustment to the degrees of freedom + for the p-value. The p-value is computed using a chi-squared + distribution with ``k - 1 - ddof`` degrees of freedom, where `k` + is the number of observed frequencies. The default value of `ddof` + is 0. + axis : int or None, optional + The axis of the broadcast result of `f_obs` and `f_exp` along which to + apply the test. If axis is None, all values in `f_obs` are treated + as a single data set. Default is 0. + + Returns + ------- + chisq : float or ndarray + The chi-squared test statistic. The value is a float if `axis` is + None or `f_obs` and `f_exp` are 1-D. + p : float or ndarray + The p-value of the test. The value is a float if `ddof` and the + return value `chisq` are scalars. + + See Also + -------- + scipy.stats.power_divergence + + Notes + ----- + This test is invalid when the observed or expected frequencies in each + category are too small. A typical rule is that all of the observed + and expected frequencies should be at least 5. + + The default degrees of freedom, k-1, are for the case when no parameters + of the distribution are estimated. If p parameters are estimated by + efficient maximum likelihood then the correct degrees of freedom are + k-1-p. If the parameters are estimated in a different way, then the + dof can be between k-1-p and k-1. However, it is also possible that + the asymptotic distribution is not chi-square, in which case this test + is not appropriate. + + References + ---------- + .. [1] Lowry, Richard. "Concepts and Applications of Inferential + Statistics". Chapter 8. + https://web.archive.org/web/20171022032306/http://vassarstats.net:80/textbook/ch8pt1.html + .. [2] "Chi-squared test", https://en.wikipedia.org/wiki/Chi-squared_test + + Examples + -------- + When just `f_obs` is given, it is assumed that the expected frequencies + are uniform and given by the mean of the observed frequencies. + + >>> import mars.tensor as mt + >>> from mars.tensor.stats import chisquare + >>> chisquare([16, 18, 16, 14, 12, 12]) + (2.0, 0.84914503608460956) + + With `f_exp` the expected frequencies can be given. + + >>> chisquare([16, 18, 16, 14, 12, 12], f_exp=[16, 16, 16, 16, 16, 8]).execute() + (3.5, 0.62338762774958223) + + When `f_obs` is 2-D, by default the test is applied to each column. + + >>> obs = mt.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T + >>> obs.shape + (6, 2) + >>> chisquare(obs).execute() + (array([ 2. , 6.66666667]), array([ 0.84914504, 0.24663415])) + + By setting ``axis=None``, the test is applied to all data in the array, + which is equivalent to applying the test to the flattened array. + + >>> chisquare(obs, axis=None).execute() + (23.31034482758621, 0.015975692534127565) + >>> chisquare(obs.ravel()).execute() + (23.31034482758621, 0.015975692534127565) + + `ddof` is the change to make to the default degrees of freedom. + + >>> chisquare([16, 18, 16, 14, 12, 12], ddof=1).execute() + (2.0, 0.73575888234288467) + + `f_obs` and `f_exp` are also broadcast. In the following, `f_obs` has + shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting + `f_obs` and `f_exp` has shape (2, 6). To compute the desired chi-squared + statistics, we use ``axis=1``: + + >>> chisquare([16, 18, 16, 14, 12, 12], + ... f_exp=[[16, 16, 16, 16, 16, 8], [8, 20, 20, 16, 12, 12]], + ... axis=1).execute() + (array([ 3.5 , 9.25]), array([ 0.62338763, 0.09949846])) + + """ + return power_divergence(f_obs, f_exp=f_exp, ddof=ddof, axis=axis, lambda_="pearson") diff --git a/python/xorbits/_mars/tensor/stats/entropy.py b/python/xorbits/_mars/tensor/stats/entropy.py new file mode 100644 index 000000000..fea0d2f27 --- /dev/null +++ b/python/xorbits/_mars/tensor/stats/entropy.py @@ -0,0 +1,47 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from math import log + +try: + from scipy.stats import entropy as sp_entropy +except ImportError: + sp_entropy = None + +from ... import tensor as mt +from ...tensor import special as mt_special +from ..core import TENSOR_TYPE +from ..datasource import tensor as astensor +from ..utils import implement_scipy + + +@implement_scipy(sp_entropy) +def entropy(pk, qk=None, base=None): + pk = astensor(pk) + pk = 1.0 * pk / mt.sum(pk, axis=0) + if qk is None: + vec = mt_special.entr(pk) + else: + qk = astensor(qk) + if len(qk) != len(pk): + raise ValueError("qk and pk must have same length.") + qk = 1.0 * qk / mt.sum(qk, axis=0) + vec = mt_special.rel_entr(pk, qk) + S = mt.sum(vec, axis=0) + if base is not None: + if isinstance(base, TENSOR_TYPE): + S /= mt.log(base) + else: + S /= log(base) + return S diff --git a/python/xorbits/_mars/tensor/stats/ks.py b/python/xorbits/_mars/tensor/stats/ks.py new file mode 100644 index 000000000..3ae58959b --- /dev/null +++ b/python/xorbits/_mars/tensor/stats/ks.py @@ -0,0 +1,689 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import warnings +from collections import namedtuple +from math import gcd +from typing import Callable, Tuple, Union + +import numpy as np +from scipy import special +from scipy.stats import distributions + +from ... import tensor as mt +from ...core import ExecutableTuple +from ...typing import TileableType + +KstestResult = namedtuple("KstestResult", ("statistic", "pvalue")) +Ks_2sampResult = KstestResult + + +def _compute_prob_inside_method(m, n, g, h): # pragma: no cover + """ + Count the proportion of paths that stay strictly inside two diagonal lines. + + Parameters + ---------- + m : integer + m > 0 + n : integer + n > 0 + g : integer + g is greatest common divisor of m and n + h : integer + 0 <= h <= lcm(m,n) + + Returns + ------- + p : float + The proportion of paths that stay inside the two lines. + + + Count the integer lattice paths from (0, 0) to (m, n) which satisfy + |x/m - y/n| < h / lcm(m, n). + The paths make steps of size +1 in either positive x or positive y directions. + + We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk. + Hodges, J.L. Jr., + "The Significance Probability of the Smirnov Two-Sample Test," + Arkiv fiur Matematik, 3, No. 43 (1958), 469-86. + + """ + # Probability is symmetrical in m, n. Computation below uses m >= n. + if m < n: + m, n = n, m + mg = m // g + ng = n // g + + # Count the integer lattice paths from (0, 0) to (m, n) which satisfy + # |nx/g - my/g| < h. + # Compute matrix A such that: + # A(x, 0) = A(0, y) = 1 + # A(x, y) = A(x, y-1) + A(x-1, y), for x,y>=1, except that + # A(x, y) = 0 if |x/m - y/n|>= h + # Probability is A(m, n)/binom(m+n, n) + # Optimizations exist for m==n, m==n*p. + # Only need to preserve a single column of A, and only a sliding window of it. + # minj keeps track of the slide. + minj, maxj = 0, min(int(np.ceil(h / mg)), n + 1) + curlen = maxj - minj + # Make a vector long enough to hold maximum window needed. + lenA = min(2 * maxj + 2, n + 1) + # This is an integer calculation, but the entries are essentially + # binomial coefficients, hence grow quickly. + # Scaling after each column is computed avoids dividing by a + # large binomial coefficient at the end, but is not sufficient to avoid + # the large dynamic range which appears during the calculation. + # Instead we rescale based on the magnitude of the right most term in + # the column and keep track of an exponent separately and apply + # it at the end of the calculation. Similarly when multiplying by + # the binomial coefficient + dtype = np.float64 + A = np.zeros(lenA, dtype=dtype) + # Initialize the first column + A[minj:maxj] = 1 + expnt = 0 + for i in range(1, m + 1): + # Generate the next column. + # First calculate the sliding window + lastminj, lastlen = minj, curlen + minj = max(int(np.floor((ng * i - h) / mg)) + 1, 0) + minj = min(minj, n) + maxj = min(int(np.ceil((ng * i + h) / mg)), n + 1) + if maxj <= minj: + return 0 + # Now fill in the values + A[0 : maxj - minj] = np.cumsum(A[minj - lastminj : maxj - lastminj]) + curlen = maxj - minj + if lastlen > curlen: + # Set some carried-over elements to 0 + A[maxj - minj : maxj - minj + (lastlen - curlen)] = 0 + # Rescale if the right most value is over 2**900 + val = A[maxj - minj - 1] + _, valexpt = math.frexp(val) + if valexpt > 900: + # Scaling to bring down to about 2**800 appears + # sufficient for sizes under 10000. + valexpt -= 800 + A = np.ldexp(A, -valexpt) + expnt += valexpt + + val = A[maxj - minj - 1] + # Now divide by the binomial (m+n)!/m!/n! + for i in range(1, n + 1): + val = (val * i) / (m + i) + _, valexpt = math.frexp(val) + if valexpt < -128: + val = np.ldexp(val, -valexpt) + expnt += valexpt + # Finally scale if needed. + return np.ldexp(val, expnt) + + +def _compute_prob_outside_square(n, h): # pragma: no cover + """ + Compute the proportion of paths that pass outside the two diagonal lines. + + Parameters + ---------- + n : integer + n > 0 + h : integer + 0 <= h <= n + + Returns + ------- + p : float + The proportion of paths that pass outside the lines x-y = +/-h. + + """ + # Compute Pr(D_{n,n} >= h/n) + # Prob = 2 * ( binom(2n, n-h) - binom(2n, n-2a) + binom(2n, n-3a) - ... ) / binom(2n, n) + # This formulation exhibits subtractive cancellation. + # Instead divide each term by binom(2n, n), then factor common terms + # and use a Horner-like algorithm + # P = 2 * A0 * (1 - A1*(1 - A2*(1 - A3*(1 - A4*(...))))) + + P = 0.0 + k = int(np.floor(n / h)) + while k >= 0: + p1 = 1.0 + # Each of the Ai terms has numerator and denominator with h simple terms. + for j in range(h): + p1 = (n - k * h - j) * p1 / (n + k * h + j + 1) + P = p1 * (1.0 - P) + k -= 1 + return 2 * P + + +def _count_paths_outside_method(m, n, g, h): # pragma: no cover + """ + Count the number of paths that pass outside the specified diagonal. + + Parameters + ---------- + m : integer + m > 0 + n : integer + n > 0 + g : integer + g is greatest common divisor of m and n + h : integer + 0 <= h <= lcm(m,n) + + Returns + ------- + p : float + The number of paths that go low. + The calculation may overflow - check for a finite answer. + + Raises + ------ + FloatingPointError: Raised if the intermediate computation goes outside + the range of a float. + + Notes + ----- + Count the integer lattice paths from (0, 0) to (m, n), which at some + point (x, y) along the path, satisfy: + m*y <= n*x - h*g + The paths make steps of size +1 in either positive x or positive y directions. + + We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk. + Hodges, J.L. Jr., + "The Significance Probability of the Smirnov Two-Sample Test," + Arkiv fiur Matematik, 3, No. 43 (1958), 469-86. + + """ + # Compute #paths which stay lower than x/m-y/n = h/lcm(m,n) + # B(x, y) = #{paths from (0,0) to (x,y) without previously crossing the boundary} + # = binom(x, y) - #{paths which already reached the boundary} + # Multiply by the number of path extensions going from (x, y) to (m, n) + # Sum. + + # Probability is symmetrical in m, n. Computation below assumes m >= n. + if m < n: + m, n = n, m + mg = m // g + ng = n // g + + # Not every x needs to be considered. + # xj holds the list of x values to be checked. + # Wherever n*x/m + ng*h crosses an integer + lxj = n + (mg - h) // mg + xj = [(h + mg * j + ng - 1) // ng for j in range(lxj)] + # B is an array just holding a few values of B(x,y), the ones needed. + # B[j] == B(x_j, j) + if lxj == 0: + return np.round(special.binom(m + n, n)) + B = np.zeros(lxj) + B[0] = 1 + # Compute the B(x, y) terms + # The binomial coefficient is an integer, but special.binom() may return a float. + # Round it to the nearest integer. + for j in range(1, lxj): + Bj = np.round(special.binom(xj[j] + j, j)) + if not np.isfinite(Bj): + raise FloatingPointError() + for i in range(j): + bin = np.round( + special.binom(xj[j] - xj[i] + j - i, j - i) + ) # pylint: disable=redefined-builtin + Bj -= bin * B[i] + B[j] = Bj + if not np.isfinite(Bj): + raise FloatingPointError() + # Compute the number of path extensions... + num_paths = 0 + for j in range(lxj): + bin = np.round(special.binom((m - xj[j]) + (n - j), n - j)) + term = B[j] * bin + if not np.isfinite(term): + raise FloatingPointError() + num_paths += term + return np.round(num_paths) + + +def _attempt_exact_2kssamp(n1, n2, g, d, alternative): # pragma: no cover + """Attempts to compute the exact 2sample probability. + + n1, n2 are the sample sizes + g is the gcd(n1, n2) + d is the computed max difference in ECDFs + + Returns (success, d, probability) + """ + lcm = (n1 // g) * n2 + h = int(np.round(d * lcm)) + d = h * 1.0 / lcm + if h == 0: + return True, d, 1.0 + saw_fp_error, prob = False, np.nan + try: + if alternative == "two-sided": + if n1 == n2: + prob = _compute_prob_outside_square(n1, h) + else: + prob = 1 - _compute_prob_inside_method(n1, n2, g, h) + else: + if n1 == n2: + # prob = binom(2n, n-h) / binom(2n, n) + # Evaluating in that form incurs roundoff errors + # from special.binom. Instead calculate directly + jrange = np.arange(h) + prob = np.prod((n1 - jrange) / (n1 + jrange + 1.0)) + else: + num_paths = _count_paths_outside_method(n1, n2, g, h) + bin = special.binom(n1 + n2, n1) # pylint: disable=redefined-builtin + if ( + not np.isfinite(bin) + or not np.isfinite(num_paths) + or num_paths > bin + ): + saw_fp_error = True + else: + prob = num_paths / bin + + except FloatingPointError: + saw_fp_error = True + + if saw_fp_error: + return False, d, np.nan + if not (0 <= prob <= 1): + return False, d, prob + return True, d, prob + + +def _calc_prob_2samp(d, n1, n2, alternative, mode): # pragma: no cover + MAX_AUTO_N = 10000 # 'auto' will attempt to be exact if n1,n2 <= MAX_AUTO_N + + g = gcd(n1, n2) + n1g = n1 // g + n2g = n2 // g + prob = -mt.inf + original_mode = mode + if mode == "auto": + mode = "exact" if max(n1, n2) <= MAX_AUTO_N else "asymp" + elif mode == "exact": + # If lcm(n1, n2) is too big, switch from exact to asymp + if n1g >= np.iinfo(np.int_).max / n2g: + mode = "asymp" + warnings.warn( + f"Exact ks_2samp calculation not possible with samples sizes " + f"{n1} and {n2}. Switching to 'asymp'.", + RuntimeWarning, + ) + + if mode == "exact": + success, d, prob = _attempt_exact_2kssamp(n1, n2, g, d, alternative) + if not success: + mode = "asymp" + if original_mode == "exact": + warnings.warn( + f"ks_2samp: Exact calculation unsuccessful. " + f"Switching to mode={mode}.", + RuntimeWarning, + ) + + if mode == "asymp": + # The product n1*n2 is large. Use Smirnov's asymptotic formula. + # Ensure float to avoid overflow in multiplication + # sorted because the one-sided formula is not symmetric in n1, n2 + m, n = sorted([float(n1), float(n2)], reverse=True) + en = m * n / (m + n) + if alternative == "two-sided": + prob = distributions.kstwo.sf(d, np.round(en)) + else: + z = np.sqrt(en) * d + # Use Hodges' suggested approximation Eqn 5.3 + # Requires m to be the larger of (n1, n2) + expt = -2 * z**2 - 2 * z * (m + 2 * n) / np.sqrt(m * n * (m + n)) / 3.0 + prob = np.exp(expt) + + return np.clip(prob, 0, 1) + + +def _compute_dplus(cdfvals, n): + """Computes D+ as used in the Kolmogorov-Smirnov test. + + Parameters + ---------- + cdfvals: array_like + Sorted array of CDF values between 0 and 1 + + Returns + ------- + Maximum distance of the CDF values below Uniform(0, 1) + """ + return (mt.arange(1.0, n + 1) / n - cdfvals).max() + + +def _compute_dminus(cdfvals, n): + """Computes D- as used in the Kolmogorov-Smirnov test. + + Parameters + ---------- + cdfvals: array_like + Sorted array of CDF values between 0 and 1 + + Returns + ------- + Maximum distance of the CDF values above Uniform(0, 1) + """ + return (cdfvals - mt.arange(0.0, n) / n).max() + + +def ks_1samp( + x: Union[np.ndarray, list, TileableType], + cdf: Callable, + args: Tuple = (), + alternative: str = "two-sided", + mode: str = "auto", +): + """ + Performs the one-sample Kolmogorov-Smirnov test for goodness of fit. + + This test compares the underlying distribution F(x) of a sample + against a given continuous distribution G(x). See Notes for a description + of the available null and alternative hypotheses. + + Parameters + ---------- + x : array_like + a 1-D array of observations of iid random variables. + cdf : callable + callable used to calculate the cdf. + args : tuple, sequence, optional + Distribution parameters, used with `cdf`. + alternative : {'two-sided', 'less', 'greater'}, optional + Defines the null and alternative hypotheses. Default is 'two-sided'. + Please see explanations in the Notes below. + mode : {'auto', 'exact', 'approx', 'asymp'}, optional + Defines the distribution used for calculating the p-value. + The following options are available (default is 'auto'): + + * 'auto' : selects one of the other options. + * 'exact' : uses the exact distribution of test statistic. + * 'approx' : approximates the two-sided probability with twice + the one-sided probability + * 'asymp': uses asymptotic distribution of test statistic + + Returns + ------- + statistic : float + KS test statistic, either D, D+ or D- (depending on the value + of 'alternative') + pvalue : float + One-tailed or two-tailed p-value. + + See Also + -------- + ks_2samp, kstest + + Notes + ----- + There are three options for the null and corresponding alternative + hypothesis that can be selected using the `alternative` parameter. + + - `two-sided`: The null hypothesis is that the two distributions are + identical, F(x)=G(x) for all x; the alternative is that they are not + identical. + + - `less`: The null hypothesis is that F(x) >= G(x) for all x; the + alternative is that F(x) < G(x) for at least one x. + + - `greater`: The null hypothesis is that F(x) <= G(x) for all x; the + alternative is that F(x) > G(x) for at least one x. + + Note that the alternative hypotheses describe the *CDFs* of the + underlying distributions, not the observed values. For example, + suppose x1 ~ F and x2 ~ G. If F(x) > G(x) for all x, the values in + x1 tend to be less than those in x2. + + Examples + -------- + >>> import numpy as np + >>> from scipy import stats + >>> import mars.tensor as mt + >>> from mars.tensor.stats import ks_1samp + + >>> np.random.seed(12345678) #fix random seed to get the same result + >>> x = mt.linspace(-15, 15, 9, chunk_size=5) + >>> ks_1samp(x, stats.norm.cdf).execute() + (0.44435602715924361, 0.038850142705171065) + + >>> ks_1samp(stats.norm.rvs(size=100), stats.norm.cdf).execute() + KstestResult(statistic=0.165471391799..., pvalue=0.007331283245...) + + *Test against one-sided alternative hypothesis* + + Shift distribution to larger values, so that `` CDF(x) < norm.cdf(x)``: + + >>> x = stats.norm.rvs(loc=0.2, size=100) + >>> ks_1samp(x, stats.norm.cdf, alternative='less').execute() + KstestResult(statistic=0.235488541678..., pvalue=1.158315030683...) + + Reject null hypothesis in favor of alternative hypothesis: less + + >>> ks_1samp(x, stats.norm.cdf, alternative='greater').execute() + KstestResult(statistic=0.010167165616..., pvalue=0.972494973653...) + + Reject null hypothesis in favor of alternative hypothesis: greater + + >>> ks_1samp(x, stats.norm.cdf).execute() + KstestResult(statistic=0.235488541678..., pvalue=2.316630061366...) + + Don't reject null hypothesis in favor of alternative hypothesis: two-sided + + *Testing t distributed random variables against normal distribution* + + With 100 degrees of freedom the t distribution looks close to the normal + distribution, and the K-S test does not reject the hypothesis that the + sample came from the normal distribution: + + >>> ks_1samp(stats.t.rvs(100, size=100), stats.norm.cdf).execute() + KstestResult(statistic=0.077844250253..., pvalue=0.553155412513...) + + With 3 degrees of freedom the t distribution looks sufficiently different + from the normal distribution, that we can reject the hypothesis that the + sample came from the normal distribution at the 10% level: + + >>> ks_1samp(stats.t.rvs(3, size=100), stats.norm.cdf).execute() + KstestResult(statistic=0.118967105356..., pvalue=0.108627114578...) + """ + alternative = {"t": "two-sided", "g": "greater", "l": "less"}.get( + alternative.lower()[0], alternative + ) + if alternative not in ["two-sided", "greater", "less"]: + raise ValueError("Unexpected alternative %s" % alternative) + + x = mt.asarray(x) + N = x.shape[0] + x = mt.sort(x) + cdfvals = x.map_chunk(cdf, args=args, elementwise=True) + + if alternative == "greater": + Dplus = _compute_dplus(cdfvals, N) + return ExecutableTuple( + KstestResult(Dplus, Dplus.map_chunk(distributions.ksone.sf, args=(N,))) + ) + + if alternative == "less": + Dminus = _compute_dminus(cdfvals, N) + return ExecutableTuple( + KstestResult(Dminus, Dminus.map_chunk(distributions.ksone.sf, args=(N,))) + ) + + # alternative == 'two-sided': + Dplus = _compute_dplus(cdfvals, N) + Dminus = _compute_dminus(cdfvals, N) + D = mt.stack([Dplus, Dminus]).max() + if mode == "auto": # Always select exact + mode = "exact" + if mode == "exact": + prob = D.map_chunk(distributions.kstwo.sf, args=(N,), elementwise=True) + elif mode == "asymp": + prob = (D * np.sqrt(N)).map_chunk(distributions.kstwobign.sf, elementwise=True) + else: + # mode == 'approx' + prob = 2 * D.map_chunk(distributions.ksone.sf, args=(N,), elementwise=True) + prob = mt.clip(prob, 0, 1) + return ExecutableTuple(KstestResult(D, prob)) + + +def ks_2samp( + data1: Union[np.ndarray, list, TileableType], + data2: Union[np.ndarray, list, TileableType], + alternative: str = "two-sided", + mode: str = "auto", +): + """ + Compute the Kolmogorov-Smirnov statistic on 2 samples. + + This is a two-sided test for the null hypothesis that 2 independent samples + are drawn from the same continuous distribution. The alternative hypothesis + can be either 'two-sided' (default), 'less' or 'greater'. + + Parameters + ---------- + data1, data2 : array_like, 1-Dimensional + Two arrays of sample observations assumed to be drawn from a continuous + distribution, sample sizes can be different. + alternative : {'two-sided', 'less', 'greater'}, optional + Defines the alternative hypothesis. + The following options are available (default is 'two-sided'): + + * 'two-sided' + * 'less': one-sided, see explanation in Notes + * 'greater': one-sided, see explanation in Notes + mode : {'auto', 'exact', 'asymp'}, optional + Defines the method used for calculating the p-value. + The following options are available (default is 'auto'): + + * 'auto' : use 'exact' for small size arrays, 'asymp' for large + * 'exact' : use exact distribution of test statistic + * 'asymp' : use asymptotic distribution of test statistic + + Returns + ------- + statistic : float + KS statistic. + pvalue : float + Two-tailed p-value. + + See Also + -------- + kstest, ks_1samp, epps_singleton_2samp, anderson_ksamp + + Notes + ----- + This tests whether 2 samples are drawn from the same distribution. Note + that, like in the case of the one-sample KS test, the distribution is + assumed to be continuous. + + In the one-sided test, the alternative is that the empirical + cumulative distribution function F(x) of the data1 variable is "less" + or "greater" than the empirical cumulative distribution function G(x) + of the data2 variable, ``F(x)<=G(x)``, resp. ``F(x)>=G(x)``. + + If the KS statistic is small or the p-value is high, then we cannot + reject the hypothesis that the distributions of the two samples + are the same. + + If the mode is 'auto', the computation is exact if the sample sizes are + less than 10000. For larger sizes, the computation uses the + Kolmogorov-Smirnov distributions to compute an approximate value. + + The 'two-sided' 'exact' computation computes the complementary probability + and then subtracts from 1. As such, the minimum probability it can return + is about 1e-16. While the algorithm itself is exact, numerical + errors may accumulate for large sample sizes. It is most suited to + situations in which one of the sample sizes is only a few thousand. + + We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk [1]_. + + References + ---------- + .. [1] Hodges, J.L. Jr., "The Significance Probability of the Smirnov + Two-Sample Test," Arkiv fiur Matematik, 3, No. 43 (1958), 469-86. + + + Examples + -------- + >>> import numpy as np + >>> from scipy import stats + >>> import mars.tensor as mt + >>> from mars.tensor.stats import ks_2samp + >>> np.random.seed(12345678) #fix random seed to get the same result + >>> n1 = 200 # size of first sample + >>> n2 = 300 # size of second sample + + For a different distribution, we can reject the null hypothesis since the + pvalue is below 1%: + + >>> rvs1 = stats.norm.rvs(size=n1, loc=0., scale=1) + >>> rvs2 = stats.norm.rvs(size=n2, loc=0.5, scale=1.5) + >>> ks_2samp(rvs1, rvs2).execute() + KstestResult(statistic=0.20833333333333337, pvalue=5.1292795978041816e-05) + + For a slightly different distribution, we cannot reject the null hypothesis + at a 10% or lower alpha since the p-value at 0.144 is higher than 10% + + >>> rvs3 = stats.norm.rvs(size=n2, loc=0.01, scale=1.0) + >>> ks_2samp(rvs1, rvs3).execute() + KstestResult(statistic=0.10333333333333333, pvalue=0.14691437867433788) + + For an identical distribution, we cannot reject the null hypothesis since + the p-value is high, 41%: + + >>> rvs4 = stats.norm.rvs(size=n2, loc=0.0, scale=1.0) + >>> ks_2samp(rvs1, rvs4).execute() + KstestResult(statistic=0.07999999999999996, pvalue=0.4115432028915931) + + """ + + if mode not in ["auto", "exact", "asymp"]: + raise ValueError(f"Invalid value for mode: {mode}") + alternative = {"t": "two-sided", "g": "greater", "l": "less"}.get( + alternative.lower()[0], alternative + ) + if alternative not in ["two-sided", "less", "greater"]: + raise ValueError(f"Invalid value for alternative: {alternative}") + data1 = mt.asarray(data1) + data2 = mt.asarray(data2) + data1 = mt.sort(data1) + data2 = mt.sort(data2) + n1 = data1.shape[0] + n2 = data2.shape[0] + if min(n1, n2) == 0: + raise ValueError("Data passed to ks_2samp must not be empty") + + data_all = mt.concatenate([data1, data2]) + # using searchsorted solves equal data problem + cdf1 = mt.searchsorted(data1, data_all, side="right") / n1 + cdf2 = mt.searchsorted(data2, data_all, side="right") / n2 + cddiffs = cdf1 - cdf2 + minS = mt.clip(-mt.min(cddiffs), 0, 1) # Ensure sign of minS is not negative. + maxS = mt.max(cddiffs) + alt2Dvalue = {"less": minS, "greater": maxS, "two-sided": mt.maximum(minS, maxS)} + d = alt2Dvalue[alternative] + prob = d.map_chunk( + _calc_prob_2samp, + args=(n1, n2, alternative, mode), + elementwise=True, + dtype=d.dtype, + ) + + return ExecutableTuple(Ks_2sampResult(d, prob)) diff --git a/python/xorbits/_mars/tensor/stats/power_divergence.py b/python/xorbits/_mars/tensor/stats/power_divergence.py new file mode 100644 index 000000000..7b50c4b0a --- /dev/null +++ b/python/xorbits/_mars/tensor/stats/power_divergence.py @@ -0,0 +1,243 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import namedtuple + +import numpy as np + +try: + from scipy.stats import distributions as sp_distributions +except ImportError: + sp_distributions = None + +from ...core import ExecutableTuple +from ...utils import require_not_none +from .. import special +from ..datasource import asarray + +# Map from names to lambda_ values used in power_divergence(). +_power_div_lambda_names = { + "pearson": 1, + "log-likelihood": 0, + "freeman-tukey": -0.5, + "mod-log-likelihood": -1, + "neyman": -2, + "cressie-read": 2 / 3, +} + + +def _count(a, axis=None): + if axis is None: + return a.size + else: + return a.shape[axis] + + +Power_divergenceResult = namedtuple("Power_divergenceResult", ("statistic", "pvalue")) + + +@require_not_none(sp_distributions) +def power_divergence(f_obs, f_exp=None, ddof=0, axis=0, lambda_=None): + """ + Cressie-Read power divergence statistic and goodness of fit test. + + This function tests the null hypothesis that the categorical data + has the given frequencies, using the Cressie-Read power divergence + statistic. + + Parameters + ---------- + f_obs : array_like + Observed frequencies in each category. + f_exp : array_like, optional + Expected frequencies in each category. By default the categories are + assumed to be equally likely. + ddof : int, optional + "Delta degrees of freedom": adjustment to the degrees of freedom + for the p-value. The p-value is computed using a chi-squared + distribution with ``k - 1 - ddof`` degrees of freedom, where `k` + is the number of observed frequencies. The default value of `ddof` + is 0. + axis : int or None, optional + The axis of the broadcast result of `f_obs` and `f_exp` along which to + apply the test. If axis is None, all values in `f_obs` are treated + as a single data set. Default is 0. + lambda_ : float or str, optional + The power in the Cressie-Read power divergence statistic. The default + is 1. For convenience, `lambda_` may be assigned one of the following + strings, in which case the corresponding numerical value is used:: + + String Value Description + "pearson" 1 Pearson's chi-squared statistic. + In this case, the function is + equivalent to `stats.chisquare`. + "log-likelihood" 0 Log-likelihood ratio. Also known as + the G-test [3]_. + "freeman-tukey" -1/2 Freeman-Tukey statistic. + "mod-log-likelihood" -1 Modified log-likelihood ratio. + "neyman" -2 Neyman's statistic. + "cressie-read" 2/3 The power recommended in [5]_. + + Returns + ------- + statistic : float or ndarray + The Cressie-Read power divergence test statistic. The value is + a float if `axis` is None or if` `f_obs` and `f_exp` are 1-D. + pvalue : float or ndarray + The p-value of the test. The value is a float if `ddof` and the + return value `stat` are scalars. + + See Also + -------- + chisquare + + Notes + ----- + This test is invalid when the observed or expected frequencies in each + category are too small. A typical rule is that all of the observed + and expected frequencies should be at least 5. + + When `lambda_` is less than zero, the formula for the statistic involves + dividing by `f_obs`, so a warning or error may be generated if any value + in `f_obs` is 0. + + Similarly, a warning or error may be generated if any value in `f_exp` is + zero when `lambda_` >= 0. + + The default degrees of freedom, k-1, are for the case when no parameters + of the distribution are estimated. If p parameters are estimated by + efficient maximum likelihood then the correct degrees of freedom are + k-1-p. If the parameters are estimated in a different way, then the + dof can be between k-1-p and k-1. However, it is also possible that + the asymptotic distribution is not a chisquare, in which case this + test is not appropriate. + + This function handles masked arrays. If an element of `f_obs` or `f_exp` + is masked, then data at that position is ignored, and does not count + towards the size of the data set. + + .. versionadded:: 0.13.0 + + References + ---------- + .. [1] Lowry, Richard. "Concepts and Applications of Inferential + Statistics". Chapter 8. + https://web.archive.org/web/20171015035606/http://faculty.vassar.edu/lowry/ch8pt1.html + .. [2] "Chi-squared test", https://en.wikipedia.org/wiki/Chi-squared_test + .. [3] "G-test", https://en.wikipedia.org/wiki/G-test + .. [4] Sokal, R. R. and Rohlf, F. J. "Biometry: the principles and + practice of statistics in biological research", New York: Freeman + (1981) + .. [5] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit + Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984), + pp. 440-464. + + Examples + -------- + (See `chisquare` for more examples.) + + When just `f_obs` is given, it is assumed that the expected frequencies + are uniform and given by the mean of the observed frequencies. Here we + perform a G-test (i.e. use the log-likelihood ratio statistic): + + >>> import mars.tensor as mt + >>> from mars.tensor.stats import power_divergence + >>> power_divergence([16, 18, 16, 14, 12, 12], lambda_='log-likelihood').execute() + (2.006573162632538, 0.84823476779463769) + + The expected frequencies can be given with the `f_exp` argument: + + >>> power_divergence([16, 18, 16, 14, 12, 12], + ... f_exp=[16, 16, 16, 16, 16, 8], + ... lambda_='log-likelihood').execute() + (3.3281031458963746, 0.6495419288047497) + + When `f_obs` is 2-D, by default the test is applied to each column. + + >>> obs = mt.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T + >>> obs.shape + (6, 2) + >>> power_divergence(obs, lambda_="log-likelihood").execute() + (array([ 2.00657316, 6.77634498]), array([ 0.84823477, 0.23781225])) + + By setting ``axis=None``, the test is applied to all data in the array, + which is equivalent to applying the test to the flattened array. + + >>> power_divergence(obs, axis=None).execute() + (23.31034482758621, 0.015975692534127565) + >>> power_divergence(obs.ravel()).execute() + (23.31034482758621, 0.015975692534127565) + + `ddof` is the change to make to the default degrees of freedom. + + >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=1).execute() + (2.0, 0.73575888234288467) + + `f_obs` and `f_exp` are also broadcast. In the following, `f_obs` has + shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting + `f_obs` and `f_exp` has shape (2, 6). To compute the desired chi-squared + statistics, we must use ``axis=1``: + + >>> power_divergence([16, 18, 16, 14, 12, 12], + ... f_exp=[[16, 16, 16, 16, 16, 8], + ... [8, 20, 20, 16, 12, 12]], + ... axis=1) + (array([ 3.5 , 9.25]), array([ 0.62338763, 0.09949846])) + """ + # Convert the input argument `lambda_` to a numerical value. + if isinstance(lambda_, str): + if lambda_ not in _power_div_lambda_names: + names = repr(list(_power_div_lambda_names.keys()))[1:-1] + raise ValueError( + "invalid string for lambda_: {0!r}. Valid strings " + "are {1}".format(lambda_, names) + ) + lambda_ = _power_div_lambda_names[lambda_] + elif lambda_ is None: + lambda_ = 1 + + f_obs = asarray(f_obs) + + if f_exp is not None: + f_exp = asarray(f_exp) + else: + f_exp = f_obs.mean(axis=axis, keepdims=True) + + # `terms` is the array of terms that are summed along `axis` to create + # the test statistic. We use some specialized code for a few special + # cases of lambda_. + if lambda_ == 1: + # Pearson's chi-squared statistic + terms = (f_obs.astype(np.float64) - f_exp) ** 2 / f_exp + elif lambda_ == 0: + # Log-likelihood ratio (i.e. G-test) + terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp) + elif lambda_ == -1: + # Modified log-likelihood ratio + terms = 2.0 * special.xlogy(f_exp, f_exp / f_obs) + else: + # General Cressie-Read power divergence. + terms = f_obs * ((f_obs / f_exp) ** lambda_ - 1) + terms /= 0.5 * lambda_ * (lambda_ + 1) + + stat = terms.sum(axis=axis) + + num_obs = _count(terms, axis=axis) + # we decide not to support ddof for multiple dimensions + # ddof = asarray(ddof) + p = stat.map_chunk( + sp_distributions.chi2.sf, (num_obs - 1 - ddof,), elementwise=True + ) + + return ExecutableTuple(Power_divergenceResult(stat, p)) diff --git a/python/xorbits/_mars/tensor/stats/rankdata.py b/python/xorbits/_mars/tensor/stats/rankdata.py new file mode 100644 index 000000000..799b98c77 --- /dev/null +++ b/python/xorbits/_mars/tensor/stats/rankdata.py @@ -0,0 +1,113 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import tensor as mt + + +def rankdata(a, method="average", *, axis=None): + """Assign ranks to data, dealing with ties appropriately. + By default (``axis=None``), the data array is first flattened, and a flat + array of ranks is returned. Separately reshape the rank array to the + shape of the data array if desired (see Examples). + Ranks begin at 1. The `method` argument controls how ranks are assigned + to equal values. See [1]_ for further discussion of ranking methods. + Parameters + ---------- + a : array_like + The array of values to be ranked. + method : {'average', 'min', 'max', 'dense', 'ordinal'}, optional + The method used to assign ranks to tied elements. + The following methods are available (default is 'average'): + * 'average': The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + * 'min': The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also + referred to as "competition" ranking.) + * 'max': The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + * 'dense': Like 'min', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + * 'ordinal': All values are given a distinct rank, corresponding to + the order that the values occur in `a`. + axis : {None, int}, optional + Axis along which to perform the ranking. If ``None``, the data array + is first flattened. + Returns + ------- + ranks : ndarray + An array of size equal to the size of `a`, containing rank + scores. + References + ---------- + .. [1] "Ranking", https://en.wikipedia.org/wiki/Ranking + Examples + -------- + >>> from mars.tensor.stats import rankdata + >>> rankdata([0, 2, 3, 2]).execute() + array([ 1. , 2.5, 4. , 2.5]) + >>> rankdata([0, 2, 3, 2], method='min').execute() + array([ 1, 2, 4, 2]) + >>> rankdata([0, 2, 3, 2], method='max').execute() + array([ 1, 3, 4, 3]) + >>> rankdata([0, 2, 3, 2], method='dense').execute() + array([ 1, 2, 3, 2]) + >>> rankdata([0, 2, 3, 2], method='ordinal').execute() + array([ 1, 2, 4, 3]) + >>> rankdata([[0, 2], [3, 2]]).reshape(2,2).execute() + array([[1. , 2.5], + [4. , 2.5]]) + >>> rankdata([[0, 2, 2], [3, 2, 5]], axis=1).execute() + array([[1. , 2.5, 2.5], + [2. , 1. , 3. ]]) + """ + if method not in ("average", "min", "max", "dense", "ordinal"): + raise ValueError('unknown method "{0}"'.format(method)) + + if axis is not None: + a = np.asarray(a) + if a.size == 0: + np.core.multiarray.normalize_axis_index(axis, a.ndim) + dt = np.float64 if method == "average" else np.int_ + return mt.empty(a.shape, dtype=dt) + return mt.tensor(np.apply_along_axis(rankdata, axis, a, method)) + + arr = mt.ravel(mt.asarray(a)) + algo = "mergesort" if method == "ordinal" else "quicksort" + sorter = mt.argsort(arr, kind=algo) + + inv = mt.empty(sorter.size, dtype=np.intp) + inv[sorter] = mt.arange(sorter.size, dtype=np.intp) + + if method == "ordinal": + return inv + 1 + + arr = arr[sorter] + obs = mt.r_[True, arr[1:] != arr[:-1]] + dense = obs.cumsum()[inv] + + if method == "dense": + return dense + + count = mt.r_[mt.nonzero(obs)[0], len(obs)] + + if method == "max": + return count[dense] + + if method == "min": + return count[dense - 1] + 1 + + return 0.5 * (count[dense] + count[dense - 1] + 1) diff --git a/python/xorbits/_mars/tensor/stats/tests/__init__.py b/python/xorbits/_mars/tensor/stats/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/tensor/stats/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/tensor/stats/tests/test_stats_execution.py b/python/xorbits/_mars/tensor/stats/tests/test_stats_execution.py new file mode 100644 index 000000000..93bb66bc7 --- /dev/null +++ b/python/xorbits/_mars/tensor/stats/tests/test_stats_execution.py @@ -0,0 +1,343 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools + +import numpy as np +import pytest +import scipy +from scipy.stats import chisquare as sp_chisquare +from scipy.stats import entropy as sp_entropy +from scipy.stats import ks_1samp as sp_ks_1samp +from scipy.stats import ks_2samp as sp_ks_2samp +from scipy.stats import norm as sp_norm +from scipy.stats import power_divergence as sp_power_divergence +from scipy.stats import rankdata as sp_rankdata +from scipy.stats import ttest_1samp as sp_ttest_1samp +from scipy.stats import ttest_ind as sp_ttest_ind +from scipy.stats import ttest_ind_from_stats as sp_ttest_ind_from_stats +from scipy.stats import ttest_rel as sp_ttest_rel + +from ....lib.version import parse as parse_version +from ... import tensor +from .. import ( + chisquare, + entropy, + ks_1samp, + ks_2samp, + power_divergence, + rankdata, + ttest_1samp, + ttest_ind, + ttest_ind_from_stats, + ttest_rel, +) + + +def test_entropy_execution(setup): + rs = np.random.RandomState(0) + a = rs.rand(10) + + t1 = tensor(a, chunk_size=4) + r = entropy(t1) + + result = r.execute().fetch() + expected = sp_entropy(a) + np.testing.assert_array_almost_equal(result, expected) + + b = rs.rand(10) + base = 3.1 + + t2 = tensor(b, chunk_size=4) + r = entropy(t1, t2, base) + + result = r.execute().fetch() + expected = sp_entropy(a, b, base) + np.testing.assert_array_almost_equal(result, expected) + + b = rs.rand(10) + base = 3.1 + + t2 = tensor(b, chunk_size=4) + r = entropy(t1, t2, base) + + result = r.execute().fetch() + expected = sp_entropy(a, b, base) + np.testing.assert_array_almost_equal(result, expected) + + r = entropy(t1, t2, t1.sum()) + + result = r.execute().fetch() + expected = sp_entropy(a, b, a.sum()) + np.testing.assert_array_almost_equal(result, expected) + + with pytest.raises(ValueError): + entropy(t1, t2[:7]) + + +def test_power_divergence_execution(setup): + f_obs_raw = np.array([16, 18, 16, 14, 12, 12]) + f_exp_raw = np.array([16, 16, 16, 16, 16, 8]) + + f_obs = tensor(f_obs_raw, chunk_size=4) + f_exp = tensor(f_exp_raw, chunk_size=4) + + with pytest.raises(ValueError): + power_divergence(f_obs, f_exp, lambda_="non-exist-lambda") + + r = power_divergence(f_obs, lambda_="pearson") + result = r.execute().fetch() + + expected = sp_power_divergence(f_obs_raw, lambda_="pearson") + np.testing.assert_almost_equal(expected[0], result[0]) + np.testing.assert_almost_equal(expected[1], result[1]) + + modes = [ + None, + "pearson", + "log-likelihood", + "mod-log-likelihood", + "neyman", + ] + + for mode in modes: + r = power_divergence(f_obs, f_exp, lambda_=mode) + result = r.execute().fetch() + + expected = sp_power_divergence(f_obs_raw, f_exp_raw, lambda_=mode) + np.testing.assert_almost_equal(expected[0], result[0]) + np.testing.assert_almost_equal(expected[1], result[1]) + + +def test_chisquare_execution(setup): + f_obs_raw = np.array([16, 18, 16, 14, 12, 12]) + f_exp_raw = np.array([16, 16, 16, 16, 16, 8]) + + f_obs = tensor(f_obs_raw, chunk_size=4) + f_exp = tensor(f_exp_raw, chunk_size=4) + + r = chisquare(f_obs, f_exp) + result = r.execute().fetch() + + expected = sp_chisquare(f_obs_raw, f_exp_raw) + np.testing.assert_almost_equal(expected[0], result[0]) + np.testing.assert_almost_equal(expected[1], result[1]) + + +def test_t_test_execution(setup): + if parse_version(scipy.__version__) >= parse_version("1.6.0"): + alternatives = ["less", "greater", "two-sided"] + + mt_from_stats = ( + lambda a, b, alternative=None, equal_var=True: ttest_ind_from_stats( + a.mean(), + a.std(), + a.shape[0], + b.mean(), + b.std(), + b.shape[0], + alternative=alternative, + equal_var=equal_var, + ) + ) + sp_from_stats = ( + lambda a, b, alternative=None, equal_var=True: sp_ttest_ind_from_stats( + a.mean(), + a.std(), + a.shape[0], + b.mean(), + b.std(), + b.shape[0], + alternative=alternative, + equal_var=equal_var, + ) + ) + else: + alternatives = ["two-sided"] + + mt_from_stats = lambda a, b, equal_var=True: ttest_ind_from_stats( + a.mean(), + a.std(), + a.shape[0], + b.mean(), + b.std(), + b.shape[0], + equal_var=equal_var, + ) + sp_from_stats = lambda a, b, equal_var=True: sp_ttest_ind_from_stats( + a.mean(), + a.std(), + a.shape[0], + b.mean(), + b.std(), + b.shape[0], + equal_var=equal_var, + ) + + funcs = [ + (ttest_rel, sp_ttest_rel), + ( + functools.partial(ttest_ind, equal_var=True), + functools.partial(sp_ttest_ind, equal_var=True), + ), + ( + functools.partial(ttest_ind, equal_var=False), + functools.partial(sp_ttest_ind, equal_var=False), + ), + ( + functools.partial(mt_from_stats, equal_var=True), + functools.partial(sp_from_stats, equal_var=True), + ), + ( + functools.partial(mt_from_stats, equal_var=False), + functools.partial(sp_from_stats, equal_var=False), + ), + ] + + fa_raw = np.array([16, 18, 16, 14, 12, 12]) + fb_raw = np.array([16, 16, 16, 16, 16, 8]) + + fa = tensor(fa_raw, chunk_size=4) + fb = tensor(fb_raw, chunk_size=4) + + for mt_func, sp_func in funcs: + if parse_version(scipy.__version__) >= parse_version("1.6.0"): + with pytest.raises(ValueError): + mt_func(fa, fb, alternative="illegal-alternative") + + for alt in alternatives: + if parse_version(scipy.__version__) >= parse_version("1.6.0"): + r = mt_func(fa, fb, alternative=alt) + else: + r = mt_func(fa, fb) + result = r.execute().fetch() + + if parse_version(scipy.__version__) >= parse_version("1.6.0"): + expected = sp_func(fa_raw, fb_raw, alternative=alt) + else: + expected = sp_func(fa_raw, fb_raw) + np.testing.assert_almost_equal(expected[0], result[0]) + np.testing.assert_almost_equal(expected[1], result[1]) + + # second param size must be 1 for ttest_1samp + fb_raw = np.array([16]) + fb = tensor(fb_raw) + for alt in alternatives: + if parse_version(scipy.__version__) >= parse_version("1.6.0"): + r = ttest_1samp(fa, fb, alternative=alt) + else: + r = ttest_1samp(fa, fb) + result = r.execute().fetch() + + if parse_version(scipy.__version__) >= parse_version("1.6.0"): + expected = sp_ttest_1samp(fa_raw, fb_raw, alternative=alt) + else: + expected = sp_ttest_1samp(fa_raw, fb_raw) + np.testing.assert_almost_equal(expected[0], result[0]) + np.testing.assert_almost_equal(expected[1], result[1]) + + +@pytest.mark.parametrize("chunk_size", [5, 15]) +@pytest.mark.parametrize( + "mode, alternative", + [ + ("auto", "greater"), + ("auto", "less"), + ("auto", "two-sided"), + ("asymp", "two-sided"), + ("approx", "two-sided"), + ], +) +def test_ks_1samp(setup, chunk_size, mode, alternative): + x = tensor(np.linspace(-15, 15, 9), chunk_size=5) + + result = ks_1samp(x, sp_norm.cdf, mode=mode).execute().fetch() + expected = sp_ks_1samp(x, sp_norm.cdf, mode=mode) + assert result == expected + + with pytest.raises(ValueError): + ks_1samp(x, sp_norm.cdf, alternative="unknown") + + +@pytest.mark.parametrize("chunk_size", [5, 15]) +def test_ks_2samp(setup, chunk_size): + n1 = 10 + n2 = 15 + rs = np.random.RandomState(0) + rvs1 = sp_norm.rvs(size=n1, loc=0.0, scale=1, random_state=rs) + rvs2 = sp_norm.rvs(size=n2, loc=0.5, scale=1.5, random_state=rs) + + d1 = tensor(rvs1, chunk_size=chunk_size) + d2 = tensor(rvs2, chunk_size=chunk_size) + + result = ks_2samp(d1, d2).execute().fetch() + expected = sp_ks_2samp(rvs1, rvs2) + assert result == expected + + with pytest.raises(ValueError): + ks_2samp(d1, d2, alternative="unknown") + + with pytest.raises(ValueError): + ks_2samp(d1, d2, mode="unknown") + + with pytest.raises(ValueError): + ks_2samp(d1, []) + + +def test_rankdata_execution(setup): + rs = np.random.RandomState(0) + a = rs.rand(4) + + t1 = tensor(a, chunk_size=5) + r = rankdata(t1) + + result = r.execute().fetch() + expected = sp_rankdata(a) + np.testing.assert_array_almost_equal(result, expected) + + b = rs.rand(4, 4) + + t2 = tensor(b, chunk_size=5) + r2 = rankdata(t2, axis=1) + + result = r2.execute().fetch() + expected = sp_rankdata(b, axis=1) + np.testing.assert_array_almost_equal(result, expected) + + c = rs.rand(0, 4) + + t3 = tensor(c, chunk_size=5) + r3 = rankdata(t3, axis=1) + + result = r3.execute().fetch() + expected = sp_rankdata(c, axis=1) + np.testing.assert_array_almost_equal(result, expected) + + methods = [ + "average", + "min", + "max", + "dense", + "ordinal", + ] + + for method in methods: + r = rankdata(t1, method=method) + result = r.execute().fetch() + + expected = sp_rankdata(a, method=method) + np.testing.assert_almost_equal(result, expected) + + with pytest.raises(ValueError): + r = rankdata(t1, method="unknown") diff --git a/python/xorbits/_mars/tensor/stats/ttest.py b/python/xorbits/_mars/tensor/stats/ttest.py new file mode 100644 index 000000000..f593cc505 --- /dev/null +++ b/python/xorbits/_mars/tensor/stats/ttest.py @@ -0,0 +1,165 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import namedtuple + +import numpy as np +from scipy import __version__ as sp_version +from scipy.stats import distributions as sp_distributions +from scipy.stats import ttest_1samp as sp_ttest_1samp +from scipy.stats import ttest_ind as sp_ttest_ind +from scipy.stats import ttest_ind_from_stats as sp_ttest_ind_from_stats +from scipy.stats import ttest_rel as sp_ttest_rel + +from ...core import ExecutableTuple +from ...lib.version import parse as parse_version +from ..arithmetic import absolute as mt_abs +from ..arithmetic import divide as mt_divide +from ..arithmetic import isnan as mt_isnan +from ..arithmetic import sqrt as mt_sqrt +from ..base import where as mt_where +from ..reduction import mean as mt_mean +from ..reduction import var as mt_var +from ..utils import implement_scipy + + +def _equal_var_ttest_denom(v1, n1, v2, n2): + df = n1 + n2 - 2.0 + svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / df + denom = mt_sqrt(svar * (1.0 / n1 + 1.0 / n2)) # XXX: np -> da + return df, denom + + +def _unequal_var_ttest_denom(v1, n1, v2, n2): + vn1 = v1 / n1 + vn2 = v2 / n2 + with np.errstate(divide="ignore", invalid="ignore"): + df = (vn1 + vn2) ** 2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1)) + + # If df is undefined, variances are zero (assumes n1 > 0 & n2 > 0). + # Hence it doesn't matter what df is as long as it's not NaN. + df = mt_where(mt_isnan(df), 1, df) + denom = mt_sqrt(vn1 + vn2) + return df, denom + + +def _ttest_ind_from_stats(mean1, mean2, denom, df, alternative): + d = mean1 - mean2 + with np.errstate(divide="ignore", invalid="ignore"): + t = mt_divide(d, denom) + t, prob = _ttest_finish(df, t, alternative) + + return t, prob + + +def _ttest_finish(df, t, alternative): + """Common code between all 3 t-test functions.""" + if alternative != "two-sided" and parse_version(sp_version) < parse_version( + "1.6.0" + ): # pragma: no cover + raise ValueError("alternative must be 'two-sided' with scipy prior to 1.6.0") + + if alternative == "less": + prob = t.map_chunk(sp_distributions.t.cdf, args=(df,)) + elif alternative == "greater": + prob = t.map_chunk(sp_distributions.t.sf, args=(df,)) + elif alternative == "two-sided": + prob = mt_abs(t).map_chunk(sp_distributions.t.sf, args=(df,)) * 2 + else: + raise ValueError("alternative must be 'less', 'greater' or 'two-sided'") + if t.ndim == 0: + t = t[()] + return t, prob + + +Ttest_1sampResult = namedtuple("Ttest_1sampResult", ("statistic", "pvalue")) + + +@implement_scipy(sp_ttest_1samp) +def ttest_1samp(a, popmean, axis=0, nan_policy="propagate", alternative="two-sided"): + if nan_policy != "propagate": + raise NotImplementedError( + "`nan_policy` other than 'propagate' have not been implemented." + ) + n = a.shape[axis] + df = n - 1 + + d = a.mean(axis=axis) - popmean + v = a.var(axis=axis, ddof=1) + denom = mt_sqrt(v / float(n)) + + with np.errstate(divide="ignore", invalid="ignore"): + t = mt_divide(d, denom) + t, prob = _ttest_finish(df, t, alternative) + return ExecutableTuple(Ttest_1sampResult(t, prob)) + + +Ttest_indResult = namedtuple("Ttest_indResult", ("statistic", "pvalue")) + + +@implement_scipy(sp_ttest_ind) +def ttest_ind(a, b, axis=0, equal_var=True, alternative="two-sided"): + v1 = mt_var(a, axis, ddof=1) + v2 = mt_var(b, axis, ddof=1) + n1 = a.shape[axis] + n2 = b.shape[axis] + + if equal_var: + df, denom = _equal_var_ttest_denom(v1, n1, v2, n2) + else: + df, denom = _unequal_var_ttest_denom(v1, n1, v2, n2) + + res = _ttest_ind_from_stats( + mt_mean(a, axis), mt_mean(b, axis), denom, df, alternative + ) + + return ExecutableTuple(Ttest_indResult(*res)) + + +@implement_scipy(sp_ttest_ind_from_stats) +def ttest_ind_from_stats( + mean1, std1, nobs1, mean2, std2, nobs2, equal_var=True, alternative="two-sided" +): + if equal_var: + df, denom = _equal_var_ttest_denom(std1**2, nobs1, std2**2, nobs2) + else: + df, denom = _unequal_var_ttest_denom(std1**2, nobs1, std2**2, nobs2) + + res = _ttest_ind_from_stats(mean1, mean2, denom, df, alternative) + return ExecutableTuple(Ttest_indResult(*res)) + + +Ttest_relResult = namedtuple("Ttest_relResult", ("statistic", "pvalue")) + + +@implement_scipy(sp_ttest_rel) +def ttest_rel(a, b, axis=0, nan_policy="propagate", alternative="two-sided"): + if nan_policy != "propagate": + raise NotImplementedError( + "`nan_policy` other than 'propagate' have not been implemented." + ) + + n = a.shape[axis] + df = float(n - 1) + + d = (a - b).astype(np.float64) + v = mt_var(d, axis, ddof=1) + dm = mt_mean(d, axis) + denom = mt_sqrt(v / float(n)) + + with np.errstate(divide="ignore", invalid="ignore"): + t = mt_divide(dm, denom) + t, prob = _ttest_finish(df, t, alternative) + + return ExecutableTuple(Ttest_relResult(t, prob)) diff --git a/python/xorbits/_mars/tensor/tests/__init__.py b/python/xorbits/_mars/tensor/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/tensor/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/tensor/tests/test_core.py b/python/xorbits/_mars/tensor/tests/test_core.py new file mode 100644 index 000000000..f0d867272 --- /dev/null +++ b/python/xorbits/_mars/tensor/tests/test_core.py @@ -0,0 +1,36 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ... import tensor as mt +from ...core import tile + + +def test_params(): + raw = np.random.rand(10, 10) + a = mt.tensor(raw) + a = a[a[0] < 0.5] + a = tile(a) + c = a.chunks[0] + + assert any(np.isnan(s) for s in c.params["shape"]) + c.params = c.get_params_from_data(raw[raw[0] < 0.5]) + assert not any(np.isnan(s) for s in c.params["shape"]) + + params = c.params.copy() + params.pop("index", None) + a.params = params + assert np.prod(a.shape) > 0 + a.refresh_params() diff --git a/python/xorbits/_mars/tensor/tests/test_core_execution.py b/python/xorbits/_mars/tensor/tests/test_core_execution.py new file mode 100644 index 000000000..f53ed763e --- /dev/null +++ b/python/xorbits/_mars/tensor/tests/test_core_execution.py @@ -0,0 +1,282 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from .. import ( + add, + atleast_1d, + atleast_2d, + atleast_3d, + moveaxis, + ones, + squeeze, + swapaxes, + tensor, +) + + +def test_array_function(setup): + a = ones((10, 20), chunk_size=8) + + # test sum + np.testing.assert_equal(np.sum(a).execute().fetch(), 200) + + # test qr + q, r = np.linalg.qr(a) + np.testing.assert_array_almost_equal(np.dot(q, r).execute().fetch(), a) + + +def test_view_data_on_slice(setup): + data = np.random.rand(10, 20) + a = tensor(data, chunk_size=8) + b = a[:5, 5:10] + b[:3, :3] = 3 + + npa = data.copy() + npb = npa[:5, 5:10] + npb[:3, :3] = 3 + + np.testing.assert_array_equal(b.execute(), npb) + np.testing.assert_array_equal(a.execute(), npa) + + data = np.random.rand(10, 20) + a = tensor(data, chunk_size=8) + b = a[:7] + b += 1 + + npa = data.copy() + npb = npa[:7] + npb += 1 + + np.testing.assert_array_equal(a.execute(), npa) + np.testing.assert_array_equal(b.execute(), npb) + + +def test_set_item_on_view(setup): + a = ones((5, 8), dtype=int) + b = a[:3] + b[0, 0] = 2 + c = b.ravel() # create view + c[1] = 4 + + npa = np.ones((5, 8), dtype=int) + npb = npa[:3] + npb[0, 0] = 2 + npc = npb.ravel() # create view + npc[1] = 4 + + np.testing.assert_array_equal(a.execute(), npa) + np.testing.assert_array_equal(b.execute(), npb) + np.testing.assert_array_equal(c.execute(), npc) + + +def test_view_data_on_transpose(setup): + data = np.random.rand(10, 20) + a = tensor(data, chunk_size=6) + b = a.T + add(b, 1, out=b) + + np.testing.assert_array_equal(b.execute(), data.T + 1) + np.testing.assert_array_equal(a.execute(), data + 1) + + +def test_view_data_on_swapaxes(setup): + data = np.random.rand(10, 20) + a = tensor(data, chunk_size=6) + b = swapaxes(a, 1, 0) + a[1] = 10 + + npa = data.copy() + npb = np.swapaxes(npa, 1, 0) + npa[1] = 10 + + np.testing.assert_array_equal(b.execute(), npb) + np.testing.assert_array_equal(a.execute(), npa) + + +def test_view_data_on_moveaxis(setup): + data = np.random.rand(10, 20) + a = tensor(data, chunk_size=6) + b = moveaxis(a, 1, 0) + a[0][1] = 10 + + npa = data.copy() + npb = np.moveaxis(npa, 1, 0) + npa[0][1] = 10 + + np.testing.assert_array_equal(b.execute(), npb) + np.testing.assert_array_equal(a.execute(), npa) + + +def test_view_data_on_atleast1d(setup): + a = atleast_1d(1) + b = a[:] + b[0] = 10 + + np.testing.assert_array_equal(b.execute(), np.array([10])) + np.testing.assert_array_equal(a.execute(), np.array([10])) + + +def test_view_data_on_atleast2d(setup): + data = np.random.rand(10) + a = atleast_2d(tensor(data, chunk_size=5)) + b = add(a[:, :5], 1, out=a[:, 5:]) + + npa = np.atleast_2d(data.copy()) + npb = np.add(npa[:, :5], 1, out=npa[:, 5:]) + + np.testing.assert_array_equal(a.execute(), npa) + np.testing.assert_array_equal(b.execute(), npb) + + +def test_view_data_on_atleast3d(setup): + data = np.random.rand(10, 20) + a = atleast_3d(tensor(data, chunk_size=5)) + b = a[:, :5, :10][0] + c = add(b[:4], b[1:], out=a[0, 16:]) + + npa = np.atleast_3d(data.copy()) + npb = npa[:, :5, :10][0] + npc = np.add(npb[:4], npb[1:], out=npa[0, 16:]) + + np.testing.assert_array_equal(a.execute(), npa) + np.testing.assert_array_equal(b.execute(), npb) + np.testing.assert_array_equal(c.execute(), npc) + + +def test_view_data_on_squeeze(setup): + data = np.random.rand(1, 4, 1) + a = tensor(data, chunk_size=2) + b = squeeze(a, axis=0) + b[:3] = 10 + + npa = data.copy() + npb = np.squeeze(npa, axis=0) + npb[:3] = 10 + + np.testing.assert_array_equal(b.execute(), npb) + np.testing.assert_array_equal(a.execute(), npa) + + +def test_view_data_on_reshape(setup): + data = np.random.RandomState(0).random((3, 4, 5)) + a = tensor(data.copy(), chunk_size=2) + b = a.reshape((5, 4, 3)) + b[:3] = 10 + + npa = data.copy() + npb = npa.reshape((5, 4, 3)) + npb[:3] = 10 + + np.testing.assert_array_equal(b.execute(), npb) + np.testing.assert_array_equal(a.execute(), npa) + + data = np.random.RandomState(0).random((4, 5)) + a2 = tensor(data.copy(), chunk_size=2) + b2 = a2.reshape((5, 4), order="F") + b2[:3] = 10 + + npa = data.copy() + npb = npa.reshape((5, 4), order="F") + npb[:3] = 10 + + b2_result = b2.execute() + np.testing.assert_array_equal(a2.execute(), npa) + np.testing.assert_array_equal(b2_result, npb) + + +def test_view_data_on_ravel(setup): + # ravel creates a view + data = np.random.rand(3, 4, 5) + a = tensor(data, chunk_size=2) + b = a.ravel() + b[:10] = 10 + + npa = data.copy() + npb = npa.ravel() + npb[:10] = 10 + + np.testing.assert_array_equal(b.execute(), npb) + np.testing.assert_array_equal(a.execute(), npa) + + # flatten creates a copy + data = np.random.rand(3, 4, 5) + a = tensor(data, chunk_size=2) + b = a.flatten() + b[:10] = 10 + + npa = data.copy() + npb = npa.flatten() + npb[:10] = 10 + + np.testing.assert_array_equal(b.execute(), npb) + np.testing.assert_array_equal(a.execute(), npa) + + +def test_copy_and_view(setup): + data = np.random.rand(10, 20) + a = tensor(data, chunk_size=6) + b = a.view() + b[:5] = 10 + + npa = data.copy() + npb = npa.view() + npb[:5] = 10 + + np.testing.assert_array_equal(b.execute(), npb) + np.testing.assert_array_equal(a.execute(), npa) + + data = np.random.rand(10, 20) + a = tensor(data.copy(), chunk_size=6) + b = a.copy() + b[:5] = 10 + + npa = data.copy() + npb = npa.copy() + npb[:5] = 10 + + np.testing.assert_array_equal(b.execute(), npb) + np.testing.assert_array_equal(a.execute(), npa) + + a = tensor(data.copy(), chunk_size=6) + b = a[:5, :4] + c = b.copy() + c[0, 0] = 10 + + npa = data.copy() + npb = npa[:5, :4] + npc = npb.copy() + npc[0, 0] = 10 + + np.testing.assert_array_equal(c.execute(), npc) + np.testing.assert_array_equal(a.execute(), npa) + + +def test_flat(setup): + data = np.random.rand(10, 20) + a = tensor(data, chunk_size=4) + fl = a.flat + fl[1:10] = 10 + b = fl[10:20] + b[0:4] = 20 + + npa = data.copy() + npfl = npa.flat + npfl[1:10] = 10 + npb = npfl[10:20] + npb[0:4] = 20 + + np.testing.assert_array_equal(b.execute(), npb) + np.testing.assert_array_equal(a.execute(), npa) diff --git a/python/xorbits/_mars/tensor/tests/test_utils.py b/python/xorbits/_mars/tensor/tests/test_utils.py new file mode 100644 index 000000000..890806ca0 --- /dev/null +++ b/python/xorbits/_mars/tensor/tests/test_utils.py @@ -0,0 +1,92 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest + +from ... import tensor as mt +from ...lib.mmh3 import hash_from_buffer as mmh3_hash_from_buffer +from ..utils import fetch_corner_data, hash_on_axis, normalize_axis_tuple + + +def test_hash_on_axis(): + hash_from_buffer = lambda x: mmh3_hash_from_buffer(memoryview(x)) + + a = np.random.rand(10) + + result = hash_on_axis(a, 0, 3) + expected = np.array([mmh3_hash_from_buffer(element) % 3 for element in a]) + + np.testing.assert_array_equal(result, expected) + + result = hash_on_axis(a, 0, 1) + expected = np.array([0 for _ in a]) + + np.testing.assert_array_equal(result, expected) + + a = np.random.rand(10, 5) + + result = hash_on_axis(a, 0, 3) + expected = np.array([hash_from_buffer(a[i, :]) % 3 for i in range(a.shape[0])]) + + np.testing.assert_array_equal(result, expected) + + result = hash_on_axis(a, 1, 3) + expected = np.array([hash_from_buffer(a[:, i]) % 3 for i in range(a.shape[1])]) + + np.testing.assert_array_equal(result, expected) + + a = np.random.rand(10, 5, 4) + + result = hash_on_axis(a, 2, 3) + expected = np.array([hash_from_buffer(a[:, :, i]) % 3 for i in range(a.shape[2])]) + + np.testing.assert_array_equal(result, expected) + + +def test_normalize_axis_tuple(): + assert normalize_axis_tuple(-1, 3) == (2,) + assert normalize_axis_tuple([0, -2], 3) == (0, 1) + assert sorted(normalize_axis_tuple({0, -2}, 3)) == [0, 1] + + with pytest.raises(ValueError) as cm: + normalize_axis_tuple((1, -2), 3, argname="axes") + assert "axes" in str(cm.value) + + with pytest.raises(ValueError): + normalize_axis_tuple((1, -2), 3) + + +def test_fetch_tensor_corner_data(setup): + print_options = np.get_printoptions() + + # make sure numpy default option + assert print_options["edgeitems"] == 3 + assert print_options["threshold"] == 1000 + + size = 12 + for i in (2, 4, size - 3, size, size + 3): + arr = np.random.rand(i, i, i) + t = mt.tensor(arr, chunk_size=size // 2) + t.execute() + + corner_data = fetch_corner_data(t) + corner_threshold = 1000 if t.size < 1000 else corner_data.size - 1 + with np.printoptions(threshold=corner_threshold, suppress=True): + # when we repr corner data, we need to limit threshold that + # it's exactly less than the size + repr_corner_data = repr(corner_data) + with np.printoptions(suppress=True): + repr_result = repr(arr) + assert repr_corner_data == repr_result diff --git a/python/xorbits/_mars/tensor/ufunc/__init__.py b/python/xorbits/_mars/tensor/ufunc/__init__.py new file mode 100644 index 000000000..a8027eea4 --- /dev/null +++ b/python/xorbits/_mars/tensor/ufunc/__init__.py @@ -0,0 +1,24 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def _install(): + from ..core import Tensor + from .ufunc import _array_ufunc + + Tensor.__array_ufunc__ = _array_ufunc + + +_install() +del _install diff --git a/python/xorbits/_mars/tensor/ufunc/tests/__init__.py b/python/xorbits/_mars/tensor/ufunc/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/_mars/tensor/ufunc/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/_mars/tensor/ufunc/tests/test_ufunc_execution.py b/python/xorbits/_mars/tensor/ufunc/tests/test_ufunc_execution.py new file mode 100644 index 000000000..3ef36850b --- /dev/null +++ b/python/xorbits/_mars/tensor/ufunc/tests/test_ufunc_execution.py @@ -0,0 +1,80 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest + +from .... import tensor as mt +from ...core import Tensor + + +@pytest.mark.parametrize("ufunc_name", ["negative"]) +def test_unary_ufunc(setup, ufunc_name): + raw_data = np.random.rand(100, 100) + t = mt.tensor(raw_data.copy(), chunk_size=20) + + ufunc_obj = getattr(np, ufunc_name) + + res = ufunc_obj(t) + expected = ufunc_obj(raw_data) + assert isinstance(res, Tensor) + np.testing.assert_array_equal(res.execute().fetch(), expected) + + ufunc_obj.at(t, 3) + ufunc_obj.at(raw_data, 3) + np.testing.assert_array_equal(t.execute().fetch(), raw_data) + + +@pytest.mark.parametrize("ufunc_name", ["add", "multiply", "logaddexp", "logaddexp2"]) +def test_binary_ufunc(setup, ufunc_name): + raw_data1 = np.random.rand(100, 100) + t1 = mt.tensor(raw_data1.copy(), chunk_size=50) + raw_data2 = np.random.rand(100, 100) + t2 = mt.tensor(raw_data2.copy(), chunk_size=50) + + ufunc_obj = getattr(np, ufunc_name) + + res = ufunc_obj(t1, t2) + expected = ufunc_obj(raw_data1, raw_data2) + assert isinstance(res, Tensor) + np.testing.assert_array_equal(res.execute().fetch(), expected) + + ufunc_obj.at(t1, (3, 4), 2) + ufunc_obj.at(raw_data1, (3, 4), 2) + np.testing.assert_array_equal(t1.execute().fetch(), raw_data1) + + res = ufunc_obj.reduce(t1, axis=1) + expected = ufunc_obj.reduce(raw_data1, axis=1) + assert isinstance(res, Tensor) + np.testing.assert_almost_equal(res.execute().fetch(), expected) + + res = t1.copy() + ufunc_obj.reduce(t1, axis=1, out=res) + expected = ufunc_obj.reduce(raw_data1, axis=1) + assert isinstance(res, Tensor) + np.testing.assert_almost_equal(res.execute().fetch(), expected) + + res = ufunc_obj.accumulate(t1, axis=1) + expected = ufunc_obj.accumulate(raw_data1, axis=1) + assert isinstance(res, Tensor) + np.testing.assert_almost_equal(res.execute().fetch(), expected) + + res = t1.copy() + ufunc_obj.accumulate(t1, axis=1, out=res) + expected = ufunc_obj.accumulate(raw_data1, axis=1) + assert isinstance(res, Tensor) + np.testing.assert_almost_equal(res.execute().fetch(), expected) + + with pytest.raises(TypeError): + ufunc_obj.reduceat(t1, [(3, 4)]) diff --git a/python/xorbits/_mars/tensor/ufunc/ufunc.py b/python/xorbits/_mars/tensor/ufunc/ufunc.py new file mode 100644 index 000000000..00a079c04 --- /dev/null +++ b/python/xorbits/_mars/tensor/ufunc/ufunc.py @@ -0,0 +1,198 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from numbers import Number + +import numpy as np + +from .. import arithmetic as arith +from .. import reduction +from ..datasource import tensor as astensor + + +class TensorUfuncDef: + def __init__( + self, method, aggregator=None, accumulator=None, pre_agg=None, post_agg=None + ): + self._method = method + self._aggregator = aggregator + self._accumulator = accumulator + self._pre_agg = pre_agg + self._post_agg = post_agg + + def __call__(self, *args, **kwargs): + return self._method(*args, **kwargs) + + def at(self, a, indices, b=None): + # todo handle setting duplicated keys, a separate operand may be needed + if b is None: + a[indices] = self(a[indices]) + else: + a[indices] = self(a[indices], b) + + def accumulate(self, array, axis=0, dtype=None, out=None): + if self._accumulator is None: + raise NotImplementedError + data = array if self._pre_agg is None else self._pre_agg(array) + result = self._accumulator(data, axis=axis, dtype=dtype) + result = result if self._post_agg is None else self._post_agg(result) + if out is not None: + out[0]._data = result._data + else: + return result + + def reduce(self, array, axis=0, dtype=None, out=None, keepdims=False): + if self._aggregator is None: + raise NotImplementedError + data = array if self._pre_agg is None else self._pre_agg(array) + result = self._aggregator(data, axis=axis, dtype=dtype, keepdims=keepdims) + result = result if self._post_agg is None else self._post_agg(result) + if out is not None: + out[0]._data = result._data + else: + return result + + +UFUNC_TO_TENSOR_FUNCS = { + np.add: TensorUfuncDef( + arith.add, + accumulator=reduction.cumsum, + aggregator=reduction.sum, + ), + np.subtract: TensorUfuncDef(arith.subtract), + np.multiply: TensorUfuncDef( + arith.multiply, + accumulator=reduction.cumprod, + aggregator=reduction.prod, + ), + np.divide: TensorUfuncDef(arith.divide), + np.logaddexp: TensorUfuncDef( + arith.logaddexp, + accumulator=reduction.cumsum, + aggregator=reduction.sum, + pre_agg=arith.exp, + post_agg=arith.log, + ), + np.logaddexp2: TensorUfuncDef( + arith.logaddexp2, + accumulator=reduction.cumsum, + aggregator=reduction.sum, + pre_agg=lambda x: arith.power(2, x), + post_agg=arith.log2, + ), + np.true_divide: TensorUfuncDef(arith.truediv), + np.floor_divide: TensorUfuncDef(arith.floordiv), + # unary + np.negative: TensorUfuncDef(arith.negative), + np.power: TensorUfuncDef(arith.power), + np.float_power: TensorUfuncDef(arith.float_power), + np.remainder: TensorUfuncDef(arith.remainder), + np.mod: TensorUfuncDef(arith.mod), + np.fmod: TensorUfuncDef(arith.fmod), + np.conj: TensorUfuncDef(arith.conj), + np.conjugate: TensorUfuncDef(arith.conjugate), + np.exp: TensorUfuncDef(arith.exp), + np.exp2: TensorUfuncDef(arith.exp2), + np.log: TensorUfuncDef(arith.log), + np.log2: TensorUfuncDef(arith.log2), + np.log10: TensorUfuncDef(arith.log10), + np.log1p: TensorUfuncDef(arith.log1p), + np.expm1: TensorUfuncDef(arith.expm1), + np.sqrt: TensorUfuncDef(arith.sqrt), + np.square: TensorUfuncDef(arith.square), + np.cbrt: TensorUfuncDef(arith.cbrt), + np.reciprocal: TensorUfuncDef(arith.reciprocal), + # trigonometric functions + np.sin: TensorUfuncDef(arith.sin), + np.cos: TensorUfuncDef(arith.cos), + np.tan: TensorUfuncDef(arith.tan), + np.arcsin: TensorUfuncDef(arith.arcsin), + np.arccos: TensorUfuncDef(arith.arccos), + np.arctan: TensorUfuncDef(arith.arctan), + np.arctan2: TensorUfuncDef(arith.arctan2), + np.hypot: TensorUfuncDef(arith.hypot), + np.sinh: TensorUfuncDef(arith.sinh), + np.cosh: TensorUfuncDef(arith.cosh), + np.tanh: TensorUfuncDef(arith.tanh), + np.arcsinh: TensorUfuncDef(arith.arcsinh), + np.arccosh: TensorUfuncDef(arith.arccosh), + np.arctanh: TensorUfuncDef(arith.arctanh), + np.deg2rad: TensorUfuncDef(arith.deg2rad), + np.rad2deg: TensorUfuncDef(arith.rad2deg), + # comparison functions + np.greater: TensorUfuncDef(arith.greater), + np.greater_equal: TensorUfuncDef(arith.greater_equal), + np.less: TensorUfuncDef(arith.less), + np.less_equal: TensorUfuncDef(arith.less_equal), + np.not_equal: TensorUfuncDef(arith.not_equal), + np.equal: TensorUfuncDef(arith.equal), + np.logical_and: TensorUfuncDef(arith.logical_and), + np.logical_or: TensorUfuncDef(arith.logical_or), + np.logical_xor: TensorUfuncDef(arith.logical_xor), + np.logical_not: TensorUfuncDef(arith.logical_not), + np.maximum: TensorUfuncDef(arith.maximum), + np.minimum: TensorUfuncDef(arith.minimum), + np.fmax: TensorUfuncDef(arith.fmax), + np.fmin: TensorUfuncDef(arith.fmin), + # floating functions + np.isfinite: TensorUfuncDef(arith.isfinite), + np.isinf: TensorUfuncDef(arith.isinf), + np.isnan: TensorUfuncDef(arith.isnan), + np.signbit: TensorUfuncDef(arith.signbit), + np.copysign: TensorUfuncDef(arith.copysign), + np.nextafter: TensorUfuncDef(arith.nextafter), + np.spacing: TensorUfuncDef(arith.spacing), + np.modf: TensorUfuncDef(arith.modf), + np.ldexp: TensorUfuncDef(arith.ldexp), + np.frexp: TensorUfuncDef(arith.frexp), + np.floor: TensorUfuncDef(arith.floor), + np.ceil: TensorUfuncDef(arith.ceil), + np.trunc: TensorUfuncDef(arith.trunc), + # more math functions + np.degrees: TensorUfuncDef(arith.degrees), + np.radians: TensorUfuncDef(arith.radians), + np.rint: TensorUfuncDef(arith.rint), + np.fabs: TensorUfuncDef(arith.fabs), + np.sign: TensorUfuncDef(arith.sign), + np.absolute: TensorUfuncDef(arith.absolute), +} + + +def _check_arg(arg): + if isinstance(arg, Number): + return True + + try: + astensor(arg) + return True + except ValueError: + return False + + +def _array_ufunc(_, ufunc, method, *inputs, **kwargs): + out = kwargs.get("out", tuple()) + for x in inputs + out: + if not _check_arg(x): + return NotImplemented + + if ufunc.signature is not None: + return NotImplemented + if ufunc not in UFUNC_TO_TENSOR_FUNCS: + return NotImplemented + + try: + tensor_func = getattr(UFUNC_TO_TENSOR_FUNCS[ufunc], method) + return tensor_func(*inputs, **kwargs) + except (AttributeError, NotImplementedError): + return NotImplemented diff --git a/python/xorbits/_mars/tensor/utils.py b/python/xorbits/_mars/tensor/utils.py new file mode 100644 index 000000000..111edf3c3 --- /dev/null +++ b/python/xorbits/_mars/tensor/utils.py @@ -0,0 +1,835 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +import itertools +import operator +from collections import OrderedDict +from collections.abc import Iterable +from functools import lru_cache, reduce, wraps +from math import ceil +from numbers import Integral +from typing import Dict, List, Union + +import numpy as np + +try: + import tiledb +except (ImportError, OSError): # pragma: no cover + tildb = None + +from ..core import ExecutableTuple, recursive_tile +from ..lib.mmh3 import hash_from_buffer +from ..utils import lazy_import + +cp = lazy_import("cupy", rename="cp") + + +def normalize_shape(shape): + if isinstance(shape, Iterable): + return tuple(shape) + else: + return (shape,) + + +def normalize_chunk_sizes(shape, chunk_size): + shape = normalize_shape(shape) + if not isinstance(chunk_size, tuple): + if isinstance(chunk_size, Iterable): + chunk_size = tuple(chunk_size) + elif isinstance(chunk_size, int): + chunk_size = (chunk_size,) * len(shape) + + if len(shape) != len(chunk_size): + raise ValueError( + "Chunks must have the same dimemsion, " + f"got shape: {shape}, chunks: {chunk_size}" + ) + + chunk_sizes = [] + for size, chunk in zip(shape, chunk_size): + if isinstance(chunk, Iterable): + if not isinstance(chunk, tuple): + chunk = tuple(chunk) + + # if chunk is (np.nan,), it means we need to concat + # all chunks together. + if chunk == (np.nan,): + chunk = (size,) + + if sum(chunk) != size: + raise ValueError( + "chunks shape should be of the same length, " + f"got shape: {size}, chunks: {chunk}" + ) + chunk_sizes.append(chunk) + else: + assert isinstance(chunk, int) + + if size == 0: + sizes = (0,) + else: + sizes = tuple(chunk for _ in range(int(size / chunk))) + ( + tuple() if size % chunk == 0 else (size % chunk,) + ) + chunk_sizes.append(sizes) + + return tuple(chunk_sizes) + + +def broadcast_shape(*shapes): + if len(shapes) == 1: + return shapes[0] + + out_shapes = [] + for ss in itertools.zip_longest(*[reversed(s) for s in shapes], fillvalue=-1): + shape = max(s for s in ss if s != -1) + if any(i != -1 and i != 1 and i != shape and not np.isnan(i) for i in ss): + raise ValueError( + "Operands could not be broadcast together " + "with shape {0}".format(" ".join(map(str, shapes))) + ) + out_shapes.append(shape) + return tuple(reversed(out_shapes)) + + +def get_chunk_slices(nsplits, idx): + return tuple( + slice(sum(nsplit[:idx]), sum(nsplit[: idx + 1])) + for idx, nsplit in zip(idx, nsplits) + ) + + +def gen_random_seeds(n, random_state): + assert isinstance(random_state, np.random.RandomState) + return tuple(np.frombuffer(random_state.bytes(n * 4), dtype=np.uint32).tolist()) + + +def validate_axis(ndim, axis, argname=None): + if axis >= ndim or axis < -ndim: + raise np.AxisError(axis, ndim=ndim, msg_prefix=argname) + + return axis if axis >= 0 else ndim + axis + + +def normalize_axis_tuple(axis, ndim, argname=None, allow_duplicate=False): + """ + Normalizes an axis argument into a tuple of non-negative integer axes. + + This handles shorthands such as ``1`` and converts them to ``(1,)``, + as well as performing the handling of negative indices covered by + `normalize_axis_index`. + + By default, this forbids axes from being specified multiple times. + + Used internally by multi-axis-checking logic. + + Parameters + ---------- + axis : int, iterable of int + The un-normalized index or indices of the axis. + ndim : int + The number of dimensions of the array that `axis` should be normalized + against. + argname : str, optional + A prefix to put before the error message, typically the name of the + argument. + allow_duplicate : bool, optional + If False, the default, disallow an axis from being specified twice. + + Returns + ------- + normalized_axes : tuple of int + The normalized axis index, such that `0 <= normalized_axis < ndim` + + Raises + ------ + AxisError + If any axis provided is out of range + ValueError + If an axis is repeated + + See also + -------- + normalize_axis_index : normalizing a single scalar axis + """ + # Optimization to speed-up the most common cases. + if type(axis) not in (tuple, list): + try: + axis = [operator.index(axis)] + except TypeError: + pass + # Going via an iterator directly is slower than via list comprehension. + axis = tuple([validate_axis(ndim, ax, argname) for ax in axis]) + if not allow_duplicate and len(set(axis)) != len(axis): + if argname: + raise ValueError(f"repeated axis in `{argname}` argument") + else: + raise ValueError("repeated axis") + return axis + + +def validate_order(dtype, order): + if getattr(dtype, "fields", None) is None: + if order is not None: + raise ValueError("Cannot specify order when the array has no fields") + else: + return + + need_check = True + if order is None: + order = list(dtype.names) + need_check = False + elif isinstance(order, (list, tuple)): + order = list(order) + else: + order = [order] + if need_check: + for o in order: + if o not in dtype.fields: + raise ValueError(f"unknown field name: {o}") + return order + + +def inject_dtype(dtype): + def inner(func): + @wraps(func) + def call(*tensors, **kw): + kw["dtype"] = np.dtype(dtype) + ret = func(*tensors, **kw) + if ret is NotImplemented: + reverse_func = getattr( + inspect.getmodule(func), f"r{func.__name__}", None + ) + if reverse_func is not None: + ret = reverse_func(*tensors[::-1], **kw) + if ret is NotImplemented: + raise TypeError( + "unsupported operand type(s) for {0}: '{1}' and '{2}".format( + func.__name__, *[type(t) for t in tensors] + ) + ) + return ret + + return call + + return inner + + +def infer_dtype(np_func, multi_outputs=False, empty=True, reverse=False, check=True): + def make_arg(arg): + if empty: + return np.empty((1,) * max(1, arg.ndim), dtype=arg.dtype) + else: + if hasattr(arg, "op") and hasattr(arg.op, "data"): + arg = arg.op.data + return arg[(0,) * max(1, arg.ndim)] + + tensor_ufunc = "__tensor_ufunc__" + + def is_arg(arg): + if hasattr(arg, tensor_ufunc): + return False + return hasattr(arg, "ndim") and hasattr(arg, "dtype") + + def inner(func): + @wraps(func) + def h(*tensors, **kw): + usr_dtype = np.dtype(kw.pop("dtype")) if "dtype" in kw else None + args = [make_arg(t) if is_arg(t) else t for t in tensors] + if reverse: + args = args[::-1] + np_kw = dict( + (k, make_arg(v) if hasattr(v, "op") else v) + for k, v in kw.items() + if is_arg(v) and k != "out" + ) + + dtype = None + if not any( + hasattr(arg, tensor_ufunc) + for arg in itertools.chain(args, np_kw.values()) + ): + # skip infer if encounter mars DataFrame etc + # that implements __tensor_ufunc__ + try: + with np.errstate(all="ignore"): + if multi_outputs: + dtype = np_func(*args, **np_kw)[0].dtype + else: + dtype = np_func(*args, **np_kw).dtype + except: # noqa: E722 + dtype = None + + if usr_dtype and dtype: + can_cast_kwargs = {} + if kw.get("casting") is not None: + can_cast_kwargs["casting"] = kw.get("casting") + if check and not np.can_cast(dtype, usr_dtype, **can_cast_kwargs): + raise TypeError( + "No loop matching the specified signature " + f"and casting was found for ufunc {np_func}" + ) + kw["dtype"] = usr_dtype + else: + kw["dtype"] = dtype + + ret = func(*tensors, **kw) + if ret is NotImplemented: + reverse_func = ( + getattr(inspect.getmodule(func), f"r{func.__name__}", None) + if not reverse + else None + ) + if reverse_func is not None: + ret = reverse_func(*tensors[::-1], **kw) + if ret is NotImplemented: + raise TypeError( + "unsupported operand type(s) for {0}: '{1}' and '{2}".format( + func.__name__, *[type(t) for t in tensors] + ) + ) + return ret + + return h + + return inner + + +def index_ndim(index): + from .core import Tensor + + if isinstance(index, Tensor) and index.dtype == np.bool_: + # boolean indexing will occupy the ndim + return index.ndim + + return 1 if index is not None else 0 + + +def replace_ellipsis(index, ndim): + all_illipsis = list(i for i, idx in enumerate(index) if idx is Ellipsis) + if len(all_illipsis) > 1: + raise IndexError("an index can only have a single ellipsis ('...')") + if not all_illipsis: + return index + + illipsis_index = all_illipsis[0] + n_extra = ndim - sum([index_ndim(i) for i in index]) + 1 + return ( + index[:illipsis_index] + (slice(None),) * n_extra + index[illipsis_index + 1 :] + ) + + +def calc_sliced_size(size: int, sliceobj: slice) -> int: + if np.isnan(size): + return np.nan + + start, stop, step = sliceobj.indices(size) + return int(ceil(abs((stop - start) / float(step)))) + + +def calc_object_length(obj, size=None): + if np.isscalar(obj): + return 1 + elif isinstance(obj, slice): + return calc_sliced_size(size, obj) + else: + return len(obj) + + +def slice_split( + index: Union[int, slice], sizes: List[int] +) -> Dict[int, Union[int, slice]]: + size = sum(sizes) + + if isinstance(index, Integral): + index = index if index >= 0 else size + index + i = 0 + ind = index + lens = list(sizes) + while ind >= lens[0]: + i += 1 + ind -= lens.pop(0) + return {i: ind} + + assert isinstance(index, slice) + start, stop, step = index.indices(size) + + slice_all = slice(None) + + if index == slice_all: + return dict((k, slice_all) for k in range(len(sizes))) + + d = dict() + if step > 0: + for i, length in enumerate(sizes): + if start < length and stop > 0: + d[i] = slice(start, min(stop, length), step) + start = (start - length) % step + else: + start = start - length + stop -= length + else: + rstart = start # running start + chunk_boundaries = np.cumsum(sizes) + for i, chunk_stop in reversed(list(enumerate(chunk_boundaries))): + # create a chunk start and stop + if i == 0: + chunk_start = 0 + else: + chunk_start = chunk_boundaries[i - 1] + + # if our slice is in this chunk + if (chunk_start <= rstart < chunk_stop) and (rstart > stop): + d[i] = slice( + rstart - chunk_stop, + max(chunk_start - chunk_stop - 1, stop - chunk_stop), + step, + ) + + # compute the next running start point, + offset = (rstart - (chunk_start - 1)) % step + rstart = chunk_start + offset - 1 + + # replace 0:20:1 with : if appropriate + for k, v in d.items(): + if v == slice(0, sizes[k], 1): + d[k] = slice(None, None, None) + + if not d: # special case x[:0] + d[0] = slice(0, 0, 1) + + return d + + +def is_asc_sorted(arr): + arr = np.asarray(arr) + if len(arr) == 0: + return True + return np.all(arr[:-1] <= arr[1:]) + + +def split_indexes_into_chunks(nsplits, indexes, ret_is_asc=True): + indexes = np.asarray(indexes) + chunk_idxes = np.empty_like(indexes) + cum_nsplits = [np.cumsum(nsplit) for nsplit in nsplits] + for i, cum_nsplit, index in zip(itertools.count(0), cum_nsplits, indexes): + # handle negative value in index + if hasattr(index, "flags") and not index.flags.writeable: + index = index.copy() + index = np.add(index, cum_nsplit[-1], out=index, where=index < 0) + sorted_idx = np.argsort(index) + + if np.any(index >= cum_nsplit[-1]): + idx = index[index >= cum_nsplit[-1]][0] + raise IndexError(f"index {idx} is out of bounds with size {cum_nsplit[-1]}") + + chunk_idx = np.searchsorted(cum_nsplit, index[sorted_idx], side="right") + chunk_idxes[i, sorted_idx] = chunk_idx + + chunk_idxes_asc = False + if ret_is_asc: + chunk_idxes_asc = is_asc_sorted(np.lexsort(chunk_idxes[::-1])) + + chunk_index_to_indexes = OrderedDict() + chunk_index_to_poses = OrderedDict() + poses = np.arange(len(indexes[0])) + for idx in itertools.product(*(range(len(nsplit)) for nsplit in nsplits)): + cond = (chunk_idxes == np.array(idx).reshape((len(idx), 1))).all(axis=0) + filtered = indexes[:, cond] + for i in range(len(indexes)): + filtered[i] = filtered[i] - ( + cum_nsplits[i][idx[i] - 1] if idx[i] > 0 else 0 + ) + chunk_index_to_indexes[idx] = filtered + chunk_index_to_poses[idx] = poses[cond] + + if ret_is_asc: + return chunk_index_to_indexes, chunk_index_to_poses, chunk_idxes_asc + return chunk_index_to_indexes, chunk_index_to_poses + + +def calc_pos(fancy_index_shape, pos, xp=np): + if isinstance(pos, dict): + pos = xp.concatenate(list(pos.values())) + select_pos = xp.empty(fancy_index_shape, dtype=int) + select_pos.flat[pos] = xp.arange(select_pos.size) + return select_pos + + +def decide_unify_split(*splits): + # TODO (jisheng): In the future, we need more sophisticated way to decide the rechunk split + # right now, for (2, 2) and (3, 1), we get the rechunk split as (2, 1, 1) + if not splits: + return () + raw_splits = splits + # support broadcasting rules + # decide_unify_splits((1,), (5,)) --> (5,) + splits = set(s for s in splits if ((len(s) > 1) or (len(s) == 1 and s[0] != 1))) + if len(splits) == 1: + return splits.pop() + if len(splits) == 0: + return raw_splits[0] + + if any(np.isnan(sum(s)) for s in splits): + raise ValueError(f"Tensor chunk sizes are unknown: {splits}") + if len(set(sum(s) for s in splits)) > 1: + raise ValueError(f"Splits not of same size: {splits}") + + q = [list(s) for s in splits] + size = sum(q[0]) + cum = 0 + + res = [] + while cum < size: + m = min(s[0] for s in q) + res.append(m) + for s in q: + s[0] -= m + if s[0] == 0: + s.pop(0) + + cum += m + + return tuple(res) + + +def unify_nsplits(*tensor_axes): + tensor_splits = [ + dict((a, split) for a, split in zip(axes, t.nsplits) if split != (1,)) + for t, axes in tensor_axes + if t.nsplits + ] + common_axes = ( + reduce(operator.and_, [set(ts.keys()) for ts in tensor_splits]) + if tensor_splits + else set() + ) + axes_unified_splits = dict( + (ax, decide_unify_split(*(t[ax] for t in tensor_splits))) for ax in common_axes + ) + + if len(common_axes) == 0: + return tuple(t[0] for t in tensor_axes) + + res = [] + for t, axes in tensor_axes: + new_chunk = dict( + (i, axes_unified_splits[ax]) + for ax, i in zip(axes, range(t.ndim)) + if ax in axes_unified_splits + ) + t = yield from recursive_tile(t.rechunk(new_chunk)) + res.append(t) + + return tuple(res) + + +def unify_chunks(*tensors): + tensor_axes = [ + (t, range(t.ndim)) if not isinstance(t, tuple) else t for t in tensors + ] + + if len(tensor_axes) < 2: + return tuple(t[0] if isinstance(t, tuple) else t for t in tensors) + + return (yield from unify_nsplits(*tensor_axes)) + + +def check_out_param(out, t, casting): + from .base import broadcast_to + + if not hasattr(out, "shape"): + raise TypeError("return arrays must be a tensor") + + try: + broadcast_to(t, out.shape) + except ValueError: + raise ValueError( + "operands could not be broadcast together " + "with shapes ({0}) ({1})".format( + ",".join(str(s) for s in t.shape), ",".join(str(s) for s in out.shape) + ) + ) + + if not np.can_cast(t.dtype, out.dtype, casting): + raise TypeError( + f"output (typecode '{t.dtype.char}') could not be coerced " + f"to provided output parameter (typecode '{out.dtype.char}') " + f"according to the casting rule ''{casting}''" + ) + + +def dictify_chunk_size(shape, chunk_size): + """ + Given chunk_size which may be a tuple or dict, return a dict type all the same. + + :param shape: tensor's shape + :param chunk_size: if dict provided, it's dimension id to chunk size; + if provided, it's the chunk size for each dimension. + :return: dict form of chunk_size + """ + if chunk_size is not None: + if isinstance(chunk_size, Iterable): + if not isinstance(chunk_size, dict): + chunk_size = {i: c for i, c in enumerate(chunk_size)} + elif isinstance(chunk_size, int): + chunk_size = {i: chunk_size for i in range(len(shape))} + else: + raise TypeError(f"chunks must be iterable, got {type(chunk_size)}") + + if chunk_size is None: + chunk_size = dict() + + return chunk_size + + +def decide_chunk_sizes(shape, chunk_size, itemsize): + """ + Decide how a given tensor can be split into chunk. + + :param shape: tensor's shape + :param chunk_size: if dict provided, it's dimension id to chunk size; + if provided, it's the chunk size for each dimension. + :param itemsize: element size + :return: the calculated chunk size for each dimension + :rtype: tuple + """ + + from ..config import options + + chunk_size = dictify_chunk_size(shape, chunk_size) + nleft = len(shape) - len(chunk_size) + if nleft < 0: + raise ValueError("chunks have more dimensions than input tensor") + if nleft == 0: + return normalize_chunk_sizes( + shape, tuple(chunk_size[j] for j in range(len(shape))) + ) + + max_chunk_size = options.chunk_store_limit + + # normalize the dimension which specified first + dim_to_normalized = { + i: normalize_chunk_sizes((shape[i],), (c,))[0] for i, c in chunk_size.items() + } + + left = {j: [] for j in range(len(shape)) if j not in dim_to_normalized} + left_unsplit = {j: shape[j] for j in left} + while True: + nbytes_occupied = ( + np.prod([max(c) for c in dim_to_normalized.values()]) * itemsize + ) + dim_size = np.maximum( + int(np.power(max_chunk_size / nbytes_occupied, 1 / float(len(left)))), 1 + ) + for j, ns in left.copy().items(): + unsplit = left_unsplit[j] + ns.append(int(np.minimum(unsplit, dim_size))) + left_unsplit[j] -= ns[-1] + if left_unsplit[j] <= 0: + dim_to_normalized[j] = tuple(ns) + del left[j] + + if len(left) == 0: + break + + return tuple(dim_to_normalized[i] for i in range(len(dim_to_normalized))) + + +def check_random_state(seed): + """ + Turn seed into a mt.random.RandomState instance + + :param seed: + If seed is None, return the RandomState singleton used by mt.random. + If seed is an int, return a new RandomState instance seeded with seed. + If seed is already a RandomState instance, return it. + Otherwise raise ValueError. + :return: + """ + from numpy import random as np_mtrand + + from . import random as mtrand + + if seed is None or seed is mtrand or seed is np_mtrand: + return mtrand._random_state + if isinstance(seed, (Integral, np.integer)): + return mtrand.RandomState(seed) + if isinstance(seed, np.random.RandomState): + return mtrand.RandomState.from_numpy(seed) + if isinstance(seed, mtrand.RandomState): + return seed + raise ValueError(f"{seed} cannot be used to seed a mt.random.RandomState instance") + + +def filter_inputs(inputs): + from ..core import ENTITY_TYPE + + return [inp for inp in inputs if isinstance(inp, ENTITY_TYPE)] + + +# As TileDB Ctx's creation is a bit time-consuming, +# we just cache the Ctx +# also remember the arguments should be hashable +@lru_cache(10) +def _create_tiledb_ctx(conf_tuple): + if conf_tuple is not None: + return tiledb.Ctx(dict(conf_tuple)) + return tiledb.Ctx() + + +def get_tiledb_ctx(conf): + key = tuple(conf.items()) if conf is not None else None + return _create_tiledb_ctx(key) + + +# this function is only used for pandas' compatibility +def to_numpy(pdf): + try: + return pdf.to_numpy() + except AttributeError: # pragma: no cover + return pdf.values + + +def check_order(order_str, available_options="KACF", err_msg="order not understood"): + order_str = order_str.upper() + if order_str not in available_options: + raise TypeError(err_msg) + + +def get_order( + order_str, to_keep_order, available_options="KACF", err_msg="order not understood" +): + from .core import TensorOrder + + check_order(order_str, available_options=available_options, err_msg=err_msg) + + if order_str in "KA": + return to_keep_order + elif order_str == "C": + return TensorOrder.C_ORDER + else: + return TensorOrder.F_ORDER + + +def reverse_order(old_order): + from .core import TensorOrder + + assert isinstance(old_order, TensorOrder) + return ( + TensorOrder.C_ORDER if old_order == TensorOrder.F_ORDER else TensorOrder.F_ORDER + ) + + +def hash_on_axis(ar, axis, n_dest): + ar = np.asarray(ar) + # cannot be scalar + assert ar.ndim > 0 + axis = validate_axis(ar.ndim, axis) + + if n_dest == 1: + return np.zeros(ar.shape[axis], dtype=np.uint32) + + if ar.ndim > 2: + ret = np.empty(ar.shape[axis], dtype=np.uint32) + + def _hash_to_dest(data): + i = data[0] + idx = (slice(None),) * axis + (i,) + ret[i] = hash_from_buffer(memoryview(ar[idx])) % n_dest + + np.apply_along_axis(_hash_to_dest, 0, np.arange(ar.shape[axis])[np.newaxis, :]) + return ret + else: + + def _hash_to_dest(data): + return hash_from_buffer(memoryview(data)) % n_dest + + if ar.ndim == 1: + ar = ar.reshape(ar.size, 1) + return np.apply_along_axis(_hash_to_dest, 1 - axis, ar) + + +def fetch_corner_data(tensor, session=None): + print_option = np.get_printoptions() + # only fetch corner data when data > threshold + threshold = print_option["threshold"] + # number of edge items to print + edgeitems = print_option["edgeitems"] + + # we fetch corner data based on the fact that + # the tensor must have been executed, + # thus the size could not be NaN + if tensor.size > threshold: + # two edges for each exis + indices_iter = list(itertools.product(*(range(2) for _ in range(tensor.ndim)))) + corners = np.empty(shape=(2,) * tensor.ndim, dtype=object) + shape = [0 for _ in range(tensor.ndim)] + for indices in indices_iter: + slc = [] + for ax, i in enumerate(indices): + size = tensor.shape[ax] + if size > edgeitems * 2 + 2: + # fetch two more elements + if i == 0: + slc.append(slice(edgeitems + 1)) + else: + slc.append(slice(-edgeitems - 1, None)) + shape[ax] += edgeitems + 1 + else: + i_sep = size // 2 + if i == 0: + slc.append(slice(i_sep)) + shape[ax] += i_sep + else: + slc.append(slice(i_sep, None)) + shape[ax] += size - i_sep + corners[indices] = tensor[tuple(slc)] + # fetch together + fetched = ExecutableTuple(corners.flat).fetch(session=session) + for indices, f in zip(indices_iter, fetched): + corners[indices] = f + return np.block(corners.tolist()) + else: + return tensor.fetch(session=session) + + +def implement_scipy(scipy_fun): + import re + import textwrap + + def wrapper(fun): + if scipy_fun is None: + return None + if not fun.__doc__: + doc_str = textwrap.dedent(scipy_fun.__doc__) + lines = [] + for line in doc_str.splitlines(keepends=False): + # skip function headers + if line.startswith(scipy_fun.__name__ + "("): + continue + # skip version marks + if line.strip().startswith(".. versionadded::"): + continue + # skip examples + if line.strip() == "Examples": + break + lines.append(line) + doc_str = "\n".join(lines).strip() + # remove trailing empty sections + fun.__doc__ = re.sub(r"[A-Za-z]+\n-+$", "", doc_str).strip() + return fun + + return wrapper diff --git a/python/xorbits/_mars/tests/__init__.py b/python/xorbits/_mars/tests/__init__.py new file mode 100644 index 000000000..ce3fa5c45 --- /dev/null +++ b/python/xorbits/_mars/tests/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .core import flaky diff --git a/python/xorbits/_mars/tests/core.py b/python/xorbits/_mars/tests/core.py new file mode 100644 index 000000000..a76952088 --- /dev/null +++ b/python/xorbits/_mars/tests/core.py @@ -0,0 +1,585 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import fnmatch +import functools +import inspect +import itertools +import logging +import os +import sys +import time +import types +from typing import Dict + +import numpy as np +import pandas as pd +import pytest + +try: + from flaky import flaky as _raw_flaky +except ImportError: + _raw_flaky = None +try: + import mock +except ImportError: + from unittest import mock +_mock = mock + +from ..core.operand import OperandStage +from ..utils import lazy_import + +cupy = lazy_import("cupy") +cudf = lazy_import("cudf") +ray = lazy_import("ray") +ucx = lazy_import("ucp") + +logger = logging.getLogger(__name__) + + +def flaky(o=None, *args, **kwargs): + platform = kwargs.pop("platform", "") + if _raw_flaky is None or not sys.platform.startswith(platform): + if o is not None: + return o + + def ident(x): + return x + + return ident + elif o is not None: + return _raw_flaky(o, *args, **kwargs) + else: + return _raw_flaky(*args, **kwargs) + + +def patch_method(method, *args, **kwargs): + if hasattr(method, "__qualname__"): + return mock.patch( + method.__module__ + "." + method.__qualname__, *args, **kwargs + ) + elif hasattr(method, "im_class"): + return mock.patch( + ".".join( + [method.im_class.__module__, method.im_class.__name__, method.__name__] + ), + *args, + **kwargs, + ) + else: + return mock.patch(method.__module__ + "." + method.__name__, *args, **kwargs) + + +def patch_cls(target_cls): + def _wrapper(cls): + class Super(cls.__bases__[0]): + pass + + cls.__patch_super__ = Super + + target = target_cls.__module__ + "." + target_cls.__qualname__ + for name, obj in cls.__dict__.items(): + if name.startswith("__") and name != "__init__": + continue + p = mock.patch(target + "." + name, obj, create=True) + original, local = p.get_original() + setattr(cls.__patch_super__, name, original) + p.start() + + return cls + + return _wrapper + + +def patch_super(): + back = inspect.currentframe().f_back + if not back or "__class__" not in back.f_locals: + raise RuntimeError("Calling super() in the incorrect context.") + + patch_super_cls = back.f_locals["__class__"].__patch_super__ + patch_self = back.f_locals.get("self") + + class _SuperAccessor: + def __getattribute__(self, item): + func = getattr(patch_super_cls, item) + if func == mock.DEFAULT: + raise AttributeError(f"super object has no attribute '{item}'") + if patch_self: + return types.MethodType(func, patch_self) + return func + + return _SuperAccessor() + + +def print_entrance(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + try: + print( + f"Start to execute function {func} with args {args} and kwargs {kwargs}" + ) + result = func(*args, **kwargs) + print( + f"Finished executing function {func} with args {args} and kwargs {kwargs}" + ) + return result + except NotImplementedError: + return NotImplemented + + return wrapper + + +def print_async_entrance(func): + @functools.wraps(func) + async def wrapper(*args, **kwargs): + try: + print( + f"Start to execute function {func} with args {args} and kwargs {kwargs}" + ) + result = await func(*args, **kwargs) + print( + f"Finished executing function {func} with args {args} and kwargs {kwargs}" + ) + return result + except NotImplementedError: + return NotImplemented + + return wrapper + + +def require_cupy(func): + if pytest: + func = pytest.mark.cuda(func) + func = pytest.mark.skipif(cupy is None, reason="cupy not installed")(func) + return func + + +def require_cudf(func): + if pytest: + func = pytest.mark.cuda(func) + func = pytest.mark.skipif(cudf is None, reason="cudf not installed")(func) + return func + + +def require_ray(func): + if pytest: + func = pytest.mark.ray(func) + func = pytest.mark.skipif(ray is None, reason="ray not installed")(func) + return func + + +def require_ucx(func): + if pytest: + func = pytest.mark.ucx(func) + func = pytest.mark.skipif(ucx is None, reason="ucx not installed")(func) + return func + + +def require_hadoop(func): + if pytest: + func = pytest.mark.hadoop(func) + func = pytest.mark.skipif( + not os.environ.get("WITH_HADOOP"), reason="Only run when hadoop is installed" + )(func) + return func + + +def assert_groupby_equal( + left, right, sort_keys=False, sort_index=True, with_selection=False +): + if hasattr(left, "groupby_obj"): + left = left.groupby_obj + if hasattr(right, "groupby_obj"): + right = right.groupby_obj + + if type(left) is not type(right): + raise AssertionError( + f"Type of groupby not consistent: {type(left)} != {type(right)}" + ) + + left_selection = getattr(left, "_selection", None) + right_selection = getattr(right, "_selection", None) + if sort_keys: + left = sorted(left, key=lambda p: p[0]) + right = sorted(right, key=lambda p: p[0]) + else: + left, right = list(left), list(right) + if sort_index: + left = [(k, v.sort_index()) for k, v in left] + right = [(k, v.sort_index()) for k, v in right] + + if len(left) != len(right): + raise AssertionError( + f"Count of groupby keys not consistent: {len(left)} != {len(right)}" + ) + + left_keys = [p[0] for p in left] + right_keys = [p[0] for p in right] + if left_keys != right_keys: + raise AssertionError( + f"Group keys not consistent: {left_keys!r} != {right_keys!r}" + ) + for (left_key, left_frame), (right_key, right_frame) in zip(left, right): + if with_selection: + if left_selection and isinstance(left_frame, pd.DataFrame): + left_frame = left_frame[left_selection] + if right_selection and isinstance(right_frame, pd.DataFrame): + right_frame = right_frame[right_selection] + + if isinstance(left_frame, pd.DataFrame): + pd.testing.assert_frame_equal(left_frame, right_frame) + else: + pd.testing.assert_series_equal(left_frame, right_frame) + + +_check_options = dict() +_check_args = [ + "check_all", + "check_series_name", + "check_index_name", + "check_dtypes", + "check_dtype", + "check_shape", + "check_nsplits", + "check_index_value", + "check_columns_value", +] + + +class ObjectCheckMixin: + _check_options: Dict + + @staticmethod + def adapt_index_value(value): + if hasattr(value, "to_pandas"): + return value.to_pandas() + return value + + def assert_shape_consistent(self, expected_shape, real_shape): + if not self._check_options["check_shape"] or not expected_shape: + return + + if len(expected_shape) != len(real_shape): + raise AssertionError( + f"ndim in metadata {len(expected_shape)} is not consistent " + f"with real ndim {len(real_shape)}" + ) + for e, r in zip(expected_shape, real_shape): + if not np.isnan(e) and e != r: + raise AssertionError( + f"shape in metadata {expected_shape!r} is not consistent " + f"with real shape {real_shape!r}" + ) + + @staticmethod + def assert_dtype_consistent(expected_dtype, real_dtype): + cate_dtypes = [pd.CategoricalDtype] + if cudf: + cate_dtypes.append(cudf.CategoricalDtype) + cate_dtypes = tuple(cate_dtypes) + + if isinstance(real_dtype, pd.DatetimeTZDtype): + real_dtype = real_dtype.base + if expected_dtype != real_dtype: + if expected_dtype == np.dtype("O") and real_dtype.type is np.str_: + # real dtype is string, this matches expectation + return + if expected_dtype is None: + raise AssertionError("Expected dtype cannot be None") + if isinstance(real_dtype, cate_dtypes) and isinstance( + expected_dtype, cate_dtypes + ): + return + if not np.can_cast(real_dtype, expected_dtype) and not np.can_cast( + expected_dtype, real_dtype + ): + raise AssertionError( + f"cannot cast between dtype of real dtype {real_dtype} " + f"and dtype {expected_dtype} defined in metadata" + ) + + def assert_tensor_consistent(self, expected, real): + from ..lib.sparse import SparseNDArray + + np_types = (np.generic, np.ndarray, pd.Timestamp, SparseNDArray) + if cupy is not None: + np_types += (cupy.ndarray,) + + if isinstance(real, tuple): + # allow returning a batch of chunks for some operands + real = real[0] + if isinstance(real, (str, int, bool, float, complex)): + real = np.array([real])[0] + if not isinstance(real, np_types): + raise AssertionError( + f"Type of real value ({type(real)}) not one of {np_types!r}" + ) + if not hasattr(expected, "dtype"): + return + if self._check_options["check_dtypes"]: + try: + self.assert_dtype_consistent(expected.dtype, real.dtype) + except AssertionError as ex: + if hasattr(expected, "op"): + raise AssertionError( + f"dtype assertion error: {ex}, source operand {expected.op}" + ) + else: + raise + if self._check_options["check_shape"]: + self.assert_shape_consistent(expected.shape, real.shape) + + @classmethod + def assert_index_value_consistent(cls, expected_index_value, real_index): + if expected_index_value is not None and expected_index_value.has_value(): + expected_index = expected_index_value.to_pandas() + try: + pd.testing.assert_index_equal( + expected_index, cls.adapt_index_value(real_index) + ) + except AssertionError as e: + raise AssertionError( + f"Index of real value ({real_index}) not equal to ({expected_index})" + ) from e + + def assert_dataframe_consistent(self, expected, real): + dataframe_types = (pd.DataFrame,) + if cudf is not None: + dataframe_types += (cudf.DataFrame,) + + if isinstance(real, tuple): + # allow returning a batch of chunks for some operands + real = real[0] + if not isinstance(real, dataframe_types): + raise AssertionError(f"Type of real value ({type(real)}) not DataFrame") + if expected.shape is None: + return + self.assert_shape_consistent(expected.shape, real.shape) + if not np.isnan(expected.shape[1]) and expected.dtypes is not None: + if self._check_options["check_dtypes"]: + # ignore check when columns length is nan or dtypes undefined + pd.testing.assert_index_equal( + expected.dtypes.index, self.adapt_index_value(real.dtypes.index) + ) + + try: + for expected_dtype, real_dtype in zip(expected.dtypes, real.dtypes): + self.assert_dtype_consistent(expected_dtype, real_dtype) + except AssertionError: + raise AssertionError( + f"dtypes in metadata {expected.dtype} cannot cast " + f"to real dtype {real.dtype}" + ) + + if self._check_options["check_columns_value"] and not np.isnan( + expected.shape[1] + ): + self.assert_index_value_consistent(expected.columns_value, real.columns) + if self._check_options["check_index_value"] and not np.isnan(expected.shape[0]): + self.assert_index_value_consistent(expected.index_value, real.index) + + def assert_series_consistent(self, expected, real): + series_types = (pd.Series,) + if cudf is not None: + series_types += (cudf.Series,) + + if not isinstance(real, series_types): + raise AssertionError(f"Type of real value ({type(real)}) not Series") + self.assert_shape_consistent(expected.shape, real.shape) + + if self._check_options["check_series_name"]: + if expected.name is not None and expected.name != real.name: + raise AssertionError( + f"series name in metadata {expected.name} " + f"is not equal to real name {real.name}" + ) + + self.assert_dtype_consistent(expected.dtype, real.dtype) + if self._check_options["check_index_value"]: + self.assert_index_value_consistent(expected.index_value, real.index) + + def assert_groupby_consistent(self, expected, real): + from pandas.core.groupby import DataFrameGroupBy, SeriesGroupBy + + from ..dataframe.core import ( + DATAFRAME_GROUPBY_CHUNK_TYPE, + DATAFRAME_GROUPBY_TYPE, + SERIES_GROUPBY_CHUNK_TYPE, + SERIES_GROUPBY_TYPE, + ) + from ..lib.groupby_wrapper import GroupByWrapper + + df_groupby_types = (DataFrameGroupBy,) + series_groupby_types = (SeriesGroupBy,) + + try: + from cudf.core.groupby.groupby import DataFrameGroupBy as CUDataFrameGroupBy + from cudf.core.groupby.groupby import SeriesGroupBy as CUSeriesGroupBy + + df_groupby_types += (CUDataFrameGroupBy,) + series_groupby_types += (CUSeriesGroupBy,) + except ImportError: + pass + + if isinstance(real, GroupByWrapper): + real = real.groupby_obj + + if isinstance( + expected, (DATAFRAME_GROUPBY_TYPE, DATAFRAME_GROUPBY_CHUNK_TYPE) + ) and isinstance(real, df_groupby_types): + selection = getattr(real, "_selection", None) + if not selection: + self.assert_dataframe_consistent(expected, real.obj) + else: + self.assert_dataframe_consistent(expected, real.obj[selection]) + elif isinstance( + expected, (SERIES_GROUPBY_TYPE, SERIES_GROUPBY_CHUNK_TYPE) + ) and isinstance(real, series_groupby_types): + self.assert_series_consistent(expected, real.obj) + else: + raise AssertionError( + "GroupBy type not consistent. Expecting %r but receive %r" + % (type(expected), type(real)) + ) + + def assert_index_consistent(self, expected, real): + index_types = (pd.Index,) + if cudf is not None: + index_types += (cudf.Index,) + + if not isinstance(real, index_types): + raise AssertionError(f"Type of real value ({type(real)}) not Index") + self.assert_shape_consistent(expected.shape, real.shape) + + if self._check_options["check_series_name"] and expected.name != real.name: + raise AssertionError( + f"series name in metadata {expected.name} is not equal to " + f"real name {real.name}" + ) + + self.assert_dtype_consistent(expected.dtype, real.dtype) + self.assert_index_value_consistent(expected.index_value, real) + + def assert_categorical_consistent(self, expected, real): + if not isinstance(real, pd.Categorical): + raise AssertionError(f"Type of real value ({type(real)}) not Categorical") + self.assert_dtype_consistent(expected.dtype, real.dtype) + self.assert_shape_consistent(expected.shape, real.shape) + self.assert_index_value_consistent(expected.categories_value, real.categories) + + def assert_object_consistent(self, expected, real): + from ..dataframe.core import ( + CATEGORICAL_CHUNK_TYPE, + CATEGORICAL_TYPE, + DATAFRAME_CHUNK_TYPE, + DATAFRAME_TYPE, + GROUPBY_CHUNK_TYPE, + GROUPBY_TYPE, + INDEX_CHUNK_TYPE, + INDEX_TYPE, + SERIES_CHUNK_TYPE, + SERIES_TYPE, + ) + from ..tensor.core import TENSOR_CHUNK_TYPE, TENSOR_TYPE + + op = getattr(expected, "op", None) + if op and getattr(op, "stage", None) == OperandStage.map: + return + + if isinstance(expected, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)): + self.assert_tensor_consistent(expected, real) + elif isinstance(expected, (DATAFRAME_TYPE, DATAFRAME_CHUNK_TYPE)): + self.assert_dataframe_consistent(expected, real) + elif isinstance(expected, (SERIES_TYPE, SERIES_CHUNK_TYPE)): + self.assert_series_consistent(expected, real) + elif isinstance(expected, (GROUPBY_TYPE, GROUPBY_CHUNK_TYPE)): + self.assert_groupby_consistent(expected, real) + elif isinstance(expected, (INDEX_TYPE, INDEX_CHUNK_TYPE)): + self.assert_index_consistent(expected, real) + elif isinstance(expected, (CATEGORICAL_TYPE, CATEGORICAL_CHUNK_TYPE)): + self.assert_categorical_consistent(expected, real) + + +DICT_NOT_EMPTY = type("DICT_NOT_EMPTY", (object,), {}) # is check works for deepcopy + + +def check_dict_structure_same(a, b, prefix=None): + def _p(k): + if prefix is None: + return k + return ".".join(str(i) for i in prefix + [k]) + + for ai, bi in itertools.zip_longest( + a.items(), b.items(), fillvalue=("_KEY_NOT_EXISTS_", None) + ): + if ai[0] != bi[0]: + if "*" in ai[0]: + pattern, target = ai[0], bi[0] + elif "*" in bi[0]: + pattern, target = bi[0], ai[0] + else: + raise KeyError(f"Key {_p(ai[0])} != {_p(bi[0])}") + if not fnmatch.fnmatch(target, pattern): + raise KeyError(f"Key {_p(target)} not match {_p(pattern)}") + + if ai[1] is DICT_NOT_EMPTY: + target = bi[1] + elif bi[1] is DICT_NOT_EMPTY: + target = ai[1] + else: + target = None + if target is not None: + if not isinstance(target, dict): + raise TypeError(f"Value type of {_p(ai[0])} is not a dict.") + if not target: + raise TypeError(f"Value of {_p(ai[0])} empty.") + continue + + if type(ai[1]) is not type(bi[1]): + raise TypeError(f"Value type of {_p(ai[0])} mismatch {ai[1]} != {bi[1]}") + if isinstance(ai[1], dict): + check_dict_structure_same( + ai[1], bi[1], [ai[0]] if prefix is None else prefix + [ai[0]] + ) + + +async def wait_for_condition( + condition_predictor, timeout=10, retry_interval_ms=100, **kwargs +): # pragma: no cover + """Wait until a condition is met or time out with an exception. + + Args: + condition_predictor: A function that predicts the condition. + timeout: Maximum timeout in seconds. + retry_interval_ms: Retry interval in milliseconds. + + Raises: + RuntimeError: If the condition is not met before the timeout expires. + """ + start = time.time() + last_ex = None + while time.time() - start <= timeout: + try: + pred = condition_predictor(**kwargs) + if inspect.isawaitable(pred): + pred = await pred + if pred: + return + except Exception as ex: + last_ex = ex + time.sleep(retry_interval_ms / 1000.0) + message = "The condition wasn't met before the timeout expired." + if last_ex is not None: + message += f" Last exception: {last_ex}" + raise RuntimeError(message) diff --git a/python/xorbits/_mars/tests/test_cluster.py b/python/xorbits/_mars/tests/test_cluster.py new file mode 100644 index 000000000..71cada2cd --- /dev/null +++ b/python/xorbits/_mars/tests/test_cluster.py @@ -0,0 +1,120 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import os +import subprocess +import sys +import tempfile + +import psutil +import pytest + +from .. import new_session +from .. import tensor as mt +from ..services.cluster import NodeRole, WebClusterAPI +from ..utils import get_next_port + +CONFIG_CONTENT = """\ +"@inherits": "@mars/config.yml" +scheduling: + mem_hard_limit: null""" + + +def _terminate(pid: int): + proc = psutil.Process(pid) + sub_pids = [p.pid for p in proc.children(recursive=True)] + proc.terminate() + proc.wait(5) + for p in sub_pids: + try: + proc = psutil.Process(p) + proc.kill() + except psutil.NoSuchProcess: + continue + + +@pytest.mark.asyncio +async def test_cluster(): + port = get_next_port() + web_port = get_next_port() + supervisor_addr = f"127.0.0.1:{port}" + web_addr = f"http://127.0.0.1:{web_port}" + + # gen config file + fd, path = tempfile.mkstemp() + with os.fdopen(fd, mode="w") as f: + f.write(CONFIG_CONTENT) + + w = subprocess.Popen( + [sys.executable, "-m", "mars.worker", "-s", supervisor_addr, "-f", path] + ) + r = subprocess.Popen( + [ + sys.executable, + "-m", + "mars.supervisor", + "-H", + "127.0.0.1", + "-p", + str(port), + "-w", + str(web_port), + "-f", + path, + ], + stderr=subprocess.PIPE, + ) + + for p in [r, w]: + try: + retcode = p.wait(1) + except subprocess.TimeoutExpired: + # supervisor & worker will run forever, + # timeout means everything goes well, at least looks well, + continue + else: + if retcode: + std_err = p.communicate()[1].decode() + _terminate(r.pid) + _terminate(w.pid) + raise RuntimeError("Start cluster failed, stderr: \n" + std_err) + + try: + cluster_api = WebClusterAPI(web_addr) + while True: + try: + jsn = await cluster_api.get_nodes_info(role=NodeRole.WORKER) + except ConnectionError: + await asyncio.sleep(0.5) + continue + if not jsn: + await asyncio.sleep(0.5) + continue + if len(jsn) > 0: + break + + sess = new_session(web_addr, default=True) + a = mt.arange(10) + assert a.sum().to_numpy(show_progress=False) == 45 + + sess2 = new_session(web_addr, session_id=sess.session_id) + sess2.close() + finally: + _terminate(w.pid) + _terminate(r.pid) + + # test stderr + out = r.communicate()[1].decode() + assert f"Supervisor started at {supervisor_addr}, web address: {web_addr}" in out diff --git a/python/xorbits/_mars/tests/test_config.py b/python/xorbits/_mars/tests/test_config.py new file mode 100644 index 000000000..28c67d2cb --- /dev/null +++ b/python/xorbits/_mars/tests/test_config.py @@ -0,0 +1,113 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pickle +import threading + +import pytest + +from ..config import Config, is_integer, is_string, option_context, options + + +def test_config_context(): + with pytest.raises(AttributeError): + _ = options.a.b.c + + options.register_option("c.d.e", "a", is_string) + assert "c" in dir(options) + assert "d" in dir(options.c) + + try: + with option_context() as ctx: + ctx.register_option("a.b.c", 1, validator=is_integer) + assert ctx.a.b.c == 1 + + ctx.a.b.c = 2 + assert ctx.a.b.c == 2 + + with pytest.raises(ValueError): + ctx.a.b.c = "a" + + assert ctx.c.d.e == "a" + + ctx.c.d.e = "b" + + assert options.c.d.e == "a" + + options.c.d.e = "c" + + assert options.c.d.e == "c" + + with pytest.raises(AttributeError): + _ = options.a.b.c # noqa: F841 + finally: + options.unregister_option("c.d.e") + + +def test_multi_thread_config(): + options.register_option("a.b.c", 1) + + class T(threading.Thread): + def __init__(self, is_first, condition): + super().__init__() + self.is_first = is_first + self.condition = condition + + def run(self): + self.condition.acquire() + if self.is_first: + options.a.b.c = 2 + self.condition.notify() + else: + self.condition.wait() + assert options.a.b.c == 1 + self.condition.release() + + try: + cond = threading.Condition() + a = T(True, cond) + b = T(False, cond) + b.start() + a.start() + a.join() + b.join() + finally: + options.unregister_option("a.b.c") + + +def test_config_copy(): + cfg = Config() + cfg.register_option("a.b.c", 1) + cfg.redirect_option("a.c", "a.b.c") + + target_cfg = Config() + target_cfg.register_option("a.b.c", -1) + target_cfg.redirect_option("a.c", "a.b.c") + + src_cfg_dict = cfg.to_dict() + assert src_cfg_dict == {"a.b.c": 1} + + target_cfg.update(src_cfg_dict) + assert target_cfg.a.b.c == 1 + + +def test_pickle_config(): + cfg = Config() + cfg.register_option("a.b.c", 1) + cfg.redirect_option("a.c", "a.b.c") + + s = pickle.dumps(cfg) + new_cfg = pickle.loads(s) + assert new_cfg.a.b.c == 1 + assert new_cfg.a.c == 1 diff --git a/python/xorbits/_mars/tests/test_eager_mode.py b/python/xorbits/_mars/tests/test_eager_mode.py new file mode 100644 index 000000000..5d1513c4e --- /dev/null +++ b/python/xorbits/_mars/tests/test_eager_mode.py @@ -0,0 +1,177 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest + +from .. import dataframe as md +from .. import tensor as mt +from ..config import option_context +from ..dataframe.datasource.dataframe import from_pandas + + +def test_base_execute(setup): + with option_context({"eager_mode": True}): + a_data = np.random.rand(10, 10) + a = mt.tensor(a_data, chunk_size=6) + np.testing.assert_array_equal(a.fetch(), a_data) + + r1 = a + 1 + np.testing.assert_array_equal(r1.fetch(), a_data + 1) + + r2 = 2 * r1 + np.testing.assert_array_equal(r2.fetch(), (a_data + 1) * 2) + + # test add with out + b = mt.ones((10, 10), chunk_size=6) + np.testing.assert_array_equal(b.fetch(), np.ones((10, 10))) + + mt.add(a, b, out=b) + np.testing.assert_array_equal(b.fetch(), a_data + 1) + + # test tensor dot + c_data1 = np.random.rand(10, 10) + c_data2 = np.random.rand(10, 10) + c1 = mt.tensor(c_data1, chunk_size=6) + c2 = mt.tensor(c_data2, chunk_size=6) + r3 = c1.dot(c2) + np.testing.assert_array_almost_equal(r3.fetch(), c_data1.dot(c_data2)) + + +def test_multiple_output_execute(setup): + with option_context({"eager_mode": True}): + data = np.random.random((5, 9)) + + arr1 = mt.tensor(data.copy(), chunk_size=3) + result = mt.modf(arr1) + expected = np.modf(data) + + np.testing.assert_array_equal(result[0].fetch(), expected[0]) + np.testing.assert_array_equal(result[1].fetch(), expected[1]) + + arr3 = mt.tensor(data.copy(), chunk_size=3) + result1, result2, result3 = mt.split(arr3, 3, axis=1) + expected = np.split(data, 3, axis=1) + + np.testing.assert_array_equal(result1.fetch(), expected[0]) + np.testing.assert_array_equal(result2.fetch(), expected[1]) + np.testing.assert_array_equal(result3.fetch(), expected[2]) + + +def test_mixed_config(setup): + a = mt.ones((10, 10), chunk_size=6) + with pytest.raises(ValueError): + a.fetch() + + with option_context({"eager_mode": True}): + b = mt.ones((10, 10), chunk_size=(6, 8)) + np.testing.assert_array_equal(b.fetch(), np.ones((10, 10))) + + r = b + 1 + np.testing.assert_array_equal(r.fetch(), np.ones((10, 10)) * 2) + + r2 = b.dot(b) + np.testing.assert_array_equal(r2.fetch(), np.ones((10, 10)) * 10) + + c = mt.ones((10, 10), chunk_size=6) + with pytest.raises(ValueError): + c.fetch() + np.testing.assert_array_equal(c.execute(), np.ones((10, 10))) + + r = c.dot(c) + with pytest.raises(ValueError): + r.fetch() + np.testing.assert_array_equal(r.execute(), np.ones((10, 10)) * 10) + + +@pytest.mark.ray_dag +def test_index(setup): + with option_context({"eager_mode": True}): + a = mt.random.rand(10, 5, chunk_size=5) + idx = slice(0, 5), slice(0, 5) + a[idx] = 1 + np.testing.assert_array_equal(a.fetch()[idx], np.ones((5, 5))) + + split1, split2 = mt.split(a, 2) + np.testing.assert_array_equal(split1.fetch(), np.ones((5, 5))) + + # test bool indexing + a = mt.random.rand(8, 8, chunk_size=4) + set_value = mt.ones((2, 2)) * 2 + a[4:6, 4:6] = set_value + b = a[a > 1] + assert b.shape == (4,) + np.testing.assert_array_equal(b.fetch(), np.ones((4,)) * 2) + + c = b.reshape((2, 2)) + assert c.shape == (2, 2) + np.testing.assert_array_equal(c.fetch(), np.ones((2, 2)) * 2) + + +def test_repr_tensor(setup): + a = mt.ones((10, 10), chunk_size=3) + assert a.key in repr(a) + + assert repr(np.ones((10, 10))) not in repr(a) + assert str(np.ones((10, 10))) not in str(a) + + with option_context({"eager_mode": True}): + a = mt.ones((10, 10)) + assert repr(np.ones((10, 10))) == repr(a) + assert str(np.ones((10, 10))) == str(a) + + +def test_repr_dataframe(setup): + x = pd.DataFrame(np.ones((10, 10))) + + with option_context({"eager_mode": True}): + a = md.DataFrame(np.ones((10, 10)), chunk_size=3) + assert repr(x) in repr(a) + assert str(x) in str(a) + + a = md.DataFrame(np.ones((10, 10)), chunk_size=3) + assert repr(x) not in repr(a) + assert str(x) not in str(a) + + +def test_view(setup): + with option_context({"eager_mode": True}): + data = np.random.rand(10, 20) + a = mt.tensor(data, chunk_size=5) + b = a[0][1:4] + b[1] = 10 + + npa = data.copy() + npb = npa[0][1:4] + npb[1] = 10 + + np.testing.assert_array_equal(a.fetch(), npa) + np.testing.assert_array_equal(b.fetch(), npb) + + +def test_dataframe(setup): + with option_context({"eager_mode": True}): + from ..dataframe.arithmetic import add + + data1 = pd.DataFrame(np.random.rand(10, 10)) + df1 = from_pandas(data1, chunk_size=5) + pd.testing.assert_frame_equal(df1.fetch(), data1) + + data2 = pd.DataFrame(np.random.rand(10, 10)) + df2 = from_pandas(data2, chunk_size=6) + pd.testing.assert_frame_equal(df2.fetch(), data2) + + df3 = add(df1, df2) + pd.testing.assert_frame_equal(df3.fetch(), data1 + data2) diff --git a/python/xorbits/_mars/tests/test_resource.py b/python/xorbits/_mars/tests/test_resource.py new file mode 100644 index 000000000..a0f35aeb9 --- /dev/null +++ b/python/xorbits/_mars/tests/test_resource.py @@ -0,0 +1,225 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import os +import tempfile +import time + +import pytest + +from ..resource import Resource, ZeroResource + +_v1_cpu_stat_first = "8678870951786" +_v1_cpu_stat_last = "8679429771672" + +# just a fragment of real cpu.stat +_v2_cpu_stat_first = """ +usage_usec 8678870951 +""" +_v2_cpu_stat_last = """ +usage_usec 8679429771 +""" + +# just a fragment of real memory.stat +_v1_memory_stat_content = """ +cache 489275392 +rss 218181632 +mapped_file 486768640 +swap 0 +inactive_anon 486744064 +active_anon 218103808 +inactive_file 2457600 +active_file 73728 +hierarchical_memory_limit 1073741824 +""" + +_v2_memory_current_content = "218181632\n" +_v2_memory_max_content = "1073741824\n" + + +def test_stats(): + from mars import resource + + resource = importlib.reload(resource) + resource.cpu_percent() + + mem_stats = resource.virtual_memory() + assert mem_stats.available >= 0 + assert mem_stats.total >= 0 + assert mem_stats.percent >= 0 + assert mem_stats.used >= 0 + assert mem_stats.free >= 0 + + cpu_usage = resource.cpu_percent() + time.sleep(0.1) + assert cpu_usage >= 0 + + resource.disk_io_usage() + time.sleep(0.1) + recv_speed, send_speed = resource.disk_io_usage() + assert recv_speed >= 0 + assert send_speed >= 0 + + curdir = os.path.dirname(os.path.abspath(__file__)) + resource.disk_io_usage(curdir) + time.sleep(0.1) + usage = resource.disk_io_usage(curdir) + if usage is not None: + assert usage.reads >= 0 + assert usage.writes >= 0 + + resource.net_io_usage() + time.sleep(0.1) + recv_speed, send_speed = resource.net_io_usage() + assert recv_speed >= 0 + assert send_speed >= 0 + + +def test_use_process_stats(): + from mars import resource + + cpu_total = resource.cpu_count() + mem_total = resource.virtual_memory().total + try: + os.environ["MARS_USE_PROCESS_STAT"] = "1" + os.environ["MARS_CPU_TOTAL"] = str(cpu_total) + os.environ["MARS_MEMORY_TOTAL"] = str(mem_total) + + resource = importlib.reload(resource) + resource.cpu_percent() + time.sleep(0.5) + + mem_stats = resource.virtual_memory() + assert mem_stats.available >= 0 + assert mem_stats.total >= 0 + assert mem_stats.percent >= 0 + assert mem_stats.used >= 0 + assert mem_stats.free >= 0 + + cpu_usage = resource.cpu_percent() + assert cpu_usage >= 0 + cpu_usage = resource.cpu_percent() + assert cpu_usage >= 0 + finally: + del os.environ["MARS_USE_PROCESS_STAT"] + del os.environ["MARS_CPU_TOTAL"] + del os.environ["MARS_MEMORY_TOTAL"] + importlib.reload(resource) + + +@pytest.mark.parametrize("cgroup_ver", ["v1", "v2"]) +def test_use_c_group_stats(cgroup_ver): + from mars import resource + + def write_tmp_text_file(prefix, content): + fd, file_name = tempfile.mkstemp(prefix) + with os.fdopen(fd, "w") as f: + f.write(content) + return file_name + + v1_cpu_acct_path = write_tmp_text_file( + "test-mars-res-cgroup-v1-cpu-", _v1_cpu_stat_first + ) + v1_mem_stat_path = write_tmp_text_file( + "test-mars-res-cgroup-v1-mem-", _v1_memory_stat_content + ) + v2_cpu_stat_path = write_tmp_text_file( + "test-mars-res-cgroup-v2-cpu-", _v2_cpu_stat_first + ) + v2_mem_cur_path = write_tmp_text_file( + "test-mars-res-cgroup-v2-cpu-", _v2_memory_current_content + ) + v2_mem_max_path = write_tmp_text_file( + "test-mars-res-cgroup-v2-cpu-", _v2_memory_max_content + ) + + old_is_cgroup_v2 = resource._is_cgroup_v2 + old_v1_cpu_acct_file = resource.CGROUP_V1_CPU_ACCT_FILE + old_v1_mem_stat_file = resource.CGROUP_V1_MEM_STAT_FILE + old_v2_cpu_stat_file = resource.CGROUP_V2_CPU_STAT_FILE + old_v2_mem_current_file = resource.CGROUP_V2_MEM_CURRENT_FILE + old_v2_mem_max_file = resource.CGROUP_V2_MEM_MAX_FILE + old_shm_path = resource._shm_path + try: + os.environ["MARS_USE_CGROUP_STAT"] = "1" + + resource = importlib.reload(resource) + if cgroup_ver == "v1": + resource.CGROUP_V1_CPU_ACCT_FILE = v1_cpu_acct_path + resource.CGROUP_V1_MEM_STAT_FILE = v1_mem_stat_path + resource._is_cgroup_v2 = False + else: + resource.CGROUP_V2_CPU_STAT_FILE = v2_cpu_stat_path + resource.CGROUP_V2_MEM_CURRENT_FILE = v2_mem_cur_path + resource.CGROUP_V2_MEM_MAX_FILE = v2_mem_max_path + resource._is_cgroup_v2 = True + resource._shm_path = None + + assert resource.cpu_percent() is None + time.sleep(0.5) + with open(v1_cpu_acct_path, "w") as f: + f.write(_v1_cpu_stat_last) + with open(v2_cpu_stat_path, "w") as f: + f.write(_v2_cpu_stat_last) + assert resource.cpu_percent() > 50 + assert resource.cpu_percent() < 150 + + mem_stats = resource.virtual_memory() + assert mem_stats.total == 1073741824 + assert mem_stats.used == 218181632 + finally: + resource._is_cgroup_v2 = old_is_cgroup_v2 + resource._shm_path = old_shm_path + resource.CGROUP_V1_CPU_ACCT_FILE = old_v1_cpu_acct_file + resource.CGROUP_V1_MEM_STAT_FILE = old_v1_mem_stat_file + resource.CGROUP_V2_CPU_STAT_FILE = old_v2_cpu_stat_file + resource.CGROUP_V2_MEM_CURRENT_FILE = old_v2_mem_current_file + resource.CGROUP_V2_MEM_MAX_FILE = old_v2_mem_max_file + + del os.environ["MARS_USE_CGROUP_STAT"] + + os.unlink(v1_cpu_acct_path) + os.unlink(v1_mem_stat_path) + os.unlink(v2_cpu_stat_path) + os.unlink(v2_mem_cur_path) + os.unlink(v2_mem_max_path) + + importlib.reload(resource) + + +def test_resource(): + assert Resource(num_cpus=1) + Resource(num_cpus=1) == Resource(num_cpus=2) + assert Resource(num_cpus=1) + Resource(num_gpus=1) + Resource( + mem_bytes=1024**3 + ) == Resource(num_cpus=1, num_gpus=1, mem_bytes=1024**3) + assert -Resource(num_cpus=1, num_gpus=1, mem_bytes=1024**3) == Resource( + num_cpus=-1, num_gpus=-1, mem_bytes=-(1024**3) + ) + assert Resource(num_cpus=-1) < ZeroResource + assert Resource(num_gpus=-1) < ZeroResource + assert Resource(mem_bytes=-1) < ZeroResource + assert Resource(num_cpus=1, num_gpus=1, mem_bytes=-(1024**3)) < ZeroResource + assert Resource(num_cpus=1, num_gpus=1, mem_bytes=1024**3) > Resource( + num_cpus=10, num_gpus=1, mem_bytes=1024 + ) + assert Resource(num_cpus=1, num_gpus=10, mem_bytes=1024**3) > Resource( + num_cpus=10, num_gpus=1, mem_bytes=1024**3 + ) + assert Resource(num_cpus=100, num_gpus=10, mem_bytes=1024**3) > Resource( + num_cpus=10, num_gpus=10, mem_bytes=1024**3 + ) + assert Resource(num_cpus=100, num_gpus=10, mem_bytes=1024) - Resource( + num_cpus=10, num_gpus=20, mem_bytes=512 + ) == Resource(num_cpus=90, num_gpus=-10, mem_bytes=512) diff --git a/python/xorbits/_mars/tests/test_session.py b/python/xorbits/_mars/tests/test_session.py new file mode 100644 index 000000000..b0de521b1 --- /dev/null +++ b/python/xorbits/_mars/tests/test_session.py @@ -0,0 +1,529 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import io +import os +import re +import sys +import tempfile +from collections import namedtuple + +import numpy as np +import pandas as pd +import pytest + +try: + import pyarrow as pa +except ImportError: # pragma: no cover + pa = None + +from .. import dataframe as md +from .. import remote as mr +from .. import tensor as mt +from ..config import option_context +from ..deploy.utils import load_service_config_file +from ..session import execute, fetch, fetch_log + +test_namedtuple_type = namedtuple("TestNamedTuple", "a b") + + +@pytest.fixture +def setup(): + from ..deploy.oscar.tests.session import new_test_session + + sess = new_test_session(address="127.0.0.1", init_local=True, default=True) + with option_context({"show_progress": False}): + try: + from .. import __version__ as mars_version + + assert sess.get_cluster_versions() == [mars_version] + yield sess + finally: + sess.stop_server() + + +def test_session_async_execute(setup): + raw_a = np.random.RandomState(0).rand(10, 20) + a = mt.tensor(raw_a) + + expected = raw_a.sum() + res = a.sum().to_numpy(wait=False).result() + assert expected == res + res = a.sum().execute(wait=False) + res = res.result().fetch() + assert expected == res + + raw_df = pd.DataFrame(raw_a) + + expected = raw_df.skew() + df = md.DataFrame(a) + res = df.skew().to_pandas(wait=False).result() + pd.testing.assert_series_equal(expected, res) + res = df.skew().execute(wait=False) + res = res.result().fetch() + pd.testing.assert_series_equal(expected, res) + + t = [df.sum(), a.sum()] + res = mt.ExecutableTuple(t).to_object(wait=False).result() + pd.testing.assert_series_equal(raw_df.sum(), res[0]) + assert raw_a.sum() == res[1] + res = mt.ExecutableTuple(t).execute(wait=False) + res = fetch(*res.result()) + pd.testing.assert_series_equal(raw_df.sum(), res[0]) + assert raw_a.sum() == res[1] + + +def test_executable_tuple_execute(setup): + raw_a = np.random.RandomState(0).rand(10, 20) + a = mt.tensor(raw_a) + + raw_df = pd.DataFrame(raw_a) + df = md.DataFrame(raw_df) + + tp = test_namedtuple_type(a, df) + executable_tp = mt.ExecutableTuple(tp) + + assert "a" in dir(executable_tp) + assert executable_tp.a is a + assert test_namedtuple_type.__name__ in repr(executable_tp) + with pytest.raises(AttributeError): + getattr(executable_tp, "c") + + res = mt.ExecutableTuple(tp).execute().fetch() + assert test_namedtuple_type is type(res) + + np.testing.assert_array_equal(raw_a, res.a) + pd.testing.assert_frame_equal(raw_df, res.b) + + +def test_multiple_output_execute(setup): + data = np.random.random((5, 9)) + + # test multiple outputs + arr1 = mt.tensor(data.copy(), chunk_size=3) + result = mt.modf(arr1).execute().fetch() + expected = np.modf(data) + + np.testing.assert_array_equal(result[0], expected[0]) + np.testing.assert_array_equal(result[1], expected[1]) + + # test 1 output + arr2 = mt.tensor(data.copy(), chunk_size=3) + result = ((arr2 + 1) * 2).to_numpy() + expected = (data + 1) * 2 + + np.testing.assert_array_equal(result, expected) + + # test multiple outputs, but only execute 1 + arr3 = mt.tensor(data.copy(), chunk_size=3) + arrs = mt.split(arr3, 3, axis=1) + result = arrs[0].to_numpy() + expected = np.split(data, 3, axis=1)[0] + + np.testing.assert_array_equal(result, expected) + + # test multiple outputs, but only execute 1 + data = np.random.randint(0, 10, (5, 5)) + arr3 = (mt.tensor(data) + 1) * 2 + arrs = mt.linalg.qr(arr3) + result = (arrs[0] + 1).to_numpy() + expected = np.linalg.qr((data + 1) * 2)[0] + 1 + + np.testing.assert_array_almost_equal(result, expected) + + result = (arrs[0] + 2).to_numpy() + expected = np.linalg.qr((data + 1) * 2)[0] + 2 + + np.testing.assert_array_almost_equal(result, expected) + + s = mt.shape(0) + + result = s.execute().fetch() + expected = np.shape(0) + assert result == expected + + +def test_closed_session(): + from ..deploy.oscar.tests.session import new_test_session + + session = new_test_session(default=True) + with option_context({"show_progress": False}): + arr = mt.ones((10, 10)) + try: + result = session.execute(arr) + + np.testing.assert_array_equal(result, np.ones((10, 10))) + + # close session + session.close() + + with pytest.raises(RuntimeError): + session.execute(arr) + + with pytest.raises(RuntimeError): + session.execute(arr + 1) + finally: + session.stop_server() + + +def test_array_protocol(setup): + arr = mt.ones((10, 20)) + + result = np.asarray(arr) + np.testing.assert_array_equal(result, np.ones((10, 20))) + + arr2 = mt.ones((10, 20)) + + result = np.asarray(arr2, mt.bool) + np.testing.assert_array_equal(result, np.ones((10, 20), dtype=np.bool_)) + + arr3 = mt.ones((10, 20)).sum() + + result = np.asarray(arr3) + np.testing.assert_array_equal(result, np.asarray(200)) + + arr4 = mt.ones((10, 20)).sum() + + result = np.asarray(arr4, dtype=np.float_) + np.testing.assert_array_equal(result, np.asarray(200, dtype=np.float_)) + + +def test_without_fuse(setup): + arr1 = (mt.ones((10, 10), chunk_size=6) + 1) * 2 + r1 = arr1.execute(fuse_enabled=False).fetch() + arr2 = (mt.ones((10, 10), chunk_size=5) + 1) * 2 + r2 = arr2.execute(fuse_enabled=False).fetch() + np.testing.assert_array_equal(r1, r2) + + +@pytest.mark.ray_dag +def test_fetch_slices(setup): + arr1 = mt.random.rand(10, 8, chunk_size=3) + r1 = arr1.execute().fetch() + + r2 = arr1[:2, 3:9].fetch() + np.testing.assert_array_equal(r2, r1[:2, 3:9]) + + r3 = arr1[0].fetch() + np.testing.assert_array_equal(r3, r1[0]) + + +def test_fetch_dataframe_slices(setup): + arr1 = mt.random.rand(10, 8, chunk_size=3) + df1 = md.DataFrame(arr1) + r1 = df1.execute().fetch() + + r2 = df1.iloc[:, :].fetch() + pd.testing.assert_frame_equal(r2, r1.iloc[:, :]) + + r3 = df1.iloc[1].fetch(extra_config={"check_series_name": False}) + pd.testing.assert_series_equal(r3, r1.iloc[1]) + + r4 = df1.iloc[0, 2].fetch() + assert r4 == r1.iloc[0, 2] + + arr2 = mt.random.rand(10, 3, chunk_size=3) + df2 = md.DataFrame(arr2) + r5 = df2.execute().fetch() + + r6 = df2.iloc[:4].fetch(batch_size=3) + pd.testing.assert_frame_equal(r5.iloc[:4], r6) + + +def test_repr(setup): + # test tensor repr + with np.printoptions(threshold=100): + arr = np.random.randint(1000, size=(11, 4, 13)) + + t = mt.tensor(arr, chunk_size=3) + + result = repr(t.execute()) + expected = repr(arr) + assert result == expected + + for size in (5, 58, 60, 62, 64): + pdf = pd.DataFrame(np.random.randint(1000, size=(size, 10))) + + # test DataFrame repr + df = md.DataFrame(pdf, chunk_size=size // 2) + + result = repr(df.execute()) + expected = repr(pdf) + assert result == expected + + # test DataFrame _repr_html_ + result = df.execute()._repr_html_() + expected = pdf._repr_html_() + assert result == expected + + # test Series repr + ps = pdf[0] + s = md.Series(ps, chunk_size=size // 2) + + result = repr(s.execute()) + expected = repr(ps) + assert result == expected + + # test Index repr + pind = pd.date_range("2020-1-1", periods=10) + ind = md.Index(pind, chunk_size=5) + + assert "DatetimeIndex" in repr(ind.execute()) + + # test groupby repr + df = md.DataFrame(pd.DataFrame(np.random.rand(100, 3), columns=list("abc"))) + grouped = df.groupby(["a", "b"]).execute() + + assert "DataFrameGroupBy" in repr(grouped) + + # test Categorical repr + c = md.qcut(range(5), 3) + assert "Categorical" in repr(c) + assert "Categorical" in str(c) + assert repr(c.execute()) == repr(pd.qcut(range(5), 3)) + + +def test_iter(setup): + raw_data = pd.DataFrame(np.random.randint(1000, size=(20, 10))) + df = md.DataFrame(raw_data, chunk_size=5) + + for col, series in df.iteritems(): + pd.testing.assert_series_equal(series.execute().fetch(), raw_data[col]) + + for i, batch in enumerate(df.iterbatch(batch_size=15)): + pd.testing.assert_frame_equal(batch, raw_data.iloc[i * 15 : (i + 1) * 15]) + + i = 0 + for result_row, expect_row in zip(df.iterrows(batch_size=15), raw_data.iterrows()): + assert result_row[0] == expect_row[0] + pd.testing.assert_series_equal(result_row[1], expect_row[1]) + i += 1 + + assert i == len(raw_data) + + i = 0 + for result_tup, expect_tup in zip( + df.itertuples(batch_size=10), raw_data.itertuples() + ): + assert result_tup == expect_tup + i += 1 + + assert i == len(raw_data) + + raw_data = pd.Series(np.random.randint(1000, size=(20,))) + s = md.Series(raw_data, chunk_size=5) + + for i, batch in enumerate(s.iterbatch(batch_size=15)): + pd.testing.assert_series_equal(batch, raw_data.iloc[i * 15 : (i + 1) * 15]) + + i = 0 + for result_item, expect_item in zip( + s.iteritems(batch_size=15), raw_data.iteritems() + ): + assert result_item[0] == expect_item[0] + assert result_item[1] == expect_item[1] + i += 1 + + assert i == len(raw_data) + + # test to_dict + assert s.to_dict() == raw_data.to_dict() + + +CONFIG = """ +"@inherits": '@default' +session: + custom_log_dir: '{custom_log_dir}' +""" + + +@pytest.fixture +def fetch_log_setup(): + from ..deploy.oscar.tests.session import new_test_session + + with tempfile.TemporaryDirectory() as temp_dir: + config = io.StringIO(CONFIG.format(custom_log_dir=temp_dir)) + sess = new_test_session( + default=True, config=load_service_config_file(config), n_cpu=8 + ) + with option_context({"show_progress": False}): + try: + yield sess + finally: + sess.stop_server() + + +def test_fetch_log(fetch_log_setup): + def f(): + print("test") + + r = mr.spawn(f) + r.execute() + + log = r.fetch_log() + assert str(log).strip() == "test" + + # test multiple functions + def f1(size): + print("f1" * size) + sys.stdout.flush() + + fs = mr.ExecutableTuple([mr.spawn(f1, 30), mr.spawn(f1, 40)]) + execute(*fs) + log = fetch_log(*fs, offsets=20, sizes=10) + assert str(log[0]).strip() == ("f1" * 30)[20:30] + assert str(log[1]).strip() == ("f1" * 40)[20:30] + assert len(log[0].offsets) > 0 + assert all(s > 0 for s in log[0].offsets) + assert len(log[1].offsets) > 0 + assert all(s > 0 for s in log[1].offsets) + assert len(log[0].chunk_op_keys) > 0 + + # test negative offsets + log = fs.fetch_log(offsets=-20, sizes=10) + assert str(log[0]).strip() == ("f1" * 30 + os.linesep)[-20:-10] + assert str(log[1]).strip() == ("f1" * 40 + os.linesep)[-20:-10] + assert all(s > 0 for s in log[0].offsets) is True + assert len(log[1].offsets) > 0 + assert all(s > 0 for s in log[1].offsets) is True + assert len(log[0].chunk_op_keys) > 0 + + # test negative offsets which represented in string + log = fetch_log(*fs, offsets="-0.02K", sizes="0.01K") + assert str(log[0]).strip() == ("f1" * 30 + os.linesep)[-20:-10] + assert str(log[1]).strip() == ("f1" * 40 + os.linesep)[-20:-10] + assert all(s > 0 for s in log[0].offsets) is True + assert len(log[1].offsets) > 0 + assert all(s > 0 for s in log[1].offsets) is True + assert len(log[0].chunk_op_keys) > 0 + + def test_nested(): + print("level0") + fr = mr.spawn(f1, 1) + fr.execute() + print(fr.fetch_log()) + + r = mr.spawn(test_nested) + r.execute() + log = str(r.fetch_log()) + assert "level0" in log + assert "f1" in log + + df = md.DataFrame(mt.random.rand(10, 3), chunk_size=5) + + def df_func(c): + print("df func") + return c + + df2 = df.map_chunk(df_func) + df2.execute() + log = df2.fetch_log() + assert "Chunk op key:" in str(log) + assert "df func" in repr(log) + assert len(str(df.fetch_log())) == 0 + + def test_host(rndf): + rm = mr.spawn(nested, rndf) + rm.execute() + print(rm.fetch_log()) + + def nested(_rndf): + print("log_content") + + ds = [mr.spawn(test_host, n, retry_when_fail=False) for n in np.random.rand(4)] + xtp = execute(ds) + for log in fetch_log(xtp): + assert str(log).strip() == "log_content" + + def test_threaded(): + import threading + + exc_info = None + + def print_fun(): + nonlocal exc_info + try: + print("inner") + except: # noqa: E722 # nosec # pylint: disable=bare-except + exc_info = sys.exc_info() + + print_thread = threading.Thread(target=print_fun) + print_thread.start() + print_thread.join() + + if exc_info is not None: + raise exc_info[1].with_traceback(exc_info[-1]) + + print("after") + + rm = mr.spawn(test_threaded) + rm.execute() + logs = str(rm.fetch_log()).strip() + assert logs == "inner\nafter" + + +def test_align_series(setup): + t = np.random.rand(10, 3) + pdf = pd.DataFrame(t) + df = md.DataFrame(pdf, chunk_size=(5, 3)) + r = df[0] != df.sort_index()[0].shift(-1) + expected = pdf[0] != pdf.sort_index()[0].shift(-1) + pd.testing.assert_series_equal(r.execute().fetch(), expected) + + +def test_cache_tileable(setup): + raw = np.random.rand(10, 3) + t = mt.tensor(raw) + t.cache = True + t2 = t + 1 + result = t2.execute().fetch() + np.testing.assert_array_equal(result, raw + 1) + np.testing.assert_array_equal(t.fetch(), raw) + + with option_context({"warn_duplicated_execution": True}): + t = mt.tensor(raw) + with pytest.warns( + RuntimeWarning, + match=re.escape(f"Tileable {repr(t)} has been submitted before"), + ): + (t + 1).execute() + (t + 2).execute() + + # should have no warning + t = mt.tensor(raw) + with pytest.raises(BaseException, match="DID NOT WARN"): + with pytest.warns( + RuntimeWarning, + match=re.escape(f"Tileable {repr(t)} has been submitted before"), + ): + (t + 1).execute() + + +@pytest.mark.parametrize("method", ["shuffle", "broadcast", None]) +@pytest.mark.parametrize("auto_merge", ["after", "before"]) +def test_merge_groupby(setup, method, auto_merge): + rs = np.random.RandomState(0) + raw1 = pd.DataFrame({"a": rs.randint(3, size=100), "b": rs.rand(100)}) + raw2 = pd.DataFrame({"a": rs.randint(3, size=10), "c": rs.rand(10)}) + df1 = md.DataFrame(raw1, chunk_size=10).execute() + df2 = md.DataFrame(raw2, chunk_size=10).execute() + # do not trigger auto merge + df3 = df1.merge( + df2, on="a", auto_merge_threshold=8, method=method, auto_merge=auto_merge + ) + df4 = df3.groupby("a").sum() + + result = df4.execute().fetch() + expected = raw1.merge(raw2, on="a").groupby("a").sum() + pd.testing.assert_frame_equal(result, expected) diff --git a/python/xorbits/_mars/tests/test_utils.py b/python/xorbits/_mars/tests/test_utils.py new file mode 100644 index 000000000..4fb9d9912 --- /dev/null +++ b/python/xorbits/_mars/tests/test_utils.py @@ -0,0 +1,669 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import copy +import logging +import multiprocessing +import os +import shutil +import sys +import tempfile +import textwrap +import time +from concurrent.futures import ThreadPoolExecutor +from enum import Enum +from functools import partial +from io import BytesIO + +import numpy as np +import pandas as pd +import pytest + +from .. import dataframe as md +from .. import tensor as mt +from .. import utils +from ..core import TileableGraph, tile +from .core import require_cudf, require_ray + + +def test_string_conversion(): + s = None + assert utils.to_binary(s) is None + assert utils.to_str(s) is None + assert utils.to_text(s) is None + + s = "abcdefg" + assert isinstance(utils.to_binary(s), bytes) + assert utils.to_binary(s) == b"abcdefg" + assert isinstance(utils.to_str(s), str) + assert utils.to_str(s) == "abcdefg" + assert isinstance(utils.to_text(s), str) + assert utils.to_text(s) == "abcdefg" + + ustr = type("ustr", (str,), {}) + assert isinstance(utils.to_str(ustr(s)), str) + assert utils.to_str(ustr(s)) == "abcdefg" + + s = b"abcdefg" + assert isinstance(utils.to_binary(s), bytes) + assert utils.to_binary(s) == b"abcdefg" + assert isinstance(utils.to_str(s), str) + assert utils.to_str(s) == "abcdefg" + assert isinstance(utils.to_text(s), str) + assert utils.to_text(s) == "abcdefg" + + ubytes = type("ubytes", (bytes,), {}) + assert isinstance(utils.to_binary(ubytes(s)), bytes) + assert utils.to_binary(ubytes(s)) == b"abcdefg" + + s = "abcdefg" + assert isinstance(utils.to_binary(s), bytes) + assert utils.to_binary(s) == b"abcdefg" + assert isinstance(utils.to_str(s), str) + assert utils.to_str(s) == "abcdefg" + assert isinstance(utils.to_text(s), str) + assert utils.to_text(s) == "abcdefg" + + uunicode = type("uunicode", (str,), {}) + assert isinstance(utils.to_text(uunicode(s)), str) + assert utils.to_text(uunicode(s)) == "abcdefg" + + with pytest.raises(TypeError): + utils.to_binary(utils) + with pytest.raises(TypeError): + utils.to_str(utils) + with pytest.raises(TypeError): + utils.to_text(utils) + + +def test_tokenize(): + import shutil + import tempfile + + class TestEnum(Enum): + VAL1 = "val1" + + tempdir = tempfile.mkdtemp("mars_test_utils_") + try: + filename = os.path.join(tempdir, "test_npa.dat") + mmp_array = np.memmap(filename, dtype=float, mode="w+", shape=(3, 4)) + mmp_array[:] = np.random.random((3, 4)).astype(float) + mmp_array.flush() + del mmp_array + + mmp_array1 = np.memmap(filename, dtype=float, shape=(3, 4)) + mmp_array2 = np.memmap(filename, dtype=float, shape=(3, 4)) + + try: + v = [ + 1, + 2.3, + "456", + "789", + b"101112", + 2147483649, + None, + np.ndarray, + [912, "uvw"], + np.arange(0, 10), + np.array(10), + np.array([b"\x01\x32\xff"]), + np.int64, + TestEnum.VAL1, + ] + copy_v = copy.deepcopy(v) + assert utils.tokenize(v + [mmp_array1], ext_data=1234) == utils.tokenize( + copy_v + [mmp_array2], ext_data=1234 + ) + finally: + del mmp_array1, mmp_array2 + finally: + shutil.rmtree(tempdir) + + v = {"a", "xyz", "uvw"} + assert utils.tokenize(v) == utils.tokenize(copy.deepcopy(v)) + + v = dict(x="abcd", y=98765) + assert utils.tokenize(v) == utils.tokenize(copy.deepcopy(v)) + + v = dict(x=dict(a=1, b=[1, 2, 3]), y=12345) + assert utils.tokenize(v) == utils.tokenize(copy.deepcopy(v)) + + # pandas relative + if pd is not None: + df = pd.DataFrame( + [[utils.to_binary("测试"), utils.to_text("数据")]], + index=["a"], + columns=["中文", "data"], + ) + v = [df, df.index, df.columns, df["data"], pd.Categorical(list("ABCD"))] + assert utils.tokenize(v) == utils.tokenize(copy.deepcopy(v)) + + class NonTokenizableCls: + def __getstate__(self): + raise SystemError + + with pytest.raises(TypeError): + utils.tokenize(NonTokenizableCls()) + + class CustomizedTokenize(object): + def __mars_tokenize__(self): + return id(type(self)), id(NonTokenizableCls) + + assert utils.tokenize(CustomizedTokenize()) == utils.tokenize(CustomizedTokenize()) + + v = lambda x: x + 1 + assert utils.tokenize(v) == utils.tokenize(copy.deepcopy(v)) + + def f(a, b): + return np.add(a, b) + + assert utils.tokenize(f) == utils.tokenize(copy.deepcopy(f)) + + partial_f = partial(f, 1, k=0) + partial_f2 = partial(f, 1, k=1) + assert utils.tokenize(partial_f) == utils.tokenize(copy.deepcopy(partial_f)) + assert utils.tokenize(partial_f) != utils.tokenize(partial_f2) + + +def test_lazy_import(): + old_sys_path = sys.path + mock_mod = textwrap.dedent( + """ + __version__ = '0.1.0b1' + """.strip() + ) + mock_mod2 = textwrap.dedent( + """ + from mars.utils import lazy_import + mock_mod = lazy_import("mock_mod") + + def get_version(): + return mock_mod.__version__ + """ + ) + + temp_dir = tempfile.mkdtemp(prefix="mars-utils-test-") + sys.path += [temp_dir] + try: + with open(os.path.join(temp_dir, "mock_mod.py"), "w") as outf: + outf.write(mock_mod) + with open(os.path.join(temp_dir, "mock_mod2.py"), "w") as outf: + outf.write(mock_mod2) + + non_exist_mod = utils.lazy_import("non_exist_mod", locals=locals()) + assert non_exist_mod is None + + non_exist_mod1 = utils.lazy_import("non_exist_mod1", placeholder=True) + with pytest.raises(AttributeError) as ex_data: + non_exist_mod1.meth() + assert "required" in str(ex_data.value) + + mod = utils.lazy_import( + "mock_mod", globals=globals(), locals=locals(), rename="mod" + ) + assert mod is not None + assert mod.__version__ == "0.1.0b1" + + glob = globals().copy() + mod = utils.lazy_import("mock_mod", globals=glob, locals=locals(), rename="mod") + glob["mod"] = mod + assert mod is not None + assert mod.__version__ == "0.1.0b1" + assert type(glob["mod"]).__name__ == "module" + + import mock_mod2 as mod2 + + assert type(mod2.mock_mod).__name__ != "module" + assert mod2.get_version() == "0.1.0b1" + assert type(mod2.mock_mod).__name__ == "module" + finally: + shutil.rmtree(temp_dir) + sys.path = old_sys_path + sys.modules.pop("mock_mod", None) + sys.modules.pop("mock_mod2", None) + + +def test_chunks_indexer(): + a = mt.ones((3, 4, 5), chunk_size=2) + a = tile(a) + + assert a.chunk_shape == (2, 2, 3) + + with pytest.raises(ValueError): + _ = a.cix[1] + with pytest.raises(ValueError): + _ = a.cix[1, :] + + chunk_key = a.cix[0, 0, 0].key + expected = a.chunks[0].key + assert chunk_key == expected + + # as chunks[9] and chunks[10] shares the same shape, + # their keys should be equal. + chunk_key = a.cix[1, 1, 1].key + expected = a.chunks[9].key + assert chunk_key == expected + + chunk_key = a.cix[1, 1, 2].key + expected = a.chunks[11].key + assert chunk_key == expected + + chunk_key = a.cix[0, -1, -1].key + expected = a.chunks[5].key + assert chunk_key == expected + + chunk_key = a.cix[0, -1, -1].key + expected = a.chunks[5].key + assert chunk_key == expected + + chunk_keys = [c.key for c in a.cix[0, 0, :]] + expected = [c.key for c in [a.cix[0, 0, 0], a.cix[0, 0, 1], a.cix[0, 0, 2]]] + assert chunk_keys == expected + + chunk_keys = [c.key for c in a.cix[:, 0, :]] + expected = [ + c.key + for c in [ + a.cix[0, 0, 0], + a.cix[0, 0, 1], + a.cix[0, 0, 2], + a.cix[1, 0, 0], + a.cix[1, 0, 1], + a.cix[1, 0, 2], + ] + ] + assert chunk_keys == expected + + chunk_keys = [c.key for c in a.cix[:, :, :]] + expected = [c.key for c in a.chunks] + assert chunk_keys == expected + + +def test_require_not_none(): + @utils.require_not_none(1) + def should_exist(): + pass + + assert should_exist is not None + + @utils.require_not_none(None) + def should_not_exist(): + pass + + assert should_not_exist is None + + @utils.require_module("numpy.fft") + def should_exist_np(): + pass + + assert should_exist_np is not None + + @utils.require_module("numpy.fft_error") + def should_not_exist_np(): + pass + + assert should_not_exist_np is None + + +def test_type_dispatcher(): + dispatcher = utils.TypeDispatcher() + + type1 = type("Type1", (), {}) + type2 = type("Type2", (type1,), {}) + type3 = type("Type3", (), {}) + type4 = type("Type4", (type2,), {}) + type5 = type("Type5", (type4,), {}) + + dispatcher.register(object, lambda x: "Object") + dispatcher.register(type1, lambda x: "Type1") + dispatcher.register(type4, lambda x: "Type4") + dispatcher.register("pandas.DataFrame", lambda x: "DataFrame") + dispatcher.register(utils.NamedType("ray", type1), lambda x: "RayType1") + + assert "Type1" == dispatcher(type2()) + assert "DataFrame" == dispatcher(pd.DataFrame()) + assert "Object" == dispatcher(type3()) + + tp = utils.NamedType("ray", type1) + assert dispatcher.get_handler(tp)(tp) == "RayType1" + tp = utils.NamedType("ray", type2) + assert dispatcher.get_handler(tp)(tp) == "RayType1" + tp = utils.NamedType("xxx", type2) + assert dispatcher.get_handler(tp)(tp) == "Type1" + assert "Type1" == dispatcher(type2()) + tp = utils.NamedType("ray", type5) + assert dispatcher.get_handler(tp)(tp) == "Type4" + + dispatcher.unregister(object) + with pytest.raises(KeyError): + dispatcher(type3()) + + +def test_fixed_size_file_object(): + arr = [str(i).encode() * 20 for i in range(10)] + bts = os.linesep.encode().join(arr) + bio = BytesIO(bts) + + ref_bio = BytesIO(bio.read(100)) + bio.seek(0) + ref_bio.seek(0) + fix_bio = utils.FixedSizeFileObject(bio, 100) + + assert ref_bio.readline() == fix_bio.readline() + assert ref_bio.tell() == fix_bio.tell() + pos = ref_bio.tell() + 10 + assert ref_bio.seek(pos) == fix_bio.seek(pos) + assert ref_bio.read(5) == fix_bio.read(5) + assert ref_bio.readlines(25) == fix_bio.readlines(25) + assert list(ref_bio) == list(fix_bio) + + +def test_timer(): + with utils.Timer() as timer: + time.sleep(0.1) + + assert timer.duration >= 0.1 + + +def test_quiet_stdio(): + old_stdout, old_stderr = sys.stdout, sys.stderr + + class _IOWrapper: + def __init__(self, name=None): + self.name = name + self.content = "" + + @staticmethod + def writable(): + return True + + def write(self, d): + self.content += d + return len(d) + + stdout_w = _IOWrapper("stdout") + stderr_w = _IOWrapper("stderr") + executor = ThreadPoolExecutor(1) + try: + sys.stdout = stdout_w + sys.stderr = stderr_w + + with utils.quiet_stdio(): + with utils.quiet_stdio(): + assert sys.stdout.writable() + assert sys.stderr.writable() + + print("LINE 1", end="\n") + print("LINE 2", file=sys.stderr, end="\n") + executor.submit(print, "LINE T").result() + print("LINE 3", end="\n") + + print("LINE 1", end="\n") + print("LINE 2", file=sys.stderr, end="\n") + finally: + sys.stdout, sys.stderr = old_stdout, old_stderr + executor.shutdown(False) + + assert stdout_w.content == "LINE T\nLINE 1\n" + assert stderr_w.content == "LINE 2\n" + + +@pytest.mark.asyncio +@pytest.mark.skipif( + sys.version_info[:2] < (3, 7), + reason="asyncio task timeout detector is not supported on python versions below 3.7", +) +async def test_asyncio_task_timeout_detector(): + log_file_name = "test_asyncio_task_timeout_detector.log" + try: + os.environ["MARS_DEBUG_ASYNCIO_TASK_TIMEOUT_CHECK_INTERVAL"] = "1" + p = multiprocessing.Process( + target=_run_task_timeout_detector, args=(log_file_name,) + ) + p.start() + while p.is_alive(): + await asyncio.sleep(0.1) + with open(log_file_name, "r") as f: + detector_log = f.read() + assert "timeout_func" in detector_log + finally: + os.environ.pop("MARS_DEBUG_ASYNCIO_TASK_TIMEOUT_CHECK_INTERVAL") + if os.path.exists(log_file_name): + os.remove(log_file_name) + + +def _run_task_timeout_detector(log_file_name): + from ..utils import logger, register_asyncio_task_timeout_detector + + fh = logging.FileHandler(log_file_name) + fh.setLevel(logging.INFO) + logger.addHandler(fh) + + async def timeout_func(): + await asyncio.sleep(2) + + async def main(): + task = register_asyncio_task_timeout_detector() + await asyncio.create_task(timeout_func()) + task.cancel() + + asyncio.run(main()) + + +def test_module_placeholder(): + required_module = utils.ModulePlaceholder("required_module") + + with pytest.raises(AttributeError): + required_module() + with pytest.raises(AttributeError) as e: + required_module.method() + msg = e.value.args[0] + assert msg == "required_module is required but not installed." + + +def test_merge_dict(): + from ..utils import merge_dict + + assert merge_dict({}, {1: 2}) == {1: 2} + assert merge_dict({1: 2}, {}) == {1: 2} + assert merge_dict( + {"a": {1: 2}, "b": {2: 3}, "c": {1: {2: 3}}}, + {"a": {1: 3}, "b": {2: 3}, "c": {1: {2: 4}}}, + ) == {"a": {1: 3}, "b": {2: 3}, "c": {1: {2: 4}}} + with pytest.raises(ValueError): + merge_dict({"a": {1: 2}, "b": {2: 3}}, {"a": {1: 3}}, overwrite=False) + + +def test_flatten_dict_to_nested_dict(): + from ..utils import flatten_dict_to_nested_dict + + assert flatten_dict_to_nested_dict({}) == {} + with pytest.raises(ValueError): + flatten_dict_to_nested_dict({"a.b.c": 1, "a.b": 2}) + assert flatten_dict_to_nested_dict({"a.b.c": 1, "a.b.d": 2}) == { + "a": {"b": {"c": 1, "d": 2}} + } + + +def test_readable_size(): + assert utils.readable_size(32) == "32.00" + assert utils.readable_size(14354) == "14.02K" + assert utils.readable_size(14354000) == "13.69M" + assert utils.readable_size(14354000000) == "13.37G" + assert utils.readable_size(14354000000000) == "13.05T" + + +def test_estimate_pandas_size(): + df1 = pd.DataFrame(np.random.rand(50, 10)) + assert utils.estimate_pandas_size(df1) == sys.getsizeof(df1) + + df2 = pd.DataFrame(np.random.rand(1000, 10)) + assert utils.estimate_pandas_size(df2) == sys.getsizeof(df2) + + df3 = pd.DataFrame( + { + "A": np.random.choice(["abcd", "def", "gh"], size=(1000,)), + "B": np.random.rand(1000), + "C": np.random.rand(1000), + } + ) + assert utils.estimate_pandas_size(df3) != sys.getsizeof(df3) + + s1 = pd.Series(np.random.rand(1000)) + assert utils.estimate_pandas_size(s1) == sys.getsizeof(s1) + + from ..dataframe.arrays import ArrowStringArray + + array = ArrowStringArray(np.random.choice(["abcd", "def", "gh"], size=(1000,))) + s2 = pd.Series(array) + assert utils.estimate_pandas_size(s2) == sys.getsizeof(s2) + + s3 = pd.Series(np.random.choice(["abcd", "def", "gh"], size=(1000,))) + assert utils.estimate_pandas_size(s3) != sys.getsizeof(s3) + assert ( + pytest.approx(utils.estimate_pandas_size(s3) / sys.getsizeof(s3), abs=0.5) == 1 + ) + + idx1 = pd.MultiIndex.from_arrays( + [np.arange(0, 1000), np.random.choice(["abcd", "def", "gh"], size=(1000,))] + ) + assert utils.estimate_pandas_size(idx1) == sys.getsizeof(idx1) + + string_idx = pd.Index(np.random.choice(["a", "bb", "cc"], size=(1000,))) + assert utils.estimate_pandas_size(string_idx) != sys.getsizeof(string_idx) + assert ( + pytest.approx( + utils.estimate_pandas_size(string_idx) / sys.getsizeof(string_idx), abs=0.5 + ) + == 1 + ) + + # dataframe with multi index + idx2 = pd.MultiIndex.from_arrays( + [np.arange(0, 1000), np.random.choice(["abcd", "def", "gh"], size=(1000,))] + ) + df4 = pd.DataFrame( + { + "A": np.random.choice(["abcd", "def", "gh"], size=(1000,)), + "B": np.random.rand(1000), + "C": np.random.rand(1000), + }, + index=idx2, + ) + assert utils.estimate_pandas_size(df4) != sys.getsizeof(df4) + assert ( + pytest.approx(utils.estimate_pandas_size(df4) / sys.getsizeof(df4), abs=0.5) + == 1 + ) + + # series with multi index + idx3 = pd.MultiIndex.from_arrays( + [ + np.random.choice(["a1", "a2", "a3"], size=(1000,)), + np.random.choice(["abcd", "def", "gh"], size=(1000,)), + ] + ) + s4 = pd.Series(np.arange(1000), index=idx3) + + assert utils.estimate_pandas_size(s4) == sys.getsizeof(s4) + + +@require_ray +def test_web_serialize_lambda(): + df = md.DataFrame( + mt.random.rand(10_0000, 4, chunk_size=1_0000), columns=list("abcd") + ) + r = df.apply(lambda x: x) + graph = TileableGraph([r]) + s = utils.serialize_serializable(graph) + f = utils.deserialize_serializable(s) + assert isinstance(f, TileableGraph) + + +def test_get_func_token_values(): + from ..utils import _get_func_token_values + + assert _get_func_token_values(test_get_func_token_values) == [ + test_get_func_token_values.__code__.co_code + ] + captured_vars = [1, 2, 3] + + def closure_func(a, b): + return captured_vars + + assert _get_func_token_values(closure_func)[1][0] == captured_vars + assert _get_func_token_values(partial(closure_func, 1))[0][0] == 1 + assert _get_func_token_values(partial(closure_func, 1))[-1][0] == captured_vars + + from .._utils import ceildiv + + assert _get_func_token_values(ceildiv) == [ceildiv.__module__, ceildiv.__name__] + + class Func: + def __call__(self, *args, **kwargs): + pass + + func = Func() + assert _get_func_token_values(func) == [func] + + +@pytest.mark.parametrize("id_length", [0, 5, 32, 63]) +def test_gen_random_id(id_length): + rnd_id = utils.new_random_id(id_length) + assert len(rnd_id) == id_length + + +@pytest.mark.asyncio +async def test_retry_callable(): + assert utils.retry_callable(lambda x: x)(1) == 1 + assert utils.retry_callable(lambda x: 0)(1) == 0 + + class CustomException(BaseException): + pass + + def f1(x): + nonlocal num_retried + num_retried += 1 + if num_retried == 3: + return x + raise CustomException + + num_retried = 0 + with pytest.raises(CustomException): + utils.retry_callable(f1)(1) + assert utils.retry_callable(f1, ex_type=CustomException)(1) == 1 + num_retried = 0 + with pytest.raises(CustomException): + utils.retry_callable(f1, max_retries=2, ex_type=CustomException)(1) + num_retried = 0 + assert utils.retry_callable(f1, max_retries=3, ex_type=CustomException)(1) == 1 + + async def f2(x): + return f1(x) + + num_retried = 0 + with pytest.raises(CustomException): + await utils.retry_callable(f2)(1) + assert await utils.retry_callable(f2, ex_type=CustomException)(1) == 1 + + +@require_cudf +def test_calc_data_size_gpu(): + import cudf + + df = pd.DataFrame({"a": ["a", "b", "a"]}, dtype="category") + df = cudf.from_pandas(df) + assert utils.calc_data_size(df) > 0 diff --git a/python/xorbits/_mars/typing.py b/python/xorbits/_mars/typing.py new file mode 100644 index 000000000..9c987f655 --- /dev/null +++ b/python/xorbits/_mars/typing.py @@ -0,0 +1,43 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +try: + from typing import Tuple, TypeVar +except ImportError: # pragma: no cover + # in some scenario (for instance, pycharm debug), `mars.typing` + # could be mistakenly imported as builtin typing. Code below + # resolves this issue. + import os + import sys + + _orig_sys_path = list(sys.path) + _mars_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + try: + sys.path = [p for p in sys.path if not p.startswith(_mars_path)] + sys.modules.pop("typing", None) + from typing import Tuple, TypeVar + finally: + sys.path = _orig_sys_path + del _orig_sys_path, _mars_path + +OperandType = TypeVar("OperandType") +TileableType = TypeVar("TileableType") +ChunkType = TypeVar("ChunkType") +EntityType = TypeVar("EntityType") +SessionType = TypeVar("SessionType") + +ClusterType = TypeVar("ClusterType") +ClientType = TypeVar("ClientType") + +BandType = Tuple[str, str] # (band address, resource_type) diff --git a/python/xorbits/_mars/utils.py b/python/xorbits/_mars/utils.py new file mode 100644 index 000000000..dc7f6ee8b --- /dev/null +++ b/python/xorbits/_mars/utils.py @@ -0,0 +1,1891 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import dataclasses +import datetime +import enum +import functools +import importlib +import inspect +import io +import itertools +import logging +import numbers +import operator +import os +import pkgutil +import random +import shutil +import socket +import struct +import sys +import threading +import time +import types +import uuid +import warnings +import weakref +import zlib +from abc import ABC +from contextlib import contextmanager +from types import TracebackType +from typing import ( + Any, + Callable, + Dict, + List, + Mapping, + NamedTuple, + Optional, + Set, + Tuple, + Type, + Union, +) +from urllib.parse import urlparse + +import cloudpickle as pickle +import numpy as np +import pandas as pd + +from ._utils import ( # noqa: F401 # pylint: disable=unused-import + NamedType, + Timer, + TypeDispatcher, + ceildiv, + new_random_id, + register_tokenizer, + reset_id_random_seed, + to_binary, + to_str, + to_text, + tokenize, + tokenize_int, +) +from .constants import MARS_LOG_PATH_KEY +from .lib.version import parse as parse_version +from .typing import ChunkType, EntityType, OperandType, TileableType + +logger = logging.getLogger(__name__) +random.seed(int(time.time()) * os.getpid()) +pd_release_version: Tuple[int] = parse_version(pd.__version__).release + +OBJECT_FIELD_OVERHEAD = 50 + +# make flake8 happy by referencing these imports +NamedType = NamedType +TypeDispatcher = TypeDispatcher +tokenize = tokenize +register_tokenizer = register_tokenizer +ceildiv = ceildiv +reset_id_random_seed = reset_id_random_seed +new_random_id = new_random_id +_create_task = asyncio.create_task +_is_ci = (os.environ.get("CI") or "0").lower() in ("1", "true") + + +# fix encoding conversion problem under windows +if sys.platform.startswith("win"): + + def _replace_default_encoding(func): + def _fun(s, encoding=None): + encoding = encoding or getattr(sys.stdout, "encoding", None) or "mbcs" + return func(s, encoding=encoding) + + _fun.__name__ = func.__name__ + _fun.__doc__ = func.__doc__ + return _fun + + to_binary = _replace_default_encoding(to_binary) + to_text = _replace_default_encoding(to_text) + to_str = _replace_default_encoding(to_str) + + +try: + from pandas._libs import lib as _pd__libs_lib + from pandas._libs.lib import NoDefault, no_default + + _raw__reduce__ = type(NoDefault).__reduce__ + + def _no_default__reduce__(self): + if self is not NoDefault: + return _raw__reduce__(self) + else: # pragma: no cover + return getattr, (_pd__libs_lib, "NoDefault") + + if hasattr(_pd__libs_lib, "_NoDefault"): # pragma: no cover + # need to patch __reduce__ to make sure it can be properly unpickled + type(NoDefault).__reduce__ = _no_default__reduce__ + else: + # introduced in pandas 1.5.0 : register for pickle compatibility + _pd__libs_lib._NoDefault = NoDefault +except ImportError: # pragma: no cover + + class NoDefault(enum.Enum): + no_default = "NO_DEFAULT" + + def __repr__(self) -> str: + return "" + + no_default = NoDefault.no_default + + try: + # register for pickle compatibility + from pandas._libs import lib as _pd__libs_lib + + _pd__libs_lib.NoDefault = NoDefault + except (ImportError, AttributeError): + pass + + +class AttributeDict(dict): + def __getattr__(self, item): + try: + return self[item] + except KeyError: + raise AttributeError(f"'AttributeDict' object has no attribute {item}") + + +def get_bool_environ(var_name: str) -> Optional[bool]: + var_value = os.environ.get(var_name) + if not var_value: + return None + return bool(int(var_value)) + + +def on_serialize_shape(shape: Tuple[int]): + if shape: + return tuple(s if not np.isnan(s) else -1 for s in shape) + return shape + + +def on_deserialize_shape(shape: Tuple[int]): + if shape: + return tuple(s if s != -1 else np.nan for s in shape) + return shape + + +def on_serialize_numpy_type(value: np.dtype): + if value is pd.NaT: + value = None + return value.item() if isinstance(value, np.generic) else value + + +def on_serialize_nsplits(value: Tuple[Tuple[int]]): + if value is None: + return None + new_nsplits = [] + for dim_splits in value: + new_nsplits.append(tuple(None if pd.isna(v) else v for v in dim_splits)) + return tuple(new_nsplits) + + +_memory_size_indices = {"": 0, "k": 1, "m": 2, "g": 3, "t": 4} + + +def calc_size_by_str( + value: Union[str, int, None], total: Union[int, None] +) -> Optional[int]: + if value is None: + return None + if isinstance(value, int): + return value + mem_limit, is_percent = parse_readable_size(value) + if is_percent: + return int(total * mem_limit) if total is not None else None + else: + return int(mem_limit) + + +def parse_readable_size(value: Union[str, int, float]) -> Tuple[float, bool]: + if isinstance(value, numbers.Number): + return float(value), False + + value = value.strip().lower() + num_pos = 0 + while num_pos < len(value) and value[num_pos] in "0123456789.-": + num_pos += 1 + + value, suffix = value[:num_pos], value[num_pos:] + suffix = suffix.strip() + if suffix.endswith("%"): + return float(value) / 100, True + + try: + return float(value) * (1024 ** _memory_size_indices[suffix[:1]]), False + except (ValueError, KeyError): + raise ValueError(f"Unknown limitation value: {value}") + + +def readable_size(size: int, trunc: bool = False) -> str: + if size < 1024: + ret_size = size + size_unit = "" + elif 1024 <= size < 1024**2: + ret_size = size * 1.0 / 1024 + size_unit = "K" + elif 1024**2 <= size < 1024**3: + ret_size = size * 1.0 / (1024**2) + size_unit = "M" + elif 1024**3 <= size < 1024**4: + ret_size = size * 1.0 / (1024**3) + size_unit = "G" + else: + ret_size = size * 1.0 / (1024**4) + size_unit = "T" + + if not trunc: + return "{0:.2f}{1}".format(ret_size, size_unit) + else: + return f"{int(ret_size)}{size_unit}" + + +_git_info = None + + +class GitInfo(NamedTuple): + commit_hash: str + commit_ref: str + + +def git_info(): + from ._version import get_versions + + global _git_info + + if _git_info is not None: + return _git_info + + versions = get_versions() + _git_info = GitInfo(versions["full-revisionid"], versions["version"]) + return _git_info + + +LOW_PORT_BOUND = 10000 +HIGH_PORT_BOUND = 65535 +_local_occupied_ports = set() + + +def _get_ports_from_netstat() -> Set[int]: + import subprocess + + while True: + p = subprocess.Popen("netstat -a -n -p tcp".split(), stdout=subprocess.PIPE) + try: + outs, _ = p.communicate(timeout=5) + outs = outs.split(to_binary(os.linesep)) + occupied = set() + for line in outs: + if b"." not in line: + continue + line = to_str(line) + for part in line.split(): + # in windows, netstat uses ':' to separate host and port + part = part.replace(":", ".") + if "." in part: + _, port_str = part.rsplit(".", 1) + if port_str == "*": + continue + port = int(port_str) + if LOW_PORT_BOUND <= port <= HIGH_PORT_BOUND: + occupied.add(int(port_str)) + break + return occupied + except subprocess.TimeoutExpired: + p.kill() + continue + + +def get_next_port(typ: int = None, occupy: bool = True) -> int: + import psutil + + if sys.platform.lower().startswith("win"): + occupied = _get_ports_from_netstat() + else: + try: + conns = psutil.net_connections() + typ = typ or socket.SOCK_STREAM + occupied = set( + sc.laddr.port + for sc in conns + if sc.type == typ and LOW_PORT_BOUND <= sc.laddr.port <= HIGH_PORT_BOUND + ) + except psutil.AccessDenied: + occupied = _get_ports_from_netstat() + + occupied.update(_local_occupied_ports) + random.seed(uuid.uuid1().bytes) + randn = random.randint(0, 100000000) + + idx = int(randn % (1 + HIGH_PORT_BOUND - LOW_PORT_BOUND - len(occupied))) + for i in range(LOW_PORT_BOUND, HIGH_PORT_BOUND + 1): + if i in occupied: + continue + if idx == 0: + if occupy: + _local_occupied_ports.add(i) + return i + idx -= 1 + raise SystemError("No ports available.") + + +@functools.lru_cache(200) +def mod_hash(val: Any, modulus: int): + return tokenize_int(val) % modulus + + +class classproperty: + def __init__(self, f): + self.f = f + + def __get__(self, obj, owner): + return self.f(owner) + + +def lazy_import( + name: str, + package: str = None, + globals: Dict = None, # pylint: disable=redefined-builtin + locals: Dict = None, # pylint: disable=redefined-builtin + rename: str = None, + placeholder: bool = False, +): + rename = rename or name + prefix_name = name.split(".", 1)[0] + globals = globals or inspect.currentframe().f_back.f_globals + + class LazyModule: + def __init__(self): + self._on_loads = [] + + def __getattr__(self, item): + if item.startswith("_pytest") or item in ("__bases__", "__test__"): + raise AttributeError(item) + + real_mod = importlib.import_module(name, package=package) + if rename in globals: + globals[rename] = real_mod + elif locals is not None: + locals[rename] = real_mod + ret = getattr(real_mod, item) + for on_load_func in self._on_loads: + on_load_func() + # make sure on_load hooks only executed once + self._on_loads = [] + return ret + + def add_load_handler(self, func: Callable): + self._on_loads.append(func) + return func + + if pkgutil.find_loader(prefix_name) is not None: + return LazyModule() + elif placeholder: + return ModulePlaceholder(prefix_name) + else: + return None + + +def lazy_import_on_load(lazy_mod): + def wrapper(fun): + if lazy_mod is not None and hasattr(lazy_mod, "add_load_handler"): + lazy_mod.add_load_handler(fun) + return fun + + return wrapper + + +class ModulePlaceholder: + def __init__(self, mod_name: str): + self._mod_name = mod_name + + def _raises(self): + raise AttributeError(f"{self._mod_name} is required but not installed.") + + def __getattr__(self, key): + self._raises() + + def __call__(self, *_args, **_kwargs): + self._raises() + + +def serialize_serializable(serializable, compress: bool = False): + from .serialization import serialize + + bio = io.BytesIO() + header, buffers = serialize(serializable) + buf_sizes = [getattr(buf, "nbytes", len(buf)) for buf in buffers] + header[0]["buf_sizes"] = buf_sizes + s_header = pickle.dumps(header) + bio.write(struct.pack("= 1.0 + try: + from ray.worker import global_worker + + global_worker.check_connected() + context = global_worker.get_serialization_context() + context.register_custom_serializer( + obj_type, serializer=serializer, deserializer=deserializer + ) + except AttributeError: # ray >= 1.2.0 + ray.util.register_serializer( + obj_type, serializer=serializer, deserializer=deserializer + ) + except ImportError: + pass + + +cudf = lazy_import("cudf") + + +def _get_dtype_itemsize(dt: Union[np.dtype, pd.api.extensions.ExtensionDtype]) -> int: + try: + return dt.itemsize + except AttributeError: + if cudf and isinstance(dt, cudf.CategoricalDtype): + return dt.to_pandas().itemsize + raise + + +def calc_data_size(dt: Any, shape: Tuple[int] = None) -> int: + from .dataframe.core import IndexValue + + if dt is None: + return 0 + + if isinstance(dt, tuple): + return sum(calc_data_size(c) for c in dt) + + shape = getattr(dt, "shape", None) or shape + if isinstance(dt, (pd.DataFrame, pd.Series)): + return estimate_pandas_size(dt) + if hasattr(dt, "estimate_size"): + return dt.estimate_size() + if hasattr(dt, "nbytes"): + return max(sys.getsizeof(dt), dt.nbytes) + if hasattr(dt, "shape") and len(dt.shape) == 0: + return 0 + if hasattr(dt, "dtypes") and shape is not None: + size = shape[0] * sum(_get_dtype_itemsize(dtype) for dtype in dt.dtypes) + try: + index_value_value = dt.index_value.value + if hasattr(index_value_value, "dtype") and not isinstance( + index_value_value, IndexValue.RangeIndex + ): + size += calc_data_size(index_value_value, shape=shape) + except AttributeError: + pass + return size + if hasattr(dt, "dtype") and shape is not None: + return shape[0] * dt.dtype.itemsize + + # object chunk + return sys.getsizeof(dt) + + +def estimate_pandas_size( + pd_obj, max_samples: int = 10, min_sample_rows: int = 100 +) -> int: + if len(pd_obj) <= min_sample_rows or isinstance(pd_obj, pd.RangeIndex): + return sys.getsizeof(pd_obj) + if isinstance(pd_obj, pd.MultiIndex): + # MultiIndex's sample size can't be used to estimate + return sys.getsizeof(pd_obj) + + from .dataframe.arrays import ArrowDtype + + def _is_fast_dtype(dtype): + if isinstance(dtype, np.dtype): + return np.issubdtype(dtype, np.number) + else: + return isinstance(dtype, ArrowDtype) + + dtypes = [] + is_series = False + if isinstance(pd_obj, pd.DataFrame): + dtypes.extend(pd_obj.dtypes) + index_obj = pd_obj.index + elif isinstance(pd_obj, pd.Series): + dtypes.append(pd_obj.dtype) + index_obj = pd_obj.index + is_series = True + else: + index_obj = pd_obj + + # handling possible MultiIndex + if hasattr(index_obj, "dtypes"): + dtypes.extend(index_obj.dtypes) + else: + dtypes.append(index_obj.dtype) + + if all(_is_fast_dtype(dtype) for dtype in dtypes): + return sys.getsizeof(pd_obj) + + indices = np.sort(np.random.choice(len(pd_obj), size=max_samples, replace=False)) + iloc = pd_obj if isinstance(pd_obj, pd.Index) else pd_obj.iloc + if isinstance(index_obj, pd.MultiIndex): + # MultiIndex's sample size is much greater than expected, thus we calculate + # the size separately. + index_size = sys.getsizeof(pd_obj.index) + if is_series: + sample_frame_size = iloc[indices].memory_usage(deep=True, index=False) + else: + sample_frame_size = iloc[indices].memory_usage(deep=True, index=False).sum() + return index_size + sample_frame_size * len(pd_obj) // max_samples + else: + sample_size = sys.getsizeof(iloc[indices]) + return sample_size * len(pd_obj) // max_samples + + +def build_fetch_shuffle( + chunk: ChunkType, n_reducers=None, shuffle_fetch_type=None +) -> ChunkType: + from .core.operand import ShuffleFetchType, ShuffleProxy + + chunk_op = chunk.op + assert isinstance(chunk_op, ShuffleProxy) + params = chunk.params.copy() + n_mappers = len(chunk.inputs) + assert n_reducers > 0, n_reducers + # for shuffle nodes, we build FetchShuffle chunks + # to replace ShuffleProxy + if shuffle_fetch_type is ShuffleFetchType.FETCH_BY_INDEX: + # skip data keys info for `FETCH_BY_INDEX` + source_keys = None + else: + source_keys = [pinp.key for pinp in chunk.inputs] + op = chunk_op.get_fetch_op_cls(chunk)( + source_keys=source_keys, + n_mappers=n_mappers, + n_reducers=n_reducers, + shuffle_fetch_type=shuffle_fetch_type, + gpu=chunk.op.gpu, + ) + return op.new_chunk( + None, + is_broadcaster=chunk.is_broadcaster, + kws=[params], + _key=chunk.key, + _id=chunk.id, + ) + + +def build_fetch_chunk(chunk: ChunkType, **kwargs) -> ChunkType: + from .core.operand import ShuffleProxy + + chunk_op = chunk.op + params = chunk.params.copy() + assert not isinstance(chunk_op, ShuffleProxy) + # for non-shuffle nodes, we build Fetch chunks + # to replace original chunk + op = chunk_op.get_fetch_op_cls(chunk)(sparse=chunk.op.sparse, gpu=chunk.op.gpu) + return op.new_chunk( + None, + is_broadcaster=chunk.is_broadcaster, + kws=[params], + _key=chunk.key, + **kwargs, + ) + + +def build_fetch_tileable(tileable: TileableType) -> TileableType: + if tileable.is_coarse(): + chunks = None + else: + chunks = [] + for c in tileable.chunks: + fetch_chunk = build_fetch_chunk(c, index=c.index) + chunks.append(fetch_chunk) + + tileable_op = tileable.op + params = tileable.params.copy() + + new_op = tileable_op.get_fetch_op_cls(tileable)(_id=tileable_op.id) + return new_op.new_tileables( + None, + chunks=chunks, + nsplits=tileable.nsplits, + _key=tileable.key, + _id=tileable.id, + **params, + )[0] + + +def build_fetch(entity: EntityType) -> EntityType: + from .core import CHUNK_TYPE, ENTITY_TYPE + + if isinstance(entity, CHUNK_TYPE): + return build_fetch_chunk(entity) + elif isinstance(entity, ENTITY_TYPE): + return build_fetch_tileable(entity) + else: + raise TypeError(f"Type {type(entity)} not supported") + + +def get_chunk_reducer_index(chunk: ChunkType) -> Tuple[int]: + op = chunk.op + try: + return op.reducer_index + except AttributeError: + from .core.operand import Fuse + + if isinstance(op, Fuse): + return chunk.composed[0].op.reducer_index + else: # pragma: no cover + raise + + +def merge_chunks(chunk_results: List[Tuple[Tuple[int], Any]]) -> Any: + """ + Concatenate chunk results according to index. + + Parameters + ---------- + chunk_results : list of tuple, {(chunk_idx, chunk_result), ...,} + + Returns + ------- + Data + """ + from sklearn.base import BaseEstimator + + from .dataframe.utils import ( + concat_on_columns, + get_xdf, + is_dataframe, + is_index, + is_series, + ) + from .lib.groupby_wrapper import GroupByWrapper + from .tensor.array_utils import get_array_module, is_array + + chunk_results = sorted(chunk_results, key=operator.itemgetter(0)) + v = chunk_results[0][1] + if len(chunk_results) == 1 and not (chunk_results[0][0]): + return v + if is_array(v): + xp = get_array_module(v) + ndim = v.ndim + for i in range(ndim - 1): + new_chunks = [] + for idx, cs in itertools.groupby(chunk_results, key=lambda t: t[0][:-1]): + new_chunks.append( + (idx, xp.concatenate([c[1] for c in cs], axis=ndim - i - 1)) + ) + chunk_results = new_chunks + to_concat = [c[1] for c in chunk_results] + if len(to_concat) == 1: + return to_concat[0] + concat_result = xp.concatenate(to_concat) + return concat_result + elif is_dataframe(v): + xdf = get_xdf(v) + concats = [] + for _, cs in itertools.groupby(chunk_results, key=lambda t: t[0][0]): + concats.append(concat_on_columns([c[1] for c in cs])) + return xdf.concat(concats, axis=0) + elif is_series(v): + xdf = get_xdf(v) + return xdf.concat([c[1] for c in chunk_results]) + elif is_index(v): + xdf = get_xdf(v) + df = xdf.concat([xdf.DataFrame(index=r[1]) for r in chunk_results]) + return df.index + elif isinstance(v, pd.Categorical): + categories = [r[1] for r in chunk_results] + arrays = [np.asarray(r) for r in categories] + array = np.concatenate(arrays) + return pd.Categorical( + array, categories=categories[0].categories, ordered=categories[0].ordered + ) + elif isinstance(v, GroupByWrapper): + df = pd.concat([r[1].obj for r in chunk_results], axis=0) + if not isinstance(v.keys, list): + keys = v.keys + else: + keys = [] + for idx, k in enumerate(v.keys): + if isinstance(k, pd.Series): + keys.append(pd.concat([r[1].keys[idx] for r in chunk_results])) + else: + keys.append(k) + grouped = GroupByWrapper( + df, + None, + keys=keys, + axis=v.axis, + level=v.level, + exclusions=v.exclusions, + selection=v.selection, + as_index=v.as_index, + sort=v.sort, + group_keys=v.group_keys, + squeeze=v.squeeze, + observed=v.observed, + mutated=v.mutated, + ) + return grouped.groupby_obj + elif isinstance(v, (str, bytes, memoryview, BaseEstimator)): + result = [r[1] for r in chunk_results] + if len(result) == 1: + return result[0] + return result + else: + result = None + for cr in chunk_results: + if cr[1] is None: + continue + if isinstance(cr[1], dict) and not cr[1]: + continue + if result is None: + result = cr[1] + result = result.item() if hasattr(result, "item") else result + else: + raise TypeError(f"unsupported type {type(v)}") + return result + + +def merged_chunk_as_tileable_type(merged, tileable: TileableType): + from .tensor.array_utils import get_array_module + from .tensor.core import TensorOrder + + if hasattr(tileable, "order") and tileable.ndim > 0: + module = get_array_module(merged) + if tileable.order == TensorOrder.F_ORDER and hasattr(module, "asfortranarray"): + merged = module.asfortranarray(merged) + elif tileable.order == TensorOrder.C_ORDER and hasattr( + module, "ascontiguousarray" + ): + merged = module.ascontiguousarray(merged) + if ( + hasattr(tileable, "isscalar") + and tileable.isscalar() + and getattr(merged, "size", None) == 1 + ): + merged = merged.item() + return merged + + +def calc_nsplits(chunk_idx_to_shape: Dict[Tuple[int], Tuple[int]]) -> Tuple[Tuple[int]]: + """ + Calculate a tiled entity's nsplits. + + Parameters + ---------- + chunk_idx_to_shape : Dict type, {chunk_idx: chunk_shape} + + Returns + ------- + nsplits + """ + ndim = len(next(iter(chunk_idx_to_shape))) + tileable_nsplits = [] + # for each dimension, record chunk shape whose index is zero on other dimensions + for i in range(ndim): + splits = [] + for index, shape in chunk_idx_to_shape.items(): + if all(idx == 0 for j, idx in enumerate(index) if j != i): + splits.append(shape[i]) + tileable_nsplits.append(tuple(splits)) + return tuple(tileable_nsplits) + + +def has_unknown_shape(*tiled_tileables: TileableType) -> bool: + for tileable in tiled_tileables: + if getattr(tileable, "shape", None) is None: + continue + if any(pd.isnull(s) for s in tileable.shape): + return True + if any(pd.isnull(s) for s in itertools.chain(*tileable.nsplits)): + return True + return False + + +def sbytes(x: Any) -> bytes: + # NB: bytes() in Python 3 has different semantic with Python 2, see: help(bytes) + from numbers import Number + + if x is None or isinstance(x, Number): + return bytes(str(x), encoding="ascii") + elif isinstance(x, list): + return bytes("[" + ", ".join([str(k) for k in x]) + "]", encoding="utf-8") + elif isinstance(x, tuple): + return bytes("(" + ", ".join([str(k) for k in x]) + ")", encoding="utf-8") + elif isinstance(x, str): + return bytes(x, encoding="utf-8") + else: + return bytes(x) + + +def copy_tileables(tileables: List[TileableType], **kwargs): + inputs = kwargs.pop("inputs", None) + copy_key = kwargs.pop("copy_key", True) + copy_id = kwargs.pop("copy_id", True) + if kwargs: + raise TypeError(f"got un unexpected keyword argument '{next(iter(kwargs))}'") + if len(tileables) > 1: + # cannot handle tileables with different operands here + # try to copy separately if so + if len({t.op for t in tileables}) != 1: + raise TypeError("All tileables' operands should be same.") + + op = tileables[0].op.copy().reset_key() + if copy_key: + op._key = tileables[0].op.key + kws = [] + for t in tileables: + params = t.params.copy() + if copy_key: + params["_key"] = t.key + if copy_id: + params["_id"] = t.id + params.update(t.extra_params) + kws.append(params) + inputs = inputs or op.inputs + return op.new_tileables(inputs, kws=kws, output_limit=len(kws)) + + +def require_not_none(obj: Any): + def wrap(func): + if obj is not None: + return func + else: + return + + return wrap + + +def require_module(module: str): + def wrap(func): + try: + importlib.import_module(module) + + @functools.wraps(func) + def inner(*args, **kwargs): + return func(*args, **kwargs) + + return inner + except ImportError: + return + + return wrap + + +def ignore_warning(func: Callable): + @functools.wraps(func) + def inner(*args, **kwargs): + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + return func(*args, **kwargs) + + return inner + + +def flatten(nested_iterable: Union[List, Tuple]) -> List: + """ + Flatten a nested iterable into a list. + + Parameters + ---------- + nested_iterable : list or tuple + an iterable which can contain other iterables + + Returns + ------- + flattened : list + + Examples + -------- + >>> flatten([[0, 1], [2, 3]]) + [0, 1, 2, 3] + >>> flatten([[0, 1], [[3], [4, 5]]]) + [0, 1, 3, 4, 5] + """ + + flattened = [] + stack = list(nested_iterable)[::-1] + while len(stack) > 0: + inp = stack.pop() + if isinstance(inp, (tuple, list)): + stack.extend(inp[::-1]) + else: + flattened.append(inp) + return flattened + + +def stack_back(flattened: List, raw: Union[List, Tuple]) -> Union[List, Tuple]: + """ + Organize a new iterable from a flattened list according to raw iterable. + + Parameters + ---------- + flattened : list + flattened list + raw: list + raw iterable + + Returns + ------- + ret : list + + Examples + -------- + >>> raw = [[0, 1], [2, [3, 4]]] + >>> flattened = flatten(raw) + >>> flattened + [0, 1, 2, 3, 4] + >>> a = [f + 1 for f in flattened] + >>> a + [1, 2, 3, 4, 5] + >>> stack_back(a, raw) + [[1, 2], [3, [4, 5]]] + """ + flattened_iter = iter(flattened) + result = list() + + def _stack(container, items): + for item in items: + if not isinstance(item, (list, tuple)): + container.append(next(flattened_iter)) + else: + new_container = list() + container.append(new_container) + _stack(new_container, item) + + return container + + return _stack(result, raw) + + +def build_fuse_chunk( + fused_chunks: List[ChunkType], + fuse_op_cls: Type[OperandType], + op_kw: Dict = None, + chunk_kw: Dict = None, +) -> ChunkType: + from .core.graph import ChunkGraph + + fuse_graph = ChunkGraph(fused_chunks) + for i, fuse_chunk in enumerate(fused_chunks): + fuse_graph.add_node(fuse_chunk) + if i > 0: + fuse_graph.add_edge(fused_chunks[i - 1], fuse_chunk) + + head_chunk = fused_chunks[0] + tail_chunk = fused_chunks[-1] + tail_chunk_op = tail_chunk.op + fuse_op = fuse_op_cls( + sparse=tail_chunk_op.sparse, + gpu=tail_chunk_op.gpu, + _key=tail_chunk_op.key, + fuse_graph=fuse_graph, + **(op_kw or dict()), + ) + return fuse_op.new_chunk( + head_chunk.inputs, + kws=[tail_chunk.params], + _key=tail_chunk.key, + _chunk=tail_chunk, + **(chunk_kw or dict()), + ) + + +def adapt_mars_docstring(doc: str) -> str: + """ + Adapt numpy-style docstrings to Mars docstring. + + This util function will add Mars imports, replace object references + and add execute calls. Note that check is needed after replacement. + """ + if doc is None: + return None + + lines = [] + first_prompt = True + prev_prompt = False + has_numpy = "np." in doc + has_pandas = "pd." in doc + + for line in doc.splitlines(): + sp = line.strip() + if sp.startswith(">>>") or sp.startswith("..."): + prev_prompt = True + if first_prompt: + first_prompt = False + indent = "".join(itertools.takewhile(lambda x: x in (" ", "\t"), line)) + if has_numpy: + lines.extend([indent + ">>> import mars.tensor as mt"]) + if has_pandas: + lines.extend([indent + ">>> import mars.dataframe as md"]) + line = line.replace("np.", "mt.").replace("pd.", "md.") + elif prev_prompt: + prev_prompt = False + if sp: + lines[-1] += ".execute()" + lines.append(line) + return "\n".join(lines) + + +class FixedSizeFileObject: + def __init__(self, file_obj, fixed_size): + self._file_obj = file_obj + self._cur = self._file_obj.tell() + self._size = fixed_size + self._end = self._cur + self._size + + def _get_size(self, size): + max_size = self._end - self._cur + if size is None: + return max_size + else: + return min(max_size, size) + + def read(self, size=None): + result = self._file_obj.read(self._get_size(size)) + self._cur = self._file_obj.tell() + return result + + def read1(self, size=None): + return self.read(size) + + def readline(self, size=None): + result = self._file_obj.readline(self._get_size(size)) + self._cur = self._file_obj.tell() + return result + + def readlines(self, size=None): + result = self._file_obj.readlines(self._get_size(size)) + self._cur = self._file_obj.tell() + return result + + def seek(self, offset): + self._cur = offset + return self._file_obj.seek(offset) + + def tell(self): + return self._file_obj.tell() + + def __next__(self): + while True: + result = self.readline() + if len(result) == 0: + raise StopIteration + else: + return result + + def __iter__(self): + while True: + try: + yield next(self) + except StopIteration: + return + + def __getattr__(self, item): # pragma: no cover + return getattr(self._file_obj, item) + + +def is_object_dtype(dtype: np.dtype) -> bool: + try: + return ( + np.issubdtype(dtype, np.object_) + or np.issubdtype(dtype, np.unicode_) + or np.issubdtype(dtype, np.bytes_) + ) + except TypeError: # pragma: no cover + return False + + +def get_dtype(dtype: Union[np.dtype, pd.api.extensions.ExtensionDtype]): + if pd.api.types.is_extension_array_dtype(dtype): + return dtype + elif dtype is pd.Timestamp or dtype is datetime.datetime: + return np.dtype("datetime64[ns]") + elif dtype is pd.Timedelta or dtype is datetime.timedelta: + return np.dtype("timedelta64[ns]") + else: + return np.dtype(dtype) + + +def calc_object_overhead(chunk: ChunkType, shape: Tuple[int]) -> int: + from .dataframe.core import ( + DATAFRAME_CHUNK_TYPE, + INDEX_CHUNK_TYPE, + SERIES_CHUNK_TYPE, + ) + + if not shape or np.isnan(shape[0]) or getattr(chunk, "dtypes", None) is None: + return 0 + + if isinstance(chunk, DATAFRAME_CHUNK_TYPE) and chunk.dtypes is not None: + n_strings = len([dt for dt in chunk.dtypes if is_object_dtype(dt)]) + if chunk.index_value and is_object_dtype( + getattr(chunk.index_value.value, "dtype", None) + ): + n_strings += 1 + elif isinstance(chunk, SERIES_CHUNK_TYPE) and chunk.dtype is not None: + n_strings = 1 if is_object_dtype(chunk.dtype) else 0 + if chunk.index_value and is_object_dtype( + getattr(chunk.index_value.value, "dtype", None) + ): + n_strings += 1 + elif isinstance(chunk, INDEX_CHUNK_TYPE) and chunk.dtype is not None: + n_strings = 1 if is_object_dtype(chunk.dtype) else 0 + else: + n_strings = 0 + return n_strings * shape[0] * OBJECT_FIELD_OVERHEAD + + +def arrow_array_to_objects( + obj: Union[pd.DataFrame, pd.Series] +) -> Union[pd.DataFrame, pd.Series]: + from .dataframe.arrays import ArrowDtype + + if isinstance(obj, pd.DataFrame): + if any(isinstance(dt, ArrowDtype) for dt in obj.dtypes): + # ArrowDtype exists + result = pd.DataFrame(columns=obj.columns) + for i, dtype in enumerate(obj.dtypes): + if isinstance(dtype, ArrowDtype): + result.iloc[:, i] = pd.Series( + obj.iloc[:, i].to_numpy(), index=obj.index + ) + else: + result.iloc[:, i] = obj.iloc[:, i] + obj = result + elif isinstance(obj, pd.Series): + if isinstance(obj.dtype, ArrowDtype): + obj = pd.Series(obj.to_numpy(), index=obj.index, name=obj.name) + return obj + + +_enter_counter = 0 +_initial_session = None + + +def enter_current_session(func: Callable): + @functools.wraps(func) + def wrapped(cls, ctx, op): + from .deploy.oscar.session import AbstractSession, get_default_session + + global _enter_counter, _initial_session + # skip in some test cases + if not hasattr(ctx, "get_current_session"): + return func(cls, ctx, op) + + with AbstractSession._lock: + if _enter_counter == 0: + # to handle nested call, only set initial session + # in first call + session = ctx.get_current_session() + _initial_session = get_default_session() + session.as_default() + _enter_counter += 1 + + try: + result = func(cls, ctx, op) + finally: + with AbstractSession._lock: + _enter_counter -= 1 + if _enter_counter == 0: + # set previous session when counter is 0 + if _initial_session: + _initial_session.as_default() + else: + AbstractSession.reset_default() + return result + + return wrapped + + +_io_quiet_local = threading.local() +_io_quiet_lock = threading.Lock() + + +class _QuietIOWrapper: + def __init__(self, wrapped): + self.wrapped = wrapped + + def __getattr__(self, item): + return getattr(self.wrapped, item) + + def write(self, d): + if getattr(_io_quiet_local, "is_wrapped", False): + return 0 + return self.wrapped.write(d) + + +@contextmanager +def quiet_stdio(): + """Quiets standard outputs when inferring types of functions""" + with _io_quiet_lock: + _io_quiet_local.is_wrapped = True + sys.stdout = _QuietIOWrapper(sys.stdout) + sys.stderr = _QuietIOWrapper(sys.stderr) + + try: + yield + finally: + with _io_quiet_lock: + sys.stdout = sys.stdout.wrapped + sys.stderr = sys.stderr.wrapped + if not isinstance(sys.stdout, _QuietIOWrapper): + _io_quiet_local.is_wrapped = False + + +def implements(f: Callable): + def decorator(g): + g.__doc__ = f.__doc__ + return g + + return decorator + + +def stringify_path(path: Union[str, os.PathLike]) -> str: + """ + Convert *path* to a string or unicode path if possible. + """ + if isinstance(path, str): + return path + + # checking whether path implements the filesystem protocol + try: + return path.__fspath__() + except AttributeError: + raise TypeError("not a path-like object") + + +def find_objects(nested: Union[List, Dict], types: Union[Type, Tuple[Type]]) -> List: + found = [] + stack = [nested] + + while len(stack) > 0: + it = stack.pop() + if isinstance(it, types): + found.append(it) + continue + + if isinstance(it, (list, tuple, set)): + stack.extend(list(it)[::-1]) + elif isinstance(it, dict): + stack.extend(list(it.values())[::-1]) + + return found + + +def replace_objects(nested: Union[List, Dict], mapping: Mapping) -> Union[List, Dict]: + if not mapping: + return nested + + if isinstance(nested, dict): + vals = list(nested.values()) + else: + vals = list(nested) + + new_vals = [] + for val in vals: + if isinstance(val, (dict, list, tuple, set)): + new_val = replace_objects(val, mapping) + else: + try: + new_val = mapping.get(val, val) + except TypeError: + new_val = val + new_vals.append(new_val) + + if isinstance(nested, dict): + return type(nested)((k, v) for k, v in zip(nested.keys(), new_vals)) + else: + return type(nested)(new_vals) + + +# from https://github.com/ericvsmith/dataclasses/blob/master/dataclass_tools.py +# released under Apache License 2.0 +def dataslots(cls): + # Need to create a new class, since we can't set __slots__ + # after a class has been created. + + # Make sure __slots__ isn't already set. + if "__slots__" in cls.__dict__: # pragma: no cover + raise TypeError(f"{cls.__name__} already specifies __slots__") + + # Create a new dict for our new class. + cls_dict = dict(cls.__dict__) + field_names = tuple(f.name for f in dataclasses.fields(cls)) + cls_dict["__slots__"] = field_names + for field_name in field_names: + # Remove our attributes, if present. They'll still be + # available in _MARKER. + cls_dict.pop(field_name, None) + # Remove __dict__ itself. + cls_dict.pop("__dict__", None) + # And finally create the class. + qualname = getattr(cls, "__qualname__", None) + cls = type(cls)(cls.__name__, cls.__bases__, cls_dict) + if qualname is not None: + cls.__qualname__ = qualname + return cls + + +def get_chunk_params(chunk): + from .dataframe.core import ( + DATAFRAME_CHUNK_TYPE, + DATAFRAME_GROUPBY_CHUNK_TYPE, + SERIES_GROUPBY_CHUNK_TYPE, + ) + + params = chunk.params.copy() + if isinstance( + chunk, + ( + DATAFRAME_CHUNK_TYPE, + DATAFRAME_GROUPBY_CHUNK_TYPE, + SERIES_GROUPBY_CHUNK_TYPE, + ), + ): + # dataframe chunk needs some special process for now + params.pop("columns_value", None) + params.pop("dtypes", None) + params.pop("key_dtypes", None) + return params + + +# Please refer to https://bugs.python.org/issue41451 +try: + + class _Dummy(ABC): + __slots__ = ("__weakref__",) + + abc_type_require_weakref_slot = True +except TypeError: + abc_type_require_weakref_slot = False + + +def patch_asyncio_task_create_time(): # pragma: no cover + new_loop = False + try: + loop = asyncio.get_running_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + new_loop = True + loop_class = loop.__class__ + # Save raw loop_class.create_task and make multiple apply idempotent + loop_create_task = getattr( + patch_asyncio_task_create_time, "loop_create_task", loop_class.create_task + ) + patch_asyncio_task_create_time.loop_create_task = loop_create_task + + def new_loop_create_task(*args, **kwargs): + task = loop_create_task(*args, **kwargs) + task.__mars_asyncio_task_create_time__ = time.time() + return task + + if loop_create_task is not new_loop_create_task: + loop_class.create_task = new_loop_create_task + if not new_loop and loop.create_task is not new_loop_create_task: + loop.create_task = functools.partial(new_loop_create_task, loop) + + +async def asyncio_task_timeout_detector( + check_interval: int, task_timeout_seconds: int, task_exclude_filters: List[str] +): + task_exclude_filters.append("asyncio_task_timeout_detector") + while True: # pragma: no cover + await asyncio.sleep(check_interval) + loop = asyncio.get_running_loop() + current_time = ( + time.time() + ) # avoid invoke `time.time()` frequently if we have plenty of unfinished tasks. + for task in asyncio.all_tasks(loop=loop): + # Some task may be create before `patch_asyncio_task_create_time` applied, take them as never timeout. + create_time = getattr( + task, "__mars_asyncio_task_create_time__", current_time + ) + if current_time - create_time >= task_timeout_seconds: + stack = io.StringIO() + task.print_stack(file=stack) + task_str = str(task) + if any( + excluded_task in task_str for excluded_task in task_exclude_filters + ): + continue + logger.warning( + """Task %s in event loop %s doesn't finish in %s seconds. %s""", + task, + loop, + time.time() - create_time, + stack.getvalue(), + ) + + +def register_asyncio_task_timeout_detector( + check_interval: int = None, + task_timeout_seconds: int = None, + task_exclude_filters: List[str] = None, +) -> Optional[asyncio.Task]: # pragma: no cover + """Register a asyncio task which print timeout task periodically.""" + check_interval = check_interval or int( + os.environ.get("MARS_DEBUG_ASYNCIO_TASK_TIMEOUT_CHECK_INTERVAL", -1) + ) + if check_interval > 0: + patch_asyncio_task_create_time() + task_timeout_seconds = task_timeout_seconds or int( + os.environ.get("MARS_DEBUG_ASYNCIO_TASK_TIMEOUT_SECONDS", check_interval) + ) + if not task_exclude_filters: + # Ignore mars/oscar by default since it has some long-running coroutines. + task_exclude_filters = os.environ.get( + "MARS_DEBUG_ASYNCIO_TASK_EXCLUDE_FILTERS", "mars/oscar" + ) + task_exclude_filters = task_exclude_filters.split(";") + if sys.version_info[:2] < (3, 7): + logger.warning( + "asyncio tasks timeout detector is not supported under python %s", + sys.version, + ) + else: + loop = asyncio.get_running_loop() + logger.info( + "Create asyncio tasks timeout detector with check_interval %s task_timeout_seconds %s " + "task_exclude_filters %s", + check_interval, + task_timeout_seconds, + task_exclude_filters, + ) + return loop.create_task( + asyncio_task_timeout_detector( + check_interval, task_timeout_seconds, task_exclude_filters + ) + ) + else: + return None + + +def ensure_own_data(data: np.ndarray) -> np.ndarray: + if not isinstance(data, np.ndarray): + return data + if not data.flags["OWNDATA"]: + return data.copy() + else: + return data + + +def get_chunk_key_to_data_keys(chunk_graph): + from .core.operand import FetchShuffle, MapReduceOperand, OperandStage + + chunk_key_to_data_keys = dict() + for chunk in chunk_graph: + if chunk.key in chunk_key_to_data_keys: + continue + if not isinstance(chunk.op, FetchShuffle): + chunk_key_to_data_keys[chunk.key] = [chunk.key] + else: + keys = [] + for succ in chunk_graph.iter_successors(chunk): + if ( + isinstance(succ.op, MapReduceOperand) + and succ.op.stage == OperandStage.reduce + ): + for key in succ.op.get_dependent_data_keys(): + if key not in keys: + keys.append(key) + chunk_key_to_data_keys[chunk.key] = keys + return chunk_key_to_data_keys + + +def merge_dict(dest: Dict, src: Dict, path=None, overwrite=True): + """ + Merges src dict into dest dict. + + Parameters + ---------- + dest: Dict + dest dict + src: Dict + source dict + path: List + merge path + overwrite: bool + Whether overwrite dest dict when where is a conflict + Returns + ------- + Dict + Updated dest dict + """ + if path is None: + path = [] + for key in src: + if key in dest: + if isinstance(dest[key], Dict) and isinstance(src[key], Dict): + merge_dict(dest[key], src[key], path + [str(key)], overwrite=overwrite) + elif dest[key] == src[key]: + pass # same leaf value + elif overwrite: + dest[key] = src[key] + else: + raise ValueError("Conflict at %s" % ".".join(path + [str(key)])) + else: + dest[key] = src[key] + return dest + + +def flatten_dict_to_nested_dict(flatten_dict: Dict, sep=".") -> Dict: + """ + Return nested dict from flatten dict. + + Parameters + ---------- + flatten_dict: Dict + sep: str + flatten key separator + + Returns + ------- + Dict + Nested dict + """ + assert all(isinstance(k, str) for k in flatten_dict.keys()) + nested_dict = dict() + # longest path first to avoid shorter path has a leaf key with value dict + # as sub dict by mistake. + keys = sorted(flatten_dict.keys(), key=lambda k: -len(k.split(sep))) + for k in keys: + sub_keys = k.split(sep) + sub_nested_dict = nested_dict + for i, sub_key in enumerate(sub_keys): + if i == len(sub_keys) - 1: + if sub_key in sub_nested_dict: + raise ValueError(f"Key {k} conflict in sub key {sub_key}.") + sub_nested_dict[sub_key] = flatten_dict[k] + else: + if sub_key not in sub_nested_dict: + new_sub_nested_dict = dict() + sub_nested_dict[sub_key] = new_sub_nested_dict + sub_nested_dict = new_sub_nested_dict + else: + sub_nested_dict = sub_nested_dict[sub_key] + return nested_dict + + +def is_full_slice(slc: Any) -> bool: + """Check if the input is a full slice ((:) or (0:))""" + return ( + isinstance(slc, slice) + and (slc.start == 0 or slc.start is None) + and slc.stop is None + and slc.step is None + ) + + +def wrap_exception( + exc: Exception, + bases: Tuple[Type] = None, + wrap_name: str = None, + message: str = None, + traceback: Optional[TracebackType] = None, + attr_dict: dict = None, +): + """Generate an exception wraps the cause exception.""" + + def __init__(self): + pass + + def __getattr__(self, item): + return getattr(exc, item) + + def __str__(self): + return message or super(type(self), self).__str__() + + traceback = traceback or exc.__traceback__ + bases = bases or () + attr_dict = attr_dict or {} + attr_dict.update( + { + "__init__": __init__, + "__getattr__": __getattr__, + "__str__": __str__, + "__wrapname__": wrap_name, + "__wrapped__": exc, + "__module__": type(exc).__module__, + "__cause__": exc.__cause__, + "__context__": exc.__context__, + "__suppress_context__": exc.__suppress_context__, + "args": exc.args, + } + ) + new_exc_type = type(type(exc).__name__, bases + (type(exc),), attr_dict) + return new_exc_type().with_traceback(traceback) + + +_func_token_cache = weakref.WeakKeyDictionary() + + +def get_func_token(func): + try: + token = _func_token_cache.get(func) + if token is None: + fields = _get_func_token_values(func) + token = tokenize(*fields) + _func_token_cache[func] = token + return token + except TypeError: # cannot create weak reference to func like 'numpy.ufunc' + return tokenize(*_get_func_token_values(func)) + + +def _get_func_token_values(func): + if hasattr(func, "__code__"): + tokens = [func.__code__.co_code] + if func.__closure__ is not None: + cvars = tuple([x.cell_contents for x in func.__closure__]) + tokens.append(cvars) + return tokens + else: + tokens = [] + while isinstance(func, functools.partial): + tokens.extend([func.args, func.keywords]) + func = func.func + if hasattr(func, "__code__"): + tokens.extend(_get_func_token_values(func)) + elif isinstance(func, types.BuiltinFunctionType): + tokens.extend([func.__module__, func.__name__]) + else: + tokens.append(func) + return tokens + + +async def _run_task_with_error_log( + coro, call_site=None, exit_if_exception=False +): # pragma: no cover + try: + return await coro + except asyncio.CancelledError: + raise + except Exception as e: + logger.exception( + "Coroutine %r at call_site %s execution got exception %s.", + coro, + call_site, + e, + ) + if exit_if_exception: + logger.error("Exit because exit_if_exception=%s.", exit_if_exception) + os._exit(-1) # Use os._exit to ensure exit in non-main thread. + raise + + +def create_task_with_error_log(coro, *args, **kwargs): # pragma: no cover + frame = inspect.currentframe() + if frame and frame.f_back: + call_site = frame.f_back.f_code + else: + call_site = None + return _create_task(_run_task_with_error_log(coro, call_site), *args, **kwargs) + + +def aiotask_wrapper(_f=None, exit_if_exception=False): + def _wrapper(func): + @functools.wraps(func) + def _aiotask_wrapper(*args, **kwargs): + frame = inspect.currentframe() + if frame and frame.f_back: + call_site = frame.f_back.f_code + else: + call_site = None + return _run_task_with_error_log( + func(*args, **kwargs), + call_site=call_site, + exit_if_exception=exit_if_exception, + ) + + return _aiotask_wrapper + + if inspect.iscoroutinefunction(_f): + return _wrapper(_f) + else: + assert _f is None + return _wrapper + + +def is_ray_address(address: str) -> bool: + from .oscar.backends.ray.communication import RayServer + + if urlparse(address).scheme == RayServer.scheme: + return True + else: + return False + + +# TODO: clean_up_func, is_on_ray and restore_func functions may be +# removed or refactored in the future to calculate func size +# with more accuracy as well as address some serialization issues. +def is_on_ray(ctx): + from .services.task.execution.ray.context import ( + RayExecutionContext, + RayExecutionWorkerContext, + ) + + # There are three conditions + # a. mars backend + # b. ray backend(oscar), c. ray backend(dag) + # When a. or b. is selected, ctx is an instance of ThreadedServiceContext. + # The main difference between them is whether worker address matches ray scheme. + # To avoid duplicated checks, here we choose the first worker address. + # When c. is selected, ctx is an instance of RayExecutionContext or RayExecutionWorkerContext, + # while get_worker_addresses method isn't currently implemented in RayExecutionWorkerContext. + try: + worker_addresses = ctx.get_worker_addresses() + except AttributeError: # pragma: no cover + assert isinstance(ctx, RayExecutionWorkerContext) + return True + return isinstance(ctx, RayExecutionContext) or is_ray_address(worker_addresses[0]) + + +def cache_tileables(*tileables): + from .core import ENTITY_TYPE + + if len(tileables) == 1 and isinstance(tileables[0], (tuple, list)): + tileables = tileables[0] + for t in tileables: + if isinstance(t, ENTITY_TYPE): + t.cache = True + + +class TreeReductionBuilder: + def __init__(self, combine_size=None): + from .config import options + + self._combine_size = combine_size or options.combine_size + + def _build_reduction(self, inputs, final=False): + raise NotImplementedError + + def build(self, inputs): + combine_size = self._combine_size + while len(inputs) > self._combine_size: + new_inputs = [] + for i in range(0, len(inputs), combine_size): + objs = inputs[i : i + combine_size] + if len(objs) == 1: + obj = objs[0] + else: + obj = self._build_reduction(objs, final=False) + new_inputs.append(obj) + inputs = new_inputs + + if len(inputs) == 1: + return inputs[0] + return self._build_reduction(inputs, final=True) + + +def ensure_coverage(): + # make sure coverage is handled when starting with subprocess.Popen + if ( + not sys.platform.startswith("win") and "COV_CORE_SOURCE" in os.environ + ): # pragma: no cover + try: + from pytest_cov.embed import cleanup_on_sigterm + except ImportError: + pass + else: + cleanup_on_sigterm() + + +@functools.lru_cache(100) +def sync_to_async(func): + if inspect.iscoroutinefunction(func): + return func + else: + # Wrap the sync call to thread to avoid blocking the + # asyncio event loop. e.g. acquiring a threading.Lock() + # in the sync call. + return functools.partial(asyncio.to_thread, func) + + +def retry_callable( + callable_, + ex_type: type = Exception, + wait_interval=1, + max_retries=-1, + sync: bool = None, +): + if inspect.iscoroutinefunction(callable_) or sync is False: + + @functools.wraps(callable) + async def retry_call(*args, **kwargs): + num_retried = 0 + while max_retries < 0 or num_retried < max_retries: + num_retried += 1 + try: + return await callable_(*args, **kwargs) + except ex_type: + await asyncio.sleep(wait_interval) + + else: + + @functools.wraps(callable) + def retry_call(*args, **kwargs): + num_retried = 0 + ex = None + while max_retries < 0 or num_retried < max_retries: + num_retried += 1 + try: + return callable_(*args, **kwargs) + except ex_type as e: + ex = e + time.sleep(wait_interval) + assert ex is not None + raise ex # pylint: disable-msg=E0702 + + return retry_call + + +def clean_mars_tmp_dir(): + # clean Mars log file and Mars tmp dir + filename = os.environ.get(MARS_LOG_PATH_KEY) + if filename is not None: + os.environ.pop(MARS_LOG_PATH_KEY) + if os.path.exists(filename): + mars_tmp_dir = os.path.dirname(filename) + if os.path.exists(mars_tmp_dir): + # on windows platform, raise Permission Error + _windows: bool = sys.platform.startswith("win") + shutil.rmtree(mars_tmp_dir, ignore_errors=_windows) diff --git a/python/xorbits/_mars/worker.py b/python/xorbits/_mars/worker.py new file mode 100644 index 000000000..3bb850485 --- /dev/null +++ b/python/xorbits/_mars/worker.py @@ -0,0 +1,23 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# shortcut to support +# python -m mars.worker + +from .deploy.oscar.worker import main +from .utils import ensure_coverage + +if __name__ == "__main__": + ensure_coverage() + main() diff --git a/python/xorbits/core/adapter.py b/python/xorbits/core/adapter.py index 2089035fd..a192226fe 100644 --- a/python/xorbits/core/adapter.py +++ b/python/xorbits/core/adapter.py @@ -243,7 +243,7 @@ def to_mars(inp: Union[DataRef, Tuple, List, Dict]): if isinstance(inp, DataRef): mars_entity = getattr(inp.data, "_mars_entity", None) if mars_entity is None: # pragma: no cover - raise TypeError(f"Can't covert {inp} to mars entity") + raise TypeError(f"Can't convert {inp} to mars entity") conditions = _TO_MARS_EXECUTION_CONDITION[type(mars_entity).__name__] for cond in conditions: if cond(mars_entity): diff --git a/python/xorbits/core/data.py b/python/xorbits/core/data.py index 47a8ed6ec..877b15547 100644 --- a/python/xorbits/core/data.py +++ b/python/xorbits/core/data.py @@ -185,7 +185,7 @@ def _own_data(self): def __iter__(self): # Mars entity hasn't implemented __iter__, however `iter(mars_entity)` # still works, it's because iteration is supported by `__getitem__` that - # accepts intergers 0,1,.., it can be seen as a "legacy feature" that not + # accepts integers 0,1,.., it can be seen as a "legacy feature" that not # recommended. Here we implement __iter__ for some data types, others keep # behaviors with Mars. if self._own_data(): diff --git a/python/xorbits/deploy/__init__.py b/python/xorbits/deploy/__init__.py index b43f33025..7f3e5d391 100644 --- a/python/xorbits/deploy/__init__.py +++ b/python/xorbits/deploy/__init__.py @@ -51,7 +51,7 @@ def init( session_id: str, optional Session ID, if not specified, a new ID will be auto generated. timeout: float - Timeout about creating a new runtime or connecting to an exising cluster. + Timeout about creating a new runtime or connecting to an existing cluster. n_worker: int, optional How many workers to start when creating a local runtime. diff --git a/third_party/_mars b/third_party/_mars deleted file mode 160000 index a99b5a1d2..000000000 --- a/third_party/_mars +++ /dev/null @@ -1 +0,0 @@ -Subproject commit a99b5a1d2e1183a58d771d56a3aa57196417cb5c